import logging
import os
import os.path
import re
import shutil
import urllib.error
import urllib.request
from time import time
import patoolib
from base.cgraph import MyGraph
from utils import GRAPHS_DIR
FORMAT = 'ij'
TMP_GRAPHS_DIR = GRAPHS_DIR / 'tmp'
netrepo_metadata_path = GRAPHS_DIR / 'netrepo' / 'metadata'
# Graphs used in current session. Need this to avoid loading the same object several times.
current_graphs = {} # full_name -> MyGraph
def reformat_graph_file(path, out_path, ignore_lines_starting_with='#%',
remove_original=False, self_loops=False, renumerate=False):
"""
:param path:
:param out_path:
:param ignore_lines_starting_with: lines starting with these symbols will be ignored
:param remove_original: if True, original file will be removed
:param self_loops: if True, self loops will be removed
:param renumerate: if True, nodes are re-numerated from 0 to N-1
:return:
"""
in_format = None
out_format = FORMAT
renums = {}
separators = ' |\t|,'
assert out_path != path
with open(out_path, 'w') as out_file:
for line in open(path, 'r'):
if line[0] in ignore_lines_starting_with: # Filter comments
continue
line = line.rstrip('\n')
assert line[0].isdigit(), "expected alpha-numeric line: '%s'" % line
if not in_format:
# Define format
items = re.split(separators, line)
in_format = 'ijwt'[:len(items)]
if len(out_format) > len(in_format):
raise Exception("Could not reformat from '%s' to '%s'" % (in_format, out_format))
logging.info("Reformatting %s->%s for '%s' ..." % (in_format, out_format, path))
items = re.split(separators, line)
i, j = items[0], items[1]
if not self_loops and i == j:
continue
if renumerate:
if i not in renums:
renums[i] = len(renums)
if j not in renums:
renums[j] = len(renums)
items[0] = str(renums[i])
items[1] = str(renums[j])
res_line = '%s %s\n' % (items[0], items[1])
out_file.write(res_line)
if remove_original:
os.remove(path)
logging.info("Reformatting finished '%s'." % out_path)
[docs]class GraphCollections:
"""
Manager of graph data.
By calling method `get(graph_full_name)`, it loads graph from file if any or downloads a graph
from online graph collection.
`graph_full_name` is string or tuple ([collection], [subcollection], ... , name) containing at
least one element, the last one is treated as graph name.
Corresponding graph file is stored at `collection/subcollection/../name.format` file.
`networkrepository <http://networkrepository.com/>`_ collection is available.
Example:
>>> graph = GraphCollections.get('konect', 'dolphins')
"""
networkrepository_url_pattern = 'http://nrvis.com/download/data/%s/%s.zip'
[docs] @staticmethod
def get(*full_name, directed=False, giant_only=True, self_loops=False, not_load=False) -> MyGraph:
"""
Read graph from storage or download it from the specified collection. In order to apply
giant_only and self_loops, you need to remove the file manually. #TODO maybe make a rewrite?
:param full_name: string or sequence [collection], [subcollection], ... , name containing at
least one element, the last one is treated as graph name. In case of konect collection, graph
name could be any of e.g. 'CL' or 'Actor collaborations' or 'actor-collaborations'.
:param directed: undirected by default
:param giant_only: giant component instead of full graph. Component extraction is applied
only once when the graph is downloaded.
:param self_loops: self loops are removed by default. Applied only once when the graph is
downloaded.
:param not_load: if True do not load the graph (useful for stats exploring). Note: any graph
modification will lead to segfault
:return: MyGraph object
"""
if isinstance(full_name, str):
full_name = (full_name,) # root data directory
# Check if graph was already loaded
if full_name in current_graphs:
return current_graphs[full_name]
path = GraphCollections._full_name_to_path(*full_name)
if not os.path.exists(path):
# Download graph if absent
temp_path = path + '_tmp'
if len(full_name) == 2 and full_name[0] == 'netrepo':
GraphCollections._download_netrepo(temp_path, netrepo_name_ref_dict[full_name[-1]])
else:
raise FileNotFoundError("File '%s' not found. Check graph name, collection or file existence." % path)
reformat_graph_file(temp_path, path, remove_original=True, self_loops=self_loops)
if giant_only:
# Replace graph by its giant component
logging.info("Extracting giant component ...")
MyGraph(path, full_name, directed, format=FORMAT).giant_component(inplace=True)
logging.info("done.")
my_graph = MyGraph(path, full_name, directed, format=FORMAT, not_load=not_load)
current_graphs[full_name] = my_graph
return my_graph
@staticmethod
def _full_name_to_path(*full_name) -> str:
""" Convert MyGraph full_name into path it should be stored at"""
format = FORMAT
return os.path.join(GRAPHS_DIR, *full_name[:-1], "%s.%s" % (full_name[-1], format))
[docs] @staticmethod
def get_by_path(path: str, not_load=False, store=True) -> MyGraph:
""" Create and load graph from specified file path.
If the path is <GRAPHS_DIR>/a/b/name.ij the full_name will be ('a', 'b', 'name')
"""
from utils import GRAPHS_DIR
assert str(GRAPHS_DIR) in path, "Please, put your graph file to %s" % GRAPHS_DIR
parts = path.split(os.path.sep)[len(GRAPHS_DIR.parts):]
last_dot = parts[-1].rfind('.')
last = parts[-1][:last_dot] if last_dot > 0 else parts[-1]
full_name = tuple(parts[:-1]) + (last,)
my_graph = MyGraph(path, full_name, not_load=not_load)
if store:
current_graphs[full_name] = my_graph
return my_graph
[docs] @staticmethod
def register_new_graph(*full_name) -> MyGraph:
""" Create a new MyGraph object, define its path corresponding to the specified full_name.
NOTE: by default the graph is not loaded, call load() if want to use this object.
:param full_name: string or sequence [collection], [subcollection], ... , name containing
at least one element, the last one is treated as graph name.
:return: new MyGraph
"""
if len(full_name) == 0: # tmp graph, not gonna save
path = str(TMP_GRAPHS_DIR / f"{str(time())}.{FORMAT}")
return GraphCollections.get_by_path(path, not_load=True, store=False)
path = GraphCollections._full_name_to_path(*full_name)
if os.path.exists(path):
raise IOError("Path corresponding to specified graph full_name %s is not free: look at ")
return MyGraph(path=path, full_name=full_name, not_load=True)
@staticmethod
def _download_netrepo(graph_path, url):
"""
Downloads graph data from http://networkrepository.com/networks.php
:param graph_path: full path to edge list file
:param url: URL of graph data
:return:
"""
# 'http://nrvis.com/download/data/eco/eco-florida.zip'
# Convert "url.tar.*" -> "filename.tar"
if (url.rsplit('/', 1)[1]).rsplit('.', 2)[1] == "tar":
archive_name = (url.rsplit('/', 1)[1]).rsplit('.', 1)[0]
else:
archive_name = (url.rsplit('/', 1)[1])
graph_dir = os.path.dirname(graph_path)
if not os.path.exists(graph_dir):
os.makedirs(graph_dir)
# Download archive and extract graph file
logging.info("Downloading graph archive from %s..." % url)
filename = os.path.join(graph_dir, archive_name)
urllib.request.urlretrieve(url, filename=filename)
logging.info("done.")
archive_dir_name = archive_name.split('.', 1)[0]
patoolib.extract_archive(filename, outdir=os.path.join(graph_dir, archive_dir_name))
out_file_name = os.path.join(graph_dir, os.path.basename(graph_path))
# Rename extracted graph file
while True:
# todo are there else formats besides '.edges', '.mtx' ?
try:
netrepo_file_name = os.path.join(graph_dir, archive_dir_name, archive_dir_name + ".edges")
os.rename(netrepo_file_name, out_file_name)
break
except IOError: pass
try:
netrepo_file_name = os.path.join(graph_dir, archive_dir_name, archive_dir_name + ".mtx")
# Remove first two lines (solution from https://stackoverflow.com/a/2329972/8900030)
fro = open(netrepo_file_name, "rb")
fro.readline()
fro.readline()
frw = open(netrepo_file_name, "r+b")
chars = fro.readline()
while chars:
frw.write(chars)
chars = fro.readline()
fro.close()
frw.truncate()
frw.close()
os.rename(netrepo_file_name, out_file_name)
break
except IOError: pass
break
os.remove(os.path.join(graph_dir, archive_name))
shutil.rmtree(os.path.join(graph_dir, archive_dir_name))
def test_netrepo():
for name in ['soc-wiki-Vote', 'socfb-Bingham82']:
g = GraphCollections.get('netrepo', name, directed=False, giant_only=True, self_loops=False)
g = GraphCollections.get('netrepo', name, directed=False, giant_only=True, self_loops=False)
g = GraphCollections.get('netrepo', name, directed=False, giant_only=True, self_loops=False)
print("N=%s E=%s" % (g.nodes(), g.edges()))
class temp_dir(object):
"""
Creates a temporary directory to store some files, which will be removed by exit.
Current working directory is also changed to this directory.
"""
def __init__(self):
self.dir_name = os.path.join(TMP_GRAPHS_DIR, str(time()))
if not os.path.exists(self.dir_name):
os.makedirs(self.dir_name)
def __enter__(self):
os.chdir(self.dir_name)
return self.dir_name
def __exit__(self, type, value, traceback):
shutil.rmtree(self.dir_name)
if __name__ == '__main__':
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)
logging.getLogger().setLevel(logging.DEBUG)
# parse_netrepo_page()
test_netrepo()