| |
| |
|
|
| import itertools |
| import statistics |
| import sys |
|
|
| from graph import Graph |
| from treewidth import quickbb |
|
|
|
|
| class DepthFirstSearch(object): |
|
|
| def __init__(self, graph, undirected=False): |
| self._graph = graph |
| self._undirected = undirected |
|
|
| self._enter = dict() |
| self._leave = dict() |
| self.n_runs = 0 |
|
|
| def compute_timestamps(node, timestamp): |
| self._enter[node] = next(timestamp) |
| for edge in self._graph.find_node(node).outgoing_edges: |
| if not edge.tgt in self._enter: |
| compute_timestamps(edge.tgt, timestamp) |
| if self._undirected: |
| for edge in self._graph.find_node(node).incoming_edges: |
| if not edge.src in self._enter: |
| compute_timestamps(edge.src, timestamp) |
| self._leave[node] = next(timestamp) |
| timestamp = itertools.count() |
| for node in self._graph.nodes: |
| if not node.id in self._enter: |
| compute_timestamps(node.id, timestamp) |
| self.n_runs += 1 |
|
|
| def is_back_edge(self, edge): |
| return \ |
| self._enter[edge.tgt] < self._enter[edge.src] and \ |
| self._leave[edge.src] < self._leave[edge.tgt] |
|
|
|
|
| class InspectedGraph(object): |
|
|
| def __init__(self, graph): |
| self.graph = graph |
| self.n_nodes = len(graph.nodes) |
| self.dfs = DepthFirstSearch(graph) |
| self.undirected_dfs = DepthFirstSearch(graph, undirected=True) |
|
|
| def n_root_nodes(self): |
| return sum(1 for node in self.graph.nodes if node.is_root()) |
|
|
| def n_leaf_nodes(self): |
| return sum(1 for node in self.graph.nodes if node.is_leaf()) |
|
|
| def n_top_nodes(self): |
| return sum(1 for node in self.graph.nodes if node.is_top()) |
|
|
| def n_singleton_nodes(self): |
| return sum(1 for node in self.graph.nodes if node.is_singleton()) |
|
|
| def n_loops(self): |
| return sum(1 for edge in self.graph.edges if edge.is_loop()) |
|
|
| def n_components(self): |
| return self.undirected_dfs.n_runs - self.n_singleton_nodes() |
|
|
| def is_cyclic(self): |
| for edge in self.graph.edges: |
| if edge.is_loop() or self.dfs.is_back_edge(edge): |
| return True |
| return False |
|
|
| def is_forest(self): |
| if self.is_cyclic(): |
| return False |
| else: |
| for node in self.graph.nodes: |
| if len(node.incoming_edges) > 1: |
| return False |
| return True |
|
|
| def is_tree(self): |
| return self.is_forest() and self.n_components() == 1 |
|
|
| def treewidth(self): |
| n_nodes = len(self.graph.nodes) - self.n_singleton_nodes() |
| if n_nodes <= 1: |
| return 1 |
| else: |
| undirected_graph = {} |
| for node in self.graph.nodes: |
| if not node.is_singleton(): |
| undirected_graph[node.id] = set() |
| for edge in self.graph.edges: |
| if not edge.is_loop(): |
| undirected_graph[edge.src].add(edge.tgt) |
| undirected_graph[edge.tgt].add(edge.src) |
| decomposition = quickbb(undirected_graph) |
| return max(1, max(len(u)-1 for u in decomposition)) |
|
|
| def _crossing_pairs(self): |
| def endpoints(edge): |
| return (min(edge.src, edge.tgt), max(edge.src, edge.tgt)) |
| for edge1 in self.graph.edges: |
| min1, max1 = endpoints(edge1) |
| for edge2 in self.graph.edges: |
| min2, max2 = endpoints(edge2) |
| if min1 < min2 and min2 < max1 and max1 < max2: |
| yield (min1, max1), (min2, max2) |
|
|
| def _crossing_edges(self): |
| crossing_edges = set() |
| for edge1, edge2 in self._crossing_pairs(): |
| crossing_edges.add(edge1) |
| crossing_edges.add(edge2) |
| return crossing_edges |
|
|
| def is_noncrossing(self): |
| for _, _ in self._crossing_pairs(): |
| return False |
| return True |
|
|
| def is_page2(self): |
| crossing_graph = {u: set() for u in self._crossing_edges()} |
| for edge1, edge2 in self._crossing_pairs(): |
| crossing_graph[edge1].add(edge2) |
| crossing_graph[edge2].add(edge1) |
|
|
| |
| colors = {} |
|
|
| def inner(node, color1, color2): |
| colors[node] = color1 |
| for neighbour in crossing_graph[node]: |
| if neighbour in colors: |
| if colors[neighbour] == color1: |
| return False |
| else: |
| inner(neighbour, color2, color1) |
| return True |
|
|
| for node in crossing_graph: |
| if node not in colors: |
| if not inner(node, 0, 1): |
| return False |
| return True |
|
|
| def density(self): |
| n_nodes = len(self.graph.nodes) - self.n_singleton_nodes() |
| if n_nodes <= 1: |
| return 1 |
| else: |
| n_edges = 0 |
| for edge in self.graph.edges: |
| if edge.src != edge.tgt: |
| n_edges += 1 |
| return n_edges / (n_nodes - 1) |
|
|
|
|
| PROPERTY_COUNTER = itertools.count(1) |
|
|
|
|
| def report(msg, val): |
| print("(%02d)\t%s\t%s" % (next(PROPERTY_COUNTER), msg, val)) |
|
|
|
|
| def analyze(graphs, ids=None): |
| ordered = False |
| n_graphs = 0 |
| n_graphs_noncrossing = 0 |
| n_graphs_has_top_node = 0 |
| n_graphs_multirooted = 0 |
| n_nodes = 0 |
| n_nodes_with_reentrancies = 0 |
| n_singletons = 0 |
| n_top_nodes = 0 |
| n_edges = 0 |
| n_labels = 0; |
| n_properties = 0; |
| n_anchors = 0; |
| n_attributes = 0; |
| n_loops = 0 |
| labels = set() |
| non_functional_labels = set() |
| n_cyclic = 0 |
| n_connected = 0 |
| n_forests = 0 |
| n_trees = 0 |
| n_graphs_page2 = 0 |
| acc_treewidth = 0 |
| n_roots_nontop = 0 |
| acc_density = 0.0 |
| max_treewidth = 0 |
| acc_edge_length = 0 |
| n_treewidth_one = 0 |
| treewidths = [] |
| for graph in graphs: |
| if ids and not graph.id in ids: |
| continue |
|
|
| n_graphs += 1 |
| n_nodes += len(graph.nodes) |
| n_edges += len(graph.edges) |
|
|
| for node in graph.nodes: |
| if node.label is not None: n_labels += 1; |
| if node.properties is not None and node.values is not None: |
| n_properties += len(node.properties); |
| if node.anchors is not None: n_anchors += 1; |
| for edge in graph.edges: |
| if edge.attributes is not None and edge.values is not None: |
| n_attributes += len(edge.attributes); |
|
|
| inspected_graph = InspectedGraph(graph) |
|
|
| treewidth = inspected_graph.treewidth() |
|
|
| n_trees += inspected_graph.is_tree() |
| acc_density += inspected_graph.density() |
|
|
| has_reentrancies = False |
| has_top_node = False |
|
|
| n_loops += inspected_graph.n_loops() |
|
|
| for edge in graph.edges: |
| if edge.lab is not None: labels.add(edge.lab) |
| for node in graph.nodes: |
| n_top_nodes += node.is_top |
| if node.is_top: |
| has_top_node = True |
| n_singletons += node.is_singleton() |
| if len(node.incoming_edges) > 1: |
| n_nodes_with_reentrancies += 1 |
| has_reentrancies = True |
| outgoing_labels = set() |
| for edge in node.outgoing_edges: |
| if edge.lab in outgoing_labels: |
| non_functional_labels.add(edge.lab) |
| else: |
| outgoing_labels.add(edge.lab) |
| if not node.is_singleton() and node.is_root() and not node.is_top: |
| n_roots_nontop += 1 |
|
|
| n_cyclic += inspected_graph.is_cyclic() |
| n_connected += inspected_graph.n_components() == 1 |
| n_forests += inspected_graph.is_forest() |
| acc_treewidth += treewidth |
| max_treewidth = max(max_treewidth, treewidth) |
| n_treewidth_one += treewidth == 1 |
| treewidths.append(treewidth) |
|
|
| if graph.flavor == 0: |
| ordered = True |
| n_graphs_noncrossing += inspected_graph.is_noncrossing() |
| n_graphs_page2 += inspected_graph.is_page2() |
| acc_edge_length += sum(edge.length() for edge in graph.edges) |
| else: |
| if ordered: |
| print( |
| "analyzer.py: cannot mix graphs of different flavors in one file; exit.", file=sys.stderr) |
| sys.exit(1) |
|
|
| n_graphs_has_top_node += has_top_node |
| n_graphs_multirooted += inspected_graph.n_root_nodes() > 1 |
|
|
| n_nonsingletons = n_nodes - n_singletons |
|
|
| report("number of graphs", "%d" % n_graphs) |
| report("number of nodes", "%d" % n_nodes) |
| n_tuples = n_top_nodes + n_labels + n_properties + n_anchors + n_edges + n_attributes; |
| if n_tuples > 0: |
| report("number of tops (percentage)", |
| "{:d} ({:.2f})".format(n_top_nodes, 100 * n_top_nodes / n_tuples)); |
| report("number of node labels (percentage)", |
| "{:d} ({:.2f})".format(n_labels, 100 * n_labels / n_tuples)); |
| report("number of node properties (percentage)", |
| "{:d} ({:.2f})".format(n_properties, 100 * n_properties / n_tuples)); |
| report("number of node anchors (percentage)", |
| "{:d} ({:.2f})".format(n_anchors, 100 * n_anchors / n_tuples)); |
| report("number of edges (percentage)", |
| "{:d} ({:.2f})".format(n_edges, 100 * n_edges / n_tuples)); |
| report("number of edge attributes (percentage)", |
| "{:d} ({:.2f})".format(n_attributes, 100 * n_attributes / n_tuples)); |
| report("number of edge labels", "%d" % len(labels)) |
| |
| |
| report("\\percentgraph\\ trees", "%.2f" % (100 * n_trees / n_graphs)) |
| report("\\percentgraph\\ treewidth one", "%.2f" % |
| (100 * n_treewidth_one / n_graphs)) |
| report("average treewidth", "%.3f" % (acc_treewidth / n_graphs)) |
| |
| report("maximal treewidth", "%d" % max_treewidth) |
| |
| report("average edge density", "%.3f" % (acc_density / n_graphs)) |
| report("\\percentnode\\ reentrant", "%.2f" % |
| (100 * n_nodes_with_reentrancies / n_nonsingletons)) |
| |
| |
| |
| |
| |
| report("\\percentgraph\\ cyclic", "%.2f" % (100 * n_cyclic / n_graphs)) |
| |
| report("\\percentgraph\\ not connected", "%.2f" % |
| (100 * (n_graphs - n_connected) / n_graphs)) |
| |
| |
| report("\\percentgraph\\ multi-rooted", "%.2f" % |
| (100 * n_graphs_multirooted / n_graphs)) |
| report("percentage of non-top roots", "%.2f" % |
| (100 * n_roots_nontop / n_nonsingletons)) |
| if ordered: |
| report("average edge length", "%.3f" % (acc_edge_length / n_edges)) |
| report("\\percentgraph\\ noncrossing", "%.2f" % |
| (100 * n_graphs_noncrossing / n_graphs)) |
| report("\\percentgraph\\ pagenumber two", "%.2f" % |
| (100 * n_graphs_page2 / n_graphs)) |
| else: |
| report("average edge length", "--") |
| report("\\percentgraph\\ noncrossing", "--") |
| report("\\percentgraph\\ pagenumber two", "--") |
|
|
|
|
| def read_ids(file_name): |
| ids = set() |
| with open(file_name) as fp: |
| for line in fp: |
| ids.add(line.rstrip()) |
| return ids |
|
|
|
|
| def read_tokens(file_name): |
| with open(file_name) as fp: |
| for line in fp: |
| yield line.split() |
|
|
|
|
| def analyze_cmd(read_function, ordered=False): |
| import sys |
| ids = None |
| tokens = None |
| for arg in sys.argv[2:]: |
| x, y = tuple(arg.split(':')) |
| if x == 'ids': |
| print("Reading whitelisted IDs from %s" % y, file=sys.stderr) |
| ids = read_ids(y) |
| if x == 'tokens': |
| print("Reading tokens from %s" % y, file=sys.stderr) |
| tokens = read_tokens(y) |
| with open(sys.argv[1]) as fp: |
| analyze(read_function(fp), ordered=ordered, ids=ids, tokens=tokens) |
|
|