File size: 8,539 Bytes
bf620c6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import glob
import json
import os
import re
import shutil
import subprocess
from collections import defaultdict
from typing import Dict, List, Tuple, Iterable, Set
def ensure_dir(p: str):
os.makedirs(p, exist_ok=True)
def read_edgelist(path: str) -> Iterable[Tuple[int, int]]:
with open(path, 'r') as f:
for line in f:
s = line.strip()
if not s or s.startswith('#'):
continue
parts = s.split()
if len(parts) < 2:
continue
try:
u = int(parts[0]); v = int(parts[1])
except ValueError:
continue
if u == v:
continue
a, b = (u, v) if u < v else (v, u)
yield a, b
def write_edgelist(path: str, edges: Iterable[Tuple[int, int]]):
with open(path, 'w') as f:
for u, v in edges:
f.write(f"{u} {v}\n")
def parse_seeds(path: str) -> Tuple[Dict[int, int], List[int]]:
"""
Return (node_to_cluster_index, sorted_cluster_ids).
The cluster indices are 0..C-1, sorted by cluster_id.
- On overlapping membership, choose the cluster with higher 'score', then smaller cluster_id.
"""
with open(path, 'r') as f:
js = json.load(f)
clusters = js.get('clusters', [])
# Sort clusters by cluster_id for stable indexing
clusters_sorted = sorted(clusters, key=lambda c: c.get('cluster_id', 0))
cluster_id_list = [c.get('cluster_id', i) for i, c in enumerate(clusters_sorted)]
cluster_id_to_idx = {cid: i for i, cid in enumerate(cluster_id_list)}
# Build node->(best_cluster_idx, best_score, best_cid)
node_choice: Dict[int, Tuple[int, float, int]] = {}
for c in clusters_sorted:
cid = c.get('cluster_id', None)
if cid is None:
continue
idx = cluster_id_to_idx[cid]
members = c.get('members', [])
score = float(c.get('score', 0.0))
for u in members:
prev = node_choice.get(u, None)
if prev is None or (score > prev[1]) or (score == prev[1] and cid < prev[2]):
node_choice[u] = (idx, score, cid)
node_to_cluster = {u: idx for u, (idx, score, cid) in node_choice.items()}
return node_to_cluster, cluster_id_list
def coarsen_edgelist(prev_edgelist: str, seeds_json: str, out_edgelist: str) -> int:
node_to_cluster, cluster_id_list = parse_seeds(seeds_json)
edges_set: Set[Tuple[int, int]] = set()
missing_nodes = 0
for u, v in read_edgelist(prev_edgelist):
cu = node_to_cluster.get(u, None)
cv = node_to_cluster.get(v, None)
if cu is None or cv is None:
# If a node isn't present in any cluster JSON, skip or count as missing
missing_nodes += 1
continue
if cu == cv:
continue
a, b = (cu, cv) if cu < cv else (cv, cu)
edges_set.add((a, b))
write_edgelist(out_edgelist, sorted(edges_set))
return missing_nodes
def run_java(java_exec: str, class_name: str, edgelist_path: str, out_json_path: str,
epsilon: str, java_opts: List[str]) -> None:
cmd = [java_exec] + java_opts + [class_name, edgelist_path, out_json_path, epsilon]
print("[run]", " ".join(cmd))
subprocess.run(cmd, check=True)
def build_single_graph_levels(args):
ensure_dir(args.out_dir)
# Stage 0 edgelist is the input; optionally copy for record
stage0_dir = os.path.join(args.out_dir, "stage0")
ensure_dir(stage0_dir)
e0_copy = os.path.join(stage0_dir, "edgelist_0.txt")
if args.copy_inputs:
shutil.copyfile(args.input_edgelist, e0_copy)
prev_edgelist = args.input_edgelist
for lvl in range(args.levels):
stage_dir = os.path.join(args.out_dir, f"stage{lvl}")
ensure_dir(stage_dir)
seeds_out = os.path.join(stage_dir, "seeds.json")
# Run Java to produce seeds at this level
run_java(args.java, args.class_name, prev_edgelist, seeds_out, args.epsilon, args.java_opts)
# Prepare next-level edgelist (unless last level)
if lvl < args.levels - 1:
next_stage_dir = os.path.join(args.out_dir, f"stage{lvl+1}")
ensure_dir(next_stage_dir)
next_edgelist = os.path.join(next_stage_dir, f"edgelist_{lvl+1}.txt")
missing = coarsen_edgelist(prev_edgelist, seeds_out, next_edgelist)
if missing > 0:
print(f"[warn] stage{lvl}: {missing} edges had nodes missing from seeds; skipped.")
prev_edgelist = next_edgelist
def build_multigraph_levels(args):
ensure_dir(args.out_dir)
# Enumerate graph files
graph_files = sorted(glob.glob(os.path.join(args.graphs_dir, args.glob)))
if not graph_files:
raise SystemExit(f"No graph files found in {args.graphs_dir} with pattern {args.glob}")
pattern = re.compile(r'(.*?)(\d+)(\.\w+)$') # capture numeric id
def graph_id_from_path(p: str) -> str:
base = os.path.basename(p)
m = pattern.match(base)
if m:
return m.group(2).zfill(6) # zero-pad to 6 for consistency
# fallback: strip extension
stem = os.path.splitext(base)[0]
m2 = re.search(r'(\d+)$', stem)
return (m2.group(1).zfill(6) if m2 else stem)
# Stage 0: run Java for each graph
prev_stage_edgelists: Dict[str, str] = {}
for lvl in range(args.levels):
stage_dir = os.path.join(args.out_dir, f"stage{lvl}")
ensure_dir(stage_dir)
if lvl == 0:
for gpath in graph_files:
gid = graph_id_from_path(gpath)
seeds_out = os.path.join(stage_dir, f"graph_{gid}.json")
run_java(args.java, args.class_name, gpath, seeds_out, args.epsilon, args.java_opts)
prev_stage_edgelists[gid] = gpath
else:
# For each graph, coarsen previous edgelist using previous seeds, then run Java
for gpath in graph_files:
gid = graph_id_from_path(gpath)
prev_edgelist = prev_stage_edgelists[gid]
prev_seeds = os.path.join(args.out_dir, f"stage{lvl-1}", f"graph_{gid}.json")
next_edgelist = os.path.join(stage_dir, f"graph_{gid}.txt")
missing = coarsen_edgelist(prev_edgelist, prev_seeds, next_edgelist)
if missing > 0:
print(f"[warn] stage{lvl-1} graph_{gid}: {missing} edges had nodes missing from seeds; skipped.")
seeds_out = os.path.join(stage_dir, f"graph_{gid}.json")
run_java(args.java, args.class_name, next_edgelist, seeds_out, args.epsilon, args.java_opts)
prev_stage_edgelists[gid] = next_edgelist
def main():
ap = argparse.ArgumentParser(description="Build LRMC seeds across multiple levels by invoking the Java LRMC tool and coarsening between levels.")
mode = ap.add_mutually_exclusive_group(required=True)
mode.add_argument('--input_edgelist', type=str, help='Single-graph mode: path to original edgelist.txt')
mode.add_argument('--graphs_dir', type=str, help='Multi-graph mode: directory containing per-graph edgelist files (e.g., graph_000000.txt)')
ap.add_argument('--glob', type=str, default='graph_*.txt', help='Multi-graph mode: glob pattern for graph files (default: graph_*.txt)')
ap.add_argument('--out_dir', type=str, required=True, help='Output directory (stages will be created here)')
ap.add_argument('--levels', type=int, required=True, help='Number of levels to build (e.g., 3)')
# Java settings
ap.add_argument('--java', type=str, default='java', help='Java executable (default: java)')
ap.add_argument('--class_name', type=str, default='LRMCGenerateSingleCluster', help='Fully qualified Java class name')
ap.add_argument('--epsilon', type=str, default='1e6', help='Epsilon argument for the Java tool (default: 1e6)')
ap.add_argument('--java_opts', type=str, default='', help='Extra options for java (e.g., "-Xmx16g -cp my.jar")')
ap.add_argument('--copy_inputs', action='store_true', help='Copy original edgelist under stage0 for record (single-graph mode)')
args = ap.parse_args()
# Parse java_opts into a list if provided
args.java_opts = args.java_opts.split() if args.java_opts else []
if args.input_edgelist:
build_single_graph_levels(args)
else:
build_multigraph_levels(args)
if __name__ == '__main__':
main()
|