|
|
import argparse |
|
|
from rich import print |
|
|
import glob |
|
|
import os |
|
|
import re |
|
|
import shutil |
|
|
import subprocess |
|
|
from typing import Dict, List, Tuple, Iterable, Set |
|
|
import random |
|
|
from iv2_utils.iv2 import json_read, json_write |
|
|
import tempfile |
|
|
|
|
|
def ensure_dir(p: str): |
|
|
os.makedirs(p, exist_ok=True) |
|
|
|
|
|
def read_edgelist(path: str) -> Iterable[Tuple[int, int]]: |
|
|
with open(path, 'r') as f: |
|
|
for line in f: |
|
|
s = line.strip() |
|
|
if not s or s.startswith('#'): |
|
|
continue |
|
|
parts = s.split() |
|
|
if len(parts) < 2: |
|
|
continue |
|
|
try: |
|
|
u = int(parts[0]); v = int(parts[1]) |
|
|
except ValueError: |
|
|
continue |
|
|
if u == v: |
|
|
continue |
|
|
a, b = (u, v) if u < v else (v, u) |
|
|
yield a, b |
|
|
|
|
|
def write_edgelist(path: str, edges: Iterable[Tuple[int, int]]): |
|
|
with open(path, 'w') as f: |
|
|
for u, v in edges: |
|
|
f.write(f"{u} {v}\n") |
|
|
|
|
|
def parse_seeds(path: str) -> Tuple[Dict[int, int], List[int]]: |
|
|
""" |
|
|
Return (node_to_cluster_index, sorted_cluster_ids). |
|
|
The cluster indices are 0..C-1, sorted by cluster_id. |
|
|
- On overlapping membership, choose the cluster with higher 'score', then smaller cluster_id. |
|
|
""" |
|
|
js = json_read(path) |
|
|
clusters = js.get('clusters', []) |
|
|
|
|
|
clusters_sorted = sorted(clusters, key=lambda c: c.get('cluster_id', 0)) |
|
|
cluster_id_list = [c.get('cluster_id', i) for i, c in enumerate(clusters_sorted)] |
|
|
cluster_id_to_idx = {cid: i for i, cid in enumerate(cluster_id_list)} |
|
|
|
|
|
|
|
|
node_choice: Dict[int, Tuple[int, float, int]] = {} |
|
|
|
|
|
for c in clusters_sorted: |
|
|
cid = c.get('cluster_id', None) |
|
|
if cid is None: |
|
|
continue |
|
|
idx = cluster_id_to_idx[cid] |
|
|
members = c.get('members') |
|
|
if members is None: |
|
|
members = c.get('seed_nodes', []) |
|
|
score = float(c.get('score', 0.0)) |
|
|
for u in members: |
|
|
prev = node_choice.get(u, None) |
|
|
if prev is None or (score > prev[1]) or (score == prev[1] and cid < prev[2]): |
|
|
node_choice[u] = (idx, score, cid) |
|
|
|
|
|
node_to_cluster = {u: idx for u, (idx, score, cid) in node_choice.items()} |
|
|
return node_to_cluster, cluster_id_list |
|
|
|
|
|
def coarsen_edgelist(prev_edgelist: str, seeds_json: str, out_edgelist: str) -> int: |
|
|
node_to_cluster, cluster_id_list = parse_seeds(seeds_json) |
|
|
edges_set: Set[Tuple[int, int]] = set() |
|
|
missing_nodes = 0 |
|
|
for u, v in read_edgelist(prev_edgelist): |
|
|
cu = node_to_cluster.get(u, None) |
|
|
cv = node_to_cluster.get(v, None) |
|
|
if cu is None or cv is None: |
|
|
|
|
|
missing_nodes += 1 |
|
|
continue |
|
|
if cu == cv: |
|
|
continue |
|
|
a, b = (cu, cv) if cu < cv else (cv, cu) |
|
|
edges_set.add((a, b)) |
|
|
|
|
|
write_edgelist(out_edgelist, sorted(edges_set)) |
|
|
return missing_nodes |
|
|
|
|
|
def run_java( |
|
|
java_exec: str, |
|
|
class_name: str, |
|
|
edgelist_path: str, |
|
|
out_json_path: str, |
|
|
epsilon: str, |
|
|
java_opts: List[str], |
|
|
quiet: bool = False, |
|
|
) -> None: |
|
|
java_file = class_name + '.java' |
|
|
compile_cmd = ['javac', '-cp', '.', java_file] |
|
|
if not quiet: |
|
|
print(f"[blue]Compiling:[/blue] {' '.join(compile_cmd)}") |
|
|
try: |
|
|
_ = subprocess.run( |
|
|
compile_cmd, check=True, capture_output=True, text=True, encoding='utf-8' |
|
|
) |
|
|
except FileNotFoundError: |
|
|
if not quiet: |
|
|
print("[red]Error: `javac` command not found. Is JDK installed and in your PATH?[/red]") |
|
|
raise |
|
|
except subprocess.CalledProcessError as e: |
|
|
if not quiet: |
|
|
print(f"[red]Java compilation failed. Return code: {e.returncode}[/red]") |
|
|
print("[red]stdout:[/red]\n" + e.stdout) |
|
|
print("[red]stderr:[/red]\n" + e.stderr) |
|
|
raise |
|
|
|
|
|
cmd = [java_exec] + java_opts + [class_name, edgelist_path, out_json_path, epsilon] |
|
|
if not quiet: |
|
|
print("[blue]Running:[/blue]", " ".join(cmd)) |
|
|
|
|
|
run_kwargs = {'check': True} |
|
|
if quiet: |
|
|
run_kwargs['stdout'] = subprocess.DEVNULL |
|
|
run_kwargs['stderr'] = subprocess.DEVNULL |
|
|
subprocess.run(cmd, **run_kwargs) |
|
|
|
|
|
def generate_lrmc_cluster(edges: List[Tuple[int, int]], epsilon: float, java_exec: str = 'java', java_opts: List[str] = None, quiet: bool = True) -> Dict: |
|
|
""" |
|
|
Generates a single cluster from an edge list using the L-RMC algorithm. |
|
|
|
|
|
Args: |
|
|
edges: A list of 0-indexed integer tuples representing the graph edges. |
|
|
epsilon: The epsilon value for the L-RMC algorithm. |
|
|
java_exec: Path to the java executable. |
|
|
java_opts: A list of options for the java executable. |
|
|
quiet: If True, suppress stdout from this function and subprocesses. |
|
|
|
|
|
Returns: |
|
|
A dictionary containing the 'seed_nodes' and 'score' of the found cluster. |
|
|
Node IDs in 'seed_nodes' are 0-indexed. |
|
|
""" |
|
|
class_name = 'LRMCGenerateSingleCluster' |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt', encoding='utf-8') as edge_file, \ |
|
|
tempfile.NamedTemporaryFile(mode='r', delete=False, suffix='.json', encoding='utf-8') as json_file: |
|
|
edge_file_path = edge_file.name |
|
|
json_file_path = json_file.name |
|
|
|
|
|
try: |
|
|
|
|
|
with open(edge_file_path, 'w', encoding='utf-8') as f: |
|
|
for u, v in edges: |
|
|
f.write(f"{u} {v}\n") |
|
|
|
|
|
|
|
|
run_java(java_exec, class_name, edge_file_path, json_file_path, str(epsilon), java_opts or [], quiet=quiet) |
|
|
|
|
|
|
|
|
json_output = json_read(json_file_path) |
|
|
|
|
|
finally: |
|
|
|
|
|
os.remove(edge_file_path) |
|
|
os.remove(json_file_path) |
|
|
|
|
|
|
|
|
cluster = json_output['clusters'][0] |
|
|
|
|
|
|
|
|
seed_nodes_1_based = cluster['seed_nodes'] |
|
|
seed_nodes_0_based = [node - 1 for node in seed_nodes_1_based] |
|
|
|
|
|
return { |
|
|
"seed_nodes": seed_nodes_0_based, |
|
|
"score": cluster['score'] |
|
|
} |
|
|
|
|
|
|
|
|
def generate_random_seeds(edgelist_path: str, num_nodes: int, out_json_path: str): |
|
|
"""Generates a seeds file with a single cluster of randomly selected nodes.""" |
|
|
nodes = set() |
|
|
for u, v in read_edgelist(edgelist_path): |
|
|
nodes.add(u) |
|
|
nodes.add(v) |
|
|
|
|
|
if num_nodes > len(nodes): |
|
|
print(f"[yellow]Warning: requested {num_nodes} random nodes, but only {len(nodes)} unique nodes exist. Selecting all nodes.[/yellow]") |
|
|
selected_nodes = list(nodes) |
|
|
else: |
|
|
selected_nodes = random.sample(list(nodes), num_nodes) |
|
|
|
|
|
|
|
|
selected_nodes_1_indexed = [n + 1 for n in selected_nodes] |
|
|
|
|
|
seed_data = { |
|
|
"clusters": [ |
|
|
{ |
|
|
"cluster_id": 1, |
|
|
"seed_nodes": selected_nodes_1_indexed, |
|
|
"score": 1.0 |
|
|
} |
|
|
] |
|
|
} |
|
|
|
|
|
json_write(seed_data, out_json_path) |
|
|
print(f"[green]Generated random seeds with {len(selected_nodes)} nodes at {out_json_path}[/green]") |
|
|
|
|
|
def build_single_graph_levels(args): |
|
|
if args.baseline == 'random': |
|
|
if not args.num_nodes: |
|
|
raise SystemExit("--num_nodes is required for --baseline random") |
|
|
|
|
|
stage_dir = os.path.join(args.out_dir, "stage0_rand") |
|
|
ensure_dir(stage_dir) |
|
|
|
|
|
if args.levels != 1: |
|
|
print("[yellow]Warning: For random baseline, only one level is generated. Ignoring --levels setting.[/yellow]") |
|
|
|
|
|
seeds_out = os.path.join(stage_dir, f"seeds_{args.num_nodes}.json") |
|
|
generate_random_seeds(args.input_edgelist, args.num_nodes, seeds_out) |
|
|
return |
|
|
|
|
|
ensure_dir(args.out_dir) |
|
|
|
|
|
stage0_dir = os.path.join(args.out_dir, "stage0") |
|
|
ensure_dir(stage0_dir) |
|
|
e0_copy = os.path.join(stage0_dir, "edgelist_0.txt") |
|
|
if args.copy_inputs: |
|
|
shutil.copyfile(args.input_edgelist, e0_copy) |
|
|
|
|
|
prev_edgelist = args.input_edgelist |
|
|
for lvl in range(args.levels): |
|
|
stage_dir = os.path.join(args.out_dir, f"stage{lvl}") |
|
|
ensure_dir(stage_dir) |
|
|
seeds_out = os.path.join(stage_dir, "seeds_"+str(args.epsilon)+".json") |
|
|
|
|
|
run_java(args.java, args.class_name, prev_edgelist, seeds_out, args.epsilon, args.java_opts) |
|
|
|
|
|
|
|
|
if lvl < args.levels - 1: |
|
|
next_stage_dir = os.path.join(args.out_dir, f"stage{lvl+1}") |
|
|
ensure_dir(next_stage_dir) |
|
|
next_edgelist = os.path.join(next_stage_dir, f"edgelist_{lvl+1}.txt") |
|
|
missing = coarsen_edgelist(prev_edgelist, seeds_out, next_edgelist) |
|
|
if missing > 0: |
|
|
print(f"[yellow]stage{lvl}: {missing} edges had nodes missing from seeds; skipped.[/yellow]") |
|
|
prev_edgelist = next_edgelist |
|
|
|
|
|
def build_multigraph_levels(args): |
|
|
ensure_dir(args.out_dir) |
|
|
|
|
|
graph_files = sorted(glob.glob(os.path.join(args.graphs_dir, args.glob))) |
|
|
if not graph_files: |
|
|
raise SystemExit(f"No graph files found in {args.graphs_dir} with pattern {args.glob}") |
|
|
|
|
|
pattern = re.compile(r'(.*?)(\d+)(\.\w+)$') |
|
|
def graph_id_from_path(p: str) -> str: |
|
|
base = os.path.basename(p) |
|
|
m = pattern.match(base) |
|
|
if m: |
|
|
return m.group(2).zfill(6) |
|
|
|
|
|
stem = os.path.splitext(base)[0] |
|
|
m2 = re.search(r'(\d+)$', stem) |
|
|
return (m2.group(1).zfill(6) if m2 else stem) |
|
|
|
|
|
|
|
|
prev_stage_edgelists: Dict[str, str] = {} |
|
|
for lvl in range(args.levels): |
|
|
stage_dir = os.path.join(args.out_dir, f"stage{lvl}") |
|
|
ensure_dir(stage_dir) |
|
|
|
|
|
if lvl == 0: |
|
|
for gpath in graph_files: |
|
|
gid = graph_id_from_path(gpath) |
|
|
seeds_out = os.path.join(stage_dir, f"graph_{gid}.json") |
|
|
run_java(args.java, args.class_name, gpath, seeds_out, args.epsilon, args.java_opts) |
|
|
prev_stage_edgelists[gid] = gpath |
|
|
else: |
|
|
|
|
|
for gpath in graph_files: |
|
|
gid = graph_id_from_path(gpath) |
|
|
prev_edgelist = prev_stage_edgelists[gid] |
|
|
prev_seeds = os.path.join(args.out_dir, f"stage{lvl-1}", f"graph_{gid}.json") |
|
|
next_edgelist = os.path.join(stage_dir, f"graph_{gid}.txt") |
|
|
missing = coarsen_edgelist(prev_edgelist, prev_seeds, next_edgelist) |
|
|
if missing > 0: |
|
|
print(f"[yellow]stage{lvl-1} graph_{gid}: {missing} edges had nodes missing from seeds; skipped.[/yellow]") |
|
|
|
|
|
seeds_out = os.path.join(stage_dir, f"graph_{gid}.json") |
|
|
run_java(args.java, args.class_name, next_edgelist, seeds_out, args.epsilon, args.java_opts) |
|
|
prev_stage_edgelists[gid] = next_edgelist |
|
|
|
|
|
|
|
|
def main(): |
|
|
ap = argparse.ArgumentParser(description="Build LRMC seeds across multiple levels by invoking the Java LRMC tool and coarsening between levels.") |
|
|
mode = ap.add_mutually_exclusive_group(required=True) |
|
|
mode.add_argument('--input_edgelist', type=str, help='Single-graph mode: path to original edgelist.txt') |
|
|
mode.add_argument('--graphs_dir', type=str, help='Multi-graph mode: directory containing per-graph edgelist files (e.g., graph_000000.txt)') |
|
|
ap.add_argument('--glob', type=str, default='graph_*.txt', help='Multi-graph mode: glob pattern for graph files (default: graph_*.txt)') |
|
|
ap.add_argument('--out_dir', type=str, required=True, help='Output directory (stages will be created here)') |
|
|
ap.add_argument('--levels', type=int, required=True, help='Number of levels to build (e.g., 3)') |
|
|
|
|
|
ap.add_argument('--baseline', type=str, choices=['random'], help='Use a baseline method for seed generation.') |
|
|
ap.add_argument('--num_nodes', type=int, help='Number of random nodes to select for random baseline.') |
|
|
|
|
|
ap.add_argument('--java', type=str, default='java', help='Java executable (default: java)') |
|
|
ap.add_argument('--class_name', type=str, default='LRMCGenerateSingleCluster', help='Fully qualified Java class name') |
|
|
ap.add_argument('--epsilon', type=str, default='1e6', help='Epsilon argument for the Java tool (default: 1e6)') |
|
|
ap.add_argument('--java_opts', type=str, default='', help='Extra options for java (e.g., "-Xmx16g -cp my.jar")') |
|
|
ap.add_argument('--copy_inputs', action='store_true', help='Copy original edgelist under stage0 for record (single-graph mode)') |
|
|
args = ap.parse_args() |
|
|
|
|
|
|
|
|
args.java_opts = args.java_opts.split() if args.java_opts else [] |
|
|
|
|
|
if args.input_edgelist: |
|
|
build_single_graph_levels(args) |
|
|
else: |
|
|
if args.baseline == 'random': |
|
|
raise SystemExit("--baseline random is only supported in single-graph mode (--input_edgelist)") |
|
|
build_multigraph_levels(args) |
|
|
|
|
|
class Args: |
|
|
def __init__(self): |
|
|
self.input_edgelist = None |
|
|
self.graphs_dir = None |
|
|
self.glob = 'graph_*.txt' |
|
|
self.out_dir = '' |
|
|
self.levels = 0 |
|
|
self.java = 'java' |
|
|
self.class_name = 'LRMCGenerateSingleCluster' |
|
|
self.epsilon = '1e6' |
|
|
self.java_opts = [] |
|
|
self.copy_inputs = False |
|
|
self.baseline = None |
|
|
self.num_nodes = None |
|
|
|
|
|
def build_lrmc_single_graph(input_edgelist: str, out_dir: str, levels: int, epsilon: str = '1e6', |
|
|
java_exec: str = 'java', class_name: str = 'LRMCGenerateSingleCluster', |
|
|
java_opts: List[str] = None, copy_inputs: bool = False) -> str: |
|
|
""" |
|
|
Build LRMC levels for a single graph. |
|
|
|
|
|
Args: |
|
|
input_edgelist: Path to input edgelist file |
|
|
out_dir: Output directory |
|
|
levels: Number of levels to build |
|
|
epsilon: Epsilon value for Java tool |
|
|
java_exec: Java executable path |
|
|
class_name: Fully qualified Java class name |
|
|
java_opts: Extra options for Java |
|
|
copy_inputs: Whether to copy input edgelist to stage0 |
|
|
|
|
|
Returns: |
|
|
Path to the generated seeds JSON file |
|
|
""" |
|
|
args = Args() |
|
|
args.input_edgelist = input_edgelist |
|
|
args.out_dir = out_dir |
|
|
args.levels = levels |
|
|
args.epsilon = epsilon |
|
|
args.java = java_exec |
|
|
args.class_name = class_name |
|
|
args.java_opts = java_opts or [] |
|
|
args.copy_inputs = copy_inputs |
|
|
|
|
|
build_single_graph_levels(args) |
|
|
|
|
|
|
|
|
return os.path.join(out_dir, "stage0", f"seeds_{epsilon}.json") |
|
|
|
|
|
if __name__ == '__main__': |
|
|
main() |
|
|
|