import argparse from rich import print import glob import os import re import shutil import subprocess from typing import Dict, List, Tuple, Iterable, Set import random from iv2_utils.iv2 import json_read, json_write import tempfile def ensure_dir(p: str): os.makedirs(p, exist_ok=True) def read_edgelist(path: str) -> Iterable[Tuple[int, int]]: with open(path, 'r') as f: for line in f: s = line.strip() if not s or s.startswith('#'): continue parts = s.split() if len(parts) < 2: continue try: u = int(parts[0]); v = int(parts[1]) except ValueError: continue if u == v: continue a, b = (u, v) if u < v else (v, u) yield a, b def write_edgelist(path: str, edges: Iterable[Tuple[int, int]]): with open(path, 'w') as f: for u, v in edges: f.write(f"{u} {v}\n") def parse_seeds(path: str) -> Tuple[Dict[int, int], List[int]]: """ Return (node_to_cluster_index, sorted_cluster_ids). The cluster indices are 0..C-1, sorted by cluster_id. - On overlapping membership, choose the cluster with higher 'score', then smaller cluster_id. """ js = json_read(path) clusters = js.get('clusters', []) # Sort clusters by cluster_id for stable indexing clusters_sorted = sorted(clusters, key=lambda c: c.get('cluster_id', 0)) cluster_id_list = [c.get('cluster_id', i) for i, c in enumerate(clusters_sorted)] cluster_id_to_idx = {cid: i for i, cid in enumerate(cluster_id_list)} # Build node->(best_cluster_idx, best_score, best_cid) node_choice: Dict[int, Tuple[int, float, int]] = {} for c in clusters_sorted: cid = c.get('cluster_id', None) if cid is None: continue idx = cluster_id_to_idx[cid] members = c.get('members') if members is None: members = c.get('seed_nodes', []) score = float(c.get('score', 0.0)) for u in members: prev = node_choice.get(u, None) if prev is None or (score > prev[1]) or (score == prev[1] and cid < prev[2]): node_choice[u] = (idx, score, cid) node_to_cluster = {u: idx for u, (idx, score, cid) in node_choice.items()} return node_to_cluster, cluster_id_list def coarsen_edgelist(prev_edgelist: str, seeds_json: str, out_edgelist: str) -> int: node_to_cluster, cluster_id_list = parse_seeds(seeds_json) edges_set: Set[Tuple[int, int]] = set() missing_nodes = 0 for u, v in read_edgelist(prev_edgelist): cu = node_to_cluster.get(u, None) cv = node_to_cluster.get(v, None) if cu is None or cv is None: # If a node isn't present in any cluster JSON, skip or count as missing missing_nodes += 1 continue if cu == cv: continue a, b = (cu, cv) if cu < cv else (cv, cu) edges_set.add((a, b)) write_edgelist(out_edgelist, sorted(edges_set)) return missing_nodes def run_java( java_exec: str, class_name: str, edgelist_path: str, out_json_path: str, epsilon: str, java_opts: List[str], quiet: bool = False, ) -> None: java_file = class_name + '.java' compile_cmd = ['javac', '-cp', '.', java_file] if not quiet: print(f"[blue]Compiling:[/blue] {' '.join(compile_cmd)}") try: _ = subprocess.run( compile_cmd, check=True, capture_output=True, text=True, encoding='utf-8' ) except FileNotFoundError: if not quiet: print("[red]Error: `javac` command not found. Is JDK installed and in your PATH?[/red]") raise except subprocess.CalledProcessError as e: if not quiet: print(f"[red]Java compilation failed. Return code: {e.returncode}[/red]") print("[red]stdout:[/red]\n" + e.stdout) print("[red]stderr:[/red]\n" + e.stderr) raise cmd = [java_exec] + java_opts + [class_name, edgelist_path, out_json_path, epsilon] if not quiet: print("[blue]Running:[/blue]", " ".join(cmd)) run_kwargs = {'check': True} if quiet: run_kwargs['stdout'] = subprocess.DEVNULL run_kwargs['stderr'] = subprocess.DEVNULL subprocess.run(cmd, **run_kwargs) def generate_lrmc_cluster(edges: List[Tuple[int, int]], epsilon: float, java_exec: str = 'java', java_opts: List[str] = None, quiet: bool = True) -> Dict: """ Generates a single cluster from an edge list using the L-RMC algorithm. Args: edges: A list of 0-indexed integer tuples representing the graph edges. epsilon: The epsilon value for the L-RMC algorithm. java_exec: Path to the java executable. java_opts: A list of options for the java executable. quiet: If True, suppress stdout from this function and subprocesses. Returns: A dictionary containing the 'seed_nodes' and 'score' of the found cluster. Node IDs in 'seed_nodes' are 0-indexed. """ class_name = 'LRMCGenerateSingleCluster' # Create temporary files for communication with the Java subprocess with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt', encoding='utf-8') as edge_file, \ tempfile.NamedTemporaryFile(mode='r', delete=False, suffix='.json', encoding='utf-8') as json_file: edge_file_path = edge_file.name json_file_path = json_file.name try: # Write 0-indexed edgelist for the Java code to consume with open(edge_file_path, 'w', encoding='utf-8') as f: for u, v in edges: f.write(f"{u} {v}\n") # Run the Java process which reads the edge file and writes the JSON output run_java(java_exec, class_name, edge_file_path, json_file_path, str(epsilon), java_opts or [], quiet=quiet) # Read and parse the JSON output from the temporary file json_output = json_read(json_file_path) finally: # Ensure temporary files are cleaned up os.remove(edge_file_path) os.remove(json_file_path) # Extract results from the first cluster cluster = json_output['clusters'][0] # The Java code returns 1-based node IDs, so convert them to 0-based seed_nodes_1_based = cluster['seed_nodes'] seed_nodes_0_based = [node - 1 for node in seed_nodes_1_based] return { "seed_nodes": seed_nodes_0_based, "score": cluster['score'] } def generate_random_seeds(edgelist_path: str, num_nodes: int, out_json_path: str): """Generates a seeds file with a single cluster of randomly selected nodes.""" nodes = set() for u, v in read_edgelist(edgelist_path): nodes.add(u) nodes.add(v) if num_nodes > len(nodes): print(f"[yellow]Warning: requested {num_nodes} random nodes, but only {len(nodes)} unique nodes exist. Selecting all nodes.[/yellow]") selected_nodes = list(nodes) else: selected_nodes = random.sample(list(nodes), num_nodes) # The downstream script expects 1-indexed nodes under "seed_nodes" selected_nodes_1_indexed = [n + 1 for n in selected_nodes] seed_data = { "clusters": [ { "cluster_id": 1, "seed_nodes": selected_nodes_1_indexed, "score": 1.0 } ] } json_write(seed_data, out_json_path) print(f"[green]Generated random seeds with {len(selected_nodes)} nodes at {out_json_path}[/green]") def build_single_graph_levels(args): if args.baseline == 'random': if not args.num_nodes: raise SystemExit("--num_nodes is required for --baseline random") stage_dir = os.path.join(args.out_dir, "stage0_rand") ensure_dir(stage_dir) if args.levels != 1: print("[yellow]Warning: For random baseline, only one level is generated. Ignoring --levels setting.[/yellow]") seeds_out = os.path.join(stage_dir, f"seeds_{args.num_nodes}.json") generate_random_seeds(args.input_edgelist, args.num_nodes, seeds_out) return ensure_dir(args.out_dir) # Stage 0 edgelist is the input; optionally copy for record stage0_dir = os.path.join(args.out_dir, "stage0") ensure_dir(stage0_dir) e0_copy = os.path.join(stage0_dir, "edgelist_0.txt") if args.copy_inputs: shutil.copyfile(args.input_edgelist, e0_copy) prev_edgelist = args.input_edgelist for lvl in range(args.levels): stage_dir = os.path.join(args.out_dir, f"stage{lvl}") ensure_dir(stage_dir) seeds_out = os.path.join(stage_dir, "seeds_"+str(args.epsilon)+".json") # Run Java to produce seeds at this level run_java(args.java, args.class_name, prev_edgelist, seeds_out, args.epsilon, args.java_opts) # Prepare next-level edgelist (unless last level) if lvl < args.levels - 1: next_stage_dir = os.path.join(args.out_dir, f"stage{lvl+1}") ensure_dir(next_stage_dir) next_edgelist = os.path.join(next_stage_dir, f"edgelist_{lvl+1}.txt") missing = coarsen_edgelist(prev_edgelist, seeds_out, next_edgelist) if missing > 0: print(f"[yellow]stage{lvl}: {missing} edges had nodes missing from seeds; skipped.[/yellow]") prev_edgelist = next_edgelist def build_multigraph_levels(args): ensure_dir(args.out_dir) # Enumerate graph files graph_files = sorted(glob.glob(os.path.join(args.graphs_dir, args.glob))) if not graph_files: raise SystemExit(f"No graph files found in {args.graphs_dir} with pattern {args.glob}") pattern = re.compile(r'(.*?)(\d+)(\.\w+)$') # capture numeric id def graph_id_from_path(p: str) -> str: base = os.path.basename(p) m = pattern.match(base) if m: return m.group(2).zfill(6) # zero-pad to 6 for consistency # fallback: strip extension stem = os.path.splitext(base)[0] m2 = re.search(r'(\d+)$', stem) return (m2.group(1).zfill(6) if m2 else stem) # Stage 0: run Java for each graph prev_stage_edgelists: Dict[str, str] = {} for lvl in range(args.levels): stage_dir = os.path.join(args.out_dir, f"stage{lvl}") ensure_dir(stage_dir) if lvl == 0: for gpath in graph_files: gid = graph_id_from_path(gpath) seeds_out = os.path.join(stage_dir, f"graph_{gid}.json") run_java(args.java, args.class_name, gpath, seeds_out, args.epsilon, args.java_opts) prev_stage_edgelists[gid] = gpath else: # For each graph, coarsen previous edgelist using previous seeds, then run Java for gpath in graph_files: gid = graph_id_from_path(gpath) prev_edgelist = prev_stage_edgelists[gid] prev_seeds = os.path.join(args.out_dir, f"stage{lvl-1}", f"graph_{gid}.json") next_edgelist = os.path.join(stage_dir, f"graph_{gid}.txt") missing = coarsen_edgelist(prev_edgelist, prev_seeds, next_edgelist) if missing > 0: print(f"[yellow]stage{lvl-1} graph_{gid}: {missing} edges had nodes missing from seeds; skipped.[/yellow]") seeds_out = os.path.join(stage_dir, f"graph_{gid}.json") run_java(args.java, args.class_name, next_edgelist, seeds_out, args.epsilon, args.java_opts) prev_stage_edgelists[gid] = next_edgelist def main(): ap = argparse.ArgumentParser(description="Build LRMC seeds across multiple levels by invoking the Java LRMC tool and coarsening between levels.") mode = ap.add_mutually_exclusive_group(required=True) mode.add_argument('--input_edgelist', type=str, help='Single-graph mode: path to original edgelist.txt') mode.add_argument('--graphs_dir', type=str, help='Multi-graph mode: directory containing per-graph edgelist files (e.g., graph_000000.txt)') ap.add_argument('--glob', type=str, default='graph_*.txt', help='Multi-graph mode: glob pattern for graph files (default: graph_*.txt)') ap.add_argument('--out_dir', type=str, required=True, help='Output directory (stages will be created here)') ap.add_argument('--levels', type=int, required=True, help='Number of levels to build (e.g., 3)') # Baseline arguments ap.add_argument('--baseline', type=str, choices=['random'], help='Use a baseline method for seed generation.') ap.add_argument('--num_nodes', type=int, help='Number of random nodes to select for random baseline.') # Java settings ap.add_argument('--java', type=str, default='java', help='Java executable (default: java)') ap.add_argument('--class_name', type=str, default='LRMCGenerateSingleCluster', help='Fully qualified Java class name') ap.add_argument('--epsilon', type=str, default='1e6', help='Epsilon argument for the Java tool (default: 1e6)') ap.add_argument('--java_opts', type=str, default='', help='Extra options for java (e.g., "-Xmx16g -cp my.jar")') ap.add_argument('--copy_inputs', action='store_true', help='Copy original edgelist under stage0 for record (single-graph mode)') args = ap.parse_args() # Parse java_opts into a list if provided args.java_opts = args.java_opts.split() if args.java_opts else [] if args.input_edgelist: build_single_graph_levels(args) else: if args.baseline == 'random': raise SystemExit("--baseline random is only supported in single-graph mode (--input_edgelist)") build_multigraph_levels(args) class Args: def __init__(self): self.input_edgelist = None self.graphs_dir = None self.glob = 'graph_*.txt' self.out_dir = '' self.levels = 0 self.java = 'java' self.class_name = 'LRMCGenerateSingleCluster' self.epsilon = '1e6' self.java_opts = [] self.copy_inputs = False self.baseline = None self.num_nodes = None def build_lrmc_single_graph(input_edgelist: str, out_dir: str, levels: int, epsilon: str = '1e6', java_exec: str = 'java', class_name: str = 'LRMCGenerateSingleCluster', java_opts: List[str] = None, copy_inputs: bool = False) -> str: """ Build LRMC levels for a single graph. Args: input_edgelist: Path to input edgelist file out_dir: Output directory levels: Number of levels to build epsilon: Epsilon value for Java tool java_exec: Java executable path class_name: Fully qualified Java class name java_opts: Extra options for Java copy_inputs: Whether to copy input edgelist to stage0 Returns: Path to the generated seeds JSON file """ args = Args() args.input_edgelist = input_edgelist args.out_dir = out_dir args.levels = levels args.epsilon = epsilon args.java = java_exec args.class_name = class_name args.java_opts = java_opts or [] args.copy_inputs = copy_inputs build_single_graph_levels(args) # Return path to the seeds file from stage0 return os.path.join(out_dir, "stage0", f"seeds_{epsilon}.json") if __name__ == '__main__': main()