File size: 8,539 Bytes
bf620c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import glob
import json
import os
import re
import shutil
import subprocess
from collections import defaultdict
from typing import Dict, List, Tuple, Iterable, Set

def ensure_dir(p: str):
    os.makedirs(p, exist_ok=True)

def read_edgelist(path: str) -> Iterable[Tuple[int, int]]:
    with open(path, 'r') as f:
        for line in f:
            s = line.strip()
            if not s or s.startswith('#'):
                continue
            parts = s.split()
            if len(parts) < 2:
                continue
            try:
                u = int(parts[0]); v = int(parts[1])
            except ValueError:
                continue
            if u == v:
                continue
            a, b = (u, v) if u < v else (v, u)
            yield a, b

def write_edgelist(path: str, edges: Iterable[Tuple[int, int]]):
    with open(path, 'w') as f:
        for u, v in edges:
            f.write(f"{u} {v}\n")

def parse_seeds(path: str) -> Tuple[Dict[int, int], List[int]]:
    """
    Return (node_to_cluster_index, sorted_cluster_ids).
    The cluster indices are 0..C-1, sorted by cluster_id.
    - On overlapping membership, choose the cluster with higher 'score', then smaller cluster_id.
    """
    with open(path, 'r') as f:
        js = json.load(f)
    clusters = js.get('clusters', [])
    # Sort clusters by cluster_id for stable indexing
    clusters_sorted = sorted(clusters, key=lambda c: c.get('cluster_id', 0))
    cluster_id_list = [c.get('cluster_id', i) for i, c in enumerate(clusters_sorted)]
    cluster_id_to_idx = {cid: i for i, cid in enumerate(cluster_id_list)}

    # Build node->(best_cluster_idx, best_score, best_cid)
    node_choice: Dict[int, Tuple[int, float, int]] = {}

    for c in clusters_sorted:
        cid = c.get('cluster_id', None)
        if cid is None:
            continue
        idx = cluster_id_to_idx[cid]
        members = c.get('members', [])
        score = float(c.get('score', 0.0))
        for u in members:
            prev = node_choice.get(u, None)
            if prev is None or (score > prev[1]) or (score == prev[1] and cid < prev[2]):
                node_choice[u] = (idx, score, cid)

    node_to_cluster = {u: idx for u, (idx, score, cid) in node_choice.items()}
    return node_to_cluster, cluster_id_list

def coarsen_edgelist(prev_edgelist: str, seeds_json: str, out_edgelist: str) -> int:
    node_to_cluster, cluster_id_list = parse_seeds(seeds_json)
    edges_set: Set[Tuple[int, int]] = set()
    missing_nodes = 0
    for u, v in read_edgelist(prev_edgelist):
        cu = node_to_cluster.get(u, None)
        cv = node_to_cluster.get(v, None)
        if cu is None or cv is None:
            # If a node isn't present in any cluster JSON, skip or count as missing
            missing_nodes += 1
            continue
        if cu == cv:
            continue
        a, b = (cu, cv) if cu < cv else (cv, cu)
        edges_set.add((a, b))

    write_edgelist(out_edgelist, sorted(edges_set))
    return missing_nodes

def run_java(java_exec: str, class_name: str, edgelist_path: str, out_json_path: str,
              epsilon: str, java_opts: List[str]) -> None:
    cmd = [java_exec] + java_opts + [class_name, edgelist_path, out_json_path, epsilon]
    print("[run]", " ".join(cmd))
    subprocess.run(cmd, check=True)

def build_single_graph_levels(args):
    ensure_dir(args.out_dir)
    # Stage 0 edgelist is the input; optionally copy for record
    stage0_dir = os.path.join(args.out_dir, "stage0")
    ensure_dir(stage0_dir)
    e0_copy = os.path.join(stage0_dir, "edgelist_0.txt")
    if args.copy_inputs:
        shutil.copyfile(args.input_edgelist, e0_copy)

    prev_edgelist = args.input_edgelist
    for lvl in range(args.levels):
        stage_dir = os.path.join(args.out_dir, f"stage{lvl}")
        ensure_dir(stage_dir)
        seeds_out = os.path.join(stage_dir, "seeds.json")
        # Run Java to produce seeds at this level
        run_java(args.java, args.class_name, prev_edgelist, seeds_out, args.epsilon, args.java_opts)

        # Prepare next-level edgelist (unless last level)
        if lvl < args.levels - 1:
            next_stage_dir = os.path.join(args.out_dir, f"stage{lvl+1}")
            ensure_dir(next_stage_dir)
            next_edgelist = os.path.join(next_stage_dir, f"edgelist_{lvl+1}.txt")
            missing = coarsen_edgelist(prev_edgelist, seeds_out, next_edgelist)
            if missing > 0:
                print(f"[warn] stage{lvl}: {missing} edges had nodes missing from seeds; skipped.")
            prev_edgelist = next_edgelist

def build_multigraph_levels(args):
    ensure_dir(args.out_dir)
    # Enumerate graph files
    graph_files = sorted(glob.glob(os.path.join(args.graphs_dir, args.glob)))
    if not graph_files:
        raise SystemExit(f"No graph files found in {args.graphs_dir} with pattern {args.glob}")

    pattern = re.compile(r'(.*?)(\d+)(\.\w+)$')  # capture numeric id
    def graph_id_from_path(p: str) -> str:
        base = os.path.basename(p)
        m = pattern.match(base)
        if m:
            return m.group(2).zfill(6)  # zero-pad to 6 for consistency
        # fallback: strip extension
        stem = os.path.splitext(base)[0]
        m2 = re.search(r'(\d+)$', stem)
        return (m2.group(1).zfill(6) if m2 else stem)

    # Stage 0: run Java for each graph
    prev_stage_edgelists: Dict[str, str] = {}
    for lvl in range(args.levels):
        stage_dir = os.path.join(args.out_dir, f"stage{lvl}")
        ensure_dir(stage_dir)

        if lvl == 0:
            for gpath in graph_files:
                gid = graph_id_from_path(gpath)
                seeds_out = os.path.join(stage_dir, f"graph_{gid}.json")
                run_java(args.java, args.class_name, gpath, seeds_out, args.epsilon, args.java_opts)
                prev_stage_edgelists[gid] = gpath
        else:
            # For each graph, coarsen previous edgelist using previous seeds, then run Java
            for gpath in graph_files:
                gid = graph_id_from_path(gpath)
                prev_edgelist = prev_stage_edgelists[gid]
                prev_seeds = os.path.join(args.out_dir, f"stage{lvl-1}", f"graph_{gid}.json")
                next_edgelist = os.path.join(stage_dir, f"graph_{gid}.txt")
                missing = coarsen_edgelist(prev_edgelist, prev_seeds, next_edgelist)
                if missing > 0:
                    print(f"[warn] stage{lvl-1} graph_{gid}: {missing} edges had nodes missing from seeds; skipped.")

                seeds_out = os.path.join(stage_dir, f"graph_{gid}.json")
                run_java(args.java, args.class_name, next_edgelist, seeds_out, args.epsilon, args.java_opts)
                prev_stage_edgelists[gid] = next_edgelist


def main():
    ap = argparse.ArgumentParser(description="Build LRMC seeds across multiple levels by invoking the Java LRMC tool and coarsening between levels.")
    mode = ap.add_mutually_exclusive_group(required=True)
    mode.add_argument('--input_edgelist', type=str, help='Single-graph mode: path to original edgelist.txt')
    mode.add_argument('--graphs_dir', type=str, help='Multi-graph mode: directory containing per-graph edgelist files (e.g., graph_000000.txt)')
    ap.add_argument('--glob', type=str, default='graph_*.txt', help='Multi-graph mode: glob pattern for graph files (default: graph_*.txt)')
    ap.add_argument('--out_dir', type=str, required=True, help='Output directory (stages will be created here)')
    ap.add_argument('--levels', type=int, required=True, help='Number of levels to build (e.g., 3)')
    # Java settings
    ap.add_argument('--java', type=str, default='java', help='Java executable (default: java)')
    ap.add_argument('--class_name', type=str, default='LRMCGenerateSingleCluster', help='Fully qualified Java class name')
    ap.add_argument('--epsilon', type=str, default='1e6', help='Epsilon argument for the Java tool (default: 1e6)')
    ap.add_argument('--java_opts', type=str, default='', help='Extra options for java (e.g., "-Xmx16g -cp my.jar")')
    ap.add_argument('--copy_inputs', action='store_true', help='Copy original edgelist under stage0 for record (single-graph mode)')
    args = ap.parse_args()

    # Parse java_opts into a list if provided
    args.java_opts = args.java_opts.split() if args.java_opts else []

    if args.input_edgelist:
        build_single_graph_levels(args)
    else:
        build_multigraph_levels(args)

if __name__ == '__main__':
    main()