| import csv |
| from collections import defaultdict |
| from statistics import median |
| import math |
|
|
|
|
| def load_per_setting_f1(path): |
| |
| data = defaultdict(dict) |
| with open(path, 'r') as f: |
| reader = csv.DictReader(f) |
| for row in reader: |
| try: |
| key = (int(row['ClusterSize']), float(row['InternalDensity']), float(row['ExternalDensity'])) |
| method = row['Method'] |
| f1 = float(row['F1']) |
| except Exception: |
| continue |
| data[key][method] = f1 |
| return data |
|
|
|
|
| def wilcoxon_signed_rank(x, y): |
| |
| |
| diffs = [a - b for a, b in zip(x, y)] |
| |
| pairs = [(abs(d), d) for d in diffs if abs(d) > 1e-12] |
| n = len(pairs) |
| if n == 0: |
| return 0.0, 0, 0.0, 1.0 |
| |
| pairs.sort(key=lambda t: t[0]) |
| ranks = [] |
| i = 0 |
| rank_sum_pos = 0.0 |
| while i < n: |
| j = i |
| while j + 1 < n and abs(pairs[j + 1][0] - pairs[i][0]) <= 1e-12: |
| j += 1 |
| |
| avg_rank = (i + 1 + j + 1) / 2.0 |
| for k in range(i, j + 1): |
| _, d = pairs[k] |
| if d > 0: |
| rank_sum_pos += avg_rank |
| i = j + 1 |
|
|
| |
| mu = n * (n + 1) / 4.0 |
| sigma = math.sqrt(n * (n + 1) * (2 * n + 1) / 24.0) |
| if sigma == 0: |
| z = 0.0 |
| p = 1.0 |
| else: |
| z = (rank_sum_pos - mu) / sigma |
| |
| p = math.erfc(abs(z) / math.sqrt(2.0)) |
| return rank_sum_pos, n, z, p |
|
|
|
|
| def main(): |
| import argparse |
| ap = argparse.ArgumentParser() |
| ap.add_argument('csv', help='Table 1 per-setting CSV') |
| args = ap.parse_args() |
|
|
| data = load_per_setting_f1(args.csv) |
| settings = sorted(data.keys()) |
|
|
| methods = set() |
| for key in settings: |
| methods.update(data[key].keys()) |
|
|
| if 'L-RMC' not in methods: |
| raise SystemExit('L-RMC not found in CSV.') |
|
|
| baselines = [m for m in methods if m != 'L-RMC'] |
|
|
| print('# Wilcoxon signed-rank: L-RMC vs baselines (per-setting F1)') |
| for b in sorted(baselines): |
| xs = [] |
| ys = [] |
| for key in settings: |
| if 'L-RMC' in data[key] and b in data[key]: |
| xs.append(data[key]['L-RMC']) |
| ys.append(data[key][b]) |
| if not xs: |
| continue |
| diffs = [a - c for a, c in zip(xs, ys)] |
| med = median(diffs) |
| Wp, n, z, p = wilcoxon_signed_rank(xs, ys) |
| print(f"Baseline={b:10s} n={n:2d} median ΔF1={med:.3f} p≈{p:.2e} z={z:.2f}") |
|
|
|
|
| if __name__ == '__main__': |
| main() |
|
|