clique / src /synthetic /Table1_utils /compute_wilcoxon.py
qingy2024's picture
Upload folder using huggingface_hub
bf620c6 verified
import csv
from collections import defaultdict
from statistics import median
import math
def load_per_setting_f1(path):
# key: (k, pin, pout) -> method -> f1
data = defaultdict(dict)
with open(path, 'r') as f:
reader = csv.DictReader(f)
for row in reader:
try:
key = (int(row['ClusterSize']), float(row['InternalDensity']), float(row['ExternalDensity']))
method = row['Method']
f1 = float(row['F1'])
except Exception:
continue
data[key][method] = f1
return data
def wilcoxon_signed_rank(x, y):
# returns (W+, n, z, p_approx)
# compute differences
diffs = [a - b for a, b in zip(x, y)]
# remove zeros
pairs = [(abs(d), d) for d in diffs if abs(d) > 1e-12]
n = len(pairs)
if n == 0:
return 0.0, 0, 0.0, 1.0
# rank by absolute difference, average ranks for ties
pairs.sort(key=lambda t: t[0])
ranks = []
i = 0
rank_sum_pos = 0.0
while i < n:
j = i
while j + 1 < n and abs(pairs[j + 1][0] - pairs[i][0]) <= 1e-12:
j += 1
# average rank for ties: average of i+1..j+1
avg_rank = (i + 1 + j + 1) / 2.0
for k in range(i, j + 1):
_, d = pairs[k]
if d > 0:
rank_sum_pos += avg_rank
i = j + 1
# normal approximation (no tie correction)
mu = n * (n + 1) / 4.0
sigma = math.sqrt(n * (n + 1) * (2 * n + 1) / 24.0)
if sigma == 0:
z = 0.0
p = 1.0
else:
z = (rank_sum_pos - mu) / sigma
# two-sided p using erfc
p = math.erfc(abs(z) / math.sqrt(2.0))
return rank_sum_pos, n, z, p
def main():
import argparse
ap = argparse.ArgumentParser()
ap.add_argument('csv', help='Table 1 per-setting CSV')
args = ap.parse_args()
data = load_per_setting_f1(args.csv)
settings = sorted(data.keys())
methods = set()
for key in settings:
methods.update(data[key].keys())
if 'L-RMC' not in methods:
raise SystemExit('L-RMC not found in CSV.')
baselines = [m for m in methods if m != 'L-RMC']
print('# Wilcoxon signed-rank: L-RMC vs baselines (per-setting F1)')
for b in sorted(baselines):
xs = [] # L-RMC
ys = [] # baseline
for key in settings:
if 'L-RMC' in data[key] and b in data[key]:
xs.append(data[key]['L-RMC'])
ys.append(data[key][b])
if not xs:
continue
diffs = [a - c for a, c in zip(xs, ys)]
med = median(diffs)
Wp, n, z, p = wilcoxon_signed_rank(xs, ys)
print(f"Baseline={b:10s} n={n:2d} median ΔF1={med:.3f} p≈{p:.2e} z={z:.2f}")
if __name__ == '__main__':
main()