PULSE-code / experiments /analysis /aggregate_new_exps.py
velvet-pine-22's picture
Upload folder using huggingface_hub
b4b2877 verified
#!/usr/bin/env python3
"""Aggregate results from the three new benchmark experiments."""
import os
import json
import glob
import numpy as np
ROOT = '${PULSE_ROOT}/results/exp_new'
def load_results(pattern):
files = sorted(glob.glob(pattern))
results = []
for f in files:
try:
results.append(json.load(open(f)))
except Exception as e:
print(f" ERR: {f}: {e}")
return results
def aggregate_expA():
"""Missing modality: average across seeds per eval config."""
print("\n" + "=" * 70)
print("EXP A: Missing-modality robustness")
print("=" * 70)
for subdir in ['expA_missing', 'expA_baseline']:
files = load_results(f'{ROOT}/{subdir}/*/results.json')
if not files:
print(f" No results yet for {subdir}")
continue
print(f"\n-- {subdir} (n seeds = {len(files)}) --")
# Group by eval config name; accumulate F1/Acc over seeds
config_stats = {}
for r in files:
if 'eval_configs' not in r:
continue
for name, info in r['eval_configs'].items():
config_stats.setdefault(name, {'f1': [], 'acc': [], 'active': info['active']})
config_stats[name]['f1'].append(info['f1'])
config_stats[name]['acc'].append(info['acc'])
# Order: full, leave-one-out, singletons
full_names = [n for n in config_stats if n == 'full']
drop_names = sorted([n for n in config_stats if n.startswith('drop_')])
only_names = sorted([n for n in config_stats if n.startswith('only_')])
print(f" {'Config':<22s} {'Active modalities':<42s} "
f"{'F1 mean±std':<14s} {'Acc mean±std':<14s}")
print(' ' + '-' * 96)
for grp in [full_names, drop_names, only_names]:
for name in grp:
d = config_stats[name]
f1_m, f1_s = np.mean(d['f1']), np.std(d['f1'])
ac_m, ac_s = np.mean(d['acc']), np.std(d['acc'])
active = ','.join(d['active'])
print(f" {name:<22s} {active:<42s} "
f"{f1_m:.3f}±{f1_s:.3f} {ac_m:.3f}±{ac_s:.3f}")
def aggregate_expB():
"""Grip regression: group by (backbone, mod_config), average over seeds."""
print("\n" + "=" * 70)
print("EXP B: Grip force regression")
print("=" * 70)
files = load_results(f'{ROOT}/expB_grip/*/results.json')
if not files:
print(" No results yet")
return
# Group
groups = {}
for r in files:
if 'best_test_metrics' not in r:
continue
key = (r['backbone'], ','.join(r['modalities']))
groups.setdefault(key, []).append(r)
rows = []
for (bb, mods), rs in groups.items():
mae_R = [r['best_test_metrics']['right_hand']['mae_g'] for r in rs]
mae_L = [r['best_test_metrics']['left_hand']['mae_g'] for r in rs]
r_R = [r['best_test_metrics']['right_hand']['pearson_r'] for r in rs]
r_L = [r['best_test_metrics']['left_hand']['pearson_r'] for r in rs]
r2_R = [r['best_test_metrics']['right_hand']['r2'] for r in rs]
r2_L = [r['best_test_metrics']['left_hand']['r2'] for r in rs]
mae_avg = [r['best_test_metrics']['avg_mae_g'] for r in rs]
r_avg = [r['best_test_metrics']['avg_pearson_r'] for r in rs]
rows.append({
'backbone': bb,
'modalities': mods,
'n_seeds': len(rs),
'mae_R': (np.mean(mae_R), np.std(mae_R)),
'mae_L': (np.mean(mae_L), np.std(mae_L)),
'mae_avg': (np.mean(mae_avg), np.std(mae_avg)),
'r_R': (np.mean(r_R), np.std(r_R)),
'r_L': (np.mean(r_L), np.std(r_L)),
'r_avg': (np.mean(r_avg), np.std(r_avg)),
'r2_R': (np.mean(r2_R), np.std(r2_R)),
'r2_L': (np.mean(r2_L), np.std(r2_L)),
})
rows.sort(key=lambda r: r['r_avg'][0], reverse=True)
print(f" {'Backbone':<12s} {'Modalities':<30s} N "
f"{'MAE(g) avg':<14s} {'Pearson r avg':<14s} {'R²(R)':<12s} {'R²(L)':<12s}")
print(' ' + '-' * 102)
for row in rows:
print(f" {row['backbone']:<12s} {row['modalities']:<30s} {row['n_seeds']} "
f"{row['mae_avg'][0]:.1f}±{row['mae_avg'][1]:.1f} "
f"{row['r_avg'][0]:.3f}±{row['r_avg'][1]:.3f} "
f"{row['r2_R'][0]:.3f}±{row['r2_R'][1]:.3f} "
f"{row['r2_L'][0]:.3f}±{row['r2_L'][1]:.3f}")
def aggregate_expC():
"""T5 retrieval: group by mod config, average over seeds."""
print("\n" + "=" * 70)
print("EXP C: T5 Cross-modal text retrieval")
print("=" * 70)
files = load_results(f'{ROOT}/expC_retrieval/*/results.json')
if not files:
print(" No results yet")
return
groups = {}
for r in files:
if 'final_avg_over_3_pool_seeds' not in r:
continue
key = ','.join(r['modalities'])
groups.setdefault(key, []).append(r)
rows = []
for mods, rs in groups.items():
r1 = [r['final_avg_over_3_pool_seeds']['recall@1'] for r in rs]
r5 = [r['final_avg_over_3_pool_seeds']['recall@5'] for r in rs]
r10 = [r['final_avg_over_3_pool_seeds']['recall@10'] for r in rs]
medR = [r['final_avg_over_3_pool_seeds']['median_rank'] for r in rs]
rows.append({
'modalities': mods,
'n_seeds': len(rs),
'r1': (np.mean(r1), np.std(r1)),
'r5': (np.mean(r5), np.std(r5)),
'r10': (np.mean(r10), np.std(r10)),
'medR': (np.mean(medR), np.std(medR)),
'n_test': rs[0].get('n_test_segments', 0),
'K': rs[0].get('K_pool', 100),
})
rows.sort(key=lambda r: r['r10'][0], reverse=True)
print(f" {'Modalities':<30s} N N_test K "
f"{'R@1':<12s} {'R@5':<12s} {'R@10':<12s} {'medR':<12s}")
print(' ' + '-' * 100)
for row in rows:
print(f" {row['modalities']:<30s} {row['n_seeds']} {row['n_test']:<6d} {row['K']:<2d} "
f"{row['r1'][0]:.3f}±{row['r1'][1]:.3f} "
f"{row['r5'][0]:.3f}±{row['r5'][1]:.3f} "
f"{row['r10'][0]:.3f}±{row['r10'][1]:.3f} "
f"{row['medR'][0]:.1f}±{row['medR'][1]:.1f}")
def main():
aggregate_expA()
aggregate_expB()
aggregate_expC()
if __name__ == '__main__':
main()