#!/usr/bin/env python3 """Aggregate results from the three new benchmark experiments.""" import os import json import glob import numpy as np ROOT = '${PULSE_ROOT}/results/exp_new' def load_results(pattern): files = sorted(glob.glob(pattern)) results = [] for f in files: try: results.append(json.load(open(f))) except Exception as e: print(f" ERR: {f}: {e}") return results def aggregate_expA(): """Missing modality: average across seeds per eval config.""" print("\n" + "=" * 70) print("EXP A: Missing-modality robustness") print("=" * 70) for subdir in ['expA_missing', 'expA_baseline']: files = load_results(f'{ROOT}/{subdir}/*/results.json') if not files: print(f" No results yet for {subdir}") continue print(f"\n-- {subdir} (n seeds = {len(files)}) --") # Group by eval config name; accumulate F1/Acc over seeds config_stats = {} for r in files: if 'eval_configs' not in r: continue for name, info in r['eval_configs'].items(): config_stats.setdefault(name, {'f1': [], 'acc': [], 'active': info['active']}) config_stats[name]['f1'].append(info['f1']) config_stats[name]['acc'].append(info['acc']) # Order: full, leave-one-out, singletons full_names = [n for n in config_stats if n == 'full'] drop_names = sorted([n for n in config_stats if n.startswith('drop_')]) only_names = sorted([n for n in config_stats if n.startswith('only_')]) print(f" {'Config':<22s} {'Active modalities':<42s} " f"{'F1 mean±std':<14s} {'Acc mean±std':<14s}") print(' ' + '-' * 96) for grp in [full_names, drop_names, only_names]: for name in grp: d = config_stats[name] f1_m, f1_s = np.mean(d['f1']), np.std(d['f1']) ac_m, ac_s = np.mean(d['acc']), np.std(d['acc']) active = ','.join(d['active']) print(f" {name:<22s} {active:<42s} " f"{f1_m:.3f}±{f1_s:.3f} {ac_m:.3f}±{ac_s:.3f}") def aggregate_expB(): """Grip regression: group by (backbone, mod_config), average over seeds.""" print("\n" + "=" * 70) print("EXP B: Grip force regression") print("=" * 70) files = load_results(f'{ROOT}/expB_grip/*/results.json') if not files: print(" No results yet") return # Group groups = {} for r in files: if 'best_test_metrics' not in r: continue key = (r['backbone'], ','.join(r['modalities'])) groups.setdefault(key, []).append(r) rows = [] for (bb, mods), rs in groups.items(): mae_R = [r['best_test_metrics']['right_hand']['mae_g'] for r in rs] mae_L = [r['best_test_metrics']['left_hand']['mae_g'] for r in rs] r_R = [r['best_test_metrics']['right_hand']['pearson_r'] for r in rs] r_L = [r['best_test_metrics']['left_hand']['pearson_r'] for r in rs] r2_R = [r['best_test_metrics']['right_hand']['r2'] for r in rs] r2_L = [r['best_test_metrics']['left_hand']['r2'] for r in rs] mae_avg = [r['best_test_metrics']['avg_mae_g'] for r in rs] r_avg = [r['best_test_metrics']['avg_pearson_r'] for r in rs] rows.append({ 'backbone': bb, 'modalities': mods, 'n_seeds': len(rs), 'mae_R': (np.mean(mae_R), np.std(mae_R)), 'mae_L': (np.mean(mae_L), np.std(mae_L)), 'mae_avg': (np.mean(mae_avg), np.std(mae_avg)), 'r_R': (np.mean(r_R), np.std(r_R)), 'r_L': (np.mean(r_L), np.std(r_L)), 'r_avg': (np.mean(r_avg), np.std(r_avg)), 'r2_R': (np.mean(r2_R), np.std(r2_R)), 'r2_L': (np.mean(r2_L), np.std(r2_L)), }) rows.sort(key=lambda r: r['r_avg'][0], reverse=True) print(f" {'Backbone':<12s} {'Modalities':<30s} N " f"{'MAE(g) avg':<14s} {'Pearson r avg':<14s} {'R²(R)':<12s} {'R²(L)':<12s}") print(' ' + '-' * 102) for row in rows: print(f" {row['backbone']:<12s} {row['modalities']:<30s} {row['n_seeds']} " f"{row['mae_avg'][0]:.1f}±{row['mae_avg'][1]:.1f} " f"{row['r_avg'][0]:.3f}±{row['r_avg'][1]:.3f} " f"{row['r2_R'][0]:.3f}±{row['r2_R'][1]:.3f} " f"{row['r2_L'][0]:.3f}±{row['r2_L'][1]:.3f}") def aggregate_expC(): """T5 retrieval: group by mod config, average over seeds.""" print("\n" + "=" * 70) print("EXP C: T5 Cross-modal text retrieval") print("=" * 70) files = load_results(f'{ROOT}/expC_retrieval/*/results.json') if not files: print(" No results yet") return groups = {} for r in files: if 'final_avg_over_3_pool_seeds' not in r: continue key = ','.join(r['modalities']) groups.setdefault(key, []).append(r) rows = [] for mods, rs in groups.items(): r1 = [r['final_avg_over_3_pool_seeds']['recall@1'] for r in rs] r5 = [r['final_avg_over_3_pool_seeds']['recall@5'] for r in rs] r10 = [r['final_avg_over_3_pool_seeds']['recall@10'] for r in rs] medR = [r['final_avg_over_3_pool_seeds']['median_rank'] for r in rs] rows.append({ 'modalities': mods, 'n_seeds': len(rs), 'r1': (np.mean(r1), np.std(r1)), 'r5': (np.mean(r5), np.std(r5)), 'r10': (np.mean(r10), np.std(r10)), 'medR': (np.mean(medR), np.std(medR)), 'n_test': rs[0].get('n_test_segments', 0), 'K': rs[0].get('K_pool', 100), }) rows.sort(key=lambda r: r['r10'][0], reverse=True) print(f" {'Modalities':<30s} N N_test K " f"{'R@1':<12s} {'R@5':<12s} {'R@10':<12s} {'medR':<12s}") print(' ' + '-' * 100) for row in rows: print(f" {row['modalities']:<30s} {row['n_seeds']} {row['n_test']:<6d} {row['K']:<2d} " f"{row['r1'][0]:.3f}±{row['r1'][1]:.3f} " f"{row['r5'][0]:.3f}±{row['r5'][1]:.3f} " f"{row['r10'][0]:.3f}±{row['r10'][1]:.3f} " f"{row['medR'][0]:.1f}±{row['medR'][1]:.1f}") def main(): aggregate_expA() aggregate_expB() aggregate_expC() if __name__ == '__main__': main()