| |
| """Aggregate results from the three new benchmark experiments.""" |
| import os |
| import json |
| import glob |
| import numpy as np |
|
|
| ROOT = '${PULSE_ROOT}/results/exp_new' |
|
|
|
|
| def load_results(pattern): |
| files = sorted(glob.glob(pattern)) |
| results = [] |
| for f in files: |
| try: |
| results.append(json.load(open(f))) |
| except Exception as e: |
| print(f" ERR: {f}: {e}") |
| return results |
|
|
|
|
| def aggregate_expA(): |
| """Missing modality: average across seeds per eval config.""" |
| print("\n" + "=" * 70) |
| print("EXP A: Missing-modality robustness") |
| print("=" * 70) |
|
|
| for subdir in ['expA_missing', 'expA_baseline']: |
| files = load_results(f'{ROOT}/{subdir}/*/results.json') |
| if not files: |
| print(f" No results yet for {subdir}") |
| continue |
| print(f"\n-- {subdir} (n seeds = {len(files)}) --") |
| |
| config_stats = {} |
| for r in files: |
| if 'eval_configs' not in r: |
| continue |
| for name, info in r['eval_configs'].items(): |
| config_stats.setdefault(name, {'f1': [], 'acc': [], 'active': info['active']}) |
| config_stats[name]['f1'].append(info['f1']) |
| config_stats[name]['acc'].append(info['acc']) |
|
|
| |
| full_names = [n for n in config_stats if n == 'full'] |
| drop_names = sorted([n for n in config_stats if n.startswith('drop_')]) |
| only_names = sorted([n for n in config_stats if n.startswith('only_')]) |
|
|
| print(f" {'Config':<22s} {'Active modalities':<42s} " |
| f"{'F1 mean±std':<14s} {'Acc mean±std':<14s}") |
| print(' ' + '-' * 96) |
| for grp in [full_names, drop_names, only_names]: |
| for name in grp: |
| d = config_stats[name] |
| f1_m, f1_s = np.mean(d['f1']), np.std(d['f1']) |
| ac_m, ac_s = np.mean(d['acc']), np.std(d['acc']) |
| active = ','.join(d['active']) |
| print(f" {name:<22s} {active:<42s} " |
| f"{f1_m:.3f}±{f1_s:.3f} {ac_m:.3f}±{ac_s:.3f}") |
|
|
|
|
| def aggregate_expB(): |
| """Grip regression: group by (backbone, mod_config), average over seeds.""" |
| print("\n" + "=" * 70) |
| print("EXP B: Grip force regression") |
| print("=" * 70) |
| files = load_results(f'{ROOT}/expB_grip/*/results.json') |
| if not files: |
| print(" No results yet") |
| return |
|
|
| |
| groups = {} |
| for r in files: |
| if 'best_test_metrics' not in r: |
| continue |
| key = (r['backbone'], ','.join(r['modalities'])) |
| groups.setdefault(key, []).append(r) |
|
|
| rows = [] |
| for (bb, mods), rs in groups.items(): |
| mae_R = [r['best_test_metrics']['right_hand']['mae_g'] for r in rs] |
| mae_L = [r['best_test_metrics']['left_hand']['mae_g'] for r in rs] |
| r_R = [r['best_test_metrics']['right_hand']['pearson_r'] for r in rs] |
| r_L = [r['best_test_metrics']['left_hand']['pearson_r'] for r in rs] |
| r2_R = [r['best_test_metrics']['right_hand']['r2'] for r in rs] |
| r2_L = [r['best_test_metrics']['left_hand']['r2'] for r in rs] |
| mae_avg = [r['best_test_metrics']['avg_mae_g'] for r in rs] |
| r_avg = [r['best_test_metrics']['avg_pearson_r'] for r in rs] |
| rows.append({ |
| 'backbone': bb, |
| 'modalities': mods, |
| 'n_seeds': len(rs), |
| 'mae_R': (np.mean(mae_R), np.std(mae_R)), |
| 'mae_L': (np.mean(mae_L), np.std(mae_L)), |
| 'mae_avg': (np.mean(mae_avg), np.std(mae_avg)), |
| 'r_R': (np.mean(r_R), np.std(r_R)), |
| 'r_L': (np.mean(r_L), np.std(r_L)), |
| 'r_avg': (np.mean(r_avg), np.std(r_avg)), |
| 'r2_R': (np.mean(r2_R), np.std(r2_R)), |
| 'r2_L': (np.mean(r2_L), np.std(r2_L)), |
| }) |
| rows.sort(key=lambda r: r['r_avg'][0], reverse=True) |
| print(f" {'Backbone':<12s} {'Modalities':<30s} N " |
| f"{'MAE(g) avg':<14s} {'Pearson r avg':<14s} {'R²(R)':<12s} {'R²(L)':<12s}") |
| print(' ' + '-' * 102) |
| for row in rows: |
| print(f" {row['backbone']:<12s} {row['modalities']:<30s} {row['n_seeds']} " |
| f"{row['mae_avg'][0]:.1f}±{row['mae_avg'][1]:.1f} " |
| f"{row['r_avg'][0]:.3f}±{row['r_avg'][1]:.3f} " |
| f"{row['r2_R'][0]:.3f}±{row['r2_R'][1]:.3f} " |
| f"{row['r2_L'][0]:.3f}±{row['r2_L'][1]:.3f}") |
|
|
|
|
| def aggregate_expC(): |
| """T5 retrieval: group by mod config, average over seeds.""" |
| print("\n" + "=" * 70) |
| print("EXP C: T5 Cross-modal text retrieval") |
| print("=" * 70) |
| files = load_results(f'{ROOT}/expC_retrieval/*/results.json') |
| if not files: |
| print(" No results yet") |
| return |
| groups = {} |
| for r in files: |
| if 'final_avg_over_3_pool_seeds' not in r: |
| continue |
| key = ','.join(r['modalities']) |
| groups.setdefault(key, []).append(r) |
|
|
| rows = [] |
| for mods, rs in groups.items(): |
| r1 = [r['final_avg_over_3_pool_seeds']['recall@1'] for r in rs] |
| r5 = [r['final_avg_over_3_pool_seeds']['recall@5'] for r in rs] |
| r10 = [r['final_avg_over_3_pool_seeds']['recall@10'] for r in rs] |
| medR = [r['final_avg_over_3_pool_seeds']['median_rank'] for r in rs] |
| rows.append({ |
| 'modalities': mods, |
| 'n_seeds': len(rs), |
| 'r1': (np.mean(r1), np.std(r1)), |
| 'r5': (np.mean(r5), np.std(r5)), |
| 'r10': (np.mean(r10), np.std(r10)), |
| 'medR': (np.mean(medR), np.std(medR)), |
| 'n_test': rs[0].get('n_test_segments', 0), |
| 'K': rs[0].get('K_pool', 100), |
| }) |
| rows.sort(key=lambda r: r['r10'][0], reverse=True) |
| print(f" {'Modalities':<30s} N N_test K " |
| f"{'R@1':<12s} {'R@5':<12s} {'R@10':<12s} {'medR':<12s}") |
| print(' ' + '-' * 100) |
| for row in rows: |
| print(f" {row['modalities']:<30s} {row['n_seeds']} {row['n_test']:<6d} {row['K']:<2d} " |
| f"{row['r1'][0]:.3f}±{row['r1'][1]:.3f} " |
| f"{row['r5'][0]:.3f}±{row['r5'][1]:.3f} " |
| f"{row['r10'][0]:.3f}±{row['r10'][1]:.3f} " |
| f"{row['medR'][0]:.1f}±{row['medR'][1]:.1f}") |
|
|
|
|
| def main(): |
| aggregate_expA() |
| aggregate_expB() |
| aggregate_expC() |
|
|
|
|
| if __name__ == '__main__': |
| main() |
|
|