| |
| """Per-subset evaluator. |
| |
| Given a (modalities, t_obs, t_fut) triple, evaluate ALL trained seed dirs |
| across all 27 rows whose results.json matches that triple. Builds the test |
| dataset exactly once for the given triple, then iterates over matching |
| seeds, loads each model_best.pt, runs inference, and writes |
| <seed_dir>/eval_macrof1.json. |
| |
| Used by dispatch_eval.sh to run 16 of these in parallel on the cluster. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import sys |
| import time |
| from pathlib import Path |
|
|
| import pandas as pd |
| import numpy as np |
| import torch |
| from sklearn.metrics import f1_score, accuracy_score |
| from torch.utils.data import DataLoader |
|
|
| REPO = Path("${PULSE_ROOT}") |
| sys.path.insert(0, str(REPO / "experiments")) |
|
|
| from dataset_seqpred import ( |
| build_train_test, collate_triplet, |
| ) |
| from models_seqpred import build_model |
|
|
|
|
| def find_matching_seeds(mods_canon: str, t_obs: float, t_fut: float): |
| out = [] |
| for tt in [ |
| "table1_main_comparison", |
| "table3_horizon_curve", |
| "table4_modality_ablation", |
| "table5_component_ablation", |
| "table7_missing_modality", |
| ]: |
| td = REPO / tt |
| for row_dir in sorted(td.glob("row*")): |
| seed42 = row_dir / "seeds" / "seed42" / "results.json" |
| if not seed42.exists(): |
| continue |
| with open(seed42) as f: |
| d = json.load(f) |
| a = d["args"] |
| row_mods_canon = ",".join(sorted(a["modalities"].split(","))) |
| if (row_mods_canon == mods_canon |
| and abs(float(a["t_obs"]) - t_obs) < 1e-6 |
| and abs(float(a["t_fut"]) - t_fut) < 1e-6): |
| for sd in sorted((row_dir / "seeds").glob("seed*")): |
| if (sd / "model_best.pt").exists() and (sd / "results.json").exists(): |
| out.append(sd) |
| return out |
|
|
|
|
| def main(): |
| ap = argparse.ArgumentParser() |
| ap.add_argument("--modalities", required=True, |
| help="Sorted comma-separated list, e.g. 'emg,eyetrack,imu,mocap,pressure'") |
| ap.add_argument("--t_obs", type=float, required=True) |
| ap.add_argument("--t_fut", type=float, required=True) |
| args = ap.parse_args() |
|
|
| seed_dirs = find_matching_seeds(args.modalities, args.t_obs, args.t_fut) |
| print(f"Subset key=({args.modalities!r}, t_obs={args.t_obs}, t_fut={args.t_fut})", flush=True) |
| print(f"Matched {len(seed_dirs)} seed dirs", flush=True) |
| for sd in seed_dirs: |
| print(f" {sd.relative_to(REPO)}", flush=True) |
| if not seed_dirs: |
| return |
|
|
| |
| |
| |
| |
| |
| |
| |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| print(f"device={device}", flush=True) |
|
|
| |
| |
| orders = {} |
| for sd in seed_dirs: |
| with open(sd / "results.json") as f: |
| d = json.load(f) |
| orig_mods = d["args"]["modalities"] |
| orders.setdefault(orig_mods, []).append((sd, d)) |
| print(f"Distinct original modality orderings under this canonical key: {len(orders)}", |
| flush=True) |
|
|
| n_ok, n_fail = 0, 0 |
| t0 = time.time() |
| for orig_mods, group in orders.items(): |
| mods_list = orig_mods.split(",") |
| print(f"\n=== Building test loader for original order: {mods_list} ===", |
| flush=True) |
| tb0 = time.time() |
| train_ds, test_ds = build_train_test( |
| modalities=mods_list, |
| t_obs_sec=args.t_obs, t_fut_sec=args.t_fut, |
| ) |
| del train_ds |
| test_loader = DataLoader(test_ds, batch_size=64, shuffle=False, |
| collate_fn=collate_triplet, num_workers=0) |
| modality_dims = test_ds.modality_dims |
| print(f" build took {time.time()-tb0:.1f}s; test n={len(test_ds)}", |
| flush=True) |
|
|
| for sd, results in group: |
| args_d = results["args"] |
| try: |
| model = build_model(args_d["model"], modality_dims).to(device) |
| state = torch.load(sd / "model_best.pt", map_location=device, |
| weights_only=False) |
| model.load_state_dict(state["state_dict"]) |
| model.eval() |
|
|
| all_logits = {k: [] for k in |
| ("verb_fine", "verb_composite", "noun", "hand")} |
| all_y = {k: [] for k in |
| ("verb_fine", "verb_composite", "noun", "hand")} |
| with torch.no_grad(): |
| for x, mask, lens, y, meta in test_loader: |
| x = {m: t.to(device) for m, t in x.items()} |
| mask = mask.to(device) |
| logits = model(x, mask) |
| for k in all_logits: |
| all_logits[k].append(logits[k].cpu()) |
| all_y[k].append(y[k]) |
|
|
| logits_cat = {k: torch.cat(v, dim=0) for k, v in all_logits.items()} |
| y_cat = {k: torch.cat(v, dim=0).numpy() for k, v in all_y.items()} |
| pred_cat = {k: logits_cat[k].argmax(dim=1).numpy() for k in logits_cat} |
|
|
| out = {} |
| for k in ("verb_fine", "verb_composite", "noun", "hand"): |
| out[f"{k}_acc"] = float(accuracy_score(y_cat[k], pred_cat[k])) |
| out[f"{k}_macro_f1"] = float(f1_score(y_cat[k], pred_cat[k], |
| average="macro", zero_division=0)) |
| out[f"{k}_weighted_f1"] = float(f1_score(y_cat[k], pred_cat[k], |
| average="weighted", zero_division=0)) |
| correct = ((pred_cat["verb_fine"] == y_cat["verb_fine"]) & |
| (pred_cat["noun"] == y_cat["noun"]) & |
| (pred_cat["hand"] == y_cat["hand"])) |
| out["action_acc"] = float(correct.mean()) |
| out["n_params"] = sum(p.numel() for p in model.parameters()) |
|
|
| with open(sd / "eval_macrof1.json", "w") as f: |
| json.dump(out, f, indent=2) |
| print(f" OK {sd.relative_to(REPO)} action_acc={out['action_acc']:.4f}", |
| flush=True) |
| n_ok += 1 |
| |
| del model |
| if torch.cuda.is_available(): |
| torch.cuda.empty_cache() |
| except Exception as e: |
| print(f" FAIL {sd.relative_to(REPO)}: {e}", flush=True) |
| n_fail += 1 |
|
|
| print(f"\nSubset done. ok={n_ok} fail={n_fail} elapsed={time.time()-t0:.1f}s", |
| flush=True) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|