| |
| """Re-evaluate all 135 trained seeds with paper-style metrics. |
| |
| For each <row>/seeds/seed*/model_best.pt: |
| - Reload the model with the right modalities |
| - Build the test loader for that modality subset |
| - Run inference, collect predictions |
| - Compute Acc, Macro-F1, Weighted-F1 per head (verb_fine, verb_composite, |
| noun, hand) and for the joint "action" (= verb_fine ∧ noun ∧ hand) |
| - Write <seed_dir>/eval_macrof1.json |
| |
| Cache the test_ds per modality subset so we don't rebuild it 135 times. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import json |
| import os |
| import sys |
| import time |
| from pathlib import Path |
|
|
| import pandas as pd |
| import numpy as np |
| import torch |
| from sklearn.metrics import f1_score, accuracy_score |
| from torch.utils.data import DataLoader |
|
|
| REPO = Path("${PULSE_ROOT}") |
| sys.path.insert(0, str(REPO / "experiments")) |
|
|
| from dataset_seqpred import ( |
| TripletSeqPredDataset, build_train_test, collate_triplet, |
| TRAIN_VOLS_V3, TEST_VOLS_V3, |
| ) |
| from models_seqpred import build_model |
|
|
|
|
| def find_seed_dirs(): |
| out = [] |
| for table_name in [ |
| "table1_main_comparison", |
| "table3_horizon_curve", |
| "table4_modality_ablation", |
| "table5_component_ablation", |
| "table7_missing_modality", |
| ]: |
| td = REPO / table_name |
| for row_dir in sorted(td.glob("row*")): |
| for sd in sorted((row_dir / "seeds").glob("seed*")): |
| if (sd / "model_best.pt").exists() and (sd / "results.json").exists(): |
| out.append(sd) |
| return out |
|
|
|
|
| _test_cache = {} |
|
|
|
|
| def get_test_loader(modalities, t_obs, t_fut, downsample, num_workers=0): |
| key = (tuple(modalities), float(t_obs), float(t_fut), int(downsample)) |
| if key in _test_cache: |
| return _test_cache[key] |
| print(f" [build test loader] modalities={modalities} t_obs={t_obs} t_fut={t_fut}", |
| flush=True) |
| train_ds, test_ds = build_train_test( |
| modalities=list(modalities), |
| t_obs_sec=t_obs, t_fut_sec=t_fut, downsample=downsample, |
| ) |
| test_loader = DataLoader(test_ds, batch_size=64, shuffle=False, |
| collate_fn=collate_triplet, num_workers=num_workers) |
| md = test_ds.modality_dims |
| _test_cache[key] = (test_loader, md) |
| return test_loader, md |
|
|
|
|
| def eval_one(seed_dir: Path, device: torch.device): |
| res_p = seed_dir / "results.json" |
| with open(res_p) as f: |
| results = json.load(f) |
| args = results["args"] |
| model_name = args["model"] |
| modalities = args["modalities"].split(",") |
| t_obs = args["t_obs"] |
| t_fut = args["t_fut"] |
| downsample = args.get("downsample", 5) |
|
|
| test_loader, modality_dims = get_test_loader(modalities, t_obs, t_fut, downsample) |
|
|
| model = build_model(model_name, modality_dims).to(device) |
| state = torch.load(seed_dir / "model_best.pt", map_location=device, |
| weights_only=False) |
| model.load_state_dict(state["state_dict"]) |
| model.eval() |
|
|
| all_logits = {k: [] for k in ("verb_fine", "verb_composite", "noun", "hand")} |
| all_y = {k: [] for k in ("verb_fine", "verb_composite", "noun", "hand")} |
|
|
| with torch.no_grad(): |
| for x, mask, lens, y, meta in test_loader: |
| x = {m: t.to(device) for m, t in x.items()} |
| mask = mask.to(device) |
| logits = model(x, mask) |
| for k in all_logits: |
| all_logits[k].append(logits[k].cpu()) |
| all_y[k].append(y[k]) |
|
|
| logits_cat = {k: torch.cat(v, dim=0) for k, v in all_logits.items()} |
| y_cat = {k: torch.cat(v, dim=0).numpy() for k, v in all_y.items()} |
| pred_cat = {k: logits_cat[k].argmax(dim=1).numpy() for k in logits_cat} |
|
|
| out = {} |
| for k in ("verb_fine", "verb_composite", "noun", "hand"): |
| out[f"{k}_acc"] = float(accuracy_score(y_cat[k], pred_cat[k])) |
| out[f"{k}_macro_f1"] = float(f1_score(y_cat[k], pred_cat[k], |
| average="macro", zero_division=0)) |
| out[f"{k}_weighted_f1"] = float(f1_score(y_cat[k], pred_cat[k], |
| average="weighted", zero_division=0)) |
|
|
| |
| correct = ((pred_cat["verb_fine"] == y_cat["verb_fine"]) & |
| (pred_cat["noun"] == y_cat["noun"]) & |
| (pred_cat["hand"] == y_cat["hand"])) |
| out["action_acc"] = float(correct.mean()) |
|
|
| |
| out["n_params"] = sum(p.numel() for p in model.parameters()) |
|
|
| out_p = seed_dir / "eval_macrof1.json" |
| with open(out_p, "w") as f: |
| json.dump(out, f, indent=2) |
| return out |
|
|
|
|
| def main(): |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| print(f"device={device}", flush=True) |
| seed_dirs = find_seed_dirs() |
| print(f"Found {len(seed_dirs)} seed dirs", flush=True) |
| t0 = time.time() |
| n_ok = 0 |
| n_fail = 0 |
| for i, sd in enumerate(seed_dirs, 1): |
| try: |
| res = eval_one(sd, device) |
| n_ok += 1 |
| if i % 10 == 0 or i <= 3: |
| rel = sd.relative_to(REPO) |
| print(f" [{i:>3}/{len(seed_dirs)}] {rel} " |
| f"action_acc={res['action_acc']:.4f} " |
| f"verb_fine_macroF1={res['verb_fine_macro_f1']:.4f} " |
| f"noun_macroF1={res['noun_macro_f1']:.4f}", |
| flush=True) |
| except Exception as e: |
| n_fail += 1 |
| print(f" [{i:>3}/{len(seed_dirs)}] FAIL {sd.relative_to(REPO)}: {e}", |
| flush=True) |
| dur = time.time() - t0 |
| print(f"Done. ok={n_ok} fail={n_fail} elapsed={dur:.1f}s", flush=True) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|