File size: 5,856 Bytes
b4b2877 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 | #!/usr/bin/env python3
"""Re-evaluate all 135 trained seeds with paper-style metrics.
For each <row>/seeds/seed*/model_best.pt:
- Reload the model with the right modalities
- Build the test loader for that modality subset
- Run inference, collect predictions
- Compute Acc, Macro-F1, Weighted-F1 per head (verb_fine, verb_composite,
noun, hand) and for the joint "action" (= verb_fine ∧ noun ∧ hand)
- Write <seed_dir>/eval_macrof1.json
Cache the test_ds per modality subset so we don't rebuild it 135 times.
"""
from __future__ import annotations
import json
import os
import sys
import time
from pathlib import Path
import pandas as pd # noqa: F401 (dataset_seqpred imports pandas first)
import numpy as np
import torch
from sklearn.metrics import f1_score, accuracy_score
from torch.utils.data import DataLoader
REPO = Path("${PULSE_ROOT}")
sys.path.insert(0, str(REPO / "experiments"))
from dataset_seqpred import ( # noqa: E402
TripletSeqPredDataset, build_train_test, collate_triplet,
TRAIN_VOLS_V3, TEST_VOLS_V3,
)
from models_seqpred import build_model # noqa: E402
def find_seed_dirs():
out = []
for table_name in [
"table1_main_comparison",
"table3_horizon_curve",
"table4_modality_ablation",
"table5_component_ablation",
"table7_missing_modality",
]:
td = REPO / table_name
for row_dir in sorted(td.glob("row*")):
for sd in sorted((row_dir / "seeds").glob("seed*")):
if (sd / "model_best.pt").exists() and (sd / "results.json").exists():
out.append(sd)
return out
_test_cache = {} # (modalities_tuple, t_obs, t_fut) -> (test_loader, modality_dims)
def get_test_loader(modalities, t_obs, t_fut, downsample, num_workers=0):
key = (tuple(modalities), float(t_obs), float(t_fut), int(downsample))
if key in _test_cache:
return _test_cache[key]
print(f" [build test loader] modalities={modalities} t_obs={t_obs} t_fut={t_fut}",
flush=True)
train_ds, test_ds = build_train_test(
modalities=list(modalities),
t_obs_sec=t_obs, t_fut_sec=t_fut, downsample=downsample,
)
test_loader = DataLoader(test_ds, batch_size=64, shuffle=False,
collate_fn=collate_triplet, num_workers=num_workers)
md = test_ds.modality_dims
_test_cache[key] = (test_loader, md)
return test_loader, md
def eval_one(seed_dir: Path, device: torch.device):
res_p = seed_dir / "results.json"
with open(res_p) as f:
results = json.load(f)
args = results["args"]
model_name = args["model"]
modalities = args["modalities"].split(",")
t_obs = args["t_obs"]
t_fut = args["t_fut"]
downsample = args.get("downsample", 5)
test_loader, modality_dims = get_test_loader(modalities, t_obs, t_fut, downsample)
model = build_model(model_name, modality_dims).to(device)
state = torch.load(seed_dir / "model_best.pt", map_location=device,
weights_only=False)
model.load_state_dict(state["state_dict"])
model.eval()
all_logits = {k: [] for k in ("verb_fine", "verb_composite", "noun", "hand")}
all_y = {k: [] for k in ("verb_fine", "verb_composite", "noun", "hand")}
with torch.no_grad():
for x, mask, lens, y, meta in test_loader:
x = {m: t.to(device) for m, t in x.items()}
mask = mask.to(device)
logits = model(x, mask)
for k in all_logits:
all_logits[k].append(logits[k].cpu())
all_y[k].append(y[k])
logits_cat = {k: torch.cat(v, dim=0) for k, v in all_logits.items()}
y_cat = {k: torch.cat(v, dim=0).numpy() for k, v in all_y.items()}
pred_cat = {k: logits_cat[k].argmax(dim=1).numpy() for k in logits_cat}
out = {}
for k in ("verb_fine", "verb_composite", "noun", "hand"):
out[f"{k}_acc"] = float(accuracy_score(y_cat[k], pred_cat[k]))
out[f"{k}_macro_f1"] = float(f1_score(y_cat[k], pred_cat[k],
average="macro", zero_division=0))
out[f"{k}_weighted_f1"] = float(f1_score(y_cat[k], pred_cat[k],
average="weighted", zero_division=0))
# Joint action = verb_fine AND noun AND hand correct
correct = ((pred_cat["verb_fine"] == y_cat["verb_fine"]) &
(pred_cat["noun"] == y_cat["noun"]) &
(pred_cat["hand"] == y_cat["hand"]))
out["action_acc"] = float(correct.mean())
# n_params (cheap)
out["n_params"] = sum(p.numel() for p in model.parameters())
out_p = seed_dir / "eval_macrof1.json"
with open(out_p, "w") as f:
json.dump(out, f, indent=2)
return out
def main():
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device={device}", flush=True)
seed_dirs = find_seed_dirs()
print(f"Found {len(seed_dirs)} seed dirs", flush=True)
t0 = time.time()
n_ok = 0
n_fail = 0
for i, sd in enumerate(seed_dirs, 1):
try:
res = eval_one(sd, device)
n_ok += 1
if i % 10 == 0 or i <= 3:
rel = sd.relative_to(REPO)
print(f" [{i:>3}/{len(seed_dirs)}] {rel} "
f"action_acc={res['action_acc']:.4f} "
f"verb_fine_macroF1={res['verb_fine_macro_f1']:.4f} "
f"noun_macroF1={res['noun_macro_f1']:.4f}",
flush=True)
except Exception as e:
n_fail += 1
print(f" [{i:>3}/{len(seed_dirs)}] FAIL {sd.relative_to(REPO)}: {e}",
flush=True)
dur = time.time() - t0
print(f"Done. ok={n_ok} fail={n_fail} elapsed={dur:.1f}s", flush=True)
if __name__ == "__main__":
main()
|