PULSE-code / scripts /eval_macrof1.py
velvet-pine-22's picture
Upload folder using huggingface_hub
b4b2877 verified
#!/usr/bin/env python3
"""Re-evaluate all 135 trained seeds with paper-style metrics.
For each <row>/seeds/seed*/model_best.pt:
- Reload the model with the right modalities
- Build the test loader for that modality subset
- Run inference, collect predictions
- Compute Acc, Macro-F1, Weighted-F1 per head (verb_fine, verb_composite,
noun, hand) and for the joint "action" (= verb_fine ∧ noun ∧ hand)
- Write <seed_dir>/eval_macrof1.json
Cache the test_ds per modality subset so we don't rebuild it 135 times.
"""
from __future__ import annotations
import json
import os
import sys
import time
from pathlib import Path
import pandas as pd # noqa: F401 (dataset_seqpred imports pandas first)
import numpy as np
import torch
from sklearn.metrics import f1_score, accuracy_score
from torch.utils.data import DataLoader
REPO = Path("${PULSE_ROOT}")
sys.path.insert(0, str(REPO / "experiments"))
from dataset_seqpred import ( # noqa: E402
TripletSeqPredDataset, build_train_test, collate_triplet,
TRAIN_VOLS_V3, TEST_VOLS_V3,
)
from models_seqpred import build_model # noqa: E402
def find_seed_dirs():
out = []
for table_name in [
"table1_main_comparison",
"table3_horizon_curve",
"table4_modality_ablation",
"table5_component_ablation",
"table7_missing_modality",
]:
td = REPO / table_name
for row_dir in sorted(td.glob("row*")):
for sd in sorted((row_dir / "seeds").glob("seed*")):
if (sd / "model_best.pt").exists() and (sd / "results.json").exists():
out.append(sd)
return out
_test_cache = {} # (modalities_tuple, t_obs, t_fut) -> (test_loader, modality_dims)
def get_test_loader(modalities, t_obs, t_fut, downsample, num_workers=0):
key = (tuple(modalities), float(t_obs), float(t_fut), int(downsample))
if key in _test_cache:
return _test_cache[key]
print(f" [build test loader] modalities={modalities} t_obs={t_obs} t_fut={t_fut}",
flush=True)
train_ds, test_ds = build_train_test(
modalities=list(modalities),
t_obs_sec=t_obs, t_fut_sec=t_fut, downsample=downsample,
)
test_loader = DataLoader(test_ds, batch_size=64, shuffle=False,
collate_fn=collate_triplet, num_workers=num_workers)
md = test_ds.modality_dims
_test_cache[key] = (test_loader, md)
return test_loader, md
def eval_one(seed_dir: Path, device: torch.device):
res_p = seed_dir / "results.json"
with open(res_p) as f:
results = json.load(f)
args = results["args"]
model_name = args["model"]
modalities = args["modalities"].split(",")
t_obs = args["t_obs"]
t_fut = args["t_fut"]
downsample = args.get("downsample", 5)
test_loader, modality_dims = get_test_loader(modalities, t_obs, t_fut, downsample)
model = build_model(model_name, modality_dims).to(device)
state = torch.load(seed_dir / "model_best.pt", map_location=device,
weights_only=False)
model.load_state_dict(state["state_dict"])
model.eval()
all_logits = {k: [] for k in ("verb_fine", "verb_composite", "noun", "hand")}
all_y = {k: [] for k in ("verb_fine", "verb_composite", "noun", "hand")}
with torch.no_grad():
for x, mask, lens, y, meta in test_loader:
x = {m: t.to(device) for m, t in x.items()}
mask = mask.to(device)
logits = model(x, mask)
for k in all_logits:
all_logits[k].append(logits[k].cpu())
all_y[k].append(y[k])
logits_cat = {k: torch.cat(v, dim=0) for k, v in all_logits.items()}
y_cat = {k: torch.cat(v, dim=0).numpy() for k, v in all_y.items()}
pred_cat = {k: logits_cat[k].argmax(dim=1).numpy() for k in logits_cat}
out = {}
for k in ("verb_fine", "verb_composite", "noun", "hand"):
out[f"{k}_acc"] = float(accuracy_score(y_cat[k], pred_cat[k]))
out[f"{k}_macro_f1"] = float(f1_score(y_cat[k], pred_cat[k],
average="macro", zero_division=0))
out[f"{k}_weighted_f1"] = float(f1_score(y_cat[k], pred_cat[k],
average="weighted", zero_division=0))
# Joint action = verb_fine AND noun AND hand correct
correct = ((pred_cat["verb_fine"] == y_cat["verb_fine"]) &
(pred_cat["noun"] == y_cat["noun"]) &
(pred_cat["hand"] == y_cat["hand"]))
out["action_acc"] = float(correct.mean())
# n_params (cheap)
out["n_params"] = sum(p.numel() for p in model.parameters())
out_p = seed_dir / "eval_macrof1.json"
with open(out_p, "w") as f:
json.dump(out, f, indent=2)
return out
def main():
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device={device}", flush=True)
seed_dirs = find_seed_dirs()
print(f"Found {len(seed_dirs)} seed dirs", flush=True)
t0 = time.time()
n_ok = 0
n_fail = 0
for i, sd in enumerate(seed_dirs, 1):
try:
res = eval_one(sd, device)
n_ok += 1
if i % 10 == 0 or i <= 3:
rel = sd.relative_to(REPO)
print(f" [{i:>3}/{len(seed_dirs)}] {rel} "
f"action_acc={res['action_acc']:.4f} "
f"verb_fine_macroF1={res['verb_fine_macro_f1']:.4f} "
f"noun_macroF1={res['noun_macro_f1']:.4f}",
flush=True)
except Exception as e:
n_fail += 1
print(f" [{i:>3}/{len(seed_dirs)}] FAIL {sd.relative_to(REPO)}: {e}",
flush=True)
dur = time.time() - t0
print(f"Done. ok={n_ok} fail={n_fail} elapsed={dur:.1f}s", flush=True)
if __name__ == "__main__":
main()