Upload folder using huggingface_hub

b4b2877 verified 3 days ago

5.86 kB

	#!/usr/bin/env python3
	"""Re-evaluate all 135 trained seeds with paper-style metrics.

	For each <row>/seeds/seed*/model_best.pt:
	- Reload the model with the right modalities
	- Build the test loader for that modality subset
	- Run inference, collect predictions
	- Compute Acc, Macro-F1, Weighted-F1 per head (verb_fine, verb_composite,
	noun, hand) and for the joint "action" (= verb_fine ∧ noun ∧ hand)
	- Write <seed_dir>/eval_macrof1.json

	Cache the test_ds per modality subset so we don't rebuild it 135 times.
	"""

	from __future__ import annotations

	import json
	import os
	import sys
	import time
	from pathlib import Path

	import pandas as pd # noqa: F401 (dataset_seqpred imports pandas first)
	import numpy as np
	import torch
	from sklearn.metrics import f1_score, accuracy_score
	from torch.utils.data import DataLoader

	REPO = Path("${PULSE_ROOT}")
	sys.path.insert(0, str(REPO / "experiments"))

	from dataset_seqpred import ( # noqa: E402
	TripletSeqPredDataset, build_train_test, collate_triplet,
	TRAIN_VOLS_V3, TEST_VOLS_V3,
	)
	from models_seqpred import build_model # noqa: E402


	def find_seed_dirs():
	out = []
	for table_name in [
	"table1_main_comparison",
	"table3_horizon_curve",
	"table4_modality_ablation",
	"table5_component_ablation",
	"table7_missing_modality",
	]:
	td = REPO / table_name
	for row_dir in sorted(td.glob("row*")):
	for sd in sorted((row_dir / "seeds").glob("seed*")):
	if (sd / "model_best.pt").exists() and (sd / "results.json").exists():
	out.append(sd)
	return out


	_test_cache = {} # (modalities_tuple, t_obs, t_fut) -> (test_loader, modality_dims)


	def get_test_loader(modalities, t_obs, t_fut, downsample, num_workers=0):
	key = (tuple(modalities), float(t_obs), float(t_fut), int(downsample))
	if key in _test_cache:
	return _test_cache[key]
	print(f" [build test loader] modalities={modalities} t_obs={t_obs} t_fut={t_fut}",
	flush=True)
	train_ds, test_ds = build_train_test(
	modalities=list(modalities),
	t_obs_sec=t_obs, t_fut_sec=t_fut, downsample=downsample,
	)
	test_loader = DataLoader(test_ds, batch_size=64, shuffle=False,
	collate_fn=collate_triplet, num_workers=num_workers)
	md = test_ds.modality_dims
	_test_cache[key] = (test_loader, md)
	return test_loader, md


	def eval_one(seed_dir: Path, device: torch.device):
	res_p = seed_dir / "results.json"
	with open(res_p) as f:
	results = json.load(f)
	args = results["args"]
	model_name = args["model"]
	modalities = args["modalities"].split(",")
	t_obs = args["t_obs"]
	t_fut = args["t_fut"]
	downsample = args.get("downsample", 5)

	test_loader, modality_dims = get_test_loader(modalities, t_obs, t_fut, downsample)

	model = build_model(model_name, modality_dims).to(device)
	state = torch.load(seed_dir / "model_best.pt", map_location=device,
	weights_only=False)
	model.load_state_dict(state["state_dict"])
	model.eval()

	all_logits = {k: [] for k in ("verb_fine", "verb_composite", "noun", "hand")}
	all_y = {k: [] for k in ("verb_fine", "verb_composite", "noun", "hand")}

	with torch.no_grad():
	for x, mask, lens, y, meta in test_loader:
	x = {m: t.to(device) for m, t in x.items()}
	mask = mask.to(device)
	logits = model(x, mask)
	for k in all_logits:
	all_logits[k].append(logits[k].cpu())
	all_y[k].append(y[k])

	logits_cat = {k: torch.cat(v, dim=0) for k, v in all_logits.items()}
	y_cat = {k: torch.cat(v, dim=0).numpy() for k, v in all_y.items()}
	pred_cat = {k: logits_cat[k].argmax(dim=1).numpy() for k in logits_cat}

	out = {}
	for k in ("verb_fine", "verb_composite", "noun", "hand"):
	out[f"{k}_acc"] = float(accuracy_score(y_cat[k], pred_cat[k]))
	out[f"{k}_macro_f1"] = float(f1_score(y_cat[k], pred_cat[k],
	average="macro", zero_division=0))
	out[f"{k}_weighted_f1"] = float(f1_score(y_cat[k], pred_cat[k],
	average="weighted", zero_division=0))

	# Joint action = verb_fine AND noun AND hand correct
	correct = ((pred_cat["verb_fine"] == y_cat["verb_fine"]) &
	(pred_cat["noun"] == y_cat["noun"]) &
	(pred_cat["hand"] == y_cat["hand"]))
	out["action_acc"] = float(correct.mean())

	# n_params (cheap)
	out["n_params"] = sum(p.numel() for p in model.parameters())

	out_p = seed_dir / "eval_macrof1.json"
	with open(out_p, "w") as f:
	json.dump(out, f, indent=2)
	return out


	def main():
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"device={device}", flush=True)
	seed_dirs = find_seed_dirs()
	print(f"Found {len(seed_dirs)} seed dirs", flush=True)
	t0 = time.time()
	n_ok = 0
	n_fail = 0
	for i, sd in enumerate(seed_dirs, 1):
	try:
	res = eval_one(sd, device)
	n_ok += 1
	if i % 10 == 0 or i <= 3:
	rel = sd.relative_to(REPO)
	print(f" [{i:>3}/{len(seed_dirs)}] {rel} "
	f"action_acc={res['action_acc']:.4f} "
	f"verb_fine_macroF1={res['verb_fine_macro_f1']:.4f} "
	f"noun_macroF1={res['noun_macro_f1']:.4f}",
	flush=True)
	except Exception as e:
	n_fail += 1
	print(f" [{i:>3}/{len(seed_dirs)}] FAIL {sd.relative_to(REPO)}: {e}",
	flush=True)
	dur = time.time() - t0
	print(f"Done. ok={n_ok} fail={n_fail} elapsed={dur:.1f}s", flush=True)


	if __name__ == "__main__":
	main()