| """Evaluate a trained forensics-GRPO model on the AF test split. |
| |
| Adapted from evaluate.py (single-span Charades-style grounding) for: |
| - multi-segment localisation (list of (s, e) tuples per video) |
| - the forensics CoT prompt template (FORENSICS_COT toggle preserved) |
| - cached video_inputs.pt to avoid re-decoding |
| - multi-GPU sharding (one process per device) |
| - multiple matching metrics: soft_F1, mean_F1@{0.5,0.75,0.85,0.95}, hungarian_IoU |
| |
| Output: |
| <out_dir>/rank_<r>.jsonl one record per evaluated test video on this rank |
| <out_dir>/summary.json aggregate metrics (overall + per-generator) |
| """ |
| import argparse |
| import json |
| import os |
| import random |
| import sys |
| import time |
|
|
| import torch |
| from transformers import ( |
| AutoProcessor, |
| GenerationConfig, |
| Qwen2_5_VLForConditionalGeneration, |
| ) |
|
|
| sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) |
| from src.open_r1.data_loader import TEST_GENERATORS, build_examples |
| from src.open_r1.reward import ( |
| hungarian_iou_reward, |
| mean_f1_at_tiou, |
| parse_segments, |
| soft_f1, |
| ) |
| from src.open_r1.trainer.grpo_trainer_video_GT_soft import ( |
| SYSTEM_PROMPT, |
| get_question_template, |
| ) |
| from src.open_r1.verifier import ( |
| ForensicsVerifier, |
| format_verifier_scores, |
| sample_id_from_video_path, |
| ) |
|
|
| ANNOT = "/mnt/local-fast/zhangt/annot/annot" |
| VROOT = "/mnt/local-fast/zhangt/video" |
| CACHE = "/mnt/local-fast/zhangt/forensics_grpo_cache_uniform3584_fps2.0" |
|
|
|
|
| def get_args(): |
| p = argparse.ArgumentParser() |
| p.add_argument("--model_path", required=True) |
| p.add_argument("--rank", type=int, default=0) |
| p.add_argument("--world_size", type=int, default=1) |
| p.add_argument("--device", type=int, default=0, |
| help="cuda device index (set CUDA_VISIBLE_DEVICES to pin physical GPU)") |
| p.add_argument("--out_dir", default="eval_outputs/stage2_verifier_grounded") |
| p.add_argument("--cot", choices=["true", "false"], default="true", |
| help="Use CoT prompt template ('true') or no-CoT ('false').") |
| p.add_argument("--cot_variant", choices=["descriptive", "counterfactual", "counterfactual_parsimonious"], default="descriptive", |
| help="CoT prompt variant; must match the variant used at training time.") |
| p.add_argument("--verifier_context", choices=["true", "false"], default="false", |
| help="If true, inject external verifier per-second scores into the prompt.") |
| p.add_argument("--verifier_ckpt", default="/mnt/local-fast/zhangt/forensics_verifier_clip_l14/verifier_temporal_best.pt") |
| p.add_argument("--verifier_cache", default="/mnt/local-fast/zhangt/forensics_verifier_clip_l14") |
| p.add_argument("--max_new_tokens", type=int, default=640) |
| p.add_argument("--temperature", type=float, default=0.0, |
| help="Greedy if 0 else sample with this temp.") |
| p.add_argument("--limit", type=int, default=0, help="Cap number of videos per rank (0=all)") |
| return p.parse_args() |
|
|
|
|
| def load_cached(sample_dir): |
| feats = torch.load(os.path.join(sample_dir, "video_inputs.pt"), weights_only=False) |
| with open(os.path.join(sample_dir, "video_kwargs.json"), "r") as f: |
| kwargs = json.load(f) |
| return feats, kwargs |
|
|
|
|
| def main(): |
| args = get_args() |
| device = f"cuda:{args.device}" |
| os.makedirs(args.out_dir, exist_ok=True) |
|
|
| |
| os.environ["FORENSICS_COT"] = args.cot |
| os.environ["FORENSICS_COT_VARIANT"] = args.cot_variant |
|
|
| print(f"[rank {args.rank}/{args.world_size}] device={device} model={args.model_path}", flush=True) |
| print(f" cot={args.cot} cot_variant={args.cot_variant} max_new_tokens={args.max_new_tokens} temp={args.temperature}", flush=True) |
|
|
| t0 = time.time() |
| model = Qwen2_5_VLForConditionalGeneration.from_pretrained( |
| args.model_path, |
| torch_dtype=torch.bfloat16, |
| use_sliding_window=True, |
| attn_implementation="flash_attention_2", |
| device_map=device, |
| ) |
| model.eval() |
| processor = AutoProcessor.from_pretrained(args.model_path) |
| print(f" loaded model+processor in {time.time()-t0:.1f}s", flush=True) |
|
|
| |
| examples = build_examples( |
| annot_dir=ANNOT, video_root=VROOT, generators=TEST_GENERATORS, |
| split_prefix="test", preprocessed_data_path=CACHE, require_video_exists=True, |
| ) |
| examples = [ex for i, ex in enumerate(examples) if i % args.world_size == args.rank] |
| if args.limit > 0: |
| examples = examples[: args.limit] |
| print(f" rank {args.rank} processes {len(examples)} test videos", flush=True) |
|
|
| question = get_question_template() |
| |
| |
| |
| |
| |
| |
| |
| gencond_mode = os.getenv("FORENSICS_GENCOND_MODE", "none").lower() |
| if gencond_mode not in ("none", "matched", "correct", "wrong"): |
| raise ValueError(f"FORENSICS_GENCOND_MODE must be none|matched|correct|wrong, got {gencond_mode!r}") |
| |
| |
| |
| gencond_prob = float(os.getenv("FORENSICS_GENCOND_PROB", "0.5")) |
| |
| |
| wrong_rng = random.Random(0xC0FFEE + args.rank) |
| matched_rng = random.Random(0xBEEF00 + args.rank) |
| print(f" FORENSICS_GENCOND_MODE = {gencond_mode}" |
| + (f" PROB = {gencond_prob}" if gencond_mode == "matched" else ""), flush=True) |
| gen_cfg = GenerationConfig( |
| max_new_tokens=args.max_new_tokens, |
| do_sample=args.temperature > 0, |
| temperature=max(args.temperature, 1e-6), |
| pad_token_id=processor.tokenizer.pad_token_id, |
| use_cache=True, |
| ) |
| |
| |
| model.config.use_cache = True |
| if hasattr(model, "generation_config"): |
| model.generation_config.use_cache = True |
|
|
| out_path = os.path.join(args.out_dir, f"rank_{args.rank}.jsonl") |
| fout = open(out_path, "w") |
| t_start = time.time() |
| done = failed = 0 |
| for ex in examples: |
| sample_id = os.path.splitext(os.path.basename(ex["video_path"]))[0] |
| sample_dir = os.path.join(CACHE, "test", ex["generator"], sample_id) |
| if not os.path.exists(os.path.join(sample_dir, "video_inputs.pt")): |
| failed += 1 |
| continue |
|
|
| try: |
| video_inputs, video_kwargs = load_cached(sample_dir) |
| |
| |
| |
| |
| if gencond_mode == "correct": |
| q_text = f"The forged segments in this video were generated by {ex['generator']}. " + question |
| elif gencond_mode == "wrong": |
| others = [g for g in TEST_GENERATORS if g != ex["generator"]] |
| wrong_gen = wrong_rng.choice(others) |
| q_text = f"The forged segments in this video were generated by {wrong_gen}. " + question |
| elif gencond_mode == "matched": |
| if matched_rng.random() < gencond_prob: |
| q_text = f"The forged segments in this video were generated by {ex['generator']}. " + question |
| else: |
| q_text = question |
| else: |
| q_text = question |
| |
| |
| |
| |
| |
| |
| |
| chat = [ |
| {"role": "system", "content": SYSTEM_PROMPT}, |
| {"role": "user", "content": [ |
| {"type": "video", |
| "video": ex["video_path"], |
| "max_pixels": 3584 * 28 * 28, |
| "min_pixels": 200704, |
| "fps": 2.0, |
| "max_frames": 64, |
| }, |
| {"type": "text", "text": q_text}, |
| ], |
| }, |
| ] |
| text = processor.apply_chat_template(chat, tokenize=False, add_generation_prompt=True) |
|
|
| inputs = processor( |
| text=[text], |
| videos=[video_inputs[0]], |
| fps=[video_kwargs["fps"][0]], |
| padding=True, |
| return_tensors="pt", |
| padding_side="left", |
| add_special_tokens=False, |
| ) |
| inputs = {k: v.to(device) if hasattr(v, "to") else v for k, v in inputs.items()} |
|
|
| with torch.no_grad(): |
| out_ids = model.generate(**inputs, generation_config=gen_cfg, use_cache=True) |
| gen_ids = out_ids[0][inputs["input_ids"].shape[1]:] |
| output_text = processor.tokenizer.decode(gen_ids, skip_special_tokens=True) |
| except Exception as e: |
| failed += 1 |
| print(f" [skip] {sample_id}: {type(e).__name__}: {e}", flush=True) |
| continue |
|
|
| |
| pred = parse_segments(output_text) |
| gt = [tuple(s) for s in ex["solution"]] |
| sf = soft_f1(pred, gt) |
| mf = mean_f1_at_tiou(pred, gt) |
| hg = hungarian_iou_reward(pred, gt) |
|
|
| rec = { |
| "sample_id": sample_id, |
| "generator": ex["generator"], |
| "gt": gt, |
| "pred": pred, |
| "output_text": output_text, |
| "soft_F1": sf, |
| "mean_F1_tIoU": mf, |
| "hungarian_iou": hg, |
| "n_pred": len(pred), |
| "n_gt": len(gt), |
| "parse_failed": len(pred) == 0, |
| } |
| fout.write(json.dumps(rec) + "\n") |
| fout.flush() |
| done += 1 |
|
|
| if done % 20 == 0: |
| elapsed = time.time() - t_start |
| rate = done / max(1e-6, elapsed) |
| remaining = (len(examples) - done - failed) / max(1e-6, rate) |
| print( |
| f" rank={args.rank} done={done} fail={failed} " |
| f"rate={rate:.2f}/s eta={remaining/60:.1f}min", |
| flush=True, |
| ) |
|
|
| fout.close() |
| print(f"[rank {args.rank}] DONE done={done} failed={failed} elapsed={time.time()-t_start:.0f}s", flush=True) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|