| """Forward-Backward Consistency (FBC) signal validation. |
| |
| Hypothesis: Real-world video has temporal asymmetry under reversal (gravity, |
| momentum, causal flow); AI-generated segments often lack this asymmetry. |
| So a model trained for forgery localization should produce SIMILAR predictions |
| on the forward and reversed versions of the same video — because the AI |
| artifact carries through reversal, while real content gets "weird" enough to |
| suppress false-positive detection. |
| |
| Quantitative test: run stage1_decomp_boundary ckpt on each test sample twice |
| (forward video / temporally-flipped video). Map reversed-prediction back to |
| original coordinates and measure: |
| |
| IoU(pred_F, GT) — forward accuracy (baseline) |
| IoU(pred_R_remapped, GT) — reverse accuracy |
| IoU(pred_F, pred_R_remapped) — KEY: model self-consistency under reversal |
| |
| For FBC to be a useful GRPO reward: |
| 1. mean IoU(F, R) should be substantially > 0 (i.e. model IS consistent |
| — if it's near 0, reverse video is just confusing the model and we |
| can't extract a forensic signal from it). |
| 2. corr(IoU(F, R), IoU(F, GT)) > 0 — consistent predictions correlate |
| with correct predictions. This is what makes "push toward consistency" |
| a valid training pressure. |
| 3. Per-generator analysis: AI-heavy generators (wan, ltx, vace, fcvg) |
| should have higher IoU(F, R) than less-AI generators if the hypothesis |
| about AI lacking temporal causality holds. |
| |
| If (1) and (2) fail, FBC is not a usable signal and we need a different idea. |
| """ |
| from __future__ import annotations |
|
|
| import argparse |
| import glob |
| import json |
| import os |
| import sys |
| import time |
| from pathlib import Path |
|
|
| import numpy as np |
| import torch |
| from transformers import ( |
| AutoProcessor, |
| GenerationConfig, |
| Qwen2_5_VLForConditionalGeneration, |
| ) |
|
|
| REPO = Path("/mnt/local-fast/zhangt/forensics_grpo") |
| sys.path.insert(0, str(REPO)) |
| sys.path.insert(0, str(REPO / "src")) |
|
|
| from src.open_r1.data_loader import TEST_GENERATORS, build_examples |
| from src.open_r1.reward import parse_segments |
| from src.open_r1.trainer.grpo_trainer_video_GT_soft import ( |
| SYSTEM_PROMPT, |
| get_question_template, |
| ) |
|
|
| VROOT = "/mnt/local-fast/zhangt/video" |
| ANNOT = "/mnt/local-fast/zhangt/annot/annot" |
| CACHE = "/mnt/local-fast/zhangt/forensics_grpo_cache_uniform3584_fps2.0" |
|
|
|
|
| def iou_1d(a, b): |
| s1, e1 = a; s2, e2 = b |
| inter = max(0.0, min(e1, e2) - max(s1, s2)) |
| union = max(e1, e2) - min(s1, s2) |
| return inter / union if union > 0 else 0.0 |
|
|
|
|
| def soft_f1_iou(preds, gts): |
| """Set-level soft IoU = soft_F1 of pairwise IoU matrix (matches reward.py).""" |
| if not preds and not gts: |
| return 1.0 |
| if not preds or not gts: |
| return 0.0 |
| pres = [max(iou_1d(p, g) for g in gts) for p in preds] |
| recs = [max(iou_1d(g, p) for p in preds) for g in gts] |
| p, r = sum(pres) / len(pres), sum(recs) / len(recs) |
| return 2 * p * r / (p + r) if (p + r) > 0 else 0.0 |
|
|
|
|
| def remap_reversed(segs, duration): |
| """Map intervals from reversed-time coords back to original coords.""" |
| return [(max(0.0, duration - e), max(0.0, duration - s)) for s, e in segs] |
|
|
|
|
| def run_inference(model, processor, video_tensor, fps, question, gen_cfg, device): |
| chat = [ |
| {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]}, |
| {"role": "user", "content": [ |
| {"type": "video", "video": "placeholder"}, |
| {"type": "text", "text": question}, |
| ]}, |
| ] |
| text = processor.apply_chat_template(chat, tokenize=False, add_generation_prompt=True) |
| inputs = processor( |
| text=[text], |
| videos=[video_tensor], |
| fps=[fps], |
| padding=True, |
| return_tensors="pt", |
| padding_side="left", |
| add_special_tokens=False, |
| ) |
| inputs = {k: v.to(device) if hasattr(v, "to") else v for k, v in inputs.items()} |
| with torch.no_grad(): |
| out_ids = model.generate(**inputs, generation_config=gen_cfg, use_cache=True) |
| gen_ids = out_ids[0][inputs["input_ids"].shape[1]:] |
| return processor.tokenizer.decode(gen_ids, skip_special_tokens=True) |
|
|
|
|
| def main(): |
| ap = argparse.ArgumentParser() |
| ap.add_argument("--model_path", default=str(REPO / "outputs_forensics/stage1_decomp_boundary")) |
| ap.add_argument("--n", type=int, default=200, help="number of test samples to evaluate") |
| ap.add_argument("--device", default="cuda:0") |
| ap.add_argument("--max_new_tokens", type=int, default=64) |
| ap.add_argument("--out", default=str(REPO / "fbc_signal_validation.jsonl")) |
| args = ap.parse_args() |
|
|
| |
| os.environ["FORENSICS_COT"] = "false" |
|
|
| print(f"[fbc-validate] device={args.device} model={args.model_path} n={args.n}", |
| flush=True) |
| t0 = time.time() |
| model = Qwen2_5_VLForConditionalGeneration.from_pretrained( |
| args.model_path, torch_dtype=torch.bfloat16, |
| use_sliding_window=True, attn_implementation="flash_attention_2", |
| device_map=args.device, |
| ) |
| model.eval() |
| processor = AutoProcessor.from_pretrained(args.model_path) |
| model.config.use_cache = True |
| if hasattr(model, "generation_config"): |
| model.generation_config.use_cache = True |
| print(f" loaded in {time.time()-t0:.1f}s", flush=True) |
|
|
| examples = build_examples( |
| annot_dir=ANNOT, video_root=VROOT, generators=TEST_GENERATORS, |
| split_prefix="test", preprocessed_data_path=CACHE, require_video_exists=True, |
| ) |
| |
| sampled = [] |
| for ex in examples: |
| sample_id = os.path.splitext(os.path.basename(ex["video_path"]))[0] |
| sample_dir = os.path.join(CACHE, "test", ex["generator"], sample_id) |
| if os.path.exists(os.path.join(sample_dir, "video_inputs.pt")): |
| sampled.append((ex, sample_id, sample_dir)) |
| if len(sampled) >= args.n: |
| break |
| print(f" using {len(sampled)} samples", flush=True) |
|
|
| question = get_question_template() |
| gen_cfg = GenerationConfig( |
| max_new_tokens=args.max_new_tokens, do_sample=False, |
| temperature=1e-6, |
| pad_token_id=processor.tokenizer.pad_token_id, use_cache=True, |
| ) |
|
|
| fout = open(args.out, "w") |
| records = [] |
| t_start = time.time() |
| for i, (ex, sample_id, sample_dir) in enumerate(sampled): |
| try: |
| feats = torch.load(os.path.join(sample_dir, "video_inputs.pt"), |
| weights_only=False) |
| with open(os.path.join(sample_dir, "video_kwargs.json")) as f: |
| kw = json.load(f) |
| video_f = feats[0] |
| video_r = video_f.flip(0).contiguous() |
| fps = kw["fps"][0] |
| duration = video_f.shape[0] / fps |
|
|
| out_f = run_inference(model, processor, video_f, fps, question, gen_cfg, args.device) |
| pred_f = parse_segments(out_f) |
|
|
| out_r = run_inference(model, processor, video_r, fps, question, gen_cfg, args.device) |
| pred_r = parse_segments(out_r) |
| pred_r_remapped = remap_reversed(pred_r, duration) |
|
|
| gt = [tuple(s) for s in ex["solution"]] |
|
|
| iou_f_gt = soft_f1_iou(pred_f, gt) |
| iou_r_gt = soft_f1_iou(pred_r_remapped, gt) |
| iou_f_r = soft_f1_iou(pred_f, pred_r_remapped) |
| except Exception as e: |
| print(f" [skip] {sample_id}: {type(e).__name__}: {e}", flush=True) |
| continue |
|
|
| rec = { |
| "sample_id": sample_id, |
| "generator": ex["generator"], |
| "duration": duration, |
| "gt": gt, |
| "pred_f": pred_f, |
| "pred_r_remapped": pred_r_remapped, |
| "iou_f_gt": iou_f_gt, |
| "iou_r_gt": iou_r_gt, |
| "iou_f_r": iou_f_r, |
| "n_pred_f": len(pred_f), |
| "n_pred_r": len(pred_r), |
| "n_gt": len(gt), |
| } |
| records.append(rec) |
| fout.write(json.dumps(rec) + "\n"); fout.flush() |
|
|
| if (i + 1) % 20 == 0: |
| elapsed = time.time() - t_start |
| rate = (i + 1) / elapsed |
| eta = (len(sampled) - i - 1) / rate |
| cur = np.array([(r["iou_f_gt"], r["iou_r_gt"], r["iou_f_r"]) for r in records]) |
| print(f" i={i+1}/{len(sampled)} rate={rate:.2f}/s eta={eta/60:.1f}min " |
| f"f_gt={cur[:,0].mean():.3f} r_gt={cur[:,1].mean():.3f} f_r={cur[:,2].mean():.3f}", |
| flush=True) |
|
|
| fout.close() |
| print(f"\n=== FBC SIGNAL VALIDATION SUMMARY (n={len(records)}) ===") |
| A = np.array([(r["iou_f_gt"], r["iou_r_gt"], r["iou_f_r"]) for r in records]) |
| iou_f_gt, iou_r_gt, iou_f_r = A[:, 0], A[:, 1], A[:, 2] |
|
|
| print(f"\nOverall:") |
| print(f" iou_f_gt (forward acc) : mean={iou_f_gt.mean():.3f} median={np.median(iou_f_gt):.3f}") |
| print(f" iou_r_gt (reverse acc) : mean={iou_r_gt.mean():.3f} median={np.median(iou_r_gt):.3f}") |
| print(f" iou_f_r (consistency) : mean={iou_f_r.mean():.3f} median={np.median(iou_f_r):.3f} " |
| f">0.5 frac={(iou_f_r > 0.5).mean()*100:.1f}%") |
|
|
| |
| crit1 = iou_f_r.mean() > 0.3 |
| print(f"\n[Criterion 1] mean iou_f_r > 0.3? {'PASS' if crit1 else 'FAIL'} " |
| f"({iou_f_r.mean():.3f})") |
| print(f" Interpretation: " + |
| ("model IS consistent under reversal — signal exists" if crit1 else |
| "model produces unrelated predictions on reversed input — no useful signal")) |
|
|
| |
| if len(A) > 3 and iou_f_r.std() > 0 and iou_f_gt.std() > 0: |
| corr = np.corrcoef(iou_f_r, iou_f_gt)[0, 1] |
| else: |
| corr = 0.0 |
| crit2 = corr > 0.2 |
| print(f"\n[Criterion 2] corr(iou_f_r, iou_f_gt) > 0.2? {'PASS' if crit2 else 'FAIL'} " |
| f"({corr:.3f})") |
| print(f" Interpretation: " + |
| ("consistency under reversal predicts correctness — FBC reward will steer toward right answers" if crit2 else |
| "consistency is uncorrelated with correctness — FBC reward will push toward random consistency")) |
|
|
| |
| print(f"\nPer-generator (sorted by iou_f_r):") |
| by_gen = {} |
| for r in records: |
| by_gen.setdefault(r["generator"], []).append(r) |
| rows = [] |
| for g, rs in by_gen.items(): |
| arr = np.array([(x["iou_f_gt"], x["iou_f_r"]) for x in rs]) |
| rows.append((g, len(rs), arr[:, 0].mean(), arr[:, 1].mean())) |
| for g, n, fg, fr in sorted(rows, key=lambda x: -x[3]): |
| print(f" {g:<12s} n={n:3d} iou_f_gt={fg:.3f} iou_f_r={fr:.3f}") |
|
|
| |
| print(f"\n{'='*60}") |
| if crit1 and crit2: |
| print("VERDICT: FBC signal exists. Proceed to implement as GRPO reward.") |
| elif crit1 and not crit2: |
| print("VERDICT: model is consistent but not in a useful way. FBC alone " |
| "won't steer training; combine with iou or rethink.") |
| else: |
| print("VERDICT: FBC signal absent. Reversed video doesn't elicit meaningful " |
| "model behavior. Rethink the spatial / temporal causality framing.") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|