File size: 11,185 Bytes

33569f9

"""Forward-Backward Consistency (FBC) signal validation.

Hypothesis: Real-world video has temporal asymmetry under reversal (gravity,
momentum, causal flow); AI-generated segments often lack this asymmetry.
So a model trained for forgery localization should produce SIMILAR predictions
on the forward and reversed versions of the same video — because the AI
artifact carries through reversal, while real content gets "weird" enough to
suppress false-positive detection.

Quantitative test: run stage1_decomp_boundary ckpt on each test sample twice
(forward video / temporally-flipped video). Map reversed-prediction back to
original coordinates and measure:

  IoU(pred_F, GT)              — forward accuracy (baseline)
  IoU(pred_R_remapped, GT)     — reverse accuracy
  IoU(pred_F, pred_R_remapped) — KEY: model self-consistency under reversal

For FBC to be a useful GRPO reward:
  1. mean IoU(F, R) should be substantially > 0 (i.e. model IS consistent
     — if it's near 0, reverse video is just confusing the model and we
     can't extract a forensic signal from it).
  2. corr(IoU(F, R), IoU(F, GT)) > 0 — consistent predictions correlate
     with correct predictions. This is what makes "push toward consistency"
     a valid training pressure.
  3. Per-generator analysis: AI-heavy generators (wan, ltx, vace, fcvg)
     should have higher IoU(F, R) than less-AI generators if the hypothesis
     about AI lacking temporal causality holds.

If (1) and (2) fail, FBC is not a usable signal and we need a different idea.
"""
from __future__ import annotations

import argparse
import glob
import json
import os
import sys
import time
from pathlib import Path

import numpy as np
import torch
from transformers import (
    AutoProcessor,
    GenerationConfig,
    Qwen2_5_VLForConditionalGeneration,
)

REPO = Path("/mnt/local-fast/zhangt/forensics_grpo")
sys.path.insert(0, str(REPO))
sys.path.insert(0, str(REPO / "src"))

from src.open_r1.data_loader import TEST_GENERATORS, build_examples
from src.open_r1.reward import parse_segments
from src.open_r1.trainer.grpo_trainer_video_GT_soft import (
    SYSTEM_PROMPT,
    get_question_template,
)

VROOT = "/mnt/local-fast/zhangt/video"
ANNOT = "/mnt/local-fast/zhangt/annot/annot"
CACHE = "/mnt/local-fast/zhangt/forensics_grpo_cache_uniform3584_fps2.0"


def iou_1d(a, b):
    s1, e1 = a; s2, e2 = b
    inter = max(0.0, min(e1, e2) - max(s1, s2))
    union = max(e1, e2) - min(s1, s2)
    return inter / union if union > 0 else 0.0


def soft_f1_iou(preds, gts):
    """Set-level soft IoU = soft_F1 of pairwise IoU matrix (matches reward.py)."""
    if not preds and not gts:
        return 1.0
    if not preds or not gts:
        return 0.0
    pres = [max(iou_1d(p, g) for g in gts) for p in preds]
    recs = [max(iou_1d(g, p) for p in preds) for g in gts]
    p, r = sum(pres) / len(pres), sum(recs) / len(recs)
    return 2 * p * r / (p + r) if (p + r) > 0 else 0.0


def remap_reversed(segs, duration):
    """Map intervals from reversed-time coords back to original coords."""
    return [(max(0.0, duration - e), max(0.0, duration - s)) for s, e in segs]


def run_inference(model, processor, video_tensor, fps, question, gen_cfg, device):
    chat = [
        {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
        {"role": "user", "content": [
            {"type": "video", "video": "placeholder"},
            {"type": "text", "text": question},
        ]},
    ]
    text = processor.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        videos=[video_tensor],
        fps=[fps],
        padding=True,
        return_tensors="pt",
        padding_side="left",
        add_special_tokens=False,
    )
    inputs = {k: v.to(device) if hasattr(v, "to") else v for k, v in inputs.items()}
    with torch.no_grad():
        out_ids = model.generate(**inputs, generation_config=gen_cfg, use_cache=True)
    gen_ids = out_ids[0][inputs["input_ids"].shape[1]:]
    return processor.tokenizer.decode(gen_ids, skip_special_tokens=True)


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--model_path", default=str(REPO / "outputs_forensics/stage1_decomp_boundary"))
    ap.add_argument("--n", type=int, default=200, help="number of test samples to evaluate")
    ap.add_argument("--device", default="cuda:0")
    ap.add_argument("--max_new_tokens", type=int, default=64)
    ap.add_argument("--out", default=str(REPO / "fbc_signal_validation.jsonl"))
    args = ap.parse_args()

    # No-CoT prompt since stage1 was trained without CoT.
    os.environ["FORENSICS_COT"] = "false"

    print(f"[fbc-validate] device={args.device}  model={args.model_path}  n={args.n}",
          flush=True)
    t0 = time.time()
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        args.model_path, torch_dtype=torch.bfloat16,
        use_sliding_window=True, attn_implementation="flash_attention_2",
        device_map=args.device,
    )
    model.eval()
    processor = AutoProcessor.from_pretrained(args.model_path)
    model.config.use_cache = True
    if hasattr(model, "generation_config"):
        model.generation_config.use_cache = True
    print(f"  loaded in {time.time()-t0:.1f}s", flush=True)

    examples = build_examples(
        annot_dir=ANNOT, video_root=VROOT, generators=TEST_GENERATORS,
        split_prefix="test", preprocessed_data_path=CACHE, require_video_exists=True,
    )
    # Deterministic sample: first N with cached features.
    sampled = []
    for ex in examples:
        sample_id = os.path.splitext(os.path.basename(ex["video_path"]))[0]
        sample_dir = os.path.join(CACHE, "test", ex["generator"], sample_id)
        if os.path.exists(os.path.join(sample_dir, "video_inputs.pt")):
            sampled.append((ex, sample_id, sample_dir))
        if len(sampled) >= args.n:
            break
    print(f"  using {len(sampled)} samples", flush=True)

    question = get_question_template()
    gen_cfg = GenerationConfig(
        max_new_tokens=args.max_new_tokens, do_sample=False,
        temperature=1e-6,
        pad_token_id=processor.tokenizer.pad_token_id, use_cache=True,
    )

    fout = open(args.out, "w")
    records = []
    t_start = time.time()
    for i, (ex, sample_id, sample_dir) in enumerate(sampled):
        try:
            feats = torch.load(os.path.join(sample_dir, "video_inputs.pt"),
                               weights_only=False)
            with open(os.path.join(sample_dir, "video_kwargs.json")) as f:
                kw = json.load(f)
            video_f = feats[0]  # (T, C, H, W)
            video_r = video_f.flip(0).contiguous()
            fps = kw["fps"][0]
            duration = video_f.shape[0] / fps

            out_f = run_inference(model, processor, video_f, fps, question, gen_cfg, args.device)
            pred_f = parse_segments(out_f)

            out_r = run_inference(model, processor, video_r, fps, question, gen_cfg, args.device)
            pred_r = parse_segments(out_r)
            pred_r_remapped = remap_reversed(pred_r, duration)

            gt = [tuple(s) for s in ex["solution"]]

            iou_f_gt = soft_f1_iou(pred_f, gt)
            iou_r_gt = soft_f1_iou(pred_r_remapped, gt)
            iou_f_r  = soft_f1_iou(pred_f, pred_r_remapped)
        except Exception as e:
            print(f"  [skip] {sample_id}: {type(e).__name__}: {e}", flush=True)
            continue

        rec = {
            "sample_id": sample_id,
            "generator": ex["generator"],
            "duration": duration,
            "gt": gt,
            "pred_f": pred_f,
            "pred_r_remapped": pred_r_remapped,
            "iou_f_gt": iou_f_gt,
            "iou_r_gt": iou_r_gt,
            "iou_f_r": iou_f_r,
            "n_pred_f": len(pred_f),
            "n_pred_r": len(pred_r),
            "n_gt": len(gt),
        }
        records.append(rec)
        fout.write(json.dumps(rec) + "\n"); fout.flush()

        if (i + 1) % 20 == 0:
            elapsed = time.time() - t_start
            rate = (i + 1) / elapsed
            eta = (len(sampled) - i - 1) / rate
            cur = np.array([(r["iou_f_gt"], r["iou_r_gt"], r["iou_f_r"]) for r in records])
            print(f"  i={i+1}/{len(sampled)}  rate={rate:.2f}/s  eta={eta/60:.1f}min  "
                  f"f_gt={cur[:,0].mean():.3f}  r_gt={cur[:,1].mean():.3f}  f_r={cur[:,2].mean():.3f}",
                  flush=True)

    fout.close()
    print(f"\n=== FBC SIGNAL VALIDATION SUMMARY  (n={len(records)}) ===")
    A = np.array([(r["iou_f_gt"], r["iou_r_gt"], r["iou_f_r"]) for r in records])
    iou_f_gt, iou_r_gt, iou_f_r = A[:, 0], A[:, 1], A[:, 2]

    print(f"\nOverall:")
    print(f"  iou_f_gt (forward acc) : mean={iou_f_gt.mean():.3f}  median={np.median(iou_f_gt):.3f}")
    print(f"  iou_r_gt (reverse acc) : mean={iou_r_gt.mean():.3f}  median={np.median(iou_r_gt):.3f}")
    print(f"  iou_f_r (consistency)  : mean={iou_f_r.mean():.3f}   median={np.median(iou_f_r):.3f}  "
          f">0.5 frac={(iou_f_r > 0.5).mean()*100:.1f}%")

    # Validation criterion 1: is iou_f_r substantially > 0?
    crit1 = iou_f_r.mean() > 0.3
    print(f"\n[Criterion 1] mean iou_f_r > 0.3?  {'PASS' if crit1 else 'FAIL'}  "
          f"({iou_f_r.mean():.3f})")
    print(f"  Interpretation: " +
          ("model IS consistent under reversal — signal exists" if crit1 else
           "model produces unrelated predictions on reversed input — no useful signal"))

    # Validation criterion 2: does iou_f_r correlate with iou_f_gt?
    if len(A) > 3 and iou_f_r.std() > 0 and iou_f_gt.std() > 0:
        corr = np.corrcoef(iou_f_r, iou_f_gt)[0, 1]
    else:
        corr = 0.0
    crit2 = corr > 0.2
    print(f"\n[Criterion 2] corr(iou_f_r, iou_f_gt) > 0.2?  {'PASS' if crit2 else 'FAIL'}  "
          f"({corr:.3f})")
    print(f"  Interpretation: " +
          ("consistency under reversal predicts correctness — FBC reward will steer toward right answers" if crit2 else
           "consistency is uncorrelated with correctness — FBC reward will push toward random consistency"))

    # Per-generator breakdown
    print(f"\nPer-generator (sorted by iou_f_r):")
    by_gen = {}
    for r in records:
        by_gen.setdefault(r["generator"], []).append(r)
    rows = []
    for g, rs in by_gen.items():
        arr = np.array([(x["iou_f_gt"], x["iou_f_r"]) for x in rs])
        rows.append((g, len(rs), arr[:, 0].mean(), arr[:, 1].mean()))
    for g, n, fg, fr in sorted(rows, key=lambda x: -x[3]):
        print(f"  {g:<12s} n={n:3d}  iou_f_gt={fg:.3f}  iou_f_r={fr:.3f}")

    # Verdict
    print(f"\n{'='*60}")
    if crit1 and crit2:
        print("VERDICT: FBC signal exists. Proceed to implement as GRPO reward.")
    elif crit1 and not crit2:
        print("VERDICT: model is consistent but not in a useful way. FBC alone "
              "won't steer training; combine with iou or rethink.")
    else:
        print("VERDICT: FBC signal absent. Reversed video doesn't elicit meaningful "
              "model behavior. Rethink the spatial / temporal causality framing.")


if __name__ == "__main__":
    main()