File size: 3,421 Bytes

33569f9

"""Symlink forensics_grpo's preprocessed video cache into Time-R1 layout.

forensics_grpo preprocess_forensics.py writes:
    <forensics_cache>/<split>/<gen>/<sample_id>/{video_inputs.pt, video_kwargs.json}

Time-R1 finetune.py expects, for split_name in {"train", "eval"}:
    <preprocessed_data_path>/<split_name>/<video_id>_<sentence_id>/{video_inputs.pt, video_kwargs.json}

where video_id is the JSON top-level key and sentence_id is the segment index.

Since build_forensics_json.py emits video_id = "<gen>__<basename>" and each
sentence corresponds to one segment of the original multi-segment video, all
sentences of the same (gen, basename) point at the SAME underlying cached
tensor. So we symlink the same source dir K times (K = #segments).

Output split mapping:
    forensics split "train" -> Time-R1 split "train"
    forensics split "test"  -> Time-R1 split "eval"
"""
import argparse
import json
import os
import sys


SPLIT_MAP = {"train": "train", "test": "eval"}


def link_one(src_dir: str, dst_dir: str, overwrite: bool) -> bool:
    if not os.path.isdir(src_dir):
        return False
    if os.path.lexists(dst_dir):
        if overwrite:
            os.remove(dst_dir) if os.path.islink(dst_dir) else None
        else:
            return True
    os.makedirs(os.path.dirname(dst_dir), exist_ok=True)
    os.symlink(src_dir, dst_dir)
    return True


def main():
    p = argparse.ArgumentParser()
    p.add_argument("--forensics_cache", required=True,
                   help="Root containing <split>/<gen>/<sample_id>/")
    p.add_argument("--annotation_json_dir", required=True,
                   help="Dir holding train.json / val.json from build_forensics_json.py")
    p.add_argument("--output_dir", required=True,
                   help="Where to place Time-R1 cache symlinks "
                        "(<output>/train/<vid>_<sid>/, <output>/eval/<vid>_<sid>/)")
    p.add_argument("--overwrite", action="store_true",
                   help="Replace existing symlinks (regular files/dirs are left alone)")
    args = p.parse_args()

    pairs = [("train.json", "train"), ("val.json", "test")]

    total_linked = 0
    total_missing = 0
    for json_name, forensics_split in pairs:
        json_path = os.path.join(args.annotation_json_dir, json_name)
        if not os.path.exists(json_path):
            print(f"[warn] missing {json_path}", file=sys.stderr)
            continue
        time_r1_split = SPLIT_MAP[forensics_split]
        with open(json_path, "r") as f:
            data = json.load(f)
        for video_id, info in data.items():
            if "__" not in video_id:
                print(f"[skip] unexpected video_id {video_id!r}", file=sys.stderr)
                continue
            gen, basename = video_id.split("__", 1)
            src_dir = os.path.join(args.forensics_cache, forensics_split, gen, basename)
            if not os.path.isdir(src_dir):
                total_missing += 1
                continue
            n_segments = len(info["timestamps"])
            for sid in range(n_segments):
                dst_dir = os.path.join(args.output_dir, time_r1_split,
                                       f"{video_id}_{sid}")
                if link_one(src_dir, dst_dir, args.overwrite):
                    total_linked += 1

    print(f"linked: {total_linked} cache entries, missing sources: {total_missing}")


if __name__ == "__main__":
    main()