"""Symlink forensics_grpo's preprocessed video cache into Time-R1 layout. forensics_grpo preprocess_forensics.py writes: ////{video_inputs.pt, video_kwargs.json} Time-R1 finetune.py expects, for split_name in {"train", "eval"}: //_/{video_inputs.pt, video_kwargs.json} where video_id is the JSON top-level key and sentence_id is the segment index. Since build_forensics_json.py emits video_id = "__" and each sentence corresponds to one segment of the original multi-segment video, all sentences of the same (gen, basename) point at the SAME underlying cached tensor. So we symlink the same source dir K times (K = #segments). Output split mapping: forensics split "train" -> Time-R1 split "train" forensics split "test" -> Time-R1 split "eval" """ import argparse import json import os import sys SPLIT_MAP = {"train": "train", "test": "eval"} def link_one(src_dir: str, dst_dir: str, overwrite: bool) -> bool: if not os.path.isdir(src_dir): return False if os.path.lexists(dst_dir): if overwrite: os.remove(dst_dir) if os.path.islink(dst_dir) else None else: return True os.makedirs(os.path.dirname(dst_dir), exist_ok=True) os.symlink(src_dir, dst_dir) return True def main(): p = argparse.ArgumentParser() p.add_argument("--forensics_cache", required=True, help="Root containing ///") p.add_argument("--annotation_json_dir", required=True, help="Dir holding train.json / val.json from build_forensics_json.py") p.add_argument("--output_dir", required=True, help="Where to place Time-R1 cache symlinks " "(/train/_/, /eval/_/)") p.add_argument("--overwrite", action="store_true", help="Replace existing symlinks (regular files/dirs are left alone)") args = p.parse_args() pairs = [("train.json", "train"), ("val.json", "test")] total_linked = 0 total_missing = 0 for json_name, forensics_split in pairs: json_path = os.path.join(args.annotation_json_dir, json_name) if not os.path.exists(json_path): print(f"[warn] missing {json_path}", file=sys.stderr) continue time_r1_split = SPLIT_MAP[forensics_split] with open(json_path, "r") as f: data = json.load(f) for video_id, info in data.items(): if "__" not in video_id: print(f"[skip] unexpected video_id {video_id!r}", file=sys.stderr) continue gen, basename = video_id.split("__", 1) src_dir = os.path.join(args.forensics_cache, forensics_split, gen, basename) if not os.path.isdir(src_dir): total_missing += 1 continue n_segments = len(info["timestamps"]) for sid in range(n_segments): dst_dir = os.path.join(args.output_dir, time_r1_split, f"{video_id}_{sid}") if link_one(src_dir, dst_dir, args.overwrite): total_linked += 1 print(f"linked: {total_linked} cache entries, missing sources: {total_missing}") if __name__ == "__main__": main()