| """Symlink forensics_grpo's preprocessed video cache into Time-R1 layout. |
| |
| forensics_grpo preprocess_forensics.py writes: |
| <forensics_cache>/<split>/<gen>/<sample_id>/{video_inputs.pt, video_kwargs.json} |
| |
| Time-R1 finetune.py expects, for split_name in {"train", "eval"}: |
| <preprocessed_data_path>/<split_name>/<video_id>_<sentence_id>/{video_inputs.pt, video_kwargs.json} |
| |
| where video_id is the JSON top-level key and sentence_id is the segment index. |
| |
| Since build_forensics_json.py emits video_id = "<gen>__<basename>" and each |
| sentence corresponds to one segment of the original multi-segment video, all |
| sentences of the same (gen, basename) point at the SAME underlying cached |
| tensor. So we symlink the same source dir K times (K = #segments). |
| |
| Output split mapping: |
| forensics split "train" -> Time-R1 split "train" |
| forensics split "test" -> Time-R1 split "eval" |
| """ |
| import argparse |
| import json |
| import os |
| import sys |
|
|
|
|
| SPLIT_MAP = {"train": "train", "test": "eval"} |
|
|
|
|
| def link_one(src_dir: str, dst_dir: str, overwrite: bool) -> bool: |
| if not os.path.isdir(src_dir): |
| return False |
| if os.path.lexists(dst_dir): |
| if overwrite: |
| os.remove(dst_dir) if os.path.islink(dst_dir) else None |
| else: |
| return True |
| os.makedirs(os.path.dirname(dst_dir), exist_ok=True) |
| os.symlink(src_dir, dst_dir) |
| return True |
|
|
|
|
| def main(): |
| p = argparse.ArgumentParser() |
| p.add_argument("--forensics_cache", required=True, |
| help="Root containing <split>/<gen>/<sample_id>/") |
| p.add_argument("--annotation_json_dir", required=True, |
| help="Dir holding train.json / val.json from build_forensics_json.py") |
| p.add_argument("--output_dir", required=True, |
| help="Where to place Time-R1 cache symlinks " |
| "(<output>/train/<vid>_<sid>/, <output>/eval/<vid>_<sid>/)") |
| p.add_argument("--overwrite", action="store_true", |
| help="Replace existing symlinks (regular files/dirs are left alone)") |
| args = p.parse_args() |
|
|
| pairs = [("train.json", "train"), ("val.json", "test")] |
|
|
| total_linked = 0 |
| total_missing = 0 |
| for json_name, forensics_split in pairs: |
| json_path = os.path.join(args.annotation_json_dir, json_name) |
| if not os.path.exists(json_path): |
| print(f"[warn] missing {json_path}", file=sys.stderr) |
| continue |
| time_r1_split = SPLIT_MAP[forensics_split] |
| with open(json_path, "r") as f: |
| data = json.load(f) |
| for video_id, info in data.items(): |
| if "__" not in video_id: |
| print(f"[skip] unexpected video_id {video_id!r}", file=sys.stderr) |
| continue |
| gen, basename = video_id.split("__", 1) |
| src_dir = os.path.join(args.forensics_cache, forensics_split, gen, basename) |
| if not os.path.isdir(src_dir): |
| total_missing += 1 |
| continue |
| n_segments = len(info["timestamps"]) |
| for sid in range(n_segments): |
| dst_dir = os.path.join(args.output_dir, time_r1_split, |
| f"{video_id}_{sid}") |
| if link_one(src_dir, dst_dir, args.overwrite): |
| total_linked += 1 |
|
|
| print(f"linked: {total_linked} cache entries, missing sources: {total_missing}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|