File size: 3,421 Bytes
33569f9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 | """Symlink forensics_grpo's preprocessed video cache into Time-R1 layout.
forensics_grpo preprocess_forensics.py writes:
<forensics_cache>/<split>/<gen>/<sample_id>/{video_inputs.pt, video_kwargs.json}
Time-R1 finetune.py expects, for split_name in {"train", "eval"}:
<preprocessed_data_path>/<split_name>/<video_id>_<sentence_id>/{video_inputs.pt, video_kwargs.json}
where video_id is the JSON top-level key and sentence_id is the segment index.
Since build_forensics_json.py emits video_id = "<gen>__<basename>" and each
sentence corresponds to one segment of the original multi-segment video, all
sentences of the same (gen, basename) point at the SAME underlying cached
tensor. So we symlink the same source dir K times (K = #segments).
Output split mapping:
forensics split "train" -> Time-R1 split "train"
forensics split "test" -> Time-R1 split "eval"
"""
import argparse
import json
import os
import sys
SPLIT_MAP = {"train": "train", "test": "eval"}
def link_one(src_dir: str, dst_dir: str, overwrite: bool) -> bool:
if not os.path.isdir(src_dir):
return False
if os.path.lexists(dst_dir):
if overwrite:
os.remove(dst_dir) if os.path.islink(dst_dir) else None
else:
return True
os.makedirs(os.path.dirname(dst_dir), exist_ok=True)
os.symlink(src_dir, dst_dir)
return True
def main():
p = argparse.ArgumentParser()
p.add_argument("--forensics_cache", required=True,
help="Root containing <split>/<gen>/<sample_id>/")
p.add_argument("--annotation_json_dir", required=True,
help="Dir holding train.json / val.json from build_forensics_json.py")
p.add_argument("--output_dir", required=True,
help="Where to place Time-R1 cache symlinks "
"(<output>/train/<vid>_<sid>/, <output>/eval/<vid>_<sid>/)")
p.add_argument("--overwrite", action="store_true",
help="Replace existing symlinks (regular files/dirs are left alone)")
args = p.parse_args()
pairs = [("train.json", "train"), ("val.json", "test")]
total_linked = 0
total_missing = 0
for json_name, forensics_split in pairs:
json_path = os.path.join(args.annotation_json_dir, json_name)
if not os.path.exists(json_path):
print(f"[warn] missing {json_path}", file=sys.stderr)
continue
time_r1_split = SPLIT_MAP[forensics_split]
with open(json_path, "r") as f:
data = json.load(f)
for video_id, info in data.items():
if "__" not in video_id:
print(f"[skip] unexpected video_id {video_id!r}", file=sys.stderr)
continue
gen, basename = video_id.split("__", 1)
src_dir = os.path.join(args.forensics_cache, forensics_split, gen, basename)
if not os.path.isdir(src_dir):
total_missing += 1
continue
n_segments = len(info["timestamps"])
for sid in range(n_segments):
dst_dir = os.path.join(args.output_dir, time_r1_split,
f"{video_id}_{sid}")
if link_one(src_dir, dst_dir, args.overwrite):
total_linked += 1
print(f"linked: {total_linked} cache entries, missing sources: {total_missing}")
if __name__ == "__main__":
main()
|