sdzt
/

forensics-grpo

Video-Text-to-Text

temporal-grounding

Model card Files Files and versions

forensics-grpo / code /time_r1 /data_forensics /link_cache.py

sdzt's picture

Add source code

33569f9 verified 29 days ago

History Blame Contribute Delete

3.42 kB

	"""Symlink forensics_grpo's preprocessed video cache into Time-R1 layout.

	forensics_grpo preprocess_forensics.py writes:
	<forensics_cache>/<split>/<gen>/<sample_id>/{video_inputs.pt, video_kwargs.json}

	Time-R1 finetune.py expects, for split_name in {"train", "eval"}:
	<preprocessed_data_path>/<split_name>/<video_id>_<sentence_id>/{video_inputs.pt, video_kwargs.json}

	where video_id is the JSON top-level key and sentence_id is the segment index.

	Since build_forensics_json.py emits video_id = "<gen>__<basename>" and each
	sentence corresponds to one segment of the original multi-segment video, all
	sentences of the same (gen, basename) point at the SAME underlying cached
	tensor. So we symlink the same source dir K times (K = #segments).

	Output split mapping:
	forensics split "train" -> Time-R1 split "train"
	forensics split "test" -> Time-R1 split "eval"
	"""
	import argparse
	import json
	import os
	import sys


	SPLIT_MAP = {"train": "train", "test": "eval"}


	def link_one(src_dir: str, dst_dir: str, overwrite: bool) -> bool:
	if not os.path.isdir(src_dir):
	return False
	if os.path.lexists(dst_dir):
	if overwrite:
	os.remove(dst_dir) if os.path.islink(dst_dir) else None
	else:
	return True
	os.makedirs(os.path.dirname(dst_dir), exist_ok=True)
	os.symlink(src_dir, dst_dir)
	return True


	def main():
	p = argparse.ArgumentParser()
	p.add_argument("--forensics_cache", required=True,
	help="Root containing <split>/<gen>/<sample_id>/")
	p.add_argument("--annotation_json_dir", required=True,
	help="Dir holding train.json / val.json from build_forensics_json.py")
	p.add_argument("--output_dir", required=True,
	help="Where to place Time-R1 cache symlinks "
	"(<output>/train/<vid>_<sid>/, <output>/eval/<vid>_<sid>/)")
	p.add_argument("--overwrite", action="store_true",
	help="Replace existing symlinks (regular files/dirs are left alone)")
	args = p.parse_args()

	pairs = [("train.json", "train"), ("val.json", "test")]

	total_linked = 0
	total_missing = 0
	for json_name, forensics_split in pairs:
	json_path = os.path.join(args.annotation_json_dir, json_name)
	if not os.path.exists(json_path):
	print(f"[warn] missing {json_path}", file=sys.stderr)
	continue
	time_r1_split = SPLIT_MAP[forensics_split]
	with open(json_path, "r") as f:
	data = json.load(f)
	for video_id, info in data.items():
	if "__" not in video_id:
	print(f"[skip] unexpected video_id {video_id!r}", file=sys.stderr)
	continue
	gen, basename = video_id.split("__", 1)
	src_dir = os.path.join(args.forensics_cache, forensics_split, gen, basename)
	if not os.path.isdir(src_dir):
	total_missing += 1
	continue
	n_segments = len(info["timestamps"])
	for sid in range(n_segments):
	dst_dir = os.path.join(args.output_dir, time_r1_split,
	f"{video_id}_{sid}")
	if link_one(src_dir, dst_dir, args.overwrite):
	total_linked += 1

	print(f"linked: {total_linked} cache entries, missing sources: {total_missing}")


	if __name__ == "__main__":
	main()