# src/data/hf_blender_extract.py import os import sys import time from pathlib import Path def _parse(argv): # argv comes after `--` args = {"input": None, "output_dir": None, "target_count": 50000} it = iter(argv) for k in it: if k == "--input": args["input"] = next(it) elif k == "--output_dir": args["output_dir"] = next(it) elif k == "--target_count": args["target_count"] = int(next(it)) if not args["input"] or not args["output_dir"]: raise SystemExit("Usage: --input --output_dir [--target_count N]") return args def main(): argv = sys.argv if "--" in argv: argv = argv[argv.index("--") + 1 :] else: argv = [] args = _parse(argv) # Ensure output dir out = Path(args["output_dir"]) out.mkdir(parents=True, exist_ok=True) from src.data.extract import extract_builtin, get_files files = get_files( data_name="raw_data.npz", inputs=str(args["input"]), input_dataset_dir=None, output_dataset_dir=str(out), force_override=True, warning=False, ) if not files: raise RuntimeError("No files to extract") timestamp = str(int(time.time())) extract_builtin( output_folder=str(out), target_count=int(args["target_count"]), num_runs=1, id=0, time=timestamp, files=files, ) if __name__ == "__main__": main()