Spaces:
Running on Zero
Running on Zero
| # src/data/hf_blender_extract.py | |
| import os | |
| import sys | |
| import time | |
| from pathlib import Path | |
| def _parse(argv): | |
| # argv comes after `--` | |
| args = {"input": None, "output_dir": None, "target_count": 50000} | |
| it = iter(argv) | |
| for k in it: | |
| if k == "--input": | |
| args["input"] = next(it) | |
| elif k == "--output_dir": | |
| args["output_dir"] = next(it) | |
| elif k == "--target_count": | |
| args["target_count"] = int(next(it)) | |
| if not args["input"] or not args["output_dir"]: | |
| raise SystemExit("Usage: --input <file> --output_dir <dir> [--target_count N]") | |
| return args | |
| def main(): | |
| argv = sys.argv | |
| if "--" in argv: | |
| argv = argv[argv.index("--") + 1 :] | |
| else: | |
| argv = [] | |
| args = _parse(argv) | |
| # Ensure output dir | |
| out = Path(args["output_dir"]) | |
| out.mkdir(parents=True, exist_ok=True) | |
| from src.data.extract import extract_builtin, get_files | |
| files = get_files( | |
| data_name="raw_data.npz", | |
| inputs=str(args["input"]), | |
| input_dataset_dir=None, | |
| output_dataset_dir=str(out), | |
| force_override=True, | |
| warning=False, | |
| ) | |
| if not files: | |
| raise RuntimeError("No files to extract") | |
| timestamp = str(int(time.time())) | |
| extract_builtin( | |
| output_folder=str(out), | |
| target_count=int(args["target_count"]), | |
| num_runs=1, | |
| id=0, | |
| time=timestamp, | |
| files=files, | |
| ) | |
| if __name__ == "__main__": | |
| main() | |