#!/usr/bin/env python3 """Delete all files from the dataset that aren't in filtered_index.json.""" import json import os import shutil import sys from collections import defaultdict from pathlib import Path def main(): index_path = sys.argv[1] if len(sys.argv) > 1 else "filtered_index.json" dataset_dir = sys.argv[2] if len(sys.argv) > 2 else "/ephemeral/community_dataset_v3" with open(index_path) as f: index = json.load(f) # Build set of needed directories (contributor/dataset) needed_datasets = set() for ep in index["episodes"]: needed_datasets.add(ep["dataset"]) # Walk the dataset dir and find all contributor/dataset dirs dataset_root = Path(dataset_dir) deleted_bytes = 0 deleted_dirs = 0 for contributor_dir in sorted(dataset_root.iterdir()): if not contributor_dir.is_dir() or contributor_dir.name.startswith("."): continue for ds_dir in sorted(contributor_dir.iterdir()): if not ds_dir.is_dir(): continue dataset_name = f"{contributor_dir.name}/{ds_dir.name}" if dataset_name not in needed_datasets: # Get size before deleting size = sum(f.stat().st_size for f in ds_dir.rglob("*") if f.is_file()) shutil.rmtree(ds_dir) deleted_bytes += size deleted_dirs += 1 if deleted_dirs % 50 == 0: print(f" Deleted {deleted_dirs} datasets, freed {deleted_bytes / 1024**3:.1f}GB", flush=True) # Remove empty contributor dirs if contributor_dir.exists() and not any(contributor_dir.iterdir()): contributor_dir.rmdir() # Also delete the .cache dir cache_dir = dataset_root / ".cache" if cache_dir.exists(): cache_size = sum(f.stat().st_size for f in cache_dir.rglob("*") if f.is_file()) shutil.rmtree(cache_dir) deleted_bytes += cache_size print(f" Deleted .cache ({cache_size / 1024**3:.1f}GB)") print(f"\nDone: deleted {deleted_dirs} unused datasets, freed {deleted_bytes / 1024**3:.1f}GB") if __name__ == "__main__": main()