File size: 2,179 Bytes
a8eb6e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/env python3
"""Delete all files from the dataset that aren't in filtered_index.json."""

import json
import os
import shutil
import sys
from collections import defaultdict
from pathlib import Path


def main():
    index_path = sys.argv[1] if len(sys.argv) > 1 else "filtered_index.json"
    dataset_dir = sys.argv[2] if len(sys.argv) > 2 else "/ephemeral/community_dataset_v3"

    with open(index_path) as f:
        index = json.load(f)

    # Build set of needed directories (contributor/dataset)
    needed_datasets = set()
    for ep in index["episodes"]:
        needed_datasets.add(ep["dataset"])

    # Walk the dataset dir and find all contributor/dataset dirs
    dataset_root = Path(dataset_dir)
    deleted_bytes = 0
    deleted_dirs = 0

    for contributor_dir in sorted(dataset_root.iterdir()):
        if not contributor_dir.is_dir() or contributor_dir.name.startswith("."):
            continue

        for ds_dir in sorted(contributor_dir.iterdir()):
            if not ds_dir.is_dir():
                continue

            dataset_name = f"{contributor_dir.name}/{ds_dir.name}"
            if dataset_name not in needed_datasets:
                # Get size before deleting
                size = sum(f.stat().st_size for f in ds_dir.rglob("*") if f.is_file())
                shutil.rmtree(ds_dir)
                deleted_bytes += size
                deleted_dirs += 1
                if deleted_dirs % 50 == 0:
                    print(f"  Deleted {deleted_dirs} datasets, freed {deleted_bytes / 1024**3:.1f}GB", flush=True)

        # Remove empty contributor dirs
        if contributor_dir.exists() and not any(contributor_dir.iterdir()):
            contributor_dir.rmdir()

    # Also delete the .cache dir
    cache_dir = dataset_root / ".cache"
    if cache_dir.exists():
        cache_size = sum(f.stat().st_size for f in cache_dir.rglob("*") if f.is_file())
        shutil.rmtree(cache_dir)
        deleted_bytes += cache_size
        print(f"  Deleted .cache ({cache_size / 1024**3:.1f}GB)")

    print(f"\nDone: deleted {deleted_dirs} unused datasets, freed {deleted_bytes / 1024**3:.1f}GB")


if __name__ == "__main__":
    main()