File size: 2,668 Bytes
50d53bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/env python3
"""
Delete Humanity submissions and unlinked results from backend-results.

Usage:
  export HF_TOKEN=...
  python streamlit_app/eval/delete_unlinked.py          # dry-run
  python streamlit_app/eval/delete_unlinked.py --apply  # actually delete
"""

import argparse
import os

from huggingface_hub import HfApi, list_repo_files


RESULTS_REPO = "agentic-document-ai/backend-results"
TOKEN = os.environ.get("HF_TOKEN")


def main() -> int:
    parser = argparse.ArgumentParser(description="Delete Humanity and unlinked files")
    parser.add_argument("--apply", action="store_true", help="Actually delete (default: dry-run)")
    args = parser.parse_args()

    api = HfApi(token=TOKEN)
    files = list_repo_files(RESULTS_REPO, repo_type="dataset", token=TOKEN)

    result_files = [f for f in files if f.endswith('.json') and '_results_' in f]
    pred_files = [f for f in files if f.endswith('.jsonl') and '_predictions_' in f]

    def key_for_result(name: str):
        parts = name.rsplit('_results_', 1)
        if len(parts) != 2:
            return None
        return f"{parts[0]}_{parts[1].replace('.json','')}"

    def key_for_pred(name: str):
        parts = name.rsplit('_predictions_', 1)
        if len(parts) != 2:
            return None
        return f"{parts[0]}_{parts[1].replace('.jsonl','')}"

    result_keys = {key_for_result(f): f for f in result_files if key_for_result(f)}
    pred_keys = {key_for_pred(f): f for f in pred_files if key_for_pred(f)}

    # Find unlinked results (no matching prediction)
    unlinked_results = [result_keys[k] for k in set(result_keys) - set(pred_keys)]

    # Find all Humanity files
    humanity_files = [f for f in files if f.startswith("Humanity/")]

    # Combine into deletion list (deduplicated)
    to_delete = sorted(set(unlinked_results + humanity_files))

    print(f"Files to delete: {len(to_delete)}")
    for f in to_delete:
        print(f"  {f}")

    if not to_delete:
        print("Nothing to delete.")
        return 0

    if not args.apply:
        print("\nDry-run mode. Add --apply to actually delete.")
        return 0

    print(f"\nDeleting {len(to_delete)} files...")
    for f in to_delete:
        try:
            api.delete_file(
                path_in_repo=f,
                repo_id=RESULTS_REPO,
                repo_type="dataset",
                token=TOKEN,
                commit_message=f"Cleanup: delete {f}"
            )
            print(f"  ✓ Deleted: {f}")
        except Exception as e:
            print(f"  ✗ Error deleting {f}: {e}")

    print("\nDone!")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())