Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Delete Humanity submissions and unlinked results from backend-results. | |
| Usage: | |
| export HF_TOKEN=... | |
| python streamlit_app/eval/delete_unlinked.py # dry-run | |
| python streamlit_app/eval/delete_unlinked.py --apply # actually delete | |
| """ | |
| import argparse | |
| import os | |
| from huggingface_hub import HfApi, list_repo_files | |
| RESULTS_REPO = "agentic-document-ai/backend-results" | |
| TOKEN = os.environ.get("HF_TOKEN") | |
| def main() -> int: | |
| parser = argparse.ArgumentParser(description="Delete Humanity and unlinked files") | |
| parser.add_argument("--apply", action="store_true", help="Actually delete (default: dry-run)") | |
| args = parser.parse_args() | |
| api = HfApi(token=TOKEN) | |
| files = list_repo_files(RESULTS_REPO, repo_type="dataset", token=TOKEN) | |
| result_files = [f for f in files if f.endswith('.json') and '_results_' in f] | |
| pred_files = [f for f in files if f.endswith('.jsonl') and '_predictions_' in f] | |
| def key_for_result(name: str): | |
| parts = name.rsplit('_results_', 1) | |
| if len(parts) != 2: | |
| return None | |
| return f"{parts[0]}_{parts[1].replace('.json','')}" | |
| def key_for_pred(name: str): | |
| parts = name.rsplit('_predictions_', 1) | |
| if len(parts) != 2: | |
| return None | |
| return f"{parts[0]}_{parts[1].replace('.jsonl','')}" | |
| result_keys = {key_for_result(f): f for f in result_files if key_for_result(f)} | |
| pred_keys = {key_for_pred(f): f for f in pred_files if key_for_pred(f)} | |
| # Find unlinked results (no matching prediction) | |
| unlinked_results = [result_keys[k] for k in set(result_keys) - set(pred_keys)] | |
| # Find all Humanity files | |
| humanity_files = [f for f in files if f.startswith("Humanity/")] | |
| # Combine into deletion list (deduplicated) | |
| to_delete = sorted(set(unlinked_results + humanity_files)) | |
| print(f"Files to delete: {len(to_delete)}") | |
| for f in to_delete: | |
| print(f" {f}") | |
| if not to_delete: | |
| print("Nothing to delete.") | |
| return 0 | |
| if not args.apply: | |
| print("\nDry-run mode. Add --apply to actually delete.") | |
| return 0 | |
| print(f"\nDeleting {len(to_delete)} files...") | |
| for f in to_delete: | |
| try: | |
| api.delete_file( | |
| path_in_repo=f, | |
| repo_id=RESULTS_REPO, | |
| repo_type="dataset", | |
| token=TOKEN, | |
| commit_message=f"Cleanup: delete {f}" | |
| ) | |
| print(f" ✓ Deleted: {f}") | |
| except Exception as e: | |
| print(f" ✗ Error deleting {f}: {e}") | |
| print("\nDone!") | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |