#!/usr/bin/env python3 """ Delete Humanity submissions and unlinked results from backend-results. Usage: export HF_TOKEN=... python streamlit_app/eval/delete_unlinked.py # dry-run python streamlit_app/eval/delete_unlinked.py --apply # actually delete """ import argparse import os from huggingface_hub import HfApi, list_repo_files RESULTS_REPO = "agentic-document-ai/backend-results" TOKEN = os.environ.get("HF_TOKEN") def main() -> int: parser = argparse.ArgumentParser(description="Delete Humanity and unlinked files") parser.add_argument("--apply", action="store_true", help="Actually delete (default: dry-run)") args = parser.parse_args() api = HfApi(token=TOKEN) files = list_repo_files(RESULTS_REPO, repo_type="dataset", token=TOKEN) result_files = [f for f in files if f.endswith('.json') and '_results_' in f] pred_files = [f for f in files if f.endswith('.jsonl') and '_predictions_' in f] def key_for_result(name: str): parts = name.rsplit('_results_', 1) if len(parts) != 2: return None return f"{parts[0]}_{parts[1].replace('.json','')}" def key_for_pred(name: str): parts = name.rsplit('_predictions_', 1) if len(parts) != 2: return None return f"{parts[0]}_{parts[1].replace('.jsonl','')}" result_keys = {key_for_result(f): f for f in result_files if key_for_result(f)} pred_keys = {key_for_pred(f): f for f in pred_files if key_for_pred(f)} # Find unlinked results (no matching prediction) unlinked_results = [result_keys[k] for k in set(result_keys) - set(pred_keys)] # Find all Humanity files humanity_files = [f for f in files if f.startswith("Humanity/")] # Combine into deletion list (deduplicated) to_delete = sorted(set(unlinked_results + humanity_files)) print(f"Files to delete: {len(to_delete)}") for f in to_delete: print(f" {f}") if not to_delete: print("Nothing to delete.") return 0 if not args.apply: print("\nDry-run mode. Add --apply to actually delete.") return 0 print(f"\nDeleting {len(to_delete)} files...") for f in to_delete: try: api.delete_file( path_in_repo=f, repo_id=RESULTS_REPO, repo_type="dataset", token=TOKEN, commit_message=f"Cleanup: delete {f}" ) print(f" ✓ Deleted: {f}") except Exception as e: print(f" ✗ Error deleting {f}: {e}") print("\nDone!") return 0 if __name__ == "__main__": raise SystemExit(main())