MADQA-Leaderboard / eval /delete_unlinked.py
Borchmann's picture
Upload folder using huggingface_hub
50d53bd verified
raw
history blame
2.67 kB
#!/usr/bin/env python3
"""
Delete Humanity submissions and unlinked results from backend-results.
Usage:
export HF_TOKEN=...
python streamlit_app/eval/delete_unlinked.py # dry-run
python streamlit_app/eval/delete_unlinked.py --apply # actually delete
"""
import argparse
import os
from huggingface_hub import HfApi, list_repo_files
RESULTS_REPO = "agentic-document-ai/backend-results"
TOKEN = os.environ.get("HF_TOKEN")
def main() -> int:
parser = argparse.ArgumentParser(description="Delete Humanity and unlinked files")
parser.add_argument("--apply", action="store_true", help="Actually delete (default: dry-run)")
args = parser.parse_args()
api = HfApi(token=TOKEN)
files = list_repo_files(RESULTS_REPO, repo_type="dataset", token=TOKEN)
result_files = [f for f in files if f.endswith('.json') and '_results_' in f]
pred_files = [f for f in files if f.endswith('.jsonl') and '_predictions_' in f]
def key_for_result(name: str):
parts = name.rsplit('_results_', 1)
if len(parts) != 2:
return None
return f"{parts[0]}_{parts[1].replace('.json','')}"
def key_for_pred(name: str):
parts = name.rsplit('_predictions_', 1)
if len(parts) != 2:
return None
return f"{parts[0]}_{parts[1].replace('.jsonl','')}"
result_keys = {key_for_result(f): f for f in result_files if key_for_result(f)}
pred_keys = {key_for_pred(f): f for f in pred_files if key_for_pred(f)}
# Find unlinked results (no matching prediction)
unlinked_results = [result_keys[k] for k in set(result_keys) - set(pred_keys)]
# Find all Humanity files
humanity_files = [f for f in files if f.startswith("Humanity/")]
# Combine into deletion list (deduplicated)
to_delete = sorted(set(unlinked_results + humanity_files))
print(f"Files to delete: {len(to_delete)}")
for f in to_delete:
print(f" {f}")
if not to_delete:
print("Nothing to delete.")
return 0
if not args.apply:
print("\nDry-run mode. Add --apply to actually delete.")
return 0
print(f"\nDeleting {len(to_delete)} files...")
for f in to_delete:
try:
api.delete_file(
path_in_repo=f,
repo_id=RESULTS_REPO,
repo_type="dataset",
token=TOKEN,
commit_message=f"Cleanup: delete {f}"
)
print(f" ✓ Deleted: {f}")
except Exception as e:
print(f" ✗ Error deleting {f}: {e}")
print("\nDone!")
return 0
if __name__ == "__main__":
raise SystemExit(main())