Spaces:

Snowflake
/

MADQA-Leaderboard

Running

App Files Files

MADQA-Leaderboard / eval /link_file_search_predictions.py

Borchmann

Upload folder using huggingface_hub

50d53bd verified 3 months ago

raw

history blame

3.59 kB

	#!/usr/bin/env python3
	"""
	Upload missing File Search predictions and link them to existing results.

	Usage:
	export HF_TOKEN=...
	python streamlit_app/eval/link_file_search_predictions.py --apply
	"""

	import argparse
	import json
	import os
	from pathlib import Path

	from huggingface_hub import HfApi, hf_hub_download


	RESULTS_REPO = "agentic-document-ai/backend-results"
	TOKEN = os.environ.get("HF_TOKEN")
	BASE_DIR = Path(__file__).resolve().parents[2] # Project root
	FILE_SEARCH_DIR = BASE_DIR / "file_search_results"

	# Map missing results -> local prediction file
	MISSING_RESULTS = {
	"Google/Gemini_2.5_Flash_with_File_Search_results_20260103_221253.json": "gemini-2.5-flash.jsonl",
	"Google/Gemini_2.5_Pro_with_File_Search_results_20260103_221943.json": "gemini-2.5-pro.jsonl",
	"Google/Gemini_3.0_Pro_(Preview)_with_File_Search_results_20260104_120431.json": "gemini-3-pro-preview.jsonl",
	"OpenAI/GPT-5.2_(2025-12-11)_with_File_Search_results_20260104_121551.json": "gpt-5.2-2025-12-11.jsonl",
	"OpenAI/GPT-5_Mini_(2025-08-07)_with_File_Search_results_20260104_122026.json": "gpt-5-mini-2025-08-07.jsonl",
	}


	def _pred_path_from_result(result_path: str) -> str:
	# {org}/{model}_results_{ts}.json -> {org}/{model}_predictions_{ts}.jsonl
	base, ts = result_path.rsplit("_results_", 1)
	ts = ts.replace(".json", "")
	return f"{base}_predictions_{ts}.jsonl"


	def main() -> int:
	parser = argparse.ArgumentParser(description="Upload file_search_results predictions and link them to results.")
	parser.add_argument("--apply", action="store_true", help="Apply uploads/updates (default: dry-run)")
	args = parser.parse_args()

	if not FILE_SEARCH_DIR.exists():
	raise FileNotFoundError(f"Missing directory: {FILE_SEARCH_DIR}")

	api = HfApi(token=TOKEN)

	actions = []
	for result_path, local_name in MISSING_RESULTS.items():
	local_file = FILE_SEARCH_DIR / local_name
	if not local_file.exists():
	raise FileNotFoundError(f"Missing local prediction file: {local_file}")

	pred_path = _pred_path_from_result(result_path)
	actions.append((result_path, pred_path, local_file))

	print(f"Planned uploads: {len(actions)}")
	for result_path, pred_path, local_file in actions:
	print(f"- {local_file.name} -> {pred_path}")

	if not args.apply:
	print("\nDry-run only. Re-run with --apply to execute.")
	return 0

	for result_path, pred_path, local_file in actions:
	# Upload predictions file
	api.upload_file(
	path_or_fileobj=str(local_file),
	path_in_repo=pred_path,
	repo_id=RESULTS_REPO,
	repo_type="dataset",
	token=TOKEN,
	commit_message=f"Add predictions for {pred_path}",
	)

	# Update results JSON with linkage fields
	local_result = hf_hub_download(
	repo_id=RESULTS_REPO,
	filename=result_path,
	repo_type="dataset",
	token=TOKEN,
	)
	with open(local_result) as f:
	data = json.load(f)
	data["source_predictions_file"] = pred_path
	data["result_file_path"] = result_path
	api.upload_file(
	path_or_fileobj=json.dumps(data, indent=2).encode("utf-8"),
	path_in_repo=result_path,
	repo_id=RESULTS_REPO,
	repo_type="dataset",
	token=TOKEN,
	commit_message=f"Link result to predictions: {result_path}",
	)

	print("Done.")
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())