Spaces:

Snowflake
/

MADQA-Leaderboard

Running

App Files Files

MADQA-Leaderboard / src /submission /submit.py

Borchmann

Upload folder using huggingface_hub

87993b5 verified 5 months ago

raw

history blame

7.67 kB

	import json
	import os
	from datetime import datetime, timezone

	from src.display.formatting import styled_error, styled_message, styled_warning
	from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, RESULTS_REPO
	from src.submission.check_validity import already_submitted_models

	REQUESTED_MODELS = None
	USERS_TO_SUBMISSION_DATES = None


	def validate_jsonl_submission(file_path):
	"""Validates the structure and content of a submission JSONL file"""
	required_fields = ["question", "answer", "citations", "iterations", "id"]

	try:
	with open(file_path, "r") as f:
	lines = f.readlines()

	if len(lines) == 0:
	return False, "File is empty", 0

	predictions = []
	for line_num, line in enumerate(lines, 1):
	line = line.strip()
	if not line:
	continue

	try:
	pred = json.loads(line)
	except json.JSONDecodeError as e:
	return False, f"Line {line_num}: Invalid JSON - {str(e)}", 0

	# Check required fields
	for field in required_fields:
	if field not in pred:
	return False, f"Line {line_num}: Missing required field '{field}'", 0

	# Validate field types
	if not isinstance(pred["question"], str):
	return False, f"Line {line_num}: 'question' must be a string", 0
	if not isinstance(pred["answer"], list):
	return False, f"Line {line_num}: 'answer' must be a list", 0
	if not isinstance(pred["citations"], list):
	return False, f"Line {line_num}: 'citations' must be a list", 0
	if not isinstance(pred["iterations"], int) or pred["iterations"] < 0:
	return False, f"Line {line_num}: 'iterations' must be a non-negative integer", 0
	if not isinstance(pred["id"], str):
	return False, f"Line {line_num}: 'id' must be a string", 0

	# Validate citations structure
	for cit_idx, citation in enumerate(pred["citations"]):
	if not isinstance(citation, dict):
	return False, f"Line {line_num}, citation {cit_idx}: Must be a dict with 'file' and 'page'", 0
	if "file" not in citation or "page" not in citation:
	return False, f"Line {line_num}, citation {cit_idx}: Must have 'file' and 'page' fields", 0

	predictions.append(pred)

	return True, "", len(predictions)

	except Exception as e:
	return False, f"Error reading file: {str(e)}", 0


	def add_new_eval(
	model_name: str,
	organization: str,
	model_type: str,
	predictions_file,
	link: str = "",
	):
	global REQUESTED_MODELS
	global USERS_TO_SUBMISSION_DATES
	if not REQUESTED_MODELS:
	REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)

	current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

	# Validate inputs
	if not model_name or model_name.strip() == "":
	return styled_error("Please provide a model name.")

	if not organization or organization.strip() == "":
	return styled_error("Please provide your organization name.")

	if model_type is None or model_type == "":
	return styled_error("Please select a model type (API or Open-weight).")

	if predictions_file is None:
	return styled_error("Please upload a predictions JSONL file.")

	# Validate JSONL structure
	is_valid, error_msg, num_predictions = validate_jsonl_submission(predictions_file)
	if not is_valid:
	return styled_error(f"Invalid submission format: {error_msg}")

	print(f"Validated {num_predictions} predictions")

	# Create safe filename
	safe_model_name = model_name.replace("/", "_").replace(" ", "_")

	# Check for duplicate submission
	if safe_model_name in REQUESTED_MODELS:
	return styled_warning("This model has already been submitted.")

	print("Adding new eval")

	# Prepare directories
	OUT_DIR = f"{EVAL_REQUESTS_PATH}/{organization}"
	PREDICTIONS_DIR = f"{EVAL_RESULTS_PATH}/{organization}"
	os.makedirs(OUT_DIR, exist_ok=True)
	os.makedirs(PREDICTIONS_DIR, exist_ok=True)

	# Save predictions file
	predictions_path = f"{PREDICTIONS_DIR}/{safe_model_name}_predictions_{current_time}.jsonl"

	# Copy the uploaded file
	import shutil

	shutil.copy(predictions_file, predictions_path)

	# TODO: Run evaluation here
	# from src.evaluation.evaluator import evaluate_predictions
	# results = evaluate_predictions(predictions_path)

	# For now, create placeholder results
	# This will be replaced with actual evaluation
	placeholder_results = {
	"model_name": model_name,
	"results": {
	"overall": {"anls": 0.50},
	"single_evidence": {"anls": 0.50},
	"multi_evidence_same_doc": {"anls": 0.50},
	"multi_evidence_multi_doc": {"anls": 0.50},
	},
	"metadata": {
	"agent_steps": num_predictions, # Use total predictions as placeholder
	"cost_usd": 0.0, # Placeholder
	"model_type": model_type.lower(),
	},
	"organization": organization,
	"submission_date": current_time,
	"num_predictions": num_predictions,
	"link": link.strip() if link else "",
	}

	# Save results file
	results_path = f"{PREDICTIONS_DIR}/{safe_model_name}_results_{current_time}.json"
	with open(results_path, "w") as f:
	json.dump(placeholder_results, f, indent=2)

	# Create request entry for queue
	eval_request = {
	"model": model_name,
	"organization": organization,
	"model_type": model_type,
	"status": "PENDING", # Will be set to FINISHED after evaluation
	"submitted_time": current_time,
	"link": link.strip() if link else "",
	}

	# Save request file
	request_path = f"{OUT_DIR}/{safe_model_name}_eval_request_{current_time}.json"
	with open(request_path, "w") as f:
	json.dump(eval_request, f, indent=2)

	print("Uploading files")
	try:
	# Upload predictions file
	API.upload_file(
	path_or_fileobj=predictions_path,
	path_in_repo=predictions_path.split("eval-results/")[1],
	repo_id=RESULTS_REPO,
	repo_type="dataset",
	commit_message=f"Add predictions for {model_name}",
	)

	# Upload results file
	API.upload_file(
	path_or_fileobj=results_path,
	path_in_repo=results_path.split("eval-results/")[1],
	repo_id=RESULTS_REPO,
	repo_type="dataset",
	commit_message=f"Add results for {model_name}",
	)

	# Upload request file to queue repo
	API.upload_file(
	path_or_fileobj=request_path,
	path_in_repo=request_path.split("eval-queue/")[1],
	repo_id=QUEUE_REPO,
	repo_type="dataset",
	commit_message=f"Add {model_name} to eval queue",
	)
	except Exception as e:
	return styled_error(f"Error uploading files: {str(e)}")

	# Remove local files
	os.remove(request_path)
	os.remove(predictions_path)
	os.remove(results_path)

	return styled_message(
	f"Your submission for '{model_name}' has been successfully submitted!\n"
	f"Validated {num_predictions} predictions.\n"
	f"⚠️ Note: Currently using placeholder scores. Implement the evaluator to compute actual ANLS scores.\n"
	f"Please wait for the leaderboard to refresh."
	)