import json import os from datetime import datetime, timezone from src.display.formatting import styled_error, styled_message, styled_warning from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, RESULTS_REPO from src.submission.check_validity import already_submitted_models REQUESTED_MODELS = None USERS_TO_SUBMISSION_DATES = None def validate_jsonl_submission(file_path): """Validates the structure and content of a submission JSONL file""" required_fields = ["question", "answer", "citations", "iterations", "id"] try: with open(file_path, "r") as f: lines = f.readlines() if len(lines) == 0: return False, "File is empty", 0 predictions = [] for line_num, line in enumerate(lines, 1): line = line.strip() if not line: continue try: pred = json.loads(line) except json.JSONDecodeError as e: return False, f"Line {line_num}: Invalid JSON - {str(e)}", 0 # Check required fields for field in required_fields: if field not in pred: return False, f"Line {line_num}: Missing required field '{field}'", 0 # Validate field types if not isinstance(pred["question"], str): return False, f"Line {line_num}: 'question' must be a string", 0 if not isinstance(pred["answer"], list): return False, f"Line {line_num}: 'answer' must be a list", 0 if not isinstance(pred["citations"], list): return False, f"Line {line_num}: 'citations' must be a list", 0 if not isinstance(pred["iterations"], int) or pred["iterations"] < 0: return False, f"Line {line_num}: 'iterations' must be a non-negative integer", 0 if not isinstance(pred["id"], str): return False, f"Line {line_num}: 'id' must be a string", 0 # Validate citations structure for cit_idx, citation in enumerate(pred["citations"]): if not isinstance(citation, dict): return False, f"Line {line_num}, citation {cit_idx}: Must be a dict with 'file' and 'page'", 0 if "file" not in citation or "page" not in citation: return False, f"Line {line_num}, citation {cit_idx}: Must have 'file' and 'page' fields", 0 predictions.append(pred) return True, "", len(predictions) except Exception as e: return False, f"Error reading file: {str(e)}", 0 def add_new_eval( model_name: str, organization: str, model_type: str, predictions_file, link: str = "", ): global REQUESTED_MODELS global USERS_TO_SUBMISSION_DATES if not REQUESTED_MODELS: REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH) current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") # Validate inputs if not model_name or model_name.strip() == "": return styled_error("Please provide a model name.") if not organization or organization.strip() == "": return styled_error("Please provide your organization name.") if model_type is None or model_type == "": return styled_error("Please select a model type (API or Open-weight).") if predictions_file is None: return styled_error("Please upload a predictions JSONL file.") # Validate JSONL structure is_valid, error_msg, num_predictions = validate_jsonl_submission(predictions_file) if not is_valid: return styled_error(f"Invalid submission format: {error_msg}") print(f"Validated {num_predictions} predictions") # Create safe filename safe_model_name = model_name.replace("/", "_").replace(" ", "_") # Check for duplicate submission if safe_model_name in REQUESTED_MODELS: return styled_warning("This model has already been submitted.") print("Adding new eval") # Prepare directories OUT_DIR = f"{EVAL_REQUESTS_PATH}/{organization}" PREDICTIONS_DIR = f"{EVAL_RESULTS_PATH}/{organization}" os.makedirs(OUT_DIR, exist_ok=True) os.makedirs(PREDICTIONS_DIR, exist_ok=True) # Save predictions file predictions_path = f"{PREDICTIONS_DIR}/{safe_model_name}_predictions_{current_time}.jsonl" # Copy the uploaded file import shutil shutil.copy(predictions_file, predictions_path) # TODO: Run evaluation here # from src.evaluation.evaluator import evaluate_predictions # results = evaluate_predictions(predictions_path) # For now, create placeholder results # This will be replaced with actual evaluation placeholder_results = { "model_name": model_name, "results": { "overall": {"anls": 0.50}, "single_evidence": {"anls": 0.50}, "multi_evidence_same_doc": {"anls": 0.50}, "multi_evidence_multi_doc": {"anls": 0.50}, }, "metadata": { "agent_steps": num_predictions, # Use total predictions as placeholder "cost_usd": 0.0, # Placeholder "model_type": model_type.lower(), }, "organization": organization, "submission_date": current_time, "num_predictions": num_predictions, "link": link.strip() if link else "", } # Save results file results_path = f"{PREDICTIONS_DIR}/{safe_model_name}_results_{current_time}.json" with open(results_path, "w") as f: json.dump(placeholder_results, f, indent=2) # Create request entry for queue eval_request = { "model": model_name, "organization": organization, "model_type": model_type, "status": "PENDING", # Will be set to FINISHED after evaluation "submitted_time": current_time, "link": link.strip() if link else "", } # Save request file request_path = f"{OUT_DIR}/{safe_model_name}_eval_request_{current_time}.json" with open(request_path, "w") as f: json.dump(eval_request, f, indent=2) print("Uploading files") try: # Upload predictions file API.upload_file( path_or_fileobj=predictions_path, path_in_repo=predictions_path.split("eval-results/")[1], repo_id=RESULTS_REPO, repo_type="dataset", commit_message=f"Add predictions for {model_name}", ) # Upload results file API.upload_file( path_or_fileobj=results_path, path_in_repo=results_path.split("eval-results/")[1], repo_id=RESULTS_REPO, repo_type="dataset", commit_message=f"Add results for {model_name}", ) # Upload request file to queue repo API.upload_file( path_or_fileobj=request_path, path_in_repo=request_path.split("eval-queue/")[1], repo_id=QUEUE_REPO, repo_type="dataset", commit_message=f"Add {model_name} to eval queue", ) except Exception as e: return styled_error(f"Error uploading files: {str(e)}") # Remove local files os.remove(request_path) os.remove(predictions_path) os.remove(results_path) return styled_message( f"Your submission for '{model_name}' has been successfully submitted!\n" f"Validated {num_predictions} predictions.\n" f"⚠️ Note: Currently using placeholder scores. Implement the evaluator to compute actual ANLS scores.\n" f"Please wait for the leaderboard to refresh." )