Spaces:
Running
Running
| import json | |
| import os | |
| from datetime import datetime, timezone | |
| from src.display.formatting import styled_error, styled_message, styled_warning | |
| from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, RESULTS_REPO | |
| from src.submission.check_validity import already_submitted_models | |
| REQUESTED_MODELS = None | |
| USERS_TO_SUBMISSION_DATES = None | |
| def validate_jsonl_submission(file_path): | |
| """Validates the structure and content of a submission JSONL file""" | |
| required_fields = ["question", "answer", "citations", "iterations", "id"] | |
| try: | |
| with open(file_path, "r") as f: | |
| lines = f.readlines() | |
| if len(lines) == 0: | |
| return False, "File is empty", 0 | |
| predictions = [] | |
| for line_num, line in enumerate(lines, 1): | |
| line = line.strip() | |
| if not line: | |
| continue | |
| try: | |
| pred = json.loads(line) | |
| except json.JSONDecodeError as e: | |
| return False, f"Line {line_num}: Invalid JSON - {str(e)}", 0 | |
| # Check required fields | |
| for field in required_fields: | |
| if field not in pred: | |
| return False, f"Line {line_num}: Missing required field '{field}'", 0 | |
| # Validate field types | |
| if not isinstance(pred["question"], str): | |
| return False, f"Line {line_num}: 'question' must be a string", 0 | |
| if not isinstance(pred["answer"], list): | |
| return False, f"Line {line_num}: 'answer' must be a list", 0 | |
| if not isinstance(pred["citations"], list): | |
| return False, f"Line {line_num}: 'citations' must be a list", 0 | |
| if not isinstance(pred["iterations"], int) or pred["iterations"] < 0: | |
| return False, f"Line {line_num}: 'iterations' must be a non-negative integer", 0 | |
| if not isinstance(pred["id"], str): | |
| return False, f"Line {line_num}: 'id' must be a string", 0 | |
| # Validate citations structure | |
| for cit_idx, citation in enumerate(pred["citations"]): | |
| if not isinstance(citation, dict): | |
| return False, f"Line {line_num}, citation {cit_idx}: Must be a dict with 'file' and 'page'", 0 | |
| if "file" not in citation or "page" not in citation: | |
| return False, f"Line {line_num}, citation {cit_idx}: Must have 'file' and 'page' fields", 0 | |
| predictions.append(pred) | |
| return True, "", len(predictions) | |
| except Exception as e: | |
| return False, f"Error reading file: {str(e)}", 0 | |
| def add_new_eval( | |
| model_name: str, | |
| organization: str, | |
| model_type: str, | |
| predictions_file, | |
| link: str = "", | |
| ): | |
| global REQUESTED_MODELS | |
| global USERS_TO_SUBMISSION_DATES | |
| if not REQUESTED_MODELS: | |
| REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH) | |
| current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") | |
| # Validate inputs | |
| if not model_name or model_name.strip() == "": | |
| return styled_error("Please provide a model name.") | |
| if not organization or organization.strip() == "": | |
| return styled_error("Please provide your organization name.") | |
| if model_type is None or model_type == "": | |
| return styled_error("Please select a model type (API or Open-weight).") | |
| if predictions_file is None: | |
| return styled_error("Please upload a predictions JSONL file.") | |
| # Validate JSONL structure | |
| is_valid, error_msg, num_predictions = validate_jsonl_submission(predictions_file) | |
| if not is_valid: | |
| return styled_error(f"Invalid submission format: {error_msg}") | |
| print(f"Validated {num_predictions} predictions") | |
| # Create safe filename | |
| safe_model_name = model_name.replace("/", "_").replace(" ", "_") | |
| # Check for duplicate submission | |
| if safe_model_name in REQUESTED_MODELS: | |
| return styled_warning("This model has already been submitted.") | |
| print("Adding new eval") | |
| # Prepare directories | |
| OUT_DIR = f"{EVAL_REQUESTS_PATH}/{organization}" | |
| PREDICTIONS_DIR = f"{EVAL_RESULTS_PATH}/{organization}" | |
| os.makedirs(OUT_DIR, exist_ok=True) | |
| os.makedirs(PREDICTIONS_DIR, exist_ok=True) | |
| # Save predictions file | |
| predictions_path = f"{PREDICTIONS_DIR}/{safe_model_name}_predictions_{current_time}.jsonl" | |
| # Copy the uploaded file | |
| import shutil | |
| shutil.copy(predictions_file, predictions_path) | |
| # TODO: Run evaluation here | |
| # from src.evaluation.evaluator import evaluate_predictions | |
| # results = evaluate_predictions(predictions_path) | |
| # For now, create placeholder results | |
| # This will be replaced with actual evaluation | |
| placeholder_results = { | |
| "model_name": model_name, | |
| "results": { | |
| "overall": {"anls": 0.50}, | |
| "single_evidence": {"anls": 0.50}, | |
| "multi_evidence_same_doc": {"anls": 0.50}, | |
| "multi_evidence_multi_doc": {"anls": 0.50}, | |
| }, | |
| "metadata": { | |
| "agent_steps": num_predictions, # Use total predictions as placeholder | |
| "cost_usd": 0.0, # Placeholder | |
| "model_type": model_type.lower(), | |
| }, | |
| "organization": organization, | |
| "submission_date": current_time, | |
| "num_predictions": num_predictions, | |
| "link": link.strip() if link else "", | |
| } | |
| # Save results file | |
| results_path = f"{PREDICTIONS_DIR}/{safe_model_name}_results_{current_time}.json" | |
| with open(results_path, "w") as f: | |
| json.dump(placeholder_results, f, indent=2) | |
| # Create request entry for queue | |
| eval_request = { | |
| "model": model_name, | |
| "organization": organization, | |
| "model_type": model_type, | |
| "status": "PENDING", # Will be set to FINISHED after evaluation | |
| "submitted_time": current_time, | |
| "link": link.strip() if link else "", | |
| } | |
| # Save request file | |
| request_path = f"{OUT_DIR}/{safe_model_name}_eval_request_{current_time}.json" | |
| with open(request_path, "w") as f: | |
| json.dump(eval_request, f, indent=2) | |
| print("Uploading files") | |
| try: | |
| # Upload predictions file | |
| API.upload_file( | |
| path_or_fileobj=predictions_path, | |
| path_in_repo=predictions_path.split("eval-results/")[1], | |
| repo_id=RESULTS_REPO, | |
| repo_type="dataset", | |
| commit_message=f"Add predictions for {model_name}", | |
| ) | |
| # Upload results file | |
| API.upload_file( | |
| path_or_fileobj=results_path, | |
| path_in_repo=results_path.split("eval-results/")[1], | |
| repo_id=RESULTS_REPO, | |
| repo_type="dataset", | |
| commit_message=f"Add results for {model_name}", | |
| ) | |
| # Upload request file to queue repo | |
| API.upload_file( | |
| path_or_fileobj=request_path, | |
| path_in_repo=request_path.split("eval-queue/")[1], | |
| repo_id=QUEUE_REPO, | |
| repo_type="dataset", | |
| commit_message=f"Add {model_name} to eval queue", | |
| ) | |
| except Exception as e: | |
| return styled_error(f"Error uploading files: {str(e)}") | |
| # Remove local files | |
| os.remove(request_path) | |
| os.remove(predictions_path) | |
| os.remove(results_path) | |
| return styled_message( | |
| f"Your submission for '{model_name}' has been successfully submitted!\n" | |
| f"Validated {num_predictions} predictions.\n" | |
| f"⚠️ Note: Currently using placeholder scores. Implement the evaluator to compute actual ANLS scores.\n" | |
| f"Please wait for the leaderboard to refresh." | |
| ) | |