| import os |
| import json |
| import datetime |
| import requests |
| from email.utils import parseaddr |
|
|
| import gradio as gr |
| import pandas as pd |
| import numpy as np |
|
|
| from datasets import load_dataset, VerificationMode |
| from apscheduler.schedulers.background import BackgroundScheduler |
| from huggingface_hub import HfApi |
|
|
| |
| from scorer import question_scorer |
| from content import format_error, format_warning, format_log, TITLE, INTRODUCTION_TEXT, SUBMISSION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink |
|
|
| TOKEN = os.environ.get("TOKEN", None) |
|
|
| OWNER="Blanca" |
| DATA_DATASET = f"{OWNER}/CQs-Gen_test" |
| INTERNAL_DATA_DATASET = f"{OWNER}/CQs-Gen_test" |
| SUBMISSION_DATASET = f"{OWNER}/submissions_internal" |
| SUBMISSION_DATASET_PUBLIC = f"{OWNER}/submissions_public" |
| |
| RESULTS_DATASET = f"{OWNER}/results_public" |
| LEADERBOARD_PATH = f"HiTZ/Critical_Questions_Leaderboard" |
| api = HfApi() |
|
|
| YEAR_VERSION = "2025" |
| ref_scores_len = {"test": 34} |
| |
|
|
| os.makedirs("scored", exist_ok=True) |
|
|
| |
| LOCAL_DEBUG = False |
|
|
| |
| test_results = load_dataset( |
| RESULTS_DATASET, |
| YEAR_VERSION, |
| split="test", |
| token=TOKEN, |
| download_mode="force_redownload", |
| verification_mode=VerificationMode.NO_CHECKS, |
| trust_remote_code=True, |
| ) |
| eval_results = {"test": test_results} |
| |
| def get_dataframe_from_results(eval_results, split): |
| local_df = eval_results[split] |
| local_df = local_df.map(lambda row: {"model": model_hyperlink(row["url"], row["model"])}) |
| local_df = local_df.remove_columns(["system_prompt", "url"]) |
| local_df = local_df.rename_column("model", "Agent name") |
| local_df = local_df.rename_column("model_family", "Model family") |
| local_df = local_df.rename_column("score", "Score (%)") |
| local_df = local_df.rename_column("date", "Submission date") |
| df = pd.DataFrame(local_df) |
| df = df.sort_values(by=["Score (%)"], ascending=False) |
|
|
| df["Score (%)"] = df["Score (%)"].multiply(100).round(2) |
|
|
| return df |
|
|
|
|
| eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test") |
|
|
| |
| gold_results = {} |
| gold_dataset = load_dataset(INTERNAL_DATA_DATASET, "test", token=TOKEN, trust_remote_code=True) |
| gold_results = {"test": {row["intervention_id"]: row for row in gold_dataset["test"]}} |
|
|
|
|
| def restart_space(): |
| api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN) |
|
|
| TYPES = ["markdown", "number", "number", "number", "number", "str", "str", "str"] |
|
|
| def add_new_eval( |
| val_or_test: str, |
| model: str, |
| model_family: str, |
| system_prompt: str, |
| url: str, |
| path_to_file: str, |
| organisation: str, |
| mail: str, |
| profile: gr.OAuthProfile, |
| ): |
| |
| user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview") |
| creation_date = json.loads(user_data.content)["createdAt"] |
| if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=60): |
| return format_error("This account is not authorized to submit on this leaderboard.") |
| |
|
|
| |
| |
| |
| |
|
|
| val_or_test = "test" |
| is_validation = False |
| |
| _, parsed_mail = parseaddr(mail) |
| if not "@" in parsed_mail: |
| return format_warning("Please provide a valid email adress.") |
|
|
| print("Adding new eval") |
|
|
| |
| if model.lower() in set([m.lower() for m in eval_results[val_or_test]["model"]]) and organisation.lower() in set([o.lower() for o in eval_results[val_or_test]["organisation"]]): |
| return format_warning("This model has been already submitted.") |
| |
| if path_to_file is None: |
| return format_warning("Please attach a file.") |
|
|
| |
| if LOCAL_DEBUG: |
| print("mock uploaded submission") |
| else: |
| api.upload_file( |
| repo_id=SUBMISSION_DATASET, |
| path_or_fileobj=path_to_file.name, |
| path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_raw_{datetime.datetime.today()}.jsonl", |
| repo_type="dataset", |
| token=TOKEN |
| ) |
|
|
| |
| contact_info = { |
| "model": model, |
| "model_family": model_family, |
| "url": url, |
| "organisation": organisation, |
| "username": profile.username, |
| "mail": mail, |
| "date": datetime.datetime.today().strftime('%Y-%m-%d') |
| } |
| |
| |
| |
| |
| |
|
|
| |
| file_path = path_to_file.name |
| scores = 0 |
| num_questions = 0 |
| task_ids = [] |
|
|
| with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file: |
| with open(file_path, 'r') as f: |
| for ix, line in enumerate(f): |
| try: |
| task = json.loads(line) |
| except Exception: |
| return format_error(f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file.") |
| if "model_answer" not in task: |
| return format_error(f"Line {ix} missing 'model_answer'.") |
| answer = task["model_answer"] |
| task_id = task["task_id"] |
|
|
| if task_id not in gold_results[val_or_test]: |
| return format_error(f"{task_id} not found in gold set.") |
|
|
| score = question_scorer(answer, gold_results[val_or_test][task_id]["Final answer"]) |
|
|
| scored_file.write( |
| json.dumps({ |
| "id": task_id, |
| "model_answer": answer, |
| "score": score |
| }) + "\n" |
| ) |
|
|
| task_ids.append(task_id) |
| scores += score |
| num_questions += 1 |
|
|
|
|
| |
| if len(task_ids) != len(set(task_ids)): |
| return format_error("There are duplicates in your submission. Please check your file and resubmit it.") |
|
|
| |
| |
|
|
| |
| if LOCAL_DEBUG: |
| print("mock uploaded scored submission") |
| else: |
| api.upload_file( |
| repo_id=SUBMISSION_DATASET, |
| path_or_fileobj=f"scored/{organisation}_{model}.jsonl", |
| path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl", |
| repo_type="dataset", |
| token=TOKEN |
| ) |
|
|
| |
| if is_validation: |
| api.upload_file( |
| repo_id=SUBMISSION_DATASET_PUBLIC, |
| path_or_fileobj=f"scored/{organisation}_{model}.jsonl", |
| path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl", |
| repo_type="dataset", |
| token=TOKEN |
| ) |
|
|
| |
| eval_entry = { |
| "model": model, |
| "model_family": model_family, |
| "system_prompt": system_prompt, |
| "url": url, |
| "organisation": organisation, |
| "score": scores / ref_scores_len, |
| |
| |
| |
| "date": datetime.datetime.today().strftime('%Y-%m-%d') |
| } |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
|
|
| eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry) |
| print(eval_results) |
| if LOCAL_DEBUG: |
| print("mock uploaded results to lb") |
| else: |
| eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=TOKEN) |
|
|
|
|
| return format_log(f"Model {model} submitted by {organisation} successfully.\nPlease wait a few hours and refresh the leaderboard to see your score displayed.") |
|
|
|
|
| def refresh(): |
| test_results = load_dataset( |
| RESULTS_DATASET, |
| YEAR_VERSION, |
| split="test", |
| token=TOKEN, |
| download_mode="force_redownload", |
| verification_mode=VerificationMode.NO_CHECKS, |
| trust_remote_code=True, |
| ) |
| eval_dataframe_test = get_dataframe_from_results(eval_results={"test": test_results}, split="test") |
| return eval_dataframe_test |
|
|
|
|
|
|
| def upload_file(files): |
| file_paths = [file.name for file in files] |
| return file_paths |
|
|
|
|
| demo = gr.Blocks() |
| with demo: |
| gr.HTML(TITLE) |
| gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") |
|
|
| with gr.Row(): |
| with gr.Accordion("📙 Citation", open=False): |
| citation_button = gr.Textbox( |
| value=CITATION_BUTTON_TEXT, |
| label=CITATION_BUTTON_LABEL, |
| elem_id="citation-button", |
| ) |
|
|
| with gr.Tab("Results: Test"): |
| leaderboard_table_test = gr.components.Dataframe( |
| value=eval_dataframe_test, datatype=TYPES, interactive=False, |
| column_widths=["20%"] |
| ) |
| |
| |
| |
| |
| |
|
|
| refresh_button = gr.Button("Refresh") |
| refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table_test]) |
|
|
| with gr.Accordion("Submit a new model for evaluation"): |
| with gr.Row(): |
| gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text") |
| with gr.Row(): |
| with gr.Column(): |
| level_of_test = gr.Radio(["test"], value="test", label="Split") |
| model_name_textbox = gr.Textbox(label="Agent name") |
| model_family_textbox = gr.Textbox(label="Model family") |
| system_prompt_textbox = gr.Textbox(label="System prompt example") |
| url_textbox = gr.Textbox(label="Url to model information") |
| with gr.Column(): |
| organisation = gr.Textbox(label="Organisation") |
| mail = gr.Textbox(label="Contact email (will be stored privately, & used if there is an issue with your submission)") |
| file_output = gr.File() |
|
|
|
|
| with gr.Row(): |
| gr.LoginButton() |
| submit_button = gr.Button("Submit Eval") |
| submission_result = gr.Markdown() |
| submit_button.click( |
| lambda *args: add_new_eval("test", *args), |
| [ |
| model_name_textbox, |
| model_family_textbox, |
| system_prompt_textbox, |
| url_textbox, |
| file_output, |
| organisation, |
| mail |
| ], |
| submission_result, |
| ) |
|
|
|
|
| scheduler = BackgroundScheduler() |
| scheduler.add_job(restart_space, "interval", seconds=3600) |
| scheduler.start() |
| demo.launch(debug=True) |
|
|