diff --git a/.ipynb_checkpoints/README-checkpoint.md b/.ipynb_checkpoints/README-checkpoint.md new file mode 100644 index 0000000000000000000000000000000000000000..8ae1f7554e5330e51bd96427d39f32bc9e35908f --- /dev/null +++ b/.ipynb_checkpoints/README-checkpoint.md @@ -0,0 +1,12 @@ +--- +title: MediLingua Leaderboard +emoji: 🚀 +colorFrom: blue +colorTo: purple +sdk: gradio +sdk_version: 5.45.0 +app_file: app.py +pinned: false +--- + +Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..b5685772804c8af4235a8504dc6752bfc9ae5d1d --- /dev/null +++ b/Makefile @@ -0,0 +1,13 @@ +.PHONY: style format + + +style: + python -m black --line-length 119 . + python -m isort . + ruff check --fix . + + +quality: + python -m black --check --line-length 119 . + python -m isort --check-only . + ruff check . diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..bf29604dbf91719c764cc4fba3a1b7d3d8be4cc5 --- /dev/null +++ b/app.py @@ -0,0 +1,224 @@ +import gradio as gr +from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns +import pandas as pd +from apscheduler.schedulers.background import BackgroundScheduler +from huggingface_hub import snapshot_download +from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE +from src.tasks import TASK_DESCRIPTIONS, MEASURE_DESCRIPTION +from src.display.css_html_js import custom_css +from src.display.utils import BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, ModelType, fields, WeightType, Precision +from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN +from src.populate import get_evaluation_queue_df, get_leaderboard_df +from src.submission.submit import add_new_eval +import random + +# Define task metadata (icons, names, descriptions) +TASK_METADATA_MULTIPLECHOICE = { +# "TE": {"icon": "📊", "name": "Textual Entailment", "tooltip": ""}, +# "SA": {"icon": "😃", "name": "Sentiment Analysis", "tooltip": ""}, +# "HS": {"icon": "⚠️", "name": "Hate Speech", "tooltip": ""}, +# "AT": {"icon": "🏥", "name": "Admission Test", "tooltip": ""}, +# "WIC": {"icon": "🔤", "name": "Word in Context", "tooltip": ""}, +# "FAQ": {"icon": "❓", "name": "Frequently Asked Questions", "tooltip": ""} +} + +# Define task metadata (icons, names, descriptions) +TASK_METADATA_GENERATIVE = { +# "LS": {"icon": "🔄", "name": "Lexical Substitution", "tooltip": ""}, +# "SU": {"icon": "📝", "name": "Summarization", "tooltip": ""}, + "NER": {"icon": "🏷️", "name": "Named Entity Recognition", "tooltip": ""}, + "REL": {"icon": "🔗", "name": "Relation Extraction", "tooltip": ""}, +} + +def restart_space(): + """Restart the Hugging Face space.""" + API.restart_space(repo_id=REPO_ID) + + +def init_leaderboard(dataframe, default_selection=None, hidden_columns=None): + """ + Initialize and return the leaderboard when it is first loaded or when 'benchmark' is selected. + The table is sorted based on the "Avg. Combined Performance" field. + """ + if dataframe is None or dataframe.empty: + raise ValueError("Leaderboard DataFrame is empty or None.") + + field_list = fields(AutoEvalColumn) + + return Leaderboard( + value=dataframe, + datatype=[c.type for c in field_list], + #select_columns=SelectColumns( + # default_selection=default_selection or [c.name for c in field_list if c.displayed_by_default], + # cant_deselect=[c.name for c in field_list if c.never_hidden], + # label="Select Columns to Display:", + #), + search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name], + hide_columns=hidden_columns or [c.name for c in field_list if c.hidden], + filter_columns=[ + ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"), + ColumnFilter(AutoEvalColumn.LANG.name, type="checkboxgroup", label="Languges "), + # ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"), + ], + #filter_columns=[ + # ColumnFilter("IS_FS", type="checkbox", default=False, label="5-Few-Shot") + # #ColumnFilter("FS", type="dropdown", label="5-Few-Shot") + #], + bool_checkboxgroup_label="Evaluation Mode", + interactive=False, + ) + +def update_task_leaderboard(dataframe, default_selection=None, hidden_columns=None): + """ + Update and return the leaderboard when a specific task is selected. + The table is sorted based on the "Combined Performance" field. + """ + if dataframe is None or dataframe.empty: + raise ValueError("Leaderboard DataFrame is empty or None.") + print ("-----------") + print(dataframe) + print("columns : ", dataframe.columns) + print ("-----------") + + #sorted_dataframe = dataframe.sort_values(by="Combined Performance", ascending=False) + sorted_dataframe = dataframe.sort_values(by="Avg. Combined Performance ⬆️", ascending=False) + + #print(sorted_dataframe['Combined Performance']) + + field_list = fields(AutoEvalColumn) + + return Leaderboard( + value=sorted_dataframe, + datatype=[c.type for c in field_list], + #select_columns=SelectColumns( + # default_selection=default_selection or [c.name for c in field_list if c.displayed_by_default], + # cant_deselect=[c.name for c in field_list if c.never_hidden], + # label="Select Columns to Display:", + #), + search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name], + hide_columns=hidden_columns or [c.name for c in field_list if c.hidden], + filter_columns=[ + ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"), + ColumnFilter(AutoEvalColumn.LANG.name, type="checkboxgroup", label="Languges "), + ], + bool_checkboxgroup_label="Evaluation Mode", + interactive=False + ) + +''' +# Helper function for leaderboard initialization +def init_leaderboard(dataframe, default_selection=None, hidden_columns=None): + """Initialize and return a leaderboard.""" + if dataframe is None or dataframe.empty: + raise ValueError("Leaderboard DataFrame is empty or None.") + + return Leaderboard( + value=dataframe, + datatype=[c.type for c in fields(AutoEvalColumn)], + select_columns=SelectColumns( + default_selection=default_selection or [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default], + cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden], + label="Select Columns to Display:", + ), + search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name], + hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden], + filter_columns=[ + ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"), + ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"), + ], + bool_checkboxgroup_label="Hide models", + interactive=False, + ) +''' + +def download_snapshot(repo, local_dir): + """Try to download a snapshot from Hugging Face Hub.""" + try: + print(f"Downloading from {repo} to {local_dir}...") + snapshot_download(repo_id=repo, local_dir=local_dir, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN) + except Exception as e: + print(f"Error downloading {repo}: {e}") + restart_space() + + +# Initialize the app by downloading snapshots +#download_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH) +#download_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH) + +# Load leaderboard data +LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS) +finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS) + +# Prepare the main interface +demo = gr.Blocks(css=custom_css) +with demo: + gr.HTML(TITLE) + gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") + + with gr.Tabs(elem_classes="tab-buttons") as tabs: + + # Main leaderboard tab + with gr.TabItem("🏅 Benchmark"): + + leaderboard = init_leaderboard( + LEADERBOARD_DF, + default_selection=['LANG','FS', 'Model', "Avg. Combined Performance ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"], + hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['LANG','FS', 'Model', "Avg. Combined Performance ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]] + ) + + # About tab + with gr.TabItem("📝 About"): + gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") + + # About tab + with gr.TabItem("║", interactive=False): + gr.Markdown("", elem_classes="markdown-text") + + # Task-specific leaderboards + for task, metadata in TASK_METADATA_MULTIPLECHOICE.items(): + + with gr.TabItem(f"{metadata['icon']}{task}"): + + task_description = TASK_DESCRIPTIONS.get(task, "Description not available.") + gr.Markdown(task_description, elem_classes="markdown-text") + + leaderboard = update_task_leaderboard( + LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average", f"{task} Best Prompt": "Best Prompt", f"{task} Best Prompt Id": "Best Prompt Id", task: "Combined Performance"}), + default_selection=['LANG','FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id'], + hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['LANG','FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']] + ) + + # About tab + with gr.TabItem("│", interactive=False): + gr.Markdown("", elem_classes="markdown-text") + + # Task-specific leaderboards + for task, metadata in TASK_METADATA_GENERATIVE.items(): + with gr.TabItem(f"{metadata['icon']}{task}"): + task_description = TASK_DESCRIPTIONS.get(task, "Description not available.") + gr.Markdown(task_description, elem_classes="markdown-text") + + leaderboard = update_task_leaderboard( + LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average", + f"{task} Best Prompt": "Best Prompt", + f"{task} Best Prompt Id": "Best Prompt Id", + task: "Combined Performance"}), + default_selection=['LANG','FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', + 'Best Prompt Id'], + hidden_columns=[col for col in LEADERBOARD_DF.columns if + col not in ['LANG','FS', 'Model', 'Combined Performance', 'Prompt Average', + 'Best Prompt', 'Best Prompt Id']] + ) + + # Citation section + with gr.Accordion("📙 Citation", open=False): + gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True) + +# Background job to restart space +scheduler = BackgroundScheduler() +scheduler.add_job(restart_space, "interval", seconds=1800) +scheduler.start() + +# Launch the app with concurrent queueing +demo.queue(default_concurrency_limit=40).launch(debug=True, # Enable Gradio debug mode + show_error=True) \ No newline at end of file diff --git a/e3c_llm_requests/meta-llama/.ipynb_checkpoints/Llama-3.2-1B-Instruct-checkpoint.json b/e3c_llm_requests/meta-llama/.ipynb_checkpoints/Llama-3.2-1B-Instruct-checkpoint.json new file mode 100644 index 0000000000000000000000000000000000000000..dd0dab56bad582a995b770ac81c21b4ad4954553 --- /dev/null +++ b/e3c_llm_requests/meta-llama/.ipynb_checkpoints/Llama-3.2-1B-Instruct-checkpoint.json @@ -0,0 +1,8 @@ +{ + "model": "meta-llama/Llama-3.2-1B-Instruct", + "base_model": "LlamaForCausalLM", + "revision": "9213176726f574b556790deb65791e0c5aa438b6", + "submitted_time": "2024-09-18 15:12:47+00:00", + "num_params_billion": 1.2358144, + "language": "en_de_fr_it_pt_hi_es_th" +} \ No newline at end of file diff --git a/e3c_llm_requests/meta-llama/Llama-3.2-1B-Instruct.json b/e3c_llm_requests/meta-llama/Llama-3.2-1B-Instruct.json new file mode 100644 index 0000000000000000000000000000000000000000..dd0dab56bad582a995b770ac81c21b4ad4954553 --- /dev/null +++ b/e3c_llm_requests/meta-llama/Llama-3.2-1B-Instruct.json @@ -0,0 +1,8 @@ +{ + "model": "meta-llama/Llama-3.2-1B-Instruct", + "base_model": "LlamaForCausalLM", + "revision": "9213176726f574b556790deb65791e0c5aa438b6", + "submitted_time": "2024-09-18 15:12:47+00:00", + "num_params_billion": 1.2358144, + "language": "en_de_fr_it_pt_hi_es_th" +} \ No newline at end of file diff --git a/e3c_llm_results/meta-llama/.ipynb_checkpoints/Llama-3-1.2-1B-Instruct_5_it-checkpoint.json b/e3c_llm_results/meta-llama/.ipynb_checkpoints/Llama-3-1.2-1B-Instruct_5_it-checkpoint.json new file mode 100644 index 0000000000000000000000000000000000000000..8ab73cb0c201e780e965129ec17f0fd3ba5131be --- /dev/null +++ b/e3c_llm_results/meta-llama/.ipynb_checkpoints/Llama-3-1.2-1B-Instruct_5_it-checkpoint.json @@ -0,0 +1,39 @@ +{ + "average_CPS": 12.479999999999999, + "config": { + "model_name": "meta-llama/Llama-3.2-1B-Instruct", + "num_fewshot": "5", + "LANG":"EN", + "batch_size": 8 + }, + "tasks": { + "RE": { + "prompts": [ + { + "prompt": "prom_1", + "metric": "f1", + "value": 12.479999999999999, + "stderr": null + } + ], + "average_accuracy": 12.479999999999999, + "best_prompt": 12.479999999999999, + "prompt_id": "prom_1", + "CPS": 12.479999999999999 + }, + "NER": { + "prompts": [ + { + "prompt": "prom_1", + "metric": "f1", + "value": 20, + "stderr": null + } + ], + "average_accuracy": 20, + "best_prompt": 20, + "prompt_id": "prom_3", + "CPS": 20 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/meta-llama/.ipynb_checkpoints/Llama-3-1.2-1B-Instruct_5_sl-checkpoint.json b/e3c_llm_results/meta-llama/.ipynb_checkpoints/Llama-3-1.2-1B-Instruct_5_sl-checkpoint.json new file mode 100644 index 0000000000000000000000000000000000000000..95acebe7b56935f0e409c3d991831ed381197fba --- /dev/null +++ b/e3c_llm_results/meta-llama/.ipynb_checkpoints/Llama-3-1.2-1B-Instruct_5_sl-checkpoint.json @@ -0,0 +1,39 @@ +{ + "average_CPS": 5, + "config": { + "model_name": "meta-llama/Llama-3.2-1B-Instruct", + "num_fewshot": "5", + "LANG":"IT", + "batch_size": 8 + }, + "tasks": { + "RE": { + "prompts": [ + { + "prompt": "prom_1", + "metric": "f1", + "value": 5, + "stderr": null + } + ], + "average_accuracy": 5, + "best_prompt": 5, + "prompt_id": "prom_1", + "CPS": 5 + }, + "NER": { + "prompts": [ + { + "prompt": "prom_1", + "metric": "f1", + "value": 25, + "stderr": null + } + ], + "average_accuracy": 25, + "best_prompt": 25, + "prompt_id": "prom_3", + "CPS": 25 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/meta-llama/.ipynb_checkpoints/Llama-3.2-1B-Instruct_5-checkpoint.json b/e3c_llm_results/meta-llama/.ipynb_checkpoints/Llama-3.2-1B-Instruct_5-checkpoint.json new file mode 100644 index 0000000000000000000000000000000000000000..50c48aa8a1f4cefbea9b9b5546e2cb3fb1749154 --- /dev/null +++ b/e3c_llm_results/meta-llama/.ipynb_checkpoints/Llama-3.2-1B-Instruct_5-checkpoint.json @@ -0,0 +1,24 @@ +{ + "average_CPS": 12.479999999999999, + "config": { + "model_name": "meta-llama/Llama-3.2-1B-Instruct", + "num_fewshot": "5", + "batch_size": 8 + }, + "tasks": { + "RE": { + "prompts": [ + { + "prompt": "prom_1", + "metric": "f1", + "value": 12.479999999999999, + "stderr": null + } + ], + "average_accuracy": 12.479999999999999, + "best_prompt": 12.479999999999999, + "prompt_id": "prom_1", + "CPS": 12.479999999999999 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/meta-llama/Llama-3-1.2-1B-Instruct_5_it.json b/e3c_llm_results/meta-llama/Llama-3-1.2-1B-Instruct_5_it.json new file mode 100644 index 0000000000000000000000000000000000000000..95acebe7b56935f0e409c3d991831ed381197fba --- /dev/null +++ b/e3c_llm_results/meta-llama/Llama-3-1.2-1B-Instruct_5_it.json @@ -0,0 +1,39 @@ +{ + "average_CPS": 5, + "config": { + "model_name": "meta-llama/Llama-3.2-1B-Instruct", + "num_fewshot": "5", + "LANG":"IT", + "batch_size": 8 + }, + "tasks": { + "RE": { + "prompts": [ + { + "prompt": "prom_1", + "metric": "f1", + "value": 5, + "stderr": null + } + ], + "average_accuracy": 5, + "best_prompt": 5, + "prompt_id": "prom_1", + "CPS": 5 + }, + "NER": { + "prompts": [ + { + "prompt": "prom_1", + "metric": "f1", + "value": 25, + "stderr": null + } + ], + "average_accuracy": 25, + "best_prompt": 25, + "prompt_id": "prom_3", + "CPS": 25 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/meta-llama/Llama-3-1.2-1B-Instruct_5_sl.json b/e3c_llm_results/meta-llama/Llama-3-1.2-1B-Instruct_5_sl.json new file mode 100644 index 0000000000000000000000000000000000000000..5d6184c793235a6e434f76bd74b25be2aa7acdb0 --- /dev/null +++ b/e3c_llm_results/meta-llama/Llama-3-1.2-1B-Instruct_5_sl.json @@ -0,0 +1,39 @@ +{ + "average_CPS": 5, + "config": { + "model_name": "meta-llama/Llama-3.2-1B-Instruct", + "num_fewshot": "5", + "LANG":"SL", + "batch_size": 8 + }, + "tasks": { + "RE": { + "prompts": [ + { + "prompt": "prom_1", + "metric": "f1", + "value": 8, + "stderr": null + } + ], + "average_accuracy": 8, + "best_prompt": 8, + "prompt_id": "prom_1", + "CPS": 8 + }, + "NER": { + "prompts": [ + { + "prompt": "prom_1", + "metric": "f1", + "value": 28, + "stderr": null + } + ], + "average_accuracy": 28, + "best_prompt": 28, + "prompt_id": "prom_3", + "CPS": 28 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/meta-llama/Llama-3.2-1B-Instruct_5.json b/e3c_llm_results/meta-llama/Llama-3.2-1B-Instruct_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8ab73cb0c201e780e965129ec17f0fd3ba5131be --- /dev/null +++ b/e3c_llm_results/meta-llama/Llama-3.2-1B-Instruct_5.json @@ -0,0 +1,39 @@ +{ + "average_CPS": 12.479999999999999, + "config": { + "model_name": "meta-llama/Llama-3.2-1B-Instruct", + "num_fewshot": "5", + "LANG":"EN", + "batch_size": 8 + }, + "tasks": { + "RE": { + "prompts": [ + { + "prompt": "prom_1", + "metric": "f1", + "value": 12.479999999999999, + "stderr": null + } + ], + "average_accuracy": 12.479999999999999, + "best_prompt": 12.479999999999999, + "prompt_id": "prom_1", + "CPS": 12.479999999999999 + }, + "NER": { + "prompts": [ + { + "prompt": "prom_1", + "metric": "f1", + "value": 20, + "stderr": null + } + ], + "average_accuracy": 20, + "best_prompt": 20, + "prompt_id": "prom_3", + "CPS": 20 + } + } +} \ No newline at end of file diff --git a/example_app.py b/example_app.py new file mode 100644 index 0000000000000000000000000000000000000000..e6e712f9ac66b7f5ae4305c0615540fde9141d85 --- /dev/null +++ b/example_app.py @@ -0,0 +1,324 @@ +import gradio as gr +from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns +import pandas as pd +from apscheduler.schedulers.background import BackgroundScheduler +from huggingface_hub import snapshot_download + +from src.about import ( + CITATION_BUTTON_LABEL, + CITATION_BUTTON_TEXT, + EVALUATION_QUEUE_TEXT, + INTRODUCTION_TEXT, + LLM_BENCHMARKS_TEXT, + TITLE, +) + +from src.tasks import ( + TE_DESCRIPTION, +) + +from src.display.css_html_js import custom_css +from src.display.utils import ( + BENCHMARK_COLS, + COLS, + EVAL_COLS, + EVAL_TYPES, + AutoEvalColumn, + ModelType, + fields, + WeightType, + Precision +) +from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN +from src.populate import get_evaluation_queue_df, get_leaderboard_df +from src.submission.submit import add_new_eval + + +def restart_space(): + API.restart_space(repo_id=REPO_ID) + +### Space initialisation +try: + print(EVAL_REQUESTS_PATH) + snapshot_download( + repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN + ) +except Exception: + restart_space() +try: + print(EVAL_RESULTS_PATH) + snapshot_download( + repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN + ) +except Exception: + restart_space() + + +LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS) + +( + finished_eval_queue_df, + running_eval_queue_df, + pending_eval_queue_df, +) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS) + +def init_leaderboard(dataframe): + print(dataframe) + if dataframe is None or dataframe.empty: + raise ValueError("Leaderboard DataFrame is empty or None.") + return Leaderboard( + value=dataframe, + datatype=[c.type for c in fields(AutoEvalColumn)], + select_columns=SelectColumns( + default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default], + cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden], + label="Select Columns to Display:", + ), + search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name], + hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden], + filter_columns=[ + ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"), + ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"), + ColumnFilter( + AutoEvalColumn.params.name, + type="slider", + min=0.01, + max=150, + label="Select the number of parameters (B)", + ), + ColumnFilter( + AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True + ), + ], + bool_checkboxgroup_label="Hide models", + interactive=False, + ) + + +def init_leaderboard2(dataframe, default_selection=None, hidden_columns=None): + + print("entrato===============================================") + + if dataframe is None or dataframe.empty: + raise ValueError("Leaderboard DataFrame is empty or None.") + return Leaderboard( + value=dataframe, + datatype=[c.type for c in fields(AutoEvalColumn)], + select_columns=SelectColumns( + default_selection=default_selection or [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default], + cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden], + label="Select Columns to Display:", + ), + search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name], + hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden], + filter_columns=[ + ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"), + ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"), + ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0.01, max=150, label="Select the number of parameters (B)"), + ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True), + ], + bool_checkboxgroup_label="Hide models", + interactive=False, + ) + + +demo = gr.Blocks(css=custom_css) +with demo: + gr.HTML(TITLE) + gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") + + with gr.Tabs(elem_classes="tab-buttons") as tabs: + with gr.TabItem("🏅 EVALITA-LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0): + #leaderboard = init_leaderboard(LEADERBOARD_DF) + + leaderboard = init_leaderboard2( + LEADERBOARD_DF, + default_selection=['T', 'Model', "Average ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"], + hidden_columns=[col for col in LEADERBOARD_DF.columns if + col not in ['T', 'Model', "Average ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL" ]] + ) + + + with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2): + gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") + + with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3): + with gr.Column(): + with gr.Row(): + gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") + + with gr.Column(): + with gr.Accordion( + f"✅ Finished Evaluations ({len(finished_eval_queue_df)})", + open=False, + ): + with gr.Row(): + finished_eval_table = gr.components.Dataframe( + value=finished_eval_queue_df, + headers=EVAL_COLS, + datatype=EVAL_TYPES, + row_count=5, + ) + with gr.Accordion( + f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})", + open=False, + ): + with gr.Row(): + running_eval_table = gr.components.Dataframe( + value=running_eval_queue_df, + headers=EVAL_COLS, + datatype=EVAL_TYPES, + row_count=5, + ) + + with gr.Accordion( + f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})", + open=False, + ): + with gr.Row(): + pending_eval_table = gr.components.Dataframe( + value=pending_eval_queue_df, + headers=EVAL_COLS, + datatype=EVAL_TYPES, + row_count=5, + ) + with gr.Row(): + gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text") + + with gr.Row(): + with gr.Column(): + model_name_textbox = gr.Textbox(label="Model name") + revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main") + model_type = gr.Dropdown( + choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown], + label="Model type", + multiselect=False, + value=None, + interactive=True, + ) + + with gr.Column(): + precision = gr.Dropdown( + choices=[i.value.name for i in Precision if i != Precision.Unknown], + label="Precision", + multiselect=False, + value="float16", + interactive=True, + ) + weight_type = gr.Dropdown( + choices=[i.value.name for i in WeightType], + label="Weights type", + multiselect=False, + value="Original", + interactive=True, + ) + base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)") + + submit_button = gr.Button("Submit Eval") + submission_result = gr.Markdown() + submit_button.click( + add_new_eval, + [ + model_name_textbox, + base_model_name_textbox, + revision_name_textbox, + precision, + weight_type, + model_type, + ], + submission_result, + ) + + + with gr.TabItem("TE", elem_id="llm-benchmark-tab-table", id=4): + gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text") + #leaderboard = init_leaderboard(LEADERBOARD_DF) + + LEADERBOARD_DF_TE = LEADERBOARD_DF.rename(columns={"TE Prompt Average": "Prompt Average", + "TE Best Prompt": "Best Prompt", + "TE Best Prompt Id": "Best Prompt Id", + "TE": "Combined Performance"}) + + leaderboard = init_leaderboard2( + LEADERBOARD_DF_TE, + default_selection=['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id'], + hidden_columns=[col for col in LEADERBOARD_DF.columns if + col not in ['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']] + ) + + + with gr.TabItem("SA", elem_id="llm-benchmark-tab-table", id=5): + gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text") + + LEADERBOARD_DF_SA = LEADERBOARD_DF.rename(columns={"SA Prompt Average": "Prompt Average", + "SA Best Prompt": "Best Prompt", + "SA Best Prompt Id": "Best Prompt Id", + "SA": "Combined Performance"}) + + leaderboard = init_leaderboard2( + LEADERBOARD_DF_SA, + default_selection=['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', + 'Best Prompt Id'], + hidden_columns=[col for col in LEADERBOARD_DF.columns if + col not in ['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', + 'Best Prompt Id']] + ) + + + + + with gr.TabItem("HS", elem_id="llm-benchmark-tab-table", id=6): + gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text") + + LEADERBOARD_DF_HS = LEADERBOARD_DF.rename(columns={"HS Prompt Average": "Prompt Average", + "HS Best Prompt": "Best Prompt", + "HS Best Prompt Id": "Best Prompt Id", + "HS": "Combined Performance"}) + + leaderboard = init_leaderboard2( + LEADERBOARD_DF_HS, + default_selection=['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', + 'Best Prompt Id'], + hidden_columns=[col for col in LEADERBOARD_DF.columns if + col not in ['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', + 'Best Prompt Id']] + ) + + + + with gr.TabItem("AT", elem_id="llm-benchmark-tab-table", id=7): + gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text") + + with gr.TabItem("WIC", elem_id="llm-benchmark-tab-table", id=8): + gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text") + + with gr.TabItem("FAQ", elem_id="llm-benchmark-tab-table", id=9): + gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text") + + with gr.TabItem("LS", elem_id="llm-benchmark-tab-table", id=10): + gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text") + + with gr.TabItem("SU", elem_id="llm-benchmark-tab-table", id=11): + gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text") + + with gr.TabItem("NER", elem_id="llm-benchmark-tab-table", id=12): + gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text") + + with gr.TabItem("REL", elem_id="llm-benchmark-tab-table", id=13): + gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text") + + + with gr.Row(): + with gr.Accordion("📙 Citation", open=False): + citation_button = gr.Textbox( + value=CITATION_BUTTON_TEXT, + label=CITATION_BUTTON_LABEL, + lines=20, + elem_id="citation-button", + show_copy_button=True, + ) + +scheduler = BackgroundScheduler() +scheduler.add_job(restart_space, "interval", seconds=1800) +scheduler.start() +demo.queue(default_concurrency_limit=40).launch() \ No newline at end of file diff --git a/example_app2.py b/example_app2.py new file mode 100644 index 0000000000000000000000000000000000000000..9268e66807d66f4d99c6c97a748691f46972e4e8 --- /dev/null +++ b/example_app2.py @@ -0,0 +1,216 @@ +import gradio as gr +from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns +import pandas as pd +from apscheduler.schedulers.background import BackgroundScheduler +from huggingface_hub import snapshot_download + +from src.about import ( + CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, + INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE +) +from src.tasks import TASK_DESCRIPTIONS, MEASURE_DESCRIPTION +from src.display.css_html_js import custom_css +from src.display.utils import ( + BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, + ModelType, fields, WeightType, Precision +) +from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN +from src.populate import get_evaluation_queue_df, get_leaderboard_df +from src.submission.submit import add_new_eval + + + + +# Define the task icons and names +TASK_ICONS = { + "TE": "📊", # Textual Entailment + "SA": "😃", # Sentiment Analysis + "HS": "⚠️", # Hate Speech + "AT": "🏥", # Admission Test + "WIC": "🔤", # Word in Context + "FAQ": "❓", # Frequently Asked Questions + "LS": "🔄", # Lexical Substitution + "SU": "📝", # Summarization + "NER": "🏷️", # Named Entity Recognition + "REL": "🔗", # Relation Extraction +} + +TASK_NAMES = { + "TE": "Textual Entailment", + "SA": "Sentiment Analysis", + "HS": "Hate Speech", + "AT": "Admission Test", + "WIC": "Word in Context", + "FAQ": "Frequently Asked Questions", + "LS": "Lexical Substitution", + "SU": "Summarization", + "NER": "Named Entity Recognition", + "REL": "Relation Extraction", +} + + +# Tooltip descriptions for each task +TASK_TOOLTIPS = { + "TE": "Identify logical relationships between two text segments.", + "SA": "Classify the sentiment (positive, negative, neutral) of a text.", + "HS": "Detect hate speech in a text.", + "AT": "Classify whether a clinical statement pertains to an admission test.", + "WIC": "Identify words in context and their meaning.", + "FAQ": "Answer frequently asked questions based on given text.", + "LS": "Identify alternative words in a given context.", + "SU": "Summarize long text into a shorter version.", + "NER": "Identify named entities (e.g., persons, locations, organizations) in text.", + "REL": "Extract and link laboratory test results to the respective tests in clinical narratives.", +} + + + + +def restart_space(): + """Restart the Hugging Face space.""" + API.restart_space(repo_id=REPO_ID) + + +def download_snapshot(repo, local_dir): + """Try to download a snapshot from the Hugging Face Hub, restarting space on failure.""" + try: + print(f"Downloading from {repo} to {local_dir}...") + snapshot_download(repo_id=repo, local_dir=local_dir, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN) + except Exception as e: + print(f"Error downloading {repo}: {e}") + restart_space() + + +# Space initialization +download_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH) +download_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH) + +# Load leaderboard and evaluation queue data +LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS) +finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS) + + +def init_leaderboard(dataframe, default_selection=None, hidden_columns=None): + """Initialize a leaderboard with specific columns.""" + if dataframe is None or dataframe.empty: + raise ValueError("Leaderboard DataFrame is empty or None.") + + return Leaderboard( + value=dataframe, + datatype=[c.type for c in fields(AutoEvalColumn)], + select_columns=SelectColumns( + default_selection=default_selection or [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default], + cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden], + label="Select Columns to Display:", + ), + search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name], + hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden], + filter_columns=[ + #ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"), + ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="Few-Shot Learning (FS)"), + #ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"), + ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0.01, max=150, label="Select the number of parameters (B)"), + #ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True), + ], + bool_checkboxgroup_label="Hide models", + interactive=False, + ) + + +def prepare_leaderboard_df(df, task_prefix): + """Rename columns for a specific task to a standard format.""" + return df.rename(columns={ + f"{task_prefix} Prompt Average": "Prompt Average", + f"{task_prefix} Best Prompt": "Best Prompt", + f"{task_prefix} Best Prompt Id": "Best Prompt Id", + task_prefix: "Combined Performance" + }) + + +demo = gr.Blocks(css=custom_css) +with demo: + gr.HTML(TITLE) + gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") + + with gr.Tabs(elem_classes="tab-buttons") as tabs: + # Main leaderboard tab + with gr.TabItem("🏅 EVALITA-LLM Benchmark", elem_id="llm-benchmark-tab-table"): + leaderboard = init_leaderboard( + LEADERBOARD_DF, + default_selection=['FS', 'Model', "Avg. Combined Performance ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"], + hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in + ['FS', 'Model', "Avg. Combined Performance ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]] + ) + + # About tab + with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table"): + gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") + + ''' + # Submission tab + with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table"): + gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") + + for queue_name, queue_df in [ + ("✅ Finished Evaluations", finished_eval_queue_df), + ("🔄 Running Evaluation Queue", running_eval_queue_df), + ("⏳ Pending Evaluation Queue", pending_eval_queue_df) + ]: + with gr.Accordion(f"{queue_name} ({len(queue_df)})", open=False): + gr.components.Dataframe(value=queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5) + + gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text") + with gr.Row(): + model_name_textbox = gr.Textbox(label="Model name") + revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main") + model_type = gr.Dropdown(choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown], + label="Model type", multiselect=False, interactive=True) + precision = gr.Dropdown(choices=[i.value.name for i in Precision if i != Precision.Unknown], + label="Precision", multiselect=False, value="float16", interactive=True) + weight_type = gr.Dropdown(choices=[i.value.name for i in WeightType], + label="Weights type", multiselect=False, value="Original", interactive=True) + base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)") + + submit_button = gr.Button("Submit Eval") + submission_result = gr.Markdown() + submit_button.click( + add_new_eval, + [model_name_textbox, base_model_name_textbox, revision_name_textbox, precision, weight_type, model_type], + submission_result, + ) + ''' + + # Task-specific leaderboards + for task in ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]: + + with gr.TabItem(f"{TASK_ICONS[task]}{task}", elem_id="llm-benchmark-tab-table"): + + task_description = TASK_DESCRIPTIONS.get(task, "Description not available.") + + + + + gr.Markdown(task_description, elem_classes="markdown-text") + + + gr.Markdown(MEASURE_DESCRIPTION, elem_classes="markdown-text") + + + + leaderboard = init_leaderboard( + prepare_leaderboard_df(LEADERBOARD_DF, task), + default_selection=['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id'], + hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in + ['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']] + ) + + # Citation section + with gr.Accordion("📙 Citation", open=False): + gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True) + +# Background job to restart space +scheduler = BackgroundScheduler() +scheduler.add_job(restart_space, "interval", seconds=1800) +scheduler.start() + +demo.queue(default_concurrency_limit=40).launch() \ No newline at end of file diff --git a/get_model_info.py b/get_model_info.py new file mode 100644 index 0000000000000000000000000000000000000000..bc5c6dd630dc417dac307eb6530d74cc7867d77a --- /dev/null +++ b/get_model_info.py @@ -0,0 +1,129 @@ +""" +MODEL METADATA EXTRACTOR + +This script processes model evaluation output files (input_folder) from the lm-eval-harness library, +extracts model identifiers, retrieves detailed metadata from HuggingFace +and saves the information as structured JSON files (output_folder). + +Input: Directory containing .out files from lm-eval-harness +Output: Directory with JSON files containing model metadata +""" + +# Example input file format (lm-eval-harness output): +''' +hf (pretrained=swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: 1 +| Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| +|------------------------|------:|------|-----:|--------|---|-----:|---|------| +|evalita-mp | 1|none | |acc |↑ |0.5605|± |0.0052| +... +Job completed +''' + +# Example output JSON format: +''' +{ + "model": "swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA", + "base_model": "LlamaForCausalLM", + "revision": "2b6e46e4c9d341dc8bf8350a167492c880116b66", + "submitted_time": "2024-04-29 09:34:12+00:00", + "num_params_billion": 8.030261248, + "language": "en_it" +} +''' + +import os +import re +import json +from huggingface_hub import HfApi + +# Configures the Hugging Face token (if needed) +# TOKEN = "YOUR_HUGGINGFACE_API_TOKEN" +api = HfApi() + +# Directory paths +# input_folder: Directory containing the output files of the lm-eval-harness library, including model accuracy metrics. +#input_folder = "../evalita_llm_models_output/" +input_folder = "/home/sfarzi/leaderboard/evalita_llm_leaderboard/task_result/" +# output_folder: Directory where JSON files with model characteristics will be saved. +output_folder = "/home/sfarzi/leaderboard/evalita_llm_leaderboard/e3c_llm_requests/" + +# Creates the output folder if it doesn't exist +os.makedirs(output_folder, exist_ok=True) + +# Regular expression to find the model name +model_pattern = re.compile(r"pretrained=([\w\-./]+)") + +# Scans files in the input folder +for filename in os.listdir(input_folder): + if filename.endswith('.out'): + file_path = os.path.join(input_folder, filename) + + # Reads the file content + with open(file_path, "r", encoding="utf-8") as f: + content = f.read() + + # Extracts the model name + match = model_pattern.search(content) + if match: + model_name = match.group(1) + print(f"Processing model: {model_name}") + + try: + # Retrieves model information from HuggingFace + model_info = api.model_info(model_name) + + # Calculates the number of parameters in billions, if available + num_params = None + if model_info.safetensors and "BF16" in model_info.safetensors.parameters: + num_params = model_info.safetensors.parameters["BF16"] / 1e9 # Convert to billions + + # Extracts and concatenates languages + language = "_".join(model_info.card_data.get("language", [])) if model_info.card_data else "" + + #print(model_info) + + # Builds the dictionary with required metadata + model_data = { + "model": model_name, + "base_model": model_info.config.get("architectures", [""])[0] if model_info.config else "", + "revision": model_info.sha, + # "precision": "bfloat16", # If available, replace with real value + # "weight_type": "Original", + # "status": "FINISHED", + "submitted_time": str(model_info.created_at), + # "model_type": "pretrained", + # "likes": model_info.likes, + # "params": model_info.safetensors_size_in_bytes / 1e9 if model_info.safetensors_size_in_bytes else None, + # "license": model_info.license, + # "private": model_info.private, + "num_params_billion": num_params, # Number of parameters in billions + "language": language, # Extracted language + } + + # Separates the model_name into two parts: directory name and file name + if "/" in model_name: + dir_name, file_name = model_name.split("/", 1) + else: + dir_name, file_name = model_name, model_name # If no "/", use the same name + + # Creates the folder for saving the produced json files + model_output_folder = os.path.join(output_folder, dir_name) + os.makedirs(model_output_folder, exist_ok=True) + + # Saves the JSON file in the appropriate folder + output_file = os.path.join(model_output_folder, f"{file_name}.json") + + # Check if the file already exists + if os.path.exists(output_file): + print(f"File {output_file} already exists. Skipping...") + continue + + with open(output_file, "w", encoding="utf-8") as f: + json.dump(model_data, f, indent=4) + + print(f"Saved metadata for {model_name} in {output_file}") + + except Exception as e: + print(f"Error retrieving info for {model_name}: {e}") + + print("Process finished!") \ No newline at end of file diff --git a/lb_e3c.zip b/lb_e3c.zip new file mode 100644 index 0000000000000000000000000000000000000000..a41a916bcc77d37e43695fd6e561148cb21c8cb6 Binary files /dev/null and b/lb_e3c.zip differ diff --git a/preprocess_models_output.py b/preprocess_models_output.py new file mode 100644 index 0000000000000000000000000000000000000000..a5d666145eaa39d0fcd6cc411b9264996bd8da57 --- /dev/null +++ b/preprocess_models_output.py @@ -0,0 +1,250 @@ +""" +EVALITA LLM EVALUATION PROCESSOR + +Transforms raw model evaluation outputs into structured performance reports for leaderboard integration. + +DATA PIPELINE OVERVIEW: + +1. Inputs: + - Evaluation Results: Raw .out files from lm-eval-harness + - Model Metadata: Pre-collected .json files from HuggingFace + +2. Output: + - Comprehensive evaluation reports in JSON format + - Ready for ingestion into the evaluation leaderboard + +-------------------------------------------------------------------- +INPUT SPECIFICATION + +Evaluation Results (.out format): + hf (pretrained=model-org/model-name), num_fewshot: 5, batch_size: 1 + | Task | Metric | Value | Stderr | + |---------------|--------|--------|--------| + | main-task | acc | 0.5605 | 0.0052 | + | - sub-task | acc | 0.4640 | 0.0088 | + | - prompt-1 | acc | 0.3720 | 0.0216 | + +Model Metadata (.json format): + { + "model": "model-org/model-name", + "base_model": "ModelArchitecture", + "revision": "git_commit_hash", + "parameters": 8.03, + "language": "en_it" + } + +-------------------------------------------------------------------- +OUTPUT SPECIFICATION + +Evaluation Report (.json format): + { + "summary_metrics": { + "average_CPS": 41.74, + "num_tasks": 12 + }, + "model_config": { + "identifier": "model-org/model-name", + "architecture": "ModelArchitecture", + "parameters": 8.03, + "evaluation_settings": { + "fewshot": 5, + "batch_size": 1 + } + }, + "task_results": { + "task-name": { + "average_score": 52.60, + "best_prompt": { + "id": "prompt-6", + "score": 66.57 + }, + "prompt_analysis": [ + { + "prompt_id": "prompt-1", + "score": 37.20, + "stderr": 0.0216 + } + ] + } + } + } +""" + +import json +import os +import re + +def safe_float(value): + """Safely converts a value to float, returning None if the conversion fails.""" + try: + return float(value) + except ValueError: + return None + + +def calculate_task_metrics(task_info): + """Calculates average accuracy, best prompt accuracy, and CPS for a given task.""" + accuracies = [prompt['value'] for prompt in task_info['prompts'] if prompt['value'] is not None] + + if not accuracies: + return None + + task_info['average_accuracy'] = sum(accuracies) / len(accuracies) + best_prompt_data = max(task_info['prompts'], key=lambda x: x['value']) + task_info['best_prompt'] = best_prompt_data['value'] + task_info['prompt_id'] = best_prompt_data['prompt'] + + # Calculate CPS + avg_acc = task_info['average_accuracy'] + best_acc = task_info['best_prompt'] + task_info['CPS'] = (1 - (best_acc - avg_acc) / 100) * best_acc + + +def extract_data_from_file(file_path): + """Extracts task and prompt data from a specified file.""" + with open(file_path, 'r') as file: + lines = file.readlines() + + tasks_data = {} + current_task = None + + for line in lines: + line = line.strip() + + # Skips empty lines + if not line: + continue + + # Skips header lines + if line.startswith("| Tasks"): + continue + + # Extracts model configuration details + if line.startswith("hf (pretrained="): + start = line.find("pretrained=") + len("pretrained=") + end = line.find(",", start) + pretrained_model = line[start:end] + + num_fewshot_match = re.search(r"num_fewshot:\s*([\w\d]+)", line) + num_fewshot = num_fewshot_match.group(1) if num_fewshot_match else None + + batch_size_match = re.search(r"batch_size:\s*(\d+)", line) + batch_size = int(batch_size_match.group(1)) if batch_size_match else None + + continue + + columns = line.split('|') + if len(columns) != 11: + continue + + task_name = columns[1] + metric = columns[5].strip() + value = safe_float(columns[7]) + stderr = safe_float(columns[9]) + print (value) + # Skips normalized accuracy metrics + if metric == "acc_norm": + continue + + # Identifies task and prompt sections in the file + if task_name.startswith(" - "): + task_name = task_name[3:].strip() + current_task = task_name + tasks_data.setdefault(current_task, + {'prompts': [], 'average_accuracy': 0, 'best_prompt': None, 'prompt_id': None, + 'CPS': None}) + + elif task_name.startswith(" - ") and current_task: + prompt_name = task_name[4:].strip() + prompt_data = {'prompt': prompt_name, 'metric': metric, 'value': value * 100, + 'stderr': stderr} + tasks_data[current_task]['prompts'].append(prompt_data) + + # Special handling for evalita NER task to calculate weighted prompt averages + if "evalita NER" in tasks_data: + task_info = tasks_data["evalita NER"] + weight_map = {"ADG prompt-1": 521, "ADG prompt-2": 521, "FIC prompt-1": 1517, "FIC prompt-2": 1517, + "WN prompt-1": 2088, "WN prompt-2": 2088} + + weighted_values = {"prompt-1": 0, "prompt-2": 0} + total_weights = sum(weight_map.values()) + + for prompt in task_info['prompts']: + if prompt['prompt'] in weight_map: + if "prompt-1" in prompt['prompt']: + weighted_values["prompt-1"] += weight_map[prompt['prompt']] * prompt['value'] + elif "prompt-2" in prompt['prompt']: + weighted_values["prompt-2"] += weight_map[prompt['prompt']] * prompt['value'] + + task_info['prompts'] = [ + {"prompt": "prompt-1", "metric": "acc", "value": weighted_values["prompt-1"] / total_weights, + 'stderr': None}, + {"prompt": "prompt-2", "metric": "acc", "value": weighted_values["prompt-2"] / total_weights, + 'stderr': None}] + + # Calculates task metrics for each task + for task_info in tasks_data.values(): + calculate_task_metrics(task_info) + + # Calculates the average CPS across all tasks + tasks_with_cps = [task['CPS'] for task in tasks_data.values() if task['CPS'] is not None] + average_CPS = sum(tasks_with_cps) / len(tasks_with_cps) if tasks_with_cps else 0 + + config = { + "model_name": pretrained_model, + "num_fewshot": num_fewshot, + "batch_size": batch_size + } + + return {'average_CPS': average_CPS, 'config': config, 'tasks': tasks_data} + + +""" +MAIN PROCESSING PIPELINE + +This script executes the complete evaluation data processing workflow: + +1. Input Sources: + - Raw evaluation results (.out files) from: ../evalita_llm_models_output/ + - Model metadata JSON files from: ../evalita_llm_requests/ + +2. Processing Steps: + - Parses evaluation metrics from .out files + - Combines with model metadata + - Calculates aggregated performance statistics + +3. Output: + - Structured JSON results saved to: ../evalita_llm_results/ + - Organized by model organization/name + - Contains complete evaluation results with metadata +""" +directory_in_path = '/home/sfarzi/leaderboard/evalita_llm_leaderboard/task_result/' +directory_in_requests_path = '/home/sfarzi/leaderboard/evalita_llm_leaderboard/evalita_llm_requests/' +directory_out_results_path = '/home/sfarzi/leaderboard/evalita_llm_leaderboard/evalita_llm_results/' + +for filename in os.listdir(directory_in_path): + if filename.endswith('.out'): + file_path = os.path.join(directory_in_path, filename) + json_output = extract_data_from_file(file_path) + + model_org_name, model_name = json_output['config']['model_name'].split('/') + + + config_file_path = os.path.join(directory_in_requests_path, model_org_name, f"{model_name}.json") + + if os.path.exists(config_file_path): + with open(config_file_path, 'r', encoding='utf-8') as config_file: + additional_config = json.load(config_file) + json_output['config'].update(additional_config) + + + org_folder_path = os.path.join(directory_out_results_path, model_org_name) + os.makedirs(org_folder_path, exist_ok=True) + + file_suffix = f"{json_output['config']['num_fewshot']}" + output_file_path = os.path.join(org_folder_path, f"{model_name}_{file_suffix}.json") + + with open(output_file_path, 'w', newline="\n") as outfile: + json.dump(json_output, outfile, indent=4) + + print(f"File {filename} processed and saved to {output_file_path}") \ No newline at end of file diff --git a/preprocess_models_output_old.py b/preprocess_models_output_old.py new file mode 100644 index 0000000000000000000000000000000000000000..e9b7fe2b199f6da6bff0b380ac99afe0ce9d0314 --- /dev/null +++ b/preprocess_models_output_old.py @@ -0,0 +1,201 @@ +import json +import os +import re + +def safe_float(value): + """Convert a value to float safely. Returns None if conversion fails.""" + try: + return float(value) + except ValueError: + return None + + +def calculate_task_metrics(task_info): + """Calculate average accuracy, best prompt, and CPS for a task.""" + accuracies = [prompt['value'] for prompt in task_info['prompts'] if prompt['value'] is not None] + + if not accuracies: + return None + + task_info['average_accuracy'] = sum(accuracies) / len(accuracies) + best_prompt_data = max(task_info['prompts'], key=lambda x: x['value']) + task_info['best_prompt'] = best_prompt_data['value'] + task_info['prompt_id'] = best_prompt_data['prompt'] + + # Calculate CPS + avg_acc = task_info['average_accuracy'] + best_acc = task_info['best_prompt'] + task_info['CPS'] = (1 - (best_acc - avg_acc) / 100) * best_acc + + +def extract_data_from_file(file_path): + """Extract task and prompt data from the given file.""" + with open(file_path, 'r') as file: + lines = file.readlines() + + tasks_data = {} + current_task = None + + for line in lines: + line = line.strip() + + # Skip irrelevant lines + if not line: + continue + + + if line.startswith("| Tasks"): + continue + + if line.startswith("hf (pretrained="): + + # Estrai la parte dopo "pretrained=" + start = line.find("pretrained=") + len("pretrained=") + end = line.find(",", start) # Trova la virgola successiva + # Estrai la stringa desiderata + pretrained_model = line[start:end] + + # Estrarre num_fewshot + num_fewshot_match = re.search(r"num_fewshot:\s*([\w\d]+)", line) + num_fewshot = num_fewshot_match.group(1) if num_fewshot_match else None + + # Estrarre batch_size + batch_size_match = re.search(r"batch_size:\s*(\d+)", line) + batch_size = int(batch_size_match.group(1)) if batch_size_match else None + + continue + + columns = line.split('|') + if len(columns) != 11: + continue + + task_name = columns[1] + metric = columns[5].strip() + value = safe_float(columns[7]) + stderr = safe_float(columns[9]) + + if metric == "acc_norm": + continue + + # Identify task and prompts + if task_name.startswith(" - "): + task_name = task_name[3:].strip() + current_task = task_name + tasks_data.setdefault(current_task, + {'prompts': [], 'average_accuracy': 0, 'best_prompt': None, 'prompt_id': None, + 'CPS': None}) + + elif task_name.startswith(" - ") and current_task: + prompt_name = task_name[4:].strip() + prompt_data = {'prompt': prompt_name, 'metric': metric, 'value': value * 100, + 'stderr': stderr} + tasks_data[current_task]['prompts'].append(prompt_data) + + # Special handling for evalita NER + if "evalita NER" in tasks_data: + task_info = tasks_data["evalita NER"] + weight_map = {"ADG prompt-1": 521, "ADG prompt-2": 521, "FIC prompt-1": 1517, "FIC prompt-2": 1517, + "WN prompt-1": 2088, "WN prompt-2": 2088} + + weighted_values = {"prompt-1": 0, "prompt-2": 0} + total_weights = sum(weight_map.values()) + + for prompt in task_info['prompts']: + if prompt['prompt'] in weight_map: + if "prompt-1" in prompt['prompt']: + weighted_values["prompt-1"] += weight_map[prompt['prompt']] * prompt['value'] + elif "prompt-2" in prompt['prompt']: + weighted_values["prompt-2"] += weight_map[prompt['prompt']] * prompt['value'] + + task_info['prompts'] = [ + {"prompt": "prompt-1", "metric": "acc", "value": weighted_values["prompt-1"] / total_weights, + 'stderr': None}, + {"prompt": "prompt-2", "metric": "acc", "value": weighted_values["prompt-2"] / total_weights, + 'stderr': None}] + + # Calculate metrics for each task + for task_info in tasks_data.values(): + calculate_task_metrics(task_info) + + # Calculate average CPS + tasks_with_cps = [task['CPS'] for task in tasks_data.values() if task['CPS'] is not None] + average_CPS = sum(tasks_with_cps) / len(tasks_with_cps) if tasks_with_cps else 0 + + config = { + "model_name": pretrained_model, + "num_fewshot": num_fewshot, + "batch_size": batch_size + } + + return {'average_CPS': average_CPS, 'config': config, 'tasks': tasks_data} + + +# Example usage +#file_path = '../evalita_llm_results/models_output/slurm-7769.out' +#json_output = extract_data_from_file(file_path) +#print(json_output) + + +# Directory da cui leggere i file .out +directory_in_path = '../evalita_llm_models_output/' +directory_out_results_path = '../evalita_llm_results/' +directory_out_requests_path = '../evalita_llm_requests/' + +# Itera sui file nella directory +for filename in os.listdir(directory_in_path): + if filename.endswith('.out'): + # Costruisci il percorso completo del file + file_path = os.path.join(directory_in_path, filename) + + # Esegui la funzione extract_data_from_file + json_output = extract_data_from_file(file_path) + + # Estrai model_org_name e model_name da model_name + model_org_name, model_name = json_output['config']['model_name'].split('/') + + + + + + + # Percorso del file JSON di configurazione in ../evalita_llm_requests2/ + config_file_path = os.path.join(directory_out_requests_path, model_org_name, f"{model_name}.json") + + # Se il file esiste, caricalo e aggiorna il dizionario config + if os.path.exists(config_file_path): + with open(config_file_path, 'r', encoding='utf-8') as config_file: + additional_config = json.load(config_file) + + # Aggiorna la configurazione con i nuovi dati + json_output['config'].update(additional_config) + + + + + # Crea il percorso della cartella per model_org_name + org_folder_path = os.path.join(directory_out_results_path, model_org_name) + os.makedirs(org_folder_path, exist_ok=True) # Crea la cartella se non esiste + + # Crea il percorso completo del file JSON + file_suffix = f"{json_output['config']['num_fewshot']}" + output_file_path = os.path.join(org_folder_path, f"{model_name}_{file_suffix}.json") + + # Salva il JSON in un file con ritorni a capo compatibili con Linux + with open(output_file_path, 'w', newline="\n") as outfile: + json.dump(json_output, outfile, indent=4) + + # Stampa il risultato + print(f"File {filename} elaborato e salvato in {output_file_path}") + + + + + + + + + + + + + diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..3b4737924b5a7d81c962a4e28b66ac6cdcc3b004 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,13 @@ +[tool.ruff] +# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default. +select = ["E", "F"] +ignore = ["E501"] # line too long (black is taking care of this) +line-length = 119 +fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"] + +[tool.isort] +profile = "black" +line_length = 119 + +[tool.black] +line-length = 119 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..3cacab3e9afab55f2ce3493ac25d7a0ea5c96255 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,16 @@ +APScheduler +black +datasets +gradio +gradio[oauth] +gradio_leaderboard==0.0.13 +gradio_client +huggingface-hub>=0.18.0 +matplotlib +numpy +pandas +python-dateutil +tqdm +transformers +tokenizers>=0.15.0 +sentencepiece \ No newline at end of file diff --git a/run_instructions.txt b/run_instructions.txt new file mode 100644 index 0000000000000000000000000000000000000000..748582a66327f9c2fce6cfd982302bb047097492 --- /dev/null +++ b/run_instructions.txt @@ -0,0 +1,46 @@ +Model Evaluation and Leaderboard + +1) Model Evaluation +Before integrating a model into the leaderboard, it must first be evaluated using the lm-eval-harness library in both zero-shot and 5-shot configurations. + +This can be done with the following command: + +lm_eval --model hf --model_args pretrained=google/gemma-3-12b-it \ + --tasks evalita-mp --device cuda:0 --batch_size 1 --trust_remote_code \ + --output_path model_output --num_fewshot 5 -- + +The output generated by the library will include the model's accuracy scores on the benchmark tasks. +This output is written to the standard output and should be saved in a txt file (e.g., slurm-8368.out), which needs to be placed in the + evalita_llm_models_output LOCAL directory for further processing. Examples of such files can be found in: https://huggingface.co/datasets/evalitahf/evalita_llm_models_output/ + +2) Extracting Model Metadata +To display model details on the leaderboard (e.g., organization/group, model name, and parameter count), metadata must be retrieved from Hugging Face. + +This can be done by running: + +python get_model_info.py + +This script processes the evaluation files from Step 1 and saves each model's metadata in a JSON file within the evalita_llm_requests LOCAL directory. + +3) Generating Leaderboard Submission File +The leaderboard requires a structured file containing each model’s metadata along with its benchmark accuracy scores. + +To generate this file, run: + +python preprocess_model_output.py + +This script combines the accuracy results from Step 1 with the metadata from Step 2 and outputs a JSON file for each kind of model in the evalita_llm_results LOCAL directory. +Examples of these files are in https://huggingface.co/datasets/evalitahf/evalita_llm_results + +4) Updating the Hugging Face Repository +A commit and push of the following three directories from the local disk to HuggingFace is required, in order to update the evalita_llm_results repository with the newly generated files from Step 3: +evalita_llm_models_output, evalita_llm_requests and evalita_llm_results + +5) Running the Leaderboard Application +To test the leaderboard locally, run the following command in your terminal and open your browser at the indicated address: + +python app.py + +On Hugging Face, the leaderboard can be started or stopped directly from the graphical interface, so running this command is only necessary when working locally. + + diff --git a/src/.ipynb_checkpoints/about-checkpoint.py b/src/.ipynb_checkpoints/about-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..19b8fdf844d40432c276763757558ad919f6ccdd --- /dev/null +++ b/src/.ipynb_checkpoints/about-checkpoint.py @@ -0,0 +1,188 @@ +from dataclasses import dataclass +from enum import Enum + +@dataclass +class Task: + benchmark: str + metric: str + metric_type: str + col_name: str + +# Select your tasks here +# --------------------------------------------------- +class Tasks(Enum): + # task_key in the json file, metric_key in the json file, name to display in the leaderboard + + task1 = Task("text-entailment_1", "acc", "CPS", "TE") + task2 = Task("text-entailment_2", "acc", "average_accuracy", "TE Prompt Average") + task3 = Task("text-entailment_3", "acc", "best_prompt", "TE Best Prompt") + task4 = Task("text-entailment_4", "acc", "prompt_id", "TE Best Prompt Id") + + task5 = Task("sentiment-analysis_1", "acc", "CPS", "SA") + task6 = Task("sentiment-analysis_2", "acc", "average_accuracy", "SA Prompt Average") + task7 = Task("sentiment-analysis_3", "acc", "best_prompt", "SA Best Prompt") + task8 = Task("sentiment-analysis_4", "acc", "prompt_id", "SA Best Prompt Id") + + task9 = Task("hate-speech-detection_1", "acc", "CPS", "HS") + task10 = Task("hate-speech-detection_2", "acc", "average_accuracy", "HS Prompt Average") + task11 = Task("hate-speech-detection_3", "acc", "best_prompt", "HS Best Prompt") + task12 = Task("hate-speech-detection_4", "acc", "prompt_id", "HS Best Prompt Id") + + task13 = Task("admission-test_1", "acc", "CPS", "AT") + task14 = Task("admission-test_2", "acc", "average_accuracy", "AT Prompt Average") + task15 = Task("admission-test_3", "acc", "best_prompt", "AT Best Prompt") + task16 = Task("admission-test_4", "acc", "prompt_id", "AT Best Prompt Id") + + task17 = Task("word-in-context_1", "acc", "CPS", "WIC") + task18 = Task("word-in-context_2", "acc", "average_accuracy", "WIC Prompt Average") + task19 = Task("word-in-context_3", "acc", "best_prompt", "WIC Best Prompt") + task20 = Task("word-in-context_4", "acc", "prompt_id", "WIC Best Prompt Id") + + task21 = Task("faq_1", "acc", "CPS", "FAQ") + task22 = Task("faq_2", "acc", "average_accuracy", "FAQ Prompt Average") + task23 = Task("faq_3", "acc", "best_prompt", "FAQ Best Prompt") + task24 = Task("faq_4", "acc", "prompt_id", "FAQ Best Prompt Id") + + task25 = Task("lexical-substitution_1", "acc", "CPS", "LS") + task26 = Task("lexical-substitution_2", "acc", "average_accuracy", "LS Prompt Average") + task27 = Task("lexical-substitution_3", "acc", "best_prompt", "LS Best Prompt") + task28 = Task("lexical-substitution_4", "acc", "prompt_id", "LS Best Prompt Id") + + task29 = Task("summarization-fanpage_1", "acc", "CPS", "SU") + task30 = Task("summarization-fanpage_2", "acc", "average_accuracy", "SU Prompt Average") + task31 = Task("summarization-fanpage_3", "acc", "best_prompt", "SU Best Prompt") + task32 = Task("summarization-fanpage_4", "acc", "prompt_id", "SU Best Prompt Id") + + task33 = Task("evalita NER_1", "acc", "CPS", "NER") + task34 = Task("evalita NER_2", "acc", "average_accuracy", "NER Prompt Average") + task35 = Task("evalita NER_3", "acc", "best_prompt", "NER Best Prompt") + task36 = Task("evalita NER_4", "acc", "prompt_id", "NER Best Prompt Id") + + task37 = Task("relation-extraction_1", "acc", "CPS", "REL") + task38 = Task("relation-extraction_2", "acc", "average_accuracy", "REL Prompt Average") + task39 = Task("relation-extraction_3", "acc", "best_prompt", "REL Best Prompt") + task40 = Task("relation-extraction_4", "acc", "prompt_id", "REL Best Prompt Id") + + ''' + task0 = Task("TextualEntailment", "acc", "Textual Entailment") + task1 = Task("TextualEntailment_best", "acc", "TextualEntailment Best") + task2 = Task("Sentiment Analysis", "acc", "Sentiment Analysis") + task3 = Task("Sentiment Analysis_best", "acc", "Sentiment Analysis_best") + task4 = Task("Hate Speech", "acc", "Hate Speech") + task5 = Task("Hate Speech_best", "acc", "Hate Speech_best") + task6 = Task("Admission Test", "acc", "Admission Test") + task7 = Task("Admission Test_best", "acc", "Admission Test_best") + task8 = Task("Word in Context", "acc", "Word in Context") + task9 = Task("Word in Context_best", "acc", "Word in Context_best") + task10 = Task("FAQ", "acc", "FAQ") + task11 = Task("FAQ_best", "acc", "FAQ_best") + task12 = Task("Lexical Substitution", "acc", "Lexical Substitution") + task13 = Task("Lexical Substitution_best", "acc", "Lexical Substitution_best") + task14 = Task("Summarization", "acc", "Summarization") + task15 = Task("Summarization_best", "acc", "Summarization_best") + task16 = Task("NER", "acc", "NER") + task17 = Task("NER_best", "acc", "NER_best") + task18 = Task("REL", "acc", "REL") + task19 = Task("REL_best", "acc", "REL_best") + ''' + +# Your leaderboard name +TITLE = """
{error}
" + + +def styled_warning(warn): + return f"{warn}
" + + +def styled_message(message): + return f"{message}
" + + +def has_no_nan_values(df, columns): + return df[columns].notna().all(axis=1) + + +def has_nan_values(df, columns): + return df[columns].isna().any(axis=1) diff --git a/src/display/utils.py b/src/display/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..2b8ba28ffc154cd5b3b89f3adb27b0a73d6ea1ed --- /dev/null +++ b/src/display/utils.py @@ -0,0 +1,166 @@ +from dataclasses import dataclass, make_dataclass +from enum import Enum + +import pandas as pd + +from src.about import Tasks + +def fields(raw_class): + return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"] + + +# These classes are for user facing column names, +# to avoid having to change them all around the code +# when a modif is needed +@dataclass +class ColumnContent: + name: str + type: str + displayed_by_default: bool + hidden: bool = False + never_hidden: bool = False + +## Leaderboard columns +auto_eval_column_dict = [] +# Init +#auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)]) + +auto_eval_column_dict.append(["fewshot_symbol", ColumnContent, ColumnContent("FS", "str", True, never_hidden=True)]) +auto_eval_column_dict.append(["is_5fewshot", ColumnContent, ColumnContent("IS_FS", "bool", True)]) + + +##### languages ############# +auto_eval_column_dict.append(["LANG", ColumnContent, ColumnContent("LANG", "str", True, never_hidden=True)]) + + + +auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)]) +#auto_eval_column_dict.append(["fewshot", ColumnContent, ColumnContent("Few-Shot", "str", True)]) + +#Scores +auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg. Combined Performance ⬆️", "number", True)]) +for task in Tasks: + auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)]) + +# Model information +#auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)]) +auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)]) +auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)]) +#auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)]) +auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)]) +auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)]) +auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)]) +auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)]) +auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)]) + +# We use make dataclass to dynamically fill the scores from Tasks +AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True) + +## For the queue columns in the submission tab +@dataclass(frozen=True) +class EvalQueueColumn: # Queue column + model = ColumnContent("model", "markdown", True) + revision = ColumnContent("revision", "str", True) + private = ColumnContent("private", "bool", True) + #precision = ColumnContent("precision", "str", True) + weight_type = ColumnContent("weight_type", "str", "Original") + status = ColumnContent("status", "str", True) + +## All the model information that we might need +@dataclass +class ModelDetails: + name: str + display_name: str = "" + symbol: str = "" # emoji + + +class ModelType(Enum): + PT = ModelDetails(name="pretrained", symbol="🟢") + FT = ModelDetails(name="fine-tuned", symbol="🔶") + IFT = ModelDetails(name="instruction-tuned", symbol="⭕") + RL = ModelDetails(name="RL-tuned", symbol="🟦") + Unknown = ModelDetails(name="", symbol="?") + + def to_str(self, separator=" "): + return f"{self.value.symbol}{separator}{self.value.name}" + + @staticmethod + def from_str(type): + if "fine-tuned" in type or "🔶" in type: + return ModelType.FT + if "pretrained" in type or "🟢" in type: + return ModelType.PT + if "RL-tuned" in type or "🟦" in type: + return ModelType.RL + if "instruction-tuned" in type or "⭕" in type: + return ModelType.IFT + return ModelType.Unknown + +@dataclass +class FewShotDetails: + name: str + symbol: str = "" # emoji + +class FewShotType(Enum): + ZS = FewShotDetails(name="zero-shot", symbol="0️⃣") + FS = FewShotDetails(name="5-few-shot", symbol="5️⃣") + Unknown = FewShotDetails(name="unknown", symbol="❓") + + def to_str(self, separator=" "): + return f"{self.value.symbol}{separator}{self.value.name}" + + @staticmethod + def from_num_fewshot(is_5fewshot): + """Determines FewShotType based on num_fewshot.""" + if is_5fewshot is False: + return FewShotType.ZS + elif is_5fewshot is True: + return FewShotType.FS + return FewShotType.Unknown + +class WeightType(Enum): + Adapter = ModelDetails("Adapter") + Original = ModelDetails("Original") + Delta = ModelDetails("Delta") + +class Precision(Enum): + float16 = ModelDetails("float16") + bfloat16 = ModelDetails("bfloat16") + Unknown = ModelDetails("?") + + def from_str(precision): + if precision in ["torch.float16", "float16"]: + return Precision.float16 + if precision in ["torch.bfloat16", "bfloat16"]: + return Precision.bfloat16 + return Precision.Unknown + +# Column selection +COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden] + +EVAL_COLS = [c.name for c in fields(EvalQueueColumn)] +EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)] + +BENCHMARK_COLS = [t.value.col_name for t in Tasks] + +''' +# Nuovi valori per CPS, AVERAGE, BEST, e ID nella tabella +@dataclass +class NewColumnContent: + name: str + type: str + displayed_by_default: bool + hidden: bool = False + never_hidden: bool = False +''' + +''' +new_column_dict = [] +# Aggiungi CPS, VERAGE, BEST, ID +new_column_dict.append(["CPS", NewColumnContent, NewColumnContent("CPS", "number", True)]) +new_column_dict.append(["AVERAGE", NewColumnContent, NewColumnContent("Average ⬆️", "number", True)]) +new_column_dict.append(["BEST", NewColumnContent, NewColumnContent("Best Performance", "number", True)]) +new_column_dict.append(["ID", NewColumnContent, NewColumnContent("ID", "str", True)]) +NewColumn = make_dataclass("NewColumn", new_column_dict, frozen=True) +NEW_COLS = [c.name for c in fields(NewColumn) if not c.hidden] +''' diff --git a/src/envs.py b/src/envs.py new file mode 100644 index 0000000000000000000000000000000000000000..fa0a1db76d8ddd301bf9cb60c71da793b24a8350 --- /dev/null +++ b/src/envs.py @@ -0,0 +1,36 @@ +import os + +from huggingface_hub import HfApi + +# Info to change for your repository +# ---------------------------------- +TOKEN = os.environ.get("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") # A read/write token for your org + +#OWNER = "giux78" # Change to your org - don't forget to create a results and request dataset, with the correct format! +OWNER = "saeedfarzi" +# ---------------------------------- + +#REPO_ID = f"{OWNER}/leaderboard-evalita" +#QUEUE_REPO = f"{OWNER}/evalita-requests" +#RESULTS_REPO = f"{OWNER}/evalita-results" + +REPO_ID = f"{OWNER}/llm_leaderboard" +QUEUE_REPO = f"{OWNER}/e3c_llm_requests" +RESULTS_REPO = f"{OWNER}/e3c_llm_results" + +# If you setup a cache later, just change HF_HOME +#CACHE_PATH=os.getenv("HF_HOME", "/home/sfarzi/leaderboard/") + +# Local caches +#EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue") +#EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results") +#EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk") +#EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk") + +EVAL_REQUESTS_PATH ='/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_requests' #os.path.join(CACHE_PATH, "eval-queue") +EVAL_RESULTS_PATH = '/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_results'#os.path.join(CACHE_PATH, "eval-results") +EVAL_REQUESTS_PATH_BACKEND = '/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_requests' #os.path.join(CACHE_PATH, "eval-queue-bk") +EVAL_RESULTS_PATH_BACKEND = '/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_results' #os.path.join(CACHE_PATH, "eval-results-bk") + + +API = HfApi(token=TOKEN) diff --git a/src/leaderboard/.ipynb_checkpoints/read_evals-checkpoint.py b/src/leaderboard/.ipynb_checkpoints/read_evals-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..e4adeff523abf5c69d7c926ed7aa5e87c7a6f2e5 --- /dev/null +++ b/src/leaderboard/.ipynb_checkpoints/read_evals-checkpoint.py @@ -0,0 +1,214 @@ +import glob +import json +import math +import os +from dataclasses import dataclass + +import dateutil +import numpy as np +from typing import Dict, Union + +#from get_model_info import num_params +from src.display.formatting import make_clickable_model +from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, FewShotType +from src.submission.check_validity import is_model_on_hub + + +@dataclass +class EvalResult: + """Represents one full evaluation. Built from a combination of the result and request file for a given run. + """ + eval_name: str # org_model_precision (uid) + full_model: str # org/model (path on hub) + org: str + model: str + revision: str # commit hash, "" if main + results: Dict[str, Union[float, int]] # float o int + average_CPS: float + is_5fewshot: bool + fewshot_symbol: FewShotType = FewShotType.Unknown + weight_type: WeightType = WeightType.Original # Original or Adapter + architecture: str = "Unknown" + license: str = "?" + likes: int = 0 + num_params: int = 0 + date: str = "" # submission date of request file + still_on_hub: bool = False + + @classmethod + def init_from_json_file(self, json_filepath): + """Inits the result from the specific model result file""" + with open(json_filepath) as fp: + data = json.load(fp) + + config = data.get("config") + + #average_CPS = f"{data.get('average_CPS'):.2f}" + # Get average_CPS + average_CPS = float(data.get('average_CPS', 0.0)) # 0.0 come valore di default + # Get number of fewshot + fewshot = config.get("num_fewshot", False) + + try: + if fewshot == "5": + is_5fewshot = True + else: + is_5fewshot = False + except ValueError: + is_5fewshot = False + # Determine the few-shot type (ZS or FS) based on num_fewshot + fewshot_symbol = FewShotType.from_num_fewshot(is_5fewshot) # Use the new + + # Determine the number of parameters of the models + num_params = int(0) + num_params_billion = config.get("num_params_billion") + if num_params_billion is not None: + num_params = math.ceil(num_params_billion) + + # Get model and org + org_and_model = config.get("model_name", config.get("model_args", None)) + org_and_model = org_and_model.split("/", 1) + + if len(org_and_model) == 1: + org = None + model = org_and_model[0] + #result_key = f"{model}_{precision.value.name}" + result_key = f"{model}_{is_5fewshot}" + else: + org = org_and_model[0] + model = org_and_model[1] + #result_key = f"{org}_{model}_{precision.value.name}" + result_key = f"{org}_{model}_{is_5fewshot}" + full_model = "/".join(org_and_model) + + still_on_hub, _, model_config = is_model_on_hub( + full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False + ) + architecture = "?" + if model_config is not None: + architectures = getattr(model_config, "architectures", None) + if architectures: + architecture = ";".join(architectures) + + # Extract the results of the models + results = {} + for task in Tasks: + task = task.value + + for k, v in data["tasks"].items(): + if task.benchmark[:-2] == k: + if "Best Prompt Id" in task.col_name: + results[task.benchmark] = int(v[task.metric_type][-1:]) + else: + #results[task.benchmark] = f"{v[task.metric_type]:.2f}" # Ensure two decimals for display + results[task.benchmark] = float(v[task.metric_type]) + #value = float(v[task.metric_type]) + #results[task.benchmark] = round(value, 2) # Arrotonda a 2 decimali + + return self( + eval_name=result_key, + full_model=full_model, + org=org, + model=model, + results=results, + average_CPS=average_CPS, + fewshot_symbol=fewshot_symbol, + is_5fewshot=is_5fewshot, + revision= config.get("model_sha", ""), + still_on_hub=still_on_hub, + architecture=architecture, + num_params=num_params + ) + + ''' + def update_with_request_file(self, requests_path): + """Finds the relevant request file for the current model and updates info with it""" + request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name) + + try: + with open(request_file, "r") as f: + request = json.load(f) + self.model_type = ModelType.from_str(request.get("model_type", "")) + self.weight_type = WeightType[request.get("weight_type", "Original")] + self.license = request.get("license", "?") + self.likes = request.get("likes", 0) + self.num_params = request.get("params", 0) + self.date = request.get("submitted_time", "") + except Exception: + print(f"Could not find request file for {self.org}/{self.model} with precision + ''' + + def to_dict(self): + """Converts the Eval Result to a dict compatible with our dataframe display""" + average = self.average_CPS + + fewshot_symbol = ( + self.fewshot_symbol.value.symbol if isinstance(self.fewshot_symbol, FewShotType) else "❓" + ) + + data_dict = { + "eval_name": self.eval_name, # not a column, just a save name, + #AutoEvalColumn.precision.name: self.precision.value.name, + #AutoEvalColumn.model_type.name: self.model_type.value.name, + #AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol, + #AutoEvalColumn.model_type.name: self.model_type.value.name if self.model_type else "Unknown", + #AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol if self.model_type else "Unknown", + AutoEvalColumn.fewshot_symbol.name: fewshot_symbol, + AutoEvalColumn.weight_type.name: self.weight_type.value.name, + AutoEvalColumn.architecture.name: self.architecture, + AutoEvalColumn.model.name: make_clickable_model(self.full_model), + AutoEvalColumn.revision.name: self.revision, + AutoEvalColumn.average.name: average, + AutoEvalColumn.is_5fewshot.name: self.is_5fewshot, + AutoEvalColumn.license.name: self.license, + AutoEvalColumn.likes.name: self.likes, + AutoEvalColumn.params.name: self.num_params, + AutoEvalColumn.still_on_hub.name: self.still_on_hub, + } + + for task in Tasks: + data_dict[task.value.col_name] = self.results[task.value.benchmark] + + return data_dict + + +def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]: + """From the path of the results folder root, extract all needed info for results""" + model_result_filepaths = [] + + for root, _, files in os.walk(results_path): + # We should only have json files in model results + if len(files) == 0 or any([not f.endswith(".json") for f in files]): + continue + + # Sort the files by date + try: + files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7]) + except dateutil.parser._parser.ParserError: + files = [files[-1]] + + for file in files: + model_result_filepaths.append(os.path.join(root, file)) + + eval_results = {} + for model_result_filepath in model_result_filepaths: + # Creation of result + eval_result = EvalResult.init_from_json_file(model_result_filepath) + #eval_result.update_with_request_file(requests_path) + + # Store results of same eval together + eval_name = eval_result.eval_name + if eval_name in eval_results.keys(): + eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None}) + else: + eval_results[eval_name] = eval_result + + results = [] + for v in eval_results.values(): + try: + v.to_dict() # we test if the dict version is complete + results.append(v) + except KeyError: # not all eval values present + continue + + return results diff --git a/src/leaderboard/__pycache__/read_evals.cpython-310.pyc b/src/leaderboard/__pycache__/read_evals.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6a44cadae990aaeb84ceeec6fa425da6d852ab37 Binary files /dev/null and b/src/leaderboard/__pycache__/read_evals.cpython-310.pyc differ diff --git a/src/leaderboard/read_evals.py b/src/leaderboard/read_evals.py new file mode 100644 index 0000000000000000000000000000000000000000..7afaa23edce75f0bc8e20819a4d7f49bafbb649d --- /dev/null +++ b/src/leaderboard/read_evals.py @@ -0,0 +1,257 @@ +import glob +import json +import math +import os +from dataclasses import dataclass + +import dateutil +import numpy as np +from typing import Dict, Union + +#from get_model_info import num_params +from src.display.formatting import make_clickable_model +from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, FewShotType +from src.submission.check_validity import is_model_on_hub + + +@dataclass +class EvalResult: + """Represents one full evaluation. Built from a combination of the result and request file for a given run. + """ + eval_name: str # org_model_precision (uid) + full_model: str # org/model (path on hub) + org: str + model: str + revision: str # commit hash, "" if main + results: Dict[str, Union[float, int]] # float o int + average_CPS: float + is_5fewshot: bool + Lang:str="EN" + fewshot_symbol: FewShotType = FewShotType.Unknown + weight_type: WeightType = WeightType.Original # Original or Adapter + architecture: str = "Unknown" + license: str = "?" + likes: int = 0 + num_params: int = 0 + date: str = "" # submission date of request file + still_on_hub: bool = False + + @classmethod + def init_from_json_file(self, json_filepath): + """Inits the result from the specific model result file""" + print ( "************ Reading file ****************") + print ("file name :" , json_filepath) + with open(json_filepath) as fp: + data = json.load(fp) + #print(json_filepath,data) + config = data.get("config") + print (config) + #print( data) + #average_CPS = f"{data.get('average_CPS'):.2f}" + # Get average_CPS + average_CPS = float(data.get('average_CPS', 0.0)) # 0.0 come valore di default + # Get number of fewshot + fewshot = config.get("num_fewshot", False) + Lang=config.get("LANG", "EN") + try: + if fewshot == "5": + is_5fewshot = True + else: + is_5fewshot = False + except ValueError: + is_5fewshot = False + # Determine the few-shot type (ZS or FS) based on num_fewshot + fewshot_symbol = FewShotType.from_num_fewshot(is_5fewshot) # Use the new + + # Determine the number of parameters of the models + num_params = int(0) + num_params_billion = config.get("num_params_billion") + if num_params_billion is not None: + num_params = math.ceil(num_params_billion) + + # Get model and org + org_and_model = config.get("model_name", config.get("model_args", None)) + org_and_model = org_and_model.split("/", 1) + + if len(org_and_model) == 1: + org = None + model = org_and_model[0] + #result_key = f"{model}_{precision.value.name}" + result_key = f"{model}_{is_5fewshot}" + else: + org = org_and_model[0] + model = org_and_model[1] + #result_key = f"{org}_{model}_{precision.value.name}" + result_key = f"{org}_{model}_{is_5fewshot}" + full_model = "/".join(org_and_model) + + still_on_hub, _, model_config = is_model_on_hub( + full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False + ) + architecture = "?" + if model_config is not None: + architectures = getattr(model_config, "architectures", None) + if architectures: + architecture = ";".join(architectures) + + # Extract the results of the models + results = {} + for task in Tasks: + #print(task, "data:", data["tasks"].items()) + task = task.value + + for k, v in data["tasks"].items(): + + if task.benchmark[:-2] == k: + if "Best Prompt Id" in task.col_name: + #print ("k:", k,"v:", v) + #print (task.metric_type) + #print(v[task.metric_type]) + results[task.benchmark] = int(v[task.metric_type][-1:]) + else: + #results[task.benchmark] = f"{v[task.metric_type]:.2f}" # Ensure two decimals for display + results[task.benchmark] = float(v[task.metric_type]) + #value = float(v[task.metric_type]) + #results[task.benchmark] = round(value, 2) # Arrotonda a 2 decimali + + print ("Generated Object: ",self( + eval_name=result_key+"_"+Lang, + full_model=full_model, + Lang=Lang, + org=org, + model=model, + results=results, + average_CPS=average_CPS, + fewshot_symbol=fewshot_symbol, + is_5fewshot=is_5fewshot, + revision= config.get("model_sha", ""), + still_on_hub=still_on_hub, + architecture=architecture, + num_params=num_params + )) + print ( "************ End of Reading file ****************") + return self( + eval_name=result_key+"_"+Lang, + full_model=full_model, + Lang=Lang, + org=org, + model=model, + results=results, + average_CPS=average_CPS, + fewshot_symbol=fewshot_symbol, + is_5fewshot=is_5fewshot, + revision= config.get("model_sha", ""), + still_on_hub=still_on_hub, + architecture=architecture, + num_params=num_params + ) + + ''' + def update_with_request_file(self, requests_path): + """Finds the relevant request file for the current model and updates info with it""" + request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name) + + try: + with open(request_file, "r") as f: + request = json.load(f) + self.model_type = ModelType.from_str(request.get("model_type", "")) + self.weight_type = WeightType[request.get("weight_type", "Original")] + self.license = request.get("license", "?") + self.likes = request.get("likes", 0) + self.num_params = request.get("params", 0) + self.date = request.get("submitted_time", "") + except Exception: + print(f"Could not find request file for {self.org}/{self.model} with precision + ''' + + def to_dict(self): + """Converts the Eval Result to a dict compatible with our dataframe display""" + average = self.average_CPS + + + fewshot_symbol = ( + self.fewshot_symbol.value.symbol if isinstance(self.fewshot_symbol, FewShotType) else "❓" + ) + + data_dict = { + "eval_name": self.eval_name, # not a column, just a save name, + #AutoEvalColumn.precision.name: self.precision.value.name, + #AutoEvalColumn.model_type.name: self.model_type.value.name, + #AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol, + #AutoEvalColumn.model_type.name: self.model_type.value.name if self.model_type else "Unknown", + #AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol if self.model_type else "Unknown", + AutoEvalColumn.fewshot_symbol.name: fewshot_symbol, + AutoEvalColumn.weight_type.name: self.weight_type.value.name, + AutoEvalColumn.architecture.name: self.architecture, + AutoEvalColumn.model.name: make_clickable_model(self.full_model), + AutoEvalColumn.revision.name: self.revision, + AutoEvalColumn.average.name: average, + AutoEvalColumn.is_5fewshot.name: self.is_5fewshot, + AutoEvalColumn.license.name: self.license, + AutoEvalColumn.likes.name: self.likes, + AutoEvalColumn.params.name: self.num_params, + AutoEvalColumn.still_on_hub.name: self.still_on_hub, + AutoEvalColumn.LANG.name:self.Lang + } + + for task in Tasks: + data_dict[task.value.col_name] = self.results[task.value.benchmark] + + return data_dict + + +def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]: + """From the path of the results folder root, extract all needed info for results""" + model_result_filepaths = [] + + for root, _, files in os.walk(results_path): + # We should only have json files in model results + #print(root,files) + if len(files) == 0 or any([not f.endswith(".json") for f in files]): + continue + #if len(files) == 0 : continue + #json_files = [f for f in files if f.endswith(".json")] + #if not json_files: + #continue + # Sort the files by date + #print(root,files) + try: + files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7]) + except dateutil.parser._parser.ParserError: + files = [files[-1]] + + for file in files: + if not file.endswith(".json"):continue + model_result_filepaths.append(os.path.join(root, file)) + + #print(model_result_filepaths) + eval_results = {} + for model_result_filepath in model_result_filepaths: + # Creation of result + eval_result = EvalResult.init_from_json_file(model_result_filepath) + #eval_result.update_with_request_file(requests_path) + #print ("************************") + #print("path: ", model_result_filepath) + #print('eval_result: ',eval_result) + # Store results of same eval together + eval_name = eval_result.eval_name + print('eval_name: ',eval_name) + print ("lang: ", eval_result.Lang) + + if ( eval_name in eval_results.keys()) : + eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None}) + else: + eval_results[eval_name] = eval_result + + results = [] + #print("eval_results: ",eval_results) + for v in eval_results.values(): + try: + v.to_dict() # we test if the dict version is complete + results.append(v) + except KeyError: # not all eval values present + print ("Except(error) : line 244 file read_evals.py") + continue + print("Final results: ",results) + print ("@@@@@@@@@@@@") + return results diff --git a/src/populate.py b/src/populate.py new file mode 100644 index 0000000000000000000000000000000000000000..c417aa03e83ac3e42b50661a322baae5a67b47c6 --- /dev/null +++ b/src/populate.py @@ -0,0 +1,62 @@ +import json +import os + +import pandas as pd + +from src.display.formatting import has_no_nan_values, make_clickable_model +from src.display.utils import AutoEvalColumn, EvalQueueColumn +from src.leaderboard.read_evals import get_raw_eval_results + + +def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame: + """Creates a dataframe from all the individual experiment results""" + print (results_path, requests_path) + raw_data = get_raw_eval_results(results_path, requests_path) + print(raw_data) + all_data_json = [v.to_dict() for v in raw_data] + + df = pd.DataFrame.from_records(all_data_json) + + print ("all_data_json: ", all_data_json) + df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False) + df = df[cols].round(decimals=2) + + # filter out if any of the benchmarks have not been produced + df = df[has_no_nan_values(df, benchmark_cols)] + return df + + +def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]: + """Creates the different dataframes for the evaluation queues requestes""" + entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")] + all_evals = [] + + for entry in entries: + if ".json" in entry: + file_path = os.path.join(save_path, entry) + with open(file_path) as fp: + data = json.load(fp) + + data[EvalQueueColumn.model.name] = make_clickable_model(data["model"]) + data[EvalQueueColumn.revision.name] = data.get("revision", "main") + + all_evals.append(data) + elif ".md" not in entry: + # this is a folder + sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")] + for sub_entry in sub_entries: + file_path = os.path.join(save_path, entry, sub_entry) + with open(file_path) as fp: + data = json.load(fp) + + data[EvalQueueColumn.model.name] = make_clickable_model(data["model"]) + data[EvalQueueColumn.revision.name] = data.get("revision", "main") + all_evals.append(data) + + pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]] + running_list = [e for e in all_evals if e["status"] == "RUNNING"] + finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"] + df_pending = pd.DataFrame.from_records(pending_list, columns=cols) + df_running = pd.DataFrame.from_records(running_list, columns=cols) + df_finished = pd.DataFrame.from_records(finished_list, columns=cols) + return df_finished[cols], df_running[cols], df_pending[cols] diff --git a/src/submission/__pycache__/check_validity.cpython-310.pyc b/src/submission/__pycache__/check_validity.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..60a00c4e8e2ec697096b23cc8b11d2ce18e42e29 Binary files /dev/null and b/src/submission/__pycache__/check_validity.cpython-310.pyc differ diff --git a/src/submission/__pycache__/submit.cpython-310.pyc b/src/submission/__pycache__/submit.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e3736aef4466c7dd63ecacff03a66e403a54d2d2 Binary files /dev/null and b/src/submission/__pycache__/submit.cpython-310.pyc differ diff --git a/src/submission/check_validity.py b/src/submission/check_validity.py new file mode 100644 index 0000000000000000000000000000000000000000..3c3ce45c4dacd2d600544c87584ee72c81d3b956 --- /dev/null +++ b/src/submission/check_validity.py @@ -0,0 +1,99 @@ +import json +import os +import re +from collections import defaultdict +from datetime import datetime, timedelta, timezone + +import huggingface_hub +from huggingface_hub import ModelCard +from huggingface_hub.hf_api import ModelInfo +from transformers import AutoConfig +from transformers.models.auto.tokenization_auto import AutoTokenizer + +def check_model_card(repo_id: str) -> tuple[bool, str]: + """Checks if the model card and license exist and have been filled""" + try: + card = ModelCard.load(repo_id) + except huggingface_hub.utils.EntryNotFoundError: + return False, "Please add a model card to your model to explain how you trained/fine-tuned it." + + # Enforce license metadata + if card.data.license is None: + if not ("license_name" in card.data and "license_link" in card.data): + return False, ( + "License not found. Please add a license to your model card using the `license` metadata or a" + " `license_name`/`license_link` pair." + ) + + # Enforce card content + if len(card.text) < 200: + return False, "Please add a description to your model card, it is too short." + + return True, "" + +def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]: + """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses.""" + try: + config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token) + if test_tokenizer: + try: + tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token) + except ValueError as e: + return ( + False, + f"uses a tokenizer which is not in a transformers release: {e}", + None + ) + except Exception as e: + return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None) + return True, None, config + + except ValueError: + return ( + False, + "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.", + None + ) + + except Exception as e: + return False, "was not found on hub!", None + + +def get_model_size(model_info: ModelInfo, precision: str): + """Gets the model size from the configuration, or the model name if the configuration does not contain the information.""" + try: + model_size = round(model_info.safetensors["total"] / 1e9, 3) + except (AttributeError, TypeError): + return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in example_app.py + + size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1 + model_size = size_factor * model_size + return model_size + +def get_model_arch(model_info: ModelInfo): + """Gets the model architecture from the configuration""" + return model_info.config.get("architectures", "Unknown") + +def already_submitted_models(requested_models_dir: str) -> set[str]: + """Gather a list of already submitted models to avoid duplicates""" + depth = 1 + file_names = [] + users_to_submission_dates = defaultdict(list) + + for root, _, files in os.walk(requested_models_dir): + current_depth = root.count(os.sep) - requested_models_dir.count(os.sep) + if current_depth == depth: + for file in files: + if not file.endswith(".json"): + continue + with open(os.path.join(root, file), "r") as f: + info = json.load(f) + file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}") + + # Select organisation + if info["model"].count("/") == 0 or "submitted_time" not in info: + continue + organisation, _ = info["model"].split("/") + users_to_submission_dates[organisation].append(info["submitted_time"]) + + return set(file_names), users_to_submission_dates diff --git a/src/submission/submit.py b/src/submission/submit.py new file mode 100644 index 0000000000000000000000000000000000000000..cac6ea48e803a0af42dabe5226191c769dbec71d --- /dev/null +++ b/src/submission/submit.py @@ -0,0 +1,119 @@ +import json +import os +from datetime import datetime, timezone + +from src.display.formatting import styled_error, styled_message, styled_warning +from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO +from src.submission.check_validity import ( + already_submitted_models, + check_model_card, + get_model_size, + is_model_on_hub, +) + +REQUESTED_MODELS = None +USERS_TO_SUBMISSION_DATES = None + +def add_new_eval( + model: str, + base_model: str, + revision: str, + precision: str, + weight_type: str, + model_type: str, +): + global REQUESTED_MODELS + global USERS_TO_SUBMISSION_DATES + if not REQUESTED_MODELS: + REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH) + + user_name = "" + model_path = model + if "/" in model: + user_name = model.split("/")[0] + model_path = model.split("/")[1] + + precision = precision.split(" ")[0] + current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + if model_type is None or model_type == "": + return styled_error("Please select a model type.") + + # Does the model actually exist? + if revision == "": + revision = "main" + + # Is the model on the hub? + if weight_type in ["Delta", "Adapter"]: + base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True) + if not base_model_on_hub: + return styled_error(f'Base model "{base_model}" {error}') + + if not weight_type == "Adapter": + model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True) + if not model_on_hub: + return styled_error(f'Model "{model}" {error}') + + # Is the model info correctly filled? + try: + model_info = API.model_info(repo_id=model, revision=revision) + except Exception: + return styled_error("Could not get your model information. Please fill it up properly.") + + model_size = get_model_size(model_info=model_info, precision=precision) + + # Were the model card and license filled? + try: + license = model_info.cardData["license"] + except Exception: + return styled_error("Please select a license for your model") + + modelcard_OK, error_msg = check_model_card(model) + if not modelcard_OK: + return styled_error(error_msg) + + # Seems good, creating the eval + print("Adding new eval") + + eval_entry = { + "model": model, + "base_model": base_model, + "revision": revision, + "precision": precision, + "weight_type": weight_type, + "status": "PENDING", + "submitted_time": current_time, + "model_type": model_type, + "likes": model_info.likes, + "params": model_size, + "license": license, + "private": False, + } + + # Check for duplicate submission + if f"{model}_{revision}_{precision}" in REQUESTED_MODELS: + return styled_warning("This model has been already submitted.") + + print("Creating eval file") + OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}" + os.makedirs(OUT_DIR, exist_ok=True) + out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json" + + with open(out_path, "w") as f: + f.write(json.dumps(eval_entry)) + + print("Uploading eval file") + API.upload_file( + path_or_fileobj=out_path, + path_in_repo=out_path.split("eval-queue/")[1], + repo_id=QUEUE_REPO, + repo_type="dataset", + commit_message=f"Add {model} to eval queue", + ) + + # Remove the local file + os.remove(out_path) + + return styled_message( + "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list." + ) diff --git a/src/tasks.py b/src/tasks.py new file mode 100644 index 0000000000000000000000000000000000000000..9859da5c09cf6dbf56462d3f4ac0f0afe1a6c20f --- /dev/null +++ b/src/tasks.py @@ -0,0 +1,183 @@ +from dataclasses import dataclass +from enum import Enum + +@dataclass +class Task: + benchmark: str + # metric: str + accuracy: str + col_name: str + +NUM_FEWSHOT = 0 # Change with your few shot +# --------------------------------------------------- + +# Your leaderboard name +TITLE = """