Spaces:
Runtime error
Runtime error
| import json | |
| import os | |
| import re | |
| import uuid | |
| import random | |
| from pathlib import Path | |
| import pandas as pd | |
| import streamlit as st | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from datasets import load_dataset | |
| from huggingface_hub import CommitScheduler, hf_hub_download | |
| from huggingface_hub.utils import RepositoryNotFoundError | |
| from yaml import safe_load as yaml_load | |
| from src.check_validity import validate_model | |
| from src.task_mappings import professional_mapping, semantic_categories | |
| # ----------------------------------------------------------------------------- | |
| # Page configuration and global CSS styles for modern look and improved UX | |
| # ----------------------------------------------------------------------------- | |
| st.set_page_config( | |
| page_title="IberBench", | |
| layout="wide", | |
| initial_sidebar_state="expanded", | |
| page_icon="🌍", | |
| ) | |
| st.markdown( | |
| """ | |
| <style> | |
| /* General page styling */ | |
| body { | |
| background-color: #f7f7f7; | |
| font-family: 'Segoe UI', sans-serif; | |
| } | |
| /* Sidebar styling */ | |
| .css-1d391kg { | |
| background-color: #ffffff; | |
| border-right: 2px solid #eaeaea; | |
| } | |
| /* Header styling */ | |
| .main-header { | |
| text-align: center; | |
| padding: 2rem 0; | |
| background: linear-gradient(90deg, #007BFF, #00BFFF); | |
| color: white; | |
| border-radius: 10px 10px 10px 10px; | |
| } | |
| /* Tab styling */ | |
| .stTabs > .css-1qimj2v { | |
| background: #fff; | |
| } | |
| /* Form styling */ | |
| .stButton>button { | |
| background-color: #007BFF; | |
| color: white; | |
| border: none; | |
| border-radius: 5px; | |
| } | |
| </style> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| # ----------------------------------------------------------------------------- | |
| # Global variables and helper functions | |
| # ----------------------------------------------------------------------------- | |
| request_file = Path("user_request/") / f"data_{uuid.uuid4()}.json" | |
| request_folder = request_file.parent | |
| LANGUAGES_SETTINGS = Path("etc/languages_settings.yml") | |
| dataset_columns = [ | |
| "workshop", | |
| "shared_task", | |
| "year", | |
| "task_type", | |
| "language", | |
| "url", | |
| "language_variety", | |
| "problem_type", | |
| "num_labels", | |
| "labels", | |
| ] | |
| model_columns = ["model_name", "model_type", "num_parameters"] | |
| scheduler = CommitScheduler( | |
| repo_id="iberbench/user-requests", | |
| repo_type="dataset", | |
| private=True, | |
| folder_path=request_folder, | |
| token=st.secrets["HF_TOKEN"], | |
| path_in_repo="data", | |
| every=10, | |
| ) | |
| def log_submission(input_dict: dict) -> None: | |
| with scheduler.lock: | |
| with request_file.open("a") as f: | |
| f.write(json.dumps(input_dict)) | |
| f.write("\n") | |
| def get_lang_columns(columns: list, lang: str): | |
| # Mixed needs to return all the columns that ends | |
| # with the language, but doesn't have variation at the end | |
| if "Mixed" in lang: | |
| lang = lang.lower().split(" ")[0] | |
| return [col for col in columns if col.endswith(lang)] | |
| else: | |
| lang_norm = lang.lower().replace(" ", "_") | |
| return [col for col in columns if lang_norm in col] | |
| def load_data(lang) -> pd.DataFrame: | |
| try: | |
| data = load_dataset( | |
| "iberbench/lm-eval-results", token=st.secrets["HF_TOKEN"] | |
| )["train"].to_pandas() | |
| task_columns = [col for col in data.columns if col not in model_columns] | |
| task_lang_columns = get_lang_columns(task_columns, lang) | |
| data[task_columns] = data[task_columns] * 100 | |
| data = data[model_columns + task_lang_columns] | |
| # data["Active"] = False | |
| return data | |
| except FileNotFoundError: | |
| st.error("iberbench/lm-eval-results was not found in the hub 😕") | |
| return pd.DataFrame() | |
| def load_dataset_card(task) -> list: | |
| name_repo = "iberbench/" + task | |
| try: | |
| info_path = hf_hub_download( | |
| repo_id=name_repo, | |
| filename="task_metadata.json", | |
| repo_type="dataset", | |
| ) | |
| with open(info_path, "r") as f: | |
| info = json.load(f) | |
| values_ = [] | |
| for i in dataset_columns: | |
| if i in info: | |
| values_.append(info[i]) | |
| else: | |
| values_.append([] if i == "labels" else "-") | |
| return values_ | |
| except RepositoryNotFoundError: | |
| st.error(task + ": dataset was not found in the hub 🚫") | |
| return ["-"] * len(dataset_columns) | |
| def active_data(lang) -> pd.DataFrame: | |
| return st.session_state[f"leaderboard_data_{lang}"][ | |
| st.session_state[f"leaderboard_data_{lang}"]["Active"] == True | |
| ].copy() | |
| def get_index(lang, row) -> pd.Series: | |
| return active_data(lang).iloc[row].name | |
| def commit(lang) -> None: | |
| for row in st.session_state[f"edited_data_{lang}"]["edited_rows"]: | |
| row_index = get_index(lang, row) | |
| for key, value in st.session_state[f"edited_data_{lang}"][ | |
| "edited_rows" | |
| ][row].items(): | |
| st.session_state[f"leaderboard_data_{lang}"].at[ | |
| row_index, key | |
| ] = value | |
| # ----------------------------------------------------------------------------- | |
| # Visualization helper functions | |
| # ----------------------------------------------------------------------------- | |
| def create_table_results(df_mean: pd.DataFrame): | |
| rank_value = [] | |
| for i in df_mean["Mean"].rank(method="dense", ascending=False).astype(int): | |
| if i == 1: | |
| rank_value.append(f"{i} 🥇") | |
| elif i == 2: | |
| rank_value.append(f"{i} 🥈") | |
| elif i == 3: | |
| rank_value.append(f"{i} 🥉") | |
| else: | |
| rank_value.append(str(i)) | |
| df_mean.insert(0, "Rank", rank_value) | |
| df_final = df_mean.sort_values("Mean", ascending=False) | |
| st.dataframe( | |
| df_final, | |
| hide_index=True, | |
| use_container_width=True, | |
| column_config={ | |
| "model_name": st.column_config.TextColumn("Model 🧠"), | |
| "model_type": st.column_config.TextColumn("Type 📌"), | |
| "num_parameters": st.column_config.NumberColumn("Model Size 🔢"), | |
| }, | |
| ) | |
| def create_table_all_results(aggregated_df: pd.DataFrame): | |
| combined_df = create_data_results_per_language() | |
| df_lang = combined_df.pivot( | |
| index="model_name", columns="language", values="Mean" | |
| ) | |
| aggregated_df[df_lang.columns] = df_lang[df_lang.columns].values | |
| rank_value = [] | |
| for i in ( | |
| aggregated_df["Mean"].rank(method="dense", ascending=False).astype(int) | |
| ): | |
| if i == 1: | |
| rank_value.append(f"{i} 🥇") | |
| elif i == 2: | |
| rank_value.append(f"{i} 🥈") | |
| elif i == 3: | |
| rank_value.append(f"{i} 🥉") | |
| else: | |
| rank_value.append(str(i)) | |
| aggregated_df.insert(0, "Rank", rank_value) | |
| df_final = aggregated_df.sort_values("Mean", ascending=False) | |
| st.dataframe( | |
| df_final, | |
| hide_index=True, | |
| use_container_width=True, | |
| column_config={ | |
| "model_name": st.column_config.TextColumn("Model 🧠"), | |
| "model_type": st.column_config.TextColumn("Type 📌"), | |
| "num_parameters": st.column_config.NumberColumn("Model Size 🔢"), | |
| }, | |
| ) | |
| def create_scatter_chart(df: pd.DataFrame, id_: str): | |
| fig = px.scatter( | |
| df, | |
| x="num_parameters", | |
| y="Mean", | |
| color="model_name", | |
| size="num_parameters", | |
| hover_data=["model_type"], | |
| labels={"num_parameters": "Num parameters"}, | |
| ) | |
| fig.update_layout(template="plotly_white") | |
| st.plotly_chart( | |
| fig, use_container_width=True, key=id_ + str(random.random()) | |
| ) | |
| def create_radar_chart(df: pd.DataFrame, id_: str): | |
| df = df.sort_values(by="Mean", ascending=False) | |
| radar_df = pd.DataFrame( | |
| {"r": df["Mean"][:10], "theta": df["model_name"][:10]} | |
| ) | |
| fig = px.line_polar( | |
| radar_df, | |
| r="r", | |
| theta="theta", | |
| line_close=True, | |
| markers=True, | |
| ) | |
| fig.update_traces(fill="toself") | |
| st.plotly_chart( | |
| fig, use_container_width=True, key=id_ + str(random.random()) | |
| ) | |
| def create_pie_chart(df: pd.DataFrame, id_: str): | |
| df_pie = df["model_type"].value_counts().reset_index() | |
| df_pie.columns = ["model_type", "count"] | |
| fig = px.pie( | |
| df_pie, | |
| values="count", | |
| names="model_type", | |
| labels={"model_type": "Model type"}, | |
| ) | |
| st.plotly_chart( | |
| fig, use_container_width=True, key=id_ + str(random.random()) | |
| ) | |
| def create_box_plot(df: pd.DataFrame, id_: str): | |
| fig = px.box( | |
| df, | |
| x="model_type", | |
| y="Mean", | |
| points="all", | |
| labels={"model_type": "Model type"}, | |
| ) | |
| st.plotly_chart( | |
| fig, use_container_width=True, key=id_ + str(random.random()) | |
| ) | |
| def get_summary_df(lang: str, task_types: list) -> pd.DataFrame: | |
| df = st.session_state[f"leaderboard_data_{lang}"][model_columns].copy() | |
| if not st.session_state[f"leaderboard_data_{lang}"].empty: | |
| for t in task_types: | |
| task_list = semantic_categories[t] | |
| cols = [ | |
| col | |
| for col in st.session_state[f"leaderboard_data_{lang}"].columns | |
| if "iberbench/" + col in task_list | |
| ] | |
| if cols: | |
| tmp = st.session_state[f"leaderboard_data_{lang}"][cols] | |
| df[t] = tmp.mean(axis=1).round(2) | |
| if df.shape[1] > 4: | |
| df.insert(3, "Mean", df.iloc[:, 3:-1].mean(axis=1).round(2)) | |
| else: | |
| df.insert(3, "Mean", df.iloc[:, 3].round(2)) | |
| return df | |
| def get_all_languages_summary_df() -> pd.DataFrame: | |
| """Combine leaderboard summary data from all languages using get_summary_df.""" | |
| combined_df = pd.DataFrame() | |
| for key in st.session_state: | |
| if key.startswith("leaderboard_data_"): | |
| lang = key.split("leaderboard_data_")[1] | |
| task_types = select_task_per_language(lang) | |
| summary_df = get_summary_df(lang, task_types) | |
| summary_df["language"] = lang | |
| combined_df = pd.concat( | |
| [combined_df, summary_df], ignore_index=True | |
| ) | |
| return combined_df | |
| def create_results_visualization_lang(lang: str): | |
| # --------------------------- | |
| # In-language plots section | |
| # --------------------------- | |
| task_types = select_task_per_language(lang) | |
| summary_df = get_summary_df(lang, task_types) | |
| tasks_df = st.session_state[f"leaderboard_data_{lang}"].copy() | |
| create_table_results(summary_df) | |
| st.markdown("### Language plots 📊") | |
| # Display the results table for the selected language | |
| in_lang_tabs = st.tabs( | |
| [ | |
| "Top 10 performance 🥇", | |
| "Performance vs. size 📏", | |
| "Performance per type 💡", | |
| "Fundamental vs industry ⚖️", | |
| "Performance per task category 📈", | |
| ] | |
| ) | |
| with in_lang_tabs[0]: | |
| create_radar_chart(summary_df, lang + "in_radar") | |
| with in_lang_tabs[1]: | |
| create_scatter_chart(summary_df, lang + "in_scatter") | |
| with in_lang_tabs[2]: | |
| create_box_plot(summary_df, lang + "in_box") | |
| with in_lang_tabs[3]: | |
| create_box_plot_per_task_category(tasks_df, lang + "in_box_task_cat") | |
| with in_lang_tabs[4]: | |
| create_box_plot_per_semantic_category(tasks_df, lang + "in_box_sem_cat") | |
| # ----------------------------------------------------------------------------- | |
| # Functions for other visualization sections | |
| # ----------------------------------------------------------------------------- | |
| def select_task_per_language(lang: str): | |
| types = [] | |
| for k, v in semantic_categories.items(): | |
| for vv in v: | |
| task_name = vv.split("iberbench/")[1] | |
| if task_name in list( | |
| st.session_state[f"leaderboard_data_{lang}"].columns | |
| ): | |
| if k not in types: | |
| types.append(k) | |
| return types | |
| def create_dataset_info_per_language(lang: str): | |
| all_values = [] | |
| if not st.session_state[f"leaderboard_data_{lang}"].empty: | |
| cols = [ | |
| col | |
| for col in st.session_state[f"leaderboard_data_{lang}"].columns | |
| if col not in model_columns | |
| ] | |
| if len(cols) > 1: | |
| for task in cols[:-1]: | |
| values = load_dataset_card(task) | |
| all_values.append(values) | |
| else: | |
| values = load_dataset_card(cols[0]) | |
| all_values.append(values) | |
| df = pd.DataFrame(all_values, columns=dataset_columns) | |
| st.dataframe( | |
| df, | |
| column_config={ | |
| "workshop": st.column_config.TextColumn( | |
| "Workshop 🏫", help="Workshop to belong to the shared task" | |
| ), | |
| "shared_task": st.column_config.TextColumn( | |
| "Shared Task 📋", help="Shared Task name" | |
| ), | |
| "year": st.column_config.TextColumn( | |
| "Year 📅", help="Year of the shared task" | |
| ), | |
| "task_type": st.column_config.TextColumn( | |
| "Task Type 🔖", help="Shared Task type" | |
| ), | |
| "language": st.column_config.TextColumn( | |
| "Language 🌐", help="Shared Task language" | |
| ), | |
| "url": st.column_config.ListColumn( | |
| "Task URL 🔗", help="Shared Task url" | |
| ), | |
| "language_variety": st.column_config.TextColumn( | |
| "Language Variety 🗣️", help="Shared Task language variety" | |
| ), | |
| "problem_type": st.column_config.TextColumn( | |
| "Problem Type ❓", help="Shared Task problem type" | |
| ), | |
| "num_labels": st.column_config.NumberColumn( | |
| "Number of Labels 🔢", help="Shared Task number of labels" | |
| ), | |
| "labels": st.column_config.ListColumn( | |
| "Labels 🏷️", help="Shared Task labels" | |
| ), | |
| }, | |
| hide_index=True, | |
| ) | |
| else: | |
| st.write("No data found to display on leaderboard 😔.") | |
| def create_box_plot_per_task_category(df: pd.DataFrame, id_: str): | |
| # Compute average performance for each professional category (using professional_mapping). | |
| melt_vars = [] | |
| for category, tasks in professional_mapping.items(): | |
| relevant_cols = [ | |
| col for col in df.columns if "iberbench/" + col in tasks | |
| ] | |
| if relevant_cols: | |
| df[category] = df[relevant_cols].mean(axis=1).round(2) | |
| melt_vars.append(category) | |
| melt_vars = list(set(melt_vars)) | |
| id_vars = model_columns.copy() | |
| if "language" in df.columns: | |
| id_vars.append("language") | |
| df_melt = df.melt( | |
| id_vars=id_vars, | |
| value_vars=melt_vars, | |
| var_name="Task Category", | |
| value_name="Performance", | |
| ) | |
| fig = px.box( | |
| df_melt, | |
| x="Task Category", | |
| y="Performance", | |
| points="all", | |
| labels={"Performance": "Performance (%)"}, | |
| ) | |
| st.plotly_chart( | |
| fig, use_container_width=True, key=id_ + str(random.random()) | |
| ) | |
| def create_box_plot_per_semantic_category(df: pd.DataFrame, id_: str): | |
| # Compute average performance for each semantic category defined in semantic_categories. | |
| melt_vars = [] | |
| for category, tasks in semantic_categories.items(): | |
| relevant_cols = [ | |
| col for col in df.columns if "iberbench/" + col in tasks | |
| ] | |
| if relevant_cols: | |
| df[category] = df[relevant_cols].mean(axis=1).round(2) | |
| melt_vars.append(category) | |
| melt_vars = list(set(melt_vars)) | |
| id_vars = model_columns.copy() | |
| if "language" in df.columns: | |
| id_vars.append("language") | |
| df_melt = df.melt( | |
| id_vars=id_vars, | |
| value_vars=melt_vars, | |
| var_name="Task Category", | |
| value_name="Performance", | |
| ) | |
| fig = px.box( | |
| df_melt, | |
| x="Task Category", | |
| y="Performance", | |
| points="all", | |
| labels={"Performance": "Performance (%)"}, | |
| ) | |
| st.plotly_chart( | |
| fig, use_container_width=True, key=id_ + str(random.random()) | |
| ) | |
| def create_histogram(df: pd.DataFrame, id_: str): | |
| fig = px.histogram( | |
| df, | |
| x="num_parameters", | |
| nbins=20, | |
| labels={"num_parameters": "Num parameters", "count": "Count"}, | |
| ) | |
| fig.update_layout(template="plotly_white") | |
| st.plotly_chart( | |
| fig, use_container_width=True, key=id_ + str(random.random()) | |
| ) | |
| def create_data_results_per_language() -> pd.DataFrame: | |
| # Create a combined dataframe from all leaderboard data in session_state. | |
| combined_df = pd.DataFrame() | |
| for key in st.session_state.keys(): | |
| if key.startswith("leaderboard_data_"): | |
| temp_df = st.session_state[key].copy() | |
| # If the "language" column is missing, use the key to assign a language name. | |
| if "language" not in temp_df.columns: | |
| lang = key.split("leaderboard_data_")[1] | |
| temp_df["language"] = lang | |
| combined_df = pd.concat([combined_df, temp_df], ignore_index=True) | |
| if combined_df.empty: | |
| st.warning("No data available for any language ⚠️.") | |
| return | |
| # Check if the "Mean" column exists. If not, compute it. | |
| if "Mean" not in combined_df.columns: | |
| # Define model metadata columns that should be excluded from the performance calculation. | |
| model_columns = ["model_name", "model_type", "num_parameters"] | |
| # Exclude metadata, language, and any non-numeric columns. | |
| performance_cols = [ | |
| col | |
| for col in combined_df.columns | |
| if col not in model_columns + ["language", "Active"] | |
| and pd.api.types.is_numeric_dtype(combined_df[col]) | |
| ] | |
| if performance_cols: | |
| combined_df["Mean"] = ( | |
| combined_df[performance_cols].mean(axis=1).round(2) | |
| ) | |
| else: | |
| st.warning( | |
| "No numeric task performance columns available to compute 'Mean' ⚠️." | |
| ) | |
| return | |
| return combined_df | |
| def create_box_plot_per_language(id_: str): | |
| # Create a boxplot with performance (Mean) per language. | |
| combined_df = create_data_results_per_language() | |
| fig = px.box( | |
| combined_df, | |
| x="language", | |
| y="Mean", | |
| points="all", | |
| labels={"language": "Language", "Mean": "Performance (%)"}, | |
| ) | |
| st.plotly_chart( | |
| fig, use_container_width=True, key=id_ + str(random.random()) | |
| ) | |
| def get_all_languages_summary_df() -> pd.DataFrame: | |
| """Combine leaderboard summary data from all languages using get_summary_df.""" | |
| combined_df = pd.DataFrame() | |
| for key in st.session_state: | |
| if key.startswith("leaderboard_data_"): | |
| lang = key.split("leaderboard_data_")[1] | |
| task_types = select_task_per_language(lang) | |
| summary_df = get_summary_df(lang, task_types) | |
| summary_df["language"] = lang | |
| combined_df = pd.concat( | |
| [combined_df, summary_df], ignore_index=True | |
| ) | |
| return combined_df | |
| def get_all_languages_aggregated_summary_df() -> pd.DataFrame: | |
| """ | |
| Aggregate the combined summary data by model_name to compute mean performance | |
| across languages. Use this aggregated data for radar, scatter, pie, box, and histogram plots. | |
| """ | |
| df = get_all_languages_summary_df() | |
| agg_df = df.groupby("model_name", as_index=False).agg( | |
| { | |
| "model_type": "first", # choose an aggregation that makes sense | |
| "num_parameters": "mean", # average model size across languages | |
| "Mean": "mean", # average performance | |
| } | |
| ) | |
| agg_df["Mean"] = agg_df["Mean"].round(2) | |
| return agg_df | |
| def get_all_languages_raw_df() -> pd.DataFrame: | |
| """ | |
| Combine the raw leaderboard data from all languages. | |
| This is used for plots (e.g., Fundamental vs Professional) that rely on the original task columns. | |
| """ | |
| combined_df = pd.DataFrame() | |
| for key in st.session_state: | |
| if key.startswith("leaderboard_data_"): | |
| lang = key.split("leaderboard_data_")[1] | |
| temp_df = st.session_state[key].copy() | |
| temp_df["language"] = lang | |
| combined_df = pd.concat([combined_df, temp_df], ignore_index=True) | |
| return combined_df | |
| # ----------------------------------------------------------------------------- | |
| # Sidebar for Navigation and Global Settings | |
| # ----------------------------------------------------------------------------- | |
| st.sidebar.markdown( | |
| "<h2 style='text-align: center;'>IberBench 🌍</h2>", unsafe_allow_html=True | |
| ) | |
| menu = st.sidebar.radio( | |
| "", ["Leaderboard 📊", "Submit Model 🚀", "Datasets 📚", "About ℹ️"] | |
| ) | |
| st.sidebar.markdown("---") | |
| st.sidebar.markdown( | |
| """ | |
| <p style="font-size:0.9rem; text-align:center;"> | |
| A leaderboard of LLMs on languages from the Iberian Peninsula and Ibero-America | |
| </p> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| def load_languages_set(): | |
| with open(LANGUAGES_SETTINGS, "r") as f: | |
| return yaml_load(f) | |
| lang_set = load_languages_set() | |
| for lang in lang_set.keys(): | |
| data = load_data(lang) | |
| if f"leaderboard_data_{lang}" not in st.session_state: | |
| st.session_state[f"leaderboard_data_{lang}"] = data | |
| # ----------------------------------------------------------------------------- | |
| # Main Content based on Navigation | |
| # ----------------------------------------------------------------------------- | |
| if menu == "Leaderboard 📊": | |
| st.markdown( | |
| "<div class='main-header'><h1>Leaderboard 📊</h1></div>", | |
| unsafe_allow_html=True, | |
| ) | |
| lang_iber = [ | |
| k | |
| for k, v in lang_set.items() | |
| if v["category"] == "Iberian Peninsula languages" | |
| ] | |
| st.markdown("### General ranking 🏆") | |
| # --------------------------- | |
| # All-language plots section | |
| # --------------------------- | |
| # Use aggregated data for plots where each model must appear once with averaged values. | |
| aggregated_df = get_all_languages_aggregated_summary_df() | |
| create_table_all_results(aggregated_df) | |
| st.markdown("### General plots 📊") | |
| # Use raw data for Fundamental vs Professional and Task Category plots. | |
| raw_all_df = get_all_languages_raw_df() | |
| all_lang_tabs = st.tabs( | |
| [ | |
| "Top 10 performance 🥇", | |
| "Performance vs. size 📏", | |
| "Type distribution 🎨", | |
| "Performance per type 💡", | |
| "Distribution of sizes 📊", | |
| "Fundamental vs industry ⚖️", | |
| "Performance per task category 📈", | |
| "Performance per language 🌐", | |
| ] | |
| ) | |
| with all_lang_tabs[0]: | |
| create_radar_chart(aggregated_df, "all_radar") | |
| with all_lang_tabs[1]: | |
| create_scatter_chart(aggregated_df, "all_scatter") | |
| with all_lang_tabs[2]: | |
| create_pie_chart(aggregated_df, "all_pie") | |
| with all_lang_tabs[3]: | |
| create_box_plot(aggregated_df, "all_box") | |
| with all_lang_tabs[4]: | |
| create_histogram(aggregated_df, "all_hist") | |
| with all_lang_tabs[5]: | |
| # Use the raw combined data so that professional task columns are available. | |
| create_box_plot_per_task_category(raw_all_df, "all_box_task_cat") | |
| with all_lang_tabs[6]: | |
| create_box_plot_per_semantic_category(raw_all_df, "all_box_sem_cat") | |
| with all_lang_tabs[7]: | |
| create_box_plot_per_language("all_box_language") | |
| # Results per language | |
| st.markdown("---") | |
| st.markdown("### Language ranking 🏆") | |
| lang_choice = st.selectbox( | |
| "Select a language 🌐:", list(lang_iber), key="lang_leaderboard" | |
| ) | |
| if lang_choice == "Spanish": | |
| variations = [ | |
| k | |
| for k, v in lang_set.items() | |
| if v["category"] in ["Spanish Variations languages"] | |
| ] | |
| tabs_var = st.tabs(variations) | |
| for var, tab in zip(variations, tabs_var): | |
| with tab: | |
| create_results_visualization_lang(var) | |
| else: | |
| create_results_visualization_lang(lang_choice) | |
| elif menu == "Submit Model 🚀": | |
| st.markdown( | |
| "<div class='main-header'><h1>Submit Your Model 🚀</h1></div>", | |
| unsafe_allow_html=True, | |
| ) | |
| st.markdown("## How to submit a model 📤") | |
| # CSS | |
| st.markdown( | |
| """ | |
| <style> | |
| .card-container { | |
| max-width: 300px; | |
| margin: auto; | |
| text-align: left; | |
| font-size: 1rem; | |
| padding: 0.5rem; | |
| box-sizing: border-box; | |
| } | |
| .id-container { | |
| display: flex; | |
| align-items: center; | |
| margin-bottom: 1rem; | |
| } | |
| .id-circle { | |
| width: 32px; | |
| height: 32px; | |
| border-radius: 50%; | |
| display: flex; | |
| align-items: center; | |
| justify-content: center; | |
| border: 1px solid #007BFF; | |
| color: #007BFF; | |
| font-size: 0.875rem; | |
| font-weight: 600; | |
| background-color: transparent; | |
| margin-right: 8px; | |
| } | |
| .guide-content { | |
| word-wrap: break-word; | |
| } | |
| .guide-title { | |
| font-weight: bold; | |
| font-size: 1rem; | |
| margin-left: 8px; | |
| } | |
| </style> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| def render_card(content): | |
| html = f""" | |
| <div class="card-container"> | |
| <div class="guide-content"> | |
| {content} | |
| </div> | |
| </div> | |
| """ | |
| return html | |
| # Load your HTML content from files | |
| guide_info_list = [] | |
| html_path = "assets/html" | |
| filenames = sorted(os.listdir(html_path)) | |
| for filename in filenames: | |
| file_path = os.path.join(html_path, filename) | |
| with open(file_path, "r", encoding="utf-8") as file: | |
| raw_html = file.read() | |
| guide_info_list.append(raw_html) | |
| # Create the grid | |
| num_columns = 3 | |
| num_rows = 2 | |
| for row in range(num_rows): | |
| cols = st.columns(num_columns) | |
| for col in range(num_columns): | |
| index = row * num_columns + col | |
| if index < len(guide_info_list): | |
| with cols[col]: | |
| st.markdown( | |
| render_card(guide_info_list[index]), | |
| unsafe_allow_html=True, | |
| ) | |
| st.markdown("## Submission form 📝") | |
| with st.form("submit_model_form", clear_on_submit=True): | |
| model_name = st.text_input( | |
| "Model Name (format: user_name/model_name) 🧩", | |
| help="Your model should be public on the Hub and follow the username/model-id format (e.g. mistralai/Mistral-7B-v0.1).", | |
| ) | |
| description = st.text_area( | |
| "Description ✍️", | |
| help="Add a description of the proposed model for the evaluation to help prioritize its evaluation.", | |
| ) | |
| user_contact = st.text_input( | |
| "Your Contact Email 📧", | |
| help="User e-mail to contact when there are updates.", | |
| ) | |
| precision_option = st.selectbox( | |
| "Choose precision format 🔢:", | |
| help="Size limits vary by precision. Choose carefully as incorrect precision can cause evaluation errors.", | |
| options=["float16", "bfloat16", "8bit", "4bit", "GPTQ"], | |
| index=0, | |
| ) | |
| weight_type_option = st.selectbox( | |
| "Select weight type ⚖️:", | |
| help="Original: Complete model weights. Delta: Differences from base model. Adapter: Lightweight fine-tuning layers.", | |
| options=["Original", "Adapter", "Delta"], | |
| index=0, | |
| ) | |
| base_model_name = st.text_input( | |
| "Base model (if applicable) 🏗️", | |
| help="Required for delta weights or adapters. This helps calculate total parameter count.", | |
| value="", | |
| ) | |
| model_type = st.selectbox( | |
| "Choose model type 🔍:", | |
| help="🟢 Pretrained: Base models, 🔶 Fine-tuned: Domain-specific, 💬 Chat: Conversational, 🤝 Merge: Combined weights.", | |
| options=["🟢 Pretrained", "🔶 Fine-tuned", "💬 Chat", "🤝 Merge"], | |
| ) | |
| submit_button = st.form_submit_button("Submit Request 🚀") | |
| if submit_button: | |
| use_chat_template = True if model_type == "💬 Chat" else False | |
| validation_error = validate_model( | |
| model_name, | |
| precision_option, | |
| base_model_name, | |
| weight_type_option, | |
| use_chat_template, | |
| ) | |
| if validation_error is not None: | |
| st.error(validation_error) | |
| elif not re.match(r"[^@]+@[^@]+\.[^@]+", user_contact): | |
| st.error("Invalid email address ⚠️.") | |
| else: | |
| input_dict = { | |
| "model_name": model_name, | |
| "description": description, | |
| "user_contact": user_contact, | |
| "precision_option": precision_option, | |
| "weight_type_option": weight_type_option, | |
| "base_model_name": base_model_name, | |
| "model_type": model_type, | |
| } | |
| try: | |
| log_submission(input_dict) | |
| st.success("Your request has been sent successfully 🎉.") | |
| except Exception as e: | |
| st.error( | |
| f"Failed to send your request: {e}. Please try again later." | |
| ) | |
| elif menu == "Datasets 📚": | |
| st.markdown( | |
| "<div class='main-header'><h1>Dataset Information 📚</h1></div>", | |
| unsafe_allow_html=True, | |
| ) | |
| st.markdown("### Check the datasets 🔍") | |
| lang_iber = [ | |
| k | |
| for k, v in lang_set.items() | |
| if v["category"] == "Iberian Peninsula languages" | |
| ] | |
| lang_choice = st.selectbox( | |
| "Select a language 🌐:", list(lang_iber), key="lang_dataset" | |
| ) | |
| if lang_choice in ["Spanish"]: | |
| variations = [ | |
| k | |
| for k, v in lang_set.items() | |
| if v["category"] in ["Spanish Variations languages"] | |
| ] | |
| tabs_var = st.tabs(variations) | |
| for var, tab in zip(variations, tabs_var): | |
| with tab: | |
| create_dataset_info_per_language(var) | |
| else: | |
| create_dataset_info_per_language(lang_choice) | |
| st.markdown("### Task mappings 🔄") | |
| st.markdown( | |
| "For the sake of completeness, here we show the mappings we use in the leaderboard to aggregate tasks." | |
| ) | |
| tab1, tab2 = st.tabs( | |
| ["Semantic categories 🗂️", "Fundamental vs. Industry ⚖️"] | |
| ) | |
| with tab1: | |
| st.json( | |
| { | |
| category: [task.removeprefix("iberbench/") for task in tasks] | |
| for category, tasks in semantic_categories.items() | |
| } | |
| ) | |
| with tab2: | |
| st.json( | |
| { | |
| category: [task.removeprefix("iberbench/") for task in tasks] | |
| for category, tasks in professional_mapping.items() | |
| } | |
| ) | |
| elif menu == "About ℹ️": | |
| st.markdown( | |
| "<div class='main-header'><h1>About ℹ️</h1></div>", | |
| unsafe_allow_html=True, | |
| ) | |
| with open("./assets/md/about.md", "r") as fr: | |
| st.markdown(fr.read(), unsafe_allow_html=True) | |