| from functools import partial, reduce |
| import json |
| import os |
| import re |
|
|
| from datasets import load_dataset |
| import gradio as gr |
| from huggingface_hub import hf_hub_download |
| from huggingface_hub.repocard import metadata_load |
| import pandas as pd |
| from tqdm.autonotebook import tqdm |
|
|
| from utils.model_size import get_model_parameters_memory |
| from envs import LEADERBOARD_CONFIG, MODEL_META, REPO_ID, RESULTS_REPO, API |
|
|
| TASKS_CONFIG = LEADERBOARD_CONFIG["tasks"] |
| BOARDS_CONFIG = LEADERBOARD_CONFIG["boards"] |
|
|
| TASKS = list(TASKS_CONFIG.keys()) |
| PRETTY_NAMES = { |
| "InstructionRetrieval": "Retrieval w/Instructions", |
| "PairClassification": "Pair Classification", |
| "BitextMining": "Bitext Mining", |
| } |
|
|
| TASK_TO_METRIC = {k:v["metric"] for k,v in TASKS_CONFIG.items()} |
|
|
| def make_clickable_model(model_name, link=None): |
| if link is None: |
| link = "https://huggingface.co/" + model_name |
| |
| return ( |
| f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name.split("/")[-1]}</a>' |
| ) |
|
|
| EXTERNAL_MODELS = {k for k,v in MODEL_META["model_meta"].items() if v.get("is_external", False)} |
| EXTERNAL_MODEL_TO_LINK = {k: v["link"] for k,v in MODEL_META["model_meta"].items() if v.get("link", False)} |
| EXTERNAL_MODEL_TO_DIM = {k: v["dim"] for k,v in MODEL_META["model_meta"].items() if v.get("dim", False)} |
| EXTERNAL_MODEL_TO_SEQLEN = {k: v["seq_len"] for k,v in MODEL_META["model_meta"].items() if v.get("seq_len", False)} |
| EXTERNAL_MODEL_TO_SIZE = {k: v["size"] for k,v in MODEL_META["model_meta"].items() if v.get("size", False)} |
| PROPRIETARY_MODELS = {k for k,v in MODEL_META["model_meta"].items() if v.get("is_proprietary", False)} |
| TASK_DESCRIPTIONS = {k: v["task_description"] for k,v in TASKS_CONFIG.items()} |
| TASK_DESCRIPTIONS["Overall"] = "Overall performance across MTEB tasks." |
| SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS = {k for k,v in MODEL_META["model_meta"].items() if v.get("is_sentence_transformers_compatible", False)} |
| MODELS_TO_SKIP = MODEL_META["models_to_skip"] |
| CROSS_ENCODERS = MODEL_META["cross_encoders"] |
| BI_ENCODERS = [k for k, _ in MODEL_META["model_meta"].items() if k not in CROSS_ENCODERS + ["bm25"]] |
|
|
| PROPRIETARY_MODELS = { |
| make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}")) |
| for model in PROPRIETARY_MODELS |
| } |
| SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS = { |
| make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}")) |
| for model in SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS |
| } |
| CROSS_ENCODERS = { |
| make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}")) |
| for model in CROSS_ENCODERS |
| } |
| BI_ENCODERS = { |
| make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}")) |
| for model in BI_ENCODERS |
| } |
|
|
|
|
| TASK_TO_TASK_TYPE = {task_category: [] for task_category in TASKS} |
| for board_config in BOARDS_CONFIG.values(): |
| for task_category, task_list in board_config["tasks"].items(): |
| TASK_TO_TASK_TYPE[task_category].extend(task_list) |
|
|
| def add_lang(examples): |
| if not(examples["eval_language"]): |
| examples["mteb_dataset_name_with_lang"] = examples["mteb_dataset_name"] |
| else: |
| examples["mteb_dataset_name_with_lang"] = examples["mteb_dataset_name"] + f' ({examples["eval_language"]})' |
| return examples |
|
|
| def norm(names): return set([name.split(" ")[0] for name in names]) |
|
|
| def add_task(examples): |
| |
| task_name = examples["mteb_dataset_name"] |
| task_type = None |
| for task_category, task_list in TASK_TO_TASK_TYPE.items(): |
| if task_name in norm(task_list): |
| task_type = task_category |
| break |
| if task_type is not None: |
| examples["mteb_task"] = task_type |
| else: |
| print("WARNING: Task not found for dataset", examples["mteb_dataset_name"]) |
| examples["mteb_task"] = "Unknown" |
| return examples |
|
|
| if os.path.exists("EXTERNAL_MODEL_RESULTS.json"): |
| with open("EXTERNAL_MODEL_RESULTS.json") as f: |
| EXTERNAL_MODEL_RESULTS = json.load(f) |
| |
| models_to_run = [] |
| for model in EXTERNAL_MODELS: |
| if model not in EXTERNAL_MODEL_RESULTS: |
| models_to_run.append(model) |
| EXTERNAL_MODEL_RESULTS[model] = {k: {v: []} for k, v in TASK_TO_METRIC.items()} |
| else: |
| EXTERNAL_MODEL_RESULTS = {model: {k: {v: []} for k, v in TASK_TO_METRIC.items()} for model in EXTERNAL_MODELS} |
| models_to_run = EXTERNAL_MODELS |
|
|
| pbar = tqdm(models_to_run, desc="Fetching external model results") |
| for model in pbar: |
| pbar.set_description(f"Fetching external model results for {model!r}") |
| ds = load_dataset(RESULTS_REPO, model, trust_remote_code=True) |
| |
| |
| ds = ds.map(add_lang) |
| ds = ds.map(add_task) |
| base_dict = {"Model": make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))} |
| |
| for task, metric in TASK_TO_METRIC.items(): |
| ds_dict = ds.filter(lambda x: (x["mteb_task"] == task) and (x["metric"] == metric))["test"].to_dict() |
| ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])} |
| EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict}) |
|
|
| |
| with open("EXTERNAL_MODEL_RESULTS.json", "w") as f: |
| json.dump(EXTERNAL_MODEL_RESULTS, f) |
|
|
| def get_dim_seq_size(model): |
| filenames = [sib.rfilename for sib in model.siblings] |
| dim, seq = "", "" |
| for filename in filenames: |
| if re.match("\d+_Pooling/config.json", filename): |
| st_config_path = hf_hub_download(model.modelId, filename=filename) |
| dim = json.load(open(st_config_path)).get("word_embedding_dimension", "") |
| break |
| for filename in filenames: |
| if re.match("\d+_Dense/config.json", filename): |
| st_config_path = hf_hub_download(model.modelId, filename=filename) |
| dim = json.load(open(st_config_path)).get("out_features", dim) |
| if "config.json" in filenames: |
| config_path = hf_hub_download(model.modelId, filename="config.json") |
| config = json.load(open(config_path)) |
| if not dim: |
| dim = config.get("hidden_dim", config.get("hidden_size", config.get("d_model", ""))) |
| seq = config.get("n_positions", config.get("max_position_embeddings", config.get("n_ctx", config.get("seq_length", "")))) |
| |
| parameters, memory = get_model_parameters_memory(model) |
| return dim, seq, parameters, memory |
|
|
| def make_datasets_clickable(df): |
| """Does not work""" |
| if "BornholmBitextMining" in df.columns: |
| link = "https://huggingface.co/datasets/strombergnlp/bornholmsk_parallel" |
| df = df.rename( |
| columns={f'BornholmBitextMining': '<a target="_blank" style="text-decoration: underline" href="{link}">BornholmBitextMining</a>',}) |
| return df |
|
|
| def add_rank(df): |
| cols_to_rank = [col for col in df.columns if col not in ["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Embedding Dimensions", "Max Tokens"]] |
| if len(cols_to_rank) == 1: |
| df.sort_values(cols_to_rank[0], ascending=False, inplace=True) |
| else: |
| df.insert(len(df.columns) - len(cols_to_rank), "Average", df[cols_to_rank].mean(axis=1, skipna=False)) |
| df.sort_values("Average", ascending=False, inplace=True) |
| df.insert(0, "Rank", list(range(1, len(df) + 1))) |
| df = df.round(2) |
| |
| df.fillna("", inplace=True) |
| return df |
|
|
| model_infos_path = "model_infos.json" |
| MODEL_INFOS = {} |
| if os.path.exists(model_infos_path): |
| with open(model_infos_path) as f: |
| MODEL_INFOS = json.load(f) |
|
|
| def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=True, task_to_metric=TASK_TO_METRIC, rank=True, refresh=True): |
| global MODEL_INFOS |
| api = API |
| models = api.list_models(filter="mteb") |
| |
| df_list = [] |
| for model in EXTERNAL_MODEL_RESULTS: |
| results_list = [] |
| for task in tasks: |
| |
| if task not in EXTERNAL_MODEL_RESULTS[model]: |
| continue |
| results_list += EXTERNAL_MODEL_RESULTS[model][task][task_to_metric[task]] |
| |
| if len(datasets) > 0: |
| res = {k: v for d in results_list for k, v in d.items() if (k == "Model") or any([x in k for x in datasets])} |
| elif langs: |
| |
| langs_format = [f"({lang})" for lang in langs] |
| res = {k: v for d in results_list for k, v in d.items() if any([k.split(" ")[-1] in (k, x) for x in langs_format])} |
| else: |
| res = {k: v for d in results_list for k, v in d.items()} |
| |
| if len(res) > 1: |
| if add_emb_dim: |
| res["Model Size (Million Parameters)"] = EXTERNAL_MODEL_TO_SIZE.get(model, "") |
| res["Memory Usage (GB, fp32)"] = round(res["Model Size (Million Parameters)"] * 1e6 * 4 / 1024**3, 2) if res["Model Size (Million Parameters)"] != "" else "" |
| res["Embedding Dimensions"] = EXTERNAL_MODEL_TO_DIM.get(model, "") |
| res["Max Tokens"] = EXTERNAL_MODEL_TO_SEQLEN.get(model, "") |
| df_list.append(res) |
|
|
| for model in models: |
| if model.modelId in MODELS_TO_SKIP: continue |
| print("MODEL", model.modelId) |
| if model.modelId not in MODEL_INFOS or refresh: |
| readme_path = hf_hub_download(model.modelId, filename="README.md") |
| meta = metadata_load(readme_path) |
| MODEL_INFOS[model.modelId] = { |
| "metadata": meta |
| } |
| meta = MODEL_INFOS[model.modelId]["metadata"] |
| if "model-index" not in meta: |
| continue |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| if len(datasets) > 0: |
| task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks) and any([x in sub_res.get("dataset", {}).get("name", "") for x in datasets])] |
| elif langs: |
| task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks) and (sub_res.get("dataset", {}).get("config", "default") in ("default", *langs))] |
| else: |
| task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks)] |
| out = [{res["dataset"]["name"].replace("MTEB ", ""): [round(score["value"], 2) for score in res["metrics"] if score["type"] == task_to_metric.get(res["task"]["type"])][0]} for res in task_results] |
| out = {k: v for d in out for k, v in d.items()} |
| out["Model"] = make_clickable_model(model.modelId) |
| |
| if len(out) > 1: |
| if add_emb_dim: |
| try: |
| |
| if "dim_seq_size" not in MODEL_INFOS[model.modelId] or refresh: |
| MODEL_INFOS[model.modelId]["dim_seq_size"] = list(get_dim_seq_size(model)) |
| out["Embedding Dimensions"], out["Max Tokens"], out["Model Size (Million Parameters)"], out["Memory Usage (GB, fp32)"] = tuple(MODEL_INFOS[model.modelId]["dim_seq_size"]) |
| except: |
| MODEL_INFOS[model.modelId]["dim_seq_size"] = "", "", "", "" |
| df_list.append(out) |
| if model.library_name == "sentence-transformers" or "sentence-transformers" in model.tags or "modules.json" in {file.rfilename for file in model.siblings}: |
| SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS.add(out["Model"]) |
| |
| |
| with open("model_infos.json", "w") as f: |
| json.dump(MODEL_INFOS, f) |
|
|
| df = pd.DataFrame(df_list) |
| |
| |
| df = df.groupby("Model", as_index=False).first() |
| |
| cols = sorted(list(df.columns)) |
| base_columns = ["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Embedding Dimensions", "Max Tokens"] |
| if len(datasets) > 0: |
| |
| cols = [col for col in cols if col in base_columns + datasets] |
| i = 0 |
| for column in base_columns: |
| if column in cols: |
| cols.insert(i, cols.pop(cols.index(column))) |
| i += 1 |
| df = df[cols] |
| if rank: |
| df = add_rank(df) |
| if fillna: |
| df.fillna("", inplace=True) |
| return df |
|
|
| |
| |
| def get_mteb_average(task_dict: dict, refresh=True): |
| all_tasks = reduce(lambda x, y: x + y, task_dict.values()) |
| DATA_OVERALL = get_mteb_data( |
| tasks=list(task_dict.keys()), |
| datasets=all_tasks, |
| fillna=False, |
| add_emb_dim=True, |
| rank=False, |
| refresh=refresh |
| ) |
| |
| |
| |
| DATA_OVERALL.insert(1, f"Average ({len(all_tasks)} datasets)", DATA_OVERALL[all_tasks].mean(axis=1, skipna=False)) |
| for i, (task_category, task_category_list) in enumerate(task_dict.items()): |
| DATA_OVERALL.insert(i+2, f"{task_category} Average ({len(task_category_list)} datasets)", DATA_OVERALL[task_category_list].mean(axis=1, skipna=False)) |
| DATA_OVERALL.sort_values(f"Average ({len(all_tasks)} datasets)", ascending=False, inplace=True) |
| |
| DATA_OVERALL.insert(0, "Rank", list(range(1, len(DATA_OVERALL) + 1))) |
|
|
| DATA_OVERALL = DATA_OVERALL.round(2) |
|
|
| DATA_TASKS = {} |
| for task_category, task_category_list in task_dict.items(): |
| DATA_TASKS[task_category] = add_rank(DATA_OVERALL[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + task_category_list]) |
| DATA_TASKS[task_category] = DATA_TASKS[task_category][DATA_TASKS[task_category].iloc[:, 4:].ne("").any(axis=1)] |
|
|
| |
| DATA_OVERALL.fillna("", inplace=True) |
|
|
| data_overall_rows = ["Rank", "Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Embedding Dimensions", "Max Tokens", f"Average ({len(all_tasks)} datasets)"] |
| for task_category, task_category_list in task_dict.items(): |
| data_overall_rows.append(f"{task_category} Average ({len(task_category_list)} datasets)") |
|
|
| DATA_OVERALL = DATA_OVERALL[data_overall_rows] |
| DATA_OVERALL = DATA_OVERALL[DATA_OVERALL.iloc[:, 5:].ne("").any(axis=1)] |
|
|
| return DATA_OVERALL, DATA_TASKS |
|
|
| boards_data = {} |
| all_data_tasks = [] |
| for board, board_config in BOARDS_CONFIG.items(): |
| boards_data[board] = { |
| "data_overall": None, |
| "data_tasks": {} |
| } |
| if board_config["has_overall"]: |
| data_overall, data_tasks = get_mteb_average(board_config["tasks"], refresh=False) |
| boards_data[board]["data_overall"] = data_overall |
| boards_data[board]["data_tasks"] = data_tasks |
| all_data_tasks.extend(data_tasks.values()) |
| else: |
| for task_category, task_category_list in board_config["tasks"].items(): |
| data_task_category = get_mteb_data(tasks=[task_category], datasets=task_category_list, refresh=False) |
| data_task_category.drop(columns=["Embedding Dimensions", "Max Tokens"], inplace=True) |
| boards_data[board]["data_tasks"][task_category] = data_task_category |
| all_data_tasks.append(data_task_category) |
|
|
| |
| NUM_SCORES = 0 |
| DATASETS = [] |
| MODELS = [] |
| |
| for d in all_data_tasks: |
| |
| cols_to_ignore = 4 if "Average" in d.columns else 3 |
| |
| NUM_SCORES += d.iloc[:, cols_to_ignore:].notna().sum().sum() |
| |
| DATASETS += [i.split(" ")[0] for i in d.columns[cols_to_ignore:]] |
| |
| MODELS += d["Model"].tolist() |
|
|
| NUM_DATASETS = len(set(DATASETS)) |
| |
| NUM_MODELS = len(set(MODELS)) |
|
|
| |
| |
| |
| |
| css = """ |
| table > thead { |
| white-space: normal |
| } |
| |
| table { |
| --cell-width-1: 250px |
| } |
| |
| table > tbody > tr > td:nth-child(2) > div { |
| overflow-x: auto |
| } |
| |
| .filter-checkbox-group { |
| max-width: max-content; |
| } |
| """ |
|
|
| """ |
| Each inner tab can have the following keys: |
| - language: The language of the leaderboard |
| - language_long: [optional] The long form of the language |
| - description: The description of the leaderboard |
| - credits: [optional] The credits for the leaderboard |
| - data: The data for the leaderboard |
| - refresh: The function to refresh the leaderboard |
| """ |
|
|
| def get_refresh_function(task_category, task_list): |
| def _refresh(): |
| data_task_category = get_mteb_data(tasks=[task_category], datasets=task_list) |
| data_task_category.drop(columns=["Embedding Dimensions", "Max Tokens"], inplace=True) |
| return data_task_category |
| return _refresh |
|
|
| data = { |
| "Overall": {"metric": "Various, refer to task tabs", "data": []} |
| } |
| for task in TASKS: |
| data[task] = {"metric": TASKS_CONFIG[task]["metric_description"], "data": []} |
|
|
| for board, board_config in BOARDS_CONFIG.items(): |
| init_name = board_config["title"] |
| if init_name in PRETTY_NAMES: |
| init_name = PRETTY_NAMES[init_name] |
| board_pretty_name = f"{init_name} leaderboard" |
| acronym = board_config.get("acronym", None) |
| board_icon = board_config.get("icon", None) |
| if board_icon is None: |
| board_icon = "" |
| credits = board_config.get("credits", None) |
|
|
| if board_config["has_overall"]: |
| overall_pretty_name = board_pretty_name |
| if acronym is not None: |
| overall_pretty_name += f" ({board_config['acronym']})" |
| data["Overall"]["data"].append({ |
| "language": board_config["title"], |
| "language_long": board_config["language_long"], |
| "description": f"**Overall MTEB {overall_pretty_name}** 🔮{board_icon}", |
| "data": boards_data[board]["data_overall"], |
| "refresh": lambda: get_mteb_average(board_config["tasks"])[0], |
| "credits": credits, |
| }) |
| for task_category, task_category_list in board_config["tasks"].items(): |
| task_icon = TASKS_CONFIG[task_category]['icon'] |
| if "special_icons" in board_config and isinstance(board_config["special_icons"], dict): |
| task_icon = board_config["special_icons"].get(task_category, task_icon) |
| data[task_category]["data"].append({ |
| "language": board_config["title"], |
| "language_long": board_config["language_long"], |
| "description": f"**{task_category} {board_pretty_name}** {task_icon}{board_icon}", |
| "data": boards_data[board]["data_tasks"][task_category], |
| "refresh": get_refresh_function(task_category, task_category_list), |
| "credits": credits, |
| }) |
|
|
| dataframes = [] |
| full_dataframes = [] |
| tabs = [] |
|
|
| |
| |
| |
| |
| set_window_url_params = """ |
| function(goalUrlObject) { |
| const params = new URLSearchParams(window.location.search); |
| for (const [key, value] of Object.entries(goalUrlObject)) { |
| params.set(key, value); |
| }; |
| const queryString = '?' + params.toString(); |
| console.log(queryString); |
| window.history.replaceState({}, '', queryString); |
| return []; |
| } |
| """ |
|
|
| def update_url_task(event: gr.SelectData, current_task_language: dict, language_per_task: dict): |
| current_task_language["task"] = event.target.id |
| |
| try: |
| current_task_language["language"] = language_per_task.get(event.target.id, event.target.children[1].children[0].id) |
| except Exception as e: |
| current_task_language["language"] = language_per_task.get(event.target.id, event.target.children[0].children[0].id) |
|
|
| return current_task_language, language_per_task |
|
|
| def update_url_language(event: gr.SelectData, current_task_language: dict, language_per_task: dict): |
| current_task_language["language"] = event.target.id |
| if "task" not in current_task_language: |
| current_task_language["task"] = "overall" |
| language_per_task[current_task_language["task"]] = event.target.id |
| return current_task_language, language_per_task |
|
|
| NUMERIC_INTERVALS = { |
| "<100M": pd.Interval(0, 100, closed="right"), |
| "100M to 250M": pd.Interval(100, 250, closed="right"), |
| "250M to 500M": pd.Interval(250, 500, closed="right"), |
| "500M to 1B": pd.Interval(500, 1000, closed="right"), |
| ">1B": pd.Interval(1000, 1_000_000, closed="right"), |
| } |
|
|
| MODEL_TYPES = [ |
| "Open", |
| "Proprietary", |
| "Sentence Transformers", |
| "Cross-Encoders", |
| "Bi-Encoders" |
| ] |
|
|
| def filter_data(search_query, model_types, model_sizes, *full_dataframes): |
| output_dataframes = [] |
| for df in full_dataframes: |
| |
| if search_query: |
| names = df["Model"].map(lambda x: re.match("<a .+?>(.+)</a>", x).group(1)) |
| masks = [] |
| for query in search_query.split(";"): |
| masks.append(names.str.contains(query)) |
| df = df[reduce(lambda a, b: a | b, masks)] |
|
|
| |
| if set(model_types) != set(MODEL_TYPES): |
| masks = [] |
| for model_type in model_types: |
| if model_type == "Open": |
| masks.append(~df["Model"].isin(PROPRIETARY_MODELS)) |
| elif model_type == "Proprietary": |
| masks.append(df["Model"].isin(PROPRIETARY_MODELS)) |
| elif model_type == "Sentence Transformers": |
| masks.append(df["Model"].isin(SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS)) |
| elif model_type == "Cross-Encoders": |
| masks.append(df["Model"].isin(CROSS_ENCODERS)) |
| elif model_type == "Bi-Encoders": |
| masks.append(df["Model"].isin(BI_ENCODERS)) |
| if masks: |
| df = df[reduce(lambda a, b: a | b, masks)] |
| else: |
| df = pd.DataFrame(columns=df.columns) |
|
|
| |
| if set(model_sizes) != set(NUMERIC_INTERVALS.keys()): |
| numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[model_size] for model_size in model_sizes])) |
| sizes = df["Model Size (Million Parameters)"].replace('', 0) |
| mask = sizes.apply(lambda size: any(numeric_interval.contains(size))) |
| df = df[mask] |
|
|
| output_dataframes.append(df) |
| return output_dataframes |
|
|
|
|
| with gr.Blocks(css=css) as block: |
|
|
| |
| |
| current_task_language = gr.JSON(value=dict(), visible=False) |
| language_per_task = gr.JSON(value=dict(), visible=False) |
|
|
| gr.Markdown(f""" |
| Massive Text Embedding Benchmark (MTEB) Leaderboard. To submit, refer to the <a href="https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_model.md" target="_blank" style="text-decoration: underline">MTEB GitHub repository</a> 🤗 Refer to the [MTEB paper](https://arxiv.org/abs/2210.07316) for details on metrics, tasks and models. |
| """) |
|
|
| with gr.Row(): |
| search_bar = gr.Textbox( |
| label="Search Bar (separate multiple queries with `;`)", |
| placeholder=" 🔍 Search for a model and press enter...", |
| ) |
| filter_model_type = gr.CheckboxGroup( |
| label="Model types", |
| choices=MODEL_TYPES, |
| value=MODEL_TYPES, |
| interactive=True, |
| elem_classes=["filter-checkbox-group"] |
| ) |
| filter_model_sizes = gr.CheckboxGroup( |
| label="Model sizes (in number of parameters)", |
| choices=list(NUMERIC_INTERVALS.keys()), |
| value=list(NUMERIC_INTERVALS.keys()), |
| interactive=True, |
| elem_classes=["filter-checkbox-group"], |
| scale=2, |
| ) |
|
|
| with gr.Tabs() as outer_tabs: |
| |
| tabs.append(outer_tabs) |
| for task, task_values in data.items(): |
| metric = task_values["metric"] |
| task_tab_id = task.lower().replace(" ", "-") |
|
|
| |
| pretty_task_name = task if task not in PRETTY_NAMES.keys() else PRETTY_NAMES[task] |
| with gr.Tab(pretty_task_name, id=task_tab_id) as task_tab: |
| |
| task_tab.select(update_url_task, [current_task_language, language_per_task], [current_task_language, language_per_task]).then(None, [current_task_language], [], js=set_window_url_params) |
| if "Overall" != task: |
| gr.Markdown(TASK_DESCRIPTIONS[task]) |
| with gr.Tabs() as task_tabs: |
| |
| tabs.append(task_tabs) |
|
|
| for item in task_values["data"]: |
| item_tab_id = item["language"].lower().replace(" ", "-") |
|
|
| |
| with gr.Tab(item["language"], id=item_tab_id) as item_tab: |
| |
| item_tab.select(update_url_language, [current_task_language, language_per_task], [current_task_language, language_per_task], trigger_mode="always_last").then(None, [current_task_language], [], js=set_window_url_params) |
|
|
| with gr.Row(): |
| gr.Markdown(f""" |
| {item['description']} |
| |
| - **Metric:** {metric} |
| - **Languages:** {item['language_long'] if 'language_long' in item else item['language']} |
| {"- **Credits:** " + item['credits'] if ("credits" in item and item["credits"] is not None) else ''} |
| """) |
|
|
| with gr.Row(): |
| datatype = ["number", "markdown"] + ["number"] * len(item["data"]) |
| dataframe = gr.Dataframe(item["data"], datatype=datatype, type="pandas", height=500) |
| dataframes.append(dataframe) |
|
|
| full_dataframe = gr.Dataframe(item["data"], datatype=datatype, type="pandas", visible=False) |
| full_dataframes.append(full_dataframe) |
|
|
| with gr.Row(): |
| refresh_button = gr.Button("Refresh") |
| refresh_button.click(item["refresh"], inputs=None, outputs=dataframe, concurrency_limit=20) |
|
|
| gr.Markdown(f""" |
| - **Total Datasets**: {NUM_DATASETS} |
| - **Total Languages**: 113 |
| - **Total Scores**: {NUM_SCORES} |
| - **Total Models**: {NUM_MODELS} |
| """ + r""" |
| Made with ❤️ for NLP. If this work is useful to you, please consider citing: |
| |
| ```bibtex |
| @article{muennighoff2022mteb, |
| doi = {10.48550/ARXIV.2210.07316}, |
| url = {https://arxiv.org/abs/2210.07316}, |
| author = {Muennighoff, Niklas and Tazi, Nouamane and Magne, Lo{\"\i}c and Reimers, Nils}, |
| title = {MTEB: Massive Text Embedding Benchmark}, |
| publisher = {arXiv}, |
| journal={arXiv preprint arXiv:2210.07316}, |
| year = {2022} |
| } |
| ``` |
| """) |
|
|
| def set_tabs_on_load(request: gr.Request): |
| """Set the selected tab based on the URL parameters on load.""" |
| global tabs |
| valid_task_keys = [child.id for child in tabs[0].children] |
| return_tabs = [gr.Tabs()] * len(tabs) |
|
|
| query_params = request.request.query_params |
| task_key = query_params.get("task", "overall") |
| if task_key not in valid_task_keys: |
| task_key = "overall" |
| return_tabs[0] = gr.Tabs(selected=task_key) |
|
|
| tabs_idx = valid_task_keys.index(task_key) + 1 |
| language_key = query_params.get("language", "english") |
| return_tabs[tabs_idx] = gr.Tabs(selected=language_key) |
| current_task_language = {"task": task_key, "language": language_key} |
| language_per_task = {task_key: language_key} |
| return return_tabs + [current_task_language, language_per_task] |
|
|
| block.load(set_tabs_on_load, inputs=[], outputs=tabs + [current_task_language, language_per_task]) |
|
|
| search_bar.submit(filter_data, inputs=[search_bar, filter_model_type, filter_model_sizes] + full_dataframes, outputs=dataframes) |
| filter_model_type.change(filter_data, inputs=[search_bar, filter_model_type, filter_model_sizes] + full_dataframes, outputs=dataframes) |
| filter_model_sizes.change(filter_data, inputs=[search_bar, filter_model_type, filter_model_sizes] + full_dataframes, outputs=dataframes) |
|
|
| block.queue(max_size=10) |
| block.launch() |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
|
|