Spaces:

Metric-AI
/

ArmBench-TextEmbed

Running

App Files Files Community

Zaruhi commited on Mar 19

Commit

c5f9df5

0 Parent(s):

Initial release

Browse files

Files changed (8) hide show

.gitattributes +35 -0
README.md +15 -0
app.py +371 -0
data_handler.py +318 -0
logo.png +0 -0
model_handler.py +252 -0
model_results.json +756 -0
requirements.txt +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,15 @@

+---
+title: ArmBench-TextEmbed
+emoji: 📊
+colorFrom: blue
+colorTo: purple
+sdk: gradio
+sdk_version: 5.19.0
+app_file: app.py
+pinned: false
+tags:
+  - embedding
+  - armenian
+  - benchmark
+  - mteb
+---

app.py ADDED Viewed

	@@ -0,0 +1,371 @@

+import gradio as gr
+from model_handler import ModelHandler
+from data_handler import (
+    prepare_leaderboard,
+    prepare_detailed_leaderboards,
+    prepare_translit_leaderboard,
+    prepare_translit_detailed,
+)
+# CSS for styled HTML tables with merged headers (uses Gradio CSS variables)
+TABLE_CSS = """
+<style>
+.detailed-table {
+    width: 100%;
+    border-collapse: collapse;
+    font-size: 14px;
+    margin: 10px 0;
+    display: table !important;
+    visibility: visible !important;
+}
+.detailed-table thead,
+.detailed-table tbody,
+.detailed-table tr {
+    display: table-row-group;
+    visibility: visible !important;
+}
+.detailed-table tr {
+    display: table-row !important;
+}
+.detailed-table thead tr th {
+    background-color: var(--background-fill-secondary) !important;
+    color: var(--body-text-color) !important;
+    font-weight: 600 !important;
+    padding: 10px 8px !important;
+    border: 1px solid var(--border-color-primary) !important;
+    text-align: center !important;
+    display: table-cell !important;
+}
+.detailed-table tbody tr td {
+    padding: 8px 12px !important;
+    text-align: center !important;
+    border: 1px solid var(--border-color-primary) !important;
+    background-color: var(--background-fill-primary) !important;
+    color: var(--body-text-color) !important;
+    display: table-cell !important;
+    visibility: visible !important;
+}
+.detailed-table tbody tr:hover td {
+    background-color: var(--background-fill-secondary) !important;
+}
+.detailed-table tbody td:first-child,
+.detailed-table tbody td:nth-child(2) {
+    text-align: left !important;
+}
+/* Bold borders to separate benchmark sections */
+/* MTEB | STS border (column 12: after #, Model, 9 MTEB cols) */
+/* STS | Retrieval border (column 14: after 2 STS cols) */
+/* Retrieval | MS MARCO border (column 19: after 5 Retrieval cols) */
+.detailed-table thead tr th:nth-child(12),
+.detailed-table thead tr th:nth-child(14),
+.detailed-table thead tr th:nth-child(19),
+.detailed-table tbody tr td:nth-child(12),
+.detailed-table tbody tr td:nth-child(14),
+.detailed-table tbody tr td:nth-child(19) {
+    border-left: 3px solid var(--body-text-color) !important;
+}
+</style>
+"""
+def df_to_styled_html(df):
+    """Convert DataFrame to styled HTML with CSS."""
+    table_html = df.to_html(classes="detailed-table", border=1, index=False, na_rep="-")
+    return TABLE_CSS + f'<div style="overflow-x: auto;">{table_html}</div>'
+# Global state
+global_data = {}
+def refresh_data():
+    global global_data
+    model_handler = ModelHandler()
+    df = model_handler.get_embedding_benchmark_data()
+    detailed_results = model_handler.get_detailed_results()
+    # Prepare main leaderboards
+    leaderboard = prepare_leaderboard(df)
+    translit_summary = prepare_translit_leaderboard(df)
+    # Extract model order from main leaderboard to pass to detailed tables
+    model_order = None
+    if not leaderboard.empty and "Model" in leaderboard.columns:
+        # Get model names, removing markdown link if present
+        model_order = []
+        for name in leaderboard["Model"]:
+            # Handle markdown format [name](url) or plain text
+            if isinstance(name, str) and "[" in name and "]" in name:
+                clean_name = name.split("]")[0].replace("[", "")
+            else:
+                clean_name = str(name)
+            model_order.append(clean_name)
+    # Extract model order from translit leaderboard
+    translit_model_order = None
+    if not translit_summary.empty and "Model" in translit_summary.columns:
+        # Get model names, removing markdown link if present
+        translit_model_order = []
+        for name in translit_summary["Model"]:
+            # Handle markdown format [name](url) or plain text
+            if isinstance(name, str) and "[" in name and "]" in name:
+                clean_name = name.split("]")[0].replace("[", "")
+            else:
+                clean_name = str(name)
+            translit_model_order.append(clean_name)
+    global_data = {
+        "leaderboard": leaderboard,
+        "detailed": prepare_detailed_leaderboards(detailed_results, model_order=model_order),
+        "translit_summary": translit_summary,
+        "translit_detailed": prepare_translit_detailed(detailed_results, model_order=translit_model_order),
+    }
+    return (
+        global_data["leaderboard"],
+        df_to_styled_html(global_data["detailed"]),
+        global_data["translit_summary"],
+        df_to_styled_html(global_data["translit_detailed"]),
+    )
+def main():
+    global global_data
+    model_handler = ModelHandler()
+    df = model_handler.get_embedding_benchmark_data()
+    detailed_results = model_handler.get_detailed_results()
+    # Prepare leaderboards
+    leaderboard = prepare_leaderboard(df)
+    translit_summary = prepare_translit_leaderboard(df)
+    # Extract model order from main leaderboard
+    model_order = None
+    if not leaderboard.empty and "Model" in leaderboard.columns:
+        model_order = []
+        for name in leaderboard["Model"]:
+            if isinstance(name, str) and "[" in name and "]" in name:
+                clean_name = name.split("]")[0].replace("[", "")
+            else:
+                clean_name = str(name)
+            model_order.append(clean_name)
+    # Extract model order from translit leaderboard
+    translit_model_order = None
+    if not translit_summary.empty and "Model" in translit_summary.columns:
+        translit_model_order = []
+        for name in translit_summary["Model"]:
+            if isinstance(name, str) and "[" in name and "]" in name:
+                clean_name = name.split("]")[0].replace("[", "")
+            else:
+                clean_name = str(name)
+            translit_model_order.append(clean_name)
+    global_data = {
+        "leaderboard": leaderboard,
+        "detailed": prepare_detailed_leaderboards(detailed_results, model_order=model_order),
+        "translit_summary": translit_summary,
+        "translit_detailed": prepare_translit_detailed(detailed_results, model_order=translit_model_order),
+    }
+    with gr.Blocks(title="ArmBench-TextEmbed", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# ArmBench-TextEmbed: Benchmarking Text Embedding Models on Armenian")
+        gr.Markdown(
+            """
+            Evaluating text embedding models on Armenian language tasks.
+            Developed by [Metric](https://metric.am/).
+            """
+        )
+        with gr.Tabs():
+            with gr.TabItem("Leaderboard"):
+                gr.Markdown("## Leaderboard")
+                gr.Markdown(
+                    """
+                    **Metrics:**
+                    - **MTEB Avg**: Average score across MTEB sample for Armenian [hye] (BitextMining, Classification, Clustering, Paraphrase, Retrieval)
+                    - **STS**: Semantic Textual Similarity (Spearman correlation)
+                    - **Retrieval**: Armenian document retrieval (Top-20 accuracy)
+                    - **MS MARCO**: Passage retrieval on MS MARCO Armenian (Top-10 accuracy)
+                    """
+                )
+                leaderboard_table = gr.DataFrame(
+                    value=global_data["leaderboard"],
+                    label="Embedding Model Leaderboard",
+                    datatype=["number", "markdown", "str", "number", "number", "number", "number", "number"],
+                )
+                with gr.Accordion("Detailed Scores", open=False):
+                    gr.Markdown(
+                        """
+                        **Note:** MTEB subscores represent different datasets, while other columns (STS, Retrieval, MS MARCO)
+                        represent different evaluation metrics within each benchmark.
+                        """
+                    )
+                    detailed_table = gr.HTML(value=df_to_styled_html(global_data["detailed"]))
+            with gr.TabItem("Translit"):
+                gr.Markdown("## Transliterated (Latin Script) Benchmarks")
+                gr.Markdown(
+                    """
+                    Evaluation on Armenian text transliterated to Latin script.
+                    Tests model robustness to script variation.
+                    """
+                )
+                translit_summary_table = gr.DataFrame(
+                    value=global_data["translit_summary"],
+                    label="Translit Leaderboard",
+                    datatype=["number", "markdown", "str", "number", "number", "number"],
+                )
+                with gr.Accordion("Detailed Scores", open=False):
+                    gr.Markdown(
+                        """
+                        **Note:** Subscores represent different evaluation metrics within each benchmark.
+                        """
+                    )
+                    translit_detailed_table = gr.HTML(
+                        value=df_to_styled_html(global_data["translit_detailed"])
+                    )
+            with gr.TabItem("About"):
+                gr.Markdown("# About ArmBench-TextEmbed")
+                gr.Markdown(
+                    """
+                    ArmBench-TextEmbed is a benchmark for evaluating text embedding models on Armenian language tasks.
+                    ## Benchmarks
+                    - **MTEB**: Multilingual Text Embedding Benchmark tasks for Armenian [hye]
+                      - BitextMining (Flores, NTREX, Tatoeba)
+                      - Classification (MASSIVE Intent/Scenario, SIB200)
+                      - Clustering (SIB200)
+                      - Paraphrase Detection
+                      - Retrieval (Belebele)
+                    - **STS**: Armenian Semantic Textual Similarity (Main score: Spearman correlation)
+                    - **Retrieval**: Armenian document retrieval (Main score: Top-20 accuracy)
+                    - **MS MARCO**: MS MARCO passage retrieval translated to Armenian (Main score: Top-10 accuracy)
+                    ## Submission Guide
+                    To submit your embedding model for evaluation:
+                    1. **Evaluate your model** using our evaluation scripts at [GitHub](https://github.com/Metric-AI-Lab/ArmBench-TextEmbed)
+                    2. **Format your results.json** with both summary and detailed metrics:
+                    ```json
+                    {
+                        "mteb_avg": 0.65,
+                        "mteb_detailed": {
+                            "FloresBitextMining_devtest": 0.12,
+                            "NTREXBitextMining_test": 0.95,
+                            "Tatoeba_test": 0.91,
+                            "MassiveIntentClassification_test": 0.53,
+                            "MassiveScenarioClassification_test": 0.58,
+                            "SIB200Classification_test": 0.66,
+                            "SIB200ClusteringS2S_test": 0.31,
+                            "ArmenianParaphrasePC_test": 0.94,
+                            "BelebeleRetrieval_test": 0.72
+                        },
+                        "sts_spearman": 0.70,
+                        "sts_detailed": {
+                            "Pearson_correlation": 0.69,
+                            "Spearman_correlation": 0.70
+                        },
+                        "retrieval_top20": 0.75,
+                        "retrieval_detailed": {
+                            "top1 within document": 0.50,
+                            "top3 within document": 0.76,
+                            "top5 within document": 0.85,
+                            "top20 group mean macro": 0.93,
+                            "top20 all": 0.75
+                        },
+                        "msmarco_top10": 0.60,
+                        "msmarco_detailed": {
+                            "reranking_mrr": 0.56,
+                            "retrieval_mrr": 0.46,
+                            "retrieval_top5_accuracy": 0.68,
+                            "retrieval_top10_accuracy": 0.60
+                        },
+                        "retrieval_translit_top20": 0.15,
+                        "retrieval_translit_detailed": {
+                            "top1 within document": 0.12,
+                            "top3 within document": 0.22,
+                            "top5 within document": 0.31,
+                            "top20 group mean macro": 0.31,
+                            "top20 all": 0.15
+                        },
+                        "msmarco_translit_top10": 0.15,
+                        "msmarco_translit_detailed": {
+                            "reranking_mrr": 0.39,
+                            "retrieval_mrr": 0.07,
+                            "retrieval_top5_accuracy": 0.11,
+                            "retrieval_top10_accuracy": 0.15
+                        }
+                    }
+                    ```
+                    **Note:** The `*_detailed` fields are required for the detailed scores tables. Translit fields are optional.
+                    3. **Add the tag and results**:
+                       - Add the `ArmBench-TextEmbed` tag to your model card
+                       - Upload `results.json` to your model repository
+                    4. Click "Refresh Data" to see your results on the leaderboard
+                    ## Citation
+                    If you use this benchmark in your research, please cite:
+                    ```bibtex
+                    @inproceedings{navasardyan2026lessismore,
+                      title={Less is More: Adapting Text Embeddings for Low-Resource Languages with Small Scale Noisy Synthetic Data},
+                      author={Navasardyan, Zaruhi and Bughdaryan, Spartak and Minasyan, Bagrat and Davtyan, Hrant},
+                      booktitle={Proceedings of the Workshop on Language Models for Low-Resource Languages (LoResLM) at EACL 2026},
+                      year={2026}
+                    }
+                    @misc{armbench-textembed,
+                      title={ArmBench-TextEmbed: A Benchmark for Armenian Text Embedding Models},
+                      year={2026},
+                      url={https://github.com/Metric-AI-Lab/ArmBench-TextEmbed}
+                    }
+                    ```
+                    ## Contributing
+                    You can contribute to this benchmark in several ways:
+                    - Provide API credits for evaluating additional API-based models
+                    - Cite our work in your research and publications
+                    - Contribute to the development of the benchmark itself with data or evaluation results
+                    ## About Metric
+                    Metric is an AI Research Lab in Yerevan, Armenia. Contact: info@metric.am
+                    *This is a non-commercial research project.*
+                    """
+                )
+                gr.Image("logo.png", width=200, show_label=False)
+        refresh_button = gr.Button("Refresh Data")
+        refresh_button.click(
+            fn=refresh_data,
+            outputs=[
+                leaderboard_table,
+                detailed_table,
+                translit_summary_table,
+                translit_detailed_table,
+            ]
+        )
+    demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)
+if __name__ == "__main__":
+    main()

data_handler.py ADDED Viewed

	@@ -0,0 +1,318 @@

+import pandas as pd
+COLUMN_LABELS = {
+    "model_name": "Model",
+    "model_size": "Size",
+    "mteb_avg": "MTEB",
+    "sts_spearman": "STS",
+    "retrieval_top20": "Retrieval",
+    "msmarco_top10": "MS MARCO",
+}
+TRANSLIT_COLUMN_LABELS = {
+    "model_name": "Model",
+    "model_size": "Size",
+    "retrieval_translit_top20": "Retrieval",
+    "msmarco_translit_top10": "MS MARCO",
+}
+# Metrics used for computing overall average (native script only)
+SCORE_COLS = ["mteb_avg", "sts_spearman", "retrieval_top20", "msmarco_top10"]
+def prepare_leaderboard(df: pd.DataFrame) -> pd.DataFrame:
+    """Prepare embedding benchmark leaderboard from raw results DataFrame."""
+    if df.empty:
+        return df
+    df = df.copy()
+    # Format model_name as hyperlink if model_url exists
+    if "model_url" in df.columns:
+        df["model_name"] = df.apply(
+            lambda row: f"[{row['model_name']}]({row['model_url']})"
+            if pd.notna(row.get("model_url"))
+            else row["model_name"],
+            axis=1,
+        )
+    # Calculate overall average (only native script metrics, exclude translit)
+    available_cols = [c for c in SCORE_COLS if c in df.columns]
+    if available_cols:
+        df["average"] = df[available_cols].mean(axis=1).round(4)
+    # Sort by average
+    if "average" in df.columns:
+        df = df.sort_values(by="average", ascending=False).reset_index(drop=True)
+    df.insert(0, "Rank", range(1, len(df) + 1))
+    # Select only main leaderboard columns (exclude translit)
+    # Include model_size if available
+    size_col = ["model_size"] if "model_size" in df.columns else []
+    display_cols = ["Rank", "model_name"] + size_col + available_cols + ["average"]
+    df = df[[c for c in display_cols if c in df.columns]]
+    # Replace missing model_size with "-"
+    if "model_size" in df.columns:
+        df["model_size"] = df["model_size"].fillna("-").replace("", "-")
+    # Round numeric columns
+    df = df.round(4)
+    # Rename columns for display
+    df = df.rename(columns={**COLUMN_LABELS, "average": "Average"})
+    return df
+def prepare_detailed_leaderboards(detailed_results: dict, model_order: list = None, use_multiindex: bool = True) -> pd.DataFrame:
+    """Prepare a single combined detailed leaderboard with hierarchical columns.
+    Args:
+        detailed_results: Dict with DataFrames from ModelHandler.get_detailed_results()
+        model_order: Optional list of model names in desired order. If provided, models will be
+                    displayed in this order instead of being sorted independently.
+        use_multiindex: If True, return DataFrame with MultiIndex columns for proper
+                        hierarchical display (merged headers in HTML/Gradio).
+                        If False, use flat "Category | Metric" column names.
+    Returns:
+        pd.DataFrame: Combined table with dataset names as hierarchical column headers
+    """
+    # Dataset configurations: (dataset_key, dataset_label, column_mappings)
+    datasets = [
+        ("mteb", "MTEB", {
+            "FloresBitextMining_devtest": "Flores",
+            "NTREXBitextMining_test": "NTREX",
+            "Tatoeba_test": "Tatoeba",
+            "MassiveIntentClassification_test": "Intent",
+            "MassiveScenarioClassification_test": "Scenario",
+            "SIB200Classification_test": "SIB200 Cls",
+            "SIB200ClusteringS2S_test": "SIB200 Clust",
+            "ArmenianParaphrasePC_test": "Paraphrase",
+            "BelebeleRetrieval_test": "Belebele",
+        }),
+        ("sts", "STS", {
+            "Pearson_correlation": "Pearson",
+            "Spearman_correlation": "Spearman",
+        }),
+        ("retrieval", "Retrieval", {
+            "top1 within document": "Top-1 Doc",
+            "top3 within document": "Top-3 Doc",
+            "top5 within document": "Top-5 Doc",
+            "top20 group mean macro": "Top-20 Type",
+            "top20 all": "Top-20 All",
+        }),
+        ("msmarco", "MS MARCO", {
+            "reranking_mrr": "Rerank MRR",
+            "retrieval_mrr": "Retr. MRR",
+            "retrieval_top5_accuracy": "Top-5",
+            "retrieval_top10_accuracy": "Top-10",
+        }),
+    ]
+    # Collect all models from all datasets
+    all_models = set()
+    for key, _, _ in datasets:
+        df = detailed_results.get(key, pd.DataFrame())
+        if not df.empty and "model_name" in df.columns:
+            all_models.update(df["model_name"].unique())
+    if not all_models:
+        return pd.DataFrame()
+    # Use provided model_order if available, otherwise sort alphabetically
+    if model_order:
+        # Filter model_order to only include models that exist in detailed_results
+        ordered_models = [m for m in model_order if m in all_models]
+        # Add any remaining models not in model_order (in case they're new)
+        remaining = sorted([m for m in all_models if m not in ordered_models])
+        all_models_ordered = ordered_models + remaining
+    else:
+        all_models_ordered = sorted(all_models)
+    # Build combined dataframe with flat columns first
+    combined = pd.DataFrame({"Model": all_models_ordered})
+    column_tuples = [("", "Model")]  # For MultiIndex: (level1, level2)
+    for key, label, col_map in datasets:
+        df = detailed_results.get(key, pd.DataFrame())
+        if df.empty:
+            continue
+        df = df.drop_duplicates(subset=["model_name"], keep="first")
+        for orig_col, new_col in col_map.items():
+            if orig_col in df.columns:
+                col_name = f"{label} | {new_col}"
+                column_tuples.append((label, new_col))
+                merged = combined.merge(
+                    df[["model_name", orig_col]].rename(columns={"model_name": "Model", orig_col: col_name}),
+                    on="Model",
+                    how="left"
+                )
+                combined = merged
+    # Round numeric columns
+    combined = combined.round(4)
+    # If no model_order was provided, sort by first numeric column for backward compatibility
+    if not model_order:
+        numeric_cols = combined.select_dtypes(include=["number"]).columns.tolist()
+        if numeric_cols:
+            combined = combined.sort_values(by=numeric_cols[0], ascending=False, na_position="last")
+    # Always reset index to ensure proper row ordering
+    combined = combined.reset_index(drop=True)
+    combined.insert(0, "#", range(1, len(combined) + 1))
+    column_tuples.insert(0, ("", "#"))
+    if use_multiindex:
+        # Convert to MultiIndex columns for proper hierarchical display
+        combined.columns = pd.MultiIndex.from_tuples(column_tuples)
+    return combined
+def prepare_translit_leaderboard(df: pd.DataFrame) -> pd.DataFrame:
+    """Prepare translit summary leaderboard from raw results DataFrame."""
+    if df.empty:
+        return df
+    df = df.copy()
+    # Format model_name as hyperlink if model_url exists
+    if "model_url" in df.columns:
+        df["model_name"] = df.apply(
+            lambda row: f"[{row['model_name']}]({row['model_url']})"
+            if pd.notna(row.get("model_url"))
+            else row["model_name"],
+            axis=1,
+        )
+    # Only include translit columns
+    translit_cols = ["retrieval_translit_top20", "msmarco_translit_top10"]
+    available_cols = [c for c in translit_cols if c in df.columns]
+    if not available_cols:
+        return pd.DataFrame()
+    # Filter to models that have translit data
+    df = df.dropna(subset=available_cols, how="all")
+    if df.empty:
+        return pd.DataFrame()
+    # Calculate average
+    df["average"] = df[available_cols].mean(axis=1).round(4)
+    # Sort by average
+    df = df.sort_values(by="average", ascending=False).reset_index(drop=True)
+    df.insert(0, "Rank", range(1, len(df) + 1))
+    # Select columns - include model_size if available
+    size_col = ["model_size"] if "model_size" in df.columns else []
+    display_cols = ["Rank", "model_name"] + size_col + available_cols + ["average"]
+    df = df[[c for c in display_cols if c in df.columns]].round(4)
+    # Replace missing model_size with "-" if it's in the data
+    if "model_size" in df.columns:
+        df["model_size"] = df["model_size"].fillna("-").replace("", "-")
+    df = df.rename(columns={**TRANSLIT_COLUMN_LABELS, "average": "Average"})
+    return df
+def prepare_translit_detailed(detailed_results: dict, model_order: list = None, use_multiindex: bool = True) -> pd.DataFrame:
+    """Prepare a single combined translit detailed leaderboard with hierarchical columns.
+    Args:
+        detailed_results: Dict with 'retrieval_translit' and 'msmarco_translit' DataFrames
+        model_order: Optional list of model names in desired order. If provided, models will be
+                    displayed in this order instead of being sorted independently.
+        use_multiindex: If True, return DataFrame with MultiIndex columns for proper
+                        hierarchical display (merged headers in HTML/Gradio).
+                        If False, use flat "Category | Metric" column names.
+    Returns:
+        pd.DataFrame: Combined table with dataset names as hierarchical column headers
+    """
+    datasets = [
+        ("retrieval_translit", "Retrieval", {
+            "top1 within document": "Top-1 Doc",
+            "top3 within document": "Top-3 Doc",
+            "top5 within document": "Top-5 Doc",
+            "top20 group mean macro": "Top-20 Type",
+            "top20 all": "Top-20 All",
+        }),
+        ("msmarco_translit", "MS MARCO", {
+            "reranking_mrr": "Rerank MRR",
+            "retrieval_mrr": "Retr. MRR",
+            "retrieval_top5_accuracy": "Top-5",
+            "retrieval_top10_accuracy": "Top-10",
+        }),
+    ]
+    # Collect all models from all datasets
+    all_models = set()
+    for key, _, _ in datasets:
+        df = detailed_results.get(key, pd.DataFrame())
+        if not df.empty and "model_name" in df.columns:
+            all_models.update(df["model_name"].unique())
+    if not all_models:
+        return pd.DataFrame()
+    # Use provided model_order if available, otherwise sort alphabetically
+    if model_order:
+        # Filter model_order to only include models that exist in detailed_results
+        ordered_models = [m for m in model_order if m in all_models]
+        # Add any remaining models not in model_order (in case they're new)
+        remaining = sorted([m for m in all_models if m not in ordered_models])
+        all_models_ordered = ordered_models + remaining
+    else:
+        all_models_ordered = sorted(all_models)
+    # Build combined dataframe
+    combined = pd.DataFrame({"Model": all_models_ordered})
+    column_tuples = [("", "Model")]  # For MultiIndex: (level1, level2)
+    for key, label, col_map in datasets:
+        df = detailed_results.get(key, pd.DataFrame())
+        if df.empty:
+            continue
+        df = df.drop_duplicates(subset=["model_name"], keep="first")
+        for orig_col, new_col in col_map.items():
+            if orig_col in df.columns:
+                col_name = f"{label} | {new_col}"
+                column_tuples.append((label, new_col))
+                merged = combined.merge(
+                    df[["model_name", orig_col]].rename(columns={"model_name": "Model", orig_col: col_name}),
+                    on="Model",
+                    how="left"
+                )
+                combined = merged
+    # Round numeric columns
+    combined = combined.round(4)
+    # If no model_order was provided, sort by first numeric column for backward compatibility
+    if not model_order:
+        numeric_cols = combined.select_dtypes(include=["number"]).columns.tolist()
+        if numeric_cols:
+            combined = combined.sort_values(by=numeric_cols[0], ascending=False, na_position="last")
+    # Always reset index to ensure proper row ordering
+    combined = combined.reset_index(drop=True)
+    combined.insert(0, "#", range(1, len(combined) + 1))
+    column_tuples.insert(0, ("", "#"))
+    if use_multiindex:
+        # Convert to MultiIndex columns for proper hierarchical display
+        combined.columns = pd.MultiIndex.from_tuples(column_tuples)
+    return combined

logo.png ADDED Viewed

model_handler.py ADDED Viewed

	@@ -0,0 +1,252 @@

+import json
+import os
+import struct
+from typing import Dict, List
+import pandas as pd
+import requests
+from huggingface_hub import HfApi, hf_hub_download
+# Required metrics for embedding evaluation
+REQUIRED_METRICS = [
+    "mteb_avg",
+    "sts_spearman",
+    "retrieval_top20",
+    "msmarco_top10",
+]
+def format_params(num_params):
+    """Format parameter count as human-readable string."""
+    if num_params >= 1e9:
+        return f"{num_params / 1e9:.1f}B"
+    else:
+        return f"{num_params / 1e6:.0f}M"
+def get_model_url(model_name):
+    """Get the model URL from HuggingFace."""
+    return f"https://huggingface.co/{model_name}"
+def get_model_size(model_name):
+    """Fetch model size from HuggingFace API."""
+    try:
+        url = f"https://huggingface.co/api/models/{model_name}"
+        response = requests.get(url, timeout=10)
+        if response.status_code == 200:
+            data = response.json()
+            # Get safetensors size first, fallback to general parameters
+            safetensors = data.get("safetensors")
+            if safetensors and "total" in safetensors:
+                num_params = safetensors["total"]
+                return format_params(num_params)
+            num_params = data.get("num_parameters")
+            if num_params:
+                return format_params(num_params)
+        # Fallback: read actual param count from safetensors header
+        num_params = get_params_from_safetensors(model_name)
+        if num_params:
+            return format_params(num_params)
+        return None
+    except Exception as e:
+        print(f"Error fetching size for {model_name}: {e}")
+        return None
+def get_params_from_safetensors(model_name):
+    """Read safetensors header to get actual parameter count."""
+    try:
+        tree_url = f"https://huggingface.co/api/models/{model_name}/tree/main"
+        resp = requests.get(tree_url, timeout=10)
+        if resp.status_code != 200:
+            return None
+        files = resp.json()
+        safetensor_files = [f for f in files if f.get("path", "").endswith(".safetensors")]
+        if not safetensor_files:
+            return None
+        total_params = 0
+        for sf in safetensor_files:
+            file_url = f"https://huggingface.co/{model_name}/resolve/main/{sf['path']}"
+            # Get header size (first 8 bytes)
+            headers = {"Range": "bytes=0-7"}
+            resp = requests.get(file_url, headers=headers, timeout=10, allow_redirects=True)
+            if resp.status_code != 206 or len(resp.content) < 8:
+                return None  # Likely gated model
+            header_size = struct.unpack("<Q", resp.content[:8])[0]
+            # Get header JSON
+            headers = {"Range": f"bytes=8-{8 + header_size - 1}"}
+            resp = requests.get(file_url, headers=headers, timeout=10, allow_redirects=True)
+            metadata = resp.json()
+            # Calculate params from tensor shapes
+            for key, info in metadata.items():
+                if key == "__metadata__":
+                    continue
+                shape = info.get("shape", [])
+                params = 1
+                for dim in shape:
+                    params *= dim
+                total_params += params
+        return total_params
+    except Exception:
+        return None
+class ModelHandler:
+    def __init__(self, model_infos_path="model_results.json"):
+        self.api = HfApi()
+        self.model_infos_path = model_infos_path
+        self.model_infos = self._load_model_infos()
+    def _load_model_infos(self) -> List:
+        if os.path.exists(self.model_infos_path):
+            with open(self.model_infos_path) as f:
+                return json.load(f)
+        return []
+    def _save_model_infos(self):
+        print("Saving model infos")
+        with open(self.model_infos_path, "w") as f:
+            json.dump(self.model_infos, f, indent=4)
+    def get_embedding_benchmark_data(self) -> pd.DataFrame:
+        """Fetch embedding benchmark results from HuggingFace models with ArmBench-TextEmbed tag."""
+        # Try to fetch new models from HuggingFace, but gracefully handle network errors
+        try:
+            models = self.api.list_models(filter="ArmBench-TextEmbed")
+            model_names = {model["model_name"] for model in self.model_infos}
+            repositories = [model.modelId for model in models]
+            for repo_id in repositories:
+                try:
+                    files = [f for f in self.api.list_repo_files(repo_id) if f == "results.json"]
+                    if not files:
+                        continue
+                    model_name = repo_id
+                    if model_name not in model_names:
+                        result_path = hf_hub_download(repo_id, filename="results.json")
+                        with open(result_path) as f:
+                            results = json.load(f)
+                        # Build model entry with metadata
+                        entry = {
+                            "model_name": model_name,
+                            "results": results
+                        }
+                        # Add model_url if not in results
+                        if "model_url" not in results:
+                            entry["model_url"] = get_model_url(model_name)
+                        # Add model_size if not in results
+                        if "model_size" not in results:
+                            model_size = get_model_size(model_name)
+                            if model_size:
+                                entry["model_size"] = model_size
+                        self.model_infos.append(entry)
+                except Exception as e:
+                    print(f"Error loading {repo_id} - {e}")
+                    continue
+            self._save_model_infos()
+        except Exception as e:
+            print(f"Failed to fetch from HuggingFace: {e}. Using local data.")
+        # Build dataframe from results
+        data = []
+        for model in self.model_infos:
+            model_name = model["model_name"]
+            results = model.get("results", {})
+            row = {"model_name": model_name}
+            # Extract model metadata
+            if "model_url" in model:
+                row["model_url"] = model["model_url"]
+            if "model_size" in model:
+                row["model_size"] = model["model_size"]
+            # Extract key metrics
+            if "mteb_avg" in results:
+                row["mteb_avg"] = results["mteb_avg"]
+            if "sts_spearman" in results:
+                row["sts_spearman"] = results["sts_spearman"]
+            if "retrieval_top20" in results:
+                row["retrieval_top20"] = results["retrieval_top20"]
+            if "retrieval_translit_top20" in results:
+                row["retrieval_translit_top20"] = results["retrieval_translit_top20"]
+            if "msmarco_top10" in results:
+                row["msmarco_top10"] = results["msmarco_top10"]
+            if "msmarco_translit_top10" in results:
+                row["msmarco_translit_top10"] = results["msmarco_translit_top10"]
+            # Only add if at least one metric is present
+            if len(row) > 1:
+                data.append(row)
+        return pd.DataFrame(data)
+    def get_detailed_results(self) -> Dict:
+        """Get all detailed results for MTEB, MS MARCO, STS, Retrieval, and translit benchmarks."""
+        mteb_data = []
+        msmarco_data = []
+        sts_data = []
+        retrieval_data = []
+        retrieval_translit_data = []
+        msmarco_translit_data = []
+        for model in self.model_infos:
+            model_name = model["model_name"]
+            results = model.get("results", {})
+            # MTEB detailed
+            if "mteb_detailed" in results:
+                row = {"model_name": model_name, **results["mteb_detailed"]}
+                mteb_data.append(row)
+            # MS MARCO detailed
+            if "msmarco_detailed" in results:
+                row = {"model_name": model_name, **results["msmarco_detailed"]}
+                msmarco_data.append(row)
+            # STS detailed
+            if "sts_detailed" in results:
+                row = {"model_name": model_name, **results["sts_detailed"]}
+                sts_data.append(row)
+            # Retrieval detailed
+            if "retrieval_detailed" in results:
+                row = {"model_name": model_name, **results["retrieval_detailed"]}
+                retrieval_data.append(row)
+            # Retrieval translit detailed
+            if "retrieval_translit_detailed" in results:
+                row = {"model_name": model_name, **results["retrieval_translit_detailed"]}
+                retrieval_translit_data.append(row)
+            # MS MARCO translit detailed
+            if "msmarco_translit_detailed" in results:
+                row = {"model_name": model_name, **results["msmarco_translit_detailed"]}
+                msmarco_translit_data.append(row)
+        return {
+            "mteb": pd.DataFrame(mteb_data) if mteb_data else pd.DataFrame(),
+            "msmarco": pd.DataFrame(msmarco_data) if msmarco_data else pd.DataFrame(),
+            "sts": pd.DataFrame(sts_data) if sts_data else pd.DataFrame(),
+            "retrieval": pd.DataFrame(retrieval_data) if retrieval_data else pd.DataFrame(),
+            "retrieval_translit": pd.DataFrame(retrieval_translit_data) if retrieval_translit_data else pd.DataFrame(),
+            "msmarco_translit": pd.DataFrame(msmarco_translit_data) if msmarco_translit_data else pd.DataFrame(),
+        }

model_results.json ADDED Viewed

	@@ -0,0 +1,756 @@

+[
+    {
+        "model_name": "Alibaba-NLP/gte-multilingual-base",
+        "model_url": "https://huggingface.co/Alibaba-NLP/gte-multilingual-base",
+        "results": {
+            "mteb_avg": 0.7337,
+            "mteb_detailed": {
+                "FloresBitextMining_devtest": 0.8919,
+                "NTREXBitextMining_test": 0.9495,
+                "Tatoeba_test": 0.8041,
+                "MassiveIntentClassification_test": 0.5091,
+                "MassiveScenarioClassification_test": 0.5719,
+                "SIB200Classification_test": 0.7549,
+                "SIB200ClusteringS2S_test": 0.3677,
+                "ArmenianParaphrasePC_test": 0.9453,
+                "BelebeleRetrieval_test": 0.8093
+            },
+            "sts_spearman": 0.6869,
+            "sts_detailed": {
+                "Pearson_correlation": 0.6815,
+                "Spearman_correlation": 0.6869
+            },
+            "retrieval_top20": 0.8315,
+            "retrieval_detailed": {
+                "top1 within document": 0.49,
+                "top3 within document": 0.76,
+                "top5 within document": 0.87,
+                "top20 group mean macro": 0.923,
+                "top20 all": 0.8315
+            },
+            "msmarco_top10": 0.7171,
+            "msmarco_detailed": {
+                "reranking_mrr": 0.5337,
+                "retrieval_mrr": 0.4098,
+                "retrieval_top5_accuracy": 0.6077,
+                "retrieval_top10_accuracy": 0.7171
+            },
+            "retrieval_translit_top20": 0.2772,
+            "retrieval_translit_detailed": {
+                "top1 within document": 0.15,
+                "top3 within document": 0.3,
+                "top5 within document": 0.49,
+                "top20 group mean macro": 0.5048,
+                "top20 all": 0.2772
+            },
+            "msmarco_translit_top10": 0.2088,
+            "msmarco_translit_detailed": {
+                "reranking_mrr": 0.3808,
+                "retrieval_mrr": 0.1007,
+                "retrieval_top5_accuracy": 0.1566,
+                "retrieval_top10_accuracy": 0.2088
+            }
+        },
+        "model_size": "305M"
+    },
+    {
+        "model_name": "Qwen/Qwen3-Embedding-0.6B",
+        "model_url": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B",
+        "results": {
+            "mteb_avg": 0.5241,
+            "mteb_detailed": {
+                "FloresBitextMining_devtest": 0.14,
+                "NTREXBitextMining_test": 0.7315,
+                "Tatoeba_test": 0.4621,
+                "MassiveIntentClassification_test": 0.468,
+                "MassiveScenarioClassification_test": 0.5255,
+                "SIB200Classification_test": 0.5196,
+                "SIB200ClusteringS2S_test": 0.2077,
+                "ArmenianParaphrasePC_test": 0.9292,
+                "BelebeleRetrieval_test": 0.7332
+            },
+            "sts_spearman": 0.6532,
+            "sts_detailed": {
+                "Pearson_correlation": 0.6502,
+                "Spearman_correlation": 0.6532
+            },
+            "retrieval_top20": 0.5163,
+            "retrieval_detailed": {
+                "top1 within document": 0.26,
+                "top3 within document": 0.44,
+                "top5 within document": 0.59,
+                "top20 group mean macro": 0.704,
+                "top20 all": 0.5163
+            },
+            "msmarco_top10": 0.6929,
+            "msmarco_detailed": {
+                "reranking_mrr": 0.5007,
+                "retrieval_mrr": 0.3783,
+                "retrieval_top5_accuracy": 0.5721,
+                "retrieval_top10_accuracy": 0.6929
+            },
+            "retrieval_translit_top20": 0.1957,
+            "retrieval_translit_detailed": {
+                "top1 within document": 0.14,
+                "top3 within document": 0.31,
+                "top5 within document": 0.49,
+                "top20 group mean macro": 0.4581,
+                "top20 all": 0.1957
+            },
+            "msmarco_translit_top10": 0.2655,
+            "msmarco_translit_detailed": {
+                "reranking_mrr": 0.4071,
+                "retrieval_mrr": 0.1283,
+                "retrieval_top5_accuracy": 0.2006,
+                "retrieval_top10_accuracy": 0.2655
+            }
+        },
+        "model_size": "596M"
+    },
+    {
+        "model_name": "Qwen/Qwen3-Embedding-8B",
+        "model_url": "https://huggingface.co/Qwen/Qwen3-Embedding-8B",
+        "results": {
+            "mteb_avg": 0.7538,
+            "mteb_detailed": {
+                "FloresBitextMining_devtest": 0.4617,
+                "NTREXBitextMining_test": 0.9633,
+                "Tatoeba_test": 0.8796,
+                "MassiveIntentClassification_test": 0.6594,
+                "MassiveScenarioClassification_test": 0.6922,
+                "SIB200Classification_test": 0.774,
+                "SIB200ClusteringS2S_test": 0.4455,
+                "ArmenianParaphrasePC_test": 0.9556,
+                "BelebeleRetrieval_test": 0.953
+            },
+            "sts_spearman": 0.7338,
+            "sts_detailed": {
+                "Pearson_correlation": 0.7258,
+                "Spearman_correlation": 0.7338
+            },
+            "retrieval_top20": 0.7011,
+            "retrieval_detailed": {
+                "top1 within document": 0.36,
+                "top3 within document": 0.59,
+                "top5 within document": 0.69,
+                "top20 group mean macro": 0.8125,
+                "top20 all": 0.7011
+            },
+            "msmarco_top10": 0.838,
+            "msmarco_detailed": {
+                "reranking_mrr": 0.542,
+                "retrieval_mrr": 0.4638,
+                "retrieval_top5_accuracy": 0.7051,
+                "retrieval_top10_accuracy": 0.838
+            },
+            "retrieval_translit_top20": 0.2717,
+            "retrieval_translit_detailed": {
+                "top1 within document": 0.19,
+                "top3 within document": 0.38,
+                "top5 within document": 0.47,
+                "top20 group mean macro": 0.446,
+                "top20 all": 0.2717
+            },
+            "msmarco_translit_top10": 0.3182,
+            "msmarco_translit_detailed": {
+                "reranking_mrr": 0.3829,
+                "retrieval_mrr": 0.1491,
+                "retrieval_top5_accuracy": 0.2366,
+                "retrieval_top10_accuracy": 0.3182
+            }
+        },
+        "model_size": "7.6B"
+    },
+    {
+        "model_name": "Metric-AI/armenian-text-embeddings-1",
+        "model_url": "https://huggingface.co/Metric-AI/armenian-text-embeddings-1",
+        "results": {
+            "mteb_avg": 0.6923,
+            "mteb_detailed": {
+                "FloresBitextMining_devtest": 0.0589,
+                "NTREXBitextMining_test": 0.9387,
+                "Tatoeba_test": 0.904,
+                "MassiveIntentClassification_test": 0.612,
+                "MassiveScenarioClassification_test": 0.6608,
+                "SIB200Classification_test": 0.7971,
+                "SIB200ClusteringS2S_test": 0.4593,
+                "ArmenianParaphrasePC_test": 0.9552,
+                "BelebeleRetrieval_test": 0.8447
+            },
+            "sts_spearman": 0.7057,
+            "sts_detailed": {
+                "Pearson_correlation": 0.6882,
+                "Spearman_correlation": 0.7057
+            },
+            "retrieval_top20": 0.8261,
+            "retrieval_detailed": {
+                "top1 within document": 0.4,
+                "top3 within document": 0.78,
+                "top5 within document": 0.82,
+                "top20 group mean macro": 0.9475,
+                "top20 all": 0.8261
+            },
+            "msmarco_top10": 0.7364,
+            "msmarco_detailed": {
+                "reranking_mrr": 0.5614,
+                "retrieval_mrr": 0.4279,
+                "retrieval_top5_accuracy": 0.6251,
+                "retrieval_top10_accuracy": 0.7364
+            },
+            "retrieval_translit_top20": 0.1033,
+            "retrieval_translit_detailed": {
+                "top1 within document": 0.09,
+                "top3 within document": 0.24,
+                "top5 within document": 0.36,
+                "top20 group mean macro": 0.3149,
+                "top20 all": 0.1033
+            },
+            "msmarco_translit_top10": 0.1053,
+            "msmarco_translit_detailed": {
+                "reranking_mrr": 0.3532,
+                "retrieval_mrr": 0.0516,
+                "retrieval_top5_accuracy": 0.0776,
+                "retrieval_top10_accuracy": 0.1053
+            }
+        },
+        "model_size": "278M"
+    },
+    {
+        "model_name": "Qwen/Qwen3-Embedding-4B",
+        "model_url": "https://huggingface.co/Qwen/Qwen3-Embedding-4B",
+        "results": {
+            "mteb_avg": 0.7039,
+            "mteb_detailed": {
+                "FloresBitextMining_devtest": 0.3528,
+                "NTREXBitextMining_test": 0.937,
+                "Tatoeba_test": 0.8123,
+                "MassiveIntentClassification_test": 0.611,
+                "MassiveScenarioClassification_test": 0.6534,
+                "SIB200Classification_test": 0.7426,
+                "SIB200ClusteringS2S_test": 0.395,
+                "ArmenianParaphrasePC_test": 0.9487,
+                "BelebeleRetrieval_test": 0.8827
+            },
+            "sts_spearman": 0.7013,
+            "sts_detailed": {
+                "Pearson_correlation": 0.6939,
+                "Spearman_correlation": 0.7013
+            },
+            "retrieval_top20": 0.6848,
+            "retrieval_detailed": {
+                "top1 within document": 0.35,
+                "top3 within document": 0.63,
+                "top5 within document": 0.74,
+                "top20 group mean macro": 0.8291,
+                "top20 all": 0.6848
+            },
+            "msmarco_top10": 0.8465,
+            "msmarco_detailed": {
+                "reranking_mrr": 0.5568,
+                "retrieval_mrr": 0.4848,
+                "retrieval_top5_accuracy": 0.7258,
+                "retrieval_top10_accuracy": 0.8465
+            },
+            "retrieval_translit_top20": 0.337,
+            "retrieval_translit_detailed": {
+                "top1 within document": 0.15,
+                "top3 within document": 0.44,
+                "top5 within document": 0.6,
+                "top20 group mean macro": 0.5185,
+                "top20 all": 0.337
+            },
+            "msmarco_translit_top10": 0.3943,
+            "msmarco_translit_detailed": {
+                "reranking_mrr": 0.4203,
+                "retrieval_mrr": 0.1926,
+                "retrieval_top5_accuracy": 0.3002,
+                "retrieval_top10_accuracy": 0.3943
+            }
+        },
+        "model_size": "4.0B"
+    },
+    {
+        "model_name": "Metric-AI/armenian-text-embeddings-2-base",
+        "model_url": "https://huggingface.co/Metric-AI/armenian-text-embeddings-2-base",
+        "results": {
+            "mteb_avg": 0.6903,
+            "mteb_detailed": {
+                "FloresBitextMining_devtest": 0.1119,
+                "NTREXBitextMining_test": 0.9626,
+                "Tatoeba_test": 0.9221,
+                "MassiveIntentClassification_test": 0.59,
+                "MassiveScenarioClassification_test": 0.6393,
+                "SIB200Classification_test": 0.7529,
+                "SIB200ClusteringS2S_test": 0.3963,
+                "ArmenianParaphrasePC_test": 0.9516,
+                "BelebeleRetrieval_test": 0.8857
+            },
+            "sts_spearman": 0.7055,
+            "sts_detailed": {
+                "Pearson_correlation": 0.6959,
+                "Spearman_correlation": 0.7055
+            },
+            "retrieval_top20": 0.8587,
+            "retrieval_detailed": {
+                "top1 within document": 0.51,
+                "top3 within document": 0.75,
+                "top5 within document": 0.86,
+                "top20 group mean macro": 0.9538,
+                "top20 all": 0.8587
+            },
+            "msmarco_top10": 0.8135,
+            "msmarco_detailed": {
+                "reranking_mrr": 0.565,
+                "retrieval_mrr": 0.4732,
+                "retrieval_top5_accuracy": 0.7035,
+                "retrieval_top10_accuracy": 0.8135
+            },
+            "retrieval_translit_top20": 0.288,
+            "retrieval_translit_detailed": {
+                "top1 within document": 0.13,
+                "top3 within document": 0.33,
+                "top5 within document": 0.45,
+                "top20 group mean macro": 0.5038,
+                "top20 all": 0.288
+            },
+            "msmarco_translit_top10": 0.2693,
+            "msmarco_translit_detailed": {
+                "reranking_mrr": 0.4308,
+                "retrieval_mrr": 0.1371,
+                "retrieval_top5_accuracy": 0.2082,
+                "retrieval_top10_accuracy": 0.2693
+            }
+        },
+        "model_size": "278M"
+    },
+    {
+        "model_name": "intfloat/multilingual-e5-large",
+        "model_url": "https://huggingface.co/intfloat/multilingual-e5-large",
+        "results": {
+            "mteb_avg": 0.6678,
+            "mteb_detailed": {
+                "FloresBitextMining_devtest": 0.2418,
+                "NTREXBitextMining_test": 0.9719,
+                "Tatoeba_test": 0.9279,
+                "MassiveIntentClassification_test": 0.5499,
+                "MassiveScenarioClassification_test": 0.5975,
+                "SIB200Classification_test": 0.6676,
+                "SIB200ClusteringS2S_test": 0.3292,
+                "ArmenianParaphrasePC_test": 0.9541,
+                "BelebeleRetrieval_test": 0.7704
+            },
+            "sts_spearman": 0.6973,
+            "sts_detailed": {
+                "Pearson_correlation": 0.689,
+                "Spearman_correlation": 0.6973
+            },
+            "retrieval_top20": 0.7663,
+            "retrieval_detailed": {
+                "top1 within document": 0.52,
+                "top3 within document": 0.72,
+                "top5 within document": 0.83,
+                "top20 group mean macro": 0.8751,
+                "top20 all": 0.7663
+            },
+            "msmarco_top10": 0.7298,
+            "msmarco_detailed": {
+                "reranking_mrr": 0.5609,
+                "retrieval_mrr": 0.4306,
+                "retrieval_top5_accuracy": 0.6282,
+                "retrieval_top10_accuracy": 0.7298
+            },
+            "retrieval_translit_top20": 0.125,
+            "retrieval_translit_detailed": {
+                "top1 within document": 0.09,
+                "top3 within document": 0.27,
+                "top5 within document": 0.41,
+                "top20 group mean macro": 0.3187,
+                "top20 all": 0.125
+            },
+            "msmarco_translit_top10": 0.1202,
+            "msmarco_translit_detailed": {
+                "reranking_mrr": 0.3551,
+                "retrieval_mrr": 0.0608,
+                "retrieval_top5_accuracy": 0.0902,
+                "retrieval_top10_accuracy": 0.1202
+            }
+        },
+        "model_size": "560M"
+    },
+    {
+        "model_name": "Snowflake/snowflake-arctic-embed-m-v2.0",
+        "model_url": "https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v2.0",
+        "results": {
+            "mteb_avg": 0.594,
+            "mteb_detailed": {
+                "FloresBitextMining_devtest": 0.0154,
+                "NTREXBitextMining_test": 0.8091,
+                "Tatoeba_test": 0.6328,
+                "MassiveIntentClassification_test": 0.55,
+                "MassiveScenarioClassification_test": 0.5947,
+                "SIB200Classification_test": 0.6667,
+                "SIB200ClusteringS2S_test": 0.3108,
+                "ArmenianParaphrasePC_test": 0.9357,
+                "BelebeleRetrieval_test": 0.8306
+            },
+            "sts_spearman": 0.6656,
+            "sts_detailed": {
+                "Pearson_correlation": 0.656,
+                "Spearman_correlation": 0.6656
+            },
+            "retrieval_top20": 0.8533,
+            "retrieval_detailed": {
+                "top1 within document": 0.56,
+                "top3 within document": 0.79,
+                "top5 within document": 0.88,
+                "top20 group mean macro": 0.9381,
+                "top20 all": 0.8533
+            },
+            "msmarco_top10": 0.7941,
+            "msmarco_detailed": {
+                "reranking_mrr": 0.562,
+                "retrieval_mrr": 0.4654,
+                "retrieval_top5_accuracy": 0.6816,
+                "retrieval_top10_accuracy": 0.7941
+            },
+            "retrieval_translit_top20": 0.1685,
+            "retrieval_translit_detailed": {
+                "top1 within document": 0.07,
+                "top3 within document": 0.28,
+                "top5 within document": 0.45,
+                "top20 group mean macro": 0.4663,
+                "top20 all": 0.1685
+            },
+            "msmarco_translit_top10": 0.1642,
+            "msmarco_translit_detailed": {
+                "reranking_mrr": 0.3896,
+                "retrieval_mrr": 0.0859,
+                "retrieval_top5_accuracy": 0.1316,
+                "retrieval_top10_accuracy": 0.1642
+            }
+        },
+        "model_size": "305M"
+    },
+    {
+        "model_name": "Snowflake/snowflake-arctic-embed-l-v2.0",
+        "model_url": "https://huggingface.co/Snowflake/snowflake-arctic-embed-l-v2.0",
+        "results": {
+            "mteb_avg": 0.686,
+            "mteb_detailed": {
+                "FloresBitextMining_devtest": 0.1367,
+                "NTREXBitextMining_test": 0.9489,
+                "Tatoeba_test": 0.8401,
+                "MassiveIntentClassification_test": 0.6301,
+                "MassiveScenarioClassification_test": 0.6703,
+                "SIB200Classification_test": 0.7348,
+                "SIB200ClusteringS2S_test": 0.3526,
+                "ArmenianParaphrasePC_test": 0.9586,
+                "BelebeleRetrieval_test": 0.9019
+            },
+            "sts_spearman": 0.702,
+            "sts_detailed": {
+                "Pearson_correlation": 0.6835,
+                "Spearman_correlation": 0.702
+            },
+            "retrieval_top20": 0.9239,
+            "retrieval_detailed": {
+                "top1 within document": 0.61,
+                "top3 within document": 0.89,
+                "top5 within document": 0.93,
+                "top20 group mean macro": 0.9647,
+                "top20 all": 0.9239
+            },
+            "msmarco_top10": 0.8851,
+            "msmarco_detailed": {
+                "reranking_mrr": 0.6179,
+                "retrieval_mrr": 0.5533,
+                "retrieval_top5_accuracy": 0.7888,
+                "retrieval_top10_accuracy": 0.8851
+            },
+            "retrieval_translit_top20": 0.2446,
+            "retrieval_translit_detailed": {
+                "top1 within document": 0.18,
+                "top3 within document": 0.33,
+                "top5 within document": 0.56,
+                "top20 group mean macro": 0.5177,
+                "top20 all": 0.2446
+            },
+            "msmarco_translit_top10": 0.2405,
+            "msmarco_translit_detailed": {
+                "reranking_mrr": 0.4078,
+                "retrieval_mrr": 0.1246,
+                "retrieval_top5_accuracy": 0.1878,
+                "retrieval_top10_accuracy": 0.2405
+            }
+        },
+        "model_size": "568M"
+    },
+    {
+        "model_name": "intfloat/multilingual-e5-base",
+        "model_url": "https://huggingface.co/intfloat/multilingual-e5-base",
+        "results": {
+            "mteb_avg": 0.6392,
+            "mteb_detailed": {
+                "FloresBitextMining_devtest": 0.1184,
+                "NTREXBitextMining_test": 0.9548,
+                "Tatoeba_test": 0.9131,
+                "MassiveIntentClassification_test": 0.5407,
+                "MassiveScenarioClassification_test": 0.5835,
+                "SIB200Classification_test": 0.6652,
+                "SIB200ClusteringS2S_test": 0.3035,
+                "ArmenianParaphrasePC_test": 0.9424,
+                "BelebeleRetrieval_test": 0.731
+            },
+            "sts_spearman": 0.6726,
+            "sts_detailed": {
+                "Pearson_correlation": 0.6661,
+                "Spearman_correlation": 0.6726
+            },
+            "retrieval_top20": 0.7446,
+            "retrieval_detailed": {
+                "top1 within document": 0.48,
+                "top3 within document": 0.68,
+                "top5 within document": 0.77,
+                "top20 group mean macro": 0.8643,
+                "top20 all": 0.7446
+            },
+            "msmarco_top10": 0.606,
+            "msmarco_detailed": {
+                "reranking_mrr": 0.5435,
+                "retrieval_mrr": 0.3474,
+                "retrieval_top5_accuracy": 0.5078,
+                "retrieval_top10_accuracy": 0.606
+            },
+            "retrieval_translit_top20": 0.087,
+            "retrieval_translit_detailed": {
+                "top1 within document": 0.02,
+                "top3 within document": 0.19,
+                "top5 within document": 0.34,
+                "top20 group mean macro": 0.2976,
+                "top20 all": 0.087
+            },
+            "msmarco_translit_top10": 0.0885,
+            "msmarco_translit_detailed": {
+                "reranking_mrr": 0.3493,
+                "retrieval_mrr": 0.0434,
+                "retrieval_top5_accuracy": 0.0658,
+                "retrieval_top10_accuracy": 0.0885
+            }
+        },
+        "model_size": "278M"
+    },
+    {
+        "model_name": "google/embeddinggemma-300m",
+        "model_url": "https://huggingface.co/google/embeddinggemma-300m",
+        "results": {
+            "mteb_avg": 0.2529,
+            "mteb_detailed": {
+                "FloresBitextMining_devtest": 0.0665,
+                "NTREXBitextMining_test": 0.2256,
+                "Tatoeba_test": 0.0727,
+                "MassiveIntentClassification_test": 0.2161,
+                "MassiveScenarioClassification_test": 0.2879,
+                "SIB200Classification_test": 0.3127,
+                "SIB200ClusteringS2S_test": 0.0492,
+                "ArmenianParaphrasePC_test": 0.9126,
+                "BelebeleRetrieval_test": 0.1329
+            },
+            "sts_spearman": 0.461,
+            "sts_detailed": {
+                "Pearson_correlation": 0.4555,
+                "Spearman_correlation": 0.461
+            },
+            "retrieval_top20": 0.0326,
+            "retrieval_detailed": {
+                "top1 within document": 0.07,
+                "top3 within document": 0.21,
+                "top5 within document": 0.39,
+                "top20 group mean macro": 0.1787,
+                "top20 all": 0.0326
+            },
+            "msmarco_top10": 0.0303,
+            "msmarco_detailed": {
+                "reranking_mrr": 0.3294,
+                "retrieval_mrr": 0.0164,
+                "retrieval_top5_accuracy": 0.0223,
+                "retrieval_top10_accuracy": 0.0303
+            },
+            "retrieval_translit_top20": 0.0,
+            "retrieval_translit_detailed": {
+                "top1 within document": 0.01,
+                "top3 within document": 0.1,
+                "top5 within document": 0.19,
+                "top20 group mean macro": 0.0298,
+                "top20 all": 0.0
+            },
+            "msmarco_translit_top10": 0.0051,
+            "msmarco_translit_detailed": {
+                "reranking_mrr": 0.2847,
+                "retrieval_mrr": 0.0029,
+                "retrieval_top5_accuracy": 0.0038,
+                "retrieval_top10_accuracy": 0.0051
+            }
+        },
+        "model_size": "303M"
+    },
+    {
+        "model_name": "Metric-AI/armenian-text-embeddings-2-large",
+        "model_url": "https://huggingface.co/Metric-AI/armenian-text-embeddings-2-large",
+        "results": {
+            "mteb_avg": 0.7311,
+            "mteb_detailed": {
+                "FloresBitextMining_devtest": 0.2859,
+                "NTREXBitextMining_test": 0.9758,
+                "Tatoeba_test": 0.9299,
+                "MassiveIntentClassification_test": 0.6314,
+                "MassiveScenarioClassification_test": 0.6852,
+                "SIB200Classification_test": 0.7706,
+                "SIB200ClusteringS2S_test": 0.4315,
+                "ArmenianParaphrasePC_test": 0.9605,
+                "BelebeleRetrieval_test": 0.9088
+            },
+            "sts_spearman": 0.7472,
+            "sts_detailed": {
+                "Pearson_correlation": 0.7401,
+                "Spearman_correlation": 0.7472
+            },
+            "retrieval_top20": 0.8804,
+            "retrieval_detailed": {
+                "top1 within document": 0.5,
+                "top3 within document": 0.83,
+                "top5 within document": 0.93,
+                "top20 group mean macro": 0.9592,
+                "top20 all": 0.8804
+            },
+            "msmarco_top10": 0.8627,
+            "msmarco_detailed": {
+                "reranking_mrr": 0.563,
+                "retrieval_mrr": 0.4961,
+                "retrieval_top5_accuracy": 0.741,
+                "retrieval_top10_accuracy": 0.8627
+            },
+            "retrieval_translit_top20": 0.462,
+            "retrieval_translit_detailed": {
+                "top1 within document": 0.21,
+                "top3 within document": 0.54,
+                "top5 within document": 0.69,
+                "top20 group mean macro": 0.6623,
+                "top20 all": 0.462
+            },
+            "msmarco_translit_top10": 0.4609,
+            "msmarco_translit_detailed": {
+                "reranking_mrr": 0.4607,
+                "retrieval_mrr": 0.2335,
+                "retrieval_top5_accuracy": 0.3606,
+                "retrieval_top10_accuracy": 0.4609
+            }
+        },
+        "model_size": "560M"
+    },
+    {
+        "model_name": "gemini/gemini-embedding-001",
+        "model_url": "https://ai.google.dev/gemini-api/docs/embeddings",
+        "results": {
+            "mteb_avg": 0.8204,
+            "mteb_detailed": {
+                "FloresBitextMining_devtest": 0.7182,
+                "NTREXBitextMining_test": 0.9634,
+                "Tatoeba_test": 0.9043,
+                "MassiveIntentClassification_test": 0.7889,
+                "MassiveScenarioClassification_test": 0.8452,
+                "SIB200Classification_test": 0.7353,
+                "SIB200ClusteringS2S_test": 0.5165,
+                "ArmenianParaphrasePC_test": 0.9681,
+                "BelebeleRetrieval_test": 0.9434
+            },
+            "sts_spearman": 0.7455,
+            "sts_detailed": {
+                "Pearson_correlation": 0.7124,
+                "Spearman_correlation": 0.7455
+            },
+            "retrieval_top20": 0.663,
+            "retrieval_detailed": {
+                "top1 within document": 0.36,
+                "top3 within document": 0.54,
+                "top5 within document": 0.63,
+                "top20 group mean macro": 0.7533,
+                "top20 all": 0.663
+            },
+            "msmarco_top10": 0.8662,
+            "msmarco_detailed": {
+                "reranking_mrr": 0.5529,
+                "retrieval_mrr": 0.4815,
+                "retrieval_top5_accuracy": 0.7384,
+                "retrieval_top10_accuracy": 0.8662
+            },
+            "retrieval_translit_top20": 0.3315,
+            "retrieval_translit_detailed": {
+                "top1 within document": 0.2,
+                "top3 within document": 0.41,
+                "top5 within document": 0.54,
+                "top20 group mean macro": 0.5542,
+                "top20 all": 0.3315
+            },
+            "msmarco_translit_top10": 0.4139,
+            "msmarco_translit_detailed": {
+                "reranking_mrr": 0.4335,
+                "retrieval_mrr": 0.2017,
+                "retrieval_top5_accuracy": 0.3204,
+                "retrieval_top10_accuracy": 0.4139
+            }
+        }
+    },
+    {
+        "model_name": "openai/text-embedding-3-large",
+        "model_url": "https://developers.openai.com/api/docs/models/text-embedding-3-large",
+        "results": {
+            "mteb_avg": 0.2768,
+            "mteb_detailed": {
+                "FloresBitextMining_devtest": 0.1187,
+                "NTREXBitextMining_test": 0.137,
+                "Tatoeba_test": 0.0435,
+                "MassiveIntentClassification_test": 0.3318,
+                "MassiveScenarioClassification_test": 0.3813,
+                "SIB200Classification_test": 0.2908,
+                "SIB200ClusteringS2S_test": 0.066,
+                "ArmenianParaphrasePC_test": 0.9121,
+                "BelebeleRetrieval_test": 0.2104
+            },
+            "sts_spearman": 0.5106,
+            "sts_detailed": {
+                "Pearson_correlation": 0.5171,
+                "Spearman_correlation": 0.5106
+            },
+            "retrieval_top20": 0.1467,
+            "retrieval_detailed": {
+                "top1 within document": 0.13,
+                "top3 within document": 0.32,
+                "top5 within document": 0.45,
+                "top20 group mean macro": 0.3745,
+                "top20 all": 0.1467
+            },
+            "msmarco_top10": 0.2518,
+            "msmarco_detailed": {
+                "reranking_mrr": 0.3848,
+                "retrieval_mrr": 0.1223,
+                "retrieval_top5_accuracy": 0.1889,
+                "retrieval_top10_accuracy": 0.2518
+            },
+            "retrieval_translit_top20": 0.0435,
+            "retrieval_translit_detailed": {
+                "top1 within document": 0.06,
+                "top3 within document": 0.15,
+                "top5 within document": 0.25,
+                "top20 group mean macro": 0.2355,
+                "top20 all": 0.0435
+            },
+            "msmarco_translit_top10": 0.1328,
+            "msmarco_translit_detailed": {
+                "reranking_mrr": 0.343,
+                "retrieval_mrr": 0.0592,
+                "retrieval_top5_accuracy": 0.0959,
+                "retrieval_top10_accuracy": 0.1328
+            }
+        }
+    }
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+gradio==5.19.0
+pandas==2.2.3
+huggingface-hub==0.28.1