Spaces:
Running
Running
| import gradio as gr | |
| from model_handler import ModelHandler | |
| from data_handler import ( | |
| prepare_leaderboard, | |
| prepare_detailed_leaderboards, | |
| prepare_translit_leaderboard, | |
| prepare_translit_detailed, | |
| ) | |
| # CSS for styled HTML tables with merged headers (uses Gradio CSS variables) | |
| TABLE_CSS = """ | |
| <style> | |
| .detailed-table { | |
| width: 100%; | |
| border-collapse: collapse; | |
| font-size: 14px; | |
| margin: 10px 0; | |
| display: table !important; | |
| visibility: visible !important; | |
| } | |
| .detailed-table thead, | |
| .detailed-table tbody, | |
| .detailed-table tr { | |
| display: table-row-group; | |
| visibility: visible !important; | |
| } | |
| .detailed-table tr { | |
| display: table-row !important; | |
| } | |
| .detailed-table thead tr th { | |
| background-color: var(--background-fill-secondary) !important; | |
| color: var(--body-text-color) !important; | |
| font-weight: 600 !important; | |
| padding: 10px 8px !important; | |
| border: 1px solid var(--border-color-primary) !important; | |
| text-align: center !important; | |
| display: table-cell !important; | |
| } | |
| .detailed-table tbody tr td { | |
| padding: 8px 12px !important; | |
| text-align: center !important; | |
| border: 1px solid var(--border-color-primary) !important; | |
| background-color: var(--background-fill-primary) !important; | |
| color: var(--body-text-color) !important; | |
| display: table-cell !important; | |
| visibility: visible !important; | |
| } | |
| .detailed-table tbody tr:hover td { | |
| background-color: var(--background-fill-secondary) !important; | |
| } | |
| .detailed-table tbody td:first-child, | |
| .detailed-table tbody td:nth-child(2) { | |
| text-align: left !important; | |
| } | |
| /* Bold borders to separate benchmark sections */ | |
| /* MTEB | STS border (column 12: after #, Model, 9 MTEB cols) */ | |
| /* STS | Retrieval border (column 14: after 2 STS cols) */ | |
| /* Retrieval | MS MARCO border (column 19: after 5 Retrieval cols) */ | |
| .detailed-table thead tr th:nth-child(12), | |
| .detailed-table thead tr th:nth-child(14), | |
| .detailed-table thead tr th:nth-child(19), | |
| .detailed-table tbody tr td:nth-child(12), | |
| .detailed-table tbody tr td:nth-child(14), | |
| .detailed-table tbody tr td:nth-child(19) { | |
| border-left: 3px solid var(--body-text-color) !important; | |
| } | |
| </style> | |
| """ | |
| def df_to_styled_html(df): | |
| """Convert DataFrame to styled HTML with CSS.""" | |
| table_html = df.to_html(classes="detailed-table", border=1, index=False, na_rep="-") | |
| return TABLE_CSS + f'<div style="overflow-x: auto;">{table_html}</div>' | |
| # Global state | |
| global_data = {} | |
| def refresh_data(): | |
| global global_data | |
| model_handler = ModelHandler() | |
| df = model_handler.get_embedding_benchmark_data() | |
| detailed_results = model_handler.get_detailed_results() | |
| # Prepare main leaderboards | |
| leaderboard = prepare_leaderboard(df) | |
| translit_summary = prepare_translit_leaderboard(df) | |
| # Extract model order from main leaderboard to pass to detailed tables | |
| model_order = None | |
| if not leaderboard.empty and "Model" in leaderboard.columns: | |
| # Get model names, removing markdown link if present | |
| model_order = [] | |
| for name in leaderboard["Model"]: | |
| # Handle markdown format [name](url) or plain text | |
| if isinstance(name, str) and "[" in name and "]" in name: | |
| clean_name = name.split("]")[0].replace("[", "") | |
| else: | |
| clean_name = str(name) | |
| model_order.append(clean_name) | |
| # Extract model order from translit leaderboard | |
| translit_model_order = None | |
| if not translit_summary.empty and "Model" in translit_summary.columns: | |
| # Get model names, removing markdown link if present | |
| translit_model_order = [] | |
| for name in translit_summary["Model"]: | |
| # Handle markdown format [name](url) or plain text | |
| if isinstance(name, str) and "[" in name and "]" in name: | |
| clean_name = name.split("]")[0].replace("[", "") | |
| else: | |
| clean_name = str(name) | |
| translit_model_order.append(clean_name) | |
| global_data = { | |
| "leaderboard": leaderboard, | |
| "detailed": prepare_detailed_leaderboards(detailed_results, model_order=model_order), | |
| "translit_summary": translit_summary, | |
| "translit_detailed": prepare_translit_detailed(detailed_results, model_order=translit_model_order), | |
| } | |
| return ( | |
| global_data["leaderboard"], | |
| df_to_styled_html(global_data["detailed"]), | |
| global_data["translit_summary"], | |
| df_to_styled_html(global_data["translit_detailed"]), | |
| ) | |
| def main(): | |
| global global_data | |
| model_handler = ModelHandler() | |
| df = model_handler.get_embedding_benchmark_data() | |
| detailed_results = model_handler.get_detailed_results() | |
| # Prepare leaderboards | |
| leaderboard = prepare_leaderboard(df) | |
| translit_summary = prepare_translit_leaderboard(df) | |
| # Extract model order from main leaderboard | |
| model_order = None | |
| if not leaderboard.empty and "Model" in leaderboard.columns: | |
| model_order = [] | |
| for name in leaderboard["Model"]: | |
| if isinstance(name, str) and "[" in name and "]" in name: | |
| clean_name = name.split("]")[0].replace("[", "") | |
| else: | |
| clean_name = str(name) | |
| model_order.append(clean_name) | |
| # Extract model order from translit leaderboard | |
| translit_model_order = None | |
| if not translit_summary.empty and "Model" in translit_summary.columns: | |
| translit_model_order = [] | |
| for name in translit_summary["Model"]: | |
| if isinstance(name, str) and "[" in name and "]" in name: | |
| clean_name = name.split("]")[0].replace("[", "") | |
| else: | |
| clean_name = str(name) | |
| translit_model_order.append(clean_name) | |
| global_data = { | |
| "leaderboard": leaderboard, | |
| "detailed": prepare_detailed_leaderboards(detailed_results, model_order=model_order), | |
| "translit_summary": translit_summary, | |
| "translit_detailed": prepare_translit_detailed(detailed_results, model_order=translit_model_order), | |
| } | |
| with gr.Blocks(title="ArmBench-TextEmbed", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# ArmBench-TextEmbed: Benchmarking Text Embedding Models on Armenian") | |
| gr.Markdown( | |
| """ | |
| Evaluating text embedding models on Armenian language tasks. | |
| Developed by [Metric](https://metric.am/). | |
| """ | |
| ) | |
| with gr.Tabs(): | |
| with gr.TabItem("Leaderboard"): | |
| gr.Markdown("## Leaderboard") | |
| gr.Markdown( | |
| """ | |
| **Metrics:** | |
| - **MTEB Avg**: Average score across MTEB sample for Armenian [hye] (BitextMining, Classification, Clustering, Paraphrase, Retrieval) | |
| - **STS**: Semantic Textual Similarity (Spearman correlation) | |
| - **Retrieval**: Armenian document retrieval (Top-20 accuracy) | |
| - **MS MARCO**: Passage retrieval on MS MARCO Armenian (Top-10 accuracy) | |
| """ | |
| ) | |
| leaderboard_table = gr.DataFrame( | |
| value=global_data["leaderboard"], | |
| label="Embedding Model Leaderboard", | |
| datatype=["number", "markdown", "str", "number", "number", "number", "number", "number"], | |
| ) | |
| with gr.Accordion("Detailed Scores", open=False): | |
| gr.Markdown( | |
| """ | |
| **Note:** MTEB subscores represent different datasets, while other columns (STS, Retrieval, MS MARCO) | |
| represent different evaluation metrics within each benchmark. | |
| """ | |
| ) | |
| detailed_table = gr.HTML(value=df_to_styled_html(global_data["detailed"])) | |
| with gr.TabItem("Translit"): | |
| gr.Markdown("## Transliterated (Latin Script) Benchmarks") | |
| gr.Markdown( | |
| """ | |
| Evaluation on Armenian text transliterated to Latin script. | |
| Tests model robustness to script variation. | |
| """ | |
| ) | |
| translit_summary_table = gr.DataFrame( | |
| value=global_data["translit_summary"], | |
| label="Translit Leaderboard", | |
| datatype=["number", "markdown", "str", "number", "number", "number"], | |
| ) | |
| with gr.Accordion("Detailed Scores", open=False): | |
| gr.Markdown( | |
| """ | |
| **Note:** Subscores represent different evaluation metrics within each benchmark. | |
| """ | |
| ) | |
| translit_detailed_table = gr.HTML( | |
| value=df_to_styled_html(global_data["translit_detailed"]) | |
| ) | |
| with gr.TabItem("About"): | |
| gr.Markdown("# About ArmBench-TextEmbed") | |
| gr.Markdown( | |
| """ | |
| ArmBench-TextEmbed is a benchmark for evaluating text embedding models on Armenian language tasks. | |
| ## Benchmarks | |
| - **MTEB**: Multilingual Text Embedding Benchmark tasks for Armenian [hye] | |
| - BitextMining (Flores, NTREX, Tatoeba) | |
| - Classification (MASSIVE Intent/Scenario, SIB200) | |
| - Clustering (SIB200) | |
| - Paraphrase Detection | |
| - Retrieval (Belebele) | |
| - **STS**: Armenian Semantic Textual Similarity (Main score: Spearman correlation) | |
| - **Retrieval**: Armenian document retrieval (Main score: Top-20 accuracy) | |
| - **MS MARCO**: MS MARCO passage retrieval translated to Armenian (Main score: Top-10 accuracy) | |
| ## Submission Guide | |
| To submit your embedding model for evaluation: | |
| 1. **Evaluate your model** using our evaluation scripts at [GitHub](https://github.com/Metric-AI-Lab/ArmBench-TextEmbed) | |
| 2. **Format your results.json** with both summary and detailed metrics: | |
| ```json | |
| { | |
| "mteb_avg": 0.65, | |
| "mteb_detailed": { | |
| "FloresBitextMining_devtest": 0.12, | |
| "NTREXBitextMining_test": 0.95, | |
| "Tatoeba_test": 0.91, | |
| "MassiveIntentClassification_test": 0.53, | |
| "MassiveScenarioClassification_test": 0.58, | |
| "SIB200Classification_test": 0.66, | |
| "SIB200ClusteringS2S_test": 0.31, | |
| "ArmenianParaphrasePC_test": 0.94, | |
| "BelebeleRetrieval_test": 0.72 | |
| }, | |
| "sts_spearman": 0.70, | |
| "sts_detailed": { | |
| "Pearson_correlation": 0.69, | |
| "Spearman_correlation": 0.70 | |
| }, | |
| "retrieval_top20": 0.75, | |
| "retrieval_detailed": { | |
| "top1 within document": 0.50, | |
| "top3 within document": 0.76, | |
| "top5 within document": 0.85, | |
| "top20 group mean macro": 0.93, | |
| "top20 all": 0.75 | |
| }, | |
| "msmarco_top10": 0.60, | |
| "msmarco_detailed": { | |
| "reranking_mrr": 0.56, | |
| "retrieval_mrr": 0.46, | |
| "retrieval_top5_accuracy": 0.68, | |
| "retrieval_top10_accuracy": 0.60 | |
| }, | |
| "retrieval_translit_top20": 0.15, | |
| "retrieval_translit_detailed": { | |
| "top1 within document": 0.12, | |
| "top3 within document": 0.22, | |
| "top5 within document": 0.31, | |
| "top20 group mean macro": 0.31, | |
| "top20 all": 0.15 | |
| }, | |
| "msmarco_translit_top10": 0.15, | |
| "msmarco_translit_detailed": { | |
| "reranking_mrr": 0.39, | |
| "retrieval_mrr": 0.07, | |
| "retrieval_top5_accuracy": 0.11, | |
| "retrieval_top10_accuracy": 0.15 | |
| } | |
| } | |
| ``` | |
| **Note:** The `*_detailed` fields are required for the detailed scores tables. Translit fields are optional. | |
| 3. **Add the tag and results**: | |
| - Add the `ArmBench-TextEmbed` tag to your model card | |
| - Upload `results.json` to your model repository | |
| 4. Click "Refresh Data" to see your results on the leaderboard | |
| ## Citation | |
| If you use this benchmark in your research, please cite: | |
| ```bibtex | |
| @inproceedings{navasardyan2026lessismore, | |
| title={Less is More: Adapting Text Embeddings for Low-Resource Languages with Small Scale Noisy Synthetic Data}, | |
| author={Navasardyan, Zaruhi and Bughdaryan, Spartak and Minasyan, Bagrat and Davtyan, Hrant}, | |
| booktitle={Proceedings of the Workshop on Language Models for Low-Resource Languages (LoResLM) at EACL 2026}, | |
| year={2026} | |
| } | |
| @misc{armbench-textembed, | |
| title={ArmBench-TextEmbed: A Benchmark for Armenian Text Embedding Models}, | |
| year={2026}, | |
| url={https://github.com/Metric-AI-Lab/ArmBench-TextEmbed} | |
| } | |
| ``` | |
| ## Contributing | |
| You can contribute to this benchmark in several ways: | |
| - Provide API credits for evaluating additional API-based models | |
| - Cite our work in your research and publications | |
| - Contribute to the development of the benchmark itself with data or evaluation results | |
| ## About Metric | |
| Metric is an AI Research Lab in Yerevan, Armenia. Contact: info@metric.am | |
| *This is a non-commercial research project.* | |
| """ | |
| ) | |
| gr.Image("logo.png", width=200, show_label=False) | |
| refresh_button = gr.Button("Refresh Data") | |
| refresh_button.click( | |
| fn=refresh_data, | |
| outputs=[ | |
| leaderboard_table, | |
| detailed_table, | |
| translit_summary_table, | |
| translit_detailed_table, | |
| ] | |
| ) | |
| demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False) | |
| if __name__ == "__main__": | |
| main() | |