Zaruhi's picture
Initial release
c5f9df5
import gradio as gr
from model_handler import ModelHandler
from data_handler import (
prepare_leaderboard,
prepare_detailed_leaderboards,
prepare_translit_leaderboard,
prepare_translit_detailed,
)
# CSS for styled HTML tables with merged headers (uses Gradio CSS variables)
TABLE_CSS = """
<style>
.detailed-table {
width: 100%;
border-collapse: collapse;
font-size: 14px;
margin: 10px 0;
display: table !important;
visibility: visible !important;
}
.detailed-table thead,
.detailed-table tbody,
.detailed-table tr {
display: table-row-group;
visibility: visible !important;
}
.detailed-table tr {
display: table-row !important;
}
.detailed-table thead tr th {
background-color: var(--background-fill-secondary) !important;
color: var(--body-text-color) !important;
font-weight: 600 !important;
padding: 10px 8px !important;
border: 1px solid var(--border-color-primary) !important;
text-align: center !important;
display: table-cell !important;
}
.detailed-table tbody tr td {
padding: 8px 12px !important;
text-align: center !important;
border: 1px solid var(--border-color-primary) !important;
background-color: var(--background-fill-primary) !important;
color: var(--body-text-color) !important;
display: table-cell !important;
visibility: visible !important;
}
.detailed-table tbody tr:hover td {
background-color: var(--background-fill-secondary) !important;
}
.detailed-table tbody td:first-child,
.detailed-table tbody td:nth-child(2) {
text-align: left !important;
}
/* Bold borders to separate benchmark sections */
/* MTEB | STS border (column 12: after #, Model, 9 MTEB cols) */
/* STS | Retrieval border (column 14: after 2 STS cols) */
/* Retrieval | MS MARCO border (column 19: after 5 Retrieval cols) */
.detailed-table thead tr th:nth-child(12),
.detailed-table thead tr th:nth-child(14),
.detailed-table thead tr th:nth-child(19),
.detailed-table tbody tr td:nth-child(12),
.detailed-table tbody tr td:nth-child(14),
.detailed-table tbody tr td:nth-child(19) {
border-left: 3px solid var(--body-text-color) !important;
}
</style>
"""
def df_to_styled_html(df):
"""Convert DataFrame to styled HTML with CSS."""
table_html = df.to_html(classes="detailed-table", border=1, index=False, na_rep="-")
return TABLE_CSS + f'<div style="overflow-x: auto;">{table_html}</div>'
# Global state
global_data = {}
def refresh_data():
global global_data
model_handler = ModelHandler()
df = model_handler.get_embedding_benchmark_data()
detailed_results = model_handler.get_detailed_results()
# Prepare main leaderboards
leaderboard = prepare_leaderboard(df)
translit_summary = prepare_translit_leaderboard(df)
# Extract model order from main leaderboard to pass to detailed tables
model_order = None
if not leaderboard.empty and "Model" in leaderboard.columns:
# Get model names, removing markdown link if present
model_order = []
for name in leaderboard["Model"]:
# Handle markdown format [name](url) or plain text
if isinstance(name, str) and "[" in name and "]" in name:
clean_name = name.split("]")[0].replace("[", "")
else:
clean_name = str(name)
model_order.append(clean_name)
# Extract model order from translit leaderboard
translit_model_order = None
if not translit_summary.empty and "Model" in translit_summary.columns:
# Get model names, removing markdown link if present
translit_model_order = []
for name in translit_summary["Model"]:
# Handle markdown format [name](url) or plain text
if isinstance(name, str) and "[" in name and "]" in name:
clean_name = name.split("]")[0].replace("[", "")
else:
clean_name = str(name)
translit_model_order.append(clean_name)
global_data = {
"leaderboard": leaderboard,
"detailed": prepare_detailed_leaderboards(detailed_results, model_order=model_order),
"translit_summary": translit_summary,
"translit_detailed": prepare_translit_detailed(detailed_results, model_order=translit_model_order),
}
return (
global_data["leaderboard"],
df_to_styled_html(global_data["detailed"]),
global_data["translit_summary"],
df_to_styled_html(global_data["translit_detailed"]),
)
def main():
global global_data
model_handler = ModelHandler()
df = model_handler.get_embedding_benchmark_data()
detailed_results = model_handler.get_detailed_results()
# Prepare leaderboards
leaderboard = prepare_leaderboard(df)
translit_summary = prepare_translit_leaderboard(df)
# Extract model order from main leaderboard
model_order = None
if not leaderboard.empty and "Model" in leaderboard.columns:
model_order = []
for name in leaderboard["Model"]:
if isinstance(name, str) and "[" in name and "]" in name:
clean_name = name.split("]")[0].replace("[", "")
else:
clean_name = str(name)
model_order.append(clean_name)
# Extract model order from translit leaderboard
translit_model_order = None
if not translit_summary.empty and "Model" in translit_summary.columns:
translit_model_order = []
for name in translit_summary["Model"]:
if isinstance(name, str) and "[" in name and "]" in name:
clean_name = name.split("]")[0].replace("[", "")
else:
clean_name = str(name)
translit_model_order.append(clean_name)
global_data = {
"leaderboard": leaderboard,
"detailed": prepare_detailed_leaderboards(detailed_results, model_order=model_order),
"translit_summary": translit_summary,
"translit_detailed": prepare_translit_detailed(detailed_results, model_order=translit_model_order),
}
with gr.Blocks(title="ArmBench-TextEmbed", theme=gr.themes.Soft()) as demo:
gr.Markdown("# ArmBench-TextEmbed: Benchmarking Text Embedding Models on Armenian")
gr.Markdown(
"""
Evaluating text embedding models on Armenian language tasks.
Developed by [Metric](https://metric.am/).
"""
)
with gr.Tabs():
with gr.TabItem("Leaderboard"):
gr.Markdown("## Leaderboard")
gr.Markdown(
"""
**Metrics:**
- **MTEB Avg**: Average score across MTEB sample for Armenian [hye] (BitextMining, Classification, Clustering, Paraphrase, Retrieval)
- **STS**: Semantic Textual Similarity (Spearman correlation)
- **Retrieval**: Armenian document retrieval (Top-20 accuracy)
- **MS MARCO**: Passage retrieval on MS MARCO Armenian (Top-10 accuracy)
"""
)
leaderboard_table = gr.DataFrame(
value=global_data["leaderboard"],
label="Embedding Model Leaderboard",
datatype=["number", "markdown", "str", "number", "number", "number", "number", "number"],
)
with gr.Accordion("Detailed Scores", open=False):
gr.Markdown(
"""
**Note:** MTEB subscores represent different datasets, while other columns (STS, Retrieval, MS MARCO)
represent different evaluation metrics within each benchmark.
"""
)
detailed_table = gr.HTML(value=df_to_styled_html(global_data["detailed"]))
with gr.TabItem("Translit"):
gr.Markdown("## Transliterated (Latin Script) Benchmarks")
gr.Markdown(
"""
Evaluation on Armenian text transliterated to Latin script.
Tests model robustness to script variation.
"""
)
translit_summary_table = gr.DataFrame(
value=global_data["translit_summary"],
label="Translit Leaderboard",
datatype=["number", "markdown", "str", "number", "number", "number"],
)
with gr.Accordion("Detailed Scores", open=False):
gr.Markdown(
"""
**Note:** Subscores represent different evaluation metrics within each benchmark.
"""
)
translit_detailed_table = gr.HTML(
value=df_to_styled_html(global_data["translit_detailed"])
)
with gr.TabItem("About"):
gr.Markdown("# About ArmBench-TextEmbed")
gr.Markdown(
"""
ArmBench-TextEmbed is a benchmark for evaluating text embedding models on Armenian language tasks.
## Benchmarks
- **MTEB**: Multilingual Text Embedding Benchmark tasks for Armenian [hye]
- BitextMining (Flores, NTREX, Tatoeba)
- Classification (MASSIVE Intent/Scenario, SIB200)
- Clustering (SIB200)
- Paraphrase Detection
- Retrieval (Belebele)
- **STS**: Armenian Semantic Textual Similarity (Main score: Spearman correlation)
- **Retrieval**: Armenian document retrieval (Main score: Top-20 accuracy)
- **MS MARCO**: MS MARCO passage retrieval translated to Armenian (Main score: Top-10 accuracy)
## Submission Guide
To submit your embedding model for evaluation:
1. **Evaluate your model** using our evaluation scripts at [GitHub](https://github.com/Metric-AI-Lab/ArmBench-TextEmbed)
2. **Format your results.json** with both summary and detailed metrics:
```json
{
"mteb_avg": 0.65,
"mteb_detailed": {
"FloresBitextMining_devtest": 0.12,
"NTREXBitextMining_test": 0.95,
"Tatoeba_test": 0.91,
"MassiveIntentClassification_test": 0.53,
"MassiveScenarioClassification_test": 0.58,
"SIB200Classification_test": 0.66,
"SIB200ClusteringS2S_test": 0.31,
"ArmenianParaphrasePC_test": 0.94,
"BelebeleRetrieval_test": 0.72
},
"sts_spearman": 0.70,
"sts_detailed": {
"Pearson_correlation": 0.69,
"Spearman_correlation": 0.70
},
"retrieval_top20": 0.75,
"retrieval_detailed": {
"top1 within document": 0.50,
"top3 within document": 0.76,
"top5 within document": 0.85,
"top20 group mean macro": 0.93,
"top20 all": 0.75
},
"msmarco_top10": 0.60,
"msmarco_detailed": {
"reranking_mrr": 0.56,
"retrieval_mrr": 0.46,
"retrieval_top5_accuracy": 0.68,
"retrieval_top10_accuracy": 0.60
},
"retrieval_translit_top20": 0.15,
"retrieval_translit_detailed": {
"top1 within document": 0.12,
"top3 within document": 0.22,
"top5 within document": 0.31,
"top20 group mean macro": 0.31,
"top20 all": 0.15
},
"msmarco_translit_top10": 0.15,
"msmarco_translit_detailed": {
"reranking_mrr": 0.39,
"retrieval_mrr": 0.07,
"retrieval_top5_accuracy": 0.11,
"retrieval_top10_accuracy": 0.15
}
}
```
**Note:** The `*_detailed` fields are required for the detailed scores tables. Translit fields are optional.
3. **Add the tag and results**:
- Add the `ArmBench-TextEmbed` tag to your model card
- Upload `results.json` to your model repository
4. Click "Refresh Data" to see your results on the leaderboard
## Citation
If you use this benchmark in your research, please cite:
```bibtex
@inproceedings{navasardyan2026lessismore,
title={Less is More: Adapting Text Embeddings for Low-Resource Languages with Small Scale Noisy Synthetic Data},
author={Navasardyan, Zaruhi and Bughdaryan, Spartak and Minasyan, Bagrat and Davtyan, Hrant},
booktitle={Proceedings of the Workshop on Language Models for Low-Resource Languages (LoResLM) at EACL 2026},
year={2026}
}
@misc{armbench-textembed,
title={ArmBench-TextEmbed: A Benchmark for Armenian Text Embedding Models},
year={2026},
url={https://github.com/Metric-AI-Lab/ArmBench-TextEmbed}
}
```
## Contributing
You can contribute to this benchmark in several ways:
- Provide API credits for evaluating additional API-based models
- Cite our work in your research and publications
- Contribute to the development of the benchmark itself with data or evaluation results
## About Metric
Metric is an AI Research Lab in Yerevan, Armenia. Contact: info@metric.am
*This is a non-commercial research project.*
"""
)
gr.Image("logo.png", width=200, show_label=False)
refresh_button = gr.Button("Refresh Data")
refresh_button.click(
fn=refresh_data,
outputs=[
leaderboard_table,
detailed_table,
translit_summary_table,
translit_detailed_table,
]
)
demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)
if __name__ == "__main__":
main()