Zaruhi commited on
Commit
c5f9df5
·
0 Parent(s):

Initial release

Browse files
Files changed (8) hide show
  1. .gitattributes +35 -0
  2. README.md +15 -0
  3. app.py +371 -0
  4. data_handler.py +318 -0
  5. logo.png +0 -0
  6. model_handler.py +252 -0
  7. model_results.json +756 -0
  8. requirements.txt +3 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: ArmBench-TextEmbed
3
+ emoji: 📊
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 5.19.0
8
+ app_file: app.py
9
+ pinned: false
10
+ tags:
11
+ - embedding
12
+ - armenian
13
+ - benchmark
14
+ - mteb
15
+ ---
app.py ADDED
@@ -0,0 +1,371 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from model_handler import ModelHandler
3
+ from data_handler import (
4
+ prepare_leaderboard,
5
+ prepare_detailed_leaderboards,
6
+ prepare_translit_leaderboard,
7
+ prepare_translit_detailed,
8
+ )
9
+
10
+ # CSS for styled HTML tables with merged headers (uses Gradio CSS variables)
11
+ TABLE_CSS = """
12
+ <style>
13
+ .detailed-table {
14
+ width: 100%;
15
+ border-collapse: collapse;
16
+ font-size: 14px;
17
+ margin: 10px 0;
18
+ display: table !important;
19
+ visibility: visible !important;
20
+ }
21
+ .detailed-table thead,
22
+ .detailed-table tbody,
23
+ .detailed-table tr {
24
+ display: table-row-group;
25
+ visibility: visible !important;
26
+ }
27
+ .detailed-table tr {
28
+ display: table-row !important;
29
+ }
30
+ .detailed-table thead tr th {
31
+ background-color: var(--background-fill-secondary) !important;
32
+ color: var(--body-text-color) !important;
33
+ font-weight: 600 !important;
34
+ padding: 10px 8px !important;
35
+ border: 1px solid var(--border-color-primary) !important;
36
+ text-align: center !important;
37
+ display: table-cell !important;
38
+ }
39
+ .detailed-table tbody tr td {
40
+ padding: 8px 12px !important;
41
+ text-align: center !important;
42
+ border: 1px solid var(--border-color-primary) !important;
43
+ background-color: var(--background-fill-primary) !important;
44
+ color: var(--body-text-color) !important;
45
+ display: table-cell !important;
46
+ visibility: visible !important;
47
+ }
48
+ .detailed-table tbody tr:hover td {
49
+ background-color: var(--background-fill-secondary) !important;
50
+ }
51
+ .detailed-table tbody td:first-child,
52
+ .detailed-table tbody td:nth-child(2) {
53
+ text-align: left !important;
54
+ }
55
+ /* Bold borders to separate benchmark sections */
56
+ /* MTEB | STS border (column 12: after #, Model, 9 MTEB cols) */
57
+ /* STS | Retrieval border (column 14: after 2 STS cols) */
58
+ /* Retrieval | MS MARCO border (column 19: after 5 Retrieval cols) */
59
+ .detailed-table thead tr th:nth-child(12),
60
+ .detailed-table thead tr th:nth-child(14),
61
+ .detailed-table thead tr th:nth-child(19),
62
+ .detailed-table tbody tr td:nth-child(12),
63
+ .detailed-table tbody tr td:nth-child(14),
64
+ .detailed-table tbody tr td:nth-child(19) {
65
+ border-left: 3px solid var(--body-text-color) !important;
66
+ }
67
+ </style>
68
+ """
69
+
70
+
71
+ def df_to_styled_html(df):
72
+ """Convert DataFrame to styled HTML with CSS."""
73
+ table_html = df.to_html(classes="detailed-table", border=1, index=False, na_rep="-")
74
+ return TABLE_CSS + f'<div style="overflow-x: auto;">{table_html}</div>'
75
+
76
+ # Global state
77
+ global_data = {}
78
+
79
+
80
+ def refresh_data():
81
+ global global_data
82
+ model_handler = ModelHandler()
83
+
84
+ df = model_handler.get_embedding_benchmark_data()
85
+ detailed_results = model_handler.get_detailed_results()
86
+
87
+ # Prepare main leaderboards
88
+ leaderboard = prepare_leaderboard(df)
89
+ translit_summary = prepare_translit_leaderboard(df)
90
+
91
+ # Extract model order from main leaderboard to pass to detailed tables
92
+ model_order = None
93
+ if not leaderboard.empty and "Model" in leaderboard.columns:
94
+ # Get model names, removing markdown link if present
95
+ model_order = []
96
+ for name in leaderboard["Model"]:
97
+ # Handle markdown format [name](url) or plain text
98
+ if isinstance(name, str) and "[" in name and "]" in name:
99
+ clean_name = name.split("]")[0].replace("[", "")
100
+ else:
101
+ clean_name = str(name)
102
+ model_order.append(clean_name)
103
+
104
+ # Extract model order from translit leaderboard
105
+ translit_model_order = None
106
+ if not translit_summary.empty and "Model" in translit_summary.columns:
107
+ # Get model names, removing markdown link if present
108
+ translit_model_order = []
109
+ for name in translit_summary["Model"]:
110
+ # Handle markdown format [name](url) or plain text
111
+ if isinstance(name, str) and "[" in name and "]" in name:
112
+ clean_name = name.split("]")[0].replace("[", "")
113
+ else:
114
+ clean_name = str(name)
115
+ translit_model_order.append(clean_name)
116
+
117
+ global_data = {
118
+ "leaderboard": leaderboard,
119
+ "detailed": prepare_detailed_leaderboards(detailed_results, model_order=model_order),
120
+ "translit_summary": translit_summary,
121
+ "translit_detailed": prepare_translit_detailed(detailed_results, model_order=translit_model_order),
122
+ }
123
+
124
+ return (
125
+ global_data["leaderboard"],
126
+ df_to_styled_html(global_data["detailed"]),
127
+ global_data["translit_summary"],
128
+ df_to_styled_html(global_data["translit_detailed"]),
129
+ )
130
+
131
+
132
+ def main():
133
+ global global_data
134
+
135
+ model_handler = ModelHandler()
136
+ df = model_handler.get_embedding_benchmark_data()
137
+ detailed_results = model_handler.get_detailed_results()
138
+
139
+ # Prepare leaderboards
140
+ leaderboard = prepare_leaderboard(df)
141
+ translit_summary = prepare_translit_leaderboard(df)
142
+
143
+ # Extract model order from main leaderboard
144
+ model_order = None
145
+ if not leaderboard.empty and "Model" in leaderboard.columns:
146
+ model_order = []
147
+ for name in leaderboard["Model"]:
148
+ if isinstance(name, str) and "[" in name and "]" in name:
149
+ clean_name = name.split("]")[0].replace("[", "")
150
+ else:
151
+ clean_name = str(name)
152
+ model_order.append(clean_name)
153
+
154
+ # Extract model order from translit leaderboard
155
+ translit_model_order = None
156
+ if not translit_summary.empty and "Model" in translit_summary.columns:
157
+ translit_model_order = []
158
+ for name in translit_summary["Model"]:
159
+ if isinstance(name, str) and "[" in name and "]" in name:
160
+ clean_name = name.split("]")[0].replace("[", "")
161
+ else:
162
+ clean_name = str(name)
163
+ translit_model_order.append(clean_name)
164
+
165
+ global_data = {
166
+ "leaderboard": leaderboard,
167
+ "detailed": prepare_detailed_leaderboards(detailed_results, model_order=model_order),
168
+ "translit_summary": translit_summary,
169
+ "translit_detailed": prepare_translit_detailed(detailed_results, model_order=translit_model_order),
170
+ }
171
+
172
+ with gr.Blocks(title="ArmBench-TextEmbed", theme=gr.themes.Soft()) as demo:
173
+ gr.Markdown("# ArmBench-TextEmbed: Benchmarking Text Embedding Models on Armenian")
174
+ gr.Markdown(
175
+ """
176
+ Evaluating text embedding models on Armenian language tasks.
177
+ Developed by [Metric](https://metric.am/).
178
+ """
179
+ )
180
+
181
+ with gr.Tabs():
182
+ with gr.TabItem("Leaderboard"):
183
+ gr.Markdown("## Leaderboard")
184
+ gr.Markdown(
185
+ """
186
+ **Metrics:**
187
+ - **MTEB Avg**: Average score across MTEB sample for Armenian [hye] (BitextMining, Classification, Clustering, Paraphrase, Retrieval)
188
+ - **STS**: Semantic Textual Similarity (Spearman correlation)
189
+ - **Retrieval**: Armenian document retrieval (Top-20 accuracy)
190
+ - **MS MARCO**: Passage retrieval on MS MARCO Armenian (Top-10 accuracy)
191
+ """
192
+ )
193
+ leaderboard_table = gr.DataFrame(
194
+ value=global_data["leaderboard"],
195
+ label="Embedding Model Leaderboard",
196
+ datatype=["number", "markdown", "str", "number", "number", "number", "number", "number"],
197
+ )
198
+
199
+ with gr.Accordion("Detailed Scores", open=False):
200
+ gr.Markdown(
201
+ """
202
+ **Note:** MTEB subscores represent different datasets, while other columns (STS, Retrieval, MS MARCO)
203
+ represent different evaluation metrics within each benchmark.
204
+ """
205
+ )
206
+ detailed_table = gr.HTML(value=df_to_styled_html(global_data["detailed"]))
207
+
208
+ with gr.TabItem("Translit"):
209
+ gr.Markdown("## Transliterated (Latin Script) Benchmarks")
210
+ gr.Markdown(
211
+ """
212
+ Evaluation on Armenian text transliterated to Latin script.
213
+ Tests model robustness to script variation.
214
+ """
215
+ )
216
+ translit_summary_table = gr.DataFrame(
217
+ value=global_data["translit_summary"],
218
+ label="Translit Leaderboard",
219
+ datatype=["number", "markdown", "str", "number", "number", "number"],
220
+ )
221
+
222
+ with gr.Accordion("Detailed Scores", open=False):
223
+ gr.Markdown(
224
+ """
225
+ **Note:** Subscores represent different evaluation metrics within each benchmark.
226
+ """
227
+ )
228
+ translit_detailed_table = gr.HTML(
229
+ value=df_to_styled_html(global_data["translit_detailed"])
230
+ )
231
+
232
+ with gr.TabItem("About"):
233
+ gr.Markdown("# About ArmBench-TextEmbed")
234
+ gr.Markdown(
235
+ """
236
+ ArmBench-TextEmbed is a benchmark for evaluating text embedding models on Armenian language tasks.
237
+
238
+ ## Benchmarks
239
+
240
+ - **MTEB**: Multilingual Text Embedding Benchmark tasks for Armenian [hye]
241
+ - BitextMining (Flores, NTREX, Tatoeba)
242
+ - Classification (MASSIVE Intent/Scenario, SIB200)
243
+ - Clustering (SIB200)
244
+ - Paraphrase Detection
245
+ - Retrieval (Belebele)
246
+
247
+ - **STS**: Armenian Semantic Textual Similarity (Main score: Spearman correlation)
248
+
249
+ - **Retrieval**: Armenian document retrieval (Main score: Top-20 accuracy)
250
+
251
+ - **MS MARCO**: MS MARCO passage retrieval translated to Armenian (Main score: Top-10 accuracy)
252
+
253
+ ## Submission Guide
254
+
255
+ To submit your embedding model for evaluation:
256
+
257
+ 1. **Evaluate your model** using our evaluation scripts at [GitHub](https://github.com/Metric-AI-Lab/ArmBench-TextEmbed)
258
+
259
+ 2. **Format your results.json** with both summary and detailed metrics:
260
+ ```json
261
+ {
262
+ "mteb_avg": 0.65,
263
+ "mteb_detailed": {
264
+ "FloresBitextMining_devtest": 0.12,
265
+ "NTREXBitextMining_test": 0.95,
266
+ "Tatoeba_test": 0.91,
267
+ "MassiveIntentClassification_test": 0.53,
268
+ "MassiveScenarioClassification_test": 0.58,
269
+ "SIB200Classification_test": 0.66,
270
+ "SIB200ClusteringS2S_test": 0.31,
271
+ "ArmenianParaphrasePC_test": 0.94,
272
+ "BelebeleRetrieval_test": 0.72
273
+ },
274
+ "sts_spearman": 0.70,
275
+ "sts_detailed": {
276
+ "Pearson_correlation": 0.69,
277
+ "Spearman_correlation": 0.70
278
+ },
279
+ "retrieval_top20": 0.75,
280
+ "retrieval_detailed": {
281
+ "top1 within document": 0.50,
282
+ "top3 within document": 0.76,
283
+ "top5 within document": 0.85,
284
+ "top20 group mean macro": 0.93,
285
+ "top20 all": 0.75
286
+ },
287
+ "msmarco_top10": 0.60,
288
+ "msmarco_detailed": {
289
+ "reranking_mrr": 0.56,
290
+ "retrieval_mrr": 0.46,
291
+ "retrieval_top5_accuracy": 0.68,
292
+ "retrieval_top10_accuracy": 0.60
293
+ },
294
+ "retrieval_translit_top20": 0.15,
295
+ "retrieval_translit_detailed": {
296
+ "top1 within document": 0.12,
297
+ "top3 within document": 0.22,
298
+ "top5 within document": 0.31,
299
+ "top20 group mean macro": 0.31,
300
+ "top20 all": 0.15
301
+ },
302
+ "msmarco_translit_top10": 0.15,
303
+ "msmarco_translit_detailed": {
304
+ "reranking_mrr": 0.39,
305
+ "retrieval_mrr": 0.07,
306
+ "retrieval_top5_accuracy": 0.11,
307
+ "retrieval_top10_accuracy": 0.15
308
+ }
309
+ }
310
+ ```
311
+
312
+ **Note:** The `*_detailed` fields are required for the detailed scores tables. Translit fields are optional.
313
+
314
+ 3. **Add the tag and results**:
315
+ - Add the `ArmBench-TextEmbed` tag to your model card
316
+ - Upload `results.json` to your model repository
317
+
318
+ 4. Click "Refresh Data" to see your results on the leaderboard
319
+
320
+ ## Citation
321
+
322
+ If you use this benchmark in your research, please cite:
323
+
324
+ ```bibtex
325
+ @inproceedings{navasardyan2026lessismore,
326
+ title={Less is More: Adapting Text Embeddings for Low-Resource Languages with Small Scale Noisy Synthetic Data},
327
+ author={Navasardyan, Zaruhi and Bughdaryan, Spartak and Minasyan, Bagrat and Davtyan, Hrant},
328
+ booktitle={Proceedings of the Workshop on Language Models for Low-Resource Languages (LoResLM) at EACL 2026},
329
+ year={2026}
330
+ }
331
+ @misc{armbench-textembed,
332
+ title={ArmBench-TextEmbed: A Benchmark for Armenian Text Embedding Models},
333
+ year={2026},
334
+ url={https://github.com/Metric-AI-Lab/ArmBench-TextEmbed}
335
+ }
336
+ ```
337
+
338
+ ## Contributing
339
+
340
+ You can contribute to this benchmark in several ways:
341
+
342
+ - Provide API credits for evaluating additional API-based models
343
+ - Cite our work in your research and publications
344
+ - Contribute to the development of the benchmark itself with data or evaluation results
345
+
346
+ ## About Metric
347
+
348
+ Metric is an AI Research Lab in Yerevan, Armenia. Contact: info@metric.am
349
+
350
+ *This is a non-commercial research project.*
351
+ """
352
+ )
353
+
354
+ gr.Image("logo.png", width=200, show_label=False)
355
+
356
+ refresh_button = gr.Button("Refresh Data")
357
+ refresh_button.click(
358
+ fn=refresh_data,
359
+ outputs=[
360
+ leaderboard_table,
361
+ detailed_table,
362
+ translit_summary_table,
363
+ translit_detailed_table,
364
+ ]
365
+ )
366
+
367
+ demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)
368
+
369
+
370
+ if __name__ == "__main__":
371
+ main()
data_handler.py ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ COLUMN_LABELS = {
4
+ "model_name": "Model",
5
+ "model_size": "Size",
6
+ "mteb_avg": "MTEB",
7
+ "sts_spearman": "STS",
8
+ "retrieval_top20": "Retrieval",
9
+ "msmarco_top10": "MS MARCO",
10
+ }
11
+
12
+ TRANSLIT_COLUMN_LABELS = {
13
+ "model_name": "Model",
14
+ "model_size": "Size",
15
+ "retrieval_translit_top20": "Retrieval",
16
+ "msmarco_translit_top10": "MS MARCO",
17
+ }
18
+
19
+ # Metrics used for computing overall average (native script only)
20
+ SCORE_COLS = ["mteb_avg", "sts_spearman", "retrieval_top20", "msmarco_top10"]
21
+
22
+
23
+ def prepare_leaderboard(df: pd.DataFrame) -> pd.DataFrame:
24
+ """Prepare embedding benchmark leaderboard from raw results DataFrame."""
25
+ if df.empty:
26
+ return df
27
+
28
+ df = df.copy()
29
+
30
+ # Format model_name as hyperlink if model_url exists
31
+ if "model_url" in df.columns:
32
+ df["model_name"] = df.apply(
33
+ lambda row: f"[{row['model_name']}]({row['model_url']})"
34
+ if pd.notna(row.get("model_url"))
35
+ else row["model_name"],
36
+ axis=1,
37
+ )
38
+
39
+ # Calculate overall average (only native script metrics, exclude translit)
40
+ available_cols = [c for c in SCORE_COLS if c in df.columns]
41
+ if available_cols:
42
+ df["average"] = df[available_cols].mean(axis=1).round(4)
43
+
44
+ # Sort by average
45
+ if "average" in df.columns:
46
+ df = df.sort_values(by="average", ascending=False).reset_index(drop=True)
47
+
48
+ df.insert(0, "Rank", range(1, len(df) + 1))
49
+
50
+ # Select only main leaderboard columns (exclude translit)
51
+ # Include model_size if available
52
+ size_col = ["model_size"] if "model_size" in df.columns else []
53
+ display_cols = ["Rank", "model_name"] + size_col + available_cols + ["average"]
54
+ df = df[[c for c in display_cols if c in df.columns]]
55
+
56
+ # Replace missing model_size with "-"
57
+ if "model_size" in df.columns:
58
+ df["model_size"] = df["model_size"].fillna("-").replace("", "-")
59
+
60
+ # Round numeric columns
61
+ df = df.round(4)
62
+
63
+ # Rename columns for display
64
+ df = df.rename(columns={**COLUMN_LABELS, "average": "Average"})
65
+
66
+ return df
67
+
68
+
69
+ def prepare_detailed_leaderboards(detailed_results: dict, model_order: list = None, use_multiindex: bool = True) -> pd.DataFrame:
70
+ """Prepare a single combined detailed leaderboard with hierarchical columns.
71
+
72
+ Args:
73
+ detailed_results: Dict with DataFrames from ModelHandler.get_detailed_results()
74
+ model_order: Optional list of model names in desired order. If provided, models will be
75
+ displayed in this order instead of being sorted independently.
76
+ use_multiindex: If True, return DataFrame with MultiIndex columns for proper
77
+ hierarchical display (merged headers in HTML/Gradio).
78
+ If False, use flat "Category | Metric" column names.
79
+
80
+ Returns:
81
+ pd.DataFrame: Combined table with dataset names as hierarchical column headers
82
+ """
83
+ # Dataset configurations: (dataset_key, dataset_label, column_mappings)
84
+ datasets = [
85
+ ("mteb", "MTEB", {
86
+ "FloresBitextMining_devtest": "Flores",
87
+ "NTREXBitextMining_test": "NTREX",
88
+ "Tatoeba_test": "Tatoeba",
89
+ "MassiveIntentClassification_test": "Intent",
90
+ "MassiveScenarioClassification_test": "Scenario",
91
+ "SIB200Classification_test": "SIB200 Cls",
92
+ "SIB200ClusteringS2S_test": "SIB200 Clust",
93
+ "ArmenianParaphrasePC_test": "Paraphrase",
94
+ "BelebeleRetrieval_test": "Belebele",
95
+ }),
96
+ ("sts", "STS", {
97
+ "Pearson_correlation": "Pearson",
98
+ "Spearman_correlation": "Spearman",
99
+ }),
100
+ ("retrieval", "Retrieval", {
101
+ "top1 within document": "Top-1 Doc",
102
+ "top3 within document": "Top-3 Doc",
103
+ "top5 within document": "Top-5 Doc",
104
+ "top20 group mean macro": "Top-20 Type",
105
+ "top20 all": "Top-20 All",
106
+ }),
107
+ ("msmarco", "MS MARCO", {
108
+ "reranking_mrr": "Rerank MRR",
109
+ "retrieval_mrr": "Retr. MRR",
110
+ "retrieval_top5_accuracy": "Top-5",
111
+ "retrieval_top10_accuracy": "Top-10",
112
+ }),
113
+ ]
114
+
115
+ # Collect all models from all datasets
116
+ all_models = set()
117
+ for key, _, _ in datasets:
118
+ df = detailed_results.get(key, pd.DataFrame())
119
+ if not df.empty and "model_name" in df.columns:
120
+ all_models.update(df["model_name"].unique())
121
+
122
+ if not all_models:
123
+ return pd.DataFrame()
124
+
125
+ # Use provided model_order if available, otherwise sort alphabetically
126
+ if model_order:
127
+ # Filter model_order to only include models that exist in detailed_results
128
+ ordered_models = [m for m in model_order if m in all_models]
129
+ # Add any remaining models not in model_order (in case they're new)
130
+ remaining = sorted([m for m in all_models if m not in ordered_models])
131
+ all_models_ordered = ordered_models + remaining
132
+ else:
133
+ all_models_ordered = sorted(all_models)
134
+
135
+ # Build combined dataframe with flat columns first
136
+ combined = pd.DataFrame({"Model": all_models_ordered})
137
+ column_tuples = [("", "Model")] # For MultiIndex: (level1, level2)
138
+
139
+ for key, label, col_map in datasets:
140
+ df = detailed_results.get(key, pd.DataFrame())
141
+ if df.empty:
142
+ continue
143
+ df = df.drop_duplicates(subset=["model_name"], keep="first")
144
+
145
+ for orig_col, new_col in col_map.items():
146
+ if orig_col in df.columns:
147
+ col_name = f"{label} | {new_col}"
148
+ column_tuples.append((label, new_col))
149
+ merged = combined.merge(
150
+ df[["model_name", orig_col]].rename(columns={"model_name": "Model", orig_col: col_name}),
151
+ on="Model",
152
+ how="left"
153
+ )
154
+ combined = merged
155
+
156
+ # Round numeric columns
157
+ combined = combined.round(4)
158
+
159
+ # If no model_order was provided, sort by first numeric column for backward compatibility
160
+ if not model_order:
161
+ numeric_cols = combined.select_dtypes(include=["number"]).columns.tolist()
162
+ if numeric_cols:
163
+ combined = combined.sort_values(by=numeric_cols[0], ascending=False, na_position="last")
164
+
165
+ # Always reset index to ensure proper row ordering
166
+ combined = combined.reset_index(drop=True)
167
+
168
+ combined.insert(0, "#", range(1, len(combined) + 1))
169
+ column_tuples.insert(0, ("", "#"))
170
+
171
+ if use_multiindex:
172
+ # Convert to MultiIndex columns for proper hierarchical display
173
+ combined.columns = pd.MultiIndex.from_tuples(column_tuples)
174
+
175
+ return combined
176
+
177
+
178
+ def prepare_translit_leaderboard(df: pd.DataFrame) -> pd.DataFrame:
179
+ """Prepare translit summary leaderboard from raw results DataFrame."""
180
+ if df.empty:
181
+ return df
182
+
183
+ df = df.copy()
184
+
185
+ # Format model_name as hyperlink if model_url exists
186
+ if "model_url" in df.columns:
187
+ df["model_name"] = df.apply(
188
+ lambda row: f"[{row['model_name']}]({row['model_url']})"
189
+ if pd.notna(row.get("model_url"))
190
+ else row["model_name"],
191
+ axis=1,
192
+ )
193
+
194
+ # Only include translit columns
195
+ translit_cols = ["retrieval_translit_top20", "msmarco_translit_top10"]
196
+ available_cols = [c for c in translit_cols if c in df.columns]
197
+
198
+ if not available_cols:
199
+ return pd.DataFrame()
200
+
201
+ # Filter to models that have translit data
202
+ df = df.dropna(subset=available_cols, how="all")
203
+
204
+ if df.empty:
205
+ return pd.DataFrame()
206
+
207
+ # Calculate average
208
+ df["average"] = df[available_cols].mean(axis=1).round(4)
209
+
210
+ # Sort by average
211
+ df = df.sort_values(by="average", ascending=False).reset_index(drop=True)
212
+ df.insert(0, "Rank", range(1, len(df) + 1))
213
+
214
+ # Select columns - include model_size if available
215
+ size_col = ["model_size"] if "model_size" in df.columns else []
216
+ display_cols = ["Rank", "model_name"] + size_col + available_cols + ["average"]
217
+ df = df[[c for c in display_cols if c in df.columns]].round(4)
218
+
219
+ # Replace missing model_size with "-" if it's in the data
220
+ if "model_size" in df.columns:
221
+ df["model_size"] = df["model_size"].fillna("-").replace("", "-")
222
+
223
+ df = df.rename(columns={**TRANSLIT_COLUMN_LABELS, "average": "Average"})
224
+
225
+ return df
226
+
227
+
228
+ def prepare_translit_detailed(detailed_results: dict, model_order: list = None, use_multiindex: bool = True) -> pd.DataFrame:
229
+ """Prepare a single combined translit detailed leaderboard with hierarchical columns.
230
+
231
+ Args:
232
+ detailed_results: Dict with 'retrieval_translit' and 'msmarco_translit' DataFrames
233
+ model_order: Optional list of model names in desired order. If provided, models will be
234
+ displayed in this order instead of being sorted independently.
235
+ use_multiindex: If True, return DataFrame with MultiIndex columns for proper
236
+ hierarchical display (merged headers in HTML/Gradio).
237
+ If False, use flat "Category | Metric" column names.
238
+
239
+ Returns:
240
+ pd.DataFrame: Combined table with dataset names as hierarchical column headers
241
+ """
242
+ datasets = [
243
+ ("retrieval_translit", "Retrieval", {
244
+ "top1 within document": "Top-1 Doc",
245
+ "top3 within document": "Top-3 Doc",
246
+ "top5 within document": "Top-5 Doc",
247
+ "top20 group mean macro": "Top-20 Type",
248
+ "top20 all": "Top-20 All",
249
+ }),
250
+ ("msmarco_translit", "MS MARCO", {
251
+ "reranking_mrr": "Rerank MRR",
252
+ "retrieval_mrr": "Retr. MRR",
253
+ "retrieval_top5_accuracy": "Top-5",
254
+ "retrieval_top10_accuracy": "Top-10",
255
+ }),
256
+ ]
257
+
258
+ # Collect all models from all datasets
259
+ all_models = set()
260
+ for key, _, _ in datasets:
261
+ df = detailed_results.get(key, pd.DataFrame())
262
+ if not df.empty and "model_name" in df.columns:
263
+ all_models.update(df["model_name"].unique())
264
+
265
+ if not all_models:
266
+ return pd.DataFrame()
267
+
268
+ # Use provided model_order if available, otherwise sort alphabetically
269
+ if model_order:
270
+ # Filter model_order to only include models that exist in detailed_results
271
+ ordered_models = [m for m in model_order if m in all_models]
272
+ # Add any remaining models not in model_order (in case they're new)
273
+ remaining = sorted([m for m in all_models if m not in ordered_models])
274
+ all_models_ordered = ordered_models + remaining
275
+ else:
276
+ all_models_ordered = sorted(all_models)
277
+
278
+ # Build combined dataframe
279
+ combined = pd.DataFrame({"Model": all_models_ordered})
280
+ column_tuples = [("", "Model")] # For MultiIndex: (level1, level2)
281
+
282
+ for key, label, col_map in datasets:
283
+ df = detailed_results.get(key, pd.DataFrame())
284
+ if df.empty:
285
+ continue
286
+ df = df.drop_duplicates(subset=["model_name"], keep="first")
287
+
288
+ for orig_col, new_col in col_map.items():
289
+ if orig_col in df.columns:
290
+ col_name = f"{label} | {new_col}"
291
+ column_tuples.append((label, new_col))
292
+ merged = combined.merge(
293
+ df[["model_name", orig_col]].rename(columns={"model_name": "Model", orig_col: col_name}),
294
+ on="Model",
295
+ how="left"
296
+ )
297
+ combined = merged
298
+
299
+ # Round numeric columns
300
+ combined = combined.round(4)
301
+
302
+ # If no model_order was provided, sort by first numeric column for backward compatibility
303
+ if not model_order:
304
+ numeric_cols = combined.select_dtypes(include=["number"]).columns.tolist()
305
+ if numeric_cols:
306
+ combined = combined.sort_values(by=numeric_cols[0], ascending=False, na_position="last")
307
+
308
+ # Always reset index to ensure proper row ordering
309
+ combined = combined.reset_index(drop=True)
310
+
311
+ combined.insert(0, "#", range(1, len(combined) + 1))
312
+ column_tuples.insert(0, ("", "#"))
313
+
314
+ if use_multiindex:
315
+ # Convert to MultiIndex columns for proper hierarchical display
316
+ combined.columns = pd.MultiIndex.from_tuples(column_tuples)
317
+
318
+ return combined
logo.png ADDED
model_handler.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import struct
4
+ from typing import Dict, List
5
+
6
+ import pandas as pd
7
+ import requests
8
+ from huggingface_hub import HfApi, hf_hub_download
9
+
10
+ # Required metrics for embedding evaluation
11
+ REQUIRED_METRICS = [
12
+ "mteb_avg",
13
+ "sts_spearman",
14
+ "retrieval_top20",
15
+ "msmarco_top10",
16
+ ]
17
+
18
+
19
+ def format_params(num_params):
20
+ """Format parameter count as human-readable string."""
21
+ if num_params >= 1e9:
22
+ return f"{num_params / 1e9:.1f}B"
23
+ else:
24
+ return f"{num_params / 1e6:.0f}M"
25
+
26
+
27
+ def get_model_url(model_name):
28
+ """Get the model URL from HuggingFace."""
29
+ return f"https://huggingface.co/{model_name}"
30
+
31
+
32
+ def get_model_size(model_name):
33
+ """Fetch model size from HuggingFace API."""
34
+ try:
35
+ url = f"https://huggingface.co/api/models/{model_name}"
36
+ response = requests.get(url, timeout=10)
37
+ if response.status_code == 200:
38
+ data = response.json()
39
+ # Get safetensors size first, fallback to general parameters
40
+ safetensors = data.get("safetensors")
41
+ if safetensors and "total" in safetensors:
42
+ num_params = safetensors["total"]
43
+ return format_params(num_params)
44
+
45
+ num_params = data.get("num_parameters")
46
+ if num_params:
47
+ return format_params(num_params)
48
+
49
+ # Fallback: read actual param count from safetensors header
50
+ num_params = get_params_from_safetensors(model_name)
51
+ if num_params:
52
+ return format_params(num_params)
53
+
54
+ return None
55
+ except Exception as e:
56
+ print(f"Error fetching size for {model_name}: {e}")
57
+ return None
58
+
59
+
60
+ def get_params_from_safetensors(model_name):
61
+ """Read safetensors header to get actual parameter count."""
62
+ try:
63
+ tree_url = f"https://huggingface.co/api/models/{model_name}/tree/main"
64
+ resp = requests.get(tree_url, timeout=10)
65
+ if resp.status_code != 200:
66
+ return None
67
+
68
+ files = resp.json()
69
+ safetensor_files = [f for f in files if f.get("path", "").endswith(".safetensors")]
70
+ if not safetensor_files:
71
+ return None
72
+
73
+ total_params = 0
74
+
75
+ for sf in safetensor_files:
76
+ file_url = f"https://huggingface.co/{model_name}/resolve/main/{sf['path']}"
77
+
78
+ # Get header size (first 8 bytes)
79
+ headers = {"Range": "bytes=0-7"}
80
+ resp = requests.get(file_url, headers=headers, timeout=10, allow_redirects=True)
81
+ if resp.status_code != 206 or len(resp.content) < 8:
82
+ return None # Likely gated model
83
+
84
+ header_size = struct.unpack("<Q", resp.content[:8])[0]
85
+
86
+ # Get header JSON
87
+ headers = {"Range": f"bytes=8-{8 + header_size - 1}"}
88
+ resp = requests.get(file_url, headers=headers, timeout=10, allow_redirects=True)
89
+ metadata = resp.json()
90
+
91
+ # Calculate params from tensor shapes
92
+ for key, info in metadata.items():
93
+ if key == "__metadata__":
94
+ continue
95
+ shape = info.get("shape", [])
96
+ params = 1
97
+ for dim in shape:
98
+ params *= dim
99
+ total_params += params
100
+
101
+ return total_params
102
+ except Exception:
103
+ return None
104
+
105
+
106
+ class ModelHandler:
107
+ def __init__(self, model_infos_path="model_results.json"):
108
+ self.api = HfApi()
109
+ self.model_infos_path = model_infos_path
110
+ self.model_infos = self._load_model_infos()
111
+
112
+ def _load_model_infos(self) -> List:
113
+ if os.path.exists(self.model_infos_path):
114
+ with open(self.model_infos_path) as f:
115
+ return json.load(f)
116
+ return []
117
+
118
+ def _save_model_infos(self):
119
+ print("Saving model infos")
120
+ with open(self.model_infos_path, "w") as f:
121
+ json.dump(self.model_infos, f, indent=4)
122
+
123
+ def get_embedding_benchmark_data(self) -> pd.DataFrame:
124
+ """Fetch embedding benchmark results from HuggingFace models with ArmBench-TextEmbed tag."""
125
+ # Try to fetch new models from HuggingFace, but gracefully handle network errors
126
+ try:
127
+ models = self.api.list_models(filter="ArmBench-TextEmbed")
128
+ model_names = {model["model_name"] for model in self.model_infos}
129
+ repositories = [model.modelId for model in models]
130
+
131
+ for repo_id in repositories:
132
+ try:
133
+ files = [f for f in self.api.list_repo_files(repo_id) if f == "results.json"]
134
+ if not files:
135
+ continue
136
+
137
+ model_name = repo_id
138
+ if model_name not in model_names:
139
+ result_path = hf_hub_download(repo_id, filename="results.json")
140
+ with open(result_path) as f:
141
+ results = json.load(f)
142
+
143
+ # Build model entry with metadata
144
+ entry = {
145
+ "model_name": model_name,
146
+ "results": results
147
+ }
148
+
149
+ # Add model_url if not in results
150
+ if "model_url" not in results:
151
+ entry["model_url"] = get_model_url(model_name)
152
+
153
+ # Add model_size if not in results
154
+ if "model_size" not in results:
155
+ model_size = get_model_size(model_name)
156
+ if model_size:
157
+ entry["model_size"] = model_size
158
+
159
+ self.model_infos.append(entry)
160
+ except Exception as e:
161
+ print(f"Error loading {repo_id} - {e}")
162
+ continue
163
+
164
+ self._save_model_infos()
165
+ except Exception as e:
166
+ print(f"Failed to fetch from HuggingFace: {e}. Using local data.")
167
+
168
+ # Build dataframe from results
169
+ data = []
170
+ for model in self.model_infos:
171
+ model_name = model["model_name"]
172
+ results = model.get("results", {})
173
+
174
+ row = {"model_name": model_name}
175
+
176
+ # Extract model metadata
177
+ if "model_url" in model:
178
+ row["model_url"] = model["model_url"]
179
+ if "model_size" in model:
180
+ row["model_size"] = model["model_size"]
181
+
182
+ # Extract key metrics
183
+ if "mteb_avg" in results:
184
+ row["mteb_avg"] = results["mteb_avg"]
185
+ if "sts_spearman" in results:
186
+ row["sts_spearman"] = results["sts_spearman"]
187
+ if "retrieval_top20" in results:
188
+ row["retrieval_top20"] = results["retrieval_top20"]
189
+ if "retrieval_translit_top20" in results:
190
+ row["retrieval_translit_top20"] = results["retrieval_translit_top20"]
191
+ if "msmarco_top10" in results:
192
+ row["msmarco_top10"] = results["msmarco_top10"]
193
+ if "msmarco_translit_top10" in results:
194
+ row["msmarco_translit_top10"] = results["msmarco_translit_top10"]
195
+
196
+ # Only add if at least one metric is present
197
+ if len(row) > 1:
198
+ data.append(row)
199
+
200
+ return pd.DataFrame(data)
201
+
202
+ def get_detailed_results(self) -> Dict:
203
+ """Get all detailed results for MTEB, MS MARCO, STS, Retrieval, and translit benchmarks."""
204
+ mteb_data = []
205
+ msmarco_data = []
206
+ sts_data = []
207
+ retrieval_data = []
208
+ retrieval_translit_data = []
209
+ msmarco_translit_data = []
210
+
211
+ for model in self.model_infos:
212
+ model_name = model["model_name"]
213
+ results = model.get("results", {})
214
+
215
+ # MTEB detailed
216
+ if "mteb_detailed" in results:
217
+ row = {"model_name": model_name, **results["mteb_detailed"]}
218
+ mteb_data.append(row)
219
+
220
+ # MS MARCO detailed
221
+ if "msmarco_detailed" in results:
222
+ row = {"model_name": model_name, **results["msmarco_detailed"]}
223
+ msmarco_data.append(row)
224
+
225
+ # STS detailed
226
+ if "sts_detailed" in results:
227
+ row = {"model_name": model_name, **results["sts_detailed"]}
228
+ sts_data.append(row)
229
+
230
+ # Retrieval detailed
231
+ if "retrieval_detailed" in results:
232
+ row = {"model_name": model_name, **results["retrieval_detailed"]}
233
+ retrieval_data.append(row)
234
+
235
+ # Retrieval translit detailed
236
+ if "retrieval_translit_detailed" in results:
237
+ row = {"model_name": model_name, **results["retrieval_translit_detailed"]}
238
+ retrieval_translit_data.append(row)
239
+
240
+ # MS MARCO translit detailed
241
+ if "msmarco_translit_detailed" in results:
242
+ row = {"model_name": model_name, **results["msmarco_translit_detailed"]}
243
+ msmarco_translit_data.append(row)
244
+
245
+ return {
246
+ "mteb": pd.DataFrame(mteb_data) if mteb_data else pd.DataFrame(),
247
+ "msmarco": pd.DataFrame(msmarco_data) if msmarco_data else pd.DataFrame(),
248
+ "sts": pd.DataFrame(sts_data) if sts_data else pd.DataFrame(),
249
+ "retrieval": pd.DataFrame(retrieval_data) if retrieval_data else pd.DataFrame(),
250
+ "retrieval_translit": pd.DataFrame(retrieval_translit_data) if retrieval_translit_data else pd.DataFrame(),
251
+ "msmarco_translit": pd.DataFrame(msmarco_translit_data) if msmarco_translit_data else pd.DataFrame(),
252
+ }
model_results.json ADDED
@@ -0,0 +1,756 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "model_name": "Alibaba-NLP/gte-multilingual-base",
4
+ "model_url": "https://huggingface.co/Alibaba-NLP/gte-multilingual-base",
5
+ "results": {
6
+ "mteb_avg": 0.7337,
7
+ "mteb_detailed": {
8
+ "FloresBitextMining_devtest": 0.8919,
9
+ "NTREXBitextMining_test": 0.9495,
10
+ "Tatoeba_test": 0.8041,
11
+ "MassiveIntentClassification_test": 0.5091,
12
+ "MassiveScenarioClassification_test": 0.5719,
13
+ "SIB200Classification_test": 0.7549,
14
+ "SIB200ClusteringS2S_test": 0.3677,
15
+ "ArmenianParaphrasePC_test": 0.9453,
16
+ "BelebeleRetrieval_test": 0.8093
17
+ },
18
+ "sts_spearman": 0.6869,
19
+ "sts_detailed": {
20
+ "Pearson_correlation": 0.6815,
21
+ "Spearman_correlation": 0.6869
22
+ },
23
+ "retrieval_top20": 0.8315,
24
+ "retrieval_detailed": {
25
+ "top1 within document": 0.49,
26
+ "top3 within document": 0.76,
27
+ "top5 within document": 0.87,
28
+ "top20 group mean macro": 0.923,
29
+ "top20 all": 0.8315
30
+ },
31
+ "msmarco_top10": 0.7171,
32
+ "msmarco_detailed": {
33
+ "reranking_mrr": 0.5337,
34
+ "retrieval_mrr": 0.4098,
35
+ "retrieval_top5_accuracy": 0.6077,
36
+ "retrieval_top10_accuracy": 0.7171
37
+ },
38
+ "retrieval_translit_top20": 0.2772,
39
+ "retrieval_translit_detailed": {
40
+ "top1 within document": 0.15,
41
+ "top3 within document": 0.3,
42
+ "top5 within document": 0.49,
43
+ "top20 group mean macro": 0.5048,
44
+ "top20 all": 0.2772
45
+ },
46
+ "msmarco_translit_top10": 0.2088,
47
+ "msmarco_translit_detailed": {
48
+ "reranking_mrr": 0.3808,
49
+ "retrieval_mrr": 0.1007,
50
+ "retrieval_top5_accuracy": 0.1566,
51
+ "retrieval_top10_accuracy": 0.2088
52
+ }
53
+ },
54
+ "model_size": "305M"
55
+ },
56
+ {
57
+ "model_name": "Qwen/Qwen3-Embedding-0.6B",
58
+ "model_url": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B",
59
+ "results": {
60
+ "mteb_avg": 0.5241,
61
+ "mteb_detailed": {
62
+ "FloresBitextMining_devtest": 0.14,
63
+ "NTREXBitextMining_test": 0.7315,
64
+ "Tatoeba_test": 0.4621,
65
+ "MassiveIntentClassification_test": 0.468,
66
+ "MassiveScenarioClassification_test": 0.5255,
67
+ "SIB200Classification_test": 0.5196,
68
+ "SIB200ClusteringS2S_test": 0.2077,
69
+ "ArmenianParaphrasePC_test": 0.9292,
70
+ "BelebeleRetrieval_test": 0.7332
71
+ },
72
+ "sts_spearman": 0.6532,
73
+ "sts_detailed": {
74
+ "Pearson_correlation": 0.6502,
75
+ "Spearman_correlation": 0.6532
76
+ },
77
+ "retrieval_top20": 0.5163,
78
+ "retrieval_detailed": {
79
+ "top1 within document": 0.26,
80
+ "top3 within document": 0.44,
81
+ "top5 within document": 0.59,
82
+ "top20 group mean macro": 0.704,
83
+ "top20 all": 0.5163
84
+ },
85
+ "msmarco_top10": 0.6929,
86
+ "msmarco_detailed": {
87
+ "reranking_mrr": 0.5007,
88
+ "retrieval_mrr": 0.3783,
89
+ "retrieval_top5_accuracy": 0.5721,
90
+ "retrieval_top10_accuracy": 0.6929
91
+ },
92
+ "retrieval_translit_top20": 0.1957,
93
+ "retrieval_translit_detailed": {
94
+ "top1 within document": 0.14,
95
+ "top3 within document": 0.31,
96
+ "top5 within document": 0.49,
97
+ "top20 group mean macro": 0.4581,
98
+ "top20 all": 0.1957
99
+ },
100
+ "msmarco_translit_top10": 0.2655,
101
+ "msmarco_translit_detailed": {
102
+ "reranking_mrr": 0.4071,
103
+ "retrieval_mrr": 0.1283,
104
+ "retrieval_top5_accuracy": 0.2006,
105
+ "retrieval_top10_accuracy": 0.2655
106
+ }
107
+ },
108
+ "model_size": "596M"
109
+ },
110
+ {
111
+ "model_name": "Qwen/Qwen3-Embedding-8B",
112
+ "model_url": "https://huggingface.co/Qwen/Qwen3-Embedding-8B",
113
+ "results": {
114
+ "mteb_avg": 0.7538,
115
+ "mteb_detailed": {
116
+ "FloresBitextMining_devtest": 0.4617,
117
+ "NTREXBitextMining_test": 0.9633,
118
+ "Tatoeba_test": 0.8796,
119
+ "MassiveIntentClassification_test": 0.6594,
120
+ "MassiveScenarioClassification_test": 0.6922,
121
+ "SIB200Classification_test": 0.774,
122
+ "SIB200ClusteringS2S_test": 0.4455,
123
+ "ArmenianParaphrasePC_test": 0.9556,
124
+ "BelebeleRetrieval_test": 0.953
125
+ },
126
+ "sts_spearman": 0.7338,
127
+ "sts_detailed": {
128
+ "Pearson_correlation": 0.7258,
129
+ "Spearman_correlation": 0.7338
130
+ },
131
+ "retrieval_top20": 0.7011,
132
+ "retrieval_detailed": {
133
+ "top1 within document": 0.36,
134
+ "top3 within document": 0.59,
135
+ "top5 within document": 0.69,
136
+ "top20 group mean macro": 0.8125,
137
+ "top20 all": 0.7011
138
+ },
139
+ "msmarco_top10": 0.838,
140
+ "msmarco_detailed": {
141
+ "reranking_mrr": 0.542,
142
+ "retrieval_mrr": 0.4638,
143
+ "retrieval_top5_accuracy": 0.7051,
144
+ "retrieval_top10_accuracy": 0.838
145
+ },
146
+ "retrieval_translit_top20": 0.2717,
147
+ "retrieval_translit_detailed": {
148
+ "top1 within document": 0.19,
149
+ "top3 within document": 0.38,
150
+ "top5 within document": 0.47,
151
+ "top20 group mean macro": 0.446,
152
+ "top20 all": 0.2717
153
+ },
154
+ "msmarco_translit_top10": 0.3182,
155
+ "msmarco_translit_detailed": {
156
+ "reranking_mrr": 0.3829,
157
+ "retrieval_mrr": 0.1491,
158
+ "retrieval_top5_accuracy": 0.2366,
159
+ "retrieval_top10_accuracy": 0.3182
160
+ }
161
+ },
162
+ "model_size": "7.6B"
163
+ },
164
+ {
165
+ "model_name": "Metric-AI/armenian-text-embeddings-1",
166
+ "model_url": "https://huggingface.co/Metric-AI/armenian-text-embeddings-1",
167
+ "results": {
168
+ "mteb_avg": 0.6923,
169
+ "mteb_detailed": {
170
+ "FloresBitextMining_devtest": 0.0589,
171
+ "NTREXBitextMining_test": 0.9387,
172
+ "Tatoeba_test": 0.904,
173
+ "MassiveIntentClassification_test": 0.612,
174
+ "MassiveScenarioClassification_test": 0.6608,
175
+ "SIB200Classification_test": 0.7971,
176
+ "SIB200ClusteringS2S_test": 0.4593,
177
+ "ArmenianParaphrasePC_test": 0.9552,
178
+ "BelebeleRetrieval_test": 0.8447
179
+ },
180
+ "sts_spearman": 0.7057,
181
+ "sts_detailed": {
182
+ "Pearson_correlation": 0.6882,
183
+ "Spearman_correlation": 0.7057
184
+ },
185
+ "retrieval_top20": 0.8261,
186
+ "retrieval_detailed": {
187
+ "top1 within document": 0.4,
188
+ "top3 within document": 0.78,
189
+ "top5 within document": 0.82,
190
+ "top20 group mean macro": 0.9475,
191
+ "top20 all": 0.8261
192
+ },
193
+ "msmarco_top10": 0.7364,
194
+ "msmarco_detailed": {
195
+ "reranking_mrr": 0.5614,
196
+ "retrieval_mrr": 0.4279,
197
+ "retrieval_top5_accuracy": 0.6251,
198
+ "retrieval_top10_accuracy": 0.7364
199
+ },
200
+ "retrieval_translit_top20": 0.1033,
201
+ "retrieval_translit_detailed": {
202
+ "top1 within document": 0.09,
203
+ "top3 within document": 0.24,
204
+ "top5 within document": 0.36,
205
+ "top20 group mean macro": 0.3149,
206
+ "top20 all": 0.1033
207
+ },
208
+ "msmarco_translit_top10": 0.1053,
209
+ "msmarco_translit_detailed": {
210
+ "reranking_mrr": 0.3532,
211
+ "retrieval_mrr": 0.0516,
212
+ "retrieval_top5_accuracy": 0.0776,
213
+ "retrieval_top10_accuracy": 0.1053
214
+ }
215
+ },
216
+ "model_size": "278M"
217
+ },
218
+ {
219
+ "model_name": "Qwen/Qwen3-Embedding-4B",
220
+ "model_url": "https://huggingface.co/Qwen/Qwen3-Embedding-4B",
221
+ "results": {
222
+ "mteb_avg": 0.7039,
223
+ "mteb_detailed": {
224
+ "FloresBitextMining_devtest": 0.3528,
225
+ "NTREXBitextMining_test": 0.937,
226
+ "Tatoeba_test": 0.8123,
227
+ "MassiveIntentClassification_test": 0.611,
228
+ "MassiveScenarioClassification_test": 0.6534,
229
+ "SIB200Classification_test": 0.7426,
230
+ "SIB200ClusteringS2S_test": 0.395,
231
+ "ArmenianParaphrasePC_test": 0.9487,
232
+ "BelebeleRetrieval_test": 0.8827
233
+ },
234
+ "sts_spearman": 0.7013,
235
+ "sts_detailed": {
236
+ "Pearson_correlation": 0.6939,
237
+ "Spearman_correlation": 0.7013
238
+ },
239
+ "retrieval_top20": 0.6848,
240
+ "retrieval_detailed": {
241
+ "top1 within document": 0.35,
242
+ "top3 within document": 0.63,
243
+ "top5 within document": 0.74,
244
+ "top20 group mean macro": 0.8291,
245
+ "top20 all": 0.6848
246
+ },
247
+ "msmarco_top10": 0.8465,
248
+ "msmarco_detailed": {
249
+ "reranking_mrr": 0.5568,
250
+ "retrieval_mrr": 0.4848,
251
+ "retrieval_top5_accuracy": 0.7258,
252
+ "retrieval_top10_accuracy": 0.8465
253
+ },
254
+ "retrieval_translit_top20": 0.337,
255
+ "retrieval_translit_detailed": {
256
+ "top1 within document": 0.15,
257
+ "top3 within document": 0.44,
258
+ "top5 within document": 0.6,
259
+ "top20 group mean macro": 0.5185,
260
+ "top20 all": 0.337
261
+ },
262
+ "msmarco_translit_top10": 0.3943,
263
+ "msmarco_translit_detailed": {
264
+ "reranking_mrr": 0.4203,
265
+ "retrieval_mrr": 0.1926,
266
+ "retrieval_top5_accuracy": 0.3002,
267
+ "retrieval_top10_accuracy": 0.3943
268
+ }
269
+ },
270
+ "model_size": "4.0B"
271
+ },
272
+ {
273
+ "model_name": "Metric-AI/armenian-text-embeddings-2-base",
274
+ "model_url": "https://huggingface.co/Metric-AI/armenian-text-embeddings-2-base",
275
+ "results": {
276
+ "mteb_avg": 0.6903,
277
+ "mteb_detailed": {
278
+ "FloresBitextMining_devtest": 0.1119,
279
+ "NTREXBitextMining_test": 0.9626,
280
+ "Tatoeba_test": 0.9221,
281
+ "MassiveIntentClassification_test": 0.59,
282
+ "MassiveScenarioClassification_test": 0.6393,
283
+ "SIB200Classification_test": 0.7529,
284
+ "SIB200ClusteringS2S_test": 0.3963,
285
+ "ArmenianParaphrasePC_test": 0.9516,
286
+ "BelebeleRetrieval_test": 0.8857
287
+ },
288
+ "sts_spearman": 0.7055,
289
+ "sts_detailed": {
290
+ "Pearson_correlation": 0.6959,
291
+ "Spearman_correlation": 0.7055
292
+ },
293
+ "retrieval_top20": 0.8587,
294
+ "retrieval_detailed": {
295
+ "top1 within document": 0.51,
296
+ "top3 within document": 0.75,
297
+ "top5 within document": 0.86,
298
+ "top20 group mean macro": 0.9538,
299
+ "top20 all": 0.8587
300
+ },
301
+ "msmarco_top10": 0.8135,
302
+ "msmarco_detailed": {
303
+ "reranking_mrr": 0.565,
304
+ "retrieval_mrr": 0.4732,
305
+ "retrieval_top5_accuracy": 0.7035,
306
+ "retrieval_top10_accuracy": 0.8135
307
+ },
308
+ "retrieval_translit_top20": 0.288,
309
+ "retrieval_translit_detailed": {
310
+ "top1 within document": 0.13,
311
+ "top3 within document": 0.33,
312
+ "top5 within document": 0.45,
313
+ "top20 group mean macro": 0.5038,
314
+ "top20 all": 0.288
315
+ },
316
+ "msmarco_translit_top10": 0.2693,
317
+ "msmarco_translit_detailed": {
318
+ "reranking_mrr": 0.4308,
319
+ "retrieval_mrr": 0.1371,
320
+ "retrieval_top5_accuracy": 0.2082,
321
+ "retrieval_top10_accuracy": 0.2693
322
+ }
323
+ },
324
+ "model_size": "278M"
325
+ },
326
+ {
327
+ "model_name": "intfloat/multilingual-e5-large",
328
+ "model_url": "https://huggingface.co/intfloat/multilingual-e5-large",
329
+ "results": {
330
+ "mteb_avg": 0.6678,
331
+ "mteb_detailed": {
332
+ "FloresBitextMining_devtest": 0.2418,
333
+ "NTREXBitextMining_test": 0.9719,
334
+ "Tatoeba_test": 0.9279,
335
+ "MassiveIntentClassification_test": 0.5499,
336
+ "MassiveScenarioClassification_test": 0.5975,
337
+ "SIB200Classification_test": 0.6676,
338
+ "SIB200ClusteringS2S_test": 0.3292,
339
+ "ArmenianParaphrasePC_test": 0.9541,
340
+ "BelebeleRetrieval_test": 0.7704
341
+ },
342
+ "sts_spearman": 0.6973,
343
+ "sts_detailed": {
344
+ "Pearson_correlation": 0.689,
345
+ "Spearman_correlation": 0.6973
346
+ },
347
+ "retrieval_top20": 0.7663,
348
+ "retrieval_detailed": {
349
+ "top1 within document": 0.52,
350
+ "top3 within document": 0.72,
351
+ "top5 within document": 0.83,
352
+ "top20 group mean macro": 0.8751,
353
+ "top20 all": 0.7663
354
+ },
355
+ "msmarco_top10": 0.7298,
356
+ "msmarco_detailed": {
357
+ "reranking_mrr": 0.5609,
358
+ "retrieval_mrr": 0.4306,
359
+ "retrieval_top5_accuracy": 0.6282,
360
+ "retrieval_top10_accuracy": 0.7298
361
+ },
362
+ "retrieval_translit_top20": 0.125,
363
+ "retrieval_translit_detailed": {
364
+ "top1 within document": 0.09,
365
+ "top3 within document": 0.27,
366
+ "top5 within document": 0.41,
367
+ "top20 group mean macro": 0.3187,
368
+ "top20 all": 0.125
369
+ },
370
+ "msmarco_translit_top10": 0.1202,
371
+ "msmarco_translit_detailed": {
372
+ "reranking_mrr": 0.3551,
373
+ "retrieval_mrr": 0.0608,
374
+ "retrieval_top5_accuracy": 0.0902,
375
+ "retrieval_top10_accuracy": 0.1202
376
+ }
377
+ },
378
+ "model_size": "560M"
379
+ },
380
+ {
381
+ "model_name": "Snowflake/snowflake-arctic-embed-m-v2.0",
382
+ "model_url": "https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v2.0",
383
+ "results": {
384
+ "mteb_avg": 0.594,
385
+ "mteb_detailed": {
386
+ "FloresBitextMining_devtest": 0.0154,
387
+ "NTREXBitextMining_test": 0.8091,
388
+ "Tatoeba_test": 0.6328,
389
+ "MassiveIntentClassification_test": 0.55,
390
+ "MassiveScenarioClassification_test": 0.5947,
391
+ "SIB200Classification_test": 0.6667,
392
+ "SIB200ClusteringS2S_test": 0.3108,
393
+ "ArmenianParaphrasePC_test": 0.9357,
394
+ "BelebeleRetrieval_test": 0.8306
395
+ },
396
+ "sts_spearman": 0.6656,
397
+ "sts_detailed": {
398
+ "Pearson_correlation": 0.656,
399
+ "Spearman_correlation": 0.6656
400
+ },
401
+ "retrieval_top20": 0.8533,
402
+ "retrieval_detailed": {
403
+ "top1 within document": 0.56,
404
+ "top3 within document": 0.79,
405
+ "top5 within document": 0.88,
406
+ "top20 group mean macro": 0.9381,
407
+ "top20 all": 0.8533
408
+ },
409
+ "msmarco_top10": 0.7941,
410
+ "msmarco_detailed": {
411
+ "reranking_mrr": 0.562,
412
+ "retrieval_mrr": 0.4654,
413
+ "retrieval_top5_accuracy": 0.6816,
414
+ "retrieval_top10_accuracy": 0.7941
415
+ },
416
+ "retrieval_translit_top20": 0.1685,
417
+ "retrieval_translit_detailed": {
418
+ "top1 within document": 0.07,
419
+ "top3 within document": 0.28,
420
+ "top5 within document": 0.45,
421
+ "top20 group mean macro": 0.4663,
422
+ "top20 all": 0.1685
423
+ },
424
+ "msmarco_translit_top10": 0.1642,
425
+ "msmarco_translit_detailed": {
426
+ "reranking_mrr": 0.3896,
427
+ "retrieval_mrr": 0.0859,
428
+ "retrieval_top5_accuracy": 0.1316,
429
+ "retrieval_top10_accuracy": 0.1642
430
+ }
431
+ },
432
+ "model_size": "305M"
433
+ },
434
+ {
435
+ "model_name": "Snowflake/snowflake-arctic-embed-l-v2.0",
436
+ "model_url": "https://huggingface.co/Snowflake/snowflake-arctic-embed-l-v2.0",
437
+ "results": {
438
+ "mteb_avg": 0.686,
439
+ "mteb_detailed": {
440
+ "FloresBitextMining_devtest": 0.1367,
441
+ "NTREXBitextMining_test": 0.9489,
442
+ "Tatoeba_test": 0.8401,
443
+ "MassiveIntentClassification_test": 0.6301,
444
+ "MassiveScenarioClassification_test": 0.6703,
445
+ "SIB200Classification_test": 0.7348,
446
+ "SIB200ClusteringS2S_test": 0.3526,
447
+ "ArmenianParaphrasePC_test": 0.9586,
448
+ "BelebeleRetrieval_test": 0.9019
449
+ },
450
+ "sts_spearman": 0.702,
451
+ "sts_detailed": {
452
+ "Pearson_correlation": 0.6835,
453
+ "Spearman_correlation": 0.702
454
+ },
455
+ "retrieval_top20": 0.9239,
456
+ "retrieval_detailed": {
457
+ "top1 within document": 0.61,
458
+ "top3 within document": 0.89,
459
+ "top5 within document": 0.93,
460
+ "top20 group mean macro": 0.9647,
461
+ "top20 all": 0.9239
462
+ },
463
+ "msmarco_top10": 0.8851,
464
+ "msmarco_detailed": {
465
+ "reranking_mrr": 0.6179,
466
+ "retrieval_mrr": 0.5533,
467
+ "retrieval_top5_accuracy": 0.7888,
468
+ "retrieval_top10_accuracy": 0.8851
469
+ },
470
+ "retrieval_translit_top20": 0.2446,
471
+ "retrieval_translit_detailed": {
472
+ "top1 within document": 0.18,
473
+ "top3 within document": 0.33,
474
+ "top5 within document": 0.56,
475
+ "top20 group mean macro": 0.5177,
476
+ "top20 all": 0.2446
477
+ },
478
+ "msmarco_translit_top10": 0.2405,
479
+ "msmarco_translit_detailed": {
480
+ "reranking_mrr": 0.4078,
481
+ "retrieval_mrr": 0.1246,
482
+ "retrieval_top5_accuracy": 0.1878,
483
+ "retrieval_top10_accuracy": 0.2405
484
+ }
485
+ },
486
+ "model_size": "568M"
487
+ },
488
+ {
489
+ "model_name": "intfloat/multilingual-e5-base",
490
+ "model_url": "https://huggingface.co/intfloat/multilingual-e5-base",
491
+ "results": {
492
+ "mteb_avg": 0.6392,
493
+ "mteb_detailed": {
494
+ "FloresBitextMining_devtest": 0.1184,
495
+ "NTREXBitextMining_test": 0.9548,
496
+ "Tatoeba_test": 0.9131,
497
+ "MassiveIntentClassification_test": 0.5407,
498
+ "MassiveScenarioClassification_test": 0.5835,
499
+ "SIB200Classification_test": 0.6652,
500
+ "SIB200ClusteringS2S_test": 0.3035,
501
+ "ArmenianParaphrasePC_test": 0.9424,
502
+ "BelebeleRetrieval_test": 0.731
503
+ },
504
+ "sts_spearman": 0.6726,
505
+ "sts_detailed": {
506
+ "Pearson_correlation": 0.6661,
507
+ "Spearman_correlation": 0.6726
508
+ },
509
+ "retrieval_top20": 0.7446,
510
+ "retrieval_detailed": {
511
+ "top1 within document": 0.48,
512
+ "top3 within document": 0.68,
513
+ "top5 within document": 0.77,
514
+ "top20 group mean macro": 0.8643,
515
+ "top20 all": 0.7446
516
+ },
517
+ "msmarco_top10": 0.606,
518
+ "msmarco_detailed": {
519
+ "reranking_mrr": 0.5435,
520
+ "retrieval_mrr": 0.3474,
521
+ "retrieval_top5_accuracy": 0.5078,
522
+ "retrieval_top10_accuracy": 0.606
523
+ },
524
+ "retrieval_translit_top20": 0.087,
525
+ "retrieval_translit_detailed": {
526
+ "top1 within document": 0.02,
527
+ "top3 within document": 0.19,
528
+ "top5 within document": 0.34,
529
+ "top20 group mean macro": 0.2976,
530
+ "top20 all": 0.087
531
+ },
532
+ "msmarco_translit_top10": 0.0885,
533
+ "msmarco_translit_detailed": {
534
+ "reranking_mrr": 0.3493,
535
+ "retrieval_mrr": 0.0434,
536
+ "retrieval_top5_accuracy": 0.0658,
537
+ "retrieval_top10_accuracy": 0.0885
538
+ }
539
+ },
540
+ "model_size": "278M"
541
+ },
542
+ {
543
+ "model_name": "google/embeddinggemma-300m",
544
+ "model_url": "https://huggingface.co/google/embeddinggemma-300m",
545
+ "results": {
546
+ "mteb_avg": 0.2529,
547
+ "mteb_detailed": {
548
+ "FloresBitextMining_devtest": 0.0665,
549
+ "NTREXBitextMining_test": 0.2256,
550
+ "Tatoeba_test": 0.0727,
551
+ "MassiveIntentClassification_test": 0.2161,
552
+ "MassiveScenarioClassification_test": 0.2879,
553
+ "SIB200Classification_test": 0.3127,
554
+ "SIB200ClusteringS2S_test": 0.0492,
555
+ "ArmenianParaphrasePC_test": 0.9126,
556
+ "BelebeleRetrieval_test": 0.1329
557
+ },
558
+ "sts_spearman": 0.461,
559
+ "sts_detailed": {
560
+ "Pearson_correlation": 0.4555,
561
+ "Spearman_correlation": 0.461
562
+ },
563
+ "retrieval_top20": 0.0326,
564
+ "retrieval_detailed": {
565
+ "top1 within document": 0.07,
566
+ "top3 within document": 0.21,
567
+ "top5 within document": 0.39,
568
+ "top20 group mean macro": 0.1787,
569
+ "top20 all": 0.0326
570
+ },
571
+ "msmarco_top10": 0.0303,
572
+ "msmarco_detailed": {
573
+ "reranking_mrr": 0.3294,
574
+ "retrieval_mrr": 0.0164,
575
+ "retrieval_top5_accuracy": 0.0223,
576
+ "retrieval_top10_accuracy": 0.0303
577
+ },
578
+ "retrieval_translit_top20": 0.0,
579
+ "retrieval_translit_detailed": {
580
+ "top1 within document": 0.01,
581
+ "top3 within document": 0.1,
582
+ "top5 within document": 0.19,
583
+ "top20 group mean macro": 0.0298,
584
+ "top20 all": 0.0
585
+ },
586
+ "msmarco_translit_top10": 0.0051,
587
+ "msmarco_translit_detailed": {
588
+ "reranking_mrr": 0.2847,
589
+ "retrieval_mrr": 0.0029,
590
+ "retrieval_top5_accuracy": 0.0038,
591
+ "retrieval_top10_accuracy": 0.0051
592
+ }
593
+ },
594
+ "model_size": "303M"
595
+ },
596
+ {
597
+ "model_name": "Metric-AI/armenian-text-embeddings-2-large",
598
+ "model_url": "https://huggingface.co/Metric-AI/armenian-text-embeddings-2-large",
599
+ "results": {
600
+ "mteb_avg": 0.7311,
601
+ "mteb_detailed": {
602
+ "FloresBitextMining_devtest": 0.2859,
603
+ "NTREXBitextMining_test": 0.9758,
604
+ "Tatoeba_test": 0.9299,
605
+ "MassiveIntentClassification_test": 0.6314,
606
+ "MassiveScenarioClassification_test": 0.6852,
607
+ "SIB200Classification_test": 0.7706,
608
+ "SIB200ClusteringS2S_test": 0.4315,
609
+ "ArmenianParaphrasePC_test": 0.9605,
610
+ "BelebeleRetrieval_test": 0.9088
611
+ },
612
+ "sts_spearman": 0.7472,
613
+ "sts_detailed": {
614
+ "Pearson_correlation": 0.7401,
615
+ "Spearman_correlation": 0.7472
616
+ },
617
+ "retrieval_top20": 0.8804,
618
+ "retrieval_detailed": {
619
+ "top1 within document": 0.5,
620
+ "top3 within document": 0.83,
621
+ "top5 within document": 0.93,
622
+ "top20 group mean macro": 0.9592,
623
+ "top20 all": 0.8804
624
+ },
625
+ "msmarco_top10": 0.8627,
626
+ "msmarco_detailed": {
627
+ "reranking_mrr": 0.563,
628
+ "retrieval_mrr": 0.4961,
629
+ "retrieval_top5_accuracy": 0.741,
630
+ "retrieval_top10_accuracy": 0.8627
631
+ },
632
+ "retrieval_translit_top20": 0.462,
633
+ "retrieval_translit_detailed": {
634
+ "top1 within document": 0.21,
635
+ "top3 within document": 0.54,
636
+ "top5 within document": 0.69,
637
+ "top20 group mean macro": 0.6623,
638
+ "top20 all": 0.462
639
+ },
640
+ "msmarco_translit_top10": 0.4609,
641
+ "msmarco_translit_detailed": {
642
+ "reranking_mrr": 0.4607,
643
+ "retrieval_mrr": 0.2335,
644
+ "retrieval_top5_accuracy": 0.3606,
645
+ "retrieval_top10_accuracy": 0.4609
646
+ }
647
+ },
648
+ "model_size": "560M"
649
+ },
650
+ {
651
+ "model_name": "gemini/gemini-embedding-001",
652
+ "model_url": "https://ai.google.dev/gemini-api/docs/embeddings",
653
+ "results": {
654
+ "mteb_avg": 0.8204,
655
+ "mteb_detailed": {
656
+ "FloresBitextMining_devtest": 0.7182,
657
+ "NTREXBitextMining_test": 0.9634,
658
+ "Tatoeba_test": 0.9043,
659
+ "MassiveIntentClassification_test": 0.7889,
660
+ "MassiveScenarioClassification_test": 0.8452,
661
+ "SIB200Classification_test": 0.7353,
662
+ "SIB200ClusteringS2S_test": 0.5165,
663
+ "ArmenianParaphrasePC_test": 0.9681,
664
+ "BelebeleRetrieval_test": 0.9434
665
+ },
666
+ "sts_spearman": 0.7455,
667
+ "sts_detailed": {
668
+ "Pearson_correlation": 0.7124,
669
+ "Spearman_correlation": 0.7455
670
+ },
671
+ "retrieval_top20": 0.663,
672
+ "retrieval_detailed": {
673
+ "top1 within document": 0.36,
674
+ "top3 within document": 0.54,
675
+ "top5 within document": 0.63,
676
+ "top20 group mean macro": 0.7533,
677
+ "top20 all": 0.663
678
+ },
679
+ "msmarco_top10": 0.8662,
680
+ "msmarco_detailed": {
681
+ "reranking_mrr": 0.5529,
682
+ "retrieval_mrr": 0.4815,
683
+ "retrieval_top5_accuracy": 0.7384,
684
+ "retrieval_top10_accuracy": 0.8662
685
+ },
686
+ "retrieval_translit_top20": 0.3315,
687
+ "retrieval_translit_detailed": {
688
+ "top1 within document": 0.2,
689
+ "top3 within document": 0.41,
690
+ "top5 within document": 0.54,
691
+ "top20 group mean macro": 0.5542,
692
+ "top20 all": 0.3315
693
+ },
694
+ "msmarco_translit_top10": 0.4139,
695
+ "msmarco_translit_detailed": {
696
+ "reranking_mrr": 0.4335,
697
+ "retrieval_mrr": 0.2017,
698
+ "retrieval_top5_accuracy": 0.3204,
699
+ "retrieval_top10_accuracy": 0.4139
700
+ }
701
+ }
702
+ },
703
+ {
704
+ "model_name": "openai/text-embedding-3-large",
705
+ "model_url": "https://developers.openai.com/api/docs/models/text-embedding-3-large",
706
+ "results": {
707
+ "mteb_avg": 0.2768,
708
+ "mteb_detailed": {
709
+ "FloresBitextMining_devtest": 0.1187,
710
+ "NTREXBitextMining_test": 0.137,
711
+ "Tatoeba_test": 0.0435,
712
+ "MassiveIntentClassification_test": 0.3318,
713
+ "MassiveScenarioClassification_test": 0.3813,
714
+ "SIB200Classification_test": 0.2908,
715
+ "SIB200ClusteringS2S_test": 0.066,
716
+ "ArmenianParaphrasePC_test": 0.9121,
717
+ "BelebeleRetrieval_test": 0.2104
718
+ },
719
+ "sts_spearman": 0.5106,
720
+ "sts_detailed": {
721
+ "Pearson_correlation": 0.5171,
722
+ "Spearman_correlation": 0.5106
723
+ },
724
+ "retrieval_top20": 0.1467,
725
+ "retrieval_detailed": {
726
+ "top1 within document": 0.13,
727
+ "top3 within document": 0.32,
728
+ "top5 within document": 0.45,
729
+ "top20 group mean macro": 0.3745,
730
+ "top20 all": 0.1467
731
+ },
732
+ "msmarco_top10": 0.2518,
733
+ "msmarco_detailed": {
734
+ "reranking_mrr": 0.3848,
735
+ "retrieval_mrr": 0.1223,
736
+ "retrieval_top5_accuracy": 0.1889,
737
+ "retrieval_top10_accuracy": 0.2518
738
+ },
739
+ "retrieval_translit_top20": 0.0435,
740
+ "retrieval_translit_detailed": {
741
+ "top1 within document": 0.06,
742
+ "top3 within document": 0.15,
743
+ "top5 within document": 0.25,
744
+ "top20 group mean macro": 0.2355,
745
+ "top20 all": 0.0435
746
+ },
747
+ "msmarco_translit_top10": 0.1328,
748
+ "msmarco_translit_detailed": {
749
+ "reranking_mrr": 0.343,
750
+ "retrieval_mrr": 0.0592,
751
+ "retrieval_top5_accuracy": 0.0959,
752
+ "retrieval_top10_accuracy": 0.1328
753
+ }
754
+ }
755
+ }
756
+ ]
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio==5.19.0
2
+ pandas==2.2.3
3
+ huggingface-hub==0.28.1