prateek-jain commited on
Commit
7da0047
·
1 Parent(s): 942e13d

Push Leaderboard

Browse files
README.md CHANGED
@@ -1,13 +1,19 @@
1
  ---
2
- title: Search
3
- emoji: 🐢
4
- colorFrom: green
5
- colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 6.3.0
 
8
  app_file: app.py
9
- pinned: false
10
- short_description: DevRev Search Evaluation Leaderboard
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
1
  ---
2
+ title: DevRev Search Evaluation Leaderboard
3
+ emoji: 🏆
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 4.44.1
8
+ python_version: "3.11"
9
  app_file: app.py
10
+ pinned: true
 
11
  ---
12
 
13
+ # 🏆 DevRev Search Evaluation Leaderboard
14
+
15
+ Interactive leaderboard for benchmarking search and retrieval systems on enterprise knowledge bases.
16
+
17
+ ## Features
18
+ - Search performance metrics (Recall@K, Precision@K)
19
+ - Interactive filtering and comparison
app.py ADDED
@@ -0,0 +1,400 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ DevRev Search Evaluation Leaderboard
3
+
4
+ An interactive leaderboard for benchmarking search and retrieval systems
5
+ on enterprise knowledge bases. Built with Gradio and ready for Hugging Face Spaces.
6
+
7
+ Uses MTEB-style standardized JSON format for evaluation results.
8
+ """
9
+
10
+ import base64
11
+ import io
12
+ import json
13
+ import os
14
+ from datetime import datetime
15
+ from pathlib import Path
16
+
17
+ import gradio as gr
18
+ import matplotlib.pyplot as plt
19
+ import pandas as pd
20
+ from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
21
+
22
+
23
+ def load_results_from_json():
24
+ """Load evaluation results from standardized JSON files"""
25
+ results = []
26
+
27
+ # Check for results directory
28
+ results_dirs = ["results", "leaderboard/results", "."]
29
+ results_dir = None
30
+
31
+ for dir_path in results_dirs:
32
+ if os.path.exists(dir_path):
33
+ temp_dir = Path(dir_path)
34
+ if any(temp_dir.glob("*.json")):
35
+ results_dir = temp_dir
36
+ break
37
+
38
+ if not results_dir:
39
+ print(
40
+ "No results directory found. Please create a 'results' directory with JSON files."
41
+ )
42
+ return []
43
+
44
+ # Load all JSON files from results directory
45
+ for json_file in results_dir.glob("*.json"):
46
+ # Skip the schema file
47
+ if json_file.name == "RESULT_SCHEMA.json":
48
+ continue
49
+
50
+ try:
51
+ with open(json_file, "r") as f:
52
+ data = json.load(f)
53
+ # Only include if it's a valid evaluation result
54
+ if "model_name" in data and "metrics" in data:
55
+ results.append(data)
56
+ print(f"Loaded: {json_file.name}")
57
+ except Exception as e:
58
+ print(f"Error loading {json_file}: {e}")
59
+
60
+ return results
61
+
62
+
63
+ def create_leaderboard_data():
64
+ """Create the leaderboard dataframe from JSON results"""
65
+
66
+ # Load results from JSON files
67
+ results = load_results_from_json()
68
+
69
+ if not results:
70
+ print(
71
+ "No evaluation results found. Please add JSON files to the 'results' directory."
72
+ )
73
+ return pd.DataFrame() # Return empty dataframe
74
+
75
+ # Convert to DataFrame format
76
+ data = []
77
+ for result in results:
78
+ metrics = result.get("metrics", {})
79
+
80
+ # Process paper field to handle multiple references
81
+ paper_field = result.get("paper", "N/A")
82
+ if paper_field and paper_field != "N/A":
83
+ # Split by semicolon to handle multiple references
84
+ references = [ref.strip() for ref in paper_field.split(";")]
85
+ formatted_refs = []
86
+ for ref in references:
87
+ if ref.startswith("http"):
88
+ # Display URL as link without custom name
89
+ formatted_refs.append(f"[{ref}]({ref})")
90
+ else:
91
+ # Plain text citation
92
+ formatted_refs.append(ref)
93
+ paper_display = " | ".join(formatted_refs)
94
+ else:
95
+ paper_display = "N/A"
96
+
97
+ row = {
98
+ "🏆 Rank": 0, # Will be set after sorting
99
+ "🔧 Method": result.get("model_name", "Unknown"),
100
+ "📝 Paper/Details": paper_display,
101
+ "🏷️ Type": result.get("model_type", "Unknown"),
102
+ "📈 Recall@5": metrics.get("recall@5", 0),
103
+ "📈 Recall@10": metrics.get("recall@10", 0),
104
+ "📈 Recall@25": metrics.get("recall@25", 0),
105
+ "📈 Recall@50": metrics.get("recall@50", 0),
106
+ "📉 Precision@5": metrics.get("precision@5", 0),
107
+ "📉 Precision@10": metrics.get("precision@10", 0),
108
+ "📉 Precision@25": metrics.get("precision@25", 0),
109
+ "📉 Precision@50": metrics.get("precision@50", 0),
110
+ "🚀 Open Source": "✅" if result.get("open_source", False) else "❌",
111
+ "📅 Date": result.get("evaluation_date", "N/A"),
112
+ }
113
+ data.append(row)
114
+
115
+ # Convert to DataFrame
116
+ df = pd.DataFrame(data)
117
+
118
+ # Sort by Recall@10 (primary) and Precision@10 (secondary)
119
+ df = df.sort_values(["📈 Recall@10", "📉 Precision@10"], ascending=False)
120
+
121
+ # Update ranks
122
+ df["🏆 Rank"] = range(1, len(df) + 1)
123
+
124
+ # Reorder columns
125
+ columns_order = [
126
+ "🏆 Rank",
127
+ "🔧 Method",
128
+ "📝 Paper/Details",
129
+ "🏷️ Type",
130
+ "📈 Recall@5",
131
+ "📈 Recall@10",
132
+ "📈 Recall@25",
133
+ "📈 Recall@50",
134
+ "📉 Precision@5",
135
+ "📉 Precision@10",
136
+ "📉 Precision@25",
137
+ "📉 Precision@50",
138
+ "🚀 Open Source",
139
+ "📅 Date",
140
+ ]
141
+ df = df[columns_order]
142
+
143
+ return df
144
+
145
+
146
+ def create_comparison_plot():
147
+ """Create performance comparison visualizations"""
148
+ df = create_leaderboard_data()
149
+
150
+ if df.empty:
151
+ return "<p style='text-align: center; color: #666;'>No data available for visualization. Please add evaluation results to the 'results' directory.</p>"
152
+
153
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
154
+
155
+ # Sort by Recall@50 for consistent ordering
156
+ df_sorted = df.sort_values("📈 Recall@50", ascending=True)
157
+
158
+ # Recall@50 comparison
159
+ methods = df_sorted["🔧 Method"].tolist()
160
+ recall_50 = df_sorted["📈 Recall@50"].tolist()
161
+ colors = ["#ff6b6b" if "DevRev" in m else "#4ecdc4" for m in methods]
162
+
163
+ ax1.barh(methods, recall_50, color=colors, alpha=0.8)
164
+ ax1.set_xlabel("Recall@50 (%)", fontsize=12)
165
+ ax1.set_title("Recall@50 Comparison", fontsize=14, fontweight="bold")
166
+ ax1.grid(True, axis="x", alpha=0.3)
167
+
168
+ # Add value labels
169
+ for i, (method, recall) in enumerate(zip(methods, recall_50)):
170
+ ax1.text(recall + 0.5, i, f"{recall:.1f}%", va="center", fontsize=10)
171
+
172
+ # Precision@50 comparison
173
+ precision_50 = df_sorted["📉 Precision@50"].tolist()
174
+
175
+ ax2.barh(methods, precision_50, color=colors, alpha=0.8)
176
+ ax2.set_xlabel("Precision@50 (%)", fontsize=12)
177
+ ax2.set_title("Precision@50 Comparison", fontsize=14, fontweight="bold")
178
+ ax2.grid(True, axis="x", alpha=0.3)
179
+
180
+ # Add value labels
181
+ for i, (method, precision) in enumerate(zip(methods, precision_50)):
182
+ ax2.text(
183
+ precision + 0.5,
184
+ i,
185
+ f"{precision:.1f}%",
186
+ va="center",
187
+ fontsize=10,
188
+ )
189
+
190
+ plt.tight_layout()
191
+
192
+ # Convert to base64 for embedding in HTML
193
+ buf = io.BytesIO()
194
+ plt.savefig(buf, format="png", dpi=150, bbox_inches="tight")
195
+ buf.seek(0)
196
+ img_base64 = base64.b64encode(buf.read()).decode()
197
+ plt.close()
198
+
199
+ return f'<img src="data:image/png;base64,{img_base64}" style="width: 100%; max-width: 1000px; margin: 20px auto; display: block;">'
200
+
201
+
202
+ def create_interface():
203
+ """Create the Gradio interface with leaderboard and visualizations"""
204
+
205
+ deep_link_js = r"""
206
+ () => {
207
+ function openAboutAndScroll() {
208
+ if (window.location.hash !== "#about") return;
209
+
210
+ // Switch to the About tab (Gradio tabs are rendered as role="tab" buttons)
211
+ const tabs = Array.from(document.querySelectorAll('button[role="tab"]'));
212
+ const aboutTab = tabs.find((b) => (b.innerText || "").includes("About"));
213
+ if (aboutTab) aboutTab.click();
214
+
215
+ // The About content is mounted after tab switch; retry briefly.
216
+ let attempts = 0;
217
+ const timer = setInterval(() => {
218
+ const el = document.getElementById("about");
219
+ if (el) {
220
+ el.scrollIntoView({ behavior: "smooth", block: "start" });
221
+ clearInterval(timer);
222
+ }
223
+ attempts += 1;
224
+ if (attempts > 25) clearInterval(timer);
225
+ }, 200);
226
+ }
227
+
228
+ window.addEventListener("hashchange", openAboutAndScroll);
229
+ openAboutAndScroll();
230
+ setTimeout(openAboutAndScroll, 600);
231
+ }
232
+ """
233
+
234
+ with gr.Blocks(
235
+ title="DevRev Search Evaluation Leaderboard", js=deep_link_js
236
+ ) as demo:
237
+ # Header
238
+ gr.HTML(
239
+ """
240
+ <div style="text-align: center; margin-bottom: 30px;">
241
+ <h1 style="font-size: 3em; font-weight: bold; margin-bottom: 10px;">
242
+ 🏆 DevRev Search Evaluation Leaderboard
243
+ </h1>
244
+ <p style="font-size: 1.2em; color: #666;">
245
+ Benchmarking Search and Retrieval Systems for Enterprise Knowledge Bases
246
+ </p>
247
+ </div>
248
+ """
249
+ )
250
+
251
+ # Tabs
252
+ with gr.Tabs():
253
+ # Main Leaderboard Tab
254
+ with gr.TabItem("🏆 Main Leaderboard"):
255
+ gr.Markdown(
256
+ """
257
+ ### Evaluation Overview
258
+ This leaderboard displays metrics of search systems on the test queries present in [DevRev Search Dataset](https://huggingface.co/datasets/devrev/search).
259
+ All methods are evaluated on the same set of agent support queries with consistent evaluation protocols.
260
+
261
+ **Metrics**: Recall@K and Precision@K measure the effectiveness of retrieving relevant articles within the top K retrieved articles.
262
+
263
+ **Leaderboard ranking**: Sorted by **Recall@10** (primary) and **Precision@10** (secondary).
264
+
265
+ **To add your results**: Submission details are available in the [About](#about) section.
266
+ """
267
+ )
268
+
269
+ # Get leaderboard data
270
+ df = create_leaderboard_data()
271
+
272
+ if not df.empty:
273
+ # Configure which columns to display by default
274
+ default_columns = [
275
+ "🏆 Rank",
276
+ "🔧 Method",
277
+ "🏷️ Type",
278
+ "📈 Recall@10",
279
+ "📈 Recall@50",
280
+ "📉 Precision@10",
281
+ "📉 Precision@50",
282
+ "🚀 Open Source",
283
+ ]
284
+
285
+ # Define column filters
286
+ type_column = ColumnFilter("🏷️ Type", type="checkboxgroup")
287
+ open_source_column = ColumnFilter(
288
+ "🚀 Open Source", type="checkboxgroup"
289
+ )
290
+
291
+ # Create the interactive leaderboard
292
+ Leaderboard(
293
+ value=df,
294
+ datatype=[
295
+ "number",
296
+ "markdown",
297
+ "markdown",
298
+ "str",
299
+ "number",
300
+ "number",
301
+ "number",
302
+ "number",
303
+ "number",
304
+ "number",
305
+ "number",
306
+ "number",
307
+ "str",
308
+ "str",
309
+ ],
310
+ select_columns=SelectColumns(
311
+ default_selection=default_columns,
312
+ cant_deselect=[
313
+ "🏆 Rank",
314
+ "🔧 Method",
315
+ "📈 Recall@10",
316
+ ],
317
+ label="Select Columns to Display",
318
+ ),
319
+ search_columns=[
320
+ "🔧 Method",
321
+ "📝 Paper/Details",
322
+ "🏷️ Type",
323
+ ],
324
+ hide_columns=["📅 Date"],
325
+ filter_columns=[type_column, open_source_column],
326
+ interactive=False,
327
+ )
328
+ else:
329
+ gr.HTML(
330
+ """
331
+ <div style="text-align: center; padding: 50px; background: #f5f5f5; border-radius: 10px;">
332
+ <h3>No Results Found</h3>
333
+ <p>Please add JSON evaluation files to the 'results' directory.</p>
334
+ <p>See the About tab for the required format.</p>
335
+ </div>
336
+ """
337
+ )
338
+
339
+ # About Tab
340
+ with gr.TabItem("ℹ️ About"):
341
+ gr.Markdown(
342
+ """
343
+ ## About This Leaderboard
344
+
345
+ This leaderboard tracks the performance of various search and retrieval systems on the [DevRev Search Dataset](https://huggingface.co/datasets/devrev/search).
346
+
347
+ ### 📊 Evaluation Metrics
348
+
349
+ - **Recall@K**: The percentage of relevant article chunks retrieved in the top K article chunks
350
+ - **Precision@K**: The percentage of retrieved article chunks that are relevant among the top K article chunks
351
+
352
+ ### 📤 How to Submit
353
+
354
+ 1. Run your retrieval on the test queries in DevRev Search Dataset
355
+ 2. Submit the results in same format as annotated_queries in the dataset through email to prateek.jain@devrev.ai
356
+ 3. Also include a **one-line system detail/link**, the **system type**, and whether it is **open source**
357
+
358
+ ### 🔗 Resources
359
+
360
+ - [Computer by DevRev](https://devrev.ai/meet-computer)
361
+ - [DevRev Search Dataset](https://huggingface.co/datasets/devrev/search)
362
+
363
+ ### 🙏 Acknowledgments
364
+
365
+ Inspired by:
366
+ - [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard)
367
+ - [Berkeley Function Calling Leaderboard](https://gorilla.cs.berkeley.edu/leaderboard)
368
+
369
+ ### 📚 Citation
370
+
371
+ ```bibtex
372
+ @misc{devrev_search_leaderboard_2026,
373
+ title={DevRev Search Leaderboard},
374
+ author={Research@DevRev},
375
+ year={2026},
376
+ url={https://huggingface.co/spaces/devrev/search}
377
+ }
378
+ ```
379
+ """,
380
+ elem_id="about",
381
+ )
382
+
383
+ # Footer
384
+ gr.HTML(
385
+ f"""
386
+ <div style="text-align: center; margin-top: 50px; padding: 20px; border-top: 1px solid #e0e0e0; color: #666;">
387
+ <p>
388
+ Last updated: {datetime.now().strftime("%Y-%m-%d %H:%M UTC")}
389
+ </p>
390
+ </div>
391
+ """
392
+ )
393
+
394
+ return demo
395
+
396
+
397
+ # Create and launch the app
398
+ if __name__ == "__main__":
399
+ demo = create_interface()
400
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=True, show_api=False)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio==4.44.1
2
+ gradio-leaderboard==0.0.11
3
+ pandas==2.3.3
4
+ numpy==2.4.1
5
+ matplotlib==3.9.2
6
+ huggingface-hub==0.24.7
results/bm25.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "BM25",
3
+ "model_type": "Lexical",
4
+ "organization": "Open Source",
5
+ "description": "Classic lexical search algorithm based on term frequency and inverse document frequency",
6
+ "paper": "Robertson et al., 1994",
7
+ "code": "https://github.com/elastic/elasticsearch",
8
+ "open_source": true,
9
+ "api_available": false,
10
+ "evaluation_date": "2026-01-18",
11
+ "metrics": {
12
+ "recall@5": 9.37,
13
+ "recall@10": 14.77,
14
+ "recall@25": 23.84,
15
+ "recall@50": 30.70,
16
+ "precision@5": 11.96,
17
+ "precision@10": 10.43,
18
+ "precision@25": 7.39,
19
+ "precision@50": 5.50
20
+ }
21
+ }
results/cohere_v3_embeddings.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "cohere.embed-english-v3",
3
+ "model_type": "Dense",
4
+ "organization": "Cohere",
5
+ "description": "Cohere's embedding model for English",
6
+ "paper": "https://docs.cohere.com/docs/cohere-embed",
7
+ "open_source": false,
8
+ "api_available": true,
9
+ "evaluation_date": "2026-01-18",
10
+ "metrics": {
11
+ "recall@5": 11.32,
12
+ "recall@10": 20.14,
13
+ "recall@25": 30.26,
14
+ "recall@50": 39.76,
15
+ "precision@5": 18.91,
16
+ "precision@10": 18.04,
17
+ "precision@25": 14.04,
18
+ "precision@50": 11.46
19
+ }
20
+ }
results/cohere_v4_embeddings.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "cohere.embed-v4:0",
3
+ "model_type": "Dense",
4
+ "organization": "Cohere",
5
+ "description": "Cohere's cohere.embed-v4:0 embedding model",
6
+ "paper": "https://docs.cohere.com/docs/cohere-embed",
7
+ "open_source": false,
8
+ "api_available": true,
9
+ "evaluation_date": "2026-01-18",
10
+ "metrics": {
11
+ "recall@5": 17.71,
12
+ "recall@10": 23.21,
13
+ "recall@25": 37.00,
14
+ "recall@50": 44.74,
15
+ "precision@5": 24.78,
16
+ "precision@10": 21.85,
17
+ "precision@25": 16.56,
18
+ "precision@50": 12.39
19
+ }
20
+ }
results/gemini_bm25_reranker.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "gemini-embedding-001-bm25-zerank-1-small",
3
+ "model_type": "Hybrid",
4
+ "organization": "NA",
5
+ "description": "Hybrid search system combining Google's gemini-embedding-001 embedding model with BM25 using Reranker",
6
+ "paper": "https://ai.google.dev/gemini-api/docs/embeddings; Robertson et al., 1994; https://huggingface.co/zeroentropy/zerank-1-small",
7
+ "open_source": false,
8
+ "api_available": true,
9
+ "evaluation_date": "2026-01-18",
10
+ "metrics": {
11
+ "recall@5": 29.11,
12
+ "recall@10": 36.50,
13
+ "recall@25": 52.09,
14
+ "recall@50": 60.00,
15
+ "precision@5": 35.65,
16
+ "precision@10": 26.85,
17
+ "precision@25": 19.00,
18
+ "precision@50": 13.56
19
+ }
20
+ }
results/gemini_bm25_rrf.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "gemini-embedding-001-bm25-rrf",
3
+ "model_type": "Hybrid",
4
+ "organization": "NA",
5
+ "description": "Hybrid search system combining Google's gemini-embedding-001 embedding model with BM25 using RRF",
6
+ "paper": "https://ai.google.dev/gemini-api/docs/embeddings; Robertson et al., 1994; Cormack et al., 2009",
7
+ "open_source": false,
8
+ "api_available": true,
9
+ "evaluation_date": "2026-01-18",
10
+ "metrics": {
11
+ "recall@5": 23.02,
12
+ "recall@10": 31.48,
13
+ "recall@25": 47.22,
14
+ "recall@50": 54.60,
15
+ "precision@5": 29.56,
16
+ "precision@10": 23.04,
17
+ "precision@25": 17.48,
18
+ "precision@50": 12.78
19
+ },
20
+ "metadata": {
21
+ "rrf": {
22
+ "semantic_retrievals": 50,
23
+ "bm25_retrievals": 50,
24
+ "semantic_weight": 0.9,
25
+ "bm25_weight": 0.1,
26
+ "k": 60
27
+ }
28
+ }
29
+ }
results/gemini_embeddings.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "gemini-embedding-001",
3
+ "model_type": "Dense",
4
+ "organization": "Google",
5
+ "description": "Google's latest text embedding model in Gemini series",
6
+ "paper": "https://ai.google.dev/gemini-api/docs/embeddings",
7
+ "open_source": false,
8
+ "api_available": true,
9
+ "evaluation_date": "2026-01-18",
10
+ "metrics": {
11
+ "recall@5": 23.08,
12
+ "recall@10": 31.04,
13
+ "recall@25": 46.73,
14
+ "recall@50": 54.60,
15
+ "precision@5": 29.56,
16
+ "precision@10": 23.26,
17
+ "precision@25": 17.22,
18
+ "precision@50": 12.78
19
+ }
20
+ }
results/gte_qwen2_bm25_reranker.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "GTE-Qwen2-7B-instruct-bm25-zerank-1-small",
3
+ "model_type": "Hybrid",
4
+ "organization": "NA",
5
+ "description": "Hybrid search system combining GTE-Qwen2-7B-instruct embedding model with BM25 using Reranker",
6
+ "paper": "https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct; Robertson et al., 1994; https://huggingface.co/zeroentropy/zerank-1-small",
7
+ "open_source": true,
8
+ "api_available": false,
9
+ "evaluation_date": "2026-01-18",
10
+ "metrics": {
11
+ "recall@5": 28.07,
12
+ "recall@10": 35.08,
13
+ "recall@25": 48.19,
14
+ "recall@50": 57.55,
15
+ "precision@5": 34.56,
16
+ "precision@10": 26.85,
17
+ "precision@25": 19.91,
18
+ "precision@50": 14.76
19
+ }
20
+ }
results/gte_qwen2_bm25_rrf.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "GTE-Qwen2-7B-instruct-bm25-rrf",
3
+ "model_type": "Hybrid",
4
+ "organization": "NA",
5
+ "description": "Hybrid search system combining GTE-Qwen2-7B-instruct embedding model with BM25 using RRF",
6
+ "paper": "https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct; Robertson et al., 1994; Cormack et al., 2009",
7
+ "open_source": true,
8
+ "api_available": false,
9
+ "evaluation_date": "2026-01-18",
10
+ "metrics": {
11
+ "recall@5": 16.44,
12
+ "recall@10": 26.14,
13
+ "recall@25": 39.39,
14
+ "recall@50": 52.55,
15
+ "precision@5": 26.30,
16
+ "precision@10": 22.5,
17
+ "precision@25": 16.91,
18
+ "precision@50": 14.20
19
+ },
20
+ "metadata": {
21
+ "rrf": {
22
+ "semantic_retrievals": 50,
23
+ "bm25_retrievals": 50,
24
+ "semantic_weight": 0.9,
25
+ "bm25_weight": 0.1,
26
+ "k": 60
27
+ }
28
+ }
29
+ }
results/gte_qwen2_embeddings.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "GTE-Qwen2-7B-instruct",
3
+ "model_type": "Dense",
4
+ "organization": "Alibaba",
5
+ "description": "Alibaba's GTE-Qwen2 embedding model",
6
+ "paper": "https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct",
7
+ "open_source": true,
8
+ "api_available": false,
9
+ "evaluation_date": "2026-01-18",
10
+ "metrics": {
11
+ "recall@5": 15.62,
12
+ "recall@10": 24.46,
13
+ "recall@25": 39.84,
14
+ "recall@50": 52.55,
15
+ "precision@5": 25.22,
16
+ "precision@10": 21.85,
17
+ "precision@25": 16.96,
18
+ "precision@50": 14.20
19
+ }
20
+ }
results/openai_embeddings.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "text-embedding-3-large",
3
+ "model_type": "Dense",
4
+ "organization": "OpenAI",
5
+ "description": "OpenAI's latest text embedding model",
6
+ "paper": "https://openai.com/index/new-embedding-models-and-api-updates/",
7
+ "open_source": false,
8
+ "api_available": true,
9
+ "evaluation_date": "2026-01-18",
10
+ "metrics": {
11
+ "recall@5": 16.06,
12
+ "recall@10": 24.03,
13
+ "recall@25": 35.59,
14
+ "recall@50": 45.10,
15
+ "precision@5": 24.78,
16
+ "precision@10": 20.65,
17
+ "precision@25": 16.74,
18
+ "precision@50": 13.13
19
+ }
20
+ }
results/qwen3_bm25_reranker.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Qwen3-Embedding-8B-bm25-zerank-1-small",
3
+ "model_type": "Hybrid",
4
+ "organization": "NA",
5
+ "description": "Hybrid search system combining Alibaba's Qwen3 embedding model with BM25 using Reranker",
6
+ "paper": "https://huggingface.co/Qwen/Qwen3-Embedding-8B; Robertson et al., 1994; https://huggingface.co/zeroentropy/zerank-1-small",
7
+ "open_source": true,
8
+ "api_available": false,
9
+ "evaluation_date": "2026-01-18",
10
+ "metrics": {
11
+ "recall@5": 27.57,
12
+ "recall@10": 36.09,
13
+ "recall@25": 46.41,
14
+ "recall@50": 51.32,
15
+ "precision@5": 34.56,
16
+ "precision@10": 26.63,
17
+ "precision@25": 17.04,
18
+ "precision@50": 11.63
19
+ }
20
+ }
results/qwen3_bm25_rrf.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Qwen3-Embedding-8B-bm25-rrf",
3
+ "model_type": "Hybrid",
4
+ "organization": "NA",
5
+ "description": "Hybrid search system combining Alibaba's Qwen3 embedding model with BM25 using RRF",
6
+ "paper": "https://huggingface.co/Qwen/Qwen3-Embedding-8B; Robertson et al., 1994; Cormack et al., 2009",
7
+ "open_source": true,
8
+ "api_available": false,
9
+ "evaluation_date": "2026-01-18",
10
+ "metrics": {
11
+ "recall@5": 15.92,
12
+ "recall@10": 24.22,
13
+ "recall@25": 34.08,
14
+ "recall@50": 43.13,
15
+ "precision@5": 22.61,
16
+ "precision@10": 18.37,
17
+ "precision@25": 13.35,
18
+ "precision@50": 11.17
19
+ },
20
+ "metadata": {
21
+ "rrf": {
22
+ "semantic_retrievals": 50,
23
+ "bm25_retrievals": 50,
24
+ "semantic_weight": 0.9,
25
+ "bm25_weight": 0.1,
26
+ "k": 60
27
+ }
28
+ }
29
+ }
results/qwen3_embeddings.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Qwen3-Embedding-8B",
3
+ "model_type": "Dense",
4
+ "organization": "Alibaba",
5
+ "description": "Alibaba's Qwen3 embedding model",
6
+ "paper": "https://huggingface.co/Qwen/Qwen3-Embedding-8B",
7
+ "open_source": true,
8
+ "api_available": false,
9
+ "evaluation_date": "2026-01-18",
10
+ "metrics": {
11
+ "recall@5": 16.42,
12
+ "recall@10": 26.67,
13
+ "recall@25": 33.82,
14
+ "recall@50": 43.13,
15
+ "precision@5": 23.26,
16
+ "precision@10": 18.70,
17
+ "precision@25": 13.48,
18
+ "precision@50": 11.17
19
+ }
20
+ }
results/sfr_embeddings.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "SFR-Embedding-Mistral",
3
+ "model_type": "Dense",
4
+ "organization": "Salesforce",
5
+ "description": "Salesforce's SFR embedding model",
6
+ "paper": "https://huggingface.co/Salesforce/SFR-Embedding-Mistral",
7
+ "open_source": true,
8
+ "api_available": false,
9
+ "evaluation_date": "2026-01-18",
10
+ "metrics": {
11
+ "recall@5": 17.02,
12
+ "recall@10": 26.61,
13
+ "recall@25": 39.82,
14
+ "recall@50": 51.32,
15
+ "precision@5": 23.91,
16
+ "precision@10": 21.30,
17
+ "precision@25": 15.26,
18
+ "precision@50": 11.80
19
+ }
20
+ }
results/snowflake_bm25_reranker.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "snowflake-arctic-embed-l-v2.0-bm25-zerank-1-small",
3
+ "model_type": "Hybrid",
4
+ "organization": "NA",
5
+ "description": "Hybrid search system combining Snowflake's snowflake-arctic-embed-l-v2.0 embedding model with BM25 using Reranker",
6
+ "paper": "https://huggingface.co/Snowflake/snowflake-arctic-embed-l-v2.0; Robertson et al., 1994; https://huggingface.co/zeroentropy/zerank-1-small",
7
+ "open_source": true,
8
+ "api_available": false,
9
+ "evaluation_date": "2026-01-18",
10
+ "metrics": {
11
+ "recall@5": 27.57,
12
+ "recall@10": 36.09,
13
+ "recall@25": 46.41,
14
+ "recall@50": 51.32,
15
+ "precision@5": 34.56,
16
+ "precision@10": 26.63,
17
+ "precision@25": 17.04,
18
+ "precision@50": 11.63
19
+ }
20
+ }
results/snowflake_bm25_rrf.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "snowflake-arctic-embed-l-v2.0-bm25-rrf",
3
+ "model_type": "Hybrid",
4
+ "organization": "NA",
5
+ "description": "Hybrid search system combining Snowflake's snowflake-arctic-embed-l-v2.0 embedding model with BM25 using RRF",
6
+ "paper": "https://huggingface.co/Snowflake/snowflake-arctic-embed-l-v2.0; Robertson et al., 1994; Cormack et al., 2009",
7
+ "open_source": true,
8
+ "api_available": false,
9
+ "evaluation_date": "2026-01-18",
10
+ "metrics": {
11
+ "recall@5": 19.56,
12
+ "recall@10": 25.22,
13
+ "recall@25": 34.34,
14
+ "recall@50": 40.55,
15
+ "precision@5": 23.70,
16
+ "precision@10": 18.91,
17
+ "precision@25": 13.43,
18
+ "precision@50": 9.91
19
+ },
20
+ "metadata": {
21
+ "rrf": {
22
+ "semantic_retrievals": 50,
23
+ "bm25_retrievals": 50,
24
+ "semantic_weight": 0.9,
25
+ "bm25_weight": 0.1,
26
+ "k": 60
27
+ }
28
+ }
29
+ }
results/snowflake_embeddings.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "snowflake-arctic-embed-l-v2.0",
3
+ "model_type": "Dense",
4
+ "organization": "Snowflake",
5
+ "description": "Snowflake's snowflake-arctic-embed-l-v2.0 embedding model",
6
+ "paper": "https://huggingface.co/Snowflake/snowflake-arctic-embed-l-v2.0",
7
+ "open_source": true,
8
+ "api_available": false,
9
+ "evaluation_date": "2026-01-18",
10
+ "metrics": {
11
+ "recall@5": 18.34,
12
+ "recall@10": 25.76,
13
+ "recall@25": 34.16,
14
+ "recall@50": 40.55,
15
+ "precision@5": 23.26,
16
+ "precision@10": 19.67,
17
+ "precision@25": 13.30,
18
+ "precision@50": 9.91
19
+ }
20
+ }