Commit ·
7da0047
1
Parent(s): 942e13d
Push Leaderboard
Browse files- README.md +14 -8
- app.py +400 -0
- requirements.txt +6 -0
- results/bm25.json +21 -0
- results/cohere_v3_embeddings.json +20 -0
- results/cohere_v4_embeddings.json +20 -0
- results/gemini_bm25_reranker.json +20 -0
- results/gemini_bm25_rrf.json +29 -0
- results/gemini_embeddings.json +20 -0
- results/gte_qwen2_bm25_reranker.json +20 -0
- results/gte_qwen2_bm25_rrf.json +29 -0
- results/gte_qwen2_embeddings.json +20 -0
- results/openai_embeddings.json +20 -0
- results/qwen3_bm25_reranker.json +20 -0
- results/qwen3_bm25_rrf.json +29 -0
- results/qwen3_embeddings.json +20 -0
- results/sfr_embeddings.json +20 -0
- results/snowflake_bm25_reranker.json +20 -0
- results/snowflake_bm25_rrf.json +29 -0
- results/snowflake_embeddings.json +20 -0
README.md
CHANGED
|
@@ -1,13 +1,19 @@
|
|
| 1 |
---
|
| 2 |
-
title: Search
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
|
|
|
| 8 |
app_file: app.py
|
| 9 |
-
pinned:
|
| 10 |
-
short_description: DevRev Search Evaluation Leaderboard
|
| 11 |
---
|
| 12 |
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: DevRev Search Evaluation Leaderboard
|
| 3 |
+
emoji: 🏆
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 4.44.1
|
| 8 |
+
python_version: "3.11"
|
| 9 |
app_file: app.py
|
| 10 |
+
pinned: true
|
|
|
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# 🏆 DevRev Search Evaluation Leaderboard
|
| 14 |
+
|
| 15 |
+
Interactive leaderboard for benchmarking search and retrieval systems on enterprise knowledge bases.
|
| 16 |
+
|
| 17 |
+
## Features
|
| 18 |
+
- Search performance metrics (Recall@K, Precision@K)
|
| 19 |
+
- Interactive filtering and comparison
|
app.py
ADDED
|
@@ -0,0 +1,400 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
DevRev Search Evaluation Leaderboard
|
| 3 |
+
|
| 4 |
+
An interactive leaderboard for benchmarking search and retrieval systems
|
| 5 |
+
on enterprise knowledge bases. Built with Gradio and ready for Hugging Face Spaces.
|
| 6 |
+
|
| 7 |
+
Uses MTEB-style standardized JSON format for evaluation results.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import base64
|
| 11 |
+
import io
|
| 12 |
+
import json
|
| 13 |
+
import os
|
| 14 |
+
from datetime import datetime
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
|
| 17 |
+
import gradio as gr
|
| 18 |
+
import matplotlib.pyplot as plt
|
| 19 |
+
import pandas as pd
|
| 20 |
+
from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def load_results_from_json():
|
| 24 |
+
"""Load evaluation results from standardized JSON files"""
|
| 25 |
+
results = []
|
| 26 |
+
|
| 27 |
+
# Check for results directory
|
| 28 |
+
results_dirs = ["results", "leaderboard/results", "."]
|
| 29 |
+
results_dir = None
|
| 30 |
+
|
| 31 |
+
for dir_path in results_dirs:
|
| 32 |
+
if os.path.exists(dir_path):
|
| 33 |
+
temp_dir = Path(dir_path)
|
| 34 |
+
if any(temp_dir.glob("*.json")):
|
| 35 |
+
results_dir = temp_dir
|
| 36 |
+
break
|
| 37 |
+
|
| 38 |
+
if not results_dir:
|
| 39 |
+
print(
|
| 40 |
+
"No results directory found. Please create a 'results' directory with JSON files."
|
| 41 |
+
)
|
| 42 |
+
return []
|
| 43 |
+
|
| 44 |
+
# Load all JSON files from results directory
|
| 45 |
+
for json_file in results_dir.glob("*.json"):
|
| 46 |
+
# Skip the schema file
|
| 47 |
+
if json_file.name == "RESULT_SCHEMA.json":
|
| 48 |
+
continue
|
| 49 |
+
|
| 50 |
+
try:
|
| 51 |
+
with open(json_file, "r") as f:
|
| 52 |
+
data = json.load(f)
|
| 53 |
+
# Only include if it's a valid evaluation result
|
| 54 |
+
if "model_name" in data and "metrics" in data:
|
| 55 |
+
results.append(data)
|
| 56 |
+
print(f"Loaded: {json_file.name}")
|
| 57 |
+
except Exception as e:
|
| 58 |
+
print(f"Error loading {json_file}: {e}")
|
| 59 |
+
|
| 60 |
+
return results
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def create_leaderboard_data():
|
| 64 |
+
"""Create the leaderboard dataframe from JSON results"""
|
| 65 |
+
|
| 66 |
+
# Load results from JSON files
|
| 67 |
+
results = load_results_from_json()
|
| 68 |
+
|
| 69 |
+
if not results:
|
| 70 |
+
print(
|
| 71 |
+
"No evaluation results found. Please add JSON files to the 'results' directory."
|
| 72 |
+
)
|
| 73 |
+
return pd.DataFrame() # Return empty dataframe
|
| 74 |
+
|
| 75 |
+
# Convert to DataFrame format
|
| 76 |
+
data = []
|
| 77 |
+
for result in results:
|
| 78 |
+
metrics = result.get("metrics", {})
|
| 79 |
+
|
| 80 |
+
# Process paper field to handle multiple references
|
| 81 |
+
paper_field = result.get("paper", "N/A")
|
| 82 |
+
if paper_field and paper_field != "N/A":
|
| 83 |
+
# Split by semicolon to handle multiple references
|
| 84 |
+
references = [ref.strip() for ref in paper_field.split(";")]
|
| 85 |
+
formatted_refs = []
|
| 86 |
+
for ref in references:
|
| 87 |
+
if ref.startswith("http"):
|
| 88 |
+
# Display URL as link without custom name
|
| 89 |
+
formatted_refs.append(f"[{ref}]({ref})")
|
| 90 |
+
else:
|
| 91 |
+
# Plain text citation
|
| 92 |
+
formatted_refs.append(ref)
|
| 93 |
+
paper_display = " | ".join(formatted_refs)
|
| 94 |
+
else:
|
| 95 |
+
paper_display = "N/A"
|
| 96 |
+
|
| 97 |
+
row = {
|
| 98 |
+
"🏆 Rank": 0, # Will be set after sorting
|
| 99 |
+
"🔧 Method": result.get("model_name", "Unknown"),
|
| 100 |
+
"📝 Paper/Details": paper_display,
|
| 101 |
+
"🏷️ Type": result.get("model_type", "Unknown"),
|
| 102 |
+
"📈 Recall@5": metrics.get("recall@5", 0),
|
| 103 |
+
"📈 Recall@10": metrics.get("recall@10", 0),
|
| 104 |
+
"📈 Recall@25": metrics.get("recall@25", 0),
|
| 105 |
+
"📈 Recall@50": metrics.get("recall@50", 0),
|
| 106 |
+
"📉 Precision@5": metrics.get("precision@5", 0),
|
| 107 |
+
"📉 Precision@10": metrics.get("precision@10", 0),
|
| 108 |
+
"📉 Precision@25": metrics.get("precision@25", 0),
|
| 109 |
+
"📉 Precision@50": metrics.get("precision@50", 0),
|
| 110 |
+
"🚀 Open Source": "✅" if result.get("open_source", False) else "❌",
|
| 111 |
+
"📅 Date": result.get("evaluation_date", "N/A"),
|
| 112 |
+
}
|
| 113 |
+
data.append(row)
|
| 114 |
+
|
| 115 |
+
# Convert to DataFrame
|
| 116 |
+
df = pd.DataFrame(data)
|
| 117 |
+
|
| 118 |
+
# Sort by Recall@10 (primary) and Precision@10 (secondary)
|
| 119 |
+
df = df.sort_values(["📈 Recall@10", "📉 Precision@10"], ascending=False)
|
| 120 |
+
|
| 121 |
+
# Update ranks
|
| 122 |
+
df["🏆 Rank"] = range(1, len(df) + 1)
|
| 123 |
+
|
| 124 |
+
# Reorder columns
|
| 125 |
+
columns_order = [
|
| 126 |
+
"🏆 Rank",
|
| 127 |
+
"🔧 Method",
|
| 128 |
+
"📝 Paper/Details",
|
| 129 |
+
"🏷️ Type",
|
| 130 |
+
"📈 Recall@5",
|
| 131 |
+
"📈 Recall@10",
|
| 132 |
+
"📈 Recall@25",
|
| 133 |
+
"📈 Recall@50",
|
| 134 |
+
"📉 Precision@5",
|
| 135 |
+
"📉 Precision@10",
|
| 136 |
+
"📉 Precision@25",
|
| 137 |
+
"📉 Precision@50",
|
| 138 |
+
"🚀 Open Source",
|
| 139 |
+
"📅 Date",
|
| 140 |
+
]
|
| 141 |
+
df = df[columns_order]
|
| 142 |
+
|
| 143 |
+
return df
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def create_comparison_plot():
|
| 147 |
+
"""Create performance comparison visualizations"""
|
| 148 |
+
df = create_leaderboard_data()
|
| 149 |
+
|
| 150 |
+
if df.empty:
|
| 151 |
+
return "<p style='text-align: center; color: #666;'>No data available for visualization. Please add evaluation results to the 'results' directory.</p>"
|
| 152 |
+
|
| 153 |
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
|
| 154 |
+
|
| 155 |
+
# Sort by Recall@50 for consistent ordering
|
| 156 |
+
df_sorted = df.sort_values("📈 Recall@50", ascending=True)
|
| 157 |
+
|
| 158 |
+
# Recall@50 comparison
|
| 159 |
+
methods = df_sorted["🔧 Method"].tolist()
|
| 160 |
+
recall_50 = df_sorted["📈 Recall@50"].tolist()
|
| 161 |
+
colors = ["#ff6b6b" if "DevRev" in m else "#4ecdc4" for m in methods]
|
| 162 |
+
|
| 163 |
+
ax1.barh(methods, recall_50, color=colors, alpha=0.8)
|
| 164 |
+
ax1.set_xlabel("Recall@50 (%)", fontsize=12)
|
| 165 |
+
ax1.set_title("Recall@50 Comparison", fontsize=14, fontweight="bold")
|
| 166 |
+
ax1.grid(True, axis="x", alpha=0.3)
|
| 167 |
+
|
| 168 |
+
# Add value labels
|
| 169 |
+
for i, (method, recall) in enumerate(zip(methods, recall_50)):
|
| 170 |
+
ax1.text(recall + 0.5, i, f"{recall:.1f}%", va="center", fontsize=10)
|
| 171 |
+
|
| 172 |
+
# Precision@50 comparison
|
| 173 |
+
precision_50 = df_sorted["📉 Precision@50"].tolist()
|
| 174 |
+
|
| 175 |
+
ax2.barh(methods, precision_50, color=colors, alpha=0.8)
|
| 176 |
+
ax2.set_xlabel("Precision@50 (%)", fontsize=12)
|
| 177 |
+
ax2.set_title("Precision@50 Comparison", fontsize=14, fontweight="bold")
|
| 178 |
+
ax2.grid(True, axis="x", alpha=0.3)
|
| 179 |
+
|
| 180 |
+
# Add value labels
|
| 181 |
+
for i, (method, precision) in enumerate(zip(methods, precision_50)):
|
| 182 |
+
ax2.text(
|
| 183 |
+
precision + 0.5,
|
| 184 |
+
i,
|
| 185 |
+
f"{precision:.1f}%",
|
| 186 |
+
va="center",
|
| 187 |
+
fontsize=10,
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
plt.tight_layout()
|
| 191 |
+
|
| 192 |
+
# Convert to base64 for embedding in HTML
|
| 193 |
+
buf = io.BytesIO()
|
| 194 |
+
plt.savefig(buf, format="png", dpi=150, bbox_inches="tight")
|
| 195 |
+
buf.seek(0)
|
| 196 |
+
img_base64 = base64.b64encode(buf.read()).decode()
|
| 197 |
+
plt.close()
|
| 198 |
+
|
| 199 |
+
return f'<img src="data:image/png;base64,{img_base64}" style="width: 100%; max-width: 1000px; margin: 20px auto; display: block;">'
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
def create_interface():
|
| 203 |
+
"""Create the Gradio interface with leaderboard and visualizations"""
|
| 204 |
+
|
| 205 |
+
deep_link_js = r"""
|
| 206 |
+
() => {
|
| 207 |
+
function openAboutAndScroll() {
|
| 208 |
+
if (window.location.hash !== "#about") return;
|
| 209 |
+
|
| 210 |
+
// Switch to the About tab (Gradio tabs are rendered as role="tab" buttons)
|
| 211 |
+
const tabs = Array.from(document.querySelectorAll('button[role="tab"]'));
|
| 212 |
+
const aboutTab = tabs.find((b) => (b.innerText || "").includes("About"));
|
| 213 |
+
if (aboutTab) aboutTab.click();
|
| 214 |
+
|
| 215 |
+
// The About content is mounted after tab switch; retry briefly.
|
| 216 |
+
let attempts = 0;
|
| 217 |
+
const timer = setInterval(() => {
|
| 218 |
+
const el = document.getElementById("about");
|
| 219 |
+
if (el) {
|
| 220 |
+
el.scrollIntoView({ behavior: "smooth", block: "start" });
|
| 221 |
+
clearInterval(timer);
|
| 222 |
+
}
|
| 223 |
+
attempts += 1;
|
| 224 |
+
if (attempts > 25) clearInterval(timer);
|
| 225 |
+
}, 200);
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
window.addEventListener("hashchange", openAboutAndScroll);
|
| 229 |
+
openAboutAndScroll();
|
| 230 |
+
setTimeout(openAboutAndScroll, 600);
|
| 231 |
+
}
|
| 232 |
+
"""
|
| 233 |
+
|
| 234 |
+
with gr.Blocks(
|
| 235 |
+
title="DevRev Search Evaluation Leaderboard", js=deep_link_js
|
| 236 |
+
) as demo:
|
| 237 |
+
# Header
|
| 238 |
+
gr.HTML(
|
| 239 |
+
"""
|
| 240 |
+
<div style="text-align: center; margin-bottom: 30px;">
|
| 241 |
+
<h1 style="font-size: 3em; font-weight: bold; margin-bottom: 10px;">
|
| 242 |
+
🏆 DevRev Search Evaluation Leaderboard
|
| 243 |
+
</h1>
|
| 244 |
+
<p style="font-size: 1.2em; color: #666;">
|
| 245 |
+
Benchmarking Search and Retrieval Systems for Enterprise Knowledge Bases
|
| 246 |
+
</p>
|
| 247 |
+
</div>
|
| 248 |
+
"""
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
# Tabs
|
| 252 |
+
with gr.Tabs():
|
| 253 |
+
# Main Leaderboard Tab
|
| 254 |
+
with gr.TabItem("🏆 Main Leaderboard"):
|
| 255 |
+
gr.Markdown(
|
| 256 |
+
"""
|
| 257 |
+
### Evaluation Overview
|
| 258 |
+
This leaderboard displays metrics of search systems on the test queries present in [DevRev Search Dataset](https://huggingface.co/datasets/devrev/search).
|
| 259 |
+
All methods are evaluated on the same set of agent support queries with consistent evaluation protocols.
|
| 260 |
+
|
| 261 |
+
**Metrics**: Recall@K and Precision@K measure the effectiveness of retrieving relevant articles within the top K retrieved articles.
|
| 262 |
+
|
| 263 |
+
**Leaderboard ranking**: Sorted by **Recall@10** (primary) and **Precision@10** (secondary).
|
| 264 |
+
|
| 265 |
+
**To add your results**: Submission details are available in the [About](#about) section.
|
| 266 |
+
"""
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
# Get leaderboard data
|
| 270 |
+
df = create_leaderboard_data()
|
| 271 |
+
|
| 272 |
+
if not df.empty:
|
| 273 |
+
# Configure which columns to display by default
|
| 274 |
+
default_columns = [
|
| 275 |
+
"🏆 Rank",
|
| 276 |
+
"🔧 Method",
|
| 277 |
+
"🏷️ Type",
|
| 278 |
+
"📈 Recall@10",
|
| 279 |
+
"📈 Recall@50",
|
| 280 |
+
"📉 Precision@10",
|
| 281 |
+
"📉 Precision@50",
|
| 282 |
+
"🚀 Open Source",
|
| 283 |
+
]
|
| 284 |
+
|
| 285 |
+
# Define column filters
|
| 286 |
+
type_column = ColumnFilter("🏷️ Type", type="checkboxgroup")
|
| 287 |
+
open_source_column = ColumnFilter(
|
| 288 |
+
"🚀 Open Source", type="checkboxgroup"
|
| 289 |
+
)
|
| 290 |
+
|
| 291 |
+
# Create the interactive leaderboard
|
| 292 |
+
Leaderboard(
|
| 293 |
+
value=df,
|
| 294 |
+
datatype=[
|
| 295 |
+
"number",
|
| 296 |
+
"markdown",
|
| 297 |
+
"markdown",
|
| 298 |
+
"str",
|
| 299 |
+
"number",
|
| 300 |
+
"number",
|
| 301 |
+
"number",
|
| 302 |
+
"number",
|
| 303 |
+
"number",
|
| 304 |
+
"number",
|
| 305 |
+
"number",
|
| 306 |
+
"number",
|
| 307 |
+
"str",
|
| 308 |
+
"str",
|
| 309 |
+
],
|
| 310 |
+
select_columns=SelectColumns(
|
| 311 |
+
default_selection=default_columns,
|
| 312 |
+
cant_deselect=[
|
| 313 |
+
"🏆 Rank",
|
| 314 |
+
"🔧 Method",
|
| 315 |
+
"📈 Recall@10",
|
| 316 |
+
],
|
| 317 |
+
label="Select Columns to Display",
|
| 318 |
+
),
|
| 319 |
+
search_columns=[
|
| 320 |
+
"🔧 Method",
|
| 321 |
+
"📝 Paper/Details",
|
| 322 |
+
"🏷️ Type",
|
| 323 |
+
],
|
| 324 |
+
hide_columns=["📅 Date"],
|
| 325 |
+
filter_columns=[type_column, open_source_column],
|
| 326 |
+
interactive=False,
|
| 327 |
+
)
|
| 328 |
+
else:
|
| 329 |
+
gr.HTML(
|
| 330 |
+
"""
|
| 331 |
+
<div style="text-align: center; padding: 50px; background: #f5f5f5; border-radius: 10px;">
|
| 332 |
+
<h3>No Results Found</h3>
|
| 333 |
+
<p>Please add JSON evaluation files to the 'results' directory.</p>
|
| 334 |
+
<p>See the About tab for the required format.</p>
|
| 335 |
+
</div>
|
| 336 |
+
"""
|
| 337 |
+
)
|
| 338 |
+
|
| 339 |
+
# About Tab
|
| 340 |
+
with gr.TabItem("ℹ️ About"):
|
| 341 |
+
gr.Markdown(
|
| 342 |
+
"""
|
| 343 |
+
## About This Leaderboard
|
| 344 |
+
|
| 345 |
+
This leaderboard tracks the performance of various search and retrieval systems on the [DevRev Search Dataset](https://huggingface.co/datasets/devrev/search).
|
| 346 |
+
|
| 347 |
+
### 📊 Evaluation Metrics
|
| 348 |
+
|
| 349 |
+
- **Recall@K**: The percentage of relevant article chunks retrieved in the top K article chunks
|
| 350 |
+
- **Precision@K**: The percentage of retrieved article chunks that are relevant among the top K article chunks
|
| 351 |
+
|
| 352 |
+
### 📤 How to Submit
|
| 353 |
+
|
| 354 |
+
1. Run your retrieval on the test queries in DevRev Search Dataset
|
| 355 |
+
2. Submit the results in same format as annotated_queries in the dataset through email to prateek.jain@devrev.ai
|
| 356 |
+
3. Also include a **one-line system detail/link**, the **system type**, and whether it is **open source**
|
| 357 |
+
|
| 358 |
+
### 🔗 Resources
|
| 359 |
+
|
| 360 |
+
- [Computer by DevRev](https://devrev.ai/meet-computer)
|
| 361 |
+
- [DevRev Search Dataset](https://huggingface.co/datasets/devrev/search)
|
| 362 |
+
|
| 363 |
+
### 🙏 Acknowledgments
|
| 364 |
+
|
| 365 |
+
Inspired by:
|
| 366 |
+
- [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard)
|
| 367 |
+
- [Berkeley Function Calling Leaderboard](https://gorilla.cs.berkeley.edu/leaderboard)
|
| 368 |
+
|
| 369 |
+
### 📚 Citation
|
| 370 |
+
|
| 371 |
+
```bibtex
|
| 372 |
+
@misc{devrev_search_leaderboard_2026,
|
| 373 |
+
title={DevRev Search Leaderboard},
|
| 374 |
+
author={Research@DevRev},
|
| 375 |
+
year={2026},
|
| 376 |
+
url={https://huggingface.co/spaces/devrev/search}
|
| 377 |
+
}
|
| 378 |
+
```
|
| 379 |
+
""",
|
| 380 |
+
elem_id="about",
|
| 381 |
+
)
|
| 382 |
+
|
| 383 |
+
# Footer
|
| 384 |
+
gr.HTML(
|
| 385 |
+
f"""
|
| 386 |
+
<div style="text-align: center; margin-top: 50px; padding: 20px; border-top: 1px solid #e0e0e0; color: #666;">
|
| 387 |
+
<p>
|
| 388 |
+
Last updated: {datetime.now().strftime("%Y-%m-%d %H:%M UTC")}
|
| 389 |
+
</p>
|
| 390 |
+
</div>
|
| 391 |
+
"""
|
| 392 |
+
)
|
| 393 |
+
|
| 394 |
+
return demo
|
| 395 |
+
|
| 396 |
+
|
| 397 |
+
# Create and launch the app
|
| 398 |
+
if __name__ == "__main__":
|
| 399 |
+
demo = create_interface()
|
| 400 |
+
demo.launch(server_name="0.0.0.0", server_port=7860, share=True, show_api=False)
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio==4.44.1
|
| 2 |
+
gradio-leaderboard==0.0.11
|
| 3 |
+
pandas==2.3.3
|
| 4 |
+
numpy==2.4.1
|
| 5 |
+
matplotlib==3.9.2
|
| 6 |
+
huggingface-hub==0.24.7
|
results/bm25.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "BM25",
|
| 3 |
+
"model_type": "Lexical",
|
| 4 |
+
"organization": "Open Source",
|
| 5 |
+
"description": "Classic lexical search algorithm based on term frequency and inverse document frequency",
|
| 6 |
+
"paper": "Robertson et al., 1994",
|
| 7 |
+
"code": "https://github.com/elastic/elasticsearch",
|
| 8 |
+
"open_source": true,
|
| 9 |
+
"api_available": false,
|
| 10 |
+
"evaluation_date": "2026-01-18",
|
| 11 |
+
"metrics": {
|
| 12 |
+
"recall@5": 9.37,
|
| 13 |
+
"recall@10": 14.77,
|
| 14 |
+
"recall@25": 23.84,
|
| 15 |
+
"recall@50": 30.70,
|
| 16 |
+
"precision@5": 11.96,
|
| 17 |
+
"precision@10": 10.43,
|
| 18 |
+
"precision@25": 7.39,
|
| 19 |
+
"precision@50": 5.50
|
| 20 |
+
}
|
| 21 |
+
}
|
results/cohere_v3_embeddings.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "cohere.embed-english-v3",
|
| 3 |
+
"model_type": "Dense",
|
| 4 |
+
"organization": "Cohere",
|
| 5 |
+
"description": "Cohere's embedding model for English",
|
| 6 |
+
"paper": "https://docs.cohere.com/docs/cohere-embed",
|
| 7 |
+
"open_source": false,
|
| 8 |
+
"api_available": true,
|
| 9 |
+
"evaluation_date": "2026-01-18",
|
| 10 |
+
"metrics": {
|
| 11 |
+
"recall@5": 11.32,
|
| 12 |
+
"recall@10": 20.14,
|
| 13 |
+
"recall@25": 30.26,
|
| 14 |
+
"recall@50": 39.76,
|
| 15 |
+
"precision@5": 18.91,
|
| 16 |
+
"precision@10": 18.04,
|
| 17 |
+
"precision@25": 14.04,
|
| 18 |
+
"precision@50": 11.46
|
| 19 |
+
}
|
| 20 |
+
}
|
results/cohere_v4_embeddings.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "cohere.embed-v4:0",
|
| 3 |
+
"model_type": "Dense",
|
| 4 |
+
"organization": "Cohere",
|
| 5 |
+
"description": "Cohere's cohere.embed-v4:0 embedding model",
|
| 6 |
+
"paper": "https://docs.cohere.com/docs/cohere-embed",
|
| 7 |
+
"open_source": false,
|
| 8 |
+
"api_available": true,
|
| 9 |
+
"evaluation_date": "2026-01-18",
|
| 10 |
+
"metrics": {
|
| 11 |
+
"recall@5": 17.71,
|
| 12 |
+
"recall@10": 23.21,
|
| 13 |
+
"recall@25": 37.00,
|
| 14 |
+
"recall@50": 44.74,
|
| 15 |
+
"precision@5": 24.78,
|
| 16 |
+
"precision@10": 21.85,
|
| 17 |
+
"precision@25": 16.56,
|
| 18 |
+
"precision@50": 12.39
|
| 19 |
+
}
|
| 20 |
+
}
|
results/gemini_bm25_reranker.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "gemini-embedding-001-bm25-zerank-1-small",
|
| 3 |
+
"model_type": "Hybrid",
|
| 4 |
+
"organization": "NA",
|
| 5 |
+
"description": "Hybrid search system combining Google's gemini-embedding-001 embedding model with BM25 using Reranker",
|
| 6 |
+
"paper": "https://ai.google.dev/gemini-api/docs/embeddings; Robertson et al., 1994; https://huggingface.co/zeroentropy/zerank-1-small",
|
| 7 |
+
"open_source": false,
|
| 8 |
+
"api_available": true,
|
| 9 |
+
"evaluation_date": "2026-01-18",
|
| 10 |
+
"metrics": {
|
| 11 |
+
"recall@5": 29.11,
|
| 12 |
+
"recall@10": 36.50,
|
| 13 |
+
"recall@25": 52.09,
|
| 14 |
+
"recall@50": 60.00,
|
| 15 |
+
"precision@5": 35.65,
|
| 16 |
+
"precision@10": 26.85,
|
| 17 |
+
"precision@25": 19.00,
|
| 18 |
+
"precision@50": 13.56
|
| 19 |
+
}
|
| 20 |
+
}
|
results/gemini_bm25_rrf.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "gemini-embedding-001-bm25-rrf",
|
| 3 |
+
"model_type": "Hybrid",
|
| 4 |
+
"organization": "NA",
|
| 5 |
+
"description": "Hybrid search system combining Google's gemini-embedding-001 embedding model with BM25 using RRF",
|
| 6 |
+
"paper": "https://ai.google.dev/gemini-api/docs/embeddings; Robertson et al., 1994; Cormack et al., 2009",
|
| 7 |
+
"open_source": false,
|
| 8 |
+
"api_available": true,
|
| 9 |
+
"evaluation_date": "2026-01-18",
|
| 10 |
+
"metrics": {
|
| 11 |
+
"recall@5": 23.02,
|
| 12 |
+
"recall@10": 31.48,
|
| 13 |
+
"recall@25": 47.22,
|
| 14 |
+
"recall@50": 54.60,
|
| 15 |
+
"precision@5": 29.56,
|
| 16 |
+
"precision@10": 23.04,
|
| 17 |
+
"precision@25": 17.48,
|
| 18 |
+
"precision@50": 12.78
|
| 19 |
+
},
|
| 20 |
+
"metadata": {
|
| 21 |
+
"rrf": {
|
| 22 |
+
"semantic_retrievals": 50,
|
| 23 |
+
"bm25_retrievals": 50,
|
| 24 |
+
"semantic_weight": 0.9,
|
| 25 |
+
"bm25_weight": 0.1,
|
| 26 |
+
"k": 60
|
| 27 |
+
}
|
| 28 |
+
}
|
| 29 |
+
}
|
results/gemini_embeddings.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "gemini-embedding-001",
|
| 3 |
+
"model_type": "Dense",
|
| 4 |
+
"organization": "Google",
|
| 5 |
+
"description": "Google's latest text embedding model in Gemini series",
|
| 6 |
+
"paper": "https://ai.google.dev/gemini-api/docs/embeddings",
|
| 7 |
+
"open_source": false,
|
| 8 |
+
"api_available": true,
|
| 9 |
+
"evaluation_date": "2026-01-18",
|
| 10 |
+
"metrics": {
|
| 11 |
+
"recall@5": 23.08,
|
| 12 |
+
"recall@10": 31.04,
|
| 13 |
+
"recall@25": 46.73,
|
| 14 |
+
"recall@50": 54.60,
|
| 15 |
+
"precision@5": 29.56,
|
| 16 |
+
"precision@10": 23.26,
|
| 17 |
+
"precision@25": 17.22,
|
| 18 |
+
"precision@50": 12.78
|
| 19 |
+
}
|
| 20 |
+
}
|
results/gte_qwen2_bm25_reranker.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "GTE-Qwen2-7B-instruct-bm25-zerank-1-small",
|
| 3 |
+
"model_type": "Hybrid",
|
| 4 |
+
"organization": "NA",
|
| 5 |
+
"description": "Hybrid search system combining GTE-Qwen2-7B-instruct embedding model with BM25 using Reranker",
|
| 6 |
+
"paper": "https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct; Robertson et al., 1994; https://huggingface.co/zeroentropy/zerank-1-small",
|
| 7 |
+
"open_source": true,
|
| 8 |
+
"api_available": false,
|
| 9 |
+
"evaluation_date": "2026-01-18",
|
| 10 |
+
"metrics": {
|
| 11 |
+
"recall@5": 28.07,
|
| 12 |
+
"recall@10": 35.08,
|
| 13 |
+
"recall@25": 48.19,
|
| 14 |
+
"recall@50": 57.55,
|
| 15 |
+
"precision@5": 34.56,
|
| 16 |
+
"precision@10": 26.85,
|
| 17 |
+
"precision@25": 19.91,
|
| 18 |
+
"precision@50": 14.76
|
| 19 |
+
}
|
| 20 |
+
}
|
results/gte_qwen2_bm25_rrf.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "GTE-Qwen2-7B-instruct-bm25-rrf",
|
| 3 |
+
"model_type": "Hybrid",
|
| 4 |
+
"organization": "NA",
|
| 5 |
+
"description": "Hybrid search system combining GTE-Qwen2-7B-instruct embedding model with BM25 using RRF",
|
| 6 |
+
"paper": "https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct; Robertson et al., 1994; Cormack et al., 2009",
|
| 7 |
+
"open_source": true,
|
| 8 |
+
"api_available": false,
|
| 9 |
+
"evaluation_date": "2026-01-18",
|
| 10 |
+
"metrics": {
|
| 11 |
+
"recall@5": 16.44,
|
| 12 |
+
"recall@10": 26.14,
|
| 13 |
+
"recall@25": 39.39,
|
| 14 |
+
"recall@50": 52.55,
|
| 15 |
+
"precision@5": 26.30,
|
| 16 |
+
"precision@10": 22.5,
|
| 17 |
+
"precision@25": 16.91,
|
| 18 |
+
"precision@50": 14.20
|
| 19 |
+
},
|
| 20 |
+
"metadata": {
|
| 21 |
+
"rrf": {
|
| 22 |
+
"semantic_retrievals": 50,
|
| 23 |
+
"bm25_retrievals": 50,
|
| 24 |
+
"semantic_weight": 0.9,
|
| 25 |
+
"bm25_weight": 0.1,
|
| 26 |
+
"k": 60
|
| 27 |
+
}
|
| 28 |
+
}
|
| 29 |
+
}
|
results/gte_qwen2_embeddings.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "GTE-Qwen2-7B-instruct",
|
| 3 |
+
"model_type": "Dense",
|
| 4 |
+
"organization": "Alibaba",
|
| 5 |
+
"description": "Alibaba's GTE-Qwen2 embedding model",
|
| 6 |
+
"paper": "https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct",
|
| 7 |
+
"open_source": true,
|
| 8 |
+
"api_available": false,
|
| 9 |
+
"evaluation_date": "2026-01-18",
|
| 10 |
+
"metrics": {
|
| 11 |
+
"recall@5": 15.62,
|
| 12 |
+
"recall@10": 24.46,
|
| 13 |
+
"recall@25": 39.84,
|
| 14 |
+
"recall@50": 52.55,
|
| 15 |
+
"precision@5": 25.22,
|
| 16 |
+
"precision@10": 21.85,
|
| 17 |
+
"precision@25": 16.96,
|
| 18 |
+
"precision@50": 14.20
|
| 19 |
+
}
|
| 20 |
+
}
|
results/openai_embeddings.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "text-embedding-3-large",
|
| 3 |
+
"model_type": "Dense",
|
| 4 |
+
"organization": "OpenAI",
|
| 5 |
+
"description": "OpenAI's latest text embedding model",
|
| 6 |
+
"paper": "https://openai.com/index/new-embedding-models-and-api-updates/",
|
| 7 |
+
"open_source": false,
|
| 8 |
+
"api_available": true,
|
| 9 |
+
"evaluation_date": "2026-01-18",
|
| 10 |
+
"metrics": {
|
| 11 |
+
"recall@5": 16.06,
|
| 12 |
+
"recall@10": 24.03,
|
| 13 |
+
"recall@25": 35.59,
|
| 14 |
+
"recall@50": 45.10,
|
| 15 |
+
"precision@5": 24.78,
|
| 16 |
+
"precision@10": 20.65,
|
| 17 |
+
"precision@25": 16.74,
|
| 18 |
+
"precision@50": 13.13
|
| 19 |
+
}
|
| 20 |
+
}
|
results/qwen3_bm25_reranker.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Qwen3-Embedding-8B-bm25-zerank-1-small",
|
| 3 |
+
"model_type": "Hybrid",
|
| 4 |
+
"organization": "NA",
|
| 5 |
+
"description": "Hybrid search system combining Alibaba's Qwen3 embedding model with BM25 using Reranker",
|
| 6 |
+
"paper": "https://huggingface.co/Qwen/Qwen3-Embedding-8B; Robertson et al., 1994; https://huggingface.co/zeroentropy/zerank-1-small",
|
| 7 |
+
"open_source": true,
|
| 8 |
+
"api_available": false,
|
| 9 |
+
"evaluation_date": "2026-01-18",
|
| 10 |
+
"metrics": {
|
| 11 |
+
"recall@5": 27.57,
|
| 12 |
+
"recall@10": 36.09,
|
| 13 |
+
"recall@25": 46.41,
|
| 14 |
+
"recall@50": 51.32,
|
| 15 |
+
"precision@5": 34.56,
|
| 16 |
+
"precision@10": 26.63,
|
| 17 |
+
"precision@25": 17.04,
|
| 18 |
+
"precision@50": 11.63
|
| 19 |
+
}
|
| 20 |
+
}
|
results/qwen3_bm25_rrf.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Qwen3-Embedding-8B-bm25-rrf",
|
| 3 |
+
"model_type": "Hybrid",
|
| 4 |
+
"organization": "NA",
|
| 5 |
+
"description": "Hybrid search system combining Alibaba's Qwen3 embedding model with BM25 using RRF",
|
| 6 |
+
"paper": "https://huggingface.co/Qwen/Qwen3-Embedding-8B; Robertson et al., 1994; Cormack et al., 2009",
|
| 7 |
+
"open_source": true,
|
| 8 |
+
"api_available": false,
|
| 9 |
+
"evaluation_date": "2026-01-18",
|
| 10 |
+
"metrics": {
|
| 11 |
+
"recall@5": 15.92,
|
| 12 |
+
"recall@10": 24.22,
|
| 13 |
+
"recall@25": 34.08,
|
| 14 |
+
"recall@50": 43.13,
|
| 15 |
+
"precision@5": 22.61,
|
| 16 |
+
"precision@10": 18.37,
|
| 17 |
+
"precision@25": 13.35,
|
| 18 |
+
"precision@50": 11.17
|
| 19 |
+
},
|
| 20 |
+
"metadata": {
|
| 21 |
+
"rrf": {
|
| 22 |
+
"semantic_retrievals": 50,
|
| 23 |
+
"bm25_retrievals": 50,
|
| 24 |
+
"semantic_weight": 0.9,
|
| 25 |
+
"bm25_weight": 0.1,
|
| 26 |
+
"k": 60
|
| 27 |
+
}
|
| 28 |
+
}
|
| 29 |
+
}
|
results/qwen3_embeddings.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Qwen3-Embedding-8B",
|
| 3 |
+
"model_type": "Dense",
|
| 4 |
+
"organization": "Alibaba",
|
| 5 |
+
"description": "Alibaba's Qwen3 embedding model",
|
| 6 |
+
"paper": "https://huggingface.co/Qwen/Qwen3-Embedding-8B",
|
| 7 |
+
"open_source": true,
|
| 8 |
+
"api_available": false,
|
| 9 |
+
"evaluation_date": "2026-01-18",
|
| 10 |
+
"metrics": {
|
| 11 |
+
"recall@5": 16.42,
|
| 12 |
+
"recall@10": 26.67,
|
| 13 |
+
"recall@25": 33.82,
|
| 14 |
+
"recall@50": 43.13,
|
| 15 |
+
"precision@5": 23.26,
|
| 16 |
+
"precision@10": 18.70,
|
| 17 |
+
"precision@25": 13.48,
|
| 18 |
+
"precision@50": 11.17
|
| 19 |
+
}
|
| 20 |
+
}
|
results/sfr_embeddings.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "SFR-Embedding-Mistral",
|
| 3 |
+
"model_type": "Dense",
|
| 4 |
+
"organization": "Salesforce",
|
| 5 |
+
"description": "Salesforce's SFR embedding model",
|
| 6 |
+
"paper": "https://huggingface.co/Salesforce/SFR-Embedding-Mistral",
|
| 7 |
+
"open_source": true,
|
| 8 |
+
"api_available": false,
|
| 9 |
+
"evaluation_date": "2026-01-18",
|
| 10 |
+
"metrics": {
|
| 11 |
+
"recall@5": 17.02,
|
| 12 |
+
"recall@10": 26.61,
|
| 13 |
+
"recall@25": 39.82,
|
| 14 |
+
"recall@50": 51.32,
|
| 15 |
+
"precision@5": 23.91,
|
| 16 |
+
"precision@10": 21.30,
|
| 17 |
+
"precision@25": 15.26,
|
| 18 |
+
"precision@50": 11.80
|
| 19 |
+
}
|
| 20 |
+
}
|
results/snowflake_bm25_reranker.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "snowflake-arctic-embed-l-v2.0-bm25-zerank-1-small",
|
| 3 |
+
"model_type": "Hybrid",
|
| 4 |
+
"organization": "NA",
|
| 5 |
+
"description": "Hybrid search system combining Snowflake's snowflake-arctic-embed-l-v2.0 embedding model with BM25 using Reranker",
|
| 6 |
+
"paper": "https://huggingface.co/Snowflake/snowflake-arctic-embed-l-v2.0; Robertson et al., 1994; https://huggingface.co/zeroentropy/zerank-1-small",
|
| 7 |
+
"open_source": true,
|
| 8 |
+
"api_available": false,
|
| 9 |
+
"evaluation_date": "2026-01-18",
|
| 10 |
+
"metrics": {
|
| 11 |
+
"recall@5": 27.57,
|
| 12 |
+
"recall@10": 36.09,
|
| 13 |
+
"recall@25": 46.41,
|
| 14 |
+
"recall@50": 51.32,
|
| 15 |
+
"precision@5": 34.56,
|
| 16 |
+
"precision@10": 26.63,
|
| 17 |
+
"precision@25": 17.04,
|
| 18 |
+
"precision@50": 11.63
|
| 19 |
+
}
|
| 20 |
+
}
|
results/snowflake_bm25_rrf.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "snowflake-arctic-embed-l-v2.0-bm25-rrf",
|
| 3 |
+
"model_type": "Hybrid",
|
| 4 |
+
"organization": "NA",
|
| 5 |
+
"description": "Hybrid search system combining Snowflake's snowflake-arctic-embed-l-v2.0 embedding model with BM25 using RRF",
|
| 6 |
+
"paper": "https://huggingface.co/Snowflake/snowflake-arctic-embed-l-v2.0; Robertson et al., 1994; Cormack et al., 2009",
|
| 7 |
+
"open_source": true,
|
| 8 |
+
"api_available": false,
|
| 9 |
+
"evaluation_date": "2026-01-18",
|
| 10 |
+
"metrics": {
|
| 11 |
+
"recall@5": 19.56,
|
| 12 |
+
"recall@10": 25.22,
|
| 13 |
+
"recall@25": 34.34,
|
| 14 |
+
"recall@50": 40.55,
|
| 15 |
+
"precision@5": 23.70,
|
| 16 |
+
"precision@10": 18.91,
|
| 17 |
+
"precision@25": 13.43,
|
| 18 |
+
"precision@50": 9.91
|
| 19 |
+
},
|
| 20 |
+
"metadata": {
|
| 21 |
+
"rrf": {
|
| 22 |
+
"semantic_retrievals": 50,
|
| 23 |
+
"bm25_retrievals": 50,
|
| 24 |
+
"semantic_weight": 0.9,
|
| 25 |
+
"bm25_weight": 0.1,
|
| 26 |
+
"k": 60
|
| 27 |
+
}
|
| 28 |
+
}
|
| 29 |
+
}
|
results/snowflake_embeddings.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "snowflake-arctic-embed-l-v2.0",
|
| 3 |
+
"model_type": "Dense",
|
| 4 |
+
"organization": "Snowflake",
|
| 5 |
+
"description": "Snowflake's snowflake-arctic-embed-l-v2.0 embedding model",
|
| 6 |
+
"paper": "https://huggingface.co/Snowflake/snowflake-arctic-embed-l-v2.0",
|
| 7 |
+
"open_source": true,
|
| 8 |
+
"api_available": false,
|
| 9 |
+
"evaluation_date": "2026-01-18",
|
| 10 |
+
"metrics": {
|
| 11 |
+
"recall@5": 18.34,
|
| 12 |
+
"recall@10": 25.76,
|
| 13 |
+
"recall@25": 34.16,
|
| 14 |
+
"recall@50": 40.55,
|
| 15 |
+
"precision@5": 23.26,
|
| 16 |
+
"precision@10": 19.67,
|
| 17 |
+
"precision@25": 13.30,
|
| 18 |
+
"precision@50": 9.91
|
| 19 |
+
}
|
| 20 |
+
}
|