Spaces:

devrev
/

search

Running

App Files Files Community

prateek-jain commited on Jan 19

Commit

7da0047

1 Parent(s): 942e13d

Push Leaderboard

Browse files

Files changed (20) hide show

README.md +14 -8
app.py +400 -0
requirements.txt +6 -0
results/bm25.json +21 -0
results/cohere_v3_embeddings.json +20 -0
results/cohere_v4_embeddings.json +20 -0
results/gemini_bm25_reranker.json +20 -0
results/gemini_bm25_rrf.json +29 -0
results/gemini_embeddings.json +20 -0
results/gte_qwen2_bm25_reranker.json +20 -0
results/gte_qwen2_bm25_rrf.json +29 -0
results/gte_qwen2_embeddings.json +20 -0
results/openai_embeddings.json +20 -0
results/qwen3_bm25_reranker.json +20 -0
results/qwen3_bm25_rrf.json +29 -0
results/qwen3_embeddings.json +20 -0
results/sfr_embeddings.json +20 -0
results/snowflake_bm25_reranker.json +20 -0
results/snowflake_bm25_rrf.json +29 -0
results/snowflake_embeddings.json +20 -0

README.md CHANGED Viewed

@@ -1,13 +1,19 @@
 ---
-title: Search
-emoji: 🐢
-colorFrom: green
-colorTo: indigo
 sdk: gradio
-sdk_version: 6.3.0
 app_file: app.py
-pinned: false
-short_description: DevRev Search Evaluation Leaderboard
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: DevRev Search Evaluation Leaderboard
+emoji: 🏆
+colorFrom: blue
+colorTo: purple
 sdk: gradio
+sdk_version: 4.44.1
+python_version: "3.11"
 app_file: app.py
+pinned: true
 ---
+# 🏆 DevRev Search Evaluation Leaderboard
+Interactive leaderboard for benchmarking search and retrieval systems on enterprise knowledge bases.
+## Features
+- Search performance metrics (Recall@K, Precision@K)
+- Interactive filtering and comparison

app.py ADDED Viewed

	@@ -0,0 +1,400 @@

+"""
+DevRev Search Evaluation Leaderboard
+An interactive leaderboard for benchmarking search and retrieval systems
+on enterprise knowledge bases. Built with Gradio and ready for Hugging Face Spaces.
+Uses MTEB-style standardized JSON format for evaluation results.
+"""
+import base64
+import io
+import json
+import os
+from datetime import datetime
+from pathlib import Path
+import gradio as gr
+import matplotlib.pyplot as plt
+import pandas as pd
+from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
+def load_results_from_json():
+    """Load evaluation results from standardized JSON files"""
+    results = []
+    # Check for results directory
+    results_dirs = ["results", "leaderboard/results", "."]
+    results_dir = None
+    for dir_path in results_dirs:
+        if os.path.exists(dir_path):
+            temp_dir = Path(dir_path)
+            if any(temp_dir.glob("*.json")):
+                results_dir = temp_dir
+                break
+    if not results_dir:
+        print(
+            "No results directory found. Please create a 'results' directory with JSON files."
+        )
+        return []
+    # Load all JSON files from results directory
+    for json_file in results_dir.glob("*.json"):
+        # Skip the schema file
+        if json_file.name == "RESULT_SCHEMA.json":
+            continue
+        try:
+            with open(json_file, "r") as f:
+                data = json.load(f)
+                # Only include if it's a valid evaluation result
+                if "model_name" in data and "metrics" in data:
+                    results.append(data)
+                    print(f"Loaded: {json_file.name}")
+        except Exception as e:
+            print(f"Error loading {json_file}: {e}")
+    return results
+def create_leaderboard_data():
+    """Create the leaderboard dataframe from JSON results"""
+    # Load results from JSON files
+    results = load_results_from_json()
+    if not results:
+        print(
+            "No evaluation results found. Please add JSON files to the 'results' directory."
+        )
+        return pd.DataFrame()  # Return empty dataframe
+    # Convert to DataFrame format
+    data = []
+    for result in results:
+        metrics = result.get("metrics", {})
+        # Process paper field to handle multiple references
+        paper_field = result.get("paper", "N/A")
+        if paper_field and paper_field != "N/A":
+            # Split by semicolon to handle multiple references
+            references = [ref.strip() for ref in paper_field.split(";")]
+            formatted_refs = []
+            for ref in references:
+                if ref.startswith("http"):
+                    # Display URL as link without custom name
+                    formatted_refs.append(f"[{ref}]({ref})")
+                else:
+                    # Plain text citation
+                    formatted_refs.append(ref)
+            paper_display = " | ".join(formatted_refs)
+        else:
+            paper_display = "N/A"
+        row = {
+            "🏆 Rank": 0,  # Will be set after sorting
+            "🔧 Method": result.get("model_name", "Unknown"),
+            "📝 Paper/Details": paper_display,
+            "🏷️ Type": result.get("model_type", "Unknown"),
+            "📈 Recall@5": metrics.get("recall@5", 0),
+            "📈 Recall@10": metrics.get("recall@10", 0),
+            "📈 Recall@25": metrics.get("recall@25", 0),
+            "📈 Recall@50": metrics.get("recall@50", 0),
+            "📉 Precision@5": metrics.get("precision@5", 0),
+            "📉 Precision@10": metrics.get("precision@10", 0),
+            "📉 Precision@25": metrics.get("precision@25", 0),
+            "📉 Precision@50": metrics.get("precision@50", 0),
+            "🚀 Open Source": "✅" if result.get("open_source", False) else "❌",
+            "📅 Date": result.get("evaluation_date", "N/A"),
+        }
+        data.append(row)
+    # Convert to DataFrame
+    df = pd.DataFrame(data)
+    # Sort by Recall@10 (primary) and Precision@10 (secondary)
+    df = df.sort_values(["📈 Recall@10", "📉 Precision@10"], ascending=False)
+    # Update ranks
+    df["🏆 Rank"] = range(1, len(df) + 1)
+    # Reorder columns
+    columns_order = [
+        "🏆 Rank",
+        "🔧 Method",
+        "📝 Paper/Details",
+        "🏷️ Type",
+        "📈 Recall@5",
+        "📈 Recall@10",
+        "📈 Recall@25",
+        "📈 Recall@50",
+        "📉 Precision@5",
+        "📉 Precision@10",
+        "📉 Precision@25",
+        "📉 Precision@50",
+        "🚀 Open Source",
+        "📅 Date",
+    ]
+    df = df[columns_order]
+    return df
+def create_comparison_plot():
+    """Create performance comparison visualizations"""
+    df = create_leaderboard_data()
+    if df.empty:
+        return "<p style='text-align: center; color: #666;'>No data available for visualization. Please add evaluation results to the 'results' directory.</p>"
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
+    # Sort by Recall@50 for consistent ordering
+    df_sorted = df.sort_values("📈 Recall@50", ascending=True)
+    # Recall@50 comparison
+    methods = df_sorted["🔧 Method"].tolist()
+    recall_50 = df_sorted["📈 Recall@50"].tolist()
+    colors = ["#ff6b6b" if "DevRev" in m else "#4ecdc4" for m in methods]
+    ax1.barh(methods, recall_50, color=colors, alpha=0.8)
+    ax1.set_xlabel("Recall@50 (%)", fontsize=12)
+    ax1.set_title("Recall@50 Comparison", fontsize=14, fontweight="bold")
+    ax1.grid(True, axis="x", alpha=0.3)
+    # Add value labels
+    for i, (method, recall) in enumerate(zip(methods, recall_50)):
+        ax1.text(recall + 0.5, i, f"{recall:.1f}%", va="center", fontsize=10)
+    # Precision@50 comparison
+    precision_50 = df_sorted["📉 Precision@50"].tolist()
+    ax2.barh(methods, precision_50, color=colors, alpha=0.8)
+    ax2.set_xlabel("Precision@50 (%)", fontsize=12)
+    ax2.set_title("Precision@50 Comparison", fontsize=14, fontweight="bold")
+    ax2.grid(True, axis="x", alpha=0.3)
+    # Add value labels
+    for i, (method, precision) in enumerate(zip(methods, precision_50)):
+        ax2.text(
+            precision + 0.5,
+            i,
+            f"{precision:.1f}%",
+            va="center",
+            fontsize=10,
+        )
+    plt.tight_layout()
+    # Convert to base64 for embedding in HTML
+    buf = io.BytesIO()
+    plt.savefig(buf, format="png", dpi=150, bbox_inches="tight")
+    buf.seek(0)
+    img_base64 = base64.b64encode(buf.read()).decode()
+    plt.close()
+    return f'<img src="data:image/png;base64,{img_base64}" style="width: 100%; max-width: 1000px; margin: 20px auto; display: block;">'
+def create_interface():
+    """Create the Gradio interface with leaderboard and visualizations"""
+    deep_link_js = r"""
+    () => {
+      function openAboutAndScroll() {
+        if (window.location.hash !== "#about") return;
+        // Switch to the About tab (Gradio tabs are rendered as role="tab" buttons)
+        const tabs = Array.from(document.querySelectorAll('button[role="tab"]'));
+        const aboutTab = tabs.find((b) => (b.innerText || "").includes("About"));
+        if (aboutTab) aboutTab.click();
+        // The About content is mounted after tab switch; retry briefly.
+        let attempts = 0;
+        const timer = setInterval(() => {
+          const el = document.getElementById("about");
+          if (el) {
+            el.scrollIntoView({ behavior: "smooth", block: "start" });
+            clearInterval(timer);
+          }
+          attempts += 1;
+          if (attempts > 25) clearInterval(timer);
+        }, 200);
+      }
+      window.addEventListener("hashchange", openAboutAndScroll);
+      openAboutAndScroll();
+      setTimeout(openAboutAndScroll, 600);
+    }
+    """
+    with gr.Blocks(
+        title="DevRev Search Evaluation Leaderboard", js=deep_link_js
+    ) as demo:
+        # Header
+        gr.HTML(
+            """
+        <div style="text-align: center; margin-bottom: 30px;">
+            <h1 style="font-size: 3em; font-weight: bold; margin-bottom: 10px;">
+                🏆 DevRev Search Evaluation Leaderboard
+            </h1>
+            <p style="font-size: 1.2em; color: #666;">
+                Benchmarking Search and Retrieval Systems for Enterprise Knowledge Bases
+            </p>
+        </div>
+        """
+        )
+        # Tabs
+        with gr.Tabs():
+            # Main Leaderboard Tab
+            with gr.TabItem("🏆 Main Leaderboard"):
+                gr.Markdown(
+                    """
+                ### Evaluation Overview
+                This leaderboard displays metrics of search systems on the test queries present in [DevRev Search Dataset](https://huggingface.co/datasets/devrev/search).
+                All methods are evaluated on the same set of agent support queries with consistent evaluation protocols.
+                **Metrics**: Recall@K and Precision@K measure the effectiveness of retrieving relevant articles within the top K retrieved articles.
+                **Leaderboard ranking**: Sorted by **Recall@10** (primary) and **Precision@10** (secondary).
+                **To add your results**: Submission details are available in the [About](#about) section.
+                """
+                )
+                # Get leaderboard data
+                df = create_leaderboard_data()
+                if not df.empty:
+                    # Configure which columns to display by default
+                    default_columns = [
+                        "🏆 Rank",
+                        "🔧 Method",
+                        "🏷️ Type",
+                        "📈 Recall@10",
+                        "📈 Recall@50",
+                        "📉 Precision@10",
+                        "📉 Precision@50",
+                        "🚀 Open Source",
+                    ]
+                    # Define column filters
+                    type_column = ColumnFilter("🏷️ Type", type="checkboxgroup")
+                    open_source_column = ColumnFilter(
+                        "🚀 Open Source", type="checkboxgroup"
+                    )
+                    # Create the interactive leaderboard
+                    Leaderboard(
+                        value=df,
+                        datatype=[
+                            "number",
+                            "markdown",
+                            "markdown",
+                            "str",
+                            "number",
+                            "number",
+                            "number",
+                            "number",
+                            "number",
+                            "number",
+                            "number",
+                            "number",
+                            "str",
+                            "str",
+                        ],
+                        select_columns=SelectColumns(
+                            default_selection=default_columns,
+                            cant_deselect=[
+                                "🏆 Rank",
+                                "🔧 Method",
+                                "📈 Recall@10",
+                            ],
+                            label="Select Columns to Display",
+                        ),
+                        search_columns=[
+                            "🔧 Method",
+                            "📝 Paper/Details",
+                            "🏷️ Type",
+                        ],
+                        hide_columns=["📅 Date"],
+                        filter_columns=[type_column, open_source_column],
+                        interactive=False,
+                    )
+                else:
+                    gr.HTML(
+                        """
+                        <div style="text-align: center; padding: 50px; background: #f5f5f5; border-radius: 10px;">
+                            <h3>No Results Found</h3>
+                            <p>Please add JSON evaluation files to the 'results' directory.</p>
+                            <p>See the About tab for the required format.</p>
+                        </div>
+                        """
+                    )
+            # About Tab
+            with gr.TabItem("ℹ️ About"):
+                gr.Markdown(
+                    """
+                ## About This Leaderboard
+                This leaderboard tracks the performance of various search and retrieval systems on the [DevRev Search Dataset](https://huggingface.co/datasets/devrev/search).
+                ### 📊 Evaluation Metrics
+                - **Recall@K**: The percentage of relevant article chunks retrieved in the top K article chunks
+                - **Precision@K**: The percentage of retrieved article chunks that are relevant among the top K article chunks
+                ### 📤 How to Submit
+                1. Run your retrieval on the test queries in DevRev Search Dataset
+                2. Submit the results in same format as annotated_queries in the dataset through email to prateek.jain@devrev.ai
+                3. Also include a **one-line system detail/link**, the **system type**, and whether it is **open source**
+                ### 🔗 Resources
+                - [Computer by DevRev](https://devrev.ai/meet-computer)
+                - [DevRev Search Dataset](https://huggingface.co/datasets/devrev/search)
+                ### 🙏 Acknowledgments
+                Inspired by:
+                - [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard)
+                - [Berkeley Function Calling Leaderboard](https://gorilla.cs.berkeley.edu/leaderboard)
+                ### 📚 Citation
+                ```bibtex
+                @misc{devrev_search_leaderboard_2026,
+                  title={DevRev Search Leaderboard},
+                  author={Research@DevRev},
+                  year={2026},
+                  url={https://huggingface.co/spaces/devrev/search}
+                }
+                ```
+                """,
+                    elem_id="about",
+                )
+        # Footer
+        gr.HTML(
+            f"""
+        <div style="text-align: center; margin-top: 50px; padding: 20px; border-top: 1px solid #e0e0e0; color: #666;">
+            <p>
+                Last updated: {datetime.now().strftime("%Y-%m-%d %H:%M UTC")}
+            </p>
+        </div>
+        """
+        )
+    return demo
+# Create and launch the app
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=True, show_api=False)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio==4.44.1
+gradio-leaderboard==0.0.11
+pandas==2.3.3
+numpy==2.4.1
+matplotlib==3.9.2
+huggingface-hub==0.24.7

results/bm25.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "model_name": "BM25",
+  "model_type": "Lexical",
+  "organization": "Open Source",
+  "description": "Classic lexical search algorithm based on term frequency and inverse document frequency",
+  "paper": "Robertson et al., 1994",
+  "code": "https://github.com/elastic/elasticsearch",
+  "open_source": true,
+  "api_available": false,
+  "evaluation_date": "2026-01-18",
+  "metrics": {
+    "recall@5": 9.37,
+    "recall@10": 14.77,
+    "recall@25": 23.84,
+    "recall@50": 30.70,
+    "precision@5": 11.96,
+    "precision@10": 10.43,
+    "precision@25": 7.39,
+    "precision@50": 5.50
+  }
+}

results/cohere_v3_embeddings.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "model_name": "cohere.embed-english-v3",
+  "model_type": "Dense",
+  "organization": "Cohere",
+  "description": "Cohere's embedding model for English",
+  "paper": "https://docs.cohere.com/docs/cohere-embed",
+  "open_source": false,
+  "api_available": true,
+  "evaluation_date": "2026-01-18",
+  "metrics": {
+    "recall@5": 11.32,
+    "recall@10": 20.14,
+    "recall@25": 30.26,
+    "recall@50": 39.76,
+    "precision@5": 18.91,
+    "precision@10": 18.04,
+    "precision@25": 14.04,
+    "precision@50": 11.46
+  }
+}

results/cohere_v4_embeddings.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "model_name": "cohere.embed-v4:0",
+  "model_type": "Dense",
+  "organization": "Cohere",
+  "description": "Cohere's cohere.embed-v4:0 embedding model",
+  "paper": "https://docs.cohere.com/docs/cohere-embed",
+  "open_source": false,
+  "api_available": true,
+  "evaluation_date": "2026-01-18",
+  "metrics": {
+    "recall@5": 17.71,
+    "recall@10": 23.21,
+    "recall@25": 37.00,
+    "recall@50": 44.74,
+    "precision@5": 24.78,
+    "precision@10": 21.85,
+    "precision@25": 16.56,
+    "precision@50": 12.39
+  }
+}

results/gemini_bm25_reranker.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "model_name": "gemini-embedding-001-bm25-zerank-1-small",
+  "model_type": "Hybrid",
+  "organization": "NA",
+  "description": "Hybrid search system combining Google's gemini-embedding-001 embedding model with BM25 using Reranker",
+  "paper": "https://ai.google.dev/gemini-api/docs/embeddings; Robertson et al., 1994; https://huggingface.co/zeroentropy/zerank-1-small",
+  "open_source": false,
+  "api_available": true,
+  "evaluation_date": "2026-01-18",
+  "metrics": {
+    "recall@5": 29.11,
+    "recall@10": 36.50,
+    "recall@25": 52.09,
+    "recall@50": 60.00,
+    "precision@5": 35.65,
+    "precision@10": 26.85,
+    "precision@25": 19.00,
+    "precision@50": 13.56
+  }
+}

results/gemini_bm25_rrf.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "model_name": "gemini-embedding-001-bm25-rrf",
+  "model_type": "Hybrid",
+  "organization": "NA",
+  "description": "Hybrid search system combining Google's gemini-embedding-001 embedding model with BM25 using RRF",
+  "paper": "https://ai.google.dev/gemini-api/docs/embeddings; Robertson et al., 1994; Cormack et al., 2009",
+  "open_source": false,
+  "api_available": true,
+  "evaluation_date": "2026-01-18",
+  "metrics": {
+    "recall@5": 23.02,
+    "recall@10": 31.48,
+    "recall@25": 47.22,
+    "recall@50": 54.60,
+    "precision@5": 29.56,
+    "precision@10": 23.04,
+    "precision@25": 17.48,
+    "precision@50": 12.78
+  },
+  "metadata": {
+    "rrf": {
+      "semantic_retrievals": 50,
+      "bm25_retrievals": 50,
+      "semantic_weight": 0.9,
+      "bm25_weight": 0.1,
+      "k": 60
+    }
+  }
+}

results/gemini_embeddings.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "model_name": "gemini-embedding-001",
+  "model_type": "Dense",
+  "organization": "Google",
+  "description": "Google's latest text embedding model in Gemini series",
+  "paper": "https://ai.google.dev/gemini-api/docs/embeddings",
+  "open_source": false,
+  "api_available": true,
+  "evaluation_date": "2026-01-18",
+  "metrics": {
+    "recall@5": 23.08,
+    "recall@10": 31.04,
+    "recall@25": 46.73,
+    "recall@50": 54.60,
+    "precision@5": 29.56,
+    "precision@10": 23.26,
+    "precision@25": 17.22,
+    "precision@50": 12.78
+  }
+}

results/gte_qwen2_bm25_reranker.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "model_name": "GTE-Qwen2-7B-instruct-bm25-zerank-1-small",
+  "model_type": "Hybrid",
+  "organization": "NA",
+  "description": "Hybrid search system combining GTE-Qwen2-7B-instruct embedding model with BM25 using Reranker",
+  "paper": "https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct; Robertson et al., 1994; https://huggingface.co/zeroentropy/zerank-1-small",
+  "open_source": true,
+  "api_available": false,
+  "evaluation_date": "2026-01-18",
+  "metrics": {
+    "recall@5": 28.07,
+    "recall@10": 35.08,
+    "recall@25": 48.19,
+    "recall@50": 57.55,
+    "precision@5": 34.56,
+    "precision@10": 26.85,
+    "precision@25": 19.91,
+    "precision@50": 14.76
+  }
+}

results/gte_qwen2_bm25_rrf.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "model_name": "GTE-Qwen2-7B-instruct-bm25-rrf",
+  "model_type": "Hybrid",
+  "organization": "NA",
+  "description": "Hybrid search system combining GTE-Qwen2-7B-instruct embedding model with BM25 using RRF",
+  "paper": "https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct; Robertson et al., 1994; Cormack et al., 2009",
+  "open_source": true,
+  "api_available": false,
+  "evaluation_date": "2026-01-18",
+  "metrics": {
+    "recall@5": 16.44,
+    "recall@10": 26.14,
+    "recall@25": 39.39,
+    "recall@50": 52.55,
+    "precision@5": 26.30,
+    "precision@10": 22.5,
+    "precision@25": 16.91,
+    "precision@50": 14.20
+  },
+  "metadata": {
+    "rrf": {
+      "semantic_retrievals": 50,
+      "bm25_retrievals": 50,
+      "semantic_weight": 0.9,
+      "bm25_weight": 0.1,
+      "k": 60
+    }
+  }
+}

results/gte_qwen2_embeddings.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "model_name": "GTE-Qwen2-7B-instruct",
+  "model_type": "Dense",
+  "organization": "Alibaba",
+  "description": "Alibaba's GTE-Qwen2 embedding model",
+  "paper": "https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct",
+  "open_source": true,
+  "api_available": false,
+  "evaluation_date": "2026-01-18",
+  "metrics": {
+    "recall@5": 15.62,
+    "recall@10": 24.46,
+    "recall@25": 39.84,
+    "recall@50": 52.55,
+    "precision@5": 25.22,
+    "precision@10": 21.85,
+    "precision@25": 16.96,
+    "precision@50": 14.20
+  }
+}

results/openai_embeddings.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "model_name": "text-embedding-3-large",
+  "model_type": "Dense",
+  "organization": "OpenAI",
+  "description": "OpenAI's latest text embedding model",
+  "paper": "https://openai.com/index/new-embedding-models-and-api-updates/",
+  "open_source": false,
+  "api_available": true,
+  "evaluation_date": "2026-01-18",
+  "metrics": {
+    "recall@5": 16.06,
+    "recall@10": 24.03,
+    "recall@25": 35.59,
+    "recall@50": 45.10,
+    "precision@5": 24.78,
+    "precision@10": 20.65,
+    "precision@25": 16.74,
+    "precision@50": 13.13
+  }
+}

results/qwen3_bm25_reranker.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "model_name": "Qwen3-Embedding-8B-bm25-zerank-1-small",
+  "model_type": "Hybrid",
+  "organization": "NA",
+  "description": "Hybrid search system combining Alibaba's Qwen3 embedding model with BM25 using Reranker",
+  "paper": "https://huggingface.co/Qwen/Qwen3-Embedding-8B; Robertson et al., 1994; https://huggingface.co/zeroentropy/zerank-1-small",
+  "open_source": true,
+  "api_available": false,
+  "evaluation_date": "2026-01-18",
+  "metrics": {
+    "recall@5": 27.57,
+    "recall@10": 36.09,
+    "recall@25": 46.41,
+    "recall@50": 51.32,
+    "precision@5": 34.56,
+    "precision@10": 26.63,
+    "precision@25": 17.04,
+    "precision@50": 11.63
+  }
+}

results/qwen3_bm25_rrf.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "model_name": "Qwen3-Embedding-8B-bm25-rrf",
+  "model_type": "Hybrid",
+  "organization": "NA",
+  "description": "Hybrid search system combining Alibaba's Qwen3 embedding model with BM25 using RRF",
+  "paper": "https://huggingface.co/Qwen/Qwen3-Embedding-8B; Robertson et al., 1994; Cormack et al., 2009",
+  "open_source": true,
+  "api_available": false,
+  "evaluation_date": "2026-01-18",
+  "metrics": {
+    "recall@5": 15.92,
+    "recall@10": 24.22,
+    "recall@25": 34.08,
+    "recall@50": 43.13,
+    "precision@5": 22.61,
+    "precision@10": 18.37,
+    "precision@25": 13.35,
+    "precision@50": 11.17
+  },
+  "metadata": {
+    "rrf": {
+      "semantic_retrievals": 50,
+      "bm25_retrievals": 50,
+      "semantic_weight": 0.9,
+      "bm25_weight": 0.1,
+      "k": 60
+    }
+  }
+}

results/qwen3_embeddings.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "model_name": "Qwen3-Embedding-8B",
+  "model_type": "Dense",
+  "organization": "Alibaba",
+  "description": "Alibaba's Qwen3 embedding model",
+  "paper": "https://huggingface.co/Qwen/Qwen3-Embedding-8B",
+  "open_source": true,
+  "api_available": false,
+  "evaluation_date": "2026-01-18",
+  "metrics": {
+    "recall@5": 16.42,
+    "recall@10": 26.67,
+    "recall@25": 33.82,
+    "recall@50": 43.13,
+    "precision@5": 23.26,
+    "precision@10": 18.70,
+    "precision@25": 13.48,
+    "precision@50": 11.17
+  }
+}

results/sfr_embeddings.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "model_name": "SFR-Embedding-Mistral",
+  "model_type": "Dense",
+  "organization": "Salesforce",
+  "description": "Salesforce's SFR embedding model",
+  "paper": "https://huggingface.co/Salesforce/SFR-Embedding-Mistral",
+  "open_source": true,
+  "api_available": false,
+  "evaluation_date": "2026-01-18",
+  "metrics": {
+    "recall@5": 17.02,
+    "recall@10": 26.61,
+    "recall@25": 39.82,
+    "recall@50": 51.32,
+    "precision@5": 23.91,
+    "precision@10": 21.30,
+    "precision@25": 15.26,
+    "precision@50": 11.80
+  }
+}

results/snowflake_bm25_reranker.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "model_name": "snowflake-arctic-embed-l-v2.0-bm25-zerank-1-small",
+  "model_type": "Hybrid",
+  "organization": "NA",
+  "description": "Hybrid search system combining Snowflake's snowflake-arctic-embed-l-v2.0 embedding model with BM25 using Reranker",
+  "paper": "https://huggingface.co/Snowflake/snowflake-arctic-embed-l-v2.0; Robertson et al., 1994; https://huggingface.co/zeroentropy/zerank-1-small",
+  "open_source": true,
+  "api_available": false,
+  "evaluation_date": "2026-01-18",
+  "metrics": {
+    "recall@5": 27.57,
+    "recall@10": 36.09,
+    "recall@25": 46.41,
+    "recall@50": 51.32,
+    "precision@5": 34.56,
+    "precision@10": 26.63,
+    "precision@25": 17.04,
+    "precision@50": 11.63
+  }
+}

results/snowflake_bm25_rrf.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "model_name": "snowflake-arctic-embed-l-v2.0-bm25-rrf",
+  "model_type": "Hybrid",
+  "organization": "NA",
+  "description": "Hybrid search system combining Snowflake's snowflake-arctic-embed-l-v2.0 embedding model with BM25 using RRF",
+  "paper": "https://huggingface.co/Snowflake/snowflake-arctic-embed-l-v2.0; Robertson et al., 1994; Cormack et al., 2009",
+  "open_source": true,
+  "api_available": false,
+  "evaluation_date": "2026-01-18",
+  "metrics": {
+    "recall@5": 19.56,
+    "recall@10": 25.22,
+    "recall@25": 34.34,
+    "recall@50": 40.55,
+    "precision@5": 23.70,
+    "precision@10": 18.91,
+    "precision@25": 13.43,
+    "precision@50": 9.91
+  },
+  "metadata": {
+    "rrf": {
+      "semantic_retrievals": 50,
+      "bm25_retrievals": 50,
+      "semantic_weight": 0.9,
+      "bm25_weight": 0.1,
+      "k": 60
+    }
+  }
+}

results/snowflake_embeddings.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "model_name": "snowflake-arctic-embed-l-v2.0",
+  "model_type": "Dense",
+  "organization": "Snowflake",
+  "description": "Snowflake's snowflake-arctic-embed-l-v2.0 embedding model",
+  "paper": "https://huggingface.co/Snowflake/snowflake-arctic-embed-l-v2.0",
+  "open_source": true,
+  "api_available": false,
+  "evaluation_date": "2026-01-18",
+  "metrics": {
+    "recall@5": 18.34,
+    "recall@10": 25.76,
+    "recall@25": 34.16,
+    "recall@50": 40.55,
+    "precision@5": 23.26,
+    "precision@10": 19.67,
+    "precision@25": 13.30,
+    "precision@50": 9.91
+  }
+}