Spaces:

amryassin
/

embedding-bench

Running

App Files Files Community

AmrYassinIsFree commited on Apr 14

Commit

f1c066b

1 Parent(s): 9d71632

enhancing the ui

Browse files

Files changed (2) hide show

.github/workflows/sync-to-hf.yml +19 -0
app.py +284 -80

.github/workflows/sync-to-hf.yml ADDED Viewed

	@@ -0,0 +1,19 @@

+name: Sync to HuggingFace Spaces
+on:
+  push:
+    branches: [main]
+jobs:
+  sync:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Push to HuggingFace Spaces
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          git push https://amryassin:$HF_TOKEN@huggingface.co/spaces/amryassin/embedding-bench main --force

app.py CHANGED Viewed

@@ -16,7 +16,7 @@ from models import REGISTRY, ModelConfig
 from wrapper import load_model
 # ---------------------------------------------------------------------------
-# Page config
 # ---------------------------------------------------------------------------
 st.set_page_config(
     page_title="Embedding Bench",
@@ -24,26 +24,101 @@ st.set_page_config(
     layout="wide",
 )
-st.title("📐 Embedding Bench")
-st.caption("Compare text embedding models on quality, speed & memory — all in your browser.")
 # ---------------------------------------------------------------------------
 # Sidebar — configuration
 # ---------------------------------------------------------------------------
-st.sidebar.header("Models")
 available_models = list(REGISTRY.keys())
 selected_models = st.sidebar.multiselect(
     "Select models",
     available_models,
     default=["mpnet", "bge-small"] if len(available_models) >= 2 else available_models[:1],
 )
-st.sidebar.header("Datasets")
 available_datasets = list(DATASET_PRESETS.keys())
 selected_datasets = st.sidebar.multiselect(
     "Select dataset presets",
     available_datasets,
     default=["sts"],
 )
 max_pairs = st.sidebar.number_input(
@@ -55,9 +130,10 @@ max_pairs = st.sidebar.number_input(
     help="Limits the number of pairs evaluated. Keep low for large datasets.",
 )
-st.sidebar.header("Speed & Memory")
-run_speed = st.sidebar.checkbox("Run speed benchmark", value=False)
-run_memory = st.sidebar.checkbox("Run memory benchmark", value=False)
 corpus_size = 500
 num_runs = 3
@@ -68,6 +144,8 @@ if run_speed or run_memory:
 if run_speed:
     num_runs = st.sidebar.number_input("Speed runs", 1, 10, 3)
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
@@ -107,6 +185,38 @@ def results_to_csv(results: list[dict]) -> str:
     return buf.getvalue()
 # ---------------------------------------------------------------------------
 # Run benchmark
 # ---------------------------------------------------------------------------
@@ -131,7 +241,6 @@ if run_btn:
         cfg = REGISTRY[model_key]
         result: dict = {"name": cfg.name, "is_baseline": cfg.is_baseline}
-        # Quality
         model = get_model(model_key)
         quality_results = {}
         for ds_cfg in ds_configs:
@@ -139,22 +248,20 @@ if run_btn:
             step += 1
             progress.progress(
                 step / total_steps,
-                text=f"Evaluating {cfg.name} on {ds_key}...",
             )
             quality_results[ds_key] = evaluate_quality(model, ds_cfg, max_pairs=max_pairs)
         result["quality"] = quality_results
-        # Speed
         if run_speed:
             step += 1
-            progress.progress(step / total_steps, text=f"Speed benchmark: {cfg.name}...")
             corpus = build_corpus(corpus_size, ds_configs[0])
             result["speed"] = evaluate_speed(model, corpus, num_runs=num_runs, batch_size=batch_size)
-        # Memory
         if run_memory:
             step += 1
-            progress.progress(step / total_steps, text=f"Memory benchmark: {cfg.name}...")
             from evals.memory import evaluate_memory
             corpus = build_corpus(corpus_size, ds_configs[0])
             result["memory_mb"] = evaluate_memory(
@@ -167,7 +274,6 @@ if run_btn:
     time.sleep(0.3)
     progress.empty()
-    # Store results in session state
     st.session_state["results"] = results
     st.session_state["selected_datasets"] = selected_datasets
@@ -175,31 +281,21 @@ if run_btn:
 # Display results
 # ---------------------------------------------------------------------------
 if "results" not in st.session_state:
-    st.info("Configure options in the sidebar and hit **Run Benchmark**.")
     st.stop()
 results = st.session_state["results"]
-selected_datasets = st.session_state["selected_datasets"]
-# --- Results table ---
-st.header("Results")
-flat_rows = [flatten_result(r) for r in results]
-st.dataframe(flat_rows, use_container_width=True)
-# --- CSV download ---
-csv_data = results_to_csv(results)
-st.download_button(
-    "📥 Download CSV",
-    data=csv_data,
-    file_name="embedding_bench_results.csv",
-    mime="text/csv",
-)
-# --- Charts ---
-st.header("Charts")
-models = [r["name"] for r in results]
-# Discover datasets
 ds_keys: list[str] = []
 for r in results:
     q = r.get("quality")
@@ -207,6 +303,88 @@ for r in results:
         ds_keys = list(q.keys())
         break
 for ds_key in ds_keys:
     first_metrics = None
     for r in results:
@@ -219,17 +397,18 @@ for ds_key in ds_keys:
     if "spearman" in first_metrics:
         values = [r.get("quality", {}).get(ds_key, {}).get("spearman", 0) for r in results]
-        fig, ax = plt.subplots(figsize=(max(6, len(models) * 1.5), 4))
-        bars = ax.bar(models, values, color="#4C72B0")
-        ax.set_ylabel("Spearman Correlation")
-        ax.set_title(f"Quality — {ds_key}")
-        ax.set_ylim(0, 1)
         for bar, v in zip(bars, values):
             ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.01,
-                    f"{v:.4f}", ha="center", va="bottom", fontsize=9)
         plt.xticks(rotation=30, ha="right")
         plt.tight_layout()
-        st.pyplot(fig)
         plt.close(fig)
     else:
         metric_names = ["mrr", "recall@1", "recall@5", "recall@10"]
@@ -237,52 +416,77 @@ for ds_key in ds_keys:
         width = 0.18
         colors = ["#4C72B0", "#55A868", "#C44E52", "#8172B2"]
-        fig, ax = plt.subplots(figsize=(max(8, len(models) * 2.2), 4.5))
         for i, (metric, color) in enumerate(zip(metric_names, colors)):
             values = [r.get("quality", {}).get(ds_key, {}).get(metric, 0) for r in results]
             offset = (i - 1.5) * width
-            bars = ax.bar(x + offset, values, width, label=metric, color=color)
             for bar, v in zip(bars, values):
                 ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.005,
-                        f"{v:.2f}", ha="center", va="bottom", fontsize=7)
-        ax.set_ylabel("Score")
-        ax.set_title(f"Retrieval Quality — {ds_key}")
         ax.set_ylim(0, 1.15)
         ax.set_xticks(x)
-        ax.set_xticklabels(models, rotation=30, ha="right")
-        ax.legend()
         plt.tight_layout()
-        st.pyplot(fig)
         plt.close(fig)
-# Speed chart
 speed_values = [r.get("speed", {}).get("sentences_per_second", 0) for r in results]
-if any(v > 0 for v in speed_values):
-    fig, ax = plt.subplots(figsize=(max(6, len(models) * 1.5), 4))
-    bars = ax.bar(models, speed_values, color="#55A868")
-    ax.set_ylabel("Sentences / second")
-    ax.set_title("Encoding Speed")
-    for bar, v in zip(bars, speed_values):
-        if v > 0:
-            ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.5,
-                    str(v), ha="center", va="bottom", fontsize=9)
-    plt.xticks(rotation=30, ha="right")
-    plt.tight_layout()
-    st.pyplot(fig)
-    plt.close(fig)
-# Memory chart
 mem_values = [r.get("memory_mb", 0) for r in results]
-if any(v > 0 for v in mem_values):
-    fig, ax = plt.subplots(figsize=(max(6, len(models) * 1.5), 4))
-    bars = ax.bar(models, mem_values, color="#C44E52")
-    ax.set_ylabel("Peak Memory (MB)")
-    ax.set_title("Memory Usage")
-    for bar, v in zip(bars, mem_values):
-        if v > 0:
-            ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.5,
-                    str(v), ha="center", va="bottom", fontsize=9)
-    plt.xticks(rotation=30, ha="right")
-    plt.tight_layout()
-    st.pyplot(fig)
-    plt.close(fig)

 from wrapper import load_model
 # ---------------------------------------------------------------------------
+# Page config & custom CSS
 # ---------------------------------------------------------------------------
 st.set_page_config(
     page_title="Embedding Bench",
     layout="wide",
 )
+st.markdown("""
+<style>
+    /* Tighter top padding */
+    .block-container { padding-top: 1.5rem; padding-bottom: 1rem; }
+    /* Metric cards */
+    .metric-card {
+        background: linear-gradient(135deg, #1a1d23 0%, #22262e 100%);
+        border: 1px solid #333;
+        border-radius: 10px;
+        padding: 14px 18px;
+        text-align: center;
+    }
+    .metric-card .label {
+        font-size: 0.72rem;
+        color: #888;
+        text-transform: uppercase;
+        letter-spacing: 0.05em;
+        margin-bottom: 4px;
+    }
+    .metric-card .value {
+        font-size: 1.5rem;
+        font-weight: 700;
+        color: #fafafa;
+    }
+    .metric-card .sub {
+        font-size: 0.7rem;
+        color: #666;
+        margin-top: 2px;
+    }
+    .metric-card.best .value { color: #55A868; }
+    .metric-card.worst .value { color: #C44E52; }
+    /* Section divider */
+    .section-divider {
+        border: none;
+        border-top: 1px solid #2a2d35;
+        margin: 1.2rem 0;
+    }
+    /* Footer */
+    .footer {
+        text-align: center;
+        color: #555;
+        font-size: 0.75rem;
+        padding: 1.5rem 0 0.5rem;
+        border-top: 1px solid #222;
+        margin-top: 2rem;
+    }
+    .footer a { color: #4C72B0; text-decoration: none; }
+</style>
+""", unsafe_allow_html=True)
+# ---------------------------------------------------------------------------
+# Header
+# ---------------------------------------------------------------------------
+col_title, col_badge = st.columns([5, 1])
+with col_title:
+    st.markdown("# 📐 Embedding Bench")
+    st.markdown(
+        "<span style='color:#888; font-size:0.95rem;'>"
+        "Compare text embedding models on quality, speed &amp; memory.</span>",
+        unsafe_allow_html=True,
+    )
+with col_badge:
+    st.markdown(
+        "<div style='text-align:right; padding-top:18px;'>"
+        "<a href='https://github.com/amryassin/embedding-bench' target='_blank'>"
+        "<img src='https://img.shields.io/badge/GitHub-repo-blue?logo=github' /></a></div>",
+        unsafe_allow_html=True,
+    )
+st.markdown("<hr class='section-divider'>", unsafe_allow_html=True)
 # ---------------------------------------------------------------------------
 # Sidebar — configuration
 # ---------------------------------------------------------------------------
+st.sidebar.markdown("### ⚙️ Configuration")
+st.sidebar.markdown("**Models**")
 available_models = list(REGISTRY.keys())
 selected_models = st.sidebar.multiselect(
     "Select models",
     available_models,
     default=["mpnet", "bge-small"] if len(available_models) >= 2 else available_models[:1],
+    label_visibility="collapsed",
 )
+st.sidebar.markdown("**Datasets**")
 available_datasets = list(DATASET_PRESETS.keys())
 selected_datasets = st.sidebar.multiselect(
     "Select dataset presets",
     available_datasets,
     default=["sts"],
+    label_visibility="collapsed",
 )
 max_pairs = st.sidebar.number_input(
     help="Limits the number of pairs evaluated. Keep low for large datasets.",
 )
+st.sidebar.markdown("---")
+st.sidebar.markdown("**Speed & Memory**")
+run_speed = st.sidebar.checkbox("Speed benchmark", value=False)
+run_memory = st.sidebar.checkbox("Memory benchmark", value=False)
 corpus_size = 500
 num_runs = 3
 if run_speed:
     num_runs = st.sidebar.number_input("Speed runs", 1, 10, 3)
+st.sidebar.markdown("---")
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
     return buf.getvalue()
+def render_metric_card(label: str, value: str, sub: str = "", css_class: str = "") -> str:
+    cls = f"metric-card {css_class}".strip()
+    sub_html = f"<div class='sub'>{sub}</div>" if sub else ""
+    return (
+        f"<div class='{cls}'>"
+        f"<div class='label'>{label}</div>"
+        f"<div class='value'>{value}</div>"
+        f"{sub_html}"
+        f"</div>"
+    )
+# ---------------------------------------------------------------------------
+# Chart style helper
+# ---------------------------------------------------------------------------
+CHART_BG = "#0E1117"
+CHART_TEXT = "#CCCCCC"
+def style_chart(fig, ax):
+    """Apply dark theme to a matplotlib chart."""
+    fig.patch.set_facecolor(CHART_BG)
+    ax.set_facecolor(CHART_BG)
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+    ax.spines["left"].set_color("#444")
+    ax.spines["bottom"].set_color("#444")
+    ax.tick_params(colors=CHART_TEXT, labelsize=7)
+    ax.yaxis.label.set_color(CHART_TEXT)
+    ax.xaxis.label.set_color(CHART_TEXT)
+    ax.title.set_color("#FAFAFA")
 # ---------------------------------------------------------------------------
 # Run benchmark
 # ---------------------------------------------------------------------------
         cfg = REGISTRY[model_key]
         result: dict = {"name": cfg.name, "is_baseline": cfg.is_baseline}
         model = get_model(model_key)
         quality_results = {}
         for ds_cfg in ds_configs:
             step += 1
             progress.progress(
                 step / total_steps,
+                text=f"Evaluating **{cfg.name}** on *{ds_key}*...",
             )
             quality_results[ds_key] = evaluate_quality(model, ds_cfg, max_pairs=max_pairs)
         result["quality"] = quality_results
         if run_speed:
             step += 1
+            progress.progress(step / total_steps, text=f"Speed benchmark: **{cfg.name}**...")
             corpus = build_corpus(corpus_size, ds_configs[0])
             result["speed"] = evaluate_speed(model, corpus, num_runs=num_runs, batch_size=batch_size)
         if run_memory:
             step += 1
+            progress.progress(step / total_steps, text=f"Memory benchmark: **{cfg.name}**...")
             from evals.memory import evaluate_memory
             corpus = build_corpus(corpus_size, ds_configs[0])
             result["memory_mb"] = evaluate_memory(
     time.sleep(0.3)
     progress.empty()
     st.session_state["results"] = results
     st.session_state["selected_datasets"] = selected_datasets
 # Display results
 # ---------------------------------------------------------------------------
 if "results" not in st.session_state:
+    st.markdown(
+        "<div style='text-align:center; padding:3rem 0; color:#666;'>"
+        "<p style='font-size:2.5rem; margin-bottom:0.5rem;'>📐</p>"
+        "<p style='font-size:1.1rem;'>Configure models &amp; datasets in the sidebar,<br>"
+        "then hit <b>Run Benchmark</b>.</p></div>",
+        unsafe_allow_html=True,
+    )
     st.stop()
 results = st.session_state["results"]
+selected_datasets_display = st.session_state["selected_datasets"]
+# ---------------------------------------------------------------------------
+# Highlight cards
+# ---------------------------------------------------------------------------
 ds_keys: list[str] = []
 for r in results:
     q = r.get("quality")
         ds_keys = list(q.keys())
         break
+# Build a quick summary: best model per first dataset
+if ds_keys:
+    first_ds = ds_keys[0]
+    first_metrics_sample = results[0].get("quality", {}).get(first_ds, {})
+    primary_metric = "spearman" if "spearman" in first_metrics_sample else "mrr"
+    primary_label = "Spearman" if primary_metric == "spearman" else "MRR"
+    scores = [
+        (r["name"], r.get("quality", {}).get(first_ds, {}).get(primary_metric, 0))
+        for r in results
+    ]
+    best = max(scores, key=lambda x: x[1])
+    speed_scores = [
+        (r["name"], r.get("speed", {}).get("sentences_per_second", 0))
+        for r in results
+    ]
+    fastest = max(speed_scores, key=lambda x: x[1]) if any(s[1] > 0 for s in speed_scores) else None
+    mem_scores = [
+        (r["name"], r.get("memory_mb", 0))
+        for r in results
+    ]
+    lightest = min((m for m in mem_scores if m[1] > 0), key=lambda x: x[1], default=None)
+    card_cols = st.columns(3)
+    with card_cols[0]:
+        st.markdown(render_metric_card(
+            f"Best {primary_label} ({first_ds})",
+            f"{best[1]:.4f}",
+            best[0],
+            "best",
+        ), unsafe_allow_html=True)
+    with card_cols[1]:
+        if fastest and fastest[1] > 0:
+            st.markdown(render_metric_card(
+                "Fastest",
+                f"{fastest[1]} sent/s",
+                fastest[0],
+                "best",
+            ), unsafe_allow_html=True)
+        else:
+            st.markdown(render_metric_card("Fastest", "—", "speed not measured"), unsafe_allow_html=True)
+    with card_cols[2]:
+        if lightest:
+            st.markdown(render_metric_card(
+                "Lightest",
+                f"{lightest[1]} MB",
+                lightest[0],
+                "best",
+            ), unsafe_allow_html=True)
+        else:
+            st.markdown(render_metric_card("Lightest", "—", "memory not measured"), unsafe_allow_html=True)
+    st.markdown("")
+# ---------------------------------------------------------------------------
+# Results table
+# ---------------------------------------------------------------------------
+st.markdown("#### 📊 Detailed Results")
+flat_rows = [flatten_result(r) for r in results]
+st.dataframe(flat_rows, use_container_width=True, hide_index=True)
+col_dl, _ = st.columns([1, 4])
+with col_dl:
+    csv_data = results_to_csv(results)
+    st.download_button(
+        "📥 Download CSV",
+        data=csv_data,
+        file_name="embedding_bench_results.csv",
+        mime="text/csv",
+        use_container_width=True,
+    )
+st.markdown("<hr class='section-divider'>", unsafe_allow_html=True)
+# ---------------------------------------------------------------------------
+# Charts
+# ---------------------------------------------------------------------------
+st.markdown("#### 📈 Charts")
+models = [r["name"] for r in results]
 for ds_key in ds_keys:
     first_metrics = None
     for r in results:
     if "spearman" in first_metrics:
         values = [r.get("quality", {}).get(ds_key, {}).get("spearman", 0) for r in results]
+        fig, ax = plt.subplots(figsize=(4, 2.4))
+        style_chart(fig, ax)
+        bars = ax.bar(models, values, color="#4C72B0", edgecolor="#5a82c0", linewidth=0.5)
+        ax.set_ylabel("Spearman", fontsize=8)
+        ax.set_title(f"Quality — {ds_key}", fontsize=9, pad=8)
+        ax.set_ylim(0, 1.08)
         for bar, v in zip(bars, values):
             ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.01,
+                    f"{v:.4f}", ha="center", va="bottom", fontsize=7, color=CHART_TEXT)
         plt.xticks(rotation=30, ha="right")
         plt.tight_layout()
+        st.pyplot(fig, use_container_width=False)
         plt.close(fig)
     else:
         metric_names = ["mrr", "recall@1", "recall@5", "recall@10"]
         width = 0.18
         colors = ["#4C72B0", "#55A868", "#C44E52", "#8172B2"]
+        fig, ax = plt.subplots(figsize=(max(4, len(models) * 1.4), 2.6))
+        style_chart(fig, ax)
         for i, (metric, color) in enumerate(zip(metric_names, colors)):
             values = [r.get("quality", {}).get(ds_key, {}).get(metric, 0) for r in results]
             offset = (i - 1.5) * width
+            bars = ax.bar(x + offset, values, width, label=metric, color=color,
+                          edgecolor=color, linewidth=0.3, alpha=0.9)
             for bar, v in zip(bars, values):
                 ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.005,
+                        f"{v:.2f}", ha="center", va="bottom", fontsize=6, color=CHART_TEXT)
+        ax.set_ylabel("Score", fontsize=8)
+        ax.set_title(f"Retrieval Quality — {ds_key}", fontsize=9, pad=8)
         ax.set_ylim(0, 1.15)
         ax.set_xticks(x)
+        ax.set_xticklabels(models, rotation=30, ha="right", fontsize=7)
+        ax.legend(fontsize=6, ncol=4, loc="upper right",
+                  facecolor=CHART_BG, edgecolor="#444", labelcolor=CHART_TEXT)
         plt.tight_layout()
+        st.pyplot(fig, use_container_width=False)
         plt.close(fig)
+# Speed & Memory side by side
 speed_values = [r.get("speed", {}).get("sentences_per_second", 0) for r in results]
 mem_values = [r.get("memory_mb", 0) for r in results]
+has_speed = any(v > 0 for v in speed_values)
+has_memory = any(v > 0 for v in mem_values)
+if has_speed or has_memory:
+    cols = st.columns(2 if has_speed and has_memory else 1)
+    if has_speed:
+        with cols[0]:
+            fig, ax = plt.subplots(figsize=(3.5, 2.4))
+            style_chart(fig, ax)
+            bars = ax.bar(models, speed_values, color="#55A868", edgecolor="#65b878", linewidth=0.5)
+            ax.set_ylabel("Sent / s", fontsize=8)
+            ax.set_title("Encoding Speed", fontsize=9, pad=8)
+            for bar, v in zip(bars, speed_values):
+                if v > 0:
+                    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.5,
+                            str(v), ha="center", va="bottom", fontsize=7, color=CHART_TEXT)
+            plt.xticks(rotation=30, ha="right")
+            plt.tight_layout()
+            st.pyplot(fig, use_container_width=False)
+            plt.close(fig)
+    if has_memory:
+        col_idx = 1 if has_speed else 0
+        with cols[col_idx]:
+            fig, ax = plt.subplots(figsize=(3.5, 2.4))
+            style_chart(fig, ax)
+            bars = ax.bar(models, mem_values, color="#C44E52", edgecolor="#d45e62", linewidth=0.5)
+            ax.set_ylabel("MB", fontsize=8)
+            ax.set_title("Memory Usage", fontsize=9, pad=8)
+            for bar, v in zip(bars, mem_values):
+                if v > 0:
+                    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.5,
+                            str(v), ha="center", va="bottom", fontsize=7, color=CHART_TEXT)
+            plt.xticks(rotation=30, ha="right")
+            plt.tight_layout()
+            st.pyplot(fig, use_container_width=False)
+            plt.close(fig)
+# ---------------------------------------------------------------------------
+# Footer
+# ---------------------------------------------------------------------------
+st.markdown(
+    "<div class='footer'>"
+    "Built with <a href='https://streamlit.io'>Streamlit</a> · "
+    "Models via <a href='https://huggingface.co'>HuggingFace</a> · "
+    "<a href='https://github.com/amryassin/embedding-bench'>Source on GitHub</a>"
+    "</div>",
+    unsafe_allow_html=True,
+)