Spaces:

soupstick
/

searchqual

Sleeping

App Files Files Community

soupstick commited on Oct 8, 2025

Commit

8b74b05

1 Parent(s): 93ec1bb

feat: Add mock UI and data for advanced evaluation demo

Browse files

Files changed (4) hide show

runs/chatgpt_mock.json +20 -0
runs/gemini_pro_mock.json +20 -0
runs/perplexity_mock.json +20 -0
streamlit_app.py +219 -105

runs/chatgpt_mock.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "name": "chatgpt-mock",
+  "dataset": "examples/beir_fiqa/queries.jsonl",
+  "metrics": {
+    "ndcg@10": 0.72,
+    "recall@100": 0.88,
+    "mrr": 0.68,
+    "p95_latency": 800.0,
+    "cost_per_query": 0.003,
+    "factuality_score": 0.92,
+    "citation_recall": 0.80
+  },
+  "stats": {
+    "total_queries": 3,
+    "total_cost_usd": 0.009,
+    "mean_latency_ms": 650.0
+  },
+  "results": []
+}

runs/gemini_pro_mock.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "name": "gemini-pro-mock",
+  "dataset": "examples/beir_fiqa/queries.jsonl",
+  "metrics": {
+    "ndcg@10": 0.78,
+    "recall@100": 0.92,
+    "mrr": 0.75,
+    "p95_latency": 450.0,
+    "cost_per_query": 0.005,
+    "factuality_score": 0.95,
+    "citation_recall": 0.88
+  },
+  "stats": {
+    "total_queries": 3,
+    "total_cost_usd": 0.015,
+    "mean_latency_ms": 350.0
+  },
+  "results": []
+}

runs/perplexity_mock.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "name": "perplexity-mock",
+  "dataset": "examples/beir_fiqa/queries.jsonl",
+  "metrics": {
+    "ndcg@10": 0.75,
+    "recall@100": 0.90,
+    "mrr": 0.72,
+    "p95_latency": 1500.0,
+    "cost_per_query": 0.008,
+    "factuality_score": 0.98,
+    "citation_recall": 0.95
+  },
+  "stats": {
+    "total_queries": 3,
+    "total_cost_usd": 0.024,
+    "mean_latency_ms": 1200.0
+  },
+  "results": []
+}

streamlit_app.py CHANGED Viewed

@@ -1,105 +1,219 @@
-import json
-import os
-from pathlib import Path
-from tempfile import NamedTemporaryFile
-import streamlit as st
-from searchqual.core.config import ConfigLoader
-from searchqual.core.runner import EvaluationRunner
-def _resolve_runs_dir() -> Path:
-    env_path = os.environ.get('SEARCHQUAL_RUNS')
-    if env_path:
-        return Path(env_path)
-    try:
-        return Path(st.secrets['runs_dir'])
-    except Exception:  # StreamlitSecretNotFoundError or missing key
-        return Path('runs')
-RUN_DIR = _resolve_runs_dir()
-RUN_DIR.mkdir(parents=True, exist_ok=True)
-st.set_page_config(page_title='SearchQual Demo', layout='wide')
-st.title('SearchQual Evaluation Demo')
-st.sidebar.header('Run Configuration')
-default_config = Path('configs/run.yaml.example')
-config_text = st.sidebar.text_area(
-    'Paste run.yaml',
-    value=default_config.read_text() if default_config.exists() else '',
-    height=400,
-)
-col1, col2 = st.columns(2)
-with col1:
-    st.subheader('Existing Runs')
-    run_files = sorted(RUN_DIR.glob('*.json'))
-    if run_files:
-        selected = st.selectbox('Select run', run_files, format_func=lambda p: p.stem)
-        if st.button('Load run'):
-            data = json.loads(selected.read_text())
-            metrics = data.get('metrics', {})
-            if metrics:
-                st.subheader("Metrics")
-                cols = st.columns(len(metrics))
-                for i, (k, v) in enumerate(metrics.items()):
-                    cols[i].metric(k, v)
-            stats = data.get('stats', {})
-            if stats:
-                st.subheader("Stats")
-                st.table(stats)
-            results = data.get('results', [])
-            if results:
-                st.subheader("Results")
-                for res in results:
-                    st.write(f"**Query:** {res['query']}")
-                    if res.get('answer'):
-                        st.write(f"**Answer:** {res['answer']}")
-                    st.dataframe(res.get('documents', []))
-    else:
-        st.info('No runs found yet. Execute a run to populate metrics.')
-with col2:
-    st.subheader('Execute Run')
-    if st.button('Run evaluation'):
-        try:
-            with NamedTemporaryFile('w+', suffix='.yaml', delete=False) as fh:
-                fh.write(config_text)
-                tmp_path = Path(fh.name)
-            run_cfg = ConfigLoader.load_run_config(tmp_path)
-            dataset_path = Path(run_cfg.dataset)
-            if not dataset_path.is_absolute():
-                dataset_path = (Path.cwd() / dataset_path).resolve()
-            dataset_cfg = ConfigLoader.load_dataset_config(dataset_path)
-            system = run_cfg.system
-            if system.type == 'http':
-                from searchqual.connectors.basic import HTTPSearchClient
-                connector = HTTPSearchClient(system)
-            elif system.type == 'local':
-                from searchqual.connectors.basic import LocalSearchClient
-                connector = LocalSearchClient(system)
-            elif system.type == 'weaviate':
-                from searchqual.connectors.weaviate import WeaviateConnector
-                connector = WeaviateConnector(system)
-            else:
-                raise TypeError(f'Unsupported system type: {system.type}')
-            runner = EvaluationRunner(run_cfg, dataset_cfg, connector, dataset_path=dataset_path)
-            result = runner.run()
-            output = RUN_DIR / f"demo-{result.name}.json"
-            output.parent.mkdir(parents=True, exist_ok=True)
-            result.dump(output)
-            st.success(f'Run completed and stored at {output}')
-            st.json(result.metrics)
-        except Exception as exc:  # noqa: BLE001
-            st.error(f'Run failed: {exc}')
-        finally:
-            if 'tmp_path' in locals() and tmp_path.exists():
-                tmp_path.unlink()

+import json
+import random
+import time
+from pathlib import Path
+import pandas as pd
+import streamlit as st
+# --- Configuration ---
+RUNS_DIR = Path("runs")
+RUNS_DIR.mkdir(exist_ok=True)
+st.set_page_config(page_title="SearchQual Pro Demo", layout="wide")
+st.title("SearchQual Pro Demo")
+st.markdown("A proof-of-concept demonstrating the future of search evaluation.")
+# --- Mock Data and Helpers ---
+MOCK_SYSTEMS = {
+    "Gemini Pro (mock)": "gemini_pro_mock",
+    "ChatGPT (mock)": "chatgpt_mock",
+    "Perplexity AI (mock)": "perplexity_mock",
+    "BM25 Baseline (mock)": "baseline",
+}
+def load_run_data(run_file):
+    """Loads, parses, and adds mock data to a run file."""
+    if not run_file:
+        return None
+    try:
+        data = json.loads(Path(run_file).read_text())
+        # Inject mock metrics if they don't exist
+        if "factuality_score" not in data["metrics"]:
+            data["metrics"]["factuality_score"] = round(random.uniform(0.85, 0.99), 2)
+        if "citation_recall" not in data["metrics"]:
+            data["metrics"]["citation_recall"] = round(random.uniform(0.75, 0.95), 2)
+        return data
+    except (FileNotFoundError, json.JSONDecodeError):
+        return None
+def display_metrics(metrics):
+    """Displays a dictionary of metrics in columns."""
+    if not metrics:
+        st.info("No metrics available.")
+        return
+    # Define the order and help text for metrics
+    METRIC_DEFINITIONS = {
+        "ndcg@10": "Relevance Score",
+        "recall@100": "Completeness",
+        "mrr": "Ranking Quality",
+        "p95_latency": "Latency (ms)",
+        "cost_per_query": "Cost ($)",
+        "factuality_score": "Factuality",
+        "citation_recall": "Citation Recall"
+    }
+    # Filter and sort metrics based on definitions
+    display_metrics = {k: metrics.get(k) for k in METRIC_DEFINITIONS if k in metrics}
+    cols = st.columns(len(display_metrics))
+    for i, (key, label) in enumerate(METRIC_DEFINITIONS.items()):
+        if key in display_metrics:
+            val = display_metrics[key]
+            if val is not None:
+                cols[i].metric(label, f"{val:.3f}" if isinstance(val, float) else val, help=label)
+# --- UI Tabs ---
+tab1, tab2, tab3, tab4 = st.tabs(["🚀 Run Evaluation", "📊 Compare Runs", "🏆 Leaderboard", "📚 API & Docs"])
+# --- Tab 1: Run Evaluation ---
+with tab1:
+    st.header("Run a New Evaluation")
+    col1, col2 = st.columns([1, 2])
+    with col1:
+        st.subheader("Configuration")
+        selected_system = st.selectbox("Select System", options=list(MOCK_SYSTEMS.keys()))
+        dataset = st.selectbox("Select Dataset", options=["fiqa_mini", "msmarco_mini", "hotpotqa_mini"])
+        if st.button("Run Evaluation", type="primary"):
+            mock_run_name = MOCK_SYSTEMS[selected_system]
+            run_file = RUNS_DIR / f"{mock_run_name}.json"
+            with st.spinner(f"Simulating evaluation for **{selected_system}**..."):
+                time.sleep(random.uniform(1.5, 3.0)) # Simulate work
+            data = load_run_data(run_file)
+            if data:
+                st.success(f"Evaluation complete for **{selected_system}**!")
+                st.session_state.last_run = data
+                st.session_state.last_run_file = str(run_file)
+            else:
+                st.error(f"Could not load mock data for {selected_system}. File not found: {run_file}")
+    with col2:
+        st.subheader("Latest Run Results")
+        if "last_run" in st.session_state:
+            data = st.session_state.last_run
+            st.write(f"Showing results for: **{data['name']}**")
+            display_metrics(data.get("metrics"))
+            with st.expander("View Raw Stats and Config"):
+                stats = data.get("stats", {})
+                if stats:
+                    st.write("**Stats**")
+                    st.json(stats)
+                st.write("**Configuration**")
+                st.json({"name": data["name"], "dataset": data["dataset"]})
+        else:
+            st.info("Run an evaluation to see the results here.")
+# --- Tab 2: Compare Runs ---
+with tab2:
+    st.header("Compare Two Evaluation Runs")
+    run_files = [str(p) for p in RUNS_DIR.glob("*.json")]
+    col1, col2 = st.columns(2)
+    with col1:
+        run1_file = st.selectbox("Select Run 1", run_files, format_func=lambda p: Path(p).stem, key="run1")
+    with col2:
+        run2_file = st.selectbox("Select Run 2", run_files, format_func=lambda p: Path(p).stem, key="run2")
+    if run1_file and run2_file:
+        data1 = load_run_data(run1_file)
+        data2 = load_run_data(run2_file)
+        if data1 and data2:
+            st.subheader("Metrics Comparison")
+            metrics1 = data1.get("metrics", {})
+            metrics2 = data2.get("metrics", {})
+            all_keys = sorted(list(set(metrics1.keys()) | set(metrics2.keys())))
+            df_data = []
+            for key in all_keys:
+                val1 = metrics1.get(key)
+                val2 = metrics2.get(key)
+                delta = None
+                if isinstance(val1, (int, float)) and isinstance(val2, (int, float)):
+                    delta = val2 - val1
+                df_data.append({
+                    "Metric": key,
+                    data1['name']: val1,
+                    data2['name']: val2,
+                    "Delta": delta
+                })
+            df = pd.DataFrame(df_data).set_index("Metric")
+            st.dataframe(df, use_container_width=True)
+# --- Tab 3: Leaderboard ---
+with tab3:
+    st.header("Public Leaderboard")
+    st.write("Ranking of all evaluated systems based on a composite score.")
+    all_runs_data = [load_run_data(f) for f in run_files]
+    valid_runs = [r for r in all_runs_data if r and "metrics" in r]
+    leaderboard_data = []
+    for run in valid_runs:
+        metrics = run["metrics"]
+        # Simple composite score for demonstration
+        score = (metrics.get("ndcg@10", 0) * 0.4) + \
+                (metrics.get("factuality_score", 0) * 0.3) + \
+                (metrics.get("citation_recall", 0) * 0.2) + \
+                ((1 - metrics.get("cost_per_query", 1)) * 0.1)
+        leaderboard_data.append({
+            "System": run["name"],
+            "Composite Score": score,
+            "NDCG@10": metrics.get("ndcg@10"),
+            "Factuality": metrics.get("factuality_score"),
+            "Citations": metrics.get("citation_recall"),
+            "Cost/Query": metrics.get("cost_per_query"),
+        })
+    if leaderboard_data:
+        leaderboard_df = pd.DataFrame(leaderboard_data).sort_values("Composite Score", ascending=False)
+        st.dataframe(leaderboard_df.set_index("System"), use_container_width=True)
+    else:
+        st.warning("No valid run data to display on the leaderboard.")
+# --- Tab 4: API & Docs ---
+with tab4:
+    st.header("API & Documentation")
+    st.markdown(f"""
+    Our public API allows for programmatic evaluation and retrieval of results.
+    ### Submit an Evaluation
+    To start a new evaluation, `POST` to the `/evaluations` endpoint:
+    ```bash
+    curl -X POST https://your-hf-space-url/api/evaluations \
+         -H "Authorization: Bearer <YOUR_API_KEY>" \
+         -H "Content-Type: application/json" \
+         -d '{{
+               "system_id": "your-system-id",
+               "dataset": "fiqa_mini"
+             }}'
+    ```
+    ### Get Results
+    Retrieve the status and results of an evaluation run:
+    ```bash
+    curl https://your-hf-space-url/api/evaluations/<EVALUATION_ID> \
+         -H "Authorization: Bearer <YOUR_API_KEY>"
+    ```
+    *Note: These are mock endpoints for demonstration purposes.*
+    """)