Spaces:
Sleeping
Sleeping
| import json | |
| import random | |
| import time | |
| from pathlib import Path | |
| import pandas as pd | |
| import streamlit as st | |
| # --- Configuration --- | |
| RUNS_DIR = Path("runs") | |
| RUNS_DIR.mkdir(exist_ok=True) | |
| st.set_page_config(page_title="SearchQual Pro Demo", layout="wide") | |
| st.title("SearchQual Pro Demo") | |
| st.markdown("A proof-of-concept demonstrating the future of search evaluation.") | |
| # --- Mock Data and Helpers --- | |
| MOCK_SYSTEMS = { | |
| "Gemini Pro (mock)": "gemini_pro_mock", | |
| "ChatGPT (mock)": "chatgpt_mock", | |
| "Perplexity AI (mock)": "perplexity_mock", | |
| "BM25 Baseline (mock)": "baseline", | |
| } | |
| def load_run_data(run_file): | |
| """Loads, parses, and adds mock data to a run file.""" | |
| if not run_file: | |
| return None | |
| try: | |
| data = json.loads(Path(run_file).read_text()) | |
| # Inject mock metrics if they don't exist | |
| if "factuality_score" not in data["metrics"]: | |
| data["metrics"]["factuality_score"] = round(random.uniform(0.85, 0.99), 2) | |
| if "citation_recall" not in data["metrics"]: | |
| data["metrics"]["citation_recall"] = round(random.uniform(0.75, 0.95), 2) | |
| return data | |
| except (FileNotFoundError, json.JSONDecodeError): | |
| return None | |
| def display_metrics(metrics): | |
| """Displays a dictionary of metrics in columns.""" | |
| if not metrics: | |
| st.info("No metrics available.") | |
| return | |
| # Define the order and help text for metrics | |
| METRIC_DEFINITIONS = { | |
| "ndcg@10": "Relevance Score", | |
| "recall@100": "Completeness", | |
| "mrr": "Ranking Quality", | |
| "p95_latency": "Latency (ms)", | |
| "cost_per_query": "Cost ($)", | |
| "factuality_score": "Factuality", | |
| "citation_recall": "Citation Recall" | |
| } | |
| # Filter and sort metrics based on definitions | |
| display_metrics = {k: metrics.get(k) for k in METRIC_DEFINITIONS if k in metrics} | |
| cols = st.columns(len(display_metrics)) | |
| for i, (key, label) in enumerate(METRIC_DEFINITIONS.items()): | |
| if key in display_metrics: | |
| val = display_metrics[key] | |
| if val is not None: | |
| cols[i].metric(label, f"{val:.3f}" if isinstance(val, float) else val, help=label) | |
| # --- UI Tabs --- | |
| tab1, tab2, tab3, tab4 = st.tabs(["🚀 Run Evaluation", "📊 Compare Runs", "🏆 Leaderboard", "📚 API & Docs"]) | |
| # --- Tab 1: Run Evaluation --- | |
| with tab1: | |
| st.header("Run a New Evaluation") | |
| col1, col2 = st.columns([1, 2]) | |
| with col1: | |
| st.subheader("Configuration") | |
| selected_system = st.selectbox("Select System", options=list(MOCK_SYSTEMS.keys())) | |
| dataset = st.selectbox("Select Dataset", options=["fiqa_mini", "msmarco_mini", "hotpotqa_mini"]) | |
| if st.button("Run Evaluation", type="primary"): | |
| mock_run_name = MOCK_SYSTEMS[selected_system] | |
| run_file = RUNS_DIR / f"{mock_run_name}.json" | |
| with st.spinner(f"Simulating evaluation for **{selected_system}**..."): | |
| time.sleep(random.uniform(1.5, 3.0)) # Simulate work | |
| data = load_run_data(run_file) | |
| if data: | |
| st.success(f"Evaluation complete for **{selected_system}**!") | |
| st.session_state.last_run = data | |
| st.session_state.last_run_file = str(run_file) | |
| else: | |
| st.error(f"Could not load mock data for {selected_system}. File not found: {run_file}") | |
| with col2: | |
| st.subheader("Latest Run Results") | |
| if "last_run" in st.session_state: | |
| data = st.session_state.last_run | |
| st.write(f"Showing results for: **{data['name']}**") | |
| display_metrics(data.get("metrics")) | |
| with st.expander("View Raw Stats and Config"): | |
| stats = data.get("stats", {}) | |
| if stats: | |
| st.write("**Stats**") | |
| st.json(stats) | |
| st.write("**Configuration**") | |
| st.json({"name": data["name"], "dataset": data["dataset"]}) | |
| else: | |
| st.info("Run an evaluation to see the results here.") | |
| # --- Tab 2: Compare Runs --- | |
| with tab2: | |
| st.header("Compare Two Evaluation Runs") | |
| run_files = [str(p) for p in RUNS_DIR.glob("*.json")] | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| run1_file = st.selectbox("Select Run 1", run_files, format_func=lambda p: Path(p).stem, key="run1") | |
| with col2: | |
| run2_file = st.selectbox("Select Run 2", run_files, format_func=lambda p: Path(p).stem, key="run2") | |
| if run1_file and run2_file: | |
| data1 = load_run_data(run1_file) | |
| data2 = load_run_data(run2_file) | |
| if data1 and data2: | |
| st.subheader("Metrics Comparison") | |
| metrics1 = data1.get("metrics", {}) | |
| metrics2 = data2.get("metrics", {}) | |
| all_keys = sorted(list(set(metrics1.keys()) | set(metrics2.keys()))) | |
| df_data = [] | |
| for key in all_keys: | |
| val1 = metrics1.get(key) | |
| val2 = metrics2.get(key) | |
| delta = None | |
| if isinstance(val1, (int, float)) and isinstance(val2, (int, float)): | |
| delta = val2 - val1 | |
| df_data.append({ | |
| "Metric": key, | |
| data1['name']: val1, | |
| data2['name']: val2, | |
| "Delta": delta | |
| }) | |
| df = pd.DataFrame(df_data).set_index("Metric") | |
| st.dataframe(df, use_container_width=True) | |
| # --- Tab 3: Leaderboard --- | |
| with tab3: | |
| st.header("Public Leaderboard") | |
| st.write("Ranking of all evaluated systems based on a composite score.") | |
| all_runs_data = [load_run_data(f) for f in run_files] | |
| valid_runs = [r for r in all_runs_data if r and "metrics" in r] | |
| leaderboard_data = [] | |
| for run in valid_runs: | |
| metrics = run["metrics"] | |
| # Simple composite score for demonstration | |
| score = (metrics.get("ndcg@10", 0) * 0.4) + \ | |
| (metrics.get("factuality_score", 0) * 0.3) + \ | |
| (metrics.get("citation_recall", 0) * 0.2) + \ | |
| ((1 - metrics.get("cost_per_query", 1)) * 0.1) | |
| leaderboard_data.append({ | |
| "System": run["name"], | |
| "Composite Score": score, | |
| "NDCG@10": metrics.get("ndcg@10"), | |
| "Factuality": metrics.get("factuality_score"), | |
| "Citations": metrics.get("citation_recall"), | |
| "Cost/Query": metrics.get("cost_per_query"), | |
| }) | |
| if leaderboard_data: | |
| leaderboard_df = pd.DataFrame(leaderboard_data).sort_values("Composite Score", ascending=False) | |
| st.dataframe(leaderboard_df.set_index("System"), use_container_width=True) | |
| else: | |
| st.warning("No valid run data to display on the leaderboard.") | |
| # --- Tab 4: API & Docs --- | |
| with tab4: | |
| st.header("API & Documentation") | |
| st.markdown(f""" | |
| Our public API allows for programmatic evaluation and retrieval of results. | |
| ### Submit an Evaluation | |
| To start a new evaluation, `POST` to the `/evaluations` endpoint: | |
| ```bash | |
| curl -X POST https://your-hf-space-url/api/evaluations \ | |
| -H "Authorization: Bearer <YOUR_API_KEY>" \ | |
| -H "Content-Type: application/json" \ | |
| -d '{{ | |
| "system_id": "your-system-id", | |
| "dataset": "fiqa_mini" | |
| }}' | |
| ``` | |
| ### Get Results | |
| Retrieve the status and results of an evaluation run: | |
| ```bash | |
| curl https://your-hf-space-url/api/evaluations/<EVALUATION_ID> \ | |
| -H "Authorization: Bearer <YOUR_API_KEY>" | |
| ``` | |
| *Note: These are mock endpoints for demonstration purposes.* | |
| """) |