soupstick commited on
Commit
8b74b05
·
1 Parent(s): 93ec1bb

feat: Add mock UI and data for advanced evaluation demo

Browse files
runs/chatgpt_mock.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ {
3
+ "name": "chatgpt-mock",
4
+ "dataset": "examples/beir_fiqa/queries.jsonl",
5
+ "metrics": {
6
+ "ndcg@10": 0.72,
7
+ "recall@100": 0.88,
8
+ "mrr": 0.68,
9
+ "p95_latency": 800.0,
10
+ "cost_per_query": 0.003,
11
+ "factuality_score": 0.92,
12
+ "citation_recall": 0.80
13
+ },
14
+ "stats": {
15
+ "total_queries": 3,
16
+ "total_cost_usd": 0.009,
17
+ "mean_latency_ms": 650.0
18
+ },
19
+ "results": []
20
+ }
runs/gemini_pro_mock.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ {
3
+ "name": "gemini-pro-mock",
4
+ "dataset": "examples/beir_fiqa/queries.jsonl",
5
+ "metrics": {
6
+ "ndcg@10": 0.78,
7
+ "recall@100": 0.92,
8
+ "mrr": 0.75,
9
+ "p95_latency": 450.0,
10
+ "cost_per_query": 0.005,
11
+ "factuality_score": 0.95,
12
+ "citation_recall": 0.88
13
+ },
14
+ "stats": {
15
+ "total_queries": 3,
16
+ "total_cost_usd": 0.015,
17
+ "mean_latency_ms": 350.0
18
+ },
19
+ "results": []
20
+ }
runs/perplexity_mock.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ {
3
+ "name": "perplexity-mock",
4
+ "dataset": "examples/beir_fiqa/queries.jsonl",
5
+ "metrics": {
6
+ "ndcg@10": 0.75,
7
+ "recall@100": 0.90,
8
+ "mrr": 0.72,
9
+ "p95_latency": 1500.0,
10
+ "cost_per_query": 0.008,
11
+ "factuality_score": 0.98,
12
+ "citation_recall": 0.95
13
+ },
14
+ "stats": {
15
+ "total_queries": 3,
16
+ "total_cost_usd": 0.024,
17
+ "mean_latency_ms": 1200.0
18
+ },
19
+ "results": []
20
+ }
streamlit_app.py CHANGED
@@ -1,105 +1,219 @@
1
-
2
- import json
3
- import os
4
- from pathlib import Path
5
- from tempfile import NamedTemporaryFile
6
-
7
- import streamlit as st
8
-
9
- from searchqual.core.config import ConfigLoader
10
- from searchqual.core.runner import EvaluationRunner
11
-
12
-
13
- def _resolve_runs_dir() -> Path:
14
- env_path = os.environ.get('SEARCHQUAL_RUNS')
15
- if env_path:
16
- return Path(env_path)
17
- try:
18
- return Path(st.secrets['runs_dir'])
19
- except Exception: # StreamlitSecretNotFoundError or missing key
20
- return Path('runs')
21
-
22
-
23
- RUN_DIR = _resolve_runs_dir()
24
- RUN_DIR.mkdir(parents=True, exist_ok=True)
25
-
26
- st.set_page_config(page_title='SearchQual Demo', layout='wide')
27
- st.title('SearchQual Evaluation Demo')
28
-
29
- st.sidebar.header('Run Configuration')
30
- default_config = Path('configs/run.yaml.example')
31
- config_text = st.sidebar.text_area(
32
- 'Paste run.yaml',
33
- value=default_config.read_text() if default_config.exists() else '',
34
- height=400,
35
- )
36
-
37
- col1, col2 = st.columns(2)
38
- with col1:
39
- st.subheader('Existing Runs')
40
- run_files = sorted(RUN_DIR.glob('*.json'))
41
- if run_files:
42
- selected = st.selectbox('Select run', run_files, format_func=lambda p: p.stem)
43
- if st.button('Load run'):
44
- data = json.loads(selected.read_text())
45
- metrics = data.get('metrics', {})
46
- if metrics:
47
- st.subheader("Metrics")
48
- cols = st.columns(len(metrics))
49
- for i, (k, v) in enumerate(metrics.items()):
50
- cols[i].metric(k, v)
51
-
52
- stats = data.get('stats', {})
53
- if stats:
54
- st.subheader("Stats")
55
- st.table(stats)
56
-
57
- results = data.get('results', [])
58
- if results:
59
- st.subheader("Results")
60
- for res in results:
61
- st.write(f"**Query:** {res['query']}")
62
- if res.get('answer'):
63
- st.write(f"**Answer:** {res['answer']}")
64
- st.dataframe(res.get('documents', []))
65
- else:
66
- st.info('No runs found yet. Execute a run to populate metrics.')
67
-
68
- with col2:
69
- st.subheader('Execute Run')
70
- if st.button('Run evaluation'):
71
- try:
72
- with NamedTemporaryFile('w+', suffix='.yaml', delete=False) as fh:
73
- fh.write(config_text)
74
- tmp_path = Path(fh.name)
75
- run_cfg = ConfigLoader.load_run_config(tmp_path)
76
- dataset_path = Path(run_cfg.dataset)
77
- if not dataset_path.is_absolute():
78
- dataset_path = (Path.cwd() / dataset_path).resolve()
79
- dataset_cfg = ConfigLoader.load_dataset_config(dataset_path)
80
-
81
- system = run_cfg.system
82
- if system.type == 'http':
83
- from searchqual.connectors.basic import HTTPSearchClient
84
- connector = HTTPSearchClient(system)
85
- elif system.type == 'local':
86
- from searchqual.connectors.basic import LocalSearchClient
87
- connector = LocalSearchClient(system)
88
- elif system.type == 'weaviate':
89
- from searchqual.connectors.weaviate import WeaviateConnector
90
- connector = WeaviateConnector(system)
91
- else:
92
- raise TypeError(f'Unsupported system type: {system.type}')
93
-
94
- runner = EvaluationRunner(run_cfg, dataset_cfg, connector, dataset_path=dataset_path)
95
- result = runner.run()
96
- output = RUN_DIR / f"demo-{result.name}.json"
97
- output.parent.mkdir(parents=True, exist_ok=True)
98
- result.dump(output)
99
- st.success(f'Run completed and stored at {output}')
100
- st.json(result.metrics)
101
- except Exception as exc: # noqa: BLE001
102
- st.error(f'Run failed: {exc}')
103
- finally:
104
- if 'tmp_path' in locals() and tmp_path.exists():
105
- tmp_path.unlink()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import random
3
+ import time
4
+ from pathlib import Path
5
+
6
+ import pandas as pd
7
+ import streamlit as st
8
+
9
+ # --- Configuration ---
10
+ RUNS_DIR = Path("runs")
11
+ RUNS_DIR.mkdir(exist_ok=True)
12
+
13
+ st.set_page_config(page_title="SearchQual Pro Demo", layout="wide")
14
+ st.title("SearchQual Pro Demo")
15
+ st.markdown("A proof-of-concept demonstrating the future of search evaluation.")
16
+
17
+ # --- Mock Data and Helpers ---
18
+ MOCK_SYSTEMS = {
19
+ "Gemini Pro (mock)": "gemini_pro_mock",
20
+ "ChatGPT (mock)": "chatgpt_mock",
21
+ "Perplexity AI (mock)": "perplexity_mock",
22
+ "BM25 Baseline (mock)": "baseline",
23
+ }
24
+
25
+ def load_run_data(run_file):
26
+ """Loads, parses, and adds mock data to a run file."""
27
+ if not run_file:
28
+ return None
29
+ try:
30
+ data = json.loads(Path(run_file).read_text())
31
+ # Inject mock metrics if they don't exist
32
+ if "factuality_score" not in data["metrics"]:
33
+ data["metrics"]["factuality_score"] = round(random.uniform(0.85, 0.99), 2)
34
+ if "citation_recall" not in data["metrics"]:
35
+ data["metrics"]["citation_recall"] = round(random.uniform(0.75, 0.95), 2)
36
+ return data
37
+ except (FileNotFoundError, json.JSONDecodeError):
38
+ return None
39
+
40
+ def display_metrics(metrics):
41
+ """Displays a dictionary of metrics in columns."""
42
+ if not metrics:
43
+ st.info("No metrics available.")
44
+ return
45
+
46
+ # Define the order and help text for metrics
47
+ METRIC_DEFINITIONS = {
48
+ "ndcg@10": "Relevance Score",
49
+ "recall@100": "Completeness",
50
+ "mrr": "Ranking Quality",
51
+ "p95_latency": "Latency (ms)",
52
+ "cost_per_query": "Cost ($)",
53
+ "factuality_score": "Factuality",
54
+ "citation_recall": "Citation Recall"
55
+ }
56
+
57
+ # Filter and sort metrics based on definitions
58
+ display_metrics = {k: metrics.get(k) for k in METRIC_DEFINITIONS if k in metrics}
59
+
60
+ cols = st.columns(len(display_metrics))
61
+ for i, (key, label) in enumerate(METRIC_DEFINITIONS.items()):
62
+ if key in display_metrics:
63
+ val = display_metrics[key]
64
+ if val is not None:
65
+ cols[i].metric(label, f"{val:.3f}" if isinstance(val, float) else val, help=label)
66
+
67
+
68
+ # --- UI Tabs ---
69
+ tab1, tab2, tab3, tab4 = st.tabs(["🚀 Run Evaluation", "📊 Compare Runs", "🏆 Leaderboard", "📚 API & Docs"])
70
+
71
+ # --- Tab 1: Run Evaluation ---
72
+ with tab1:
73
+ st.header("Run a New Evaluation")
74
+
75
+ col1, col2 = st.columns([1, 2])
76
+
77
+ with col1:
78
+ st.subheader("Configuration")
79
+ selected_system = st.selectbox("Select System", options=list(MOCK_SYSTEMS.keys()))
80
+ dataset = st.selectbox("Select Dataset", options=["fiqa_mini", "msmarco_mini", "hotpotqa_mini"])
81
+
82
+ if st.button("Run Evaluation", type="primary"):
83
+ mock_run_name = MOCK_SYSTEMS[selected_system]
84
+ run_file = RUNS_DIR / f"{mock_run_name}.json"
85
+
86
+ with st.spinner(f"Simulating evaluation for **{selected_system}**..."):
87
+ time.sleep(random.uniform(1.5, 3.0)) # Simulate work
88
+
89
+ data = load_run_data(run_file)
90
+ if data:
91
+ st.success(f"Evaluation complete for **{selected_system}**!")
92
+ st.session_state.last_run = data
93
+ st.session_state.last_run_file = str(run_file)
94
+ else:
95
+ st.error(f"Could not load mock data for {selected_system}. File not found: {run_file}")
96
+
97
+ with col2:
98
+ st.subheader("Latest Run Results")
99
+ if "last_run" in st.session_state:
100
+ data = st.session_state.last_run
101
+ st.write(f"Showing results for: **{data['name']}**")
102
+ display_metrics(data.get("metrics"))
103
+
104
+ with st.expander("View Raw Stats and Config"):
105
+ stats = data.get("stats", {})
106
+ if stats:
107
+ st.write("**Stats**")
108
+ st.json(stats)
109
+ st.write("**Configuration**")
110
+ st.json({"name": data["name"], "dataset": data["dataset"]})
111
+ else:
112
+ st.info("Run an evaluation to see the results here.")
113
+
114
+
115
+ # --- Tab 2: Compare Runs ---
116
+ with tab2:
117
+ st.header("Compare Two Evaluation Runs")
118
+ run_files = [str(p) for p in RUNS_DIR.glob("*.json")]
119
+
120
+ col1, col2 = st.columns(2)
121
+ with col1:
122
+ run1_file = st.selectbox("Select Run 1", run_files, format_func=lambda p: Path(p).stem, key="run1")
123
+ with col2:
124
+ run2_file = st.selectbox("Select Run 2", run_files, format_func=lambda p: Path(p).stem, key="run2")
125
+
126
+ if run1_file and run2_file:
127
+ data1 = load_run_data(run1_file)
128
+ data2 = load_run_data(run2_file)
129
+
130
+ if data1 and data2:
131
+ st.subheader("Metrics Comparison")
132
+
133
+ metrics1 = data1.get("metrics", {})
134
+ metrics2 = data2.get("metrics", {})
135
+
136
+ all_keys = sorted(list(set(metrics1.keys()) | set(metrics2.keys())))
137
+
138
+ df_data = []
139
+ for key in all_keys:
140
+ val1 = metrics1.get(key)
141
+ val2 = metrics2.get(key)
142
+ delta = None
143
+ if isinstance(val1, (int, float)) and isinstance(val2, (int, float)):
144
+ delta = val2 - val1
145
+
146
+ df_data.append({
147
+ "Metric": key,
148
+ data1['name']: val1,
149
+ data2['name']: val2,
150
+ "Delta": delta
151
+ })
152
+
153
+ df = pd.DataFrame(df_data).set_index("Metric")
154
+ st.dataframe(df, use_container_width=True)
155
+
156
+ # --- Tab 3: Leaderboard ---
157
+ with tab3:
158
+ st.header("Public Leaderboard")
159
+ st.write("Ranking of all evaluated systems based on a composite score.")
160
+
161
+ all_runs_data = [load_run_data(f) for f in run_files]
162
+ valid_runs = [r for r in all_runs_data if r and "metrics" in r]
163
+
164
+ leaderboard_data = []
165
+ for run in valid_runs:
166
+ metrics = run["metrics"]
167
+ # Simple composite score for demonstration
168
+ score = (metrics.get("ndcg@10", 0) * 0.4) + \
169
+ (metrics.get("factuality_score", 0) * 0.3) + \
170
+ (metrics.get("citation_recall", 0) * 0.2) + \
171
+ ((1 - metrics.get("cost_per_query", 1)) * 0.1)
172
+
173
+ leaderboard_data.append({
174
+ "System": run["name"],
175
+ "Composite Score": score,
176
+ "NDCG@10": metrics.get("ndcg@10"),
177
+ "Factuality": metrics.get("factuality_score"),
178
+ "Citations": metrics.get("citation_recall"),
179
+ "Cost/Query": metrics.get("cost_per_query"),
180
+ })
181
+
182
+ if leaderboard_data:
183
+ leaderboard_df = pd.DataFrame(leaderboard_data).sort_values("Composite Score", ascending=False)
184
+ st.dataframe(leaderboard_df.set_index("System"), use_container_width=True)
185
+ else:
186
+ st.warning("No valid run data to display on the leaderboard.")
187
+
188
+
189
+ # --- Tab 4: API & Docs ---
190
+ with tab4:
191
+ st.header("API & Documentation")
192
+ st.markdown(f"""
193
+ Our public API allows for programmatic evaluation and retrieval of results.
194
+
195
+ ### Submit an Evaluation
196
+
197
+ To start a new evaluation, `POST` to the `/evaluations` endpoint:
198
+
199
+ ```bash
200
+ curl -X POST https://your-hf-space-url/api/evaluations \
201
+ -H "Authorization: Bearer <YOUR_API_KEY>" \
202
+ -H "Content-Type: application/json" \
203
+ -d '{{
204
+ "system_id": "your-system-id",
205
+ "dataset": "fiqa_mini"
206
+ }}'
207
+ ```
208
+
209
+ ### Get Results
210
+
211
+ Retrieve the status and results of an evaluation run:
212
+
213
+ ```bash
214
+ curl https://your-hf-space-url/api/evaluations/<EVALUATION_ID> \
215
+ -H "Authorization: Bearer <YOUR_API_KEY>"
216
+ ```
217
+
218
+ *Note: These are mock endpoints for demonstration purposes.*
219
+ """)