admin-healthelic commited on
Commit
3eafae5
·
verified ·
1 Parent(s): 43ad69c

Upload 5 files

Browse files
Files changed (5) hide show
  1. README.md +4 -3
  2. app.py +354 -0
  3. postBuild +3 -0
  4. requirements.txt +11 -0
  5. runs.json +1 -0
README.md CHANGED
@@ -1,12 +1,13 @@
1
  ---
2
  title: HealthBenchAdvancedDemo
3
- emoji: 💻
4
  colorFrom: gray
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 5.44.1
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: HealthBenchAdvancedDemo
3
+ emoji: 🏢
4
  colorFrom: gray
5
+ colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 5.42.0
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import re
4
+ import uuid
5
+ from datetime import datetime
6
+ import openai
7
+ import gradio as gr
8
+ import numpy as np
9
+ import google.generativeai as genai
10
+ import random
11
+
12
+ # -------------------------
13
+ # Config
14
+ # -------------------------
15
+ PHYSICIAN_COMPLETION_MODES = {"Group 1": 1, "Group 2": 2, "Group 3": 3}
16
+
17
+ DATASET_FILES = {
18
+ "regular": os.path.join(os.path.dirname(__file__), "data", "oss_eval.jsonl"),
19
+ "hard": os.path.join(os.path.dirname(__file__), "data", "hard_2025-05-08-21-00-10.jsonl"),
20
+ "consensus": os.path.join(os.path.dirname(__file__), "data", "consensus_2025-05-09-20-00-46.jsonl"),
21
+ }
22
+
23
+ CANDIDATE_MODELS = [
24
+ "gpt-4.1",
25
+ "gpt-4o-mini",
26
+ "gpt-5-chat-latest"
27
+ ]
28
+
29
+ GRADER_MODEL = "gpt-4o-mini"
30
+
31
+ openai.api_key = os.getenv("OPENAI_API_KEY")
32
+ genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
33
+
34
+ # Models that only support default temperature (don’t allow setting temperature manually)
35
+ MODEL_DEFAULT_TEMP = ["o4-mini"]
36
+
37
+ # Local JSON file for storing runs
38
+ RUNS_FILE = "/data/runs2.json"
39
+
40
+ # -------------------------
41
+ # Helper to read JSONL
42
+ # -------------------------
43
+ def read_jsonl(file_path):
44
+ data = []
45
+ with open(file_path, "r", encoding="utf-8") as f:
46
+ for line in f:
47
+ data.append(json.loads(line))
48
+ return data
49
+
50
+ # -------------------------
51
+ # Evaluation class
52
+ # -------------------------
53
+ class HealthBenchEval:
54
+ def __init__(self, dataset_file, num_examples=None, seed=None):
55
+ self.all_data = read_jsonl(dataset_file)
56
+
57
+ if seed is not None and num_examples:
58
+ random.seed(seed)
59
+ self.indices = random.sample(range(len(self.all_data)), min(num_examples, len(self.all_data)))
60
+ self.dataset = [self.all_data[i] for i in self.indices]
61
+ elif num_examples:
62
+ self.indices = list(range(num_examples))
63
+ self.dataset = self.all_data[:num_examples]
64
+ else:
65
+ self.indices = list(range(len(self.all_data)))
66
+ self.dataset = self.all_data
67
+
68
+ self.scores = []
69
+ self.htmls = ""
70
+ self.sample_records = [] # <-- store per-sample rows
71
+ self.seed = seed
72
+ self.eval_id = str(uuid.uuid4()) # unique id per evaluation run
73
+
74
+ def score_with_grader(self, prompt_text, completion_text, example_index):
75
+ prompt = f"""
76
+ You are a grader model. Score the following completion from 0.0 to 1.0.
77
+ Prompt: {prompt_text}
78
+ Completion: {completion_text}
79
+ Return only a number between 0 and 1.
80
+ """
81
+ try:
82
+ resp = openai.chat.completions.create(
83
+ model=GRADER_MODEL,
84
+ messages=[{"role": "user", "content": prompt}],
85
+ max_completion_tokens=50
86
+ )
87
+ score_text = resp.choices[0].message.content.strip()
88
+ match = re.search(r"0(?:\.\d+)?|1(?:\.0+)?", score_text)
89
+ if match:
90
+ score = float(match.group(0))
91
+ else:
92
+ score = 0.0
93
+ return max(0.0, min(1.0, score))
94
+ except Exception as e:
95
+ print(f"Grader error: {e}")
96
+ return 0.0
97
+
98
+ def generate_with_candidate(self, candidate_model, system_prompt, prompt_text, example_index, max_tokens=1024):
99
+ """
100
+ Generate completion with retry logic and better error logging.
101
+ """
102
+ for attempt in range(3): # retry up to 3 times
103
+ try:
104
+ if candidate_model.startswith("gemini"):
105
+ model = genai.GenerativeModel(candidate_model)
106
+ full_prompt = ""
107
+ if system_prompt:
108
+ full_prompt += f"System: {system_prompt}\n"
109
+ full_prompt += f"User: {prompt_text}"
110
+
111
+ response = model.generate_content(
112
+ full_prompt,
113
+ generation_config={"max_output_tokens": max_tokens, "temperature": 0.7}
114
+ )
115
+ completion = response.text if response.text else "[EMPTY GEMINI OUTPUT]"
116
+ else:
117
+ messages = []
118
+ if system_prompt:
119
+ messages.append({"role": "system", "content": system_prompt})
120
+ messages.append({"role": "user", "content": prompt_text})
121
+
122
+ # Skip temperature for models that don't support it
123
+ if candidate_model in MODEL_DEFAULT_TEMP:
124
+ resp = openai.chat.completions.create(
125
+ model=candidate_model,
126
+ messages=messages,
127
+ max_completion_tokens=max_tokens
128
+ )
129
+ else:
130
+ resp = openai.chat.completions.create(
131
+ model=candidate_model,
132
+ messages=messages,
133
+ temperature=0.7,
134
+ max_completion_tokens=max_tokens
135
+ )
136
+ completion = resp.choices[0].message.content
137
+ print(resp)
138
+
139
+ return completion.strip() if hasattr(completion, "strip") else completion
140
+
141
+ except Exception as e:
142
+ print(f"[ERROR] Candidate model {candidate_model} failed at dataset index {example_index} (attempt {attempt+1}/3)")
143
+ print(f"Prompt text: {prompt_text[:200]}...")
144
+ print(f"Error: {e}")
145
+ if attempt == 2: # after last attempt
146
+ return f"[ERROR after 3 retries: {str(e)}]"
147
+
148
+ def __call__(self, candidate_model, system_prompt, eval_subset=""):
149
+ html_lines = ["<h2>Evaluation Report</h2>", "<ul>"]
150
+
151
+ cumulative_total = 0.0
152
+ for i, example in enumerate(self.dataset):
153
+ dataset_index = self.indices[i] # actual dataset row index
154
+ prompt_obj = example.get("prompt", [])
155
+ prompt_text = " ".join([m.get("content", "") for m in prompt_obj])
156
+
157
+ completion_text = self.generate_with_candidate(candidate_model, system_prompt, prompt_text, dataset_index)
158
+ score = self.score_with_grader(prompt_text, completion_text, dataset_index)
159
+
160
+ # update running totals (per eval_id)
161
+ cumulative_total += score
162
+ cumulative_avg = cumulative_total / (i + 1)
163
+
164
+ self.scores.append(score)
165
+ html_lines.append(f"<li>Dataset Row {dataset_index}: Score = {score:.3f}</li>")
166
+
167
+ # create individual sample record
168
+ self.sample_records.append({
169
+ "eval_id": self.eval_id,
170
+ "timestamp": datetime.utcnow().isoformat(),
171
+ "candidate_model": candidate_model,
172
+ "system_prompt": system_prompt,
173
+ "eval_subset": eval_subset,
174
+ "seed": self.seed,
175
+ "dataset_index": dataset_index,
176
+ "prompt_text": prompt_text,
177
+ "completion_text": completion_text,
178
+ "score": float(score),
179
+ "cumulative_total": float(cumulative_total),
180
+ "cumulative_avg": float(cumulative_avg)
181
+ })
182
+
183
+ self.htmls = "\n".join(html_lines) + "</ul>"
184
+ return self
185
+
186
+ # -------------------------
187
+ # Helper to generate HTML table from runs
188
+ # -------------------------
189
+ def generate_runs_html():
190
+ runs = []
191
+ if os.path.exists(RUNS_FILE):
192
+ try:
193
+ with open(RUNS_FILE, "r", encoding="utf-8") as f:
194
+ runs = json.load(f)
195
+ if not isinstance(runs, list):
196
+ runs = []
197
+ except (json.JSONDecodeError, ValueError):
198
+ runs = []
199
+
200
+ if runs:
201
+ table_rows = ""
202
+ for r in reversed(runs):
203
+ table_rows += f"""
204
+ <tr>
205
+ <td>{r.get('eval_id','')}</td>
206
+ <td>{r.get('timestamp','')}</td>
207
+ <td>{r.get('candidate_model','')}</td>
208
+ <td>{r.get('system_prompt','')}</td>
209
+ <td>{r.get('eval_subset','')}</td>
210
+ <td>{r.get('seed','')}</td>
211
+ <td>{r.get('dataset_index','')}</td>
212
+ <td>{r.get('prompt_text','')[:80]}...</td>
213
+ <td>{(r.get('completion_text') or '').strip()[:80]}...</td>
214
+ <td>{r.get('score',0.0):.3f}</td>
215
+ <td>{r.get('cumulative_total',0.0):.3f}</td>
216
+ <td>{r.get('cumulative_avg',0.0):.3f}</td>
217
+ </tr>
218
+ """
219
+ runs_html = f"""
220
+ <h3>Evaluation History (Per Sample)</h3>
221
+ <div style="max-height:300px; overflow:auto;">
222
+ <table border="1" style="border-collapse: collapse; padding:5px; width:100%; table-layout: fixed; word-wrap: break-word;">
223
+ <thead>
224
+ <tr>
225
+ <th>Eval ID</th>
226
+ <th>Timestamp</th>
227
+ <th>Candidate Model</th>
228
+ <th>System Prompt</th>
229
+ <th>Eval Subset</th>
230
+ <th>Seed</th>
231
+ <th>Dataset Row</th>
232
+ <th>Prompt Text</th>
233
+ <th>Completion Text</th>
234
+ <th>Score</th>
235
+ <th>Cumulative Total</th>
236
+ <th>Cumulative Avg</th>
237
+ </tr>
238
+ </thead>
239
+ <tbody>
240
+ {table_rows}
241
+ </tbody>
242
+ </table>
243
+ </div>
244
+ """
245
+ else:
246
+ runs_html = "<p>No evaluations yet.</p>"
247
+
248
+ return runs_html
249
+
250
+ # -------------------------
251
+ # Clear runs file
252
+ # -------------------------
253
+ def clear_runs():
254
+ with open(RUNS_FILE, "w", encoding="utf-8") as f:
255
+ json.dump([], f, indent=2)
256
+ return "<p>No evaluations yet.</p>"
257
+
258
+ # -------------------------
259
+ # Gradio UI function
260
+ # -------------------------
261
+ def run_eval_ui(candidate_model, system_prompt, eval_subset, num_examples, seed):
262
+ dataset_file = DATASET_FILES.get(eval_subset)
263
+ if not dataset_file:
264
+ return "<p style='color:red'>Invalid dataset</p>", {}, generate_runs_html()
265
+
266
+ seed_val = int(seed) if seed else None
267
+ num_val = int(num_examples) if num_examples else None
268
+
269
+ eval_obj = HealthBenchEval(dataset_file, num_examples=num_val, seed=seed_val)
270
+ result = eval_obj(candidate_model, system_prompt, eval_subset=eval_subset)
271
+
272
+ # Load existing runs
273
+ runs = []
274
+ if os.path.exists(RUNS_FILE):
275
+ try:
276
+ with open(RUNS_FILE, "r", encoding="utf-8") as f:
277
+ runs = json.load(f)
278
+ if not isinstance(runs, list):
279
+ runs = []
280
+ except (json.JSONDecodeError, ValueError):
281
+ runs = []
282
+
283
+ runs.extend(result.sample_records)
284
+
285
+ with open(RUNS_FILE, "w", encoding="utf-8") as f:
286
+ json.dump(runs, f, indent=2)
287
+
288
+ runs_html = generate_runs_html()
289
+
290
+ metrics = {
291
+ "eval_id": result.eval_id,
292
+ "mean_score": float(np.mean(result.scores)) if result.scores else 0.0,
293
+ "std_score": float(np.std(result.scores)) if result.scores else 0.0,
294
+ "n_samples": len(result.scores),
295
+ "seed": seed_val
296
+ }
297
+
298
+ return result.htmls, metrics, runs_html
299
+
300
+ # -------------------------
301
+ # Gradio UI
302
+ # -------------------------
303
+ def ui():
304
+ with gr.Blocks(title="HealthBench OpenAI + Gemini Evaluation") as demo:
305
+ gr.Markdown("## HealthBench Evaluation (OpenAI + Gemini API-based)")
306
+
307
+ with gr.Row():
308
+ candidate_model = gr.Dropdown(
309
+ label="Candidate model",
310
+ choices=CANDIDATE_MODELS,
311
+ value="gpt-4o-mini",
312
+ )
313
+ eval_subset = gr.Dropdown(
314
+ label="Eval subset",
315
+ choices=list(DATASET_FILES.keys()),
316
+ value="regular"
317
+ )
318
+ num_examples = gr.Number(label="# examples (leave blank for all)", value=1, precision=0)
319
+ seed = gr.Textbox(label="Random Seed (optional)", placeholder="Enter a seed for reproducibility")
320
+
321
+ system_prompt = gr.Textbox(
322
+ label="System Prompt (optional)",
323
+ placeholder="Enter a system prompt here for the candidate model",
324
+ lines=3
325
+ )
326
+
327
+ run_btn = gr.Button("Run evaluation")
328
+
329
+ output_html = gr.HTML(label="Evaluation Report")
330
+ output_metrics = gr.JSON(label="Metrics JSON")
331
+ output_all_runs = gr.HTML(label="Evaluation History", value=generate_runs_html())
332
+
333
+ with gr.Row():
334
+ clear_btn = gr.Button("Clear History")
335
+
336
+ # Connect buttons
337
+ run_btn.click(
338
+ fn=run_eval_ui,
339
+ inputs=[candidate_model, system_prompt, eval_subset, num_examples, seed],
340
+ outputs=[output_html, output_metrics, output_all_runs]
341
+ )
342
+
343
+ clear_btn.click(
344
+ fn=clear_runs,
345
+ inputs=[],
346
+ outputs=[output_all_runs]
347
+ )
348
+
349
+ return demo
350
+
351
+ if __name__ == "__main__":
352
+ demo = ui()
353
+ demo.queue(max_size=5)
354
+ demo.launch()
postBuild ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ set -e
2
+ git clone https://github.com/openai/simple-evals.git
3
+ mv simple-evals/simple_evals ./simple_evals
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.44.0
2
+ pandas
3
+ numpy
4
+ blobfile
5
+ openai>=1.44.0
6
+ jinja2
7
+ tqdm
8
+ requests
9
+ google-generativeai
10
+ pymongo[srv]
11
+ dnspython
runs.json ADDED
@@ -0,0 +1 @@
 
 
1
+ []