admin-healthelic's picture
Update app.py
513cae4 verified
import os
import json
import re
from datetime import datetime
import openai
import gradio as gr
import numpy as np
import google.generativeai as genai
# -------------------------
# Config
# -------------------------
PHYSICIAN_COMPLETION_MODES = {"Group 1": 1, "Group 2": 2, "Group 3": 3}
DATASET_FILES = {
"regular": os.path.join(os.path.dirname(__file__), "data", "oss_eval.jsonl"),
"hard": os.path.join(os.path.dirname(__file__), "data", "hard_2025-05-08-21-00-10.jsonl"),
"consensus": os.path.join(os.path.dirname(__file__), "data", "consensus_2025-05-09-20-00-46.jsonl"),
}
CANDIDATE_MODELS = [
"gpt-4.1",
"gpt-4.1-mini",
"gpt-4o",
"gpt-4o-mini",
"gpt-5",
"gemini-2.5-pro",
"gemini-2.5-flash",
"gemini-2.5-flash-lite",
]
GRADER_MODEL = "gpt-4o-mini"
openai.api_key = os.getenv("OPENAI_API_KEY")
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
MODEL_DEFAULT_TEMP = ["gpt-4.1", "gpt-4o", "gpt-5"]
# Local JSON file for storing runs
RUNS_FILE = "runs.json"
# -------------------------
# Helper to read JSONL
# -------------------------
def read_jsonl(file_path, num_examples=None, seed=None):
data = []
with open(file_path, "r", encoding="utf-8") as f:
for line in f:
data.append(json.loads(line))
if num_examples:
if seed is not None:
np.random.seed(seed)
idxs = np.random.choice(len(data), size=num_examples, replace=False)
data = [data[i] for i in idxs]
return data
# -------------------------
# Evaluation class
# -------------------------
class HealthBenchEval:
def __init__(self, dataset_file, num_examples=None, seed=None):
self.dataset = read_jsonl(dataset_file, num_examples, seed)
self.metrics = {}
self.htmls = ""
self.convos = []
def score_with_grader(self, prompt_text, completion_text, example_index):
prompt = f"""
You are a grader model. Score the following completion from 0.0 to 1.0.
Prompt: {prompt_text}
Completion: {completion_text}
Return only a number between 0 and 1.
"""
try:
resp = openai.chat.completions.create(
model=GRADER_MODEL,
messages=[{"role": "user", "content": prompt}],
max_completion_tokens=50
)
score_text = resp.choices[0].message.content.strip()
match = re.search(r"0(?:\.\d+)?|1(?:\.0+)?", score_text)
if match:
score = float(match.group(0))
else:
score = 0.0
score = max(0.0, min(1.0, score))
return score
except Exception as e:
print(f"Grader error: {e}")
return 0.0
def generate_with_candidate(self, candidate_model, system_prompt, prompt_text, example_index, max_tokens=256):
try:
if candidate_model.startswith("gemini"):
model = genai.GenerativeModel(candidate_model)
full_prompt = ""
if system_prompt:
full_prompt += f"System: {system_prompt}\n"
full_prompt += f"User: {prompt_text}"
response = model.generate_content(
full_prompt,
generation_config={"max_output_tokens": max_tokens, "temperature": 0.7}
)
completion = response.text
else:
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt_text})
if candidate_model in MODEL_DEFAULT_TEMP:
resp = openai.chat.completions.create(
model=candidate_model,
messages=messages,
max_completion_tokens=max_tokens
)
else:
resp = openai.chat.completions.create(
model=candidate_model,
messages=messages,
temperature=0.7,
max_completion_tokens=max_tokens
)
completion = resp.choices[0].message.content
if hasattr(completion, "strip"):
completion = completion.strip()
return completion
except Exception as e:
print(f"Candidate model error for example {example_index+1}: {e}")
return ""
def __call__(self, candidate_model, system_prompt, num_examples=None):
scores = []
html_lines = ["<h2>Evaluation Report</h2>", "<ul>"]
for i, example in enumerate(self.dataset[:num_examples] if num_examples else self.dataset):
prompt_obj = example.get("prompt", [])
prompt_text = " ".join([m.get("content", "") for m in prompt_obj])
completion_text = self.generate_with_candidate(candidate_model, system_prompt, prompt_text, i)
score = self.score_with_grader(prompt_text, completion_text, i)
scores.append(score)
html_lines.append(f"<li>Example {i+1}: Score = {score:.3f}</li>")
mean_score = float(np.mean(scores)) if scores else 0.0
std_score = float(np.std(scores)) if scores else 0.0
n_samples = len(scores)
self.metrics = {
"overall_score": mean_score,
"overall_score:n_samples": n_samples,
"overall_score:std": std_score,
}
self.htmls = "\n".join(html_lines) + "</ul>"
return self
# -------------------------
# Helper to generate HTML table from runs
# -------------------------
def generate_runs_html():
runs = []
if os.path.exists(RUNS_FILE):
try:
with open(RUNS_FILE, "r", encoding="utf-8") as f:
runs = json.load(f)
if not isinstance(runs, list):
runs = []
except (json.JSONDecodeError, ValueError):
runs = []
if runs:
table_rows = ""
for r in reversed(runs):
table_rows += f"""
<tr>
<td>{r.get('timestamp','')}</td>
<td>{r.get('candidate_model','')}</td>
<td>{r.get('system_prompt','')}</td>
<td>{r.get('eval_subset','')}</td>
<td>{r.get('num_examples','')}</td>
<td>{r.get('seed_number','')}</td>
<td>{r.get('overall_score',0.0):.3f}</td>
</tr>
"""
runs_html = f"""
<h3>Evaluation History</h3>
<div style="max-height:300px; overflow:auto;">
<table border="1" style="border-collapse: collapse; padding:5px; width:100%;">
<thead>
<tr>
<th>Timestamp</th>
<th>Candidate Model</th>
<th>System Prompt</th>
<th>Eval Subset</th>
<th>Num Examples</th>
<th>Seed</th>
<th>Overall Score</th>
</tr>
</thead>
<tbody>
{table_rows}
</tbody>
</table>
</div>
"""
else:
runs_html = "<p>No evaluations yet.</p>"
return runs_html
# -------------------------
# Gradio UI function
# -------------------------
def run_eval_ui(candidate_model, system_prompt, eval_subset, num_examples, seed_number):
dataset_file = DATASET_FILES.get(eval_subset)
if not dataset_file:
return "<p style='color:red'>Invalid dataset</p>", {}, generate_runs_html()
seed = int(seed_number) if seed_number else None
eval_obj = HealthBenchEval(dataset_file, num_examples=int(num_examples) if num_examples else None, seed=seed)
result = eval_obj(candidate_model, system_prompt, num_examples=int(num_examples) if num_examples else None)
# Save run to local JSON
run_record = {
"timestamp": datetime.utcnow().isoformat(),
"candidate_model": candidate_model,
"system_prompt": system_prompt,
"overall_score": float(result.metrics.get("overall_score", 0.0)),
"num_examples": int(num_examples) if num_examples else None,
"eval_subset": eval_subset,
"seed_number": int(seed_number) if seed_number else None
}
runs = []
if os.path.exists(RUNS_FILE):
try:
with open(RUNS_FILE, "r", encoding="utf-8") as f:
runs = json.load(f)
if not isinstance(runs, list):
runs = []
except (json.JSONDecodeError, ValueError):
runs = []
runs.append(run_record)
with open(RUNS_FILE, "w", encoding="utf-8") as f:
json.dump(runs, f, indent=2)
runs_html = generate_runs_html()
return result.htmls, result.metrics, runs_html
# -------------------------
# Gradio UI
# -------------------------
def ui():
with gr.Blocks(title="HealthBench Week 1 Evaluation") as demo:
gr.Markdown("## HealthBench Evaluation Week 1")
with gr.Row():
candidate_model = gr.Dropdown(
label="Candidate model",
choices=CANDIDATE_MODELS,
value="gpt-4o-mini",
)
eval_subset = gr.Dropdown(
label="Eval subset",
choices=list(DATASET_FILES.keys()),
value="regular"
)
num_examples = gr.Number(label="# examples (leave blank for all)", value=1, precision=0)
seed_number = gr.Number(label="Seed (optional)", value=None, precision=0)
system_prompt = gr.Textbox(
label="System Prompt (optional)",
placeholder="Enter a system prompt here for the candidate model",
lines=3
)
run_btn = gr.Button("Run evaluation")
output_html = gr.HTML(label="Evaluation Report")
output_metrics = gr.JSON(label="Metrics JSON")
output_all_runs = gr.HTML(label="Evaluation History", value=generate_runs_html())
run_btn.click(
fn=run_eval_ui,
inputs=[candidate_model, system_prompt, eval_subset, num_examples, seed_number],
outputs=[output_html, output_metrics, output_all_runs]
)
return demo
if __name__ == "__main__":
demo = ui()
demo.queue(max_size=5)
demo.launch()