|
|
import os |
|
|
import json |
|
|
import re |
|
|
import uuid |
|
|
import io |
|
|
import csv |
|
|
from datetime import datetime |
|
|
import openai |
|
|
import gradio as gr |
|
|
import numpy as np |
|
|
import google.generativeai as genai |
|
|
import random |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PHYSICIAN_COMPLETION_MODES = {"Group 1": 1, "Group 2": 2, "Group 3": 3} |
|
|
|
|
|
DATASET_FILES = { |
|
|
"regular": os.path.join(os.path.dirname(__file__), "data", "oss_eval.jsonl"), |
|
|
"hard": os.path.join(os.path.dirname(__file__), "data", "hard_2025-05-08-21-00-10.jsonl"), |
|
|
"consensus": os.path.join(os.path.dirname(__file__), "data", "consensus_2025-05-09-20-00-46.jsonl"), |
|
|
} |
|
|
|
|
|
CANDIDATE_MODELS = [ |
|
|
"gpt-4.1", |
|
|
"gpt-4o-mini", |
|
|
"gpt-5-chat-latest" |
|
|
] |
|
|
|
|
|
GRADER_MODEL = "gpt-4o-mini" |
|
|
|
|
|
openai.api_key = os.getenv("OPENAI_API_KEY") |
|
|
genai.configure(api_key=os.getenv("GEMINI_API_KEY")) |
|
|
|
|
|
|
|
|
MODEL_DEFAULT_TEMP = ["o4-mini"] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def read_jsonl(file_path): |
|
|
data = [] |
|
|
with open(file_path, "r", encoding="utf-8") as f: |
|
|
for line in f: |
|
|
data.append(json.loads(line)) |
|
|
return data |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class HealthBenchEval: |
|
|
def __init__(self, dataset_file, num_examples=None, seed=None): |
|
|
self.all_data = read_jsonl(dataset_file) |
|
|
|
|
|
if seed is not None and num_examples: |
|
|
random.seed(seed) |
|
|
self.indices = random.sample(range(len(self.all_data)), min(num_examples, len(self.all_data))) |
|
|
self.dataset = [self.all_data[i] for i in self.indices] |
|
|
elif num_examples: |
|
|
self.indices = list(range(num_examples)) |
|
|
self.dataset = self.all_data[:num_examples] |
|
|
else: |
|
|
self.indices = list(range(len(self.all_data))) |
|
|
self.dataset = self.all_data |
|
|
|
|
|
self.scores = [] |
|
|
self.htmls = "" |
|
|
self.sample_records = [] |
|
|
self.seed = seed |
|
|
self.eval_id = str(uuid.uuid4()) |
|
|
|
|
|
def score_with_grader(self, prompt_text, completion_text, example_index): |
|
|
prompt = f""" |
|
|
You are a grader model. Score the following completion from 0.0 to 1.0. |
|
|
Prompt: {prompt_text} |
|
|
Completion: {completion_text} |
|
|
Return only a number between 0 and 1. |
|
|
""" |
|
|
try: |
|
|
resp = openai.chat.completions.create( |
|
|
model=GRADER_MODEL, |
|
|
messages=[{"role": "user", "content": prompt}], |
|
|
max_completion_tokens=50 |
|
|
) |
|
|
score_text = resp.choices[0].message.content.strip() |
|
|
match = re.search(r"0(?:\.\d+)?|1(?:\.0+)?", score_text) |
|
|
if match: |
|
|
score = float(match.group(0)) |
|
|
else: |
|
|
score = 0.0 |
|
|
return max(0.0, min(1.0, score)) |
|
|
except Exception as e: |
|
|
print(f"Grader error: {e}") |
|
|
return 0.0 |
|
|
|
|
|
def generate_with_candidate(self, candidate_model, system_prompt, prompt_text, example_index, max_tokens=1024): |
|
|
for attempt in range(3): |
|
|
try: |
|
|
if candidate_model.startswith("gemini"): |
|
|
model = genai.GenerativeModel(candidate_model) |
|
|
full_prompt = "" |
|
|
if system_prompt: |
|
|
full_prompt += f"System: {system_prompt}\n" |
|
|
full_prompt += f"User: {prompt_text}" |
|
|
|
|
|
response = model.generate_content( |
|
|
full_prompt, |
|
|
generation_config={"max_output_tokens": max_tokens, "temperature": 0.7} |
|
|
) |
|
|
completion = response.text if response.text else "[EMPTY GEMINI OUTPUT]" |
|
|
else: |
|
|
messages = [] |
|
|
if system_prompt: |
|
|
messages.append({"role": "system", "content": system_prompt}) |
|
|
messages.append({"role": "user", "content": prompt_text}) |
|
|
|
|
|
if candidate_model in MODEL_DEFAULT_TEMP: |
|
|
resp = openai.chat.completions.create( |
|
|
model=candidate_model, |
|
|
messages=messages, |
|
|
max_completion_tokens=max_tokens |
|
|
) |
|
|
else: |
|
|
resp = openai.chat.completions.create( |
|
|
model=candidate_model, |
|
|
messages=messages, |
|
|
temperature=0.7, |
|
|
max_completion_tokens=max_tokens |
|
|
) |
|
|
completion = resp.choices[0].message.content |
|
|
|
|
|
return completion.strip() if hasattr(completion, "strip") else completion |
|
|
|
|
|
except Exception as e: |
|
|
print(f"[ERROR] Candidate model {candidate_model} failed at dataset index {example_index} (attempt {attempt+1}/3)") |
|
|
print(f"Prompt text: {prompt_text[:200]}...") |
|
|
print(f"Error: {e}") |
|
|
if attempt == 2: |
|
|
return f"[ERROR after 3 retries: {str(e)}]" |
|
|
|
|
|
def __call__(self, candidate_model, system_prompt, eval_subset=""): |
|
|
html_lines = ["<h2>Evaluation Report</h2>", "<ul>"] |
|
|
|
|
|
cumulative_total = 0.0 |
|
|
for i, example in enumerate(self.dataset): |
|
|
dataset_index = self.indices[i] |
|
|
prompt_obj = example.get("prompt", []) |
|
|
prompt_text = " ".join([m.get("content", "") for m in prompt_obj]) |
|
|
|
|
|
completion_text = self.generate_with_candidate(candidate_model, system_prompt, prompt_text, dataset_index) |
|
|
score = self.score_with_grader(prompt_text, completion_text, dataset_index) |
|
|
|
|
|
cumulative_total += score |
|
|
cumulative_avg = cumulative_total / (i + 1) |
|
|
|
|
|
self.scores.append(score) |
|
|
html_lines.append(f"<li>Dataset Row {dataset_index}: Score = {score:.3f}</li>") |
|
|
|
|
|
self.sample_records.append({ |
|
|
"eval_id": self.eval_id, |
|
|
"timestamp": datetime.utcnow().isoformat(), |
|
|
"candidate_model": candidate_model, |
|
|
"system_prompt": system_prompt, |
|
|
"eval_subset": eval_subset, |
|
|
"seed": self.seed, |
|
|
"dataset_index": dataset_index, |
|
|
"prompt_text": prompt_text, |
|
|
"completion_text": completion_text, |
|
|
"score": float(score), |
|
|
"cumulative_total": float(cumulative_total), |
|
|
"cumulative_avg": float(cumulative_avg) |
|
|
}) |
|
|
|
|
|
self.htmls = "\n".join(html_lines) + "</ul>" |
|
|
return self |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_runs_html(runs): |
|
|
if runs: |
|
|
table_rows = "" |
|
|
for r in reversed(runs): |
|
|
table_rows += f""" |
|
|
<tr> |
|
|
<td>{r.get('eval_id','')}</td> |
|
|
<td>{r.get('timestamp','')}</td> |
|
|
<td>{r.get('candidate_model','')}</td> |
|
|
<td>{r.get('system_prompt','')}</td> |
|
|
<td>{r.get('eval_subset','')}</td> |
|
|
<td>{r.get('seed','')}</td> |
|
|
<td>{r.get('dataset_index','')}</td> |
|
|
<td>{r.get('prompt_text','')[:80]}...</td> |
|
|
<td>{(r.get('completion_text') or '').strip()[:80]}...</td> |
|
|
<td>{r.get('score',0.0):.3f}</td> |
|
|
<td>{r.get('cumulative_total',0.0):.3f}</td> |
|
|
<td>{r.get('cumulative_avg',0.0):.3f}</td> |
|
|
</tr> |
|
|
""" |
|
|
runs_html = f""" |
|
|
<h3>Evaluation History (Per Sample)</h3> |
|
|
<div style="max-height:300px; overflow:auto;"> |
|
|
<table border="1" style="border-collapse: collapse; padding:5px; width:100%; table-layout: fixed; word-wrap: break-word;"> |
|
|
<thead> |
|
|
<tr> |
|
|
<th>Eval ID</th> |
|
|
<th>Timestamp</th> |
|
|
<th>Candidate Model</th> |
|
|
<th>System Prompt</th> |
|
|
<th>Eval Subset</th> |
|
|
<th>Seed</th> |
|
|
<th>Dataset Row</th> |
|
|
<th>Prompt Text</th> |
|
|
<th>Completion Text</th> |
|
|
<th>Score</th> |
|
|
<th>Cumulative Total</th> |
|
|
<th>Cumulative Avg</th> |
|
|
</tr> |
|
|
</thead> |
|
|
<tbody> |
|
|
{table_rows} |
|
|
</tbody> |
|
|
</table> |
|
|
</div> |
|
|
""" |
|
|
else: |
|
|
runs_html = "<p>No evaluations yet.</p>" |
|
|
return runs_html |
|
|
|
|
|
def generate_csv(runs): |
|
|
if not runs: |
|
|
return None |
|
|
output = io.StringIO() |
|
|
fieldnames = ["eval_id", "timestamp", "candidate_model", "system_prompt", "eval_subset", |
|
|
"seed", "dataset_index", "prompt_text", "completion_text", "score", |
|
|
"cumulative_total", "cumulative_avg"] |
|
|
writer = csv.DictWriter(output, fieldnames=fieldnames) |
|
|
writer.writeheader() |
|
|
for run in runs: |
|
|
writer.writerow(run) |
|
|
csv_data = output.getvalue() |
|
|
output.close() |
|
|
return csv_data |
|
|
|
|
|
def prepare_download(runs): |
|
|
csv_data = generate_csv(runs) |
|
|
if not csv_data: |
|
|
return None |
|
|
filename = f"eval_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" |
|
|
filepath = os.path.join("/tmp", filename) |
|
|
with open(filepath, "w", encoding="utf-8") as f: |
|
|
f.write(csv_data) |
|
|
return filepath |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run_eval_ui(candidate_model, system_prompt, eval_subset, num_examples, seed, runs): |
|
|
dataset_file = DATASET_FILES.get(eval_subset) |
|
|
if not dataset_file: |
|
|
return "<p style='color:red'>Invalid dataset</p>", {}, generate_runs_html(runs), runs |
|
|
|
|
|
seed_val = int(seed) if seed else None |
|
|
num_val = int(num_examples) if num_examples else None |
|
|
|
|
|
eval_obj = HealthBenchEval(dataset_file, num_examples=num_val, seed=seed_val) |
|
|
result = eval_obj(candidate_model, system_prompt, eval_subset=eval_subset) |
|
|
|
|
|
runs.extend(result.sample_records) |
|
|
runs_html = generate_runs_html(runs) |
|
|
|
|
|
metrics = { |
|
|
"eval_id": result.eval_id, |
|
|
"mean_score": float(np.mean(result.scores)) if result.scores else 0.0, |
|
|
"std_score": float(np.std(result.scores)) if result.scores else 0.0, |
|
|
"n_samples": len(result.scores), |
|
|
"seed": seed_val |
|
|
} |
|
|
|
|
|
return result.htmls, metrics, runs_html, runs |
|
|
|
|
|
def clear_runs(): |
|
|
return "<p>No evaluations yet.</p>", [] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def ui(): |
|
|
with gr.Blocks(title="HealthBench OpenAI + Gemini Evaluation") as demo: |
|
|
gr.Markdown("## HealthBench Evaluation (OpenAI + Gemini API-based)") |
|
|
|
|
|
with gr.Row(): |
|
|
candidate_model = gr.Dropdown( |
|
|
label="Candidate model", |
|
|
choices=CANDIDATE_MODELS, |
|
|
value="gpt-4o-mini", |
|
|
) |
|
|
eval_subset = gr.Dropdown( |
|
|
label="Eval subset", |
|
|
choices=list(DATASET_FILES.keys()), |
|
|
value="regular" |
|
|
) |
|
|
num_examples = gr.Number(label="# examples (leave blank for all)", value=1, precision=0) |
|
|
seed = gr.Textbox(label="Random Seed (optional)", placeholder="Enter a seed for reproducibility") |
|
|
|
|
|
system_prompt = gr.Textbox( |
|
|
label="System Prompt (optional)", |
|
|
placeholder="Enter a system prompt here for the candidate model", |
|
|
lines=3 |
|
|
) |
|
|
|
|
|
run_btn = gr.Button("Run evaluation") |
|
|
|
|
|
output_html = gr.HTML(label="Evaluation Report") |
|
|
output_metrics = gr.JSON(label="Metrics JSON") |
|
|
output_all_runs = gr.HTML(label="Evaluation History") |
|
|
session_runs = gr.State([]) |
|
|
|
|
|
with gr.Row(): |
|
|
clear_btn = gr.Button("Clear History") |
|
|
download_btn = gr.DownloadButton( |
|
|
label="Download CSV", |
|
|
variant="secondary" |
|
|
) |
|
|
|
|
|
run_btn.click( |
|
|
fn=run_eval_ui, |
|
|
inputs=[candidate_model, system_prompt, eval_subset, num_examples, seed, session_runs], |
|
|
outputs=[output_html, output_metrics, output_all_runs, session_runs] |
|
|
) |
|
|
|
|
|
clear_btn.click( |
|
|
fn=clear_runs, |
|
|
inputs=[], |
|
|
outputs=[output_all_runs, session_runs] |
|
|
) |
|
|
|
|
|
download_btn.click( |
|
|
fn=prepare_download, |
|
|
inputs=[session_runs], |
|
|
outputs=[download_btn] |
|
|
) |
|
|
|
|
|
return demo |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo = ui() |
|
|
demo.queue(max_size=5) |
|
|
demo.launch() |
|
|
|