import os import json import re from datetime import datetime import openai import gradio as gr import numpy as np import google.generativeai as genai # ------------------------- # Config # ------------------------- PHYSICIAN_COMPLETION_MODES = {"Group 1": 1, "Group 2": 2, "Group 3": 3} DATASET_FILES = { "regular": os.path.join(os.path.dirname(__file__), "data", "oss_eval.jsonl"), "hard": os.path.join(os.path.dirname(__file__), "data", "hard_2025-05-08-21-00-10.jsonl"), "consensus": os.path.join(os.path.dirname(__file__), "data", "consensus_2025-05-09-20-00-46.jsonl"), } CANDIDATE_MODELS = [ "gpt-4.1", "gpt-4.1-mini", "gpt-4o", "gpt-4o-mini", "gpt-5", "gemini-2.5-pro", "gemini-2.5-flash", "gemini-2.5-flash-lite", ] GRADER_MODEL = "gpt-4o-mini" openai.api_key = os.getenv("OPENAI_API_KEY") genai.configure(api_key=os.getenv("GEMINI_API_KEY")) MODEL_DEFAULT_TEMP = ["gpt-4.1", "gpt-4o", "gpt-5"] # Local JSON file for storing runs RUNS_FILE = "runs.json" # ------------------------- # Helper to read JSONL # ------------------------- def read_jsonl(file_path, num_examples=None, seed=None): data = [] with open(file_path, "r", encoding="utf-8") as f: for line in f: data.append(json.loads(line)) if num_examples: if seed is not None: np.random.seed(seed) idxs = np.random.choice(len(data), size=num_examples, replace=False) data = [data[i] for i in idxs] return data # ------------------------- # Evaluation class # ------------------------- class HealthBenchEval: def __init__(self, dataset_file, num_examples=None, seed=None): self.dataset = read_jsonl(dataset_file, num_examples, seed) self.metrics = {} self.htmls = "" self.convos = [] def score_with_grader(self, prompt_text, completion_text, example_index): prompt = f""" You are a grader model. Score the following completion from 0.0 to 1.0. Prompt: {prompt_text} Completion: {completion_text} Return only a number between 0 and 1. """ try: resp = openai.chat.completions.create( model=GRADER_MODEL, messages=[{"role": "user", "content": prompt}], max_completion_tokens=50 ) score_text = resp.choices[0].message.content.strip() match = re.search(r"0(?:\.\d+)?|1(?:\.0+)?", score_text) if match: score = float(match.group(0)) else: score = 0.0 score = max(0.0, min(1.0, score)) return score except Exception as e: print(f"Grader error: {e}") return 0.0 def generate_with_candidate(self, candidate_model, system_prompt, prompt_text, example_index, max_tokens=256): try: if candidate_model.startswith("gemini"): model = genai.GenerativeModel(candidate_model) full_prompt = "" if system_prompt: full_prompt += f"System: {system_prompt}\n" full_prompt += f"User: {prompt_text}" response = model.generate_content( full_prompt, generation_config={"max_output_tokens": max_tokens, "temperature": 0.7} ) completion = response.text else: messages = [] if system_prompt: messages.append({"role": "system", "content": system_prompt}) messages.append({"role": "user", "content": prompt_text}) if candidate_model in MODEL_DEFAULT_TEMP: resp = openai.chat.completions.create( model=candidate_model, messages=messages, max_completion_tokens=max_tokens ) else: resp = openai.chat.completions.create( model=candidate_model, messages=messages, temperature=0.7, max_completion_tokens=max_tokens ) completion = resp.choices[0].message.content if hasattr(completion, "strip"): completion = completion.strip() return completion except Exception as e: print(f"Candidate model error for example {example_index+1}: {e}") return "" def __call__(self, candidate_model, system_prompt, num_examples=None): scores = [] html_lines = ["
| Timestamp | Candidate Model | System Prompt | Eval Subset | Num Examples | Seed | Overall Score |
|---|
No evaluations yet.
" return runs_html # ------------------------- # Gradio UI function # ------------------------- def run_eval_ui(candidate_model, system_prompt, eval_subset, num_examples, seed_number): dataset_file = DATASET_FILES.get(eval_subset) if not dataset_file: return "Invalid dataset
", {}, generate_runs_html() seed = int(seed_number) if seed_number else None eval_obj = HealthBenchEval(dataset_file, num_examples=int(num_examples) if num_examples else None, seed=seed) result = eval_obj(candidate_model, system_prompt, num_examples=int(num_examples) if num_examples else None) # Save run to local JSON run_record = { "timestamp": datetime.utcnow().isoformat(), "candidate_model": candidate_model, "system_prompt": system_prompt, "overall_score": float(result.metrics.get("overall_score", 0.0)), "num_examples": int(num_examples) if num_examples else None, "eval_subset": eval_subset, "seed_number": int(seed_number) if seed_number else None } runs = [] if os.path.exists(RUNS_FILE): try: with open(RUNS_FILE, "r", encoding="utf-8") as f: runs = json.load(f) if not isinstance(runs, list): runs = [] except (json.JSONDecodeError, ValueError): runs = [] runs.append(run_record) with open(RUNS_FILE, "w", encoding="utf-8") as f: json.dump(runs, f, indent=2) runs_html = generate_runs_html() return result.htmls, result.metrics, runs_html # ------------------------- # Gradio UI # ------------------------- def ui(): with gr.Blocks(title="HealthBench Week 1 Evaluation") as demo: gr.Markdown("## HealthBench Evaluation Week 1") with gr.Row(): candidate_model = gr.Dropdown( label="Candidate model", choices=CANDIDATE_MODELS, value="gpt-4o-mini", ) eval_subset = gr.Dropdown( label="Eval subset", choices=list(DATASET_FILES.keys()), value="regular" ) num_examples = gr.Number(label="# examples (leave blank for all)", value=1, precision=0) seed_number = gr.Number(label="Seed (optional)", value=None, precision=0) system_prompt = gr.Textbox( label="System Prompt (optional)", placeholder="Enter a system prompt here for the candidate model", lines=3 ) run_btn = gr.Button("Run evaluation") output_html = gr.HTML(label="Evaluation Report") output_metrics = gr.JSON(label="Metrics JSON") output_all_runs = gr.HTML(label="Evaluation History", value=generate_runs_html()) run_btn.click( fn=run_eval_ui, inputs=[candidate_model, system_prompt, eval_subset, num_examples, seed_number], outputs=[output_html, output_metrics, output_all_runs] ) return demo if __name__ == "__main__": demo = ui() demo.queue(max_size=5) demo.launch()