Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| import re | |
| from datetime import datetime | |
| import openai | |
| import gradio as gr | |
| import numpy as np | |
| import google.generativeai as genai | |
| # ------------------------- | |
| # Config | |
| # ------------------------- | |
| PHYSICIAN_COMPLETION_MODES = {"Group 1": 1, "Group 2": 2, "Group 3": 3} | |
| DATASET_FILES = { | |
| "regular": os.path.join(os.path.dirname(__file__), "data", "oss_eval.jsonl"), | |
| "hard": os.path.join(os.path.dirname(__file__), "data", "hard_2025-05-08-21-00-10.jsonl"), | |
| "consensus": os.path.join(os.path.dirname(__file__), "data", "consensus_2025-05-09-20-00-46.jsonl"), | |
| } | |
| CANDIDATE_MODELS = [ | |
| "gpt-4.1", | |
| "gpt-4.1-mini", | |
| "gpt-4o", | |
| "gpt-4o-mini", | |
| "gpt-5", | |
| "gemini-2.5-pro", | |
| "gemini-2.5-flash", | |
| "gemini-2.5-flash-lite", | |
| ] | |
| GRADER_MODEL = "gpt-4o-mini" | |
| openai.api_key = os.getenv("OPENAI_API_KEY") | |
| genai.configure(api_key=os.getenv("GEMINI_API_KEY")) | |
| MODEL_DEFAULT_TEMP = ["gpt-4.1", "gpt-4o", "gpt-5"] | |
| # Local JSON file for storing runs | |
| RUNS_FILE = "runs.json" | |
| # ------------------------- | |
| # Helper to read JSONL | |
| # ------------------------- | |
| def read_jsonl(file_path, num_examples=None, seed=None): | |
| data = [] | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| for line in f: | |
| data.append(json.loads(line)) | |
| if num_examples: | |
| if seed is not None: | |
| np.random.seed(seed) | |
| idxs = np.random.choice(len(data), size=num_examples, replace=False) | |
| data = [data[i] for i in idxs] | |
| return data | |
| # ------------------------- | |
| # Evaluation class | |
| # ------------------------- | |
| class HealthBenchEval: | |
| def __init__(self, dataset_file, num_examples=None, seed=None): | |
| self.dataset = read_jsonl(dataset_file, num_examples, seed) | |
| self.metrics = {} | |
| self.htmls = "" | |
| self.convos = [] | |
| def score_with_grader(self, prompt_text, completion_text, example_index): | |
| prompt = f""" | |
| You are a grader model. Score the following completion from 0.0 to 1.0. | |
| Prompt: {prompt_text} | |
| Completion: {completion_text} | |
| Return only a number between 0 and 1. | |
| """ | |
| try: | |
| resp = openai.chat.completions.create( | |
| model=GRADER_MODEL, | |
| messages=[{"role": "user", "content": prompt}], | |
| max_completion_tokens=50 | |
| ) | |
| score_text = resp.choices[0].message.content.strip() | |
| match = re.search(r"0(?:\.\d+)?|1(?:\.0+)?", score_text) | |
| if match: | |
| score = float(match.group(0)) | |
| else: | |
| score = 0.0 | |
| score = max(0.0, min(1.0, score)) | |
| return score | |
| except Exception as e: | |
| print(f"Grader error: {e}") | |
| return 0.0 | |
| def generate_with_candidate(self, candidate_model, system_prompt, prompt_text, example_index, max_tokens=256): | |
| try: | |
| if candidate_model.startswith("gemini"): | |
| model = genai.GenerativeModel(candidate_model) | |
| full_prompt = "" | |
| if system_prompt: | |
| full_prompt += f"System: {system_prompt}\n" | |
| full_prompt += f"User: {prompt_text}" | |
| response = model.generate_content( | |
| full_prompt, | |
| generation_config={"max_output_tokens": max_tokens, "temperature": 0.7} | |
| ) | |
| completion = response.text | |
| else: | |
| messages = [] | |
| if system_prompt: | |
| messages.append({"role": "system", "content": system_prompt}) | |
| messages.append({"role": "user", "content": prompt_text}) | |
| if candidate_model in MODEL_DEFAULT_TEMP: | |
| resp = openai.chat.completions.create( | |
| model=candidate_model, | |
| messages=messages, | |
| max_completion_tokens=max_tokens | |
| ) | |
| else: | |
| resp = openai.chat.completions.create( | |
| model=candidate_model, | |
| messages=messages, | |
| temperature=0.7, | |
| max_completion_tokens=max_tokens | |
| ) | |
| completion = resp.choices[0].message.content | |
| if hasattr(completion, "strip"): | |
| completion = completion.strip() | |
| return completion | |
| except Exception as e: | |
| print(f"Candidate model error for example {example_index+1}: {e}") | |
| return "" | |
| def __call__(self, candidate_model, system_prompt, num_examples=None): | |
| scores = [] | |
| html_lines = ["<h2>Evaluation Report</h2>", "<ul>"] | |
| for i, example in enumerate(self.dataset[:num_examples] if num_examples else self.dataset): | |
| prompt_obj = example.get("prompt", []) | |
| prompt_text = " ".join([m.get("content", "") for m in prompt_obj]) | |
| completion_text = self.generate_with_candidate(candidate_model, system_prompt, prompt_text, i) | |
| score = self.score_with_grader(prompt_text, completion_text, i) | |
| scores.append(score) | |
| html_lines.append(f"<li>Example {i+1}: Score = {score:.3f}</li>") | |
| mean_score = float(np.mean(scores)) if scores else 0.0 | |
| std_score = float(np.std(scores)) if scores else 0.0 | |
| n_samples = len(scores) | |
| self.metrics = { | |
| "overall_score": mean_score, | |
| "overall_score:n_samples": n_samples, | |
| "overall_score:std": std_score, | |
| } | |
| self.htmls = "\n".join(html_lines) + "</ul>" | |
| return self | |
| # ------------------------- | |
| # Helper to generate HTML table from runs | |
| # ------------------------- | |
| def generate_runs_html(): | |
| runs = [] | |
| if os.path.exists(RUNS_FILE): | |
| try: | |
| with open(RUNS_FILE, "r", encoding="utf-8") as f: | |
| runs = json.load(f) | |
| if not isinstance(runs, list): | |
| runs = [] | |
| except (json.JSONDecodeError, ValueError): | |
| runs = [] | |
| if runs: | |
| table_rows = "" | |
| for r in reversed(runs): | |
| table_rows += f""" | |
| <tr> | |
| <td>{r.get('timestamp','')}</td> | |
| <td>{r.get('candidate_model','')}</td> | |
| <td>{r.get('system_prompt','')}</td> | |
| <td>{r.get('eval_subset','')}</td> | |
| <td>{r.get('num_examples','')}</td> | |
| <td>{r.get('seed_number','')}</td> | |
| <td>{r.get('overall_score',0.0):.3f}</td> | |
| </tr> | |
| """ | |
| runs_html = f""" | |
| <h3>Evaluation History</h3> | |
| <div style="max-height:300px; overflow:auto;"> | |
| <table border="1" style="border-collapse: collapse; padding:5px; width:100%;"> | |
| <thead> | |
| <tr> | |
| <th>Timestamp</th> | |
| <th>Candidate Model</th> | |
| <th>System Prompt</th> | |
| <th>Eval Subset</th> | |
| <th>Num Examples</th> | |
| <th>Seed</th> | |
| <th>Overall Score</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| {table_rows} | |
| </tbody> | |
| </table> | |
| </div> | |
| """ | |
| else: | |
| runs_html = "<p>No evaluations yet.</p>" | |
| return runs_html | |
| # ------------------------- | |
| # Gradio UI function | |
| # ------------------------- | |
| def run_eval_ui(candidate_model, system_prompt, eval_subset, num_examples, seed_number): | |
| dataset_file = DATASET_FILES.get(eval_subset) | |
| if not dataset_file: | |
| return "<p style='color:red'>Invalid dataset</p>", {}, generate_runs_html() | |
| seed = int(seed_number) if seed_number else None | |
| eval_obj = HealthBenchEval(dataset_file, num_examples=int(num_examples) if num_examples else None, seed=seed) | |
| result = eval_obj(candidate_model, system_prompt, num_examples=int(num_examples) if num_examples else None) | |
| # Save run to local JSON | |
| run_record = { | |
| "timestamp": datetime.utcnow().isoformat(), | |
| "candidate_model": candidate_model, | |
| "system_prompt": system_prompt, | |
| "overall_score": float(result.metrics.get("overall_score", 0.0)), | |
| "num_examples": int(num_examples) if num_examples else None, | |
| "eval_subset": eval_subset, | |
| "seed_number": int(seed_number) if seed_number else None | |
| } | |
| runs = [] | |
| if os.path.exists(RUNS_FILE): | |
| try: | |
| with open(RUNS_FILE, "r", encoding="utf-8") as f: | |
| runs = json.load(f) | |
| if not isinstance(runs, list): | |
| runs = [] | |
| except (json.JSONDecodeError, ValueError): | |
| runs = [] | |
| runs.append(run_record) | |
| with open(RUNS_FILE, "w", encoding="utf-8") as f: | |
| json.dump(runs, f, indent=2) | |
| runs_html = generate_runs_html() | |
| return result.htmls, result.metrics, runs_html | |
| # ------------------------- | |
| # Gradio UI | |
| # ------------------------- | |
| def ui(): | |
| with gr.Blocks(title="HealthBench Week 1 Evaluation") as demo: | |
| gr.Markdown("## HealthBench Evaluation Week 1") | |
| with gr.Row(): | |
| candidate_model = gr.Dropdown( | |
| label="Candidate model", | |
| choices=CANDIDATE_MODELS, | |
| value="gpt-4o-mini", | |
| ) | |
| eval_subset = gr.Dropdown( | |
| label="Eval subset", | |
| choices=list(DATASET_FILES.keys()), | |
| value="regular" | |
| ) | |
| num_examples = gr.Number(label="# examples (leave blank for all)", value=1, precision=0) | |
| seed_number = gr.Number(label="Seed (optional)", value=None, precision=0) | |
| system_prompt = gr.Textbox( | |
| label="System Prompt (optional)", | |
| placeholder="Enter a system prompt here for the candidate model", | |
| lines=3 | |
| ) | |
| run_btn = gr.Button("Run evaluation") | |
| output_html = gr.HTML(label="Evaluation Report") | |
| output_metrics = gr.JSON(label="Metrics JSON") | |
| output_all_runs = gr.HTML(label="Evaluation History", value=generate_runs_html()) | |
| run_btn.click( | |
| fn=run_eval_ui, | |
| inputs=[candidate_model, system_prompt, eval_subset, num_examples, seed_number], | |
| outputs=[output_html, output_metrics, output_all_runs] | |
| ) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = ui() | |
| demo.queue(max_size=5) | |
| demo.launch() | |