Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| import re | |
| import uuid | |
| import time | |
| from datetime import datetime | |
| import random | |
| import csv | |
| import io | |
| import openai | |
| from openai import OpenAI | |
| import gradio as gr | |
| import numpy as np | |
| import google.generativeai as genai | |
| # ------------------------- | |
| # Config | |
| # ------------------------- | |
| PHYSICIAN_COMPLETION_MODES = {"Group 1": 1, "Group 2": 2, "Group 3": 3} | |
| DATASET_FILES = { | |
| "regular": os.path.join(os.path.dirname(__file__), "data", "oss_eval.jsonl"), | |
| "hard": os.path.join(os.path.dirname(__file__), "data", "hard_2025-05-08-21-00-10.jsonl"), | |
| "consensus": os.path.join(os.path.dirname(__file__), "data", "consensus_2025-05-09-20-00-46.jsonl"), | |
| } | |
| CANDIDATE_MODELS = [ | |
| "gpt-4.1", | |
| "gpt-4o-mini", | |
| "gpt-5-chat-latest", | |
| "o4-mini" | |
| ] | |
| GRADER_MODEL = "gpt-4o-mini" | |
| openai_api_key = os.getenv("OPENAI_API_KEY") | |
| gemini_api_key = os.getenv("GEMINI_API_KEY") | |
| openai_client = OpenAI(api_key=openai_api_key) | |
| genai.configure(api_key=gemini_api_key) | |
| MODEL_DEFAULT_TEMP = ["o4-mini"] | |
| # ------------------------- | |
| # Helper to read JSONL | |
| # ------------------------- | |
| def read_jsonl(file_path): | |
| data = [] | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| for line in f: | |
| data.append(json.loads(line)) | |
| return data | |
| # ------------------------- | |
| # Evaluation class | |
| # ------------------------- | |
| class HealthBenchEval: | |
| def __init__(self, dataset_file, num_examples=None, seed=None): | |
| self.all_data = read_jsonl(dataset_file) | |
| if seed is not None and num_examples: | |
| random.seed(seed) | |
| self.indices = random.sample(range(len(self.all_data)), min(num_examples, len(self.all_data))) | |
| self.dataset = [self.all_data[i] for i in self.indices] | |
| elif num_examples: | |
| self.indices = list(range(num_examples)) | |
| self.dataset = self.all_data[:num_examples] | |
| else: | |
| self.indices = list(range(len(self.all_data))) | |
| self.dataset = self.all_data | |
| self.scores = [] | |
| self.htmls = "" | |
| self.sample_records = [] | |
| self.seed = seed | |
| self.eval_id = str(uuid.uuid4()) | |
| def score_with_grader(self, prompt_text, completion_text, example_index): | |
| prompt = f""" | |
| You are a grader model. Score the following completion from 0.0 to 1.0. | |
| Prompt: {prompt_text} | |
| Completion: {completion_text} | |
| Return only a number between 0 and 1. | |
| """ | |
| try: | |
| resp = openai_client.chat.completions.create( | |
| model=GRADER_MODEL, | |
| messages=[{"role": "user", "content": [{"type": "text", "text": prompt}]}], | |
| temperature=0 | |
| ) | |
| score_text = resp.choices[0].message.content.strip() | |
| match = re.search(r"0(?:\.\d+)?|1(?:\.0+)?", score_text) | |
| score = float(match.group(0)) if match else 0.0 | |
| return max(0.0, min(1.0, score)) | |
| except Exception as e: | |
| print(f"Grader error: {e}") | |
| return 0.0 | |
| def generate_with_candidate(self, candidate_model, system_prompt, prompt_text, example_index, max_tokens=1024): | |
| for attempt in range(3): | |
| try: | |
| if candidate_model.startswith("gemini"): | |
| model = genai.GenerativeModel(candidate_model) | |
| full_prompt = "" | |
| if system_prompt: | |
| full_prompt += f"System: {system_prompt}\n" | |
| full_prompt += f"User: {prompt_text}" | |
| response = model.generate_content( | |
| full_prompt, | |
| generation_config={"max_output_tokens": max_tokens, "temperature": 0.7} | |
| ) | |
| completion = response.text if response.text else "[EMPTY GEMINI OUTPUT]" | |
| elif candidate_model.startswith("o"): | |
| messages = [] | |
| if system_prompt: | |
| messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt}]}) | |
| messages.append({"role": "user", "content": [{"type": "text", "text": prompt_text}]}) | |
| kwargs = { | |
| "model": candidate_model, | |
| "messages": messages, | |
| "reasoning_effort": "medium" | |
| } | |
| if candidate_model not in MODEL_DEFAULT_TEMP: | |
| kwargs["temperature"] = 0.7 | |
| resp = openai_client.chat.completions.create(**kwargs) | |
| completion = resp.choices[0].message.content | |
| else: | |
| messages = [] | |
| if system_prompt: | |
| messages.append({"role": "system", "content": system_prompt}) | |
| messages.append({"role": "user", "content": prompt_text}) | |
| if candidate_model in MODEL_DEFAULT_TEMP: | |
| resp = openai_client.chat.completions.create( | |
| model=candidate_model, | |
| messages=messages | |
| ) | |
| else: | |
| resp = openai_client.chat.completions.create( | |
| model=candidate_model, | |
| messages=messages, | |
| temperature=0.7 | |
| ) | |
| completion = resp.choices[0].message.content | |
| return completion.strip() if hasattr(completion, "strip") else completion | |
| except Exception as e: | |
| print(f"[ERROR] Candidate model {candidate_model} failed at dataset index {example_index} (attempt {attempt+1}/3)") | |
| print(f"Prompt text: {prompt_text[:200]}...") | |
| print(f"Error: {e}") | |
| time.sleep(2 ** attempt) | |
| if attempt == 2: | |
| return f"[ERROR after 3 retries: {str(e)}]" | |
| def __call__(self, candidate_model, system_prompt, eval_subset=""): | |
| html_lines = ["<h2>Evaluation Report</h2>", "<ul>"] | |
| cumulative_total = 0.0 | |
| for i, example in enumerate(self.dataset): | |
| dataset_index = self.indices[i] | |
| prompt_obj = example.get("prompt", []) | |
| prompt_text = " ".join([m.get("content", "") for m in prompt_obj]) | |
| completion_text = self.generate_with_candidate(candidate_model, system_prompt, prompt_text, dataset_index) | |
| score = self.score_with_grader(prompt_text, completion_text, dataset_index) | |
| cumulative_total += score | |
| cumulative_avg = cumulative_total / (i + 1) | |
| self.scores.append(score) | |
| html_lines.append(f"<li>Dataset Row {dataset_index}: Score = {score:.3f}</li>") | |
| self.sample_records.append({ | |
| "eval_id": self.eval_id, | |
| "timestamp": datetime.utcnow().isoformat(), | |
| "candidate_model": candidate_model, | |
| "system_prompt": system_prompt, | |
| "eval_subset": eval_subset, | |
| "seed": self.seed, | |
| "dataset_index": dataset_index, | |
| "prompt_text": prompt_text, | |
| "completion_text": completion_text, | |
| "score": float(score), | |
| "cumulative_total": float(cumulative_total), | |
| "cumulative_avg": float(cumulative_avg) | |
| }) | |
| self.htmls = "\n".join(html_lines) + "</ul>" | |
| return self | |
| # ------------------------- | |
| # Helpers: HTML / JSON | |
| # ------------------------- | |
| def generate_runs_html(session_runs): | |
| if session_runs: | |
| table_rows = "" | |
| for r in reversed(session_runs): | |
| table_rows += f""" | |
| <tr> | |
| <td>{r.get('eval_id','')}</td> | |
| <td>{r.get('timestamp','')}</td> | |
| <td>{r.get('candidate_model','')}</td> | |
| <td>{r.get('system_prompt','')}</td> | |
| <td>{r.get('eval_subset','')}</td> | |
| <td>{r.get('seed','')}</td> | |
| <td>{r.get('dataset_index','')}</td> | |
| <td>{r.get('prompt_text','')[:80]}...</td> | |
| <td>{(r.get('completion_text') or '').strip()[:80]}...</td> | |
| <td>{r.get('score',0.0):.3f}</td> | |
| <td>{r.get('cumulative_total',0.0):.3f}</td> | |
| <td>{r.get('cumulative_avg',0.0):.3f}</td> | |
| </tr> | |
| """ | |
| runs_html = f""" | |
| <h3>Evaluation History (Per Sample)</h3> | |
| <div style="max-height:300px; overflow:auto;"> | |
| <table border="1" style="border-collapse: collapse; padding:5px; width:100%; table-layout: fixed; word-wrap: break-word;"> | |
| <thead> | |
| <tr> | |
| <th>Eval ID</th> | |
| <th>Timestamp</th> | |
| <th>Candidate Model</th> | |
| <th>System Prompt</th> | |
| <th>Eval Subset</th> | |
| <th>Seed</th> | |
| <th>Dataset Row</th> | |
| <th>Prompt Text</th> | |
| <th>Completion Text</th> | |
| <th>Score</th> | |
| <th>Cumulative Total</th> | |
| <th>Cumulative Avg</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| {table_rows} | |
| </tbody> | |
| </table> | |
| </div> | |
| """ | |
| else: | |
| runs_html = "<p>No evaluations yet.</p>" | |
| return runs_html | |
| def clear_runs(): | |
| return [], "<p>No evaluations yet.</p>" | |
| def generate_csv(session_runs): | |
| if not session_runs: | |
| return None | |
| output = io.StringIO() | |
| fieldnames = ['eval_id', 'timestamp', 'candidate_model', 'system_prompt', 'eval_subset', | |
| 'seed', 'dataset_index', 'prompt_text', 'completion_text', 'score', | |
| 'cumulative_total', 'cumulative_avg'] | |
| writer = csv.DictWriter(output, fieldnames=fieldnames) | |
| writer.writeheader() | |
| for run in session_runs: | |
| writer.writerow(run) | |
| csv_data = output.getvalue() | |
| output.close() | |
| return csv_data | |
| # ------------------------- | |
| # Gradio UI | |
| # ------------------------- | |
| def run_eval_ui(candidate_model, system_prompt, eval_subset, num_examples, seed, session_runs): | |
| dataset_file = DATASET_FILES.get(eval_subset) | |
| if not dataset_file: | |
| return "<p style='color:red'>Invalid dataset</p>", {}, generate_runs_html(session_runs), session_runs | |
| seed_val = int(seed) if seed else None | |
| num_val = int(num_examples) if num_examples else None | |
| eval_obj = HealthBenchEval(dataset_file, num_examples=num_val, seed=seed_val) | |
| result = eval_obj(candidate_model, system_prompt, eval_subset=eval_subset) | |
| session_runs.extend(result.sample_records) | |
| runs_html = generate_runs_html(session_runs) | |
| metrics = { | |
| "eval_id": result.eval_id, | |
| "mean_score": float(np.mean(result.scores)) if result.scores else 0.0, | |
| "std_score": float(np.std(result.scores)) if result.scores else 0.0, | |
| "n_samples": len(result.scores), | |
| "seed": seed_val | |
| } | |
| return result.htmls, metrics, runs_html, session_runs | |
| def ui(): | |
| with gr.Blocks(title="HealthBench OpenAI + Gemini Evaluation") as demo: | |
| gr.Markdown("## HealthBench Evaluation (OpenAI + Gemini API-based)") | |
| session_runs = gr.State([]) | |
| with gr.Row(): | |
| candidate_model = gr.Dropdown( | |
| label="Candidate model", | |
| choices=CANDIDATE_MODELS, | |
| value="o4-mini", # default | |
| interactive=False # readonly | |
| ) | |
| eval_subset = gr.Dropdown( | |
| label="Eval subset", | |
| choices=list(DATASET_FILES.keys()), | |
| value="regular" | |
| ) | |
| num_examples = gr.Number(label="# examples (leave blank for all)", value=1, precision=0) | |
| seed = gr.Textbox(label="Random Seed (optional)", placeholder="Enter a seed for reproducibility") | |
| system_prompt = gr.Textbox( | |
| label="System Prompt (optional)", | |
| placeholder="Enter a system prompt here for the candidate model", | |
| lines=3 | |
| ) | |
| run_btn = gr.Button("Run evaluation") | |
| output_html = gr.HTML(label="Evaluation Report") | |
| output_metrics = gr.JSON(label="Metrics JSON") | |
| output_all_runs = gr.HTML(label="Evaluation History", value="<p>No evaluations yet.</p>") | |
| with gr.Row(): | |
| clear_btn = gr.Button("Clear History") | |
| download_btn = gr.DownloadButton( | |
| label="Download CSV", | |
| variant="secondary" | |
| ) | |
| run_btn.click( | |
| fn=run_eval_ui, | |
| inputs=[candidate_model, system_prompt, eval_subset, num_examples, seed, session_runs], | |
| outputs=[output_html, output_metrics, output_all_runs, session_runs] | |
| ) | |
| def clear_and_update(session_runs): | |
| new_runs, html = clear_runs() | |
| return new_runs, html | |
| clear_btn.click( | |
| fn=clear_and_update, | |
| inputs=[session_runs], | |
| outputs=[session_runs, output_all_runs] | |
| ) | |
| # FIXED: Proper CSV download with dynamic filename | |
| def prepare_download(session_runs): | |
| csv_data = generate_csv(session_runs) | |
| if not csv_data: | |
| return None | |
| filename = f"eval_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" | |
| filepath = os.path.join("/tmp", filename) | |
| with open(filepath, "w", encoding="utf-8") as f: | |
| f.write(csv_data) | |
| return filepath | |
| download_btn.click( | |
| fn=prepare_download, | |
| inputs=[session_runs], | |
| outputs=[download_btn] | |
| ) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = ui() | |
| demo.queue(max_size=5) | |
| demo.launch() | |