Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| import re | |
| import uuid | |
| from datetime import datetime | |
| import openai | |
| import gradio as gr | |
| import numpy as np | |
| import google.generativeai as genai | |
| import random | |
| import matplotlib.pyplot as plt | |
| from io import BytesIO | |
| from scipy.stats import ttest_ind | |
| from PIL import Image | |
| # ------------------------- | |
| # Config | |
| # ------------------------- | |
| DATASET_FILES = { | |
| "regular": os.path.join(os.path.dirname(__file__), "data", "oss_eval.jsonl"), | |
| "hard": os.path.join(os.path.dirname(__file__), "data", "hard_2025-05-08-21-00-10.jsonl"), | |
| "consensus": os.path.join(os.path.dirname(__file__), "data", "consensus_2025-05-09-20-00-46.jsonl"), | |
| } | |
| CANDIDATE_MODELS = [ | |
| "gpt-4.1", | |
| "gpt-4o-mini", | |
| "gpt-5-chat-latest" | |
| ] | |
| GRADER_MODEL = "gpt-4o-mini" | |
| openai.api_key = os.getenv("OPENAI_API_KEY") | |
| genai.configure(api_key=os.getenv("GEMINI_API_KEY")) | |
| MODEL_DEFAULT_TEMP = ["o4-mini"] | |
| # ------------------------- | |
| # Helper to read JSONL | |
| # ------------------------- | |
| def read_jsonl(file_path): | |
| data = [] | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| for line in f: | |
| data.append(json.loads(line)) | |
| return data | |
| # ------------------------- | |
| # Evaluation class | |
| # ------------------------- | |
| class HealthBenchEval: | |
| def __init__(self, dataset_file, num_examples=None, indices=None): | |
| self.all_data = read_jsonl(dataset_file) | |
| if indices is not None: | |
| self.indices = indices | |
| self.dataset = [self.all_data[i] for i in self.indices] | |
| elif num_examples: | |
| self.indices = random.sample(range(len(self.all_data)), min(num_examples, len(self.all_data))) | |
| self.dataset = [self.all_data[i] for i in self.indices] | |
| else: | |
| self.indices = list(range(len(self.all_data))) | |
| self.dataset = self.all_data | |
| self.scores = [] | |
| self.htmls = "" | |
| self.eval_id = str(uuid.uuid4()) | |
| def score_with_grader(self, prompt_text, completion_text, example_index): | |
| prompt = f""" | |
| You are a grader model. Score the following completion from 0.0 to 1.0. | |
| Prompt: {prompt_text} | |
| Completion: {completion_text} | |
| Return only a number between 0 and 1. | |
| """ | |
| try: | |
| resp = openai.chat.completions.create( | |
| model=GRADER_MODEL, | |
| messages=[{"role": "user", "content": prompt}], | |
| max_completion_tokens=50 | |
| ) | |
| score_text = resp.choices[0].message.content.strip() | |
| match = re.search(r"0(?:\.\d+)?|1(?:\.0+)?", score_text) | |
| score = float(match.group(0)) if match else 0.0 | |
| return max(0.0, min(1.0, score)) | |
| except Exception as e: | |
| print(f"Grader error: {e}") | |
| return 0.0 | |
| def generate_with_candidate(self, candidate_model, system_prompt, prompt_text, example_index, max_tokens=1024): | |
| for attempt in range(3): | |
| try: | |
| if candidate_model.startswith("gemini"): | |
| model = genai.GenerativeModel(candidate_model) | |
| full_prompt = f"System: {system_prompt}\nUser: {prompt_text}" if system_prompt else f"User: {prompt_text}" | |
| response = model.generate_content( | |
| full_prompt, | |
| generation_config={"max_output_tokens": max_tokens, "temperature": 0.7} | |
| ) | |
| completion = response.text if response.text else "[EMPTY GEMINI OUTPUT]" | |
| else: | |
| messages = [{"role": "system", "content": system_prompt}] if system_prompt else [] | |
| messages.append({"role": "user", "content": prompt_text}) | |
| if candidate_model in MODEL_DEFAULT_TEMP: | |
| resp = openai.chat.completions.create( | |
| model=candidate_model, | |
| messages=messages, | |
| max_completion_tokens=max_tokens | |
| ) | |
| else: | |
| resp = openai.chat.completions.create( | |
| model=candidate_model, | |
| messages=messages, | |
| temperature=0.7, | |
| max_completion_tokens=max_tokens | |
| ) | |
| completion = resp.choices[0].message.content | |
| return completion.strip() if hasattr(completion, "strip") else completion | |
| except Exception as e: | |
| print(f"[ERROR] Candidate model {candidate_model} failed at index {example_index} (attempt {attempt+1}/3)") | |
| print(f"Prompt: {prompt_text[:200]}...\nError: {e}") | |
| if attempt == 2: | |
| return f"[ERROR after 3 retries: {str(e)}]" | |
| def __call__(self, candidate_model, system_prompt, eval_subset=""): | |
| html_lines = ["<ul>"] | |
| cumulative_total = 0.0 | |
| for i, example in enumerate(self.dataset): | |
| dataset_index = self.indices[i] | |
| prompt_obj = example.get("prompt", []) | |
| prompt_text = " ".join([m.get("content", "") for m in prompt_obj]) | |
| completion_text = self.generate_with_candidate(candidate_model, system_prompt, prompt_text, dataset_index) | |
| score = self.score_with_grader(prompt_text, completion_text, dataset_index) | |
| cumulative_total += score | |
| self.scores.append(score) | |
| html_lines.append(f"<li>Dataset Row {dataset_index}: Score = {score:.3f}</li>") | |
| self.htmls = "\n".join(html_lines) + "</ul>" | |
| return self | |
| # ------------------------- | |
| # Helper to plot distributions | |
| # ------------------------- | |
| def plot_score_distributions(scores1, scores2): | |
| import matplotlib.pyplot as plt | |
| from io import BytesIO | |
| from PIL import Image | |
| plt.figure(figsize=(6,4)) | |
| plt.hist(scores1, bins=10, alpha=0.6, label="Sample 1") | |
| plt.hist(scores2, bins=10, alpha=0.6, label="Sample 2") | |
| plt.xlabel("Score") | |
| plt.ylabel("Frequency") | |
| plt.title("Score Distributions") | |
| plt.legend() | |
| buf = BytesIO() | |
| plt.savefig(buf, format="png") | |
| plt.close() | |
| buf.seek(0) | |
| # Convert BytesIO to PIL Image | |
| img = Image.open(buf) | |
| return img | |
| # ------------------------- | |
| # Gradio evaluation function | |
| # ------------------------- | |
| def run_eval_ui(candidate_model, system_prompt, eval_subset, num_examples): | |
| dataset_file = DATASET_FILES.get(eval_subset) | |
| if not dataset_file: | |
| return "<p style='color:red'>Invalid dataset</p>", {}, None | |
| num_val = int(num_examples) if num_examples else None | |
| eval_obj1 = HealthBenchEval(dataset_file, num_examples=num_val) | |
| result1 = eval_obj1(candidate_model, system_prompt, eval_subset=eval_subset) | |
| eval_obj2 = HealthBenchEval(dataset_file, num_examples=num_val) | |
| result2 = eval_obj2(candidate_model, system_prompt, eval_subset=eval_subset) | |
| # t-test | |
| if result1.scores and result2.scores: | |
| t_stat, p_val = ttest_ind(result1.scores, result2.scores, equal_var=False) | |
| else: | |
| p_val = None | |
| html_report = f""" | |
| <h2>Evaluation Report (Two Random Samples)</h2> | |
| <h3>Sample 1</h3> | |
| {result1.htmls} | |
| <h3>Sample 2</h3> | |
| {result2.htmls} | |
| """ | |
| metrics = { | |
| "eval_id_1": result1.eval_id, | |
| "eval_id_2": result2.eval_id, | |
| "mean_score_sample1": float(np.mean(result1.scores)) if result1.scores else 0.0, | |
| "mean_score_sample2": float(np.mean(result2.scores)) if result2.scores else 0.0, | |
| "std_score_sample1": float(np.std(result1.scores)) if result1.scores else 0.0, | |
| "std_score_sample2": float(np.std(result2.scores)) if result2.scores else 0.0, | |
| "n_samples_each": num_val, | |
| "p_value": float(p_val) if p_val is not None else None | |
| } | |
| # generate plot | |
| plot_buf = plot_score_distributions(result1.scores, result2.scores) | |
| return html_report, metrics, plot_buf | |
| # ------------------------- | |
| # Gradio UI | |
| # ------------------------- | |
| def ui(): | |
| with gr.Blocks(title="HealthBench Evaluation with T-Test & Plot") as demo: | |
| gr.Markdown("## HealthBench Evaluation (Two Random Samples + T-Test + Plot)") | |
| with gr.Row(): | |
| candidate_model = gr.Dropdown( | |
| label="Candidate model", | |
| choices=CANDIDATE_MODELS, | |
| value="gpt-4o-mini", | |
| ) | |
| eval_subset = gr.Dropdown( | |
| label="Eval subset", | |
| choices=list(DATASET_FILES.keys()), | |
| value="regular" | |
| ) | |
| num_examples = gr.Number(label="# examples per sample (leave blank for all)", value=5, precision=0) | |
| system_prompt = gr.Textbox( | |
| label="System Prompt (optional)", | |
| placeholder="Enter a system prompt here for the candidate model", | |
| lines=3 | |
| ) | |
| run_btn = gr.Button("Run evaluation") | |
| output_html = gr.HTML(label="Evaluation Report") | |
| output_metrics = gr.JSON(label="Metrics JSON (with p-value)") | |
| output_plot = gr.Image(label="Score Distributions") | |
| run_btn.click( | |
| fn=run_eval_ui, | |
| inputs=[candidate_model, system_prompt, eval_subset, num_examples], | |
| outputs=[output_html, output_metrics, output_plot] | |
| ) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = ui() | |
| demo.queue(max_size=5) | |
| demo.launch() | |