import os import json import re import uuid from datetime import datetime import openai import gradio as gr import numpy as np import google.generativeai as genai import random import matplotlib.pyplot as plt from io import BytesIO from scipy.stats import ttest_ind from PIL import Image # ------------------------- # Config # ------------------------- DATASET_FILES = { "regular": os.path.join(os.path.dirname(__file__), "data", "oss_eval.jsonl"), "hard": os.path.join(os.path.dirname(__file__), "data", "hard_2025-05-08-21-00-10.jsonl"), "consensus": os.path.join(os.path.dirname(__file__), "data", "consensus_2025-05-09-20-00-46.jsonl"), } CANDIDATE_MODELS = [ "gpt-4.1", "gpt-4o-mini", "gpt-5-chat-latest" ] GRADER_MODEL = "gpt-4o-mini" openai.api_key = os.getenv("OPENAI_API_KEY") genai.configure(api_key=os.getenv("GEMINI_API_KEY")) MODEL_DEFAULT_TEMP = ["o4-mini"] # ------------------------- # Helper to read JSONL # ------------------------- def read_jsonl(file_path): data = [] with open(file_path, "r", encoding="utf-8") as f: for line in f: data.append(json.loads(line)) return data # ------------------------- # Evaluation class # ------------------------- class HealthBenchEval: def __init__(self, dataset_file, num_examples=None, indices=None): self.all_data = read_jsonl(dataset_file) if indices is not None: self.indices = indices self.dataset = [self.all_data[i] for i in self.indices] elif num_examples: self.indices = random.sample(range(len(self.all_data)), min(num_examples, len(self.all_data))) self.dataset = [self.all_data[i] for i in self.indices] else: self.indices = list(range(len(self.all_data))) self.dataset = self.all_data self.scores = [] self.htmls = "" self.eval_id = str(uuid.uuid4()) def score_with_grader(self, prompt_text, completion_text, example_index): prompt = f""" You are a grader model. Score the following completion from 0.0 to 1.0. Prompt: {prompt_text} Completion: {completion_text} Return only a number between 0 and 1. """ try: resp = openai.chat.completions.create( model=GRADER_MODEL, messages=[{"role": "user", "content": prompt}], max_completion_tokens=50 ) score_text = resp.choices[0].message.content.strip() match = re.search(r"0(?:\.\d+)?|1(?:\.0+)?", score_text) score = float(match.group(0)) if match else 0.0 return max(0.0, min(1.0, score)) except Exception as e: print(f"Grader error: {e}") return 0.0 def generate_with_candidate(self, candidate_model, system_prompt, prompt_text, example_index, max_tokens=1024): for attempt in range(3): try: if candidate_model.startswith("gemini"): model = genai.GenerativeModel(candidate_model) full_prompt = f"System: {system_prompt}\nUser: {prompt_text}" if system_prompt else f"User: {prompt_text}" response = model.generate_content( full_prompt, generation_config={"max_output_tokens": max_tokens, "temperature": 0.7} ) completion = response.text if response.text else "[EMPTY GEMINI OUTPUT]" else: messages = [{"role": "system", "content": system_prompt}] if system_prompt else [] messages.append({"role": "user", "content": prompt_text}) if candidate_model in MODEL_DEFAULT_TEMP: resp = openai.chat.completions.create( model=candidate_model, messages=messages, max_completion_tokens=max_tokens ) else: resp = openai.chat.completions.create( model=candidate_model, messages=messages, temperature=0.7, max_completion_tokens=max_tokens ) completion = resp.choices[0].message.content return completion.strip() if hasattr(completion, "strip") else completion except Exception as e: print(f"[ERROR] Candidate model {candidate_model} failed at index {example_index} (attempt {attempt+1}/3)") print(f"Prompt: {prompt_text[:200]}...\nError: {e}") if attempt == 2: return f"[ERROR after 3 retries: {str(e)}]" def __call__(self, candidate_model, system_prompt, eval_subset=""): html_lines = ["
Invalid dataset
", {}, None num_val = int(num_examples) if num_examples else None eval_obj1 = HealthBenchEval(dataset_file, num_examples=num_val) result1 = eval_obj1(candidate_model, system_prompt, eval_subset=eval_subset) eval_obj2 = HealthBenchEval(dataset_file, num_examples=num_val) result2 = eval_obj2(candidate_model, system_prompt, eval_subset=eval_subset) # t-test if result1.scores and result2.scores: t_stat, p_val = ttest_ind(result1.scores, result2.scores, equal_var=False) else: p_val = None html_report = f"""