Spaces:

Healthelicai
/

HealthBenchAdvancedDemo

Sleeping

App Files Files Community

admin-healthelic commited on Sep 2, 2025

Commit

3eafae5

verified ·

1 Parent(s): 43ad69c

Upload 5 files

Browse files

Files changed (5) hide show

README.md +4 -3
app.py +354 -0
postBuild +3 -0
requirements.txt +11 -0
runs.json +1 -0

README.md CHANGED Viewed

@@ -1,12 +1,13 @@
 ---
 title: HealthBenchAdvancedDemo
-emoji: 💻
 colorFrom: gray
-colorTo: red
 sdk: gradio
-sdk_version: 5.44.1
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: HealthBenchAdvancedDemo
+emoji: 🏢
 colorFrom: gray
+colorTo: blue
 sdk: gradio
+sdk_version: 5.42.0
 app_file: app.py
 pinned: false
+license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

	@@ -0,0 +1,354 @@

+import os
+import json
+import re
+import uuid
+from datetime import datetime
+import openai
+import gradio as gr
+import numpy as np
+import google.generativeai as genai
+import random
+# -------------------------
+# Config
+# -------------------------
+PHYSICIAN_COMPLETION_MODES = {"Group 1": 1, "Group 2": 2, "Group 3": 3}
+DATASET_FILES = {
+    "regular": os.path.join(os.path.dirname(__file__), "data", "oss_eval.jsonl"),
+    "hard": os.path.join(os.path.dirname(__file__), "data", "hard_2025-05-08-21-00-10.jsonl"),
+    "consensus": os.path.join(os.path.dirname(__file__), "data", "consensus_2025-05-09-20-00-46.jsonl"),
+}
+CANDIDATE_MODELS = [
+    "gpt-4.1",
+    "gpt-4o-mini",
+    "gpt-5-chat-latest"
+]
+GRADER_MODEL = "gpt-4o-mini"
+openai.api_key = os.getenv("OPENAI_API_KEY")
+genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
+# Models that only support default temperature (don’t allow setting temperature manually)
+MODEL_DEFAULT_TEMP = ["o4-mini"]
+# Local JSON file for storing runs
+RUNS_FILE = "/data/runs2.json"
+# -------------------------
+# Helper to read JSONL
+# -------------------------
+def read_jsonl(file_path):
+    data = []
+    with open(file_path, "r", encoding="utf-8") as f:
+        for line in f:
+            data.append(json.loads(line))
+    return data
+# -------------------------
+# Evaluation class
+# -------------------------
+class HealthBenchEval:
+    def __init__(self, dataset_file, num_examples=None, seed=None):
+        self.all_data = read_jsonl(dataset_file)
+        if seed is not None and num_examples:
+            random.seed(seed)
+            self.indices = random.sample(range(len(self.all_data)), min(num_examples, len(self.all_data)))
+            self.dataset = [self.all_data[i] for i in self.indices]
+        elif num_examples:
+            self.indices = list(range(num_examples))
+            self.dataset = self.all_data[:num_examples]
+        else:
+            self.indices = list(range(len(self.all_data)))
+            self.dataset = self.all_data
+        self.scores = []
+        self.htmls = ""
+        self.sample_records = []  # <-- store per-sample rows
+        self.seed = seed
+        self.eval_id = str(uuid.uuid4())  # unique id per evaluation run
+    def score_with_grader(self, prompt_text, completion_text, example_index):
+        prompt = f"""
+You are a grader model. Score the following completion from 0.0 to 1.0.
+Prompt: {prompt_text}
+Completion: {completion_text}
+Return only a number between 0 and 1.
+"""
+        try:
+            resp = openai.chat.completions.create(
+                model=GRADER_MODEL,
+                messages=[{"role": "user", "content": prompt}],
+                max_completion_tokens=50
+            )
+            score_text = resp.choices[0].message.content.strip()
+            match = re.search(r"0(?:\.\d+)?|1(?:\.0+)?", score_text)
+            if match:
+                score = float(match.group(0))
+            else:
+                score = 0.0
+            return max(0.0, min(1.0, score))
+        except Exception as e:
+            print(f"Grader error: {e}")
+            return 0.0
+    def generate_with_candidate(self, candidate_model, system_prompt, prompt_text, example_index, max_tokens=1024):
+        """
+        Generate completion with retry logic and better error logging.
+        """
+        for attempt in range(3):  # retry up to 3 times
+            try:
+                if candidate_model.startswith("gemini"):
+                    model = genai.GenerativeModel(candidate_model)
+                    full_prompt = ""
+                    if system_prompt:
+                        full_prompt += f"System: {system_prompt}\n"
+                    full_prompt += f"User: {prompt_text}"
+                    response = model.generate_content(
+                        full_prompt,
+                        generation_config={"max_output_tokens": max_tokens, "temperature": 0.7}
+                    )
+                    completion = response.text if response.text else "[EMPTY GEMINI OUTPUT]"
+                else:
+                    messages = []
+                    if system_prompt:
+                        messages.append({"role": "system", "content": system_prompt})
+                    messages.append({"role": "user", "content": prompt_text})
+                    # Skip temperature for models that don't support it
+                    if candidate_model in MODEL_DEFAULT_TEMP:
+                        resp = openai.chat.completions.create(
+                            model=candidate_model,
+                            messages=messages,
+                            max_completion_tokens=max_tokens
+                        )
+                    else:
+                        resp = openai.chat.completions.create(
+                            model=candidate_model,
+                            messages=messages,
+                            temperature=0.7,
+                            max_completion_tokens=max_tokens
+                        )
+                    completion = resp.choices[0].message.content
+                    print(resp)
+                return completion.strip() if hasattr(completion, "strip") else completion
+            except Exception as e:
+                print(f"[ERROR] Candidate model {candidate_model} failed at dataset index {example_index} (attempt {attempt+1}/3)")
+                print(f"Prompt text: {prompt_text[:200]}...")
+                print(f"Error: {e}")
+                if attempt == 2:  # after last attempt
+                    return f"[ERROR after 3 retries: {str(e)}]"
+    def __call__(self, candidate_model, system_prompt, eval_subset=""):
+        html_lines = ["<h2>Evaluation Report</h2>", "<ul>"]
+        cumulative_total = 0.0
+        for i, example in enumerate(self.dataset):
+            dataset_index = self.indices[i]  # actual dataset row index
+            prompt_obj = example.get("prompt", [])
+            prompt_text = " ".join([m.get("content", "") for m in prompt_obj])
+            completion_text = self.generate_with_candidate(candidate_model, system_prompt, prompt_text, dataset_index)
+            score = self.score_with_grader(prompt_text, completion_text, dataset_index)
+            # update running totals (per eval_id)
+            cumulative_total += score
+            cumulative_avg = cumulative_total / (i + 1)
+            self.scores.append(score)
+            html_lines.append(f"<li>Dataset Row {dataset_index}: Score = {score:.3f}</li>")
+            # create individual sample record
+            self.sample_records.append({
+                "eval_id": self.eval_id,
+                "timestamp": datetime.utcnow().isoformat(),
+                "candidate_model": candidate_model,
+                "system_prompt": system_prompt,
+                "eval_subset": eval_subset,
+                "seed": self.seed,
+                "dataset_index": dataset_index,
+                "prompt_text": prompt_text,
+                "completion_text": completion_text,
+                "score": float(score),
+                "cumulative_total": float(cumulative_total),
+                "cumulative_avg": float(cumulative_avg)
+            })
+        self.htmls = "\n".join(html_lines) + "</ul>"
+        return self
+# -------------------------
+# Helper to generate HTML table from runs
+# -------------------------
+def generate_runs_html():
+    runs = []
+    if os.path.exists(RUNS_FILE):
+        try:
+            with open(RUNS_FILE, "r", encoding="utf-8") as f:
+                runs = json.load(f)
+                if not isinstance(runs, list):
+                    runs = []
+        except (json.JSONDecodeError, ValueError):
+            runs = []
+    if runs:
+        table_rows = ""
+        for r in reversed(runs):
+            table_rows += f"""
+            <tr>
+                <td>{r.get('eval_id','')}</td>
+                <td>{r.get('timestamp','')}</td>
+                <td>{r.get('candidate_model','')}</td>
+                <td>{r.get('system_prompt','')}</td>
+                <td>{r.get('eval_subset','')}</td>
+                <td>{r.get('seed','')}</td>
+                <td>{r.get('dataset_index','')}</td>
+                <td>{r.get('prompt_text','')[:80]}...</td>
+                <td>{(r.get('completion_text') or '').strip()[:80]}...</td>
+                <td>{r.get('score',0.0):.3f}</td>
+                <td>{r.get('cumulative_total',0.0):.3f}</td>
+                <td>{r.get('cumulative_avg',0.0):.3f}</td>
+            </tr>
+            """
+        runs_html = f"""
+        <h3>Evaluation History (Per Sample)</h3>
+        <div style="max-height:300px; overflow:auto;">
+        <table border="1" style="border-collapse: collapse; padding:5px; width:100%; table-layout: fixed; word-wrap: break-word;">
+            <thead>
+                <tr>
+                    <th>Eval ID</th>
+                    <th>Timestamp</th>
+                    <th>Candidate Model</th>
+                    <th>System Prompt</th>
+                    <th>Eval Subset</th>
+                    <th>Seed</th>
+                    <th>Dataset Row</th>
+                    <th>Prompt Text</th>
+                    <th>Completion Text</th>
+                    <th>Score</th>
+                    <th>Cumulative Total</th>
+                    <th>Cumulative Avg</th>
+                </tr>
+            </thead>
+            <tbody>
+                {table_rows}
+            </tbody>
+        </table>
+        </div>
+        """
+    else:
+        runs_html = "<p>No evaluations yet.</p>"
+    return runs_html
+# -------------------------
+# Clear runs file
+# -------------------------
+def clear_runs():
+    with open(RUNS_FILE, "w", encoding="utf-8") as f:
+        json.dump([], f, indent=2)
+    return "<p>No evaluations yet.</p>"
+# -------------------------
+# Gradio UI function
+# -------------------------
+def run_eval_ui(candidate_model, system_prompt, eval_subset, num_examples, seed):
+    dataset_file = DATASET_FILES.get(eval_subset)
+    if not dataset_file:
+        return "<p style='color:red'>Invalid dataset</p>", {}, generate_runs_html()
+    seed_val = int(seed) if seed else None
+    num_val = int(num_examples) if num_examples else None
+    eval_obj = HealthBenchEval(dataset_file, num_examples=num_val, seed=seed_val)
+    result = eval_obj(candidate_model, system_prompt, eval_subset=eval_subset)
+    # Load existing runs
+    runs = []
+    if os.path.exists(RUNS_FILE):
+        try:
+            with open(RUNS_FILE, "r", encoding="utf-8") as f:
+                runs = json.load(f)
+                if not isinstance(runs, list):
+                    runs = []
+        except (json.JSONDecodeError, ValueError):
+            runs = []
+    runs.extend(result.sample_records)
+    with open(RUNS_FILE, "w", encoding="utf-8") as f:
+        json.dump(runs, f, indent=2)
+    runs_html = generate_runs_html()
+    metrics = {
+        "eval_id": result.eval_id,
+        "mean_score": float(np.mean(result.scores)) if result.scores else 0.0,
+        "std_score": float(np.std(result.scores)) if result.scores else 0.0,
+        "n_samples": len(result.scores),
+        "seed": seed_val
+    }
+    return result.htmls, metrics, runs_html
+# -------------------------
+# Gradio UI
+# -------------------------
+def ui():
+    with gr.Blocks(title="HealthBench OpenAI + Gemini Evaluation") as demo:
+        gr.Markdown("## HealthBench Evaluation (OpenAI + Gemini API-based)")
+        with gr.Row():
+            candidate_model = gr.Dropdown(
+                label="Candidate model",
+                choices=CANDIDATE_MODELS,
+                value="gpt-4o-mini",
+            )
+            eval_subset = gr.Dropdown(
+                label="Eval subset",
+                choices=list(DATASET_FILES.keys()),
+                value="regular"
+            )
+            num_examples = gr.Number(label="# examples (leave blank for all)", value=1, precision=0)
+            seed = gr.Textbox(label="Random Seed (optional)", placeholder="Enter a seed for reproducibility")
+        system_prompt = gr.Textbox(
+            label="System Prompt (optional)",
+            placeholder="Enter a system prompt here for the candidate model",
+            lines=3
+        )
+        run_btn = gr.Button("Run evaluation")
+        output_html = gr.HTML(label="Evaluation Report")
+        output_metrics = gr.JSON(label="Metrics JSON")
+        output_all_runs = gr.HTML(label="Evaluation History", value=generate_runs_html())
+        with gr.Row():
+            clear_btn = gr.Button("Clear History")
+        # Connect buttons
+        run_btn.click(
+            fn=run_eval_ui,
+            inputs=[candidate_model, system_prompt, eval_subset, num_examples, seed],
+            outputs=[output_html, output_metrics, output_all_runs]
+        )
+        clear_btn.click(
+            fn=clear_runs,
+            inputs=[],
+            outputs=[output_all_runs]
+        )
+    return demo
+if __name__ == "__main__":
+    demo = ui()
+    demo.queue(max_size=5)
+    demo.launch()

postBuild ADDED Viewed

	@@ -0,0 +1,3 @@

+set -e
+git clone https://github.com/openai/simple-evals.git
+mv simple-evals/simple_evals ./simple_evals

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+gradio>=4.44.0
+pandas
+numpy
+blobfile
+openai>=1.44.0
+jinja2
+tqdm
+requests
+google-generativeai
+pymongo[srv]
+dnspython

runs.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ []