Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import subprocess | |
| import json | |
| import os | |
| def run_benchmark(): | |
| # spin up benchmark.py as a subprocess so it doesn't block the UI | |
| # -u flag = unbuffered output, which is what lets us stream line by line | |
| process = subprocess.Popen( | |
| ["python", "-u", "benchmark.py"], | |
| stdout=subprocess.PIPE, | |
| stderr=subprocess.DEVNULL, # hide model loading noise β it's distracting | |
| text=True, | |
| bufsize=1, | |
| cwd=os.path.dirname(os.path.abspath(__file__)) | |
| ) | |
| # stream each line to the UI as it comes in | |
| # this way the user sees progress instead of staring at a blank box for 10 minutes | |
| output = "" | |
| for line in process.stdout: | |
| output += line | |
| yield output | |
| process.wait() | |
| # once benchmark.py finishes, try to load the saved JSON and show a clean summary | |
| # benchmark.py writes this file β if it's missing, something went wrong upstream | |
| if os.path.exists("benchmark_results.json"): | |
| with open("benchmark_results.json", "r") as f: | |
| data = json.load(f) | |
| summary = data.get("summary", {}) | |
| output += f"\n\nπ SUMMARY:\n" | |
| output += f" Single MedGemma Accuracy : {summary.get('single_accuracy', 'N/A'):.1f}%\n" | |
| output += f" MedPanel Accuracy : {summary.get('medpanel_accuracy', 'N/A'):.1f}%\n" | |
| output += f" Improvement : +{summary.get('improvement', 'N/A'):.1f} pts\n" | |
| output += f" Devil's Advocate Saves : {summary.get('devils_advocate_saves', 'N/A')}\n" | |
| yield output | |
| def load_results(): | |
| # just a convenience button β lets you reload the last run without | |
| # running the full benchmark again (which takes ~10 minutes) | |
| if not os.path.exists("benchmark_results.json"): | |
| return "No results yet. Run the benchmark first." | |
| with open("benchmark_results.json", "r") as f: | |
| return json.dumps(json.load(f), indent=2) | |
| # ββ UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Blocks(title="MedPanel Benchmark") as demo: | |
| gr.Markdown("# π₯ MedPanel Benchmark Runner") | |
| gr.Markdown("Compares **Single MedGemma** vs **MedPanel** across test cases.") | |
| with gr.Row(): | |
| # run_benchmark streams live output as the test cases complete | |
| run_btn = gr.Button("βΆοΈ Run Benchmark", variant="primary") | |
| # load_results just reads the last saved JSON β no rerun needed | |
| results_btn = gr.Button("π Load Saved Results") | |
| # tall textbox so you can see the full output without scrolling too much | |
| output_box = gr.Textbox( | |
| label="Benchmark Output", | |
| lines=40, | |
| max_lines=80, | |
| ) | |
| run_btn.click(fn=run_benchmark, outputs=output_box) | |
| results_btn.click(fn=load_results, outputs=output_box) | |
| demo.launch() |