llm_compare / app.py
crossingk's picture
Update app.py
8ee870f verified
import re
import tempfile
import gradio as gr
from db import init_db, save_evaluation, export_to_excel
from providers import (
MODEL_NAMES,
call_model,
call_custom_endpoint,
MODEL_REGISTRY,
get_model_defaults,
)
# ---------------------------------------------------------------------------
# Initialise database on import
# ---------------------------------------------------------------------------
init_db()
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
URL_RE = re.compile(r"^https?://\S+$")
def _sanitize_nickname(nick: str) -> str:
return nick.strip()[:50]
def _validate_url(url: str) -> bool:
return bool(URL_RE.match(url.strip()))
def on_model_select(display_name: str):
"""When user picks a model from dropdown, populate base_url and model_id."""
base_url, model_id = get_model_defaults(display_name)
return base_url, model_id
# ---------------------------------------------------------------------------
# Event handlers
# ---------------------------------------------------------------------------
def send_to_both(
prompt: str,
left_url: str,
left_model: str,
left_key: str,
right_name: str,
right_base_url: str,
right_model_id: str,
right_key: str,
):
"""Call both models and return their responses."""
if not prompt or not prompt.strip():
raise gr.Error("Please enter a prompt.")
# Left — Dify endpoint
left_response = ""
left_err = ""
if left_url and left_url.strip():
if not _validate_url(left_url):
left_err = "⚠️ Invalid URL format. Use http:// or https://."
else:
try:
left_response = call_custom_endpoint(
left_url.strip(), left_model.strip() or "default", prompt, left_key
)
except Exception as e:
left_err = f"⚠️ Left model error: {e}"
# Right — registry model (with optional user overrides)
right_response = ""
right_err = ""
try:
right_response = call_model(
right_name, prompt, right_key, right_base_url, right_model_id
)
except Exception as e:
right_err = f"⚠️ Right model error: {e}"
return (
left_response if not left_err else left_err,
right_response if not right_err else right_err,
)
def submit_evaluation(
nickname: str,
prompt: str,
left_url: str,
left_model: str,
left_response: str,
left_comment: str,
left_grade: int,
right_name: str,
right_model_id: str,
right_response: str,
right_comment: str,
right_grade: int,
):
"""Validate and persist an evaluation."""
nickname = _sanitize_nickname(nickname)
if not nickname:
raise gr.Error("Nickname is required.")
if not prompt or not prompt.strip():
raise gr.Error("Prompt is empty — send a prompt first.")
if not left_response.strip() and not right_response.strip():
raise gr.Error("No responses to evaluate — send a prompt first.")
if left_grade < 1 or left_grade > 10:
raise gr.Error("Left grade must be between 1 and 10.")
if right_grade < 1 or right_grade > 10:
raise gr.Error("Right grade must be between 1 and 10.")
entry = MODEL_REGISTRY.get(right_name, {})
right_provider = entry.get("provider", "unknown")
save_evaluation(
nickname=nickname,
prompt=prompt,
left_model_name=left_model.strip() or "custom",
left_model_endpoint=left_url.strip(),
left_response=left_response,
left_comment=left_comment,
left_grade=int(left_grade),
right_model_name=right_model_id.strip() or right_name,
right_provider=right_provider,
right_response=right_response,
right_comment=right_comment,
right_grade=int(right_grade),
)
gr.Info("✅ Evaluation saved!")
def download_report():
"""Export all evaluations to a temp .xlsx and return as a downloadable file."""
tmp = tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False)
export_to_excel(tmp.name)
return tmp.name
# ---------------------------------------------------------------------------
# Gradio Blocks UI
# ---------------------------------------------------------------------------
# Pre-compute initial defaults for first model
_init_base_url, _init_model_id = get_model_defaults(MODEL_NAMES[0])
with gr.Blocks(title="LLM Compare") as demo:
gr.Markdown("# 🔍 LLM Compare\nSide-by-side comparison: your Dify app vs reference models.")
# ---- Top bar: nickname ---------------------------------------------------
with gr.Row():
nickname = gr.Textbox(
label="Your Nickname",
placeholder="Enter a nickname (required)",
scale=2,
)
# ---- Prompt area ---------------------------------------------------------
with gr.Row():
prompt = gr.Textbox(
label="Prompt",
placeholder="Type your prompt here…",
lines=4,
scale=4,
)
send_btn = gr.Button("🚀 Send to Both", variant="primary", scale=1)
# ---- Two-column layout ---------------------------------------------------
with gr.Row(equal_height=True):
# ---- LEFT: Dify model ------------------------------------------------
with gr.Column():
gr.Markdown("### 🧪 Your Model (Dify Endpoint)")
left_url = gr.Textbox(
label="Dify API Base URL",
placeholder="https://api.dify.ai/v1",
)
left_model = gr.Textbox(
label="App Name (for display only)",
placeholder="e.g. my-dify-app",
)
left_key = gr.Textbox(
label="Dify Secret Key",
placeholder="app-xxxxxxxxxxxx",
type="password",
)
left_response = gr.Textbox(
label="Response",
lines=12,
interactive=False,
)
left_comment = gr.Textbox(
label="Comment",
placeholder="Your thoughts on this response…",
lines=2,
)
left_grade = gr.Slider(
minimum=1,
maximum=10,
step=1,
value=5,
label="Grade (1–10)",
)
# ---- RIGHT: reference model ------------------------------------------
with gr.Column():
gr.Markdown("### 📚 Reference Model")
right_name = gr.Dropdown(
choices=MODEL_NAMES,
value=MODEL_NAMES[0],
label="Select Model",
)
right_base_url = gr.Textbox(
label="Base URL (auto-filled, editable)",
value=_init_base_url,
placeholder="e.g. https://api.openai.com/v1",
)
right_model_id = gr.Textbox(
label="Model ID (auto-filled, editable)",
value=_init_model_id,
placeholder="e.g. gpt-4o",
)
right_key = gr.Textbox(
label="API Key (optional — uses env default)",
placeholder="Leave blank to use default key",
type="password",
)
right_response = gr.Textbox(
label="Response",
lines=12,
interactive=False,
)
right_comment = gr.Textbox(
label="Comment",
placeholder="Your thoughts on this response…",
lines=2,
)
right_grade = gr.Slider(
minimum=1,
maximum=10,
step=1,
value=5,
label="Grade (1–10)",
)
# ---- Action buttons ------------------------------------------------------
with gr.Row():
submit_btn = gr.Button("💾 Submit Evaluation", variant="primary")
download_btn = gr.Button("📥 Download Report (.xlsx)")
report_file = gr.File(label="Report", visible=False)
# ---- Wiring --------------------------------------------------------------
# Auto-fill base_url and model_id when dropdown changes
right_name.change(
fn=on_model_select,
inputs=[right_name],
outputs=[right_base_url, right_model_id],
)
send_btn.click(
fn=send_to_both,
inputs=[
prompt, left_url, left_model, left_key,
right_name, right_base_url, right_model_id, right_key,
],
outputs=[left_response, right_response],
)
submit_btn.click(
fn=submit_evaluation,
inputs=[
nickname,
prompt,
left_url,
left_model,
left_response,
left_comment,
left_grade,
right_name,
right_model_id,
right_response,
right_comment,
right_grade,
],
outputs=[],
)
download_btn.click(
fn=download_report,
inputs=[],
outputs=[report_file],
).then(lambda: gr.update(visible=True), outputs=[report_file])
if __name__ == "__main__":
demo.launch(theme=gr.themes.Soft())