LLM-Judge / app.py
workbykait's picture
Update app.py
67dfd7b verified
import gradio as gr
import pandas as pd
from huggingface_hub import InferenceClient
from prompts import JUDGE_PROMPT
from inference import generate_response
from leaderboard import load_leaderboard, save_vote
from models_config import MODELS, JUDGE_MODEL
# import utils # uncomment when you have parsing + bertscore functions
MAX_MODELS = 6
def main():
with gr.Blocks(title="LLM Judge Arena") as demo:
gr.Markdown(
"# 🏟️ LLM Judge Arena\n"
"Side-by-side comparison + LLM-as-a-Judge + Human votes + Live leaderboard"
)
with gr.Tab("βš”οΈ Arena"):
with gr.Row():
model_select = gr.Dropdown(
choices=[m["name"] for m in MODELS],
value=[m["name"] for m in MODELS[:min(4, len(MODELS))]],
multiselect=True,
label="Select 2–6 models to compare",
max_choices=6,
interactive=True
)
prompt = gr.Textbox(
label="Your prompt",
lines=4,
placeholder="Type or paste your question / instruction here...",
)
ref_answer = gr.Textbox(
label="Optional reference answer (used for BERTScore etc.)",
lines=3,
placeholder="(leave empty if no reference)",
)
generate_btn = gr.Button("Generate Responses", variant="primary")
# UI components β€” fixed number for stability
model_labels = []
response_boxes = []
vote_btns = []
with gr.Row():
for i in range(MAX_MODELS):
with gr.Column():
label = gr.Markdown("**Model**", visible=False)
response = gr.Textbox(
label=" ",
lines=12,
interactive=False,
visible=False,
show_label=False
)
vote = gr.Button(f"Vote #{i+1}", visible=False, interactive=False)
model_labels.append(label)
response_boxes.append(response)
vote_btns.append(vote)
with gr.Row():
judge_btn = gr.Button("Run LLM-as-a-Judge", visible=False)
tie_btn = gr.Button("It's a tie / both good", visible=False)
judge_output = gr.JSON(label="LLM Judge Evaluation", visible=False)
auto_metrics = gr.JSON(label="Automatic Metrics (if reference provided)", visible=False)
error_msg = gr.Markdown(visible=False)
with gr.Tab("πŸ† Leaderboard"):
leaderboard_df = gr.DataFrame(
value=load_leaderboard(),
interactive=False,
label="Live Leaderboard"
)
refresh_btn = gr.Button("Refresh Leaderboard")
# ────────────────────────────────────────────────
# Helpers
# ────────────────────────────────────────────────
def update_visibility(selected_names):
if not isinstance(selected_names, list):
selected_names = []
n = len(selected_names)
if n == 0:
return tuple([gr.update(visible=False)] * 22)
labels = []
boxes = []
votes = []
for i in range(n):
labels.append(gr.update(visible=True, value=f"**{selected_names[i]}**"))
boxes.append(gr.update(visible=True))
votes.append(gr.update(visible=True, interactive=False))
for i in range(n, MAX_MODELS):
labels.append(gr.update(visible=False, value=""))
boxes.append(gr.update(visible=False))
votes.append(gr.update(visible=False, interactive=False))
controls_visible = n >= 2
return (
*labels, # 6
*boxes, # 6
*votes, # 6
gr.update(visible=controls_visible), # judge_btn
gr.update(visible=controls_visible), # tie_btn
gr.update(visible=False), # judge_output
gr.update(visible=False), # auto_metrics
)
def generate_responses(selected_names, user_prompt):
if not selected_names or not user_prompt.strip():
return (
*[""] * MAX_MODELS,
*[gr.update() for _ in range(MAX_MODELS)], # labels unchanged
*[gr.update(interactive=False) for _ in range(MAX_MODELS)],
gr.update(visible=False), # judge
gr.update(visible=False), # tie
gr.update(visible=False), # judge_output
gr.update(visible=False), # auto_metrics
gr.update(value="Please select models and write a prompt.", visible=True)
)
selected_models = [m for m in MODELS if m["name"] in selected_names]
responses = []
for m in selected_models:
try:
resp = generate_response(m, user_prompt.strip())
responses.append(resp)
except Exception as e:
responses.append(f"**Generation failed:** {str(e)}")
padded_responses = responses + [""] * (MAX_MODELS - len(responses))
vote_updates = [
gr.update(interactive=True) if i < len(responses) else gr.update(interactive=False)
for i in range(MAX_MODELS)
]
return (
*padded_responses,
*[gr.update() for _ in range(MAX_MODELS)], # labels
*vote_updates,
gr.update(visible=len(responses) >= 2),
gr.update(visible=len(responses) >= 2),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False) # error
)
def run_judge(selected_names, prompt_text, *responses):
active_responses = []
active_names = []
for name, resp in zip(selected_names, responses):
if resp and resp.strip() and resp != " ":
active_names.append(name)
active_responses.append(resp)
if len(active_responses) < 2 or not prompt_text.strip():
return (
gr.update(value={"error": "Need at least 2 valid responses and a prompt"}, visible=True),
gr.update(visible=False)
)
try:
# Format prompt (you can improve this)
formatted_prompt = JUDGE_PROMPT.format(
prompt=prompt_text,
responses="\n\n".join(f"[{name}]\n{resp}" for name, resp in zip(active_names, active_responses))
)
client = InferenceClient(model=JUDGE_MODEL)
raw_output = client.text_generation(
formatted_prompt,
max_new_tokens=1200,
temperature=0.7
)
# Placeholder parsing β€” replace with real utils.parse_json(raw_output)
parsed = {"raw": raw_output[:500] + "..."}
return (
gr.update(value=parsed, visible=True),
gr.update(visible=False) # auto_metrics – implement when ready
)
except Exception as e:
return (
gr.update(value={"error": str(e)}, visible=True),
gr.update(visible=False)
)
def record_human_vote(winner_index, selected_names, prompt_text, *responses):
if winner_index is None or not selected_names:
return leaderboard_df.value
active_models = []
active_responses = []
for i, (name, resp) in enumerate(zip(selected_names, responses)):
if resp and resp.strip():
active_models.append(name)
active_responses.append(resp)
if not active_models:
return leaderboard_df.value
winner_name = "tie" if winner_index < 0 else active_models[winner_index]
vote_data = {
"timestamp": pd.Timestamp.now().isoformat(),
"prompt": prompt_text,
"models": active_models,
"responses": active_responses,
"human_winner_idx": winner_index,
"human_winner_name": winner_name,
}
try:
save_vote(vote_data)
return load_leaderboard()
except Exception as e:
return pd.DataFrame({"error": [str(e)]})
# ────────────────────────────────────────────────
# Event bindings
# ────────────────────────────────────────────────
model_select.change(
update_visibility,
inputs=model_select,
outputs=[
*model_labels,
*response_boxes,
*vote_btns,
judge_btn,
tie_btn,
judge_output,
auto_metrics,
]
)
generate_btn.click(
generate_responses,
inputs=[model_select, prompt],
outputs=[
*response_boxes,
*model_labels,
*vote_btns,
judge_btn,
tie_btn,
judge_output,
auto_metrics,
error_msg
]
)
judge_btn.click(
run_judge,
inputs=[model_select, prompt, *response_boxes],
outputs=[judge_output, auto_metrics]
)
# Vote buttons
for idx, btn in enumerate(vote_btns):
btn.click(
record_human_vote,
inputs=[gr.State(idx), model_select, prompt, *response_boxes],
outputs=leaderboard_df
)
tie_btn.click(
record_human_vote,
inputs=[gr.State(-1), model_select, prompt, *response_boxes],
outputs=leaderboard_df
)
refresh_btn.click(
load_leaderboard,
outputs=leaderboard_df
)
# Initial load
demo.load(
update_visibility,
inputs=model_select,
outputs=[
*model_labels,
*response_boxes,
*vote_btns,
judge_btn,
tie_btn,
judge_output,
auto_metrics,
]
)
demo.launch()
if __name__ == "__main__":
main()