import gradio as gr import pandas as pd from huggingface_hub import InferenceClient from prompts import JUDGE_PROMPT from inference import generate_response from leaderboard import load_leaderboard, save_vote from models_config import MODELS, JUDGE_MODEL # import utils # uncomment when you have parsing + bertscore functions MAX_MODELS = 6 def main(): with gr.Blocks(title="LLM Judge Arena") as demo: gr.Markdown( "# 🏟️ LLM Judge Arena\n" "Side-by-side comparison + LLM-as-a-Judge + Human votes + Live leaderboard" ) with gr.Tab("⚔️ Arena"): with gr.Row(): model_select = gr.Dropdown( choices=[m["name"] for m in MODELS], value=[m["name"] for m in MODELS[:min(4, len(MODELS))]], multiselect=True, label="Select 2–6 models to compare", max_choices=6, interactive=True ) prompt = gr.Textbox( label="Your prompt", lines=4, placeholder="Type or paste your question / instruction here...", ) ref_answer = gr.Textbox( label="Optional reference answer (used for BERTScore etc.)", lines=3, placeholder="(leave empty if no reference)", ) generate_btn = gr.Button("Generate Responses", variant="primary") # UI components — fixed number for stability model_labels = [] response_boxes = [] vote_btns = [] with gr.Row(): for i in range(MAX_MODELS): with gr.Column(): label = gr.Markdown("**Model**", visible=False) response = gr.Textbox( label=" ", lines=12, interactive=False, visible=False, show_label=False ) vote = gr.Button(f"Vote #{i+1}", visible=False, interactive=False) model_labels.append(label) response_boxes.append(response) vote_btns.append(vote) with gr.Row(): judge_btn = gr.Button("Run LLM-as-a-Judge", visible=False) tie_btn = gr.Button("It's a tie / both good", visible=False) judge_output = gr.JSON(label="LLM Judge Evaluation", visible=False) auto_metrics = gr.JSON(label="Automatic Metrics (if reference provided)", visible=False) error_msg = gr.Markdown(visible=False) with gr.Tab("🏆 Leaderboard"): leaderboard_df = gr.DataFrame( value=load_leaderboard(), interactive=False, label="Live Leaderboard" ) refresh_btn = gr.Button("Refresh Leaderboard") # ──────────────────────────────────────────────── # Helpers # ──────────────────────────────────────────────── def update_visibility(selected_names): if not isinstance(selected_names, list): selected_names = [] n = len(selected_names) if n == 0: return tuple([gr.update(visible=False)] * 22) labels = [] boxes = [] votes = [] for i in range(n): labels.append(gr.update(visible=True, value=f"**{selected_names[i]}**")) boxes.append(gr.update(visible=True)) votes.append(gr.update(visible=True, interactive=False)) for i in range(n, MAX_MODELS): labels.append(gr.update(visible=False, value="")) boxes.append(gr.update(visible=False)) votes.append(gr.update(visible=False, interactive=False)) controls_visible = n >= 2 return ( *labels, # 6 *boxes, # 6 *votes, # 6 gr.update(visible=controls_visible), # judge_btn gr.update(visible=controls_visible), # tie_btn gr.update(visible=False), # judge_output gr.update(visible=False), # auto_metrics ) def generate_responses(selected_names, user_prompt): if not selected_names or not user_prompt.strip(): return ( *[""] * MAX_MODELS, *[gr.update() for _ in range(MAX_MODELS)], # labels unchanged *[gr.update(interactive=False) for _ in range(MAX_MODELS)], gr.update(visible=False), # judge gr.update(visible=False), # tie gr.update(visible=False), # judge_output gr.update(visible=False), # auto_metrics gr.update(value="Please select models and write a prompt.", visible=True) ) selected_models = [m for m in MODELS if m["name"] in selected_names] responses = [] for m in selected_models: try: resp = generate_response(m, user_prompt.strip()) responses.append(resp) except Exception as e: responses.append(f"**Generation failed:** {str(e)}") padded_responses = responses + [""] * (MAX_MODELS - len(responses)) vote_updates = [ gr.update(interactive=True) if i < len(responses) else gr.update(interactive=False) for i in range(MAX_MODELS) ] return ( *padded_responses, *[gr.update() for _ in range(MAX_MODELS)], # labels *vote_updates, gr.update(visible=len(responses) >= 2), gr.update(visible=len(responses) >= 2), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) # error ) def run_judge(selected_names, prompt_text, *responses): active_responses = [] active_names = [] for name, resp in zip(selected_names, responses): if resp and resp.strip() and resp != " ": active_names.append(name) active_responses.append(resp) if len(active_responses) < 2 or not prompt_text.strip(): return ( gr.update(value={"error": "Need at least 2 valid responses and a prompt"}, visible=True), gr.update(visible=False) ) try: # Format prompt (you can improve this) formatted_prompt = JUDGE_PROMPT.format( prompt=prompt_text, responses="\n\n".join(f"[{name}]\n{resp}" for name, resp in zip(active_names, active_responses)) ) client = InferenceClient(model=JUDGE_MODEL) raw_output = client.text_generation( formatted_prompt, max_new_tokens=1200, temperature=0.7 ) # Placeholder parsing — replace with real utils.parse_json(raw_output) parsed = {"raw": raw_output[:500] + "..."} return ( gr.update(value=parsed, visible=True), gr.update(visible=False) # auto_metrics – implement when ready ) except Exception as e: return ( gr.update(value={"error": str(e)}, visible=True), gr.update(visible=False) ) def record_human_vote(winner_index, selected_names, prompt_text, *responses): if winner_index is None or not selected_names: return leaderboard_df.value active_models = [] active_responses = [] for i, (name, resp) in enumerate(zip(selected_names, responses)): if resp and resp.strip(): active_models.append(name) active_responses.append(resp) if not active_models: return leaderboard_df.value winner_name = "tie" if winner_index < 0 else active_models[winner_index] vote_data = { "timestamp": pd.Timestamp.now().isoformat(), "prompt": prompt_text, "models": active_models, "responses": active_responses, "human_winner_idx": winner_index, "human_winner_name": winner_name, } try: save_vote(vote_data) return load_leaderboard() except Exception as e: return pd.DataFrame({"error": [str(e)]}) # ──────────────────────────────────────────────── # Event bindings # ──────────────────────────────────────────────── model_select.change( update_visibility, inputs=model_select, outputs=[ *model_labels, *response_boxes, *vote_btns, judge_btn, tie_btn, judge_output, auto_metrics, ] ) generate_btn.click( generate_responses, inputs=[model_select, prompt], outputs=[ *response_boxes, *model_labels, *vote_btns, judge_btn, tie_btn, judge_output, auto_metrics, error_msg ] ) judge_btn.click( run_judge, inputs=[model_select, prompt, *response_boxes], outputs=[judge_output, auto_metrics] ) # Vote buttons for idx, btn in enumerate(vote_btns): btn.click( record_human_vote, inputs=[gr.State(idx), model_select, prompt, *response_boxes], outputs=leaderboard_df ) tie_btn.click( record_human_vote, inputs=[gr.State(-1), model_select, prompt, *response_boxes], outputs=leaderboard_df ) refresh_btn.click( load_leaderboard, outputs=leaderboard_df ) # Initial load demo.load( update_visibility, inputs=model_select, outputs=[ *model_labels, *response_boxes, *vote_btns, judge_btn, tie_btn, judge_output, auto_metrics, ] ) demo.launch() if __name__ == "__main__": main()