Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| from huggingface_hub import InferenceClient | |
| from prompts import JUDGE_PROMPT | |
| from inference import generate_response | |
| from leaderboard import load_leaderboard, save_vote | |
| from models_config import MODELS, JUDGE_MODEL | |
| # import utils # uncomment when you have parsing + bertscore functions | |
| MAX_MODELS = 6 | |
| def main(): | |
| with gr.Blocks(title="LLM Judge Arena") as demo: | |
| gr.Markdown( | |
| "# ποΈ LLM Judge Arena\n" | |
| "Side-by-side comparison + LLM-as-a-Judge + Human votes + Live leaderboard" | |
| ) | |
| with gr.Tab("βοΈ Arena"): | |
| with gr.Row(): | |
| model_select = gr.Dropdown( | |
| choices=[m["name"] for m in MODELS], | |
| value=[m["name"] for m in MODELS[:min(4, len(MODELS))]], | |
| multiselect=True, | |
| label="Select 2β6 models to compare", | |
| max_choices=6, | |
| interactive=True | |
| ) | |
| prompt = gr.Textbox( | |
| label="Your prompt", | |
| lines=4, | |
| placeholder="Type or paste your question / instruction here...", | |
| ) | |
| ref_answer = gr.Textbox( | |
| label="Optional reference answer (used for BERTScore etc.)", | |
| lines=3, | |
| placeholder="(leave empty if no reference)", | |
| ) | |
| generate_btn = gr.Button("Generate Responses", variant="primary") | |
| # UI components β fixed number for stability | |
| model_labels = [] | |
| response_boxes = [] | |
| vote_btns = [] | |
| with gr.Row(): | |
| for i in range(MAX_MODELS): | |
| with gr.Column(): | |
| label = gr.Markdown("**Model**", visible=False) | |
| response = gr.Textbox( | |
| label=" ", | |
| lines=12, | |
| interactive=False, | |
| visible=False, | |
| show_label=False | |
| ) | |
| vote = gr.Button(f"Vote #{i+1}", visible=False, interactive=False) | |
| model_labels.append(label) | |
| response_boxes.append(response) | |
| vote_btns.append(vote) | |
| with gr.Row(): | |
| judge_btn = gr.Button("Run LLM-as-a-Judge", visible=False) | |
| tie_btn = gr.Button("It's a tie / both good", visible=False) | |
| judge_output = gr.JSON(label="LLM Judge Evaluation", visible=False) | |
| auto_metrics = gr.JSON(label="Automatic Metrics (if reference provided)", visible=False) | |
| error_msg = gr.Markdown(visible=False) | |
| with gr.Tab("π Leaderboard"): | |
| leaderboard_df = gr.DataFrame( | |
| value=load_leaderboard(), | |
| interactive=False, | |
| label="Live Leaderboard" | |
| ) | |
| refresh_btn = gr.Button("Refresh Leaderboard") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Helpers | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def update_visibility(selected_names): | |
| if not isinstance(selected_names, list): | |
| selected_names = [] | |
| n = len(selected_names) | |
| if n == 0: | |
| return tuple([gr.update(visible=False)] * 22) | |
| labels = [] | |
| boxes = [] | |
| votes = [] | |
| for i in range(n): | |
| labels.append(gr.update(visible=True, value=f"**{selected_names[i]}**")) | |
| boxes.append(gr.update(visible=True)) | |
| votes.append(gr.update(visible=True, interactive=False)) | |
| for i in range(n, MAX_MODELS): | |
| labels.append(gr.update(visible=False, value="")) | |
| boxes.append(gr.update(visible=False)) | |
| votes.append(gr.update(visible=False, interactive=False)) | |
| controls_visible = n >= 2 | |
| return ( | |
| *labels, # 6 | |
| *boxes, # 6 | |
| *votes, # 6 | |
| gr.update(visible=controls_visible), # judge_btn | |
| gr.update(visible=controls_visible), # tie_btn | |
| gr.update(visible=False), # judge_output | |
| gr.update(visible=False), # auto_metrics | |
| ) | |
| def generate_responses(selected_names, user_prompt): | |
| if not selected_names or not user_prompt.strip(): | |
| return ( | |
| *[""] * MAX_MODELS, | |
| *[gr.update() for _ in range(MAX_MODELS)], # labels unchanged | |
| *[gr.update(interactive=False) for _ in range(MAX_MODELS)], | |
| gr.update(visible=False), # judge | |
| gr.update(visible=False), # tie | |
| gr.update(visible=False), # judge_output | |
| gr.update(visible=False), # auto_metrics | |
| gr.update(value="Please select models and write a prompt.", visible=True) | |
| ) | |
| selected_models = [m for m in MODELS if m["name"] in selected_names] | |
| responses = [] | |
| for m in selected_models: | |
| try: | |
| resp = generate_response(m, user_prompt.strip()) | |
| responses.append(resp) | |
| except Exception as e: | |
| responses.append(f"**Generation failed:** {str(e)}") | |
| padded_responses = responses + [""] * (MAX_MODELS - len(responses)) | |
| vote_updates = [ | |
| gr.update(interactive=True) if i < len(responses) else gr.update(interactive=False) | |
| for i in range(MAX_MODELS) | |
| ] | |
| return ( | |
| *padded_responses, | |
| *[gr.update() for _ in range(MAX_MODELS)], # labels | |
| *vote_updates, | |
| gr.update(visible=len(responses) >= 2), | |
| gr.update(visible=len(responses) >= 2), | |
| gr.update(visible=False), | |
| gr.update(visible=False), | |
| gr.update(visible=False) # error | |
| ) | |
| def run_judge(selected_names, prompt_text, *responses): | |
| active_responses = [] | |
| active_names = [] | |
| for name, resp in zip(selected_names, responses): | |
| if resp and resp.strip() and resp != " ": | |
| active_names.append(name) | |
| active_responses.append(resp) | |
| if len(active_responses) < 2 or not prompt_text.strip(): | |
| return ( | |
| gr.update(value={"error": "Need at least 2 valid responses and a prompt"}, visible=True), | |
| gr.update(visible=False) | |
| ) | |
| try: | |
| # Format prompt (you can improve this) | |
| formatted_prompt = JUDGE_PROMPT.format( | |
| prompt=prompt_text, | |
| responses="\n\n".join(f"[{name}]\n{resp}" for name, resp in zip(active_names, active_responses)) | |
| ) | |
| client = InferenceClient(model=JUDGE_MODEL) | |
| raw_output = client.text_generation( | |
| formatted_prompt, | |
| max_new_tokens=1200, | |
| temperature=0.7 | |
| ) | |
| # Placeholder parsing β replace with real utils.parse_json(raw_output) | |
| parsed = {"raw": raw_output[:500] + "..."} | |
| return ( | |
| gr.update(value=parsed, visible=True), | |
| gr.update(visible=False) # auto_metrics β implement when ready | |
| ) | |
| except Exception as e: | |
| return ( | |
| gr.update(value={"error": str(e)}, visible=True), | |
| gr.update(visible=False) | |
| ) | |
| def record_human_vote(winner_index, selected_names, prompt_text, *responses): | |
| if winner_index is None or not selected_names: | |
| return leaderboard_df.value | |
| active_models = [] | |
| active_responses = [] | |
| for i, (name, resp) in enumerate(zip(selected_names, responses)): | |
| if resp and resp.strip(): | |
| active_models.append(name) | |
| active_responses.append(resp) | |
| if not active_models: | |
| return leaderboard_df.value | |
| winner_name = "tie" if winner_index < 0 else active_models[winner_index] | |
| vote_data = { | |
| "timestamp": pd.Timestamp.now().isoformat(), | |
| "prompt": prompt_text, | |
| "models": active_models, | |
| "responses": active_responses, | |
| "human_winner_idx": winner_index, | |
| "human_winner_name": winner_name, | |
| } | |
| try: | |
| save_vote(vote_data) | |
| return load_leaderboard() | |
| except Exception as e: | |
| return pd.DataFrame({"error": [str(e)]}) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Event bindings | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| model_select.change( | |
| update_visibility, | |
| inputs=model_select, | |
| outputs=[ | |
| *model_labels, | |
| *response_boxes, | |
| *vote_btns, | |
| judge_btn, | |
| tie_btn, | |
| judge_output, | |
| auto_metrics, | |
| ] | |
| ) | |
| generate_btn.click( | |
| generate_responses, | |
| inputs=[model_select, prompt], | |
| outputs=[ | |
| *response_boxes, | |
| *model_labels, | |
| *vote_btns, | |
| judge_btn, | |
| tie_btn, | |
| judge_output, | |
| auto_metrics, | |
| error_msg | |
| ] | |
| ) | |
| judge_btn.click( | |
| run_judge, | |
| inputs=[model_select, prompt, *response_boxes], | |
| outputs=[judge_output, auto_metrics] | |
| ) | |
| # Vote buttons | |
| for idx, btn in enumerate(vote_btns): | |
| btn.click( | |
| record_human_vote, | |
| inputs=[gr.State(idx), model_select, prompt, *response_boxes], | |
| outputs=leaderboard_df | |
| ) | |
| tie_btn.click( | |
| record_human_vote, | |
| inputs=[gr.State(-1), model_select, prompt, *response_boxes], | |
| outputs=leaderboard_df | |
| ) | |
| refresh_btn.click( | |
| load_leaderboard, | |
| outputs=leaderboard_df | |
| ) | |
| # Initial load | |
| demo.load( | |
| update_visibility, | |
| inputs=model_select, | |
| outputs=[ | |
| *model_labels, | |
| *response_boxes, | |
| *vote_btns, | |
| judge_btn, | |
| tie_btn, | |
| judge_output, | |
| auto_metrics, | |
| ] | |
| ) | |
| demo.launch() | |
| if __name__ == "__main__": | |
| main() |