Spaces:

workbykait
/

LLM-Judge

Sleeping

App Files Files Community

workbykait commited on Feb 18

Commit

67dfd7b

verified ·

1 Parent(s): fdb29e0

Update app.py

Browse files

Files changed (1) hide show

app.py +292 -227

app.py CHANGED Viewed

@@ -1,245 +1,310 @@
-# app.py (corrected)
 import gradio as gr
 from prompts import JUDGE_PROMPT
 from inference import generate_response
 from leaderboard import load_leaderboard, save_vote
 from models_config import MODELS, JUDGE_MODEL
-import utils  # your ELO, BERTScore, JSON parsing helpers
-import pandas as pd
-# We keep 6 fixed output boxes → easier dynamic slicing + consistent layout
 MAX_MODELS = 6
-with gr.Blocks(title="LLM Judge Arena") as demo:
-    gr.Markdown("# 🏟️ LLM Judge Arena\nSide-by-side + LLM Judge + Human Votes + Live HF Leaderboard")
-    with gr.Tab("⚔️ Arena"):
-        with gr.Row():
-            model_select = gr.Dropdown(
-                choices=[m["name"] for m in MODELS],
-                value=[m["name"] for m in MODELS[:4]],
-                multiselect=True,
-                label="Select 4–6 Models",
-                max_choices=6
             )
-        prompt = gr.Textbox(label="Your prompt", lines=4, placeholder="Write or paste your question here...")
-        ref_answer = gr.Textbox(label="Optional reference answer (enables BERTScore)", lines=3)
-        generate_btn = gr.Button("Generate Responses", variant="primary")
-        # Fixed: always show MAX_MODELS columns (hide inactive ones via visibility)
-        response_boxes = []
-        model_labels = []
-        with gr.Row():
-            for i in range(MAX_MODELS):
-                with gr.Column():
-                    lbl = gr.Markdown(f"**Model {i+1}**", visible=False)
-                    resp = gr.Textbox(label=" ", lines=12, interactive=False, visible=False)
-                    model_labels.append(lbl)
-                    response_boxes.append(resp)
-        with gr.Row():
-            judge_btn = gr.Button("Run LLM-as-a-Judge", variant="secondary")
             vote_btns = []
-            for i in range(MAX_MODELS):
-                btn = gr.Button(f"Vote #{i+1}", visible=False, interactive=False)
-                vote_btns.append(btn)
-            tie_btn = gr.Button("It's a tie", visible=False)
-        judge_output = gr.JSON(label="LLM Judge Evaluation", visible=False)
-        auto_metrics = gr.JSON(label="Automatic Metrics", visible=False)
-    with gr.Tab("🏆 Leaderboard"):
-        leaderboard_df = gr.DataFrame(
-            value=load_leaderboard(),
-            label="Live Leaderboard (ELO + wins)",
-            interactive=False
-        )
-        refresh_btn = gr.Button("Refresh Leaderboard")
-    # ────────────────────────────────────────────────
-    # Helper to update visible components
-    # ────────────────────────────────────────────────
-    def update_visible_components(selected_names):
-    # selected_names is list of str, e.g. ["Llama-3.1-8B", "Gemma-2-9B"]
-    # We assume order in selected_names matches desired display order
-    n_selected = len(selected_names)
-    if n_selected == 0:
-        return tuple([gr.update(visible=False)] * 22)
-    # Prepare updates for each group
-    label_updates = []
-    box_updates  = []
-    vote_updates = []
-    # For the first n_selected models: show + set name
-    for i in range(n_selected):
-        label_updates.append(gr.update(visible=True, value=f"**{selected_names[i]}**"))
-        box_updates.append(gr.update(visible=True))
-        vote_updates.append(gr.update(visible=True, interactive=False))   # enable after generation
-    # Remaining slots: hide everything
-    for i in range(n_selected, MAX_MODELS):
-        label_updates.append(gr.update(visible=False, value=""))
-        box_updates.append(gr.update(visible=False))
-        vote_updates.append(gr.update(visible=False, interactive=False))
-    # Global controls — show judge/tie only if ≥ 2 models selected
-    judge_visible = n_selected >= 2
-    common_json   = gr.update(visible=False)
-    return (
-        *label_updates,     # 6 Markdown labels
-        *box_updates,       # 6 Textbox responses
-        *vote_updates,      # 6 Vote buttons
-        gr.update(visible=judge_visible),  # judge_btn
-        gr.update(visible=judge_visible),  # tie_btn
-        common_json,                       # judge_output JSON
-        common_json                        # auto_metrics JSON
-    )
-    # Run once on load + every time selection changes
-    model_select.change(
-        update_visible_components,
-        inputs=model_select,
-        outputs=[
-            *model_labels,
-            *response_boxes,
-            *vote_btns,
-            judge_btn,
-            tie_btn,
-            judge_output,
-            auto_metrics
-        ]
-    )
-    # ────────────────────────────────────────────────
-    # Generate responses
-    # ────────────────────────────────────────────────
-    def on_generate(selected_names, user_prompt):
-        if not selected_names:
             return (
-                *[""] * MAX_MODELS,           # responses
-                *[""] * MAX_MODELS,           # labels (already set)
-                *[gr.update(interactive=False) for _ in range(MAX_MODELS)],  # vote buttons
-                gr.update(visible=False), judge_output,
-                gr.update(visible=False), auto_metrics
             )
-        selected = [m for m in MODELS if m["name"] in selected_names]
-        responses = []
-        for m in selected:
             try:
-                resp = generate_response(m, user_prompt.strip())
             except Exception as e:
-                resp = f"Error: {str(e)}"
-            responses.append(resp)
-        # Pad to MAX_MODELS
-        padded = responses + [""] * (MAX_MODELS - len(responses))
-        vote_updates = [gr.update(interactive=True) for _ in range(len(selected))] + \
-                       [gr.update(interactive=False) for _ in range(MAX_MODELS - len(selected))]
-        return (
-            *padded,                          # response boxes
-            *[gr.update() for _ in range(MAX_MODELS)],  # labels unchanged
-            *vote_updates,                    # vote buttons
-            gr.update(visible=True),          # judge btn
-            gr.update(visible=True),          # tie
-            gr.update(visible=False),         # judge_output reset
-            gr.update(visible=False),         # auto_metrics reset
         )
-    generate_btn.click(
-        on_generate,
-        inputs=[model_select, prompt],
-        outputs=[
-            *response_boxes,
-            *model_labels,          # usually no change here
-            *vote_btns,
-            judge_btn,
-            tie_btn,
-            judge_output,
-            auto_metrics
-        ]
-    )
-    # ────────────────────────────────────────────────
-    # Run judge (simplified – expand as needed)
-    # ────────────────────────────────────────────────
-    def run_judge(selected_names, prompt_text, *responses):
-        active_responses = [r for r, n in zip(responses, selected_names) if n]
-        active_names = selected_names
-        if len(active_responses) < 2 or not prompt_text.strip():
-            return gr.update(value={"error": "Not enough responses or empty prompt"}), \
-                   gr.update(visible=False)
-        try:
-            formatted = JUDGE_PROMPT.format(
-                prompt=prompt_text,
-                responses="\n\n".join([f"[{n}]\n{r}" for n, r in zip(active_names, active_responses)])
             )
-            client = InferenceClient(model=JUDGE_MODEL)
-            raw = client.text_generation(formatted, max_new_tokens=1200, temperature=0.7)
-            scores = utils.parse_json(raw)  # your safe json parser
-            bert = utils.compute_bertscore(active_responses, ref_answer.value) if ref_answer.value else {}
-            result = {"judge": scores, "auto": bert}
-            return gr.update(value=result, visible=True), gr.update(value=bert, visible=True)
-        except Exception as e:
-            return gr.update(value={"error": str(e)}, visible=True), gr.update(visible=False)
-    judge_btn.click(
-        run_judge,
-        inputs=[model_select, prompt, *response_boxes],
-        outputs=[judge_output, auto_metrics]
-    )
-    # ────────────────────────────────────────────────
-    # Voting (example – one function for all buttons)
-    # ────────────────────────────────────────────────
-    def record_vote(winner_idx, selected_names, prompt_text, *responses):
-        if winner_idx is None or not selected_names:
-            return leaderboard_df.value
-        active_models = selected_names
-        active_responses = [r for r, n in zip(responses, selected_names) if n]
-        vote_data = {
-            "timestamp": pd.Timestamp.now().isoformat(),
-            "prompt": prompt_text,
-            "models": active_models,
-            "responses": active_responses,
-            "human_winner_idx": winner_idx,
-            "human_winner_name": active_models[winner_idx] if winner_idx < len(active_models) else "tie",
-            # add judge scores later if you want
-        }
-        save_vote(vote_data)
-        return load_leaderboard()
-    # Bind each vote button
-    for i, btn in enumerate(vote_btns):
-        btn.click(
-            record_vote,
-            inputs=[gr.State(i), model_select, prompt, *response_boxes],
             outputs=leaderboard_df
         )
-    tie_btn.click(
-        record_vote,
-        inputs=[gr.State(-1), model_select, prompt, *response_boxes],  # -1 = tie
-        outputs=leaderboard_df
-    )
-    refresh_btn.click(load_leaderboard, outputs=leaderboard_df)
-    # Trigger initial visibility update
-    demo.load(update_visible_components, inputs=model_select, outputs=[
-        *model_labels, *response_boxes, *vote_btns, judge_btn, tie_btn, judge_output, auto_metrics
-    ])
-demo.launch()

 import gradio as gr
+import pandas as pd
+from huggingface_hub import InferenceClient
 from prompts import JUDGE_PROMPT
 from inference import generate_response
 from leaderboard import load_leaderboard, save_vote
 from models_config import MODELS, JUDGE_MODEL
+# import utils  # uncomment when you have parsing + bertscore functions
 MAX_MODELS = 6
+def main():
+    with gr.Blocks(title="LLM Judge Arena") as demo:
+        gr.Markdown(
+            "# 🏟️ LLM Judge Arena\n"
+            "Side-by-side comparison + LLM-as-a-Judge + Human votes + Live leaderboard"
+        )
+        with gr.Tab("⚔️ Arena"):
+            with gr.Row():
+                model_select = gr.Dropdown(
+                    choices=[m["name"] for m in MODELS],
+                    value=[m["name"] for m in MODELS[:min(4, len(MODELS))]],
+                    multiselect=True,
+                    label="Select 2–6 models to compare",
+                    max_choices=6,
+                    interactive=True
+                )
+            prompt = gr.Textbox(
+                label="Your prompt",
+                lines=4,
+                placeholder="Type or paste your question / instruction here...",
+            )
+            ref_answer = gr.Textbox(
+                label="Optional reference answer (used for BERTScore etc.)",
+                lines=3,
+                placeholder="(leave empty if no reference)",
             )
+            generate_btn = gr.Button("Generate Responses", variant="primary")
+            # UI components — fixed number for stability
+            model_labels = []
+            response_boxes = []
             vote_btns = []
+            with gr.Row():
+                for i in range(MAX_MODELS):
+                    with gr.Column():
+                        label = gr.Markdown("**Model**", visible=False)
+                        response = gr.Textbox(
+                            label=" ",
+                            lines=12,
+                            interactive=False,
+                            visible=False,
+                            show_label=False
+                        )
+                        vote = gr.Button(f"Vote #{i+1}", visible=False, interactive=False)
+                        model_labels.append(label)
+                        response_boxes.append(response)
+                        vote_btns.append(vote)
+            with gr.Row():
+                judge_btn = gr.Button("Run LLM-as-a-Judge", visible=False)
+                tie_btn = gr.Button("It's a tie / both good", visible=False)
+            judge_output = gr.JSON(label="LLM Judge Evaluation", visible=False)
+            auto_metrics = gr.JSON(label="Automatic Metrics (if reference provided)", visible=False)
+            error_msg = gr.Markdown(visible=False)
+        with gr.Tab("🏆 Leaderboard"):
+            leaderboard_df = gr.DataFrame(
+                value=load_leaderboard(),
+                interactive=False,
+                label="Live Leaderboard"
+            )
+            refresh_btn = gr.Button("Refresh Leaderboard")
+        # ────────────────────────────────────────────────
+        # Helpers
+        # ────────────────────────────────────────────────
+        def update_visibility(selected_names):
+            if not isinstance(selected_names, list):
+                selected_names = []
+            n = len(selected_names)
+            if n == 0:
+                return tuple([gr.update(visible=False)] * 22)
+            labels = []
+            boxes = []
+            votes = []
+            for i in range(n):
+                labels.append(gr.update(visible=True, value=f"**{selected_names[i]}**"))
+                boxes.append(gr.update(visible=True))
+                votes.append(gr.update(visible=True, interactive=False))
+            for i in range(n, MAX_MODELS):
+                labels.append(gr.update(visible=False, value=""))
+                boxes.append(gr.update(visible=False))
+                votes.append(gr.update(visible=False, interactive=False))
+            controls_visible = n >= 2
+            return (
+                *labels,                    # 6
+                *boxes,                     # 6
+                *votes,                     # 6
+                gr.update(visible=controls_visible),  # judge_btn
+                gr.update(visible=controls_visible),  # tie_btn
+                gr.update(visible=False),             # judge_output
+                gr.update(visible=False),             # auto_metrics
+            )
+        def generate_responses(selected_names, user_prompt):
+            if not selected_names or not user_prompt.strip():
+                return (
+                    *[""] * MAX_MODELS,
+                    *[gr.update() for _ in range(MAX_MODELS)],  # labels unchanged
+                    *[gr.update(interactive=False) for _ in range(MAX_MODELS)],
+                    gr.update(visible=False),  # judge
+                    gr.update(visible=False),  # tie
+                    gr.update(visible=False),  # judge_output
+                    gr.update(visible=False),  # auto_metrics
+                    gr.update(value="Please select models and write a prompt.", visible=True)
+                )
+            selected_models = [m for m in MODELS if m["name"] in selected_names]
+            responses = []
+            for m in selected_models:
+                try:
+                    resp = generate_response(m, user_prompt.strip())
+                    responses.append(resp)
+                except Exception as e:
+                    responses.append(f"**Generation failed:** {str(e)}")
+            padded_responses = responses + [""] * (MAX_MODELS - len(responses))
+            vote_updates = [
+                gr.update(interactive=True) if i < len(responses) else gr.update(interactive=False)
+                for i in range(MAX_MODELS)
+            ]
             return (
+                *padded_responses,
+                *[gr.update() for _ in range(MAX_MODELS)],  # labels
+                *vote_updates,
+                gr.update(visible=len(responses) >= 2),
+                gr.update(visible=len(responses) >= 2),
+                gr.update(visible=False),
+                gr.update(visible=False),
+                gr.update(visible=False)   # error
             )
+        def run_judge(selected_names, prompt_text, *responses):
+            active_responses = []
+            active_names = []
+            for name, resp in zip(selected_names, responses):
+                if resp and resp.strip() and resp != " ":
+                    active_names.append(name)
+                    active_responses.append(resp)
+            if len(active_responses) < 2 or not prompt_text.strip():
+                return (
+                    gr.update(value={"error": "Need at least 2 valid responses and a prompt"}, visible=True),
+                    gr.update(visible=False)
+                )
             try:
+                # Format prompt (you can improve this)
+                formatted_prompt = JUDGE_PROMPT.format(
+                    prompt=prompt_text,
+                    responses="\n\n".join(f"[{name}]\n{resp}" for name, resp in zip(active_names, active_responses))
+                )
+                client = InferenceClient(model=JUDGE_MODEL)
+                raw_output = client.text_generation(
+                    formatted_prompt,
+                    max_new_tokens=1200,
+                    temperature=0.7
+                )
+                # Placeholder parsing — replace with real utils.parse_json(raw_output)
+                parsed = {"raw": raw_output[:500] + "..."}
+                return (
+                    gr.update(value=parsed, visible=True),
+                    gr.update(visible=False)  # auto_metrics – implement when ready
+                )
             except Exception as e:
+                return (
+                    gr.update(value={"error": str(e)}, visible=True),
+                    gr.update(visible=False)
+                )
+        def record_human_vote(winner_index, selected_names, prompt_text, *responses):
+            if winner_index is None or not selected_names:
+                return leaderboard_df.value
+            active_models = []
+            active_responses = []
+            for i, (name, resp) in enumerate(zip(selected_names, responses)):
+                if resp and resp.strip():
+                    active_models.append(name)
+                    active_responses.append(resp)
+            if not active_models:
+                return leaderboard_df.value
+            winner_name = "tie" if winner_index < 0 else active_models[winner_index]
+            vote_data = {
+                "timestamp": pd.Timestamp.now().isoformat(),
+                "prompt": prompt_text,
+                "models": active_models,
+                "responses": active_responses,
+                "human_winner_idx": winner_index,
+                "human_winner_name": winner_name,
+            }
+            try:
+                save_vote(vote_data)
+                return load_leaderboard()
+            except Exception as e:
+                return pd.DataFrame({"error": [str(e)]})
+        # ────────────────────────────────────────────────
+        # Event bindings
+        # ────────────────────────────────────────────────
+        model_select.change(
+            update_visibility,
+            inputs=model_select,
+            outputs=[
+                *model_labels,
+                *response_boxes,
+                *vote_btns,
+                judge_btn,
+                tie_btn,
+                judge_output,
+                auto_metrics,
+            ]
+        )
+        generate_btn.click(
+            generate_responses,
+            inputs=[model_select, prompt],
+            outputs=[
+                *response_boxes,
+                *model_labels,
+                *vote_btns,
+                judge_btn,
+                tie_btn,
+                judge_output,
+                auto_metrics,
+                error_msg
+            ]
         )
+        judge_btn.click(
+            run_judge,
+            inputs=[model_select, prompt, *response_boxes],
+            outputs=[judge_output, auto_metrics]
+        )
+        # Vote buttons
+        for idx, btn in enumerate(vote_btns):
+            btn.click(
+                record_human_vote,
+                inputs=[gr.State(idx), model_select, prompt, *response_boxes],
+                outputs=leaderboard_df
             )
+        tie_btn.click(
+            record_human_vote,
+            inputs=[gr.State(-1), model_select, prompt, *response_boxes],
             outputs=leaderboard_df
         )
+        refresh_btn.click(
+            load_leaderboard,
+            outputs=leaderboard_df
+        )
+        # Initial load
+        demo.load(
+            update_visibility,
+            inputs=model_select,
+            outputs=[
+                *model_labels,
+                *response_boxes,
+                *vote_btns,
+                judge_btn,
+                tie_btn,
+                judge_output,
+                auto_metrics,
+            ]
+        )
+    demo.launch()
+if __name__ == "__main__":
+    main()