workbykait commited on
Commit
67dfd7b
Β·
verified Β·
1 Parent(s): fdb29e0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +292 -227
app.py CHANGED
@@ -1,245 +1,310 @@
1
- # app.py (corrected)
2
-
3
  import gradio as gr
 
 
4
  from prompts import JUDGE_PROMPT
5
  from inference import generate_response
6
  from leaderboard import load_leaderboard, save_vote
7
  from models_config import MODELS, JUDGE_MODEL
8
- import utils # your ELO, BERTScore, JSON parsing helpers
9
- import pandas as pd
10
 
11
- # We keep 6 fixed output boxes β†’ easier dynamic slicing + consistent layout
12
  MAX_MODELS = 6
13
 
14
- with gr.Blocks(title="LLM Judge Arena") as demo:
15
- gr.Markdown("# 🏟️ LLM Judge Arena\nSide-by-side + LLM Judge + Human Votes + Live HF Leaderboard")
16
-
17
- with gr.Tab("βš”οΈ Arena"):
18
- with gr.Row():
19
- model_select = gr.Dropdown(
20
- choices=[m["name"] for m in MODELS],
21
- value=[m["name"] for m in MODELS[:4]],
22
- multiselect=True,
23
- label="Select 4–6 Models",
24
- max_choices=6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  )
26
-
27
- prompt = gr.Textbox(label="Your prompt", lines=4, placeholder="Write or paste your question here...")
28
- ref_answer = gr.Textbox(label="Optional reference answer (enables BERTScore)", lines=3)
29
-
30
- generate_btn = gr.Button("Generate Responses", variant="primary")
31
-
32
- # Fixed: always show MAX_MODELS columns (hide inactive ones via visibility)
33
- response_boxes = []
34
- model_labels = []
35
- with gr.Row():
36
- for i in range(MAX_MODELS):
37
- with gr.Column():
38
- lbl = gr.Markdown(f"**Model {i+1}**", visible=False)
39
- resp = gr.Textbox(label=" ", lines=12, interactive=False, visible=False)
40
- model_labels.append(lbl)
41
- response_boxes.append(resp)
42
-
43
- with gr.Row():
44
- judge_btn = gr.Button("Run LLM-as-a-Judge", variant="secondary")
45
  vote_btns = []
46
- for i in range(MAX_MODELS):
47
- btn = gr.Button(f"Vote #{i+1}", visible=False, interactive=False)
48
- vote_btns.append(btn)
49
- tie_btn = gr.Button("It's a tie", visible=False)
50
-
51
- judge_output = gr.JSON(label="LLM Judge Evaluation", visible=False)
52
- auto_metrics = gr.JSON(label="Automatic Metrics", visible=False)
53
-
54
- with gr.Tab("πŸ† Leaderboard"):
55
- leaderboard_df = gr.DataFrame(
56
- value=load_leaderboard(),
57
- label="Live Leaderboard (ELO + wins)",
58
- interactive=False
59
- )
60
- refresh_btn = gr.Button("Refresh Leaderboard")
61
-
62
- # ────────────────────────────────────────────────
63
- # Helper to update visible components
64
- # ────────────────────────────────────────────────
65
- def update_visible_components(selected_names):
66
- # selected_names is list of str, e.g. ["Llama-3.1-8B", "Gemma-2-9B"]
67
- # We assume order in selected_names matches desired display order
68
-
69
- n_selected = len(selected_names)
70
- if n_selected == 0:
71
- return tuple([gr.update(visible=False)] * 22)
72
-
73
- # Prepare updates for each group
74
- label_updates = []
75
- box_updates = []
76
- vote_updates = []
77
-
78
- # For the first n_selected models: show + set name
79
- for i in range(n_selected):
80
- label_updates.append(gr.update(visible=True, value=f"**{selected_names[i]}**"))
81
- box_updates.append(gr.update(visible=True))
82
- vote_updates.append(gr.update(visible=True, interactive=False)) # enable after generation
83
-
84
- # Remaining slots: hide everything
85
- for i in range(n_selected, MAX_MODELS):
86
- label_updates.append(gr.update(visible=False, value=""))
87
- box_updates.append(gr.update(visible=False))
88
- vote_updates.append(gr.update(visible=False, interactive=False))
89
-
90
- # Global controls β€” show judge/tie only if β‰₯ 2 models selected
91
- judge_visible = n_selected >= 2
92
- common_json = gr.update(visible=False)
93
-
94
- return (
95
- *label_updates, # 6 Markdown labels
96
- *box_updates, # 6 Textbox responses
97
- *vote_updates, # 6 Vote buttons
98
- gr.update(visible=judge_visible), # judge_btn
99
- gr.update(visible=judge_visible), # tie_btn
100
- common_json, # judge_output JSON
101
- common_json # auto_metrics JSON
102
- )
103
-
104
- # Run once on load + every time selection changes
105
- model_select.change(
106
- update_visible_components,
107
- inputs=model_select,
108
- outputs=[
109
- *model_labels,
110
- *response_boxes,
111
- *vote_btns,
112
- judge_btn,
113
- tie_btn,
114
- judge_output,
115
- auto_metrics
116
- ]
117
- )
118
-
119
- # ────────────────────────────────────────────────
120
- # Generate responses
121
- # ────────────────────────────────────────────────
122
- def on_generate(selected_names, user_prompt):
123
- if not selected_names:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  return (
125
- *[""] * MAX_MODELS, # responses
126
- *[""] * MAX_MODELS, # labels (already set)
127
- *[gr.update(interactive=False) for _ in range(MAX_MODELS)], # vote buttons
128
- gr.update(visible=False), judge_output,
129
- gr.update(visible=False), auto_metrics
 
 
 
130
  )
131
-
132
- selected = [m for m in MODELS if m["name"] in selected_names]
133
- responses = []
134
- for m in selected:
 
 
 
 
 
 
 
 
 
 
 
 
135
  try:
136
- resp = generate_response(m, user_prompt.strip())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  except Exception as e:
138
- resp = f"Error: {str(e)}"
139
- responses.append(resp)
140
-
141
- # Pad to MAX_MODELS
142
- padded = responses + [""] * (MAX_MODELS - len(responses))
143
-
144
- vote_updates = [gr.update(interactive=True) for _ in range(len(selected))] + \
145
- [gr.update(interactive=False) for _ in range(MAX_MODELS - len(selected))]
146
-
147
- return (
148
- *padded, # response boxes
149
- *[gr.update() for _ in range(MAX_MODELS)], # labels unchanged
150
- *vote_updates, # vote buttons
151
- gr.update(visible=True), # judge btn
152
- gr.update(visible=True), # tie
153
- gr.update(visible=False), # judge_output reset
154
- gr.update(visible=False), # auto_metrics reset
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  )
156
-
157
- generate_btn.click(
158
- on_generate,
159
- inputs=[model_select, prompt],
160
- outputs=[
161
- *response_boxes,
162
- *model_labels, # usually no change here
163
- *vote_btns,
164
- judge_btn,
165
- tie_btn,
166
- judge_output,
167
- auto_metrics
168
- ]
169
- )
170
-
171
- # ────────────────────────────────────────────────
172
- # Run judge (simplified – expand as needed)
173
- # ────────────────────────────────────────────────
174
- def run_judge(selected_names, prompt_text, *responses):
175
- active_responses = [r for r, n in zip(responses, selected_names) if n]
176
- active_names = selected_names
177
-
178
- if len(active_responses) < 2 or not prompt_text.strip():
179
- return gr.update(value={"error": "Not enough responses or empty prompt"}), \
180
- gr.update(visible=False)
181
-
182
- try:
183
- formatted = JUDGE_PROMPT.format(
184
- prompt=prompt_text,
185
- responses="\n\n".join([f"[{n}]\n{r}" for n, r in zip(active_names, active_responses)])
186
  )
187
- client = InferenceClient(model=JUDGE_MODEL)
188
- raw = client.text_generation(formatted, max_new_tokens=1200, temperature=0.7)
189
- scores = utils.parse_json(raw) # your safe json parser
190
- bert = utils.compute_bertscore(active_responses, ref_answer.value) if ref_answer.value else {}
191
- result = {"judge": scores, "auto": bert}
192
- return gr.update(value=result, visible=True), gr.update(value=bert, visible=True)
193
- except Exception as e:
194
- return gr.update(value={"error": str(e)}, visible=True), gr.update(visible=False)
195
-
196
- judge_btn.click(
197
- run_judge,
198
- inputs=[model_select, prompt, *response_boxes],
199
- outputs=[judge_output, auto_metrics]
200
- )
201
-
202
- # ────────────────────────────────────────────────
203
- # Voting (example – one function for all buttons)
204
- # ────────────────────────────────────────────────
205
- def record_vote(winner_idx, selected_names, prompt_text, *responses):
206
- if winner_idx is None or not selected_names:
207
- return leaderboard_df.value
208
-
209
- active_models = selected_names
210
- active_responses = [r for r, n in zip(responses, selected_names) if n]
211
-
212
- vote_data = {
213
- "timestamp": pd.Timestamp.now().isoformat(),
214
- "prompt": prompt_text,
215
- "models": active_models,
216
- "responses": active_responses,
217
- "human_winner_idx": winner_idx,
218
- "human_winner_name": active_models[winner_idx] if winner_idx < len(active_models) else "tie",
219
- # add judge scores later if you want
220
- }
221
- save_vote(vote_data)
222
- return load_leaderboard()
223
-
224
- # Bind each vote button
225
- for i, btn in enumerate(vote_btns):
226
- btn.click(
227
- record_vote,
228
- inputs=[gr.State(i), model_select, prompt, *response_boxes],
229
  outputs=leaderboard_df
230
  )
231
-
232
- tie_btn.click(
233
- record_vote,
234
- inputs=[gr.State(-1), model_select, prompt, *response_boxes], # -1 = tie
235
- outputs=leaderboard_df
236
- )
237
-
238
- refresh_btn.click(load_leaderboard, outputs=leaderboard_df)
239
-
240
- # Trigger initial visibility update
241
- demo.load(update_visible_components, inputs=model_select, outputs=[
242
- *model_labels, *response_boxes, *vote_btns, judge_btn, tie_btn, judge_output, auto_metrics
243
- ])
244
-
245
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import pandas as pd
3
+ from huggingface_hub import InferenceClient
4
  from prompts import JUDGE_PROMPT
5
  from inference import generate_response
6
  from leaderboard import load_leaderboard, save_vote
7
  from models_config import MODELS, JUDGE_MODEL
8
+ # import utils # uncomment when you have parsing + bertscore functions
 
9
 
 
10
  MAX_MODELS = 6
11
 
12
+ def main():
13
+ with gr.Blocks(title="LLM Judge Arena") as demo:
14
+ gr.Markdown(
15
+ "# 🏟️ LLM Judge Arena\n"
16
+ "Side-by-side comparison + LLM-as-a-Judge + Human votes + Live leaderboard"
17
+ )
18
+
19
+ with gr.Tab("βš”οΈ Arena"):
20
+ with gr.Row():
21
+ model_select = gr.Dropdown(
22
+ choices=[m["name"] for m in MODELS],
23
+ value=[m["name"] for m in MODELS[:min(4, len(MODELS))]],
24
+ multiselect=True,
25
+ label="Select 2–6 models to compare",
26
+ max_choices=6,
27
+ interactive=True
28
+ )
29
+
30
+ prompt = gr.Textbox(
31
+ label="Your prompt",
32
+ lines=4,
33
+ placeholder="Type or paste your question / instruction here...",
34
+ )
35
+
36
+ ref_answer = gr.Textbox(
37
+ label="Optional reference answer (used for BERTScore etc.)",
38
+ lines=3,
39
+ placeholder="(leave empty if no reference)",
40
  )
41
+
42
+ generate_btn = gr.Button("Generate Responses", variant="primary")
43
+
44
+ # UI components β€” fixed number for stability
45
+ model_labels = []
46
+ response_boxes = []
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  vote_btns = []
48
+
49
+ with gr.Row():
50
+ for i in range(MAX_MODELS):
51
+ with gr.Column():
52
+ label = gr.Markdown("**Model**", visible=False)
53
+ response = gr.Textbox(
54
+ label=" ",
55
+ lines=12,
56
+ interactive=False,
57
+ visible=False,
58
+ show_label=False
59
+ )
60
+ vote = gr.Button(f"Vote #{i+1}", visible=False, interactive=False)
61
+ model_labels.append(label)
62
+ response_boxes.append(response)
63
+ vote_btns.append(vote)
64
+
65
+ with gr.Row():
66
+ judge_btn = gr.Button("Run LLM-as-a-Judge", visible=False)
67
+ tie_btn = gr.Button("It's a tie / both good", visible=False)
68
+
69
+ judge_output = gr.JSON(label="LLM Judge Evaluation", visible=False)
70
+ auto_metrics = gr.JSON(label="Automatic Metrics (if reference provided)", visible=False)
71
+
72
+ error_msg = gr.Markdown(visible=False)
73
+
74
+ with gr.Tab("πŸ† Leaderboard"):
75
+ leaderboard_df = gr.DataFrame(
76
+ value=load_leaderboard(),
77
+ interactive=False,
78
+ label="Live Leaderboard"
79
+ )
80
+ refresh_btn = gr.Button("Refresh Leaderboard")
81
+
82
+ # ────────────────────────────────────────────────
83
+ # Helpers
84
+ # ────────────────────────────────────────────────
85
+ def update_visibility(selected_names):
86
+ if not isinstance(selected_names, list):
87
+ selected_names = []
88
+
89
+ n = len(selected_names)
90
+ if n == 0:
91
+ return tuple([gr.update(visible=False)] * 22)
92
+
93
+ labels = []
94
+ boxes = []
95
+ votes = []
96
+
97
+ for i in range(n):
98
+ labels.append(gr.update(visible=True, value=f"**{selected_names[i]}**"))
99
+ boxes.append(gr.update(visible=True))
100
+ votes.append(gr.update(visible=True, interactive=False))
101
+
102
+ for i in range(n, MAX_MODELS):
103
+ labels.append(gr.update(visible=False, value=""))
104
+ boxes.append(gr.update(visible=False))
105
+ votes.append(gr.update(visible=False, interactive=False))
106
+
107
+ controls_visible = n >= 2
108
+
109
+ return (
110
+ *labels, # 6
111
+ *boxes, # 6
112
+ *votes, # 6
113
+ gr.update(visible=controls_visible), # judge_btn
114
+ gr.update(visible=controls_visible), # tie_btn
115
+ gr.update(visible=False), # judge_output
116
+ gr.update(visible=False), # auto_metrics
117
+ )
118
+
119
+ def generate_responses(selected_names, user_prompt):
120
+ if not selected_names or not user_prompt.strip():
121
+ return (
122
+ *[""] * MAX_MODELS,
123
+ *[gr.update() for _ in range(MAX_MODELS)], # labels unchanged
124
+ *[gr.update(interactive=False) for _ in range(MAX_MODELS)],
125
+ gr.update(visible=False), # judge
126
+ gr.update(visible=False), # tie
127
+ gr.update(visible=False), # judge_output
128
+ gr.update(visible=False), # auto_metrics
129
+ gr.update(value="Please select models and write a prompt.", visible=True)
130
+ )
131
+
132
+ selected_models = [m for m in MODELS if m["name"] in selected_names]
133
+ responses = []
134
+
135
+ for m in selected_models:
136
+ try:
137
+ resp = generate_response(m, user_prompt.strip())
138
+ responses.append(resp)
139
+ except Exception as e:
140
+ responses.append(f"**Generation failed:** {str(e)}")
141
+
142
+ padded_responses = responses + [""] * (MAX_MODELS - len(responses))
143
+
144
+ vote_updates = [
145
+ gr.update(interactive=True) if i < len(responses) else gr.update(interactive=False)
146
+ for i in range(MAX_MODELS)
147
+ ]
148
+
149
  return (
150
+ *padded_responses,
151
+ *[gr.update() for _ in range(MAX_MODELS)], # labels
152
+ *vote_updates,
153
+ gr.update(visible=len(responses) >= 2),
154
+ gr.update(visible=len(responses) >= 2),
155
+ gr.update(visible=False),
156
+ gr.update(visible=False),
157
+ gr.update(visible=False) # error
158
  )
159
+
160
+ def run_judge(selected_names, prompt_text, *responses):
161
+ active_responses = []
162
+ active_names = []
163
+
164
+ for name, resp in zip(selected_names, responses):
165
+ if resp and resp.strip() and resp != " ":
166
+ active_names.append(name)
167
+ active_responses.append(resp)
168
+
169
+ if len(active_responses) < 2 or not prompt_text.strip():
170
+ return (
171
+ gr.update(value={"error": "Need at least 2 valid responses and a prompt"}, visible=True),
172
+ gr.update(visible=False)
173
+ )
174
+
175
  try:
176
+ # Format prompt (you can improve this)
177
+ formatted_prompt = JUDGE_PROMPT.format(
178
+ prompt=prompt_text,
179
+ responses="\n\n".join(f"[{name}]\n{resp}" for name, resp in zip(active_names, active_responses))
180
+ )
181
+
182
+ client = InferenceClient(model=JUDGE_MODEL)
183
+ raw_output = client.text_generation(
184
+ formatted_prompt,
185
+ max_new_tokens=1200,
186
+ temperature=0.7
187
+ )
188
+
189
+ # Placeholder parsing β€” replace with real utils.parse_json(raw_output)
190
+ parsed = {"raw": raw_output[:500] + "..."}
191
+
192
+ return (
193
+ gr.update(value=parsed, visible=True),
194
+ gr.update(visible=False) # auto_metrics – implement when ready
195
+ )
196
+
197
  except Exception as e:
198
+ return (
199
+ gr.update(value={"error": str(e)}, visible=True),
200
+ gr.update(visible=False)
201
+ )
202
+
203
+ def record_human_vote(winner_index, selected_names, prompt_text, *responses):
204
+ if winner_index is None or not selected_names:
205
+ return leaderboard_df.value
206
+
207
+ active_models = []
208
+ active_responses = []
209
+
210
+ for i, (name, resp) in enumerate(zip(selected_names, responses)):
211
+ if resp and resp.strip():
212
+ active_models.append(name)
213
+ active_responses.append(resp)
214
+
215
+ if not active_models:
216
+ return leaderboard_df.value
217
+
218
+ winner_name = "tie" if winner_index < 0 else active_models[winner_index]
219
+
220
+ vote_data = {
221
+ "timestamp": pd.Timestamp.now().isoformat(),
222
+ "prompt": prompt_text,
223
+ "models": active_models,
224
+ "responses": active_responses,
225
+ "human_winner_idx": winner_index,
226
+ "human_winner_name": winner_name,
227
+ }
228
+
229
+ try:
230
+ save_vote(vote_data)
231
+ return load_leaderboard()
232
+ except Exception as e:
233
+ return pd.DataFrame({"error": [str(e)]})
234
+
235
+ # ────────────────────────────────────────────────
236
+ # Event bindings
237
+ # ────────────────────────────────────────────────
238
+ model_select.change(
239
+ update_visibility,
240
+ inputs=model_select,
241
+ outputs=[
242
+ *model_labels,
243
+ *response_boxes,
244
+ *vote_btns,
245
+ judge_btn,
246
+ tie_btn,
247
+ judge_output,
248
+ auto_metrics,
249
+ ]
250
+ )
251
+
252
+ generate_btn.click(
253
+ generate_responses,
254
+ inputs=[model_select, prompt],
255
+ outputs=[
256
+ *response_boxes,
257
+ *model_labels,
258
+ *vote_btns,
259
+ judge_btn,
260
+ tie_btn,
261
+ judge_output,
262
+ auto_metrics,
263
+ error_msg
264
+ ]
265
  )
266
+
267
+ judge_btn.click(
268
+ run_judge,
269
+ inputs=[model_select, prompt, *response_boxes],
270
+ outputs=[judge_output, auto_metrics]
271
+ )
272
+
273
+ # Vote buttons
274
+ for idx, btn in enumerate(vote_btns):
275
+ btn.click(
276
+ record_human_vote,
277
+ inputs=[gr.State(idx), model_select, prompt, *response_boxes],
278
+ outputs=leaderboard_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  )
280
+
281
+ tie_btn.click(
282
+ record_human_vote,
283
+ inputs=[gr.State(-1), model_select, prompt, *response_boxes],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  outputs=leaderboard_df
285
  )
286
+
287
+ refresh_btn.click(
288
+ load_leaderboard,
289
+ outputs=leaderboard_df
290
+ )
291
+
292
+ # Initial load
293
+ demo.load(
294
+ update_visibility,
295
+ inputs=model_select,
296
+ outputs=[
297
+ *model_labels,
298
+ *response_boxes,
299
+ *vote_btns,
300
+ judge_btn,
301
+ tie_btn,
302
+ judge_output,
303
+ auto_metrics,
304
+ ]
305
+ )
306
+
307
+ demo.launch()
308
+
309
+ if __name__ == "__main__":
310
+ main()