celikn commited on
Commit
3396062
·
verified ·
1 Parent(s): f591de0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -99
app.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  #!/usr/bin/env python3
3
  # -*- coding: utf-8 -*-
4
  """
@@ -172,101 +171,3 @@ def hf_judge(model_name, prompt, candidate, reference=None, rubric=None, max_new
172
  return json.loads(m.group(0)) if m else {"raw": text}
173
  except Exception as e:
174
  return {"error": str(e)}
175
-
176
-
177
- # ---------------- Gradio UI ---------------- #
178
-
179
- # ---------------- Gradio UI ---------------- #
180
- with gr.Blocks() as demo:
181
- gr.Markdown("# LLM Benchmarking App (Hugging Face)")
182
- gr.Markdown("Upload config.yaml and dataset.jsonl, select task, and run benchmark.")
183
-
184
- with gr.Row():
185
- config_file = gr.File(label="Upload Config (YAML)", type="filepath")
186
- dataset_file = gr.File(label="Upload Dataset (JSONL)", type="filepath")
187
-
188
- task = gr.Textbox(label="Task (e.g., qa, summarization, classification)")
189
- use_judge = gr.Checkbox(label="Use Judge Model?", value=False)
190
- judge_model = gr.Textbox(label="Judge Model (HF repo id)", value="mistralai/Mistral-7B-Instruct")
191
- rubric = gr.Textbox(label="Judge Rubric", value="Evaluate relevance, factuality, clarity. Return JSON.")
192
-
193
- run_btn = gr.Button("Run Benchmark")
194
-
195
- results_table = gr.Dataframe(headers=[
196
- "model","id","task","prediction","reference","latency_seconds",
197
- "exact_match","f1","rougeL_f","bleu","judge_relevance","judge_factuality","judge_clarity","judge_overall"
198
- ], label="Results")
199
-
200
- summary_box = gr.Textbox(label="Summary", lines=10)
201
- download_csv = gr.File(label="Download CSV")
202
-
203
- def run_benchmark(config_path, dataset_path, task, use_judge=False, judge_model=None, rubric=None):
204
- if not config_path or not dataset_path:
205
- return None, "Error: Please upload both files", None
206
-
207
- config_text = open(config_path, "r", encoding="utf-8").read()
208
- dataset_text = open(dataset_path, "r", encoding="utf-8").read()
209
- cfg = yaml.safe_load(config_text)
210
- data = [json.loads(line) for line in dataset_text.splitlines() if line.strip()]
211
- template = cfg.get("prompt_templates", {}).get(task, "{{text}}")
212
-
213
- results = []
214
- for m in cfg.get("models", []):
215
- model_name = m["name"]
216
- max_tokens = m.get("params", {}).get("max_tokens", 256)
217
- temperature = m.get("params", {}).get("temperature", 0.2)
218
-
219
- for ex in data:
220
- variables = {k: ex.get(k, "") for k in ("question", "context", "text", "labels")}
221
- prompt = template
222
- for k, v in variables.items():
223
- prompt = prompt.replace(f"{{{{{k}}}}}", str(v))
224
-
225
- prediction, latency = hf_generate(model_name, prompt, max_new_tokens=max_tokens, temperature=temperature)
226
- metrics = compute_metrics(task, prediction, ex.get("reference", ""))
227
-
228
- row = {
229
- "model": model_name,
230
- "id": ex.get("id", ""),
231
- "task": task,
232
- "prompt": prompt,
233
- "prediction": prediction,
234
- "reference": ex.get("reference", ""),
235
- "latency_seconds": latency,
236
- **metrics
237
- }
238
-
239
- if use_judge and judge_model:
240
- scores = hf_judge(judge_model, prompt, prediction, ex.get("reference", ""), rubric)
241
- for k, v in (scores.items() if isinstance(scores, dict) else []):
242
- row[f"judge_{k}"] = v
243
-
244
- results.append(row)
245
-
246
- df = pd.DataFrame(results)
247
- csv_path = "results.csv"
248
- df.to_csv(csv_path, index=False)
249
-
250
- # Summary
251
- summary = []
252
- for model_name in set(df["model"]):
253
- sub = df[df["model"] == model_name]
254
- summary.append(f"## {model_name}")
255
- summary.append(f"Samples: {len(sub)}")
256
- for metric in ["exact_match", "f1", "rougeL_f", "bleu", "judge_overall"]:
257
- if metric in sub.columns:
258
- vals = [v for v in sub[metric] if isinstance(v, (int, float))]
259
- if vals:
260
- summary.append(f"{metric}: mean={sum(vals)/len(vals):.4f}")
261
- summary.append(f"Latency mean: {sum(sub['latency_seconds'])/len(sub):.3f}s\n")
262
-
263
- return df, "\n".join(summary), csv_path
264
-
265
- run_btn.click(
266
- run_benchmark,
267
- inputs=[config_file, dataset_file, task, use_judge, judge_model, rubric],
268
- outputs=[results_table, summary_box, download_csv]
269
- )
270
-
271
- demo.launch()
272
-
 
 
1
  #!/usr/bin/env python3
2
  # -*- coding: utf-8 -*-
3
  """
 
171
  return json.loads(m.group(0)) if m else {"raw": text}
172
  except Exception as e:
173
  return {"error": str(e)}