celikn commited on
Commit
394176d
·
verified ·
1 Parent(s): 3396062

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -173
app.py CHANGED
@@ -1,173 +1,88 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- """
4
- Hugging Face Space: LLM Benchmarking App using Gradio
5
- - Upload config.yaml and dataset.jsonl
6
- - Select task
7
- - Run benchmarking across multiple models
8
- - Compute metrics: Exact Match, F1, ROUGE-L, BLEU
9
- - Display results and allow CSV download
10
- """
11
-
12
- import os
13
- import time
14
- import json
15
- import yaml
16
- import gradio as gr
17
- import pandas as pd
18
- from tqdm import tqdm
19
- from huggingface_hub import login
20
-
21
- HF_TOKEN = (os.environ.get("HUGGINGFACE_HUB_TOKEN", "") or "").strip()
22
-
23
- if HF_TOKEN:
24
- login(token=HF_TOKEN)
25
- else:
26
- print("UYARI: HF_TOKEN / HUGGINGFACE_HUB_TOKEN bulunamadı, gated modellere erişilemeyebilir.")
27
-
28
- # Optional metrics
29
- try:
30
- from rouge_score import rouge_scorer
31
- except ImportError:
32
- rouge_scorer = None
33
-
34
- try:
35
- import sacrebleu
36
- except ImportError:
37
- sacrebleu = None
38
-
39
- # ---------------- Metrics ---------------- #
40
- def exact_match(pred, ref):
41
- return float(pred.strip().lower() == ref.strip().lower())
42
-
43
- def token_f1(pred, ref):
44
- pred_tokens = pred.lower().split()
45
- ref_tokens = ref.lower().split()
46
- if not pred_tokens and not ref_tokens:
47
- return 1.0
48
- if not pred_tokens or not ref_tokens:
49
- return 0.0
50
- common = sum(min(pred_tokens.count(t), ref_tokens.count(t)) for t in set(pred_tokens))
51
- precision = common / len(pred_tokens)
52
- recall = common / len(ref_tokens)
53
- return 2 * precision * recall / (precision + recall) if precision + recall else 0.0
54
-
55
- def rouge_l(pred, ref):
56
- if rouge_scorer:
57
- scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
58
- return scorer.score(ref, pred)["rougeL"].fmeasure
59
- return 0.0
60
-
61
- def bleu(pred, ref):
62
- if sacrebleu:
63
- return sacrebleu.corpus_bleu([pred], [[ref]]).score
64
- return 0.0
65
-
66
- def compute_metrics(task, prediction, reference):
67
- metrics = {}
68
- if task in ("qa", "classification"):
69
- metrics["exact_match"] = exact_match(prediction, reference)
70
- metrics["f1"] = token_f1(prediction, reference)
71
- elif task in ("summarization", "translation"):
72
- metrics["rougeL_f"] = rouge_l(prediction, reference)
73
- metrics["bleu"] = bleu(prediction, reference)
74
- else:
75
- metrics["f1"] = token_f1(prediction, reference)
76
- return metrics
77
-
78
- # ---------------- Hugging Face Inference ---------------- #
79
- def hf_generate(model_name, prompt, max_new_tokens=256, temperature=0.2):
80
- from huggingface_hub import InferenceClient
81
- client = InferenceClient(model=model_name, token=HF_TOKEN)
82
- start = time.time()
83
- try:
84
- if "flan" in model_name or "t5" in model_name:
85
- output = client.text2text_generation(prompt, max_new_tokens=max_new_tokens)
86
- else:
87
- output = client.text_generation(prompt, max_new_tokens=max_new_tokens, temperature=temperature)
88
- latency = time.time() - start
89
- return output.strip(), latency
90
- except Exception as e:
91
- return f"ERROR: {e}", time.time() - start
92
-
93
-
94
- # ---------------- Benchmark Function ---------------- #
95
- def benchmark(config_text, dataset_text, task):
96
- cfg = yaml.safe_load(config_text)
97
- data = [json.loads(line) for line in dataset_text.splitlines() if line.strip()]
98
-
99
- models = cfg.get("models", [])
100
- templates = cfg.get("prompt_templates", {})
101
- template = templates.get(task, "{{text}}")
102
-
103
- results = []
104
- for m in models:
105
- model_name = m["name"]
106
- max_tokens = m.get("params", {}).get("max_tokens", 256)
107
- temperature = m.get("params", {}).get("temperature", 0.2)
108
- for ex in tqdm(data, desc=model_name):
109
- variables = {k: ex.get(k, "") for k in ("question", "context", "text", "labels")}
110
- prompt = template
111
- for k, v in variables.items():
112
- prompt = prompt.replace(f"{{{{{k}}}}}", str(v))
113
- prediction, latency = hf_generate(model_name, prompt, max_new_tokens=max_tokens, temperature=temperature)
114
- metrics = compute_metrics(task, prediction, ex.get("reference", ""))
115
- row = {
116
- "model": model_name,
117
- "id": ex.get("id", ""),
118
- "task": task,
119
- "prompt": prompt,
120
- "prediction": prediction,
121
- "reference": ex.get("reference", ""),
122
- "latency_seconds": latency,
123
- **metrics
124
- }
125
- results.append(row)
126
-
127
- df = pd.DataFrame(results)
128
- summary = []
129
- for model_name in set(df["model"]):
130
- sub = df[df["model"] == model_name]
131
- summary.append(f"## {model_name}")
132
- summary.append(f"Samples: {len(sub)}")
133
- for metric in ["exact_match", "f1", "rougeL_f", "bleu"]:
134
- if metric in sub.columns:
135
- vals = [v for v in sub[metric] if isinstance(v, (int, float))]
136
- if vals:
137
- summary.append(f"{metric}: mean={sum(vals)/len(vals):.4f}")
138
- summary.append(f"Latency mean: {sum(sub['latency_seconds'])/len(sub):.3f}s\n")
139
-
140
- return df, "\n".join(summary)
141
-
142
-
143
- def hf_judge(model_name, prompt, candidate, reference=None, rubric=None, max_new_tokens=256):
144
- """
145
- Calls a judge model to score candidate output. Returns a dict of scores.
146
- Rubric should instruct JSON output, e.g.:
147
- {"relevance": int, "factuality": int, "clarity": int, "overall": float}
148
- """
149
- from huggingface_hub import InferenceClient
150
- client = InferenceClient(model=model_name, token=os.getenv("HF_TOKEN"))
151
-
152
- rubric = rubric or (
153
- "Evaluate the candidate answer. Score 1–5 for:\n"
154
- "- Relevance: addresses the prompt\n"
155
- "- Factuality: correct and supported\n"
156
- "- Clarity: clear and well-structured\n"
157
- "Return JSON: {\"relevance\": int, \"factuality\": int, \"clarity\": int, \"overall\": float}"
158
- )
159
-
160
- judge_prompt = (
161
- f"{rubric}\n\nPrompt:\n{prompt}\n\n"
162
- f"Candidate:\n{candidate}\n\n"
163
- f"Reference (if available):\n{reference if reference is not None else 'N/A'}\n"
164
- )
165
-
166
- try:
167
- text = client.text_generation(judge_prompt, max_new_tokens=max_new_tokens, temperature=0.0)
168
- # Attempt to parse JSON anywhere in the response:
169
- import re, json
170
- m = re.search(r'\{.*\}', text, re.S)
171
- return json.loads(m.group(0)) if m else {"raw": text}
172
- except Exception as e:
173
- return {"error": str(e)}
 
1
+
2
+ # ---------------- Gradio UI ---------------- #
3
+ with gr.Blocks() as demo:
4
+ gr.Markdown("# LLM Benchmarking App (Hugging Face)")
5
+ gr.Markdown("Upload config.yaml and dataset.jsonl, select task, and run benchmark.")
6
+
7
+ with gr.Row():
8
+ config_file = gr.File(label="Upload Config (YAML)", type="filepath")
9
+ dataset_file = gr.File(label="Upload Dataset (JSONL)", type="filepath")
10
+
11
+ task = gr.Dropdown(choices=["qa", "summarization", "classification", "conversation"], label="Select Task")
12
+ use_judge = gr.Checkbox(label="Enable Judge Scoring?", value=False)
13
+ run_btn = gr.Button("Run Benchmark")
14
+
15
+ results_table = gr.Dataframe(headers=[
16
+ "model","id","task","prompt","prediction","reference","latency_seconds",
17
+ "exact_match","f1","rougeL_f","bleu","judge_overall"
18
+ ], label="Results")
19
+
20
+ summary_box = gr.Textbox(label="Summary", lines=10)
21
+ download_csv = gr.File(label="Download CSV")
22
+
23
+ def run_benchmark(config_path, dataset_path, task, use_judge):
24
+ if not config_path or not dataset_path:
25
+ return None, "Error: Please upload both files", None
26
+
27
+ config_text = open(config_path, "r", encoding="utf-8").read()
28
+ dataset_text = open(dataset_path, "r", encoding="utf-8").read()
29
+ cfg = yaml.safe_load(config_text)
30
+ data = [json.loads(line) for line in dataset_text.splitlines() if line.strip()]
31
+ template = cfg.get("prompt_templates", {}).get(task, "{{text}}")
32
+ judge_cfg = cfg.get("judge", {})
33
+
34
+ results = []
35
+ for m in cfg.get("models", []):
36
+ model_name = m["name"]
37
+ max_tokens = m.get("params", {}).get("max_tokens", 256)
38
+ temperature = m.get("params", {}).get("temperature", 0.2)
39
+
40
+ for ex in data:
41
+ variables = {k: ex.get(k, "") for k in ("question", "context", "text", "labels")}
42
+ prompt = template
43
+ for k, v in variables.items():
44
+ prompt = prompt.replace(f"{{{{{k}}}}}", str(v))
45
+
46
+ prediction, latency = hf_generate(model_name, prompt, max_new_tokens=max_tokens, temperature=temperature)
47
+ metrics = compute_metrics(task, prediction, ex.get("reference", ""))
48
+
49
+ row = {
50
+ "model": model_name,
51
+ "id": ex.get("id", ""),
52
+ "task": task,
53
+ "prompt": prompt,
54
+ "prediction": prediction,
55
+ "reference": ex.get("reference", ""),
56
+ "latency_seconds": latency,
57
+ **metrics
58
+ }
59
+
60
+ if use_judge and judge_cfg.get("enabled"):
61
+ scores = hf_judge(judge_cfg.get("model"), prompt, prediction, ex.get("reference", ""), judge_cfg.get("rubric"))
62
+ for k, v in (scores.items() if isinstance(scores, dict) else []):
63
+ row[f"judge_{k}"] = v
64
+
65
+ results.append(row)
66
+
67
+ df = pd.DataFrame(results)
68
+ csv_path = "results.csv"
69
+ df.to_csv(csv_path, index=False)
70
+
71
+ # Summary
72
+ summary = []
73
+ for model_name in set(df["model"]):
74
+ sub = df[df["model"] == model_name]
75
+ summary.append(f"## {model_name}")
76
+ summary.append(f"Samples: {len(sub)}")
77
+ for metric in ["exact_match", "f1", "rougeL_f", "bleu", "judge_overall"]:
78
+ if metric in sub.columns:
79
+ vals = [v for v in sub[metric] if isinstance(v, (int, float))]
80
+ if vals:
81
+ summary.append(f"{metric}: mean={sum(vals)/len(vals):.4f}")
82
+ summary.append(f"Latency mean: {sum(sub['latency_seconds'])/len(sub):.3f}s\n")
83
+
84
+ return df, "\n".join(summary), csv_path
85
+
86
+ run_btn.click(run_benchmark, inputs=[config_file, dataset_file, task, use_judge], outputs=[results_table, summary_box, download_csv])
87
+
88
+ demo.launch()