Spaces:

MINC01
/

LLMs_Benchmarker

Sleeping

App Files Files Community

Mattimax commited on 11 days ago

Commit

19dd1f8

verified ·

1 Parent(s): 8d47b06

Create app.py

Browse files

Files changed (1) hide show

app.py +332 -0

app.py ADDED Viewed

	@@ -0,0 +1,332 @@

+import time
+import torch
+import gradio as gr
+from datasets import load_dataset
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import pandas as pd
+# =========================
+# Configurazione benchmark
+# =========================
+MAX_MODELS = 5
+DEFAULT_NUM_SAMPLES = 50  # numero di esempi da usare per il benchmark
+def get_device():
+    if torch.cuda.is_available():
+        return "cuda"
+    return "cpu"
+def load_boolq_dataset(num_samples=DEFAULT_NUM_SAMPLES):
+    """
+    Carica un subset del dataset BoolQ.
+    BoolQ: domande sì/no con un breve contesto.
+    """
+    ds = load_dataset("boolq", split="validation")
+    if num_samples is not None and num_samples < len(ds):
+        ds = ds.select(range(num_samples))
+    return ds
+def build_boolq_prompt(passage, question):
+    """
+    Costruisce un prompt generico per LLM per BoolQ.
+    Il modello deve rispondere solo 'yes' o 'no'.
+    """
+    prompt = (
+        "You are a question answering system. "
+        "Answer strictly with 'yes' or 'no'.\n\n"
+        f"Passage: {passage}\n"
+        f"Question: {question}\n"
+        "Answer:"
+    )
+    return prompt
+def parse_yes_no(output_text):
+    """
+    Estrae 'yes' o 'no' dall'output del modello.
+    Se non è chiaro, restituisce None.
+    """
+    text = output_text.strip().lower()
+    # prendi solo la prima parola
+    first = text.split()[0] if text else ""
+    if first.startswith("yes"):
+        return True
+    if first.startswith("no"):
+        return False
+    return None
+def evaluate_model_on_boolq(model_name, num_samples=DEFAULT_NUM_SAMPLES, max_new_tokens=5):
+    """
+    Esegue il benchmark di un modello su BoolQ.
+    Ritorna:
+      - accuracy
+      - numero di esempi valutati
+      - tempo medio per esempio
+    """
+    device = get_device()
+    start_total = time.time()
+    # Caricamento modello e tokenizer
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForCausalLM.from_pretrained(model_name)
+    except Exception as e:
+        raise RuntimeError(f"Errore nel caricamento del modello '{model_name}': {e}")
+    model.to(device)
+    model.eval()
+    ds = load_boolq_dataset(num_samples=num_samples)
+    correct = 0
+    total = 0
+    times = []
+    for example in ds:
+        passage = example["passage"]
+        question = example["question"]
+        label = example["answer"]  # True/False
+        prompt = build_boolq_prompt(passage, question)
+        inputs = tokenizer(prompt, return_tensors="pt").to(device)
+        t0 = time.time()
+        with torch.no_grad():
+            output_ids = model.generate(
+                **inputs,
+                max_new_tokens=max_new_tokens,
+                do_sample=False,
+                temperature=0.0,
+            )
+        t1 = time.time()
+        gen_text = tokenizer.decode(output_ids[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
+        pred = parse_yes_no(gen_text)
+        if pred is not None:
+            if pred == label:
+                correct += 1
+            total += 1
+            times.append(t1 - t0)
+    if total == 0:
+        accuracy = 0.0
+        avg_time = None
+    else:
+        accuracy = correct / total
+        avg_time = sum(times) / len(times) if times else None
+    total_time = time.time() - start_total
+    return {
+        "model_name": model_name,
+        "num_samples": total,
+        "accuracy": accuracy,
+        "avg_time_per_sample_sec": avg_time,
+        "total_time_sec": total_time,
+    }
+# =========================
+# Funzioni per la UI
+# =========================
+def add_model_field(current_count):
+    """
+    Aumenta il numero di campi modello visibili, fino a MAX_MODELS.
+    """
+    if current_count < MAX_MODELS:
+        current_count += 1
+    return current_count
+def get_visible_textboxes(model_count):
+    """
+    Ritorna la visibilità dei 5 campi modello in base a model_count.
+    """
+    visibility = []
+    for i in range(1, MAX_MODELS + 1):
+        visibility.append(gr.update(visible=(i <= model_count)))
+    return visibility
+def run_benchmark_ui(
+    model_1,
+    model_2,
+    model_3,
+    model_4,
+    model_5,
+    model_count,
+    num_samples,
+):
+    """
+    Funzione chiamata dal pulsante 'Esegui benchmark'.
+    Raccoglie i nomi dei modelli, esegue il benchmark e ritorna:
+      - tabella risultati
+      - log testuale
+    """
+    # Raccogli i modelli attivi
+    model_names = []
+    all_models = [model_1, model_2, model_3, model_4, model_5]
+    for i in range(model_count):
+        name = (all_models[i] or "").strip()
+        if name:
+            model_names.append(name)
+    if len(model_names) < 2:
+        return (
+            pd.DataFrame(),
+            "Devi specificare almeno due modelli validi."
+        )
+    results = []
+    logs = []
+    logs.append(f"Avvio benchmark su BoolQ con {num_samples} esempi...")
+    logs.append(f"Modelli: {', '.join(model_names)}")
+    logs.append("Device: " + get_device())
+    logs.append("====================================")
+    for name in model_names:
+        logs.append(f"\n[MODELLO] {name}")
+        try:
+            res = evaluate_model_on_boolq(name, num_samples=num_samples)
+            results.append(res)
+            logs.append(
+                f"  - Esempi valutati: {res['num_samples']}\n"
+                f"  - Accuracy: {res['accuracy']:.3f}\n"
+                f"  - Tempo medio per esempio (s): "
+                f"{res['avg_time_per_sample_sec']:.3f}" if res['avg_time_per_sample_sec'] is not None else "N/A"
+            )
+        except Exception as e:
+            logs.append(f"  ERRORE: {e}")
+    if results:
+        df = pd.DataFrame(results)
+        # Ordina per accuracy decrescente
+        df = df.sort_values(by="accuracy", ascending=False)
+    else:
+        df = pd.DataFrame()
+    log_text = "\n".join(str(l) for l in logs)
+    return df, log_text
+# =========================
+# Costruzione interfaccia Gradio
+# =========================
+with gr.Blocks(title="LLM Benchmark Space - BoolQ") as demo:
+    gr.Markdown(
+        """
+        # 🔍 LLM Benchmark Space (BoolQ)
+        Inserisci i nomi dei modelli Hugging Face (es. `meta-llama/Meta-Llama-3-8B-Instruct`)
+        e confrontali su un subset del dataset **BoolQ** (domande sì/no).
+        - Minimo **2 modelli**
+        - Puoi aggiungere fino a **5 modelli** con il pulsante **"+ Aggiungi modello"**
+        - Output: tabella con **accuracy**, numero di esempi e tempi
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            model_count_state = gr.State(value=2)
+            model_1 = gr.Textbox(
+                label="Modello 1",
+                placeholder="es. meta-llama/Meta-Llama-3-8B-Instruct",
+                value="",
+                visible=True,
+            )
+            model_2 = gr.Textbox(
+                label="Modello 2",
+                placeholder="es. mistralai/Mistral-7B-Instruct-v0.3",
+                value="",
+                visible=True,
+            )
+            model_3 = gr.Textbox(
+                label="Modello 3",
+                placeholder="Modello opzionale",
+                value="",
+                visible=False,
+            )
+            model_4 = gr.Textbox(
+                label="Modello 4",
+                placeholder="Modello opzionale",
+                value="",
+                visible=False,
+            )
+            model_5 = gr.Textbox(
+                label="Modello 5",
+                placeholder="Modello opzionale",
+                value="",
+                visible=False,
+            )
+            add_button = gr.Button("+ Aggiungi modello")
+            num_samples = gr.Slider(
+                minimum=10,
+                maximum=200,
+                step=10,
+                value=DEFAULT_NUM_SAMPLES,
+                label="Numero di esempi BoolQ da usare",
+            )
+            run_button = gr.Button("🚀 Esegui benchmark", variant="primary")
+        with gr.Column():
+            results_df = gr.Dataframe(
+                headers=[
+                    "model_name",
+                    "num_samples",
+                    "accuracy",
+                    "avg_time_per_sample_sec",
+                    "total_time_sec",
+                ],
+                label="Risultati benchmark",
+                interactive=False,
+            )
+            logs_box = gr.Textbox(
+                label="Log esecuzione",
+                lines=20,
+                interactive=False,
+            )
+    # Logica pulsante "+ Aggiungi modello"
+    def on_add_model(model_count):
+        new_count = add_model_field(model_count)
+        visibility_updates = get_visible_textboxes(new_count)
+        return [new_count] + visibility_updates
+    add_button.click(
+        fn=on_add_model,
+        inputs=[model_count_state],
+        outputs=[model_count_state, model_1, model_2, model_3, model_4, model_5],
+    )
+    # Logica pulsante "Esegui benchmark"
+    run_button.click(
+        fn=run_benchmark_ui,
+        inputs=[
+            model_1,
+            model_2,
+            model_3,
+            model_4,
+            model_5,
+            model_count_state,
+            num_samples,
+        ],
+        outputs=[results_df, logs_box],
+    )
+if __name__ == "__main__":
+    demo.launch()