Spaces:

MINC01
/

LLMs_Benchmarker

Sleeping

App Files Files Community

Mattimax commited on Feb 3

Commit

8398ebb

verified ·

1 Parent(s): 19dd1f8

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -23

app.py CHANGED Viewed

@@ -33,31 +33,42 @@ def load_boolq_dataset(num_samples=DEFAULT_NUM_SAMPLES):
 def build_boolq_prompt(passage, question):
     """
-    Costruisce un prompt generico per LLM per BoolQ.
-    Il modello deve rispondere solo 'yes' o 'no'.
     """
     prompt = (
-        "You are a question answering system. "
-        "Answer strictly with 'yes' or 'no'.\n\n"
-        f"Passage: {passage}\n"
-        f"Question: {question}\n"
-        "Answer:"
     )
     return prompt
 def parse_yes_no(output_text):
     """
-    Estrae 'yes' o 'no' dall'output del modello.
-    Se non è chiaro, restituisce None.
     """
     text = output_text.strip().lower()
-    # prendi solo la prima parola
-    first = text.split()[0] if text else ""
     if first.startswith("yes"):
         return True
     if first.startswith("no"):
         return False
     return None
@@ -105,14 +116,19 @@ def evaluate_model_on_boolq(model_name, num_samples=DEFAULT_NUM_SAMPLES, max_new
                 temperature=0.0,
             )
         t1 = time.time()
-        gen_text = tokenizer.decode(output_ids[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
         pred = parse_yes_no(gen_text)
-        if pred is not None:
-            if pred == label:
-                correct += 1
-            total += 1
-            times.append(t1 - t0)
     if total == 0:
         accuracy = 0.0
@@ -197,11 +213,18 @@ def run_benchmark_ui(
         try:
             res = evaluate_model_on_boolq(name, num_samples=num_samples)
             results.append(res)
             logs.append(
                 f"  - Esempi valutati: {res['num_samples']}\n"
                 f"  - Accuracy: {res['accuracy']:.3f}\n"
-                f"  - Tempo medio per esempio (s): "
-                f"{res['avg_time_per_sample_sec']:.3f}" if res['avg_time_per_sample_sec'] is not None else "N/A"
             )
         except Exception as e:
             logs.append(f"  ERRORE: {e}")
@@ -221,10 +244,10 @@ def run_benchmark_ui(
 # Costruzione interfaccia Gradio
 # =========================
-with gr.Blocks(title="LLM Benchmark Space - BoolQ") as demo:
     gr.Markdown(
         """
-        # 🔍 LLM Benchmark Space (BoolQ)
         Inserisci i nomi dei modelli Hugging Face (es. `meta-llama/Meta-Llama-3-8B-Instruct`)
         e confrontali su un subset del dataset **BoolQ** (domande sì/no).
@@ -232,6 +255,8 @@ with gr.Blocks(title="LLM Benchmark Space - BoolQ") as demo:
         - Minimo **2 modelli**
         - Puoi aggiungere fino a **5 modelli** con il pulsante **"+ Aggiungi modello"**
         - Output: tabella con **accuracy**, numero di esempi e tempi
         """
     )
@@ -241,13 +266,13 @@ with gr.Blocks(title="LLM Benchmark Space - BoolQ") as demo:
             model_1 = gr.Textbox(
                 label="Modello 1",
-                placeholder="es. meta-llama/Meta-Llama-3-8B-Instruct",
                 value="",
                 visible=True,
             )
             model_2 = gr.Textbox(
                 label="Modello 2",
-                placeholder="es. mistralai/Mistral-7B-Instruct-v0.3",
                 value="",
                 visible=True,
             )

 def build_boolq_prompt(passage, question):
     """
+    Prompt in italiano: il modello deve rispondere solo 'sì' o 'no'.
     """
     prompt = (
+        "Sei un sistema di question answering. "
+        "Rispondi strettamente solo con 'sì' o 'no'.\n\n"
+        f"Testo: {passage}\n"
+        f"Domanda: {question}\n"
+        "Risposta:"
     )
     return prompt
 def parse_yes_no(output_text):
     """
+    Estrae 'sì/si' o 'no' dall'output del modello.
+    Supporta anche 'yes'/'no' per modelli inglesi.
+    Ritorna True per sì/yes, False per no, None se non riconosciuto.
     """
     text = output_text.strip().lower()
+    if not text:
+        return None
+    first = text.split()[0]
+    # italiano
+    if first.startswith("sì") or first.startswith("si"):
+        return True
+    if first.startswith("no"):
+        return False
+    # inglese
     if first.startswith("yes"):
         return True
     if first.startswith("no"):
         return False
     return None
                 temperature=0.0,
             )
         t1 = time.time()
+        gen_text = tokenizer.decode(
+            output_ids[0][inputs["input_ids"].shape[-1]:],
+            skip_special_tokens=True,
+        )
         pred = parse_yes_no(gen_text)
+        # Contiamo sempre l'esempio, anche se il modello non risponde in modo valido
+        total += 1
+        times.append(t1 - t0)
+        if pred is not None and pred == label:
+            correct += 1
     if total == 0:
         accuracy = 0.0
         try:
             res = evaluate_model_on_boolq(name, num_samples=num_samples)
             results.append(res)
+            avg_time_str = (
+                f"{res['avg_time_per_sample_sec']:.3f}"
+                if res['avg_time_per_sample_sec'] is not None
+                else "N/A"
+            )
             logs.append(
                 f"  - Esempi valutati: {res['num_samples']}\n"
                 f"  - Accuracy: {res['accuracy']:.3f}\n"
+                f"  - Tempo medio per esempio (s): {avg_time_str}\n"
+                f"  - Tempo totale (s): {res['total_time_sec']:.3f}"
             )
         except Exception as e:
             logs.append(f"  ERRORE: {e}")
 # Costruzione interfaccia Gradio
 # =========================
+with gr.Blocks(title="LLM Benchmark Space - BoolQ (IT)") as demo:
     gr.Markdown(
         """
+        # 🔍 LLM Benchmark Space (BoolQ, IT)
         Inserisci i nomi dei modelli Hugging Face (es. `meta-llama/Meta-Llama-3-8B-Instruct`)
         e confrontali su un subset del dataset **BoolQ** (domande sì/no).
         - Minimo **2 modelli**
         - Puoi aggiungere fino a **5 modelli** con il pulsante **"+ Aggiungi modello"**
         - Output: tabella con **accuracy**, numero di esempi e tempi
+        I prompt sono in **italiano** e il modello deve rispondere solo con **"sì"** o **"no"**.
         """
     )
             model_1 = gr.Textbox(
                 label="Modello 1",
+                placeholder="es. Mattimax/DACMini-IT",
                 value="",
                 visible=True,
             )
             model_2 = gr.Textbox(
                 label="Modello 2",
+                placeholder="es. Mattimax/DAC60M",
                 value="",
                 visible=True,
             )