Final_Assignment_Template

Sleeping

App Files Files Community

emanuelediluzio commited on Apr 7

Commit

fb5be90

verified ·

1 Parent(s): 67bb288

Update app.py

Browse files

Files changed (1) hide show

app.py +234 -318

app.py CHANGED Viewed

@@ -2,24 +2,26 @@ import os
 import re
 import io
 import json
 import traceback
 import gradio as gr
 import requests
 import pandas as pd
 from bs4 import BeautifulSoup
-from smolagents import CodeAgent, DuckDuckGoSearchTool, InferenceClientModel, tool
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
-# Modelli in ordine di preferenza (tutti gratuiti su HF Inference API)
-MODEL_CANDIDATES = [
-    "Qwen/Qwen2.5-Coder-32B-Instruct",
-    "Qwen/Qwen2.5-72B-Instruct",
-    "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "mistralai/Mixtral-8x7B-Instruct-v0.1",
-    "HuggingFaceH4/zephyr-7b-beta",
-]
 # ==========================================
@@ -41,9 +43,7 @@ def visit_webpage(url: str) -> str:
         soup = BeautifulSoup(response.text, "html.parser")
         for el in soup(["script", "style", "nav", "footer", "header", "aside", "noscript"]):
             el.extract()
-        text = soup.get_text(separator="\n", strip=True)
-        # Pulizia extra
-        lines = [l.strip() for l in text.splitlines() if l.strip()]
         return "\n".join(lines)[:15000]
     except Exception as e:
         return f"Error fetching {url}: {str(e)}"
@@ -87,8 +87,8 @@ def get_youtube_transcript(video_url: str) -> str:
 @tool
 def download_task_file(task_id: str) -> str:
     """Downloads and reads the file attached to a GAIA task.
-    Handles text, CSV, JSON, PDF, Excel (.xlsx/.xls), Python, and audio files.
-    Always try this if the question might reference an attached document, table, or file.
     Args:
         task_id: The task_id string from the GAIA question.
     """
@@ -101,7 +101,6 @@ def download_task_file(task_id: str) -> str:
         ct = response.headers.get("Content-Type", "")
         cd = response.headers.get("Content-Disposition", "")
-        # Detect filename from Content-Disposition
         filename = ""
         if "filename=" in cd:
             filename = cd.split("filename=")[-1].strip('" ')
@@ -109,13 +108,13 @@ def download_task_file(task_id: str) -> str:
         print(f"  [FILE] type={ct}, name={filename}, ext={ext}, size={len(response.content)}")
-        # --- TEXT / CSV / JSON ---
         if any(t in ct for t in ["text", "json", "csv"]) or ext in ["txt", "csv", "json", "py", "md"]:
             text = response.text
             if ext == "csv" or "csv" in ct:
                 try:
                     df = pd.read_csv(io.StringIO(text))
-                    return f"CSV file with {len(df)} rows, columns: {list(df.columns)}\n\n{df.to_string()}"[:12000]
                 except Exception:
                     pass
             return text[:12000]
@@ -124,10 +123,7 @@ def download_task_file(task_id: str) -> str:
         if "spreadsheet" in ct or "excel" in ct or ext in ["xlsx", "xls"]:
             try:
                 df = pd.read_excel(io.BytesIO(response.content), engine="openpyxl")
-                summary = f"Excel file with {len(df)} rows, columns: {list(df.columns)}\n"
-                summary += f"Data types: {dict(df.dtypes)}\n\n"
-                summary += df.to_string()
-                return summary[:12000]
             except Exception as e:
                 return f"Excel file but read error: {e}"
@@ -136,60 +132,32 @@ def download_task_file(task_id: str) -> str:
             try:
                 import PyPDF2
                 reader = PyPDF2.PdfReader(io.BytesIO(response.content))
-                pages_text = []
                 for i, page in enumerate(reader.pages):
                     t = page.extract_text() or ""
-                    pages_text.append(f"[Page {i+1}]\n{t}")
-                return "\n".join(pages_text)[:12000]
             except Exception as e:
                 return f"PDF attached but read error: {e}"
-        # --- AUDIO (mp3, wav) ---
         if "audio" in ct or ext in ["mp3", "wav", "m4a", "ogg"]:
-            return f"Audio file attached ({ct}, {len(response.content)} bytes). Cannot transcribe directly."
         # --- IMAGE ---
         if "image" in ct or ext in ["png", "jpg", "jpeg", "gif", "webp"]:
             return f"Image file attached ({ct}, {len(response.content)} bytes)."
-        # --- Fallback: try decode as text ---
         try:
-            decoded = response.content.decode("utf-8")
-            return decoded[:12000]
         except Exception:
-            return f"Binary file ({ct}, {len(response.content)} bytes). Cannot parse."
     except Exception as e:
         return f"File download error: {str(e)}"
-# ==========================================
-# 🧮 TOOL 4: PYTHON EVAL SICURO
-# ==========================================
-@tool
-def python_compute(code: str) -> str:
-    """Executes a Python expression or short script and returns the result.
-    Use for math calculations, string manipulation, date computations, etc.
-    Args:
-        code: A Python expression or short script. Use print() for output.
-    """
-    try:
-        # Prova prima come espressione
-        result = eval(code)
-        return str(result)
-    except SyntaxError:
-        # Se è uno statement, eseguilo e cattura stdout
-        import contextlib
-        import sys
-        f = io.StringIO()
-        with contextlib.redirect_stdout(f):
-            exec(code)
-        output = f.getvalue().strip()
-        return output if output else "Code executed (no output)"
-    except Exception as e:
-        return f"Error: {str(e)}"
 # ==========================================
 # 🔍 PRE-PROCESSING
 # ==========================================
@@ -197,79 +165,15 @@ def preprocess_question(question: str) -> str:
     """Detect reversed text and fix it."""
     stripped = question.strip()
     reversed_q = stripped[::-1]
-    keywords_en = ["answer", "what", "who", "how", "find", "list", "which", "where", "when", "the"]
-    keywords_present_original = sum(1 for w in keywords_en if w in stripped.lower())
-    keywords_present_reversed = sum(1 for w in keywords_en if w in reversed_q.lower())
-    if keywords_present_reversed > keywords_present_original and len(stripped) > 20:
-        print(f"  [PRE-PROCESS] Reversed text detected! Using reversed version.")
         return reversed_q
     return question
-# ==========================================
-# 🔄 CHIAMATA DIRETTA HF INFERENCE API
-# ==========================================
-def call_hf_direct(question: str, task_context: str = "") -> str:
-    """Fallback: chiama HF Inference API direttamente senza smolagents."""
-    prompt = f"""You are answering a question from the GAIA benchmark.
-Give ONLY the final answer — no explanation, no preamble, no "The answer is".
-Rules:
-- For numbers: just digits (e.g., 42)
-- For names: just the name (e.g., Einstein)
-- For lists: comma-separated (e.g., apple, banana, cherry)
-- No period at the end unless part of the answer
-- If text seems reversed, reverse it first
-{task_context}
-Question: {question}
-Answer:"""
-    hf_token = os.getenv("HF_TOKEN", "")
-    headers = {"Content-Type": "application/json"}
-    if hf_token:
-        headers["Authorization"] = f"Bearer {hf_token}"
-    for model in MODEL_CANDIDATES:
-        try:
-            api_url = f"https://api-inference.huggingface.co/models/{model}"
-            payload = {
-                "inputs": prompt,
-                "parameters": {
-                    "max_new_tokens": 150,
-                    "temperature": 0.1,
-                    "return_full_text": False,
-                },
-            }
-            resp = requests.post(api_url, headers=headers, json=payload, timeout=45)
-            if resp.status_code == 200:
-                data = resp.json()
-                if isinstance(data, list) and len(data) > 0:
-                    raw = data[0].get("generated_text", "").strip()
-                    if raw:
-                        answer = clean_answer(raw)
-                        if answer and answer.lower() not in [
-                            "i don't know", "unknown", "n/a", "none", "error", "",
-                        ]:
-                            print(f"  [FALLBACK OK via {model}]: {answer[:100]}")
-                            return answer
-            else:
-                print(f"  [FALLBACK {model}] HTTP {resp.status_code}")
-        except Exception as e:
-            print(f"  [FALLBACK {model} ERROR]: {e}")
-            continue
-    return "I don't know"
 # ==========================================
 # 🧹 PULIZIA RISPOSTA
 # ==========================================
@@ -277,12 +181,12 @@ def clean_answer(raw: str) -> str:
     """Pulisci la risposta grezza dall'agente."""
     answer = str(raw).strip()
-    # Se multilinea, prendi la prima riga non vuota significativa
     lines = [l.strip() for l in answer.split("\n") if l.strip()]
     if lines:
         answer = lines[0]
-    # Rimuovi prefissi comuni
     prefixes = [
         "the answer is:", "the answer is", "final answer:", "final answer is:",
         "final answer is", "answer:", "answer is:", "answer is",
@@ -298,19 +202,17 @@ def clean_answer(raw: str) -> str:
         if lower.startswith(prefix):
             answer = answer[len(prefix):].strip()
             lower = answer.lower()
-            # Rimuovi anche eventuali virgolette dopo il prefisso
-            if answer.startswith('"') or answer.startswith("'"):
                 answer = answer[1:]
             break
-    # Rimuovi punto finale (ma non se è un decimale tipo "3.14")
     if answer.endswith(".") and not re.search(r"\d\.$", answer):
         answer = answer[:-1].strip()
-    # Rimuovi markdown bold, virgolette
     answer = answer.replace("**", "").strip('"').strip("'").strip("`").strip()
-    # Se la risposta inizia con "is " (residuo), rimuovilo
     if answer.lower().startswith("is "):
         answer = answer[3:].strip()
@@ -318,220 +220,234 @@ def clean_answer(raw: str) -> str:
 # ==========================================
-# 🧠 AGENTE PRINCIPALE
 # ==========================================
-class SuperAgent:
-    def __init__(self):
-        print("=" * 60)
-        print("🚀 Inizializzazione SuperAgent...")
-        print("=" * 60)
-        hf_token = os.getenv("HF_TOKEN", "")
-        print(f"  HF_TOKEN presente: {bool(hf_token)}")
-        # Prova a inizializzare il modello per smolagents
-        self.agent = None
-        for model_id in MODEL_CANDIDATES[:3]:  # Prova i primi 3
-            try:
-                print(f"  Trying model: {model_id}")
-                model = InferenceClientModel(
-                    model_id=model_id,
-                    token=hf_token if hf_token else None,
-                )
-                self.agent = CodeAgent(
-                    tools=[
-                        DuckDuckGoSearchTool(),
-                        visit_webpage,
-                        get_youtube_transcript,
-                        download_task_file,
-                        python_compute,
-                    ],
-                    model=model,
-                    max_steps=6,
-                    additional_authorized_imports=[
-                        "requests", "bs4", "json", "time", "math", "datetime",
-                        "pandas", "numpy", "re", "csv", "urllib", "collections",
-                        "itertools", "string", "unicodedata", "statistics",
-                    ],
-                )
-                print(f"  ✅ Agent inizializzato con {model_id}")
-                break
-            except Exception as e:
-                print(f"  ❌ {model_id} fallito: {e}")
-                continue
-        if self.agent is None:
-            print("  ⚠️ Nessun modello disponibile per l'agente — solo fallback diretto.")
-    def _build_prompt(self, question: str, task_id: str, file_context: str = "") -> str:
-        """Costruisci il prompt per l'agente."""
-        file_hint = ""
-        if task_id:
-            file_hint = f'\nThis question has task_id="{task_id}". Call download_task_file("{task_id}") to check for attached files.'
-        extra_context = ""
-        if file_context:
-            extra_context = f"\n\nFILE CONTENT:\n{file_context}\n"
-        return f"""You are an expert AI assistant solving GAIA benchmark questions.
-Your goal: find the EXACT correct answer.
-STRATEGY (follow in this order):
-1. If the question has a YouTube URL → call get_youtube_transcript(url)
-2. If the question has any URL → call visit_webpage(url)
-3. If there might be an attached file → call download_task_file(task_id)
-4. For factual questions → use DuckDuckGoSearchTool, then visit_webpage to verify
-5. For calculations → use python_compute() or write Python directly
-6. If text looks reversed/scrambled → reverse it with Python: text[::-1]
-ANSWER FORMAT (CRITICAL):
-- Output ONLY the final answer. No explanation. No prefix.
-- Numbers: just digits (e.g., 3)
-- Names: just the name (e.g., Einstein)
-- Lists: comma-separated (e.g., cat, dog, bird)
-- NEVER say "The answer is..." or "FINAL ANSWER:" or any preamble
-{file_hint}{extra_context}
-Question: {question}"""
     def __call__(self, question: str, task_id: str = "") -> str:
         print(f"\n{'─'*60}")
-        print(f"[Q]: {question[:150]}...")
         print(f"[TASK]: {task_id}")
-        # 1. Pre-process (reversed text detection)
         processed = preprocess_question(question)
-        # 2. Se c'è un task_id, prova a scaricare il file subito per avere contesto
-        file_context = ""
-        if task_id:
-            try:
-                fc = download_task_file.__wrapped__(task_id) if hasattr(download_task_file, '__wrapped__') else ""
-                if fc and "No file" not in fc and "error" not in fc.lower():
-                    file_context = fc
-                    print(f"  [FILE PRE-FETCH]: {len(file_context)} chars")
-            except Exception:
-                # Smolagents tool wrapper, proviamo direttamente
-                try:
-                    file_url = f"https://agents-course-unit4-scoring.hf.space/files/{task_id}"
-                    resp = requests.get(file_url, timeout=15)
-                    if resp.status_code == 200:
-                        ct = resp.headers.get("Content-Type", "")
-                        cd = resp.headers.get("Content-Disposition", "")
-                        filename = ""
-                        if "filename=" in cd:
-                            filename = cd.split("filename=")[-1].strip('" ')
-                        ext = filename.rsplit(".", 1)[-1].lower() if "." in filename else ""
-                        if any(t in ct for t in ["text", "json", "csv"]) or ext in ["txt", "csv", "json", "py"]:
-                            file_context = resp.text[:8000]
-                        elif "spreadsheet" in ct or "excel" in ct or ext in ["xlsx", "xls"]:
-                            try:
-                                df = pd.read_excel(io.BytesIO(resp.content), engine="openpyxl")
-                                file_context = f"Excel: {len(df)} rows, cols={list(df.columns)}\n{df.to_string()}"[:8000]
-                            except Exception:
-                                pass
-                        elif "pdf" in ct or ext == "pdf":
-                            try:
-                                import PyPDF2
-                                reader = PyPDF2.PdfReader(io.BytesIO(resp.content))
-                                file_context = "\n".join(
-                                    [p.extract_text() or "" for p in reader.pages]
-                                )[:8000]
-                            except Exception:
-                                pass
-                        print(f"  [FILE PRE-FETCH direct]: {len(file_context)} chars")
-                except Exception as e:
-                    print(f"  [FILE PRE-FETCH failed]: {e}")
-        # 3. Detect special question types and handle directly
-        answer = self._handle_special_cases(processed, task_id, file_context)
-        if answer:
-            print(f"  [SPECIAL CASE]: {answer}")
-            return answer
-        # 4. Tentativo con agente smolagents
-        if self.agent:
             try:
-                prompt = self._build_prompt(processed, task_id, file_context)
                 raw = self.agent.run(prompt)
                 answer = clean_answer(str(raw))
-                if self._is_valid_answer(answer):
-                    print(f"  [✅ AGENT]: {answer}")
                     return answer
-                print(f"  [⚠️ AGENT invalid: '{answer}']")
             except Exception as e:
-                print(f"  [⚠️ AGENT ERROR]: {e}")
-                traceback.print_exc()
-        # 5. Fallback: HF API diretta
-        print("  [→ FALLBACK HF DIRECT]")
-        context_for_fallback = ""
         if file_context:
-            context_for_fallback = f"\nAttached file content:\n{file_context[:3000]}\n"
-        answer = call_hf_direct(processed, context_for_fallback)
-        print(f"  [FINAL]: {answer}")
-        return answer
-    def _is_valid_answer(self, answer: str) -> bool:
-        """Controlla se una risposta è valida (non vuota e non un errore generico)."""
-        if not answer:
             return False
-        invalid = [
             "i don't know", "unknown", "n/a", "none", "error",
             "i cannot", "i can't", "not available", "no answer",
-            "could not", "unable to", "i'm not sure",
-        ]
         return answer.lower().strip() not in invalid
-    def _handle_special_cases(self, question: str, task_id: str, file_context: str) -> str:
-        """Gestisci direttamente casi speciali che non richiedono l'agente."""
-        q_lower = question.lower()
-        # --- EXCEL con domanda su totali/somme ---
-        if file_context and ("total" in q_lower or "sum" in q_lower or "sales" in q_lower):
             try:
-                # Prova a parsare il contesto come DataFrame
-                if file_context.startswith("Excel:") or file_context.startswith("CSV"):
-                    # Ri-scarica il file e calcola
-                    file_url = f"https://agents-course-unit4-scoring.hf.space/files/{task_id}"
-                    resp = requests.get(file_url, timeout=15)
-                    ct = resp.headers.get("Content-Type", "")
-                    cd = resp.headers.get("Content-Disposition", "")
-                    filename = ""
-                    if "filename=" in cd:
-                        filename = cd.split("filename=")[-1].strip('" ')
-                    ext = filename.rsplit(".", 1)[-1].lower() if "." in filename else ""
-                    if "spreadsheet" in ct or "excel" in ct or ext in ["xlsx", "xls"]:
-                        df = pd.read_excel(io.BytesIO(resp.content), engine="openpyxl")
-                    elif ext == "csv" or "csv" in ct:
-                        df = pd.read_csv(io.BytesIO(resp.content))
-                    else:
-                        return ""
-                    # Trova colonne numeriche e calcola totali
-                    numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
-                    if numeric_cols:
-                        totals = {col: df[col].sum() for col in numeric_cols}
-                        # Se chiede "total sales", cerca colonna "sales"
-                        for col in numeric_cols:
-                            if "sale" in col.lower() or "total" in col.lower() or "amount" in col.lower():
-                                val = df[col].sum()
-                                # Formatta come numero intero se è un intero
-                                if val == int(val):
-                                    return str(int(val))
-                                return f"${val:,.2f}" if val > 100 else str(val)
-                        # Altrimenti somma la prima colonna numerica
-                        val = list(totals.values())[0]
-                        if val == int(val):
-                            return str(int(val))
-                        return str(val)
             except Exception as e:
-                print(f"  [SPECIAL CASE Excel error]: {e}")
-        return ""
 # ==========================================
@@ -549,7 +465,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
     print(f"{'='*60}")
     try:
-        agent = SuperAgent()
     except Exception as e:
         traceback.print_exc()
         return f"Errore inizializzazione agente: {e}", None
@@ -575,7 +491,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
         if not task_id or question_text is None:
             continue
-        print(f"\n[{i+1}/{len(questions_data)}] ────────────────────────")
         try:
             answer = agent(question_text, task_id=task_id)
         except Exception as e:
@@ -622,13 +538,13 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
 # ==========================================
-# 🖥️ INTERFACCIA GRADIO
 # ==========================================
 with gr.Blocks() as demo:
-    gr.Markdown("# 🚀 Super Agente - Final Assignment Runner")
     gr.Markdown(
-        "Login con HF, poi clicca il bottone. "
-        "L'agente proverà più modelli e strategie per rispondere al GAIA benchmark."
     )
     gr.LoginButton()
     run_button = gr.Button("🔥 Avvia Valutazione & Invia Risposte", variant="primary")

 import re
 import io
 import json
+import time
 import traceback
 import gradio as gr
 import requests
 import pandas as pd
 from bs4 import BeautifulSoup
+from smolagents import (
+    CodeAgent,
+    DuckDuckGoSearchTool,
+    LiteLLMModel,
+    tool,
+)
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
+# Groq è GRATIS, velocissimo, e ha modelli 70B
+# Registrati su console.groq.com e metti GROQ_API_KEY nei secrets
+GROQ_MODEL = "groq/llama-3.3-70b-versatile"
+GROQ_SMALL = "groq/llama-3.1-8b-instant"
 # ==========================================
         soup = BeautifulSoup(response.text, "html.parser")
         for el in soup(["script", "style", "nav", "footer", "header", "aside", "noscript"]):
             el.extract()
+        lines = [l.strip() for l in soup.get_text(separator="\n", strip=True).splitlines() if l.strip()]
         return "\n".join(lines)[:15000]
     except Exception as e:
         return f"Error fetching {url}: {str(e)}"
 @tool
 def download_task_file(task_id: str) -> str:
     """Downloads and reads the file attached to a GAIA task.
+    Handles text, CSV, JSON, PDF, Excel (.xlsx/.xls), and Python files.
+    Always try this tool first if the question might reference an attached file.
     Args:
         task_id: The task_id string from the GAIA question.
     """
         ct = response.headers.get("Content-Type", "")
         cd = response.headers.get("Content-Disposition", "")
         filename = ""
         if "filename=" in cd:
             filename = cd.split("filename=")[-1].strip('" ')
         print(f"  [FILE] type={ct}, name={filename}, ext={ext}, size={len(response.content)}")
+        # --- TEXT / CSV / JSON / PY ---
         if any(t in ct for t in ["text", "json", "csv"]) or ext in ["txt", "csv", "json", "py", "md"]:
             text = response.text
             if ext == "csv" or "csv" in ct:
                 try:
                     df = pd.read_csv(io.StringIO(text))
+                    return f"CSV with {len(df)} rows, columns: {list(df.columns)}\n\n{df.to_string()}"[:12000]
                 except Exception:
                     pass
             return text[:12000]
         if "spreadsheet" in ct or "excel" in ct or ext in ["xlsx", "xls"]:
             try:
                 df = pd.read_excel(io.BytesIO(response.content), engine="openpyxl")
+                return f"Excel with {len(df)} rows, columns: {list(df.columns)}\n\n{df.to_string()}"[:12000]
             except Exception as e:
                 return f"Excel file but read error: {e}"
             try:
                 import PyPDF2
                 reader = PyPDF2.PdfReader(io.BytesIO(response.content))
+                pages = []
                 for i, page in enumerate(reader.pages):
                     t = page.extract_text() or ""
+                    pages.append(f"[Page {i+1}] {t}")
+                return "\n".join(pages)[:12000]
             except Exception as e:
                 return f"PDF attached but read error: {e}"
+        # --- AUDIO ---
         if "audio" in ct or ext in ["mp3", "wav", "m4a", "ogg"]:
+            return f"Audio file attached ({ct}, {len(response.content)} bytes). Cannot transcribe in this environment."
         # --- IMAGE ---
         if "image" in ct or ext in ["png", "jpg", "jpeg", "gif", "webp"]:
             return f"Image file attached ({ct}, {len(response.content)} bytes)."
+        # --- Fallback ---
         try:
+            return response.content.decode("utf-8")[:12000]
         except Exception:
+            return f"Binary file ({ct}, {len(response.content)} bytes)."
     except Exception as e:
         return f"File download error: {str(e)}"
 # ==========================================
 # 🔍 PRE-PROCESSING
 # ==========================================
     """Detect reversed text and fix it."""
     stripped = question.strip()
     reversed_q = stripped[::-1]
+    keywords = ["answer", "what", "who", "how", "find", "list", "which", "where", "when", "the"]
+    score_orig = sum(1 for w in keywords if w in stripped.lower())
+    score_rev = sum(1 for w in keywords if w in reversed_q.lower())
+    if score_rev > score_orig and len(stripped) > 20:
+        print(f"  [PRE-PROCESS] Reversed text detected!")
         return reversed_q
     return question
 # ==========================================
 # 🧹 PULIZIA RISPOSTA
 # ==========================================
     """Pulisci la risposta grezza dall'agente."""
     answer = str(raw).strip()
+    # Prima riga non vuota
     lines = [l.strip() for l in answer.split("\n") if l.strip()]
     if lines:
         answer = lines[0]
+    # Rimuovi prefissi
     prefixes = [
         "the answer is:", "the answer is", "final answer:", "final answer is:",
         "final answer is", "answer:", "answer is:", "answer is",
         if lower.startswith(prefix):
             answer = answer[len(prefix):].strip()
             lower = answer.lower()
+            if answer and answer[0] in '"\'':
                 answer = answer[1:]
             break
+    # Rimuovi punto finale (non se decimale)
     if answer.endswith(".") and not re.search(r"\d\.$", answer):
         answer = answer[:-1].strip()
+    # Pulizia markdown
     answer = answer.replace("**", "").strip('"').strip("'").strip("`").strip()
     if answer.lower().startswith("is "):
         answer = answer[3:].strip()
 # ==========================================
+# 📁 PRE-FETCH FILE
 # ==========================================
+def prefetch_file(task_id: str) -> str:
+    """Scarica il file prima di chiamare l'agente per avere contesto."""
+    if not task_id:
+        return ""
+    try:
+        file_url = f"https://agents-course-unit4-scoring.hf.space/files/{task_id}"
+        resp = requests.get(file_url, timeout=15)
+        if resp.status_code != 200:
+            return ""
+        ct = resp.headers.get("Content-Type", "")
+        cd = resp.headers.get("Content-Disposition", "")
+        filename = ""
+        if "filename=" in cd:
+            filename = cd.split("filename=")[-1].strip('" ')
+        ext = filename.rsplit(".", 1)[-1].lower() if "." in filename else ""
+        if any(t in ct for t in ["text", "json", "csv"]) or ext in ["txt", "csv", "json", "py"]:
+            if ext == "csv" or "csv" in ct:
+                try:
+                    df = pd.read_csv(io.StringIO(resp.text))
+                    return f"CSV with {len(df)} rows, columns: {list(df.columns)}\n{df.to_string()}"[:8000]
+                except Exception:
+                    pass
+            return resp.text[:8000]
+        if "spreadsheet" in ct or "excel" in ct or ext in ["xlsx", "xls"]:
+            try:
+                df = pd.read_excel(io.BytesIO(resp.content), engine="openpyxl")
+                return f"Excel with {len(df)} rows, columns: {list(df.columns)}\n{df.to_string()}"[:8000]
+            except Exception:
+                pass
+        if "pdf" in ct or ext == "pdf":
+            try:
+                import PyPDF2
+                reader = PyPDF2.PdfReader(io.BytesIO(resp.content))
+                return "\n".join([p.extract_text() or "" for p in reader.pages])[:8000]
+            except Exception:
+                pass
+        return ""
+    except Exception:
+        return ""
+# ==========================================
+# 🤖 AGENTE PRINCIPALE
+# ==========================================
+class GaiaAgent:
+    def __init__(self):
+        print("=" * 60)
+        print("🚀 Inizializzazione GaiaAgent con Groq...")
+        print("=" * 60)
+        groq_key = os.getenv("GROQ_API_KEY", "")
+        if not groq_key:
+            raise ValueError(
+                "❌ GROQ_API_KEY non trovata nei secrets!\n"
+                "1. Vai su console.groq.com\n"
+                "2. Crea account gratis (no carta)\n"
+                "3. Genera API key\n"
+                "4. Mettila in Settings → Secrets del tuo HF Space"
+            )
+        print(f"  GROQ_API_KEY presente: ✅")
+        print(f"  Modello: {GROQ_MODEL}")
+        self.model = LiteLLMModel(
+            model_id=GROQ_MODEL,
+            api_key=groq_key,
+            temperature=0.1,
+            max_tokens=1024,
+        )
+        self.agent = CodeAgent(
+            tools=[
+                DuckDuckGoSearchTool(),
+                visit_webpage,
+                get_youtube_transcript,
+                download_task_file,
+            ],
+            model=self.model,
+            max_steps=8,
+            additional_authorized_imports=[
+                "requests", "bs4", "json", "time", "math", "datetime",
+                "pandas", "numpy", "re", "csv", "urllib", "collections",
+                "itertools", "string", "unicodedata", "statistics",
+            ],
+        )
+        print("  ✅ Agent pronto!")
     def __call__(self, question: str, task_id: str = "") -> str:
         print(f"\n{'─'*60}")
+        print(f"[Q]: {question[:150]}")
         print(f"[TASK]: {task_id}")
         processed = preprocess_question(question)
+        file_context = prefetch_file(task_id)
+        if file_context:
+            print(f"  [FILE PRE-FETCH]: {len(file_context)} chars")
+        prompt = self._build_prompt(processed, task_id, file_context)
+        # Run agent con retry
+        for attempt in range(2):
             try:
+                time.sleep(3)  # Rate limit: Groq free = 30 RPM
                 raw = self.agent.run(prompt)
                 answer = clean_answer(str(raw))
+                if self._is_valid(answer):
+                    print(f"  [✅ AGENT attempt {attempt+1}]: {answer}")
                     return answer
+                print(f"  [⚠️ Invalid: '{answer}'] attempt {attempt+1}")
             except Exception as e:
+                err_str = str(e)
+                print(f"  [⚠️ ERROR attempt {attempt+1}]: {err_str[:200]}")
+                if "429" in err_str or "rate" in err_str.lower():
+                    print("  Waiting 15s for rate limit...")
+                    time.sleep(15)
+        # Fallback diretto
+        print("  [→ FALLBACK DIRETTO]")
+        answer = self._direct_fallback(processed, file_context)
+        print(f"  [FINAL]: {answer}")
+        return answer
+    def _build_prompt(self, question: str, task_id: str, file_context: str) -> str:
+        file_hint = ""
+        if task_id:
+            file_hint = f'\nThis question has task_id="{task_id}". Call download_task_file("{task_id}") to check for attached files.'
+        extra = ""
         if file_context:
+            extra = f"\n\n--- ATTACHED FILE CONTENT ---\n{file_context[:4000]}\n--- END FILE ---\n"
+        return f"""You are an expert AI assistant solving GAIA benchmark questions.
+Your ONLY goal: find the EXACT correct answer.
+STRATEGY (follow in order):
+1. If question has a YouTube URL → call get_youtube_transcript(url)
+2. If question has any URL → call visit_webpage(url)
+3. If there might be an attached file → call download_task_file(task_id)
+4. For factual/historical questions → DuckDuckGoSearchTool, then visit_webpage for details
+5. For math/calculations → write and execute Python code directly
+6. If text looks reversed → reverse it: text[::-1]
+7. For Excel/CSV data → use pandas to compute the answer from the data
+ANSWER FORMAT (CRITICAL):
+- Output ONLY the bare final answer
+- Numbers: just the number (e.g., 3 or 12.5)
+- Names: just the name (e.g., Einstein)
+- Lists: comma-separated (e.g., cat, dog, bird)
+- NEVER say "The answer is..." or "Based on..." — just the raw answer
+- No periods at the end
+{file_hint}{extra}
+Question: {question}"""
+    def _is_valid(self, answer: str) -> bool:
+        if not answer or len(answer) < 1:
             return False
+        invalid = {
             "i don't know", "unknown", "n/a", "none", "error",
             "i cannot", "i can't", "not available", "no answer",
+            "could not", "unable to", "i'm not sure", "i am not sure",
+        }
         return answer.lower().strip() not in invalid
+    def _direct_fallback(self, question: str, file_context: str = "") -> str:
+        """Chiamata diretta a Groq senza smolagents."""
+        groq_key = os.getenv("GROQ_API_KEY", "")
+        if not groq_key:
+            return "I don't know"
+        extra = ""
+        if file_context:
+            extra = f"\n\nAttached file content:\n{file_context[:3000]}\n"
+        prompt = f"""Answer this question with ONLY the final answer.
+No explanation. No preamble. No "The answer is".
+Just the bare answer.
+- Numbers: just digits
+- Names: just the name
+- Lists: comma-separated
+- No period at the end
+{extra}
+Question: {question}
+Answer:"""
+        for model in ["llama-3.3-70b-versatile", "llama-3.1-8b-instant"]:
             try:
+                time.sleep(3)
+                resp = requests.post(
+                    "https://api.groq.com/openai/v1/chat/completions",
+                    headers={
+                        "Authorization": f"Bearer {groq_key}",
+                        "Content-Type": "application/json",
+                    },
+                    json={
+                        "model": model,
+                        "messages": [{"role": "user", "content": prompt}],
+                        "temperature": 0.1,
+                        "max_tokens": 200,
+                    },
+                    timeout=30,
+                )
+                if resp.status_code == 200:
+                    data = resp.json()
+                    raw = data["choices"][0]["message"]["content"].strip()
+                    answer = clean_answer(raw)
+                    if self._is_valid(answer):
+                        print(f"    [FALLBACK OK via {model}]: {answer}")
+                        return answer
+                elif resp.status_code == 429:
+                    print(f"    [RATE LIMITED {model}] — waiting 15s...")
+                    time.sleep(15)
+                    continue
+                else:
+                    print(f"    [FALLBACK {model}] HTTP {resp.status_code}: {resp.text[:200]}")
             except Exception as e:
+                print(f"    [FALLBACK {model} ERROR]: {e}")
+        return "I don't know"
 # ==========================================
     print(f"{'='*60}")
     try:
+        agent = GaiaAgent()
     except Exception as e:
         traceback.print_exc()
         return f"Errore inizializzazione agente: {e}", None
         if not task_id or question_text is None:
             continue
+        print(f"\n[{i+1}/{len(questions_data)}] ════════════════════════")
         try:
             answer = agent(question_text, task_id=task_id)
         except Exception as e:
 # ==========================================
+# 🖥️ INTERFACCIA
 # ==========================================
 with gr.Blocks() as demo:
+    gr.Markdown("# 🚀 GAIA Agent — Final Assignment")
     gr.Markdown(
+        "Powered by **Groq** (Llama 3.3 70B) — free & fast.\n\n"
+        "Login con HF, poi clicca il bottone."
     )
     gr.LoginButton()
     run_button = gr.Button("🔥 Avvia Valutazione & Invia Risposte", variant="primary")