Final_Assignment_Template

Sleeping

App Files Files Community

GilbertoEwaldFilho commited on Nov 26, 2025

Commit

c15943d

verified ·

1 Parent(s): e96252b

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -313

app.py CHANGED Viewed

@@ -22,13 +22,6 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 # ================================
 def clean_answer(text: str) -> str:
-    """
-    Limpa a resposta do modelo para bater em EXACT MATCH:
-    - remove prefixos tipo 'Final answer', 'Answer:'
-    - remove quebras de linha
-    - remove aspas externas
-    - normaliza espaços e ponto final solto
-    """
     if not text:
         return ""
@@ -39,7 +32,6 @@ def clean_answer(text: str) -> str:
         r"(?i)^answer[:\- ]*",
         r"(?i)^the answer is[:\- ]*",
         r"(?i)^my answer is[:\- ]*",
-        r"(?i)^resposta[:\- ]*",
     ]
     for p in patterns_to_remove:
         text = re.sub(p, "", text).strip()
@@ -48,134 +40,43 @@ def clean_answer(text: str) -> str:
     text = re.sub(r"\s+", " ", text).strip()
     if len(text) > 2 and text.startswith(("'", '"')) and text.endswith(("'", '"')):
-        text = text[1:-1].strip()
     if text.endswith(".") and not re.search(r"[0-9A-Za-z][.!?]$", text[:-1]):
-        text = text[:-1].strip()
-    return text
 def enforce_numeric_format(question: str, answer: str) -> str:
-    """
-    Para questões que pedem número / duas casas / USD:
-    tenta extrair só o número principal e formatar certo.
-    """
     q = question.lower()
-    a = answer
-    # USD com duas casas decimais
-    if ("usd" in q or "$" in q) and (
-        "two decimal places" in q or "2 decimal places" in q
-    ):
-        match = re.search(r"[-+]?\d+(?:[.,]\d+)?", a)
-        if match:
-            try:
-                value = float(match.group(0).replace(",", ""))
-                return f"{value:.2f}"
-            except Exception:
-                pass
-    # Duas casas decimais sem necessariamente USD
     if "two decimal places" in q or "2 decimal places" in q:
-        match = re.search(r"[-+]?\d+(?:[.,]\d+)?", a)
         if match:
             try:
                 value = float(match.group(0).replace(",", ""))
                 return f"{value:.2f}"
-            except Exception:
                 pass
-    # Contagens / anos etc.
-    if any(
-        kw in q
-        for kw in [
-            "how many",
-            "number of",
-            "at bats",
-            "population",
-            "what year",
-            "in which year",
-        ]
-    ):
-        match = re.search(r"-?\d+", a.replace(",", ""))
         if match:
             return match.group(0)
-    return a
-def postprocess_vegetable_question(question: str, answer: str) -> str:
-    """
-    Ajuste especial para a questão dos VEGETAIS com definição BOTÂNICA.
-    - Remove claramente fruits/herbs da lista
-    - Ordena alfabeticamente
-    - Garante formato 'item, item, item'
-    """
-    q = question.lower()
-    if "vegetables" not in q:
-        return answer
-    if "botany" not in q and "botanical" not in q:
-        return answer
-    # Tenta quebrar a resposta em itens separados por vírgula
-    items_raw = [x.strip() for x in answer.split(",") if x.strip()]
-    if not items_raw:
-        return answer
-    # normalização pra comparar
-    normalized_map = {item.lower(): item for item in items_raw}
-    # lista de itens que NÃO devem entrar como vegetable: fruits, herbs, seeds etc.
-    # baseado especificamente na lista dessa questão
-    blacklist = {
-        "plums",
-        "green beans",
-        "rice",
-        "corn",
-        "bell pepper",
-        "whole bean coffee",
-        "whole allspice",
-        "acorns",
-        "peanuts",
-        "fresh basil",
-        "basil",
-        "oreos",
-        "milk",
-        "eggs",
-        "flour",
-    }
-    filtered = []
-    for low, original in normalized_map.items():
-        if low in blacklist:
-            continue
-        filtered.append(original)
-    if not filtered:
-        # se por algum motivo removemos tudo, volta original
-        filtered = list(normalized_map.values())
-    # ordena alfabeticamente ignorando maiúsculas/minúsculas
-    filtered_sorted = sorted(filtered, key=lambda x: x.lower())
-    return ", ".join(filtered_sorted)
 def web_search(question: str, max_results: int = 5) -> str:
-    """
-    Usa DuckDuckGo (ddgs) pra buscar contexto web.
-    """
-    snippets: List[str] = []
     try:
         with DDGS() as ddgs:
             for r in ddgs.text(question, max_results=max_results, safesearch="moderate"):
-                title = r.get("title") or ""
-                body = r.get("body") or ""
-                url = r.get("href") or ""
-                snippet = f"{title}\n{body}\nURL: {url}"
-                snippets.append(snippet)
     except Exception as e:
         print("[WEB SEARCH ERROR]", e)
         return ""
@@ -187,9 +88,6 @@ def web_search(question: str, max_results: int = 5) -> str:
 def get_file_context(api_url: str, task_id: str, item: dict) -> str:
-    """
-    Baixa arquivo em /files/{task_id} se existir e extrai texto/tab.
-    """
     file_name = (
         item.get("file_name")
         or item.get("filename")
@@ -217,7 +115,7 @@ def get_file_context(api_url: str, task_id: str, item: dict) -> str:
         if any(name_lower.endswith(ext) for ext in [".txt", ".csv", ".tsv"]):
             try:
                 text = data.decode("utf-8", errors="replace")
-            except Exception:
                 text = data.decode("latin-1", errors="replace")
             return f"[FILE TXT]\n{text[:8000]}"
@@ -231,8 +129,7 @@ def get_file_context(api_url: str, task_id: str, item: dict) -> str:
                 print("[EXCEL PARSE ERROR]", e)
                 return "[FILE] Spreadsheet exists but cannot parse."
-        # Outros tipos
-        return f"[FILE BINARY: {file_name}] {len(data)} bytes (type: {content_type})"
     except Exception as e:
         print("[FILE ERROR]", e)
@@ -245,17 +142,12 @@ def get_file_context(api_url: str, task_id: str, item: dict) -> str:
 SYSTEM_INSTRUCTIONS = """
 You are a highly accurate GAIA benchmark agent.
-Your answers are evaluated with EXACT MATCH.
-Core rules:
-- Think step by step INTERNALLY, but NEVER show your reasoning.
-- Output ONLY the final answer string, no explanations, no extra words.
-- Do NOT write prefixes like "Final answer:", "Answer is:", etc.
-- If the answer is a number, output only the number (no units) unless the format explicitly requires otherwise.
-- If the answer must have two decimal places (e.g. USD values), ensure exactly two decimal places.
-- If the answer is a list, output it exactly in the requested format (e.g. comma-separated, alphabetical order).
-- Carefully use both the provided file content (if any) and the web search snippets.
-- If external context is noisy or contradictory, prefer sources that match the question's constraints (dates, names, etc.).
 """
@@ -264,13 +156,6 @@ Core rules:
 # ================================
 class GaiaAgent:
-    """
-    Agente tunado:
-    - Qwen3-Next-80B-A3B-Thinking via chat_completion
-    - web search (ddgs)
-    - file context (txt/csv/excel)
-    - pós-processamento de número / USD / vegetais-botânica
-    """
     def __init__(self):
         print("Initializing GAIA Agent with Qwen 80B...")
@@ -283,67 +168,22 @@ class GaiaAgent:
             token=token,
         )
-    def build_prompt(self, question: str, search_ctx: str, file_ctx: str) -> str:
-        q = question.lower()
-        extra_guidance = []
-        # DICAS ESPECÍFICAS POR TIPO DE QUESTÃO
-        # Questões de USD / duas casas decimais
-        if "usd" in q or "dollars" in q or "two decimal places" in q:
-            extra_guidance.append(
-                "- If the answer is a monetary value, output only the numeric value with exactly two decimal places "
-                "(no currency symbol)."
-            )
-        # Questão de vegetais com definição botânica
-        if "vegetables" in q and ("botany" in q or "botanical" in q):
-            extra_guidance.append(
-                "- Use strict botanical definitions: fruits are seed-bearing structures (e.g., plums, bell peppers, "
-                "corn kernels, acorns, peanuts, beans, grains). Vegetables are other edible plant parts such as leaves, "
-                "stems, flowers, or roots (e.g., lettuce, celery, broccoli, sweet potatoes)."
-            )
-            extra_guidance.append(
-                "- Do NOT include any botanical fruits or seeds in the vegetable list, even if they are commonly "
-                "treated as vegetables in cooking."
-            )
-        # Questões de contagem/quantidade
-        if "how many" in q or "number of" in q or "at bats" in q:
-            extra_guidance.append(
-                "- Carefully count the exact quantity requested and output only that integer number."
-            )
-        # Questões de lista (ex: nomes separados por vírgula)
-        if "comma separated" in q or "comma-delimited" in q or "comma separated list" in q:
-            extra_guidance.append(
-                "- Output a single line with items separated by a comma and a space (e.g., 'item1, item2, item3')."
-            )
-        guidance_block = ""
-        if extra_guidance:
-            guidance_block = "\nAdditional question-specific rules:\n" + "\n".join(extra_guidance)
-        prompt = (
-            f"{SYSTEM_INSTRUCTIONS.strip()}\n"
-            f"{guidance_block}\n\n"
             f"QUESTION:\n{question}\n\n"
-            f"FILE CONTEXT (may be partial or noisy):\n{file_ctx or 'No file content.'}\n\n"
-            f"WEB SEARCH CONTEXT (may be partial or noisy):\n{search_ctx or 'No web search results.'}\n\n"
-            "Using ALL the reliable information above, deduce the correct answer.\n"
-            "Remember: DO NOT show your reasoning, only output the final answer string.\n"
-            "Answer:"
         )
-        return prompt
     def __call__(self, question: str, file_context: str = "") -> str:
-        print("\n" + "=" * 60)
         print("NEW QUESTION:")
         print(question)
-        print("=" * 60 + "\n")
-        search_ctx = web_search(question, max_results=5)
         print(f"[SEARCH LEN] {len(search_ctx)} | [FILE LEN] {len(file_context)}")
         prompt = self.build_prompt(question, search_ctx, file_context)
@@ -354,9 +194,8 @@ class GaiaAgent:
                     {"role": "system", "content": SYSTEM_INSTRUCTIONS},
                     {"role": "user", "content": prompt},
                 ],
-                max_tokens=220,
                 temperature=0.0,
-                top_p=1.0,
             )
             raw = response.choices[0].message["content"]
             print("[RAW OUTPUT]", raw)
@@ -366,7 +205,6 @@ class GaiaAgent:
         answer = clean_answer(raw)
         answer = enforce_numeric_format(question, answer)
-        answer = postprocess_vegetable_question(question, answer)
         print("[FINAL ANSWER]", answer)
         return answer
@@ -377,136 +215,70 @@ class GaiaAgent:
 # ================================
 def run_and_submit_all(profile: Optional[gr.OAuthProfile]):
-    """
-    Pipeline completo:
-    - busca questões
-    - tenta baixar arquivo (/files/{task_id})
-    - faz web search
-    - responde com GaiaAgent
-    - submete respostas ao /submit
-    """
     if not profile:
-        return "Please Login to Hugging Face with the button.", None
     username = profile.username
-    print(f"User logged in: {username}")
     api_url = DEFAULT_API_URL
     questions_url = f"{api_url}/questions"
     submit_url = f"{api_url}/submit"
     space_id = os.getenv("SPACE_ID")
-    agent_code = (
-        f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else ""
-    )
     print(f"Agent code URL: {agent_code}")
-    # Instanciar agente
     try:
         agent = GaiaAgent()
     except Exception as e:
-        print("Error instantiating agent:", e)
         return f"Error initializing agent: {e}", None
-    # Buscar questões
-    print(f"Fetching questions from: {questions_url}")
     try:
         resp = requests.get(questions_url, timeout=120)
         resp.raise_for_status()
-        questions_data = resp.json()
-        if not questions_data:
-            return "Fetched questions list is empty or invalid format.", None
-        print(f"Fetched {len(questions_data)} questions.")
     except Exception as e:
-        print("Error fetching questions:", e)
         return f"Error fetching questions: {e}", None
-    # Rodar agente em cada questão
-    results_log = []
-    answers_payload = []
-    print(f"Running agent on {len(questions_data)} questions...")
-    for item in questions_data:
-        task_id = item.get("task_id")
-        question_text = item.get("question")
-        if not task_id or question_text is None:
-            print("Skipping item with missing task_id or question:", item)
-            continue
-        file_context = get_file_context(api_url, task_id, item)
-        try:
-            submitted_answer = agent(question_text, file_context)
-        except Exception as e:
-            print(f"Error running agent on task {task_id}:", e)
-            submitted_answer = ""
-        answers_payload.append(
-            {"task_id": task_id, "submitted_answer": submitted_answer}
-        )
-        results_log.append(
-            {
-                "Task ID": task_id,
-                "Question": question_text,
-                "Submitted Answer": submitted_answer,
-            }
-        )
-    if not answers_payload:
-        return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
-    submission_data = {
-        "username": username.strip(),
         "agent_code": agent_code,
         "answers": answers_payload,
     }
-    print(
-        f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
-    )
-    print(f"Submitting to: {submit_url}")
     try:
-        resp = requests.post(submit_url, json=submission_data)
         resp.raise_for_status()
-        result_data = resp.json()
-        final_status = (
             f"Submission Successful!\n"
-            f"User: {result_data.get('username')}\n"
-            f"Overall Score: {result_data.get('score', 'N/A')}% "
-            f"({result_data.get('correct_count', '?')}/"
-            f"{result_data.get('total_attempted', '?')} correct)\n"
-            f"Message: {result_data.get('message', 'No message received.')}"
         )
-        print("Submission successful.")
-        results_df = pd.DataFrame(results_log)
-        return final_status, results_df
-    except requests.exceptions.HTTPError as e:
-        error_detail = f"Server responded with status {e.response.status_code}."
-        try:
-            error_json = e.response.json()
-            error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
-        except Exception:
-            error_detail += f" Response: {e.response.text[:500]}"
-        status_message = f"Submission Failed: {error_detail}"
-        print(status_message)
-        results_df = pd.DataFrame(results_log)
-        return status_message, results_df
-    except requests.exceptions.RequestException as e:
-        status_message = f"Submission Failed: Network error - {e}"
-        print(status_message)
-        results_df = pd.DataFrame(results_log)
-        return status_message, results_df
     except Exception as e:
-        status_message = f"An unexpected error occurred during submission: {e}"
-        print(status_message)
-        results_df = pd.DataFrame(results_log)
-        return status_message, results_df
 # ================================
@@ -514,37 +286,17 @@ def run_and_submit_all(profile: Optional[gr.OAuthProfile]):
 # ================================
 with gr.Blocks() as demo:
-    gr.Markdown("# GAIA Agent Evaluation Runner (Qwen 80B – Tuned Version)")
-    gr.Markdown(
-        """
-        **How to use**
-        1. Log in with your Hugging Face account.
-        2. Make sure this Space is public and has a Secret `HF_TOKEN`
-           with Inference permissions.
-        3. Click **"Run Evaluation & Submit All Answers"** and wait.
-        The agent will:
-        - fetch all questions,
-        - optionally download attached files (if any),
-        - perform web search,
-        - answer each question with ONLY the final answer (EXACT MATCH friendly),
-        - submit to the scoring API.
-        """
-    )
     gr.LoginButton()
     run_button = gr.Button("Run Evaluation & Submit All Answers")
-    status_output = gr.Textbox(label="Run Status / Submission Result", lines=5)
-    results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
-    run_button.click(
-        fn=run_and_submit_all,
-        outputs=[status_output, results_table],
-    )
 if __name__ == "__main__":
-    print("\n" + "-" * 30 + " App Starting " + "-" * 30)
-    demo.launch(debug=True, share=False)

 # ================================
 def clean_answer(text: str) -> str:
     if not text:
         return ""
         r"(?i)^answer[:\- ]*",
         r"(?i)^the answer is[:\- ]*",
         r"(?i)^my answer is[:\- ]*",
     ]
     for p in patterns_to_remove:
         text = re.sub(p, "", text).strip()
     text = re.sub(r"\s+", " ", text).strip()
     if len(text) > 2 and text.startswith(("'", '"')) and text.endswith(("'", '"')):
+        text = text[1:-1]
     if text.endswith(".") and not re.search(r"[0-9A-Za-z][.!?]$", text[:-1]):
+        text = text[:-1]
+    return text.strip()
 def enforce_numeric_format(question: str, answer: str) -> str:
     q = question.lower()
     if "two decimal places" in q or "2 decimal places" in q:
+        match = re.search(r"[-+]?\d+(?:[.,]\d+)?", answer)
         if match:
             try:
                 value = float(match.group(0).replace(",", ""))
                 return f"{value:.2f}"
+            except:
                 pass
+    if any(kw in q for kw in ["how many", "number of", "what year", "in which year"]):
+        match = re.search(r"-?\d+", answer.replace(",", ""))
         if match:
             return match.group(0)
+    return answer
 def web_search(question: str, max_results: int = 5) -> str:
+    snippets = []
     try:
         with DDGS() as ddgs:
             for r in ddgs.text(question, max_results=max_results, safesearch="moderate"):
+                title = r.get("title", "")
+                body = r.get("body", "")
+                url = r.get("href", "")
+                snippets.append(f"{title}\n{body}\nURL: {url}")
     except Exception as e:
         print("[WEB SEARCH ERROR]", e)
         return ""
 def get_file_context(api_url: str, task_id: str, item: dict) -> str:
     file_name = (
         item.get("file_name")
         or item.get("filename")
         if any(name_lower.endswith(ext) for ext in [".txt", ".csv", ".tsv"]):
             try:
                 text = data.decode("utf-8", errors="replace")
+            except:
                 text = data.decode("latin-1", errors="replace")
             return f"[FILE TXT]\n{text[:8000]}"
                 print("[EXCEL PARSE ERROR]", e)
                 return "[FILE] Spreadsheet exists but cannot parse."
+        return f"[FILE BINARY: {file_name}] {len(data)} bytes"
     except Exception as e:
         print("[FILE ERROR]", e)
 SYSTEM_INSTRUCTIONS = """
 You are a highly accurate GAIA benchmark agent.
+Always output ONLY the final answer (EXACT MATCH).
+No explanations. No reasoning. No extra words.
+Rules:
+- If the answer is a number → only the number.
+- If format requires 2 decimal places → enforce it.
+- If a list is required → output in exact requested form.
 """
 # ================================
 class GaiaAgent:
     def __init__(self):
         print("Initializing GAIA Agent with Qwen 80B...")
             token=token,
         )
+    def build_prompt(self, question, search_ctx, file_ctx):
+        return (
+            f"{SYSTEM_INSTRUCTIONS}\n\n"
             f"QUESTION:\n{question}\n\n"
+            f"FILE CONTEXT:\n{file_ctx or 'No file provided.'}\n\n"
+            f"WEB SEARCH CONTEXT:\n{search_ctx or 'No search results.'}\n\n"
+            "Now output ONLY the final answer:\n"
         )
     def __call__(self, question: str, file_context: str = "") -> str:
+        print("\n====================================================")
         print("NEW QUESTION:")
         print(question)
+        print("====================================================\n")
+        search_ctx = web_search(question)
         print(f"[SEARCH LEN] {len(search_ctx)} | [FILE LEN] {len(file_context)}")
         prompt = self.build_prompt(question, search_ctx, file_context)
                     {"role": "system", "content": SYSTEM_INSTRUCTIONS},
                     {"role": "user", "content": prompt},
                 ],
+                max_tokens=200,
                 temperature=0.0,
             )
             raw = response.choices[0].message["content"]
             print("[RAW OUTPUT]", raw)
         answer = clean_answer(raw)
         answer = enforce_numeric_format(question, answer)
         print("[FINAL ANSWER]", answer)
         return answer
 # ================================
 def run_and_submit_all(profile: Optional[gr.OAuthProfile]):
     if not profile:
+        return "Please log in first.", None
     username = profile.username
     api_url = DEFAULT_API_URL
     questions_url = f"{api_url}/questions"
     submit_url = f"{api_url}/submit"
     space_id = os.getenv("SPACE_ID")
+    agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
+    print(f"User logged in: {username}")
     print(f"Agent code URL: {agent_code}")
     try:
         agent = GaiaAgent()
     except Exception as e:
         return f"Error initializing agent: {e}", None
+    print("Fetching questions...")
     try:
         resp = requests.get(questions_url, timeout=120)
         resp.raise_for_status()
+        questions = resp.json()
     except Exception as e:
         return f"Error fetching questions: {e}", None
+    print(f"Fetched {len(questions)} questions.")
+    answers_payload = []
+    results_log = []
+    for item in questions:
+        qid = item["task_id"]
+        qtext = item["question"]
+        file_context = get_file_context(api_url, qid, item)
+        answer = agent(qtext, file_context)
+        answers_payload.append({"task_id": qid, "submitted_answer": answer})
+        results_log.append({"Task ID": qid, "Question": qtext, "Submitted Answer": answer})
+    submission = {
+        "username": username,
         "agent_code": agent_code,
         "answers": answers_payload,
     }
+    print("Submitting answers...")
     try:
+        resp = requests.post(submit_url, json=submission)
         resp.raise_for_status()
+        result = resp.json()
+        status = (
             f"Submission Successful!\n"
+            f"Score: {result.get('score')}% "
+            f"({result.get('correct_count')}/{result.get('total_attempted')})\n"
+            f"{result.get('message')}"
         )
+        return status, pd.DataFrame(results_log)
     except Exception as e:
+        return f"Submission failed: {e}", pd.DataFrame(results_log)
 # ================================
 # ================================
 with gr.Blocks() as demo:
+    gr.Markdown("## GAIA Agent Runner – Qwen 80B Enhanced Version")
     gr.LoginButton()
     run_button = gr.Button("Run Evaluation & Submit All Answers")
+    out_status = gr.Textbox(label="Status", lines=4)
+    out_table = gr.DataFrame(label="Answers")
+    run_button.click(run_and_submit_all, outputs=[out_status, out_table])
 if __name__ == "__main__":
+    demo.launch(debug=True, share=False)