Final_Assignment_Template

Sleeping

App Files Files Community

MainStreet123 commited on 7 days ago

Commit

2543503

verified ·

1 Parent(s): 547bdb7

Update app.py

Browse files

Files changed (1) hide show

app.py +235 -101

app.py CHANGED Viewed

@@ -1,121 +1,253 @@
 import os
 import sys
 import gradio as gr
 import requests
 import pandas as pd
-# Allow importing smolagents from hf-smolagents venv when running from project root
-_root = os.path.dirname(os.path.abspath(__file__))
-_venv_lib = os.path.join(_root, "hf-smolagents", ".venv", "lib")
-if os.path.isdir(_venv_lib):
-    for _name in os.listdir(_venv_lib):
-        if _name.startswith("python"):
-            _sp = os.path.join(_venv_lib, _name, "site-packages")
-            if os.path.isdir(_sp):
-                sys.path.insert(0, _sp)
-                break
-    else:
-        _sp = None
-else:
-    _sp = None
-from smolagents import CodeAgent, InferenceClientModel
-from smolagents.default_tools import (
-    DuckDuckGoSearchTool,
-    FinalAnswerTool,
-    PythonInterpreterTool,
-    UserInputTool,
-)
 # (Keep Constants as is)
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
-# --- Multi-Agent System (smolagents) ---
-def _create_model():
-    token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
-    return InferenceClientModel(
-        model_id="Qwen/Qwen2.5-Coder-7B-Instruct",
-        token=token,
-    )
-def _create_code_agent(model):
-    """Code agent: Python interpreter + final answer. For math, calculations, data."""
-    return CodeAgent(
-        tools=[PythonInterpreterTool(), UserInputTool()],
-        model=model,
-        name="code_agent",
-        description="Use for math, calculations, data processing, or when the task can be solved by writing and running Python code. Call with a single clear task.",
-        max_steps=15,
-    )
-def _create_web_agent(model):
-    """Web agent: DuckDuckGo search + final answer. For factual/search tasks."""
-    return CodeAgent(
-        tools=[DuckDuckGoSearchTool(), UserInputTool()],
-        model=model,
-        name="web_agent",
-        description="Use for factual questions, current events, or when you need to search the web. Give one search task per call; you can call me multiple times with different queries.",
-        max_steps=15,
-    )
-def _create_evaluator_agent(model):
-    """Evaluator: picks best answer from multiple candidates, returns via final_answer."""
-    return CodeAgent(
-        tools=[FinalAnswerTool(), UserInputTool()],
-        model=model,
-        name="evaluator_agent",
-        description="Use to pick the single best answer from multiple candidate answers. Pass a task containing the original question and the list of candidate answers; return only the chosen best answer (concise, factual).",
-        max_steps=5,
     )
-def _create_manager_agent(model):
-    """Manager: decides code_agent vs web_agent; if web, runs multiple web queries then evaluator then final_answer."""
-    code_agent = _create_code_agent(model)
-    web_agent = _create_web_agent(model)
-    evaluator_agent = _create_evaluator_agent(model)
-    return CodeAgent(
-        tools=[UserInputTool()],
-        model=model,
-        managed_agents=[code_agent, web_agent, evaluator_agent],
-        name="manager",
-        description="Orchestrator: decide whether to use code_agent or web_agent. For web tasks, call web_agent multiple times with different search queries, then call evaluator_agent with the question and all answers to pick the best one, then call final_answer with that result.",
-        max_steps=25,
-        planning_interval=3,
-    )
-class BasicAgent:
-    """Wrapper: builds manager-led multi-agent system and returns final answer string per question."""
     def __init__(self):
-        print("Multi-agent system: initializing model and manager...")
-        self._model = _create_model()
-        self._manager = _create_manager_agent(self._model)
-        print("Multi-agent system initialized (manager + code_agent, web_agent, evaluator_agent).")
     def __call__(self, question: str) -> str:
-        print(f"Agent received question (first 80 chars): {question[:80]}...")
-        try:
-            result = self._manager.run(
-                question,
-                reset=True,
-                stream=False,
-                return_full_result=False,
-            )
-            if result is None:
-                out = "No answer produced."
             else:
-                out = str(result).strip()
-            print(f"Agent returning answer (first 80 chars): {out[:80]}...")
-            return out
-        except Exception as e:
-            print(f"Agent error: {e}")
-            return f"AGENT ERROR: {e}"
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
@@ -136,9 +268,9 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
     questions_url = f"{api_url}/questions"
     submit_url = f"{api_url}/submit"
-    # 1. Instantiate Agent ( modify this part to create your agent)
     try:
-        agent = BasicAgent()
     except Exception as e:
         print(f"Error instantiating agent: {e}")
         return f"Error initializing agent: {e}", None
@@ -240,17 +372,20 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
 # --- Build Gradio Interface using Blocks ---
 with gr.Blocks() as demo:
-    gr.Markdown("# Basic Agent Evaluation Runner")
     gr.Markdown(
         """
         **Instructions:**
-        1.  Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
         2.  Log in to your Hugging Face account using the button below. This uses your HF username for submission.
-        3.  Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
         ---
         **Disclaimers:**
-        Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
-        This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
         """
     )
@@ -289,5 +424,4 @@ if __name__ == "__main__":
     print("-"*(60 + len(" App Starting ")) + "\n")
     print("Launching Gradio Interface for Basic Agent Evaluation...")
-    demo.launch(debug=True, share=False)

 import os
+import re
+import io
 import sys
 import gradio as gr
 import requests
 import pandas as pd
+from duckduckgo_search import DDGS
+from bs4 import BeautifulSoup
 # (Keep Constants as is)
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
+HF_INFERENCE_URL = "https://api-inference.huggingface.co/models"
+ROUTER_MODEL = "HuggingFaceH4/zephyr-7b-beta"
+EVALUATOR_MODEL = "HuggingFaceH4/zephyr-7b-beta"
+MAX_MANAGER_ITERATIONS = 5
+# --- Tools (used by agents) ---
+def python_interpreter_tool(code: str) -> str:
+    """Execute Python code and return stdout + result."""
+    try:
+        old_stdout = sys.stdout
+        sys.stdout = buf = io.StringIO()
+        try:
+            local = {}
+            exec(code, {"__builtins__": __builtins__}, local)
+            out = buf.getvalue()
+            if local.get("result") is not None:
+                out = (out + "\n" + str(local["result"])).strip()
+            return out or "(no output)"
+        finally:
+            sys.stdout = old_stdout
+    except Exception as e:
+        return f"Error: {e}"
+def duckduckgo_search_tool(query: str, max_results: int = 5) -> str:
+    """Search DuckDuckGo and return snippets."""
+    try:
+        with DDGS() as ddgs:
+            results = list(ddgs.text(query, max_results=max_results))
+        if not results:
+            return "No search results found."
+        parts = []
+        for r in results:
+            title = r.get("title", "")
+            body = r.get("body", "")
+            href = r.get("href", "")
+            parts.append(f"[{title}]({href})\n{body}")
+        return "\n\n".join(parts)
+    except Exception as e:
+        return f"Search error: {e}"
+def visit_web_page_tool(url: str, max_chars: int = 8000) -> str:
+    """Fetch a URL and return main text content."""
+    try:
+        headers = {"User-Agent": "Mozilla/5.0 (compatible; GAIA-Agent/1.0)"}
+        resp = requests.get(url, timeout=15, headers=headers)
+        resp.raise_for_status()
+        soup = BeautifulSoup(resp.text, "html.parser")
+        for tag in soup(["script", "style"]):
+            tag.decompose()
+        text = soup.get_text(separator="\n", strip=True)
+        text = re.sub(r"\n{3,}", "\n\n", text)
+        return text[:max_chars] if len(text) > max_chars else text
+    except Exception as e:
+        return f"Visit error: {e}"
+def _llm_call(prompt: str, model: str, max_new_tokens: int = 150) -> str:
+    """Single LLM call via Hugging Face Inference API."""
+    token = os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_HUB_TOKEN")
+    if not token:
+        return ""
+    url = f"{HF_INFERENCE_URL}/{model}"
+    try:
+        r = requests.post(
+            url,
+            headers={"Authorization": f"Bearer {token}"},
+            json={"inputs": prompt, "parameters": {"max_new_tokens": max_new_tokens, "return_full_text": False}},
+            timeout=30,
+        )
+        if r.status_code != 200:
+            return ""
+        data = r.json()
+        if isinstance(data, list) and len(data) > 0:
+            return (data[0].get("generated_text") or "").strip()
+        if isinstance(data, dict) and data.get("generated_text"):
+            return str(data["generated_text"]).strip()
+        return ""
+    except Exception:
+        return ""
+def manager_route_question(question: str) -> str:
+    """Decide whether to use code agent or web search agent. Returns 'code' or 'web'."""
+    q = question.lower()
+    code_keywords = (
+        "calculate", "compute", "python", "code", "program", "script", "function",
+        "how many", "number of", "formula", "equation", "sum", "multiply", "divide",
+        "percentage", "average", "median", "prime", "fibonacci", "factorial",
+        "run code", "execute", "output of", "result of"
+    )
+    if any(k in q for k in code_keywords):
+        return "code"
+    prompt = f'Given this question, reply with exactly one word: "code" or "web". Question: {question[:300]}'
+    out = _llm_call(prompt, ROUTER_MODEL, max_new_tokens=10).lower()
+    if "code" in out:
+        return "code"
+    if "web" in out:
+        return "web"
+    return "web"
+def evaluate_accuracy_tool(question: str, answer: str) -> bool:
+    """Use LLM to judge if answer looks mostly accurate. If no LLM, accept non-empty non-error answers."""
+    if not answer or "Error:" in answer or "error:" in answer[:200]:
+        return False
+    prompt = (
+        f'Question: {question}\nProposed answer: {answer[:800]}\n'
+        'Does this answer look mostly correct and complete? Reply with exactly "yes" or "no".'
     )
+    out = _llm_call(prompt, EVALUATOR_MODEL, max_new_tokens=5).lower()
+    if "yes" in out:
+        return True
+    if "no" in out:
+        return False
+    return len(answer.strip()) > 10 and "not found" not in answer.lower()[:100]
+def final_answer_tool(answer: str) -> str:
+    """Commit the final answer (manager returns this as the answer)."""
+    return answer.strip()
+# --- Code Agent (has Python interpreter tool) ---
+def _extract_python_code(text: str) -> str:
+    if not text:
+        return ""
+    text = text.strip()
+    for marker in ["```python", "```"]:
+        if marker in text:
+            parts = text.split(marker, 1)
+            if len(parts) > 1:
+                rest = parts[1].split("```", 1)[0]
+                return rest.strip()
+    return text
+def _heuristic_code_from_question(question: str) -> str:
+    numbers = re.findall(r"\d+(?:\.\d+)?", question)
+    q = question.lower()
+    if "how many" in q or "number of" in q:
+        return "result = ' (code agent could not compute; try web search)'"
+    if numbers and ("sum" in q or "total" in q or "+" in question):
+        return f"result = {' + '.join(numbers)}"
+    return "result = ' (no code generated; try web search)'"
+class CodeAgent:
     def __init__(self):
+        print("CodeAgent initialized.")
     def __call__(self, question: str) -> str:
+        print(f"CodeAgent received (first 50 chars): {question[:50]}...")
+        prompt = (
+            f"Question: {question}\n\n"
+            "Write a single Python code block to answer this. Use a variable 'result' for the final answer. "
+            "Only output valid Python code, no explanation."
+        )
+        code = _llm_call(prompt, ROUTER_MODEL, max_new_tokens=400)
+        if not code:
+            code = _heuristic_code_from_question(question)
+        code = _extract_python_code(code)
+        if not code:
+            return "Could not generate code for this question."
+        return python_interpreter_tool(code)
+# --- Web Search Agent (DuckDuckGo + visit web page tools) ---
+class WebSearchAgent:
+    def __init__(self):
+        print("WebSearchAgent initialized.")
+    def __call__(self, question: str) -> str:
+        print(f"WebSearchAgent received (first 50 chars): {question[:50]}...")
+        snippets = duckduckgo_search_tool(question, max_results=5)
+        if not snippets or "No search results" in snippets:
+            return "No search results found."
+        first_url = None
+        for line in snippets.split("\n"):
+            m = re.search(r"\((https?://[^)]+)\)", line)
+            if m:
+                first_url = m.group(1)
+                break
+        if first_url:
+            page_text = visit_web_page_tool(first_url, max_chars=4000)
+            if "Visit error" not in page_text:
+                snippets = snippets + "\n\n--- Page content ---\n" + page_text[:3000]
+        prompt = (
+            f"Question: {question}\n\nRelevant information:\n{snippets[:6000]}\n\n"
+            "Provide a concise, direct answer (string or number). No preamble."
+        )
+        answer = _llm_call(prompt, EVALUATOR_MODEL, max_new_tokens=200)
+        if answer:
+            return answer.strip()
+        blocks = [b.strip() for b in snippets.split("\n\n") if len(b.strip()) > 20]
+        return blocks[0][:500] if blocks else snippets[:500]
+# --- Manager Agent (user input = question; routes code/web; evaluates accuracy; final answer or retry) ---
+class ManagerAgent:
+    def __init__(self):
+        self.code_agent = CodeAgent()
+        self.web_agent = WebSearchAgent()
+        print("ManagerAgent initialized.")
+    def __call__(self, question: str) -> str:
+        print(f"Manager received question (first 50 chars): {question[:50]}...")
+        best_answer = None
+        tried_code = False
+        tried_web = False
+        for _ in range(MAX_MANAGER_ITERATIONS):
+            route = manager_route_question(question)
+            if route == "code" and not tried_code:
+                tried_code = True
+                reply = self.code_agent(question)
+            elif route == "web" and not tried_web:
+                tried_web = True
+                reply = self.web_agent(question)
             else:
+                if not tried_code:
+                    tried_code = True
+                    reply = self.code_agent(question)
+                elif not tried_web:
+                    tried_web = True
+                    reply = self.web_agent(question)
+                else:
+                    break
+            if reply and "Error:" not in reply[:100] and "Could not" not in reply[:100]:
+                best_answer = reply
+            if evaluate_accuracy_tool(question, reply):
+                return final_answer_tool(reply)
+        return final_answer_tool(best_answer) if best_answer else "I could not determine a reliable answer."
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
     questions_url = f"{api_url}/questions"
     submit_url = f"{api_url}/submit"
+    # 1. Instantiate Agent (multi-agent: Manager with Code + Web Search agents)
     try:
+        agent = ManagerAgent()
     except Exception as e:
         print(f"Error instantiating agent: {e}")
         return f"Error initializing agent: {e}", None
 # --- Build Gradio Interface using Blocks ---
 with gr.Blocks() as demo:
+    gr.Markdown("# Multi-Agent GAIA Evaluation Runner")
     gr.Markdown(
         """
+        **Architecture:** Manager Agent routes each question to either a **Code Agent** (Python interpreter) or **Web Search Agent** (DuckDuckGo + visit web page). The manager evaluates answer accuracy via an LLM; if mostly accurate it returns the final answer, otherwise it tries the other agent. Goal: score above 30 on GAIA.
         **Instructions:**
+        1.  Clone this space, then modify the code to tune agents, tools, or add an API token (HF_TOKEN or HUGGING_FACE_HUB_TOKEN) for LLM routing and evaluation.
         2.  Log in to your Hugging Face account using the button below. This uses your HF username for submission.
+        3.  Click 'Run Evaluation & Submit All Answers' to fetch questions, run the multi-agent system, submit answers, and see the score.
         ---
         **Disclaimers:**
+        Running the evaluation can take a long time while the agent processes all questions. For better GAIA scores, set HF_TOKEN in Space secrets for LLM-based routing and accuracy checks.
         """
     )
     print("-"*(60 + len(" App Starting ")) + "\n")
     print("Launching Gradio Interface for Basic Agent Evaluation...")
+    demo.launch(debug=True, share=False)