Final_Assignment_Template

Sleeping

App Files Files Community

MainStreet123 commited on 5 days ago

Commit

993aee8

verified ·

1 Parent(s): 00d93b9

Update app.py

Browse files

Files changed (1) hide show

app.py +170 -301

app.py CHANGED Viewed

@@ -1,321 +1,193 @@
 import os
 import re
-import io
-import sys
 import gradio as gr
 import requests
 import pandas as pd
-from duckduckgo_search import DDGS
 from bs4 import BeautifulSoup
 # (Keep Constants as is)
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
-HF_INFERENCE_URL = "https://api-inference.huggingface.co/models"
-ROUTER_MODEL = "HuggingFaceH4/zephyr-7b-beta"
-EVALUATOR_MODEL = "HuggingFaceH4/zephyr-7b-beta"
-CODE_MODEL = "HuggingFaceH4/zephyr-7b-beta"
-EXTRACTOR_MODEL = "HuggingFaceH4/zephyr-7b-beta"
-MAX_MANAGER_ITERATIONS = 5
-MAX_WEB_PAGES_TO_VISIT = 3
-MAX_WEB_SEARCH_ROUNDS = 2
-# --- Tools (used by agents) ---
-def python_interpreter_tool(code: str) -> str:
-    """Execute Python code and return stdout + result."""
-    try:
-        old_stdout = sys.stdout
-        sys.stdout = buf = io.StringIO()
-        try:
-            local = {}
-            exec(code, {"__builtins__": __builtins__}, local)
-            out = buf.getvalue()
-            if local.get("result") is not None:
-                out = (out + "\n" + str(local["result"])).strip()
-            return out or "(no output)"
-        finally:
-            sys.stdout = old_stdout
-    except Exception as e:
-        return f"Error: {e}"
-def duckduckgo_search_tool(query: str, max_results: int = 5) -> str:
-    """Search DuckDuckGo and return snippets."""
     try:
-        with DDGS() as ddgs:
-            results = list(ddgs.text(query, max_results=max_results))
         if not results:
             return "No search results found."
-        parts = []
-        for r in results:
-            title = r.get("title", "")
-            body = r.get("body", "")
-            href = r.get("href", "")
-            parts.append(f"[{title}]({href})\n{body}")
-        return "\n\n".join(parts)
     except Exception as e:
-        return f"Search error: {e}"
-def visit_web_page_tool(url: str, max_chars: int = 8000) -> str:
-    """Fetch a URL and return main text content."""
     try:
-        headers = {"User-Agent": "Mozilla/5.0 (compatible; GAIA-Agent/1.0)"}
-        resp = requests.get(url, timeout=15, headers=headers)
-        resp.raise_for_status()
-        soup = BeautifulSoup(resp.text, "html.parser")
-        for tag in soup(["script", "style"]):
             tag.decompose()
         text = soup.get_text(separator="\n", strip=True)
-        text = re.sub(r"\n{3,}", "\n\n", text)
-        return text[:max_chars] if len(text) > max_chars else text
     except Exception as e:
-        return f"Visit error: {e}"
-def _llm_call(prompt: str, model: str, max_new_tokens: int = 150) -> str:
-    """Single LLM call via Hugging Face Inference API."""
-    token = os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_HUB_TOKEN")
-    if not token:
-        return ""
-    url = f"{HF_INFERENCE_URL}/{model}"
     try:
-        r = requests.post(
-            url,
-            headers={"Authorization": f"Bearer {token}"},
-            json={"inputs": prompt, "parameters": {"max_new_tokens": max_new_tokens, "return_full_text": False}},
-            timeout=30,
-        )
-        if r.status_code != 200:
             return ""
-        data = r.json()
-        if isinstance(data, list) and len(data) > 0:
-            return (data[0].get("generated_text") or "").strip()
-        if isinstance(data, dict) and data.get("generated_text"):
-            return str(data["generated_text"]).strip()
-        return ""
-    except Exception:
-        return ""
-def manager_route_question(question: str) -> str:
-    """Decide whether to use code agent or web search agent. Returns 'code' or 'web'."""
-    q = question.lower()
-    code_keywords = (
-        "calculate", "compute", "python", "code", "program", "script", "function",
-        "how many", "number of", "formula", "equation", "sum", "multiply", "divide",
-        "percentage", "average", "median", "prime", "fibonacci", "factorial",
-        "run code", "execute", "output of", "result of"
-    )
-    if any(k in q for k in code_keywords):
-        return "code"
-    prompt = f'Given this question, reply with exactly one word: "code" or "web". Question: {question[:300]}'
-    out = _llm_call(prompt, ROUTER_MODEL, max_new_tokens=10).lower()
-    if "code" in out:
-        return "code"
-    if "web" in out:
-        return "web"
-    return "web"
-def evaluate_accuracy_tool(question: str, answer: str) -> bool:
-    """Use LLM to judge if answer looks mostly accurate. If no LLM, accept non-empty non-error answers."""
-    if not answer or "Error:" in answer or "error:" in answer[:200]:
-        return False
-    prompt = (
-        f'Question: {question}\nProposed answer: {answer[:800]}\n'
-        'Does this answer look mostly correct and complete? Reply with exactly "yes" or "no".'
-    )
-    out = _llm_call(prompt, EVALUATOR_MODEL, max_new_tokens=5).lower()
-    if "yes" in out:
-        return True
-    if "no" in out:
-        return False
-    return len(answer.strip()) > 10 and "not found" not in answer.lower()[:100]
-def final_answer_tool(answer: str) -> str:
-    """Commit the final answer (manager returns this as the answer)."""
-    return answer.strip()
-def _looks_like_number(s: str) -> bool:
-    s = s.strip().rstrip("%")
-    try:
-        float(s.replace(",", ""))
-        return True
-    except ValueError:
-        return False
-def normalize_to_gaia_answer(question: str, raw_answer: str) -> str:
-    """Extract a short, GAIA-style answer: one word, number, or short comma-separated list."""
-    if not raw_answer or not raw_answer.strip():
-        return raw_answer.strip() if raw_answer else ""
-    raw = raw_answer.strip()
-    lines = [ln.strip() for ln in raw.split("\n") if ln.strip()]
-    for candidate in reversed(lines):
-        if 1 <= len(candidate) <= 120 and "Error" not in candidate and "Could not" not in candidate:
-            if candidate[0].isdigit() or (not candidate.startswith("(") and "http" not in candidate.lower()):
-                if "," in candidate and len(candidate) < 80:
-                    return candidate
-                if candidate.isdigit() or _looks_like_number(candidate):
-                    return candidate
-                if len(candidate.split()) <= 8:
-                    return candidate
-    numbers = re.findall(r"\b\d+(?:\.\d+)?%?\b", raw)
-    if numbers:
-        return numbers[-1]
-    prompt = (
-        f"Question: {question}\n\nLong answer or context:\n{raw[:1000]}\n\n"
-        "Output ONLY the final answer: one word, one number, or a short comma-separated list (no explanation, no period at end). "
-        "Example: Paris | 42 | apple, banana"
-    )
-    out = _llm_call(prompt, EXTRACTOR_MODEL, max_new_tokens=50).strip()
-    if out:
-        out = out.rstrip(".")
-        if len(out) <= 150:
-            return out
-    for seg in re.split(r"[\n.!?]", raw):
-        seg = seg.strip()
-        if 1 <= len(seg) <= 100 and "Error" not in seg:
-            return seg
-    return raw[:200].strip()
-# --- Code Agent (has Python interpreter tool) ---
-def _extract_python_code(text: str) -> str:
-    if not text:
-        return ""
-    text = text.strip()
-    for marker in ["```python", "```"]:
-        if marker in text:
-            parts = text.split(marker, 1)
-            if len(parts) > 1:
-                rest = parts[1].split("```", 1)[0]
-                return rest.strip()
-    return text
-def _heuristic_code_from_question(question: str) -> str:
-    numbers = re.findall(r"\d+(?:\.\d+)?", question)
-    q = question.lower()
-    if "how many" in q or "number of" in q:
-        return "result = ' (code agent could not compute; try web search)'"
-    if numbers and ("sum" in q or "total" in q or "+" in question):
-        return f"result = {' + '.join(numbers)}"
-    return "result = ' (no code generated; try web search)'"
-class CodeAgent:
-    def __init__(self):
-        print("CodeAgent initialized.")
-    def __call__(self, question: str) -> str:
-        print(f"CodeAgent received (first 50 chars): {question[:50]}...")
-        prompt = (
-            f"Question: {question}\n\n"
-            "Write a single Python code block to answer this. Use a variable 'result' for the final answer. "
-            "The value of 'result' must be a single number, one word, or a short phrase (GAIA format: no long explanation). "
-            "Only output valid Python code, no explanation."
-        )
-        code = _llm_call(prompt, CODE_MODEL, max_new_tokens=400)
-        if not code:
-            code = _heuristic_code_from_question(question)
-        code = _extract_python_code(code)
-        if not code:
-            return "Could not generate code for this question."
-        return python_interpreter_tool(code)
-# --- Web Search Agent (DuckDuckGo + visit web page tools) ---
-def _urls_from_snippets(snippets: str, max_urls: int = 5) -> list:
-    urls = []
-    for line in snippets.split("\n"):
-        m = re.search(r"\((https?://[^)]+)\)", line)
-        if m:
-            u = m.group(1)
-            if u not in urls:
-                urls.append(u)
-                if len(urls) >= max_urls:
-                    break
-    return urls
-class WebSearchAgent:
-    def __init__(self):
-        print("WebSearchAgent initialized.")
     def __call__(self, question: str) -> str:
-        print(f"WebSearchAgent received (first 50 chars): {question[:50]}...")
-        combined = ""
-        for round_num in range(MAX_WEB_SEARCH_ROUNDS):
-            query = question if round_num == 0 else f"{question} answer"
-            snippets = duckduckgo_search_tool(query, max_results=6)
-            if not snippets or "No search results" in snippets:
-                if round_num == 0:
-                    return "No search results found."
-                break
-            combined += "\n\n--- Search round {} ---\n{}".format(round_num + 1, snippets)
-            urls = _urls_from_snippets(snippets, max_urls=MAX_WEB_PAGES_TO_VISIT)
-            for url in urls:
-                page_text = visit_web_page_tool(url, max_chars=3500)
-                if "Visit error" not in page_text:
-                    combined += "\n\n--- Page ---\n" + page_text[:3000]
-            if round_num == 0 and len(combined) > 500:
-                break
-        if not combined:
-            return "No search results found."
-        prompt = (
-            f"Question: {question}\n\nRelevant information:\n{combined[:7000]}\n\n"
-            "Provide ONLY the final answer in GAIA format: one word, one number, or a short comma-separated list. No preamble, no explanation, no period at end."
         )
-        answer = _llm_call(prompt, EVALUATOR_MODEL, max_new_tokens=200)
-        if answer:
-            return answer.strip()
-        blocks = [b.strip() for b in combined.split("\n\n") if len(b.strip()) > 20]
-        return blocks[0][:400] if blocks else combined[:400]
-# --- Manager Agent (user input = question; routes code/web; evaluates accuracy; final answer or retry) ---
-class ManagerAgent:
-    def __init__(self):
-        self.code_agent = CodeAgent()
-        self.web_agent = WebSearchAgent()
-        print("ManagerAgent initialized.")
-    def __call__(self, question: str) -> str:
-        print(f"Manager received question (first 50 chars): {question[:50]}...")
-        best_answer = None
-        tried_code = False
-        tried_web = False
-        for _ in range(MAX_MANAGER_ITERATIONS):
-            route = manager_route_question(question)
-            if route == "code" and not tried_code:
-                tried_code = True
-                reply = self.code_agent(question)
-            elif route == "web" and not tried_web:
-                tried_web = True
-                reply = self.web_agent(question)
-            else:
-                if not tried_code:
-                    tried_code = True
-                    reply = self.code_agent(question)
-                elif not tried_web:
-                    tried_web = True
-                    reply = self.web_agent(question)
-                else:
-                    break
-            if reply and "Error:" not in reply[:100] and "Could not" not in reply[:100]:
-                best_answer = reply
-            if evaluate_accuracy_tool(question, reply):
-                return normalize_to_gaia_answer(question, final_answer_tool(reply))
-        out = final_answer_tool(best_answer) if best_answer else "I could not determine a reliable answer."
-        return normalize_to_gaia_answer(question, out)
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
@@ -336,9 +208,9 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
     questions_url = f"{api_url}/questions"
     submit_url = f"{api_url}/submit"
-    # 1. Instantiate Agent (multi-agent: Manager with Code + Web Search agents)
     try:
-        agent = ManagerAgent()
     except Exception as e:
         print(f"Error instantiating agent: {e}")
         return f"Error initializing agent: {e}", None
@@ -440,20 +312,17 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
 # --- Build Gradio Interface using Blocks ---
 with gr.Blocks() as demo:
-    gr.Markdown("# Multi-Agent GAIA Evaluation Runner")
     gr.Markdown(
         """
-        **Architecture:** Manager Agent routes each question to either a **Code Agent** (Python interpreter) or **Web Search Agent** (DuckDuckGo + visit web page). The manager evaluates answer accuracy via an LLM; if mostly accurate it returns the final answer, otherwise it tries the other agent. Goal: score above 30 on GAIA.
         **Instructions:**
-        1.  Clone this space, then modify the code to tune agents, tools, or add an API token (HF_TOKEN or HUGGING_FACE_HUB_TOKEN) for LLM routing and evaluation.
         2.  Log in to your Hugging Face account using the button below. This uses your HF username for submission.
-        3.  Click 'Run Evaluation & Submit All Answers' to fetch questions, run the multi-agent system, submit answers, and see the score.
         ---
         **Disclaimers:**
-        Running the evaluation can take a long time while the agent processes all questions. For better GAIA scores, set HF_TOKEN in Space secrets for LLM-based routing and accuracy checks.
         """
     )

 import os
 import re
+import json
 import gradio as gr
 import requests
 import pandas as pd
+from urllib.parse import quote
 from bs4 import BeautifulSoup
+from dotenv import load_dotenv
+load_dotenv()
 # (Keep Constants as is)
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
+HF_TOKEN = os.getenv("HF_TOKEN", "")
+REACT_MAX_STEPS = 10
+LLM_MODEL = "Qwen/Qwen2.5-7B-Instruct"
+# --- Tools (DuckDuckGo search, web page view, code agent) ---
+def tool_web_search(query: str, max_results: int = 5) -> str:
+    """Search the web using DuckDuckGo. Input: search query string."""
     try:
+        from duckduckgo_search import DDGS
+        results = list(DDGS().text(query, max_results=max_results))
         if not results:
             return "No search results found."
+        out = []
+        for i, r in enumerate(results, 1):
+            out.append(f"{i}. {r.get('title', '')}\n   URL: {r.get('href', '')}\n   {r.get('body', '')}")
+        return "\n\n".join(out)
     except Exception as e:
+        return f"Web search error: {e}"
+def tool_web_page_view(url: str) -> str:
+    """View the main text content of a web page. Input: full URL string."""
     try:
+        headers = {"User-Agent": "Mozilla/5.0 (compatible; ReActAgent/1.0)"}
+        r = requests.get(url, timeout=15, headers=headers)
+        r.raise_for_status()
+        soup = BeautifulSoup(r.text, "html.parser")
+        for tag in soup(["script", "style", "nav", "footer", "header"]):
             tag.decompose()
         text = soup.get_text(separator="\n", strip=True)
+        return text[:8000] if len(text) > 8000 else text or "No text content found."
     except Exception as e:
+        return f"Web page view error: {e}"
+def tool_code_agent(code: str) -> str:
+    """Run Python code to compute an answer. Input: a single Python expression or block (e.g. print(2+2)). No file or network access."""
+    import builtins
+    import io
+    import sys
+    safe_builtins = {
+        "abs": builtins.abs, "all": builtins.all, "any": builtins.any,
+        "bin": builtins.bin, "bool": builtins.bool, "chr": builtins.chr,
+        "dict": builtins.dict, "divmod": builtins.divmod, "enumerate": builtins.enumerate,
+        "filter": builtins.filter, "float": builtins.float, "format": builtins.format,
+        "hash": builtins.hash, "int": builtins.int, "len": builtins.len,
+        "list": builtins.list, "map": builtins.map, "max": builtins.max,
+        "min": builtins.min, "next": builtins.next, "pow": builtins.pow,
+        "print": builtins.print, "range": builtins.range, "repr": builtins.repr,
+        "reversed": builtins.reversed, "round": builtins.round, "set": builtins.set,
+        "sorted": builtins.sorted, "str": builtins.str, "sum": builtins.sum,
+        "tuple": builtins.tuple, "zip": builtins.zip,
+    }
     try:
+        code = code.strip()
+        if not code.startswith("print(") and "print(" not in code:
+            code = f"print({code})"
+        buf = io.StringIO()
+        old_stdout = sys.stdout
+        sys.stdout = buf
+        try:
+            exec(code, {"__builtins__": safe_builtins, "print": builtins.print}, {})
+        finally:
+            sys.stdout = old_stdout
+        return buf.getvalue().strip() or "Code ran (no printed output)."
+    except Exception as e:
+        return f"Code error: {e}"
+TOOLS = {
+    "web_search": tool_web_search,
+    "web_page_view": tool_web_page_view,
+    "code_agent": tool_code_agent,
+}
+TOOL_DESCRIPTIONS = """Available tools:
+- web_search: search the web with DuckDuckGo. Input: search query (string).
+- web_page_view: get main text from a web page. Input: URL (string).
+- code_agent: run Python code (math, string ops). Input: code (string)."""
+# --- ReAct Agent: Plan -> Act -> Observe -> Reflect ---
+class ReActAgent:
+    def __init__(self, token: str | None = None, model: str = LLM_MODEL, max_steps: int = REACT_MAX_STEPS):
+        self.token = (token or HF_TOKEN or "").strip()
+        self.model = model
+        self.max_steps = max_steps
+        print("ReActAgent initialized (plan -> act -> observe -> reflect).")
+    def _llm(self, messages: list[dict]) -> str:
+        if not self.token:
+            return "Error: HF_TOKEN not set. Add your token in .env to use the LLM."
+        url = f"https://api-inference.huggingface.co/models/{self.model}"
+        headers = {"Authorization": f"Bearer {self.token}", "Content-Type": "application/json"}
+        payload = {"inputs": self._messages_to_prompt(messages), "parameters": {"max_new_tokens": 512, "return_full_text": False}}
+        try:
+            r = requests.post(url, json=payload, headers=headers, timeout=60)
+            r.raise_for_status()
+            data = r.json()
+            if isinstance(data, list) and len(data) > 0:
+                return (data[0].get("generated_text") or "").strip()
+            if isinstance(data, dict) and "generated_text" in data:
+                return (data["generated_text"] or "").strip()
             return ""
+        except Exception as e:
+            return f"LLM error: {e}"
+    def _messages_to_prompt(self, messages: list[dict]) -> str:
+        out = []
+        for m in messages:
+            role = m.get("role", "user")
+            content = m.get("content", "")
+            if role == "system":
+                out.append(f"System: {content}")
+            elif role == "user":
+                out.append(f"User: {content}")
+            else:
+                out.append(f"Assistant: {content}")
+        out.append("Assistant:")
+        return "\n\n".join(out)
+    def _parse_action(self, text: str) -> tuple[str | None, str | None, str | None]:
+        """Returns (thought, action, action_input) or (None, None, final_answer)."""
+        text = text.strip()
+        final_match = re.search(r"Final Answer\s*:\s*(.+?)(?=\n\n|\Z)", text, re.DOTALL | re.IGNORECASE)
+        if final_match:
+            return None, None, final_match.group(1).strip()
+        action_match = re.search(r"Action\s*:\s*(\w+)", text, re.IGNORECASE)
+        input_match = re.search(r"Action Input\s*:\s*(.+?)(?=\n\n|\nThought:|\Z)", text, re.DOTALL | re.IGNORECASE)
+        thought = None
+        thought_match = re.search(r"Thought\s*:\s*(.+?)(?=\nAction:|\Z)", text, re.DOTALL | re.IGNORECASE)
+        if thought_match:
+            thought = thought_match.group(1).strip()
+        action = action_match.group(1).strip() if action_match else None
+        action_input = input_match.group(1).strip() if input_match else None
+        if action_input:
+            action_input = action_input.strip().strip('"\'')
+        return thought, action, action_input
     def __call__(self, question: str) -> str:
+        print(f"ReAct agent received question (first 50 chars): {question[:50]}...")
+        if not self.token:
+            return "HF_TOKEN not set. Add your Hugging Face token in .env to run the ReAct agent."
+        system = (
+            "You are a ReAct agent. For each turn you must either:\n"
+            "1. Output: Thought: <reasoning> then Action: <tool_name> then Action Input: <input>\n"
+            "2. Or when you have the answer: Final Answer: <your answer>\n\n"
+            + TOOL_DESCRIPTIONS
         )
+        messages = [
+            {"role": "system", "content": system},
+            {"role": "user", "content": f"Question: {question}\n\nFirst, plan which tool(s) to use, then take action, then observe, then reflect. Give your final answer when done."},
+        ]
+        for step in range(self.max_steps):
+            response = self._llm(messages)
+            thought, action, action_input = self._parse_action(response)
+            if thought is None and action is None and action_input is not None:
+                return action_input  # Final Answer
+            if not action or action not in TOOLS:
+                messages.append({"role": "assistant", "content": response})
+                messages.append({"role": "user", "content": "You must use one of the tools (Action: tool_name, Action Input: input) or give Final Answer: your answer. Try again."})
+                continue
+            try:
+                observation = TOOLS[action](action_input)
+            except Exception as e:
+                observation = f"Tool error: {e}"
+            observation = (observation[:3000] + "...") if len(observation) > 3000 else observation
+            messages.append({"role": "assistant", "content": response})
+            messages.append({"role": "user", "content": f"Observation: {observation}\n\nReflect: does this answer the question? If yes, reply with Final Answer: <answer>. If not, use another tool (Thought / Action / Action Input)."})
+        last_assistant = next((m["content"] for m in reversed(messages) if m.get("role") == "assistant"), "")
+        final = self._parse_action(last_assistant)
+        if final[2] and final[0] is None and final[1] is None:
+            return final[2]
+        return last_assistant[:500] if last_assistant else "ReAct agent reached max steps without a final answer."
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
     questions_url = f"{api_url}/questions"
     submit_url = f"{api_url}/submit"
+    # 1. Instantiate Agent ( modify this part to create your agent)
     try:
+        agent = ReActAgent(token=os.getenv("HF_TOKEN"), max_steps=REACT_MAX_STEPS)
     except Exception as e:
         print(f"Error instantiating agent: {e}")
         return f"Error initializing agent: {e}", None
 # --- Build Gradio Interface using Blocks ---
 with gr.Blocks() as demo:
+    gr.Markdown("# Basic Agent Evaluation Runner")
     gr.Markdown(
         """
         **Instructions:**
+        1.  Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
         2.  Log in to your Hugging Face account using the button below. This uses your HF username for submission.
+        3.  Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
         ---
         **Disclaimers:**
+        Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
+        This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
         """
     )