Spaces:

schoolkithub
/

multi-agent-gaia-system

Runtime error

App Files Files Community

Omachoko commited on Jun 29, 2025

Commit

83a3deb

1 Parent(s): a9d900f

Final: robust GAIA agent with advanced tool registry, GPT-4.1, web search, strict output, and full multi-modal support

Browse files

Files changed (1) hide show

app.py +90 -39

app.py CHANGED Viewed

@@ -24,6 +24,7 @@ from huggingface_hub import InferenceClient
 import cv2
 import torch
 from bs4 import BeautifulSoup
 logging.basicConfig(filename='gaia_agent.log', level=logging.INFO, format='%(asctime)s %(levelname)s:%(message)s')
 logger = logging.getLogger(__name__)
@@ -198,6 +199,37 @@ def youtube_video_qa(youtube_url, question):
         logging.error(f"YouTube video QA error: {e}")
         return f"Video analysis error: {e}"
 TOOL_REGISTRY = {
     "llama3_chat": llama3_chat,
     "mixtral_chat": mixtral_chat,
@@ -207,6 +239,8 @@ TOOL_REGISTRY = {
     "image_caption": image_caption,
     "code_analysis": code_analysis,
     "youtube_video_qa": youtube_video_qa,
 }
 class ModularGAIAAgent:
@@ -304,63 +338,80 @@ class ModularGAIAAgent:
             self.reasoning_trace.append(f"Unknown file type: {file_name}")
             return None
     def answer_question(self, question_obj):
         self.reasoning_trace = []
         q = question_obj["question"]
         file_name = question_obj.get("file_name", "")
         file_content = None
         file_type = None
-        # YouTube video question detection
-        if "youtube.com" in q or "youtu.be" in q:
-            url = None
-            for word in q.split():
-                if "youtube.com" in word or "youtu.be" in word:
-                    url = word.strip().strip(',')
-                    break
-            if url:
-                answer = self.tools['youtube_video_qa'](url, q)
-                self.reasoning_trace.append(f"YouTube video analyzed: {url}")
-                self.reasoning_trace.append(f"Final answer: {answer}")
-                return self.format_answer(answer), self.reasoning_trace
         if file_name:
             file_id = file_name.split('.')[0]
             local_file = self.download_file(file_id, file_name)
             if local_file:
                 file_type = self.detect_file_type(local_file)
                 file_content = self.analyze_file(local_file, file_type)
-        # Plan: choose tool based on question and file
-        if file_type == 'audio' or file_type == 'text':
-            if file_content:
-                answer = self.tools['extractive_qa'](q, file_content)
-            else:
-                answer = self.tools['llama3_chat'](q)
-        elif file_type == 'excel' or file_type == 'csv':
-            if file_content:
-                answer = self.tools['table_qa'](q, file_content)
             else:
-                answer = self.tools['llama3_chat'](q)
-        elif file_type == 'image':
-            if file_content:
-                answer = self.tools['llama3_chat'](f"{q}\nImage description: {file_content}")
-            else:
-                answer = self.tools['llama3_chat'](q)
-        elif file_type == 'code':
-            answer = file_content
-        else:
-            answer = self.tools['llama3_chat'](q)
         self.reasoning_trace.append(f"Final answer: {answer}")
         return self.format_answer(answer), self.reasoning_trace
     def format_answer(self, answer):
         if isinstance(answer, str):
-            answer = answer.strip().rstrip('.')
-            for prefix in ['answer:', 'result:', 'the answer is', 'final answer:', 'response:']:
-                if answer.lower().startswith(prefix):
-                    answer = answer[len(prefix):].strip()
-            import re
-            answer = re.sub(r'\b(the|a|an)\b ', '', answer, flags=re.IGNORECASE)
-            answer = answer.strip().rstrip('.')
-        return answer
 # --- Basic Agent Definition (now wraps ModularGAIAAgent) ---
 class BasicAgent:

 import cv2
 import torch
 from bs4 import BeautifulSoup
+import openai
 logging.basicConfig(filename='gaia_agent.log', level=logging.INFO, format='%(asctime)s %(levelname)s:%(message)s')
 logger = logging.getLogger(__name__)
         logging.error(f"YouTube video QA error: {e}")
         return f"Video analysis error: {e}"
+def web_search_duckduckgo(query, max_results=5):
+    """DuckDuckGo web search tool: returns top snippets and URLs."""
+    try:
+        import duckduckgo_search
+        results = duckduckgo_search.DuckDuckGoSearch().search(query, max_results=max_results)
+        snippets = []
+        for r in results:
+            snippet = f"Title: {r['title']}\nSnippet: {r['body']}\nURL: {r['href']}"
+            snippets.append(snippet)
+        return '\n---\n'.join(snippets)
+    except Exception as e:
+        logging.error(f"web_search_duckduckgo error: {e}")
+        return f"Web search error: {e}"
+def gpt4_chat(prompt, api_key=None):
+    """OpenAI GPT-4.1 chat completion."""
+    try:
+        api_key = api_key or os.environ.get("OPENAI_API_KEY", "")
+        if not api_key:
+            return "No OpenAI API key provided."
+        response = openai.ChatCompletion.create(
+            model="gpt-4-1106-preview",
+            messages=[{"role": "system", "content": "You are a general AI assistant. Answer using as few words as possible, in the required format. Use tools as needed, and only output the answer."},
+                     {"role": "user", "content": prompt}],
+            api_key=api_key,
+        )
+        return response.choices[0].message['content'].strip()
+    except Exception as e:
+        logging.error(f"gpt4_chat error: {e}")
+        return f"GPT-4 error: {e}"
 TOOL_REGISTRY = {
     "llama3_chat": llama3_chat,
     "mixtral_chat": mixtral_chat,
     "image_caption": image_caption,
     "code_analysis": code_analysis,
     "youtube_video_qa": youtube_video_qa,
+    "web_search_duckduckgo": web_search_duckduckgo,
+    "gpt4_chat": gpt4_chat,
 }
 class ModularGAIAAgent:
             self.reasoning_trace.append(f"Unknown file type: {file_name}")
             return None
+    def smart_tool_select(self, question, file_type=None):
+        """Select the best tool(s) for the question, optionally using GPT-4.1 for planning."""
+        # Use GPT-4.1 to suggest a tool if available
+        api_key = os.environ.get("OPENAI_API_KEY", "")
+        if api_key:
+            plan_prompt = f"""
+You are an expert AI agent. Given the following question and file type, suggest the best tool(s) to use from this list: {list(self.tools.keys())}.
+Question: {question}
+File type: {file_type}
+Respond with a comma-separated list of tool names only, in order of use. If unsure, start with web_search_duckduckgo.
+"""
+            plan = gpt4_chat(plan_prompt, api_key=api_key)
+            tool_names = [t.strip() for t in plan.split(',') if t.strip() in self.tools]
+            if tool_names:
+                return tool_names
+        # Fallback: heuristic
+        if file_type == 'audio':
+            return ['asr_transcribe']
+        elif file_type == 'image':
+            return ['image_caption']
+        elif file_type == 'code':
+            return ['code_analysis']
+        elif file_type in ['excel', 'csv']:
+            return ['table_qa']
+        elif 'youtube.com' in question or 'youtu.be' in question:
+            return ['youtube_video_qa']
+        elif any(w in question.lower() for w in ['wikipedia', 'who', 'when', 'where', 'what', 'how', 'find', 'search']):
+            return ['web_search_duckduckgo']
+        else:
+            return ['llama3_chat']
     def answer_question(self, question_obj):
         self.reasoning_trace = []
         q = question_obj["question"]
         file_name = question_obj.get("file_name", "")
         file_content = None
         file_type = None
         if file_name:
             file_id = file_name.split('.')[0]
             local_file = self.download_file(file_id, file_name)
             if local_file:
                 file_type = self.detect_file_type(local_file)
                 file_content = self.analyze_file(local_file, file_type)
+        # Smart tool selection
+        tool_names = self.smart_tool_select(q, file_type)
+        answer = None
+        context = None
+        for tool_name in tool_names:
+            tool = self.tools[tool_name]
+            if tool_name == 'web_search_duckduckgo':
+                context = tool(q)
+                # Use LLM to synthesize answer from snippets
+                answer = llama3_chat(f"Answer the following question using ONLY the information below.\nQuestion: {q}\nSnippets:\n{context}\nAnswer:")
+            elif tool_name == 'gpt4_chat':
+                answer = tool(q)
+            elif tool_name == 'table_qa' and file_content:
+                answer = tool(q, file_content)
+            elif tool_name in ['asr_transcribe', 'image_caption', 'code_analysis'] and file_content:
+                answer = tool(file_name)
+            elif tool_name == 'youtube_video_qa':
+                answer = tool(q, q)
             else:
+                answer = tool(q)
+            if answer:
+                break
+        self.reasoning_trace.append(f"Tools used: {tool_names}")
         self.reasoning_trace.append(f"Final answer: {answer}")
         return self.format_answer(answer), self.reasoning_trace
     def format_answer(self, answer):
+        # Strict GAIA: only the answer, no extra text, no prefix
         if isinstance(answer, str):
+            return answer.strip().split('\n')[0]
+        return str(answer)
 # --- Basic Agent Definition (now wraps ModularGAIAAgent) ---
 class BasicAgent: