Spaces:

gabejavitt
/

agentCourse

Sleeping

App Files Files Community

gabejavitt commited on Feb 8

Commit

b81ce26

verified ·

1 Parent(s): 2e5ef21

Update app.py

Browse files

Files changed (1) hide show

app.py +700 -41

app.py CHANGED Viewed

@@ -510,38 +510,111 @@ class ValidateInput(BaseModel):
 @tool(args_schema=ValidateInput)
 def validate_answer(proposed_answer: str, original_question: str) -> str:
-    """Validate answer before submission"""
     start_time = time.time()
     try:
         print(f"✓ Validating: '{proposed_answer[:50]}...'")
         issues = []
         warnings = []
-        # Check conversational fluff
-        fluff = ["the answer is", "based on", "according to", "i found", "here is"]
         if any(p in proposed_answer.lower() for p in fluff):
-            issues.append("❌ Remove conversational text")
-        # Check code fences
         if "```" in proposed_answer:
-            issues.append("❌ Remove code fences")
-        # Check length
         if len(proposed_answer) > 500:
-            warnings.append("⚠️ Very long answer")
-        # Check numbers
-        if any(k in original_question.lower() for k in ["how many", "what number", "count"]):
             if not any(c.isdigit() for c in proposed_answer):
-                warnings.append("⚠️ Number expected but none found")
         if issues:
             result = "🚫 VALIDATION FAILED:\n" + "\n".join(issues)
         elif warnings:
-            result = "⚠️ WARNINGS:\n" + "\n".join(warnings) + "\n\nProceed if confident."
         else:
-            result = "✅ PASSED! Call final_answer_tool() now."
         telemetry.record_call("validate_answer", time.time() - start_time, True)
         return result
@@ -553,6 +626,113 @@ def validate_answer(proposed_answer: str, original_question: str) -> str:
 # =============================================================================
 # CORE TOOLS
 # =============================================================================
 class SearchInput(BaseModel):
     query: str = Field(description="Search query (concise)")
@@ -684,6 +864,185 @@ def code_interpreter(code: str) -> str:
         raise ToolError("code_interpreter", e, "Check code syntax")
 class ReadFileInput(BaseModel):
     path: str = Field(description="File path")
@@ -909,6 +1268,153 @@ def get_youtube_transcript(video_url: str) -> str:
         raise ToolError("get_youtube_transcript", e)
 class ScrapeInput(BaseModel):
     url: str = Field(description="URL (http:// or https://)")
     query: str = Field(description="Specific info to find")
@@ -1016,20 +1522,34 @@ def final_answer_tool(answer: str) -> str:
 # TOOLS LIST
 # =============================================================================
 defined_tools = [
     think_through_logic,
     create_plan,
     reflect_on_progress,
     validate_answer,
     search_tool,
     calculator,
     code_interpreter,
     read_file,
     write_file,
     list_directory,
     audio_transcription_tool,
-    analyze_image,
     get_youtube_transcript,
-    scrape_and_retrieve,
     final_answer_tool
 ]
@@ -1195,41 +1715,126 @@ class PlanningReflectionAgent:
             tool_desc_list.append(desc)
         tool_descriptions = "\n".join(tool_desc_list)
-        self.system_prompt = f"""You are an elite AI agent for GAIA benchmark.
 ═══════════════════════════════════════════════════════════════
-⚠️ ABSOLUTE RULES:
 ═══════════════════════════════════════════════════════════════
-1. EVERY TURN MUST CALL EXACTLY ONE TOOL
-2. NEVER OUTPUT REASONING TEXT WITHOUT TOOL CALL
-3. IDENTIFY QUESTION TYPE FIRST
-4. LOGIC: think → calc → validate → final
-5. FACTUAL: search → scrape → validate → final
-6. DATA: read → code → validate → final
-7. ALWAYS VALIDATE before final_answer
-8. FINAL FORMAT: EXACTLY what asked, NO fluff
 ═══════════════════════════════════════════════════════════════
-📚 TOOLS:
 ═══════════════════════════════════════════════════════════════
 {tool_descriptions}
 ═══════════════════════════════════════════════════════════════
-⚡ EXECUTION:
 ═══════════════════════════════════════════════════════════════
-- Text without tool = FAILURE
-- Unsure? → think_through_logic()
-- After each tool: Have answer? → validate → submit
 - Stuck after 3 turns? → reflect_on_progress()
 ═══════════════════════════════════════════════════════════════
 """
-        # Initialize LLM
-        print("Initializing Groq LLM...")
-        self.llm_with_tools = ChatGroq(
             temperature=0,
             groq_api_key=GROQ_API_KEY,
             model_name="llama-3.3-70b-versatile",
@@ -1237,7 +1842,24 @@ class PlanningReflectionAgent:
             timeout=60
         ).bind_tools(self.tools, tool_choice="auto")
-        print("✅ LLM initialized")
         # Build agent graph
         def agent_node(state: AgentState):
@@ -1274,7 +1896,7 @@ class PlanningReflectionAgent:
                 messages_to_send.append(hint)
                 print("🤔 Reflection hint")
-            # Invoke LLM with retries
             ai_message = None
             for attempt in range(config.MAX_RETRIES):
@@ -1287,7 +1909,20 @@ class PlanningReflectionAgent:
                     print(f"⚠️ No tool calls (attempt {attempt+1})")
                 except Exception as e:
-                    print(f"⚠️ LLM error (attempt {attempt+1}): {str(e)[:200]}")
                     if attempt == config.MAX_RETRIES - 1:
                         print("🚨 Forcing think_through_logic")
@@ -1424,6 +2059,11 @@ class PlanningReflectionAgent:
             "last_tool_was_thinking": False
         }
         final_answer = "AGENT FAILED"
         all_messages = []
@@ -1468,15 +2108,17 @@ class PlanningReflectionAgent:
                                         break
                                 break
-            # Clean answer
             cleaned = str(final_answer).strip()
-            # Remove prefixes
             prefixes = [
                 "the answer is:", "here is the answer:", "based on",
                 "final answer:", "answer:", "the final answer is:",
                 "my answer is:", "according to", "i found that",
-                "the result is:", "result:"
             ]
             for prefix in prefixes:
                 if cleaned.lower().startswith(prefix.lower()):
@@ -1488,16 +2130,33 @@ class PlanningReflectionAgent:
             # Remove code fences
             cleaned = remove_fences_simple(cleaned)
             while cleaned.startswith("`") and cleaned.endswith("`"):
                 cleaned = cleaned[1:-1].strip()
             if (cleaned.startswith('"') and cleaned.endswith('"')) or \
                (cleaned.startswith("'") and cleaned.endswith("'")):
                 cleaned = cleaned[1:-1].strip()
             if cleaned.endswith('.') and len(cleaned.split()) < 10:
                 cleaned = cleaned[:-1]
             print(f"\n🎉 RETURNING: {cleaned}\n")
             return cleaned

 @tool(args_schema=ValidateInput)
 def validate_answer(proposed_answer: str, original_question: str) -> str:
+    """
+    ENHANCED: Validate answer before submission with comprehensive checks.
+    ALWAYS use before final_answer_tool.
+    """
     start_time = time.time()
     try:
         print(f"✓ Validating: '{proposed_answer[:50]}...'")
         issues = []
         warnings = []
+        suggestions = []
+        # 1. Check conversational fluff
+        fluff = ["the answer is", "based on", "according to", "i found", "here is",
+                 "here's", "after searching", "from my research", "the result is"]
         if any(p in proposed_answer.lower() for p in fluff):
+            issues.append("❌ Remove conversational text - answer ONLY")
+        # 2. Check code fences
         if "```" in proposed_answer:
+            issues.append("❌ Remove code fences (```)")
+        # 3. Check markdown formatting
+        if proposed_answer.startswith('#') or '**' in proposed_answer:
+            issues.append("❌ Remove markdown formatting")
+        # 4. Check length appropriateness
+        question_lower = original_question.lower()
         if len(proposed_answer) > 500:
+            if not any(k in question_lower for k in ['explain', 'describe', 'why', 'how does']):
+                warnings.append("⚠️ Answer very long. Question asks for short answer?")
+        # 5. Check for number questions
+        number_keywords = ["how many", "what number", "count", "total", "sum",
+                          "what year", "when did", "what date"]
+        if any(k in question_lower for k in number_keywords):
             if not any(c.isdigit() for c in proposed_answer):
+                issues.append("❌ Question asks for number but answer has no digits")
+            else:
+                # Extract just the number(s)
+                import re
+                numbers = re.findall(r'\d+(?:\.\d+)?', proposed_answer)
+                if numbers and len(proposed_answer) > 50:
+                    suggestions.append(f"💡 Consider just the number(s): {', '.join(numbers)}")
+        # 6. Check for list questions
+        list_keywords = ["list", "what are", "name the", "which"]
+        if any(k in question_lower for k in list_keywords):
+            if '\n' in proposed_answer or len(proposed_answer.split(',')) > 1:
+                # Good, it's formatted as a list
+                pass
+            else:
+                warnings.append("⚠️ Question might ask for multiple items")
+        # 7. Check for yes/no questions
+        if question_lower.startswith(('is ', 'does ', 'did ', 'can ', 'will ', 'was ', 'were ', 'are ')):
+            if proposed_answer.lower() not in ['yes', 'no', 'true', 'false']:
+                if not proposed_answer.lower().startswith(('yes', 'no')):
+                    warnings.append("⚠️ Question seems yes/no. Answer should start with yes/no?")
+        # 8. Check for excessive punctuation
+        if proposed_answer.count('!') > 2 or proposed_answer.count('?') > 1:
+            issues.append("❌ Remove excessive punctuation")
+        # 9. Check for quotes around answer
+        if (proposed_answer.startswith('"') and proposed_answer.endswith('"')) or \
+           (proposed_answer.startswith("'") and proposed_answer.endswith("'")):
+            suggestions.append("💡 Consider removing quotes around answer")
+        # 10. Check for multiple sentences when one expected
+        sentences = [s.strip() for s in proposed_answer.split('.') if s.strip()]
+        if len(sentences) > 3:
+            if not any(k in question_lower for k in ['explain', 'describe', 'why', 'how']):
+                warnings.append("⚠️ Multiple sentences. Question asks for simple answer?")
+        # 11. Sanity check: is it empty?
+        if not proposed_answer.strip():
+            issues.append("❌ Answer is empty!")
+        # 12. Check for units in measurement questions
+        unit_keywords = ['height', 'weight', 'distance', 'speed', 'temperature', 'size']
+        if any(k in question_lower for k in unit_keywords):
+            has_unit = any(u in proposed_answer.lower() for u in
+                         ['km', 'miles', 'kg', 'lbs', 'cm', 'inches', 'celsius',
+                          'fahrenheit', 'mph', 'kph', 'meters', 'feet'])
+            if not has_unit and any(c.isdigit() for c in proposed_answer):
+                warnings.append("⚠️ Measurement question but no unit found")
+        # Build response
         if issues:
             result = "🚫 VALIDATION FAILED:\n" + "\n".join(issues)
+            if suggestions:
+                result += "\n\nSuggestions:\n" + "\n".join(suggestions)
+            result += "\n\nFix issues then retry validation."
         elif warnings:
+            result = "⚠️ WARNINGS:\n" + "\n".join(warnings)
+            if suggestions:
+                result += "\n\nSuggestions:\n" + "\n".join(suggestions)
+            result += "\n\nProceed if confident, or refine answer."
+        elif suggestions:
+            result = "✅ PASSED with suggestions:\n" + "\n".join(suggestions)
+            result += "\n\nCall final_answer_tool() when ready."
         else:
+            result = "✅ VALIDATION PASSED! Call final_answer_tool() now."
         telemetry.record_call("validate_answer", time.time() - start_time, True)
         return result
 # =============================================================================
 # CORE TOOLS
 # =============================================================================
+class WikipediaInput(BaseModel):
+    query: str = Field(description="Topic to search (e.g., 'Mercedes Sosa', 'Python programming')")
+@tool(args_schema=WikipediaInput)
+@retry_with_backoff(max_retries=2)
+def wikipedia_search(query: str) -> str:
+    """
+    Search Wikipedia with automatic page retrieval.
+    Better than search_tool for:
+    - Biographical information
+    - Historical facts
+    - Scientific concepts
+    - Counting items in lists (discography, filmography, etc.)
+    Returns full article sections, not just snippets.
+    """
+    start_time = time.time()
+    try:
+        print(f"📚 Wikipedia search: {query}")
+        # Check cache first
+        cache_key = f"wiki:{query}"
+        cached = search_cache.get(cache_key)
+        if cached:
+            print(f"   (cached)")
+            telemetry.record_call("wikipedia_search", time.time() - start_time, True)
+            return cached
+        import requests
+        # Step 1: Search for page
+        search_url = "https://en.wikipedia.org/w/api.php"
+        search_params = {
+            'action': 'opensearch',
+            'search': query,
+            'limit': 1,
+            'namespace': 0,
+            'format': 'json'
+        }
+        response = requests.get(search_url, params=search_params, timeout=10)
+        response.raise_for_status()
+        search_results = response.json()
+        if not search_results[1]:  # No results
+            result = f"No Wikipedia article found for: '{query}'"
+            search_cache.put(cache_key, result)
+            telemetry.record_call("wikipedia_search", time.time() - start_time, True)
+            return result
+        page_title = search_results[1][0]
+        page_url = search_results[3][0]
+        print(f"   Found: {page_title}")
+        print(f"   URL: {page_url}")
+        # Step 2: Get full page content
+        content_params = {
+            'action': 'query',
+            'titles': page_title,
+            'prop': 'extracts',
+            'explaintext': True,
+            'format': 'json'
+        }
+        response = requests.get(search_url, params=content_params, timeout=10)
+        response.raise_for_status()
+        data = response.json()
+        pages = data['query']['pages']
+        page_id = list(pages.keys())[0]
+        if page_id == '-1':
+            result = f"Wikipedia page not found: '{query}'"
+            search_cache.put(cache_key, result)
+            telemetry.record_call("wikipedia_search", time.time() - start_time, True)
+            return result
+        content = pages[page_id].get('extract', '')
+        if not content:
+            result = f"Wikipedia page found but content empty: '{page_title}'"
+            search_cache.put(cache_key, result)
+            telemetry.record_call("wikipedia_search", time.time() - start_time, True)
+            return result
+        print(f"   Retrieved {len(content)} chars")
+        # Format result
+        result = f"Wikipedia: {page_title}\n"
+        result += f"URL: {page_url}\n\n"
+        result += content
+        result = truncate_if_needed(result, max_length=12000)  # Allow more for Wikipedia
+        # Cache result
+        search_cache.put(cache_key, result)
+        telemetry.record_call("wikipedia_search", time.time() - start_time, True)
+        return result
+    except Exception as e:
+        telemetry.record_call("wikipedia_search", time.time() - start_time, False)
+        raise ToolError("wikipedia_search", e, "Try a more specific search term")
 class SearchInput(BaseModel):
     query: str = Field(description="Search query (concise)")
         raise ToolError("code_interpreter", e, "Check code syntax")
+class AnalyzeDataInput(BaseModel):
+    file_path: str = Field(description="Path to CSV or Excel file")
+    question: str = Field(description="What to find (e.g., 'count rows where year > 2000')")
+@tool(args_schema=AnalyzeDataInput)
+def analyze_data_file(file_path: str, question: str) -> str:
+    """
+    Analyze CSV/Excel files with automatic data profiling.
+    Generates Python code to answer questions about data files.
+    Better than code_interpreter alone because it:
+    1. Profiles the data first (columns, types, sample)
+    2. Generates appropriate pandas code
+    3. Handles common data issues (encoding, missing values)
+    Use for questions like:
+    - "How many rows have X?"
+    - "What's the sum/average of column Y?"
+    - "Count items grouped by Z"
+    """
+    start_time = time.time()
+    try:
+        print(f"📊 Analyzing data file: {file_path}")
+        print(f"   Question: {question[:100]}...")
+        # Find file
+        data_file = find_file(file_path)
+        if not data_file:
+            raise FileNotFoundError(f"Data file not found: {file_path}")
+        file_ext = data_file.suffix.lower()
+        if file_ext not in ['.csv', '.xlsx', '.xls', '.tsv']:
+            raise ValueError(f"Unsupported file type: {file_ext}. Use .csv, .xlsx, .xls, or .tsv")
+        print(f"   File type: {file_ext}")
+        # Generate profiling code
+        profiling_code = f"""
+import pandas as pd
+import numpy as np
+# Load file
+file_path = r"{data_file}"
+"""
+        if file_ext == '.csv':
+            profiling_code += """
+# Try different encodings
+for encoding in ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252']:
+    try:
+        df = pd.read_csv(file_path, encoding=encoding)
+        break
+    except:
+        continue
+"""
+        elif file_ext == '.tsv':
+            profiling_code += """
+df = pd.read_csv(file_path, sep='\\t', encoding='utf-8')
+"""
+        else:  # Excel
+            profiling_code += """
+df = pd.read_excel(file_path)
+"""
+        profiling_code += """
+# Profile data
+print("=" * 60)
+print("DATA PROFILE")
+print("=" * 60)
+print(f"Shape: {df.shape[0]} rows × {df.shape[1]} columns")
+print(f"\\nColumns: {', '.join(df.columns.tolist())}")
+print(f"\\nData types:")
+print(df.dtypes)
+print(f"\\nFirst 3 rows:")
+print(df.head(3))
+print(f"\\nMissing values:")
+print(df.isnull().sum())
+"""
+        # Execute profiling
+        print(f"   Profiling data...")
+        output_stream = io.StringIO()
+        error_stream = io.StringIO()
+        with contextlib.redirect_stdout(output_stream), contextlib.redirect_stderr(error_stream):
+            exec(profiling_code, {"pd": pd, "np": np, "__builtins__": __builtins__})
+        profile_output = output_stream.getvalue()
+        if error_stream.getvalue():
+            raise RuntimeError(f"Profiling failed: {error_stream.getvalue()}")
+        print(f"   Profiling complete")
+        print(profile_output[:500] + "..." if len(profile_output) > 500 else profile_output)
+        # Now generate analysis code based on question
+        analysis_code = profiling_code + f"""
+# Analysis for: {question}
+print("\\n" + "=" * 60)
+print("ANALYSIS RESULT")
+print("=" * 60)
+"""
+        # Add intelligent code based on question keywords
+        q_lower = question.lower()
+        if 'count' in q_lower or 'how many' in q_lower:
+            if 'where' in q_lower or 'with' in q_lower:
+                analysis_code += """
+# Count rows matching condition
+# NOTE: Adjust the filter condition based on your needs
+result = len(df)  # Total count
+print(f"Total rows: {result}")
+# Example filters (uncomment and modify as needed):
+# result = len(df[df['column'] > value])
+# result = len(df[df['column'].str.contains('text', na=False)])
+"""
+            else:
+                analysis_code += """
+result = len(df)
+print(f"Total rows: {result}")
+"""
+        elif 'sum' in q_lower or 'total' in q_lower:
+            analysis_code += """
+# Sum a numeric column
+# NOTE: Replace 'column_name' with actual column
+# result = df['column_name'].sum()
+# print(f"Sum: {result}")
+"""
+        elif 'average' in q_lower or 'mean' in q_lower:
+            analysis_code += """
+# Average of a column
+# result = df['column_name'].mean()
+# print(f"Average: {result}")
+"""
+        elif 'group' in q_lower or 'by' in q_lower:
+            analysis_code += """
+# Group by and count
+# result = df.groupby('column_name').size()
+# print(result)
+"""
+        else:
+            # Generic: show summary
+            analysis_code += """
+# Summary statistics
+print(df.describe())
+"""
+        result = f"""Data Profile:
+{profile_output}
+Generated Analysis Code:
+```python
+{analysis_code}
+```
+**IMPORTANT**: The code above needs column names adjusted.
+Use code_interpreter() with the corrected code to get the answer.
+Columns available: {", ".join(pd.read_csv(data_file) if file_ext == '.csv' else pd.read_excel(data_file)).columns.tolist()}
+"""
+        telemetry.record_call("analyze_data_file", time.time() - start_time, True)
+        return truncate_if_needed(result)
+    except Exception as e:
+        telemetry.record_call("analyze_data_file", time.time() - start_time, False)
+        raise ToolError("analyze_data_file", e, "Check file path and format")
 class ReadFileInput(BaseModel):
     path: str = Field(description="File path")
         raise ToolError("get_youtube_transcript", e)
+class BrowseInput(BaseModel):
+    start_url: str = Field(description="Starting URL (http:// or https://)")
+    goal: str = Field(description="What you're trying to find (e.g., 'Mercedes Sosa albums 2000-2009')")
+    max_steps: int = Field(description="Max pages to visit (1-5)", default=3)
+@tool(args_schema=BrowseInput)
+@retry_with_backoff(max_retries=2)
+def iterative_web_browser(start_url: str, goal: str, max_steps: int = 3) -> str:
+    """
+    Multi-turn web browsing - follows links iteratively to find information.
+    Use when:
+    - Information requires navigating through multiple pages
+    - Need to follow "Read more" or "Details" links
+    - Example: "Find Mercedes Sosa's discography, then count 2000-2009 albums"
+    This tool:
+    1. Visits start_url
+    2. Searches content for goal-related info
+    3. Extracts relevant links
+    4. Follows most promising link
+    5. Repeats until info found or max_steps reached
+    Better than scrape_and_retrieve when single page doesn't have complete info.
+    """
+    start_time = time.time()
+    try:
+        if not rag_manager.is_ready():
+            rag_manager.initialize()
+        print(f"🌐 Iterative browsing starting at: {start_url}")
+        print(f"   Goal: {goal[:100]}...")
+        print(f"   Max steps: {max_steps}")
+        visited_urls = set()
+        current_url = start_url
+        all_findings = []
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+        }
+        for step in range(max_steps):
+            if current_url in visited_urls:
+                print(f"   Step {step+1}: Already visited, stopping")
+                break
+            visited_urls.add(current_url)
+            print(f"   Step {step+1}: Visiting {current_url}")
+            try:
+                response = requests.get(current_url, headers=headers, timeout=15)
+                response.raise_for_status()
+                soup = BeautifulSoup(response.text, 'html.parser')
+                # Remove noise
+                for tag in soup(["script", "style", "nav", "footer", "aside", "header", "iframe"]):
+                    tag.extract()
+                # Extract main content
+                main = soup.find('main') or soup.find('article') or soup.find('div', class_='mw-parser-output') or soup.body
+                if not main:
+                    print(f"      No main content found")
+                    continue
+                text = main.get_text(separator='\n', strip=True)
+                lines = [l.strip() for l in text.splitlines() if l.strip()]
+                text = '\n'.join(lines)
+                print(f"      Extracted {len(text)} chars")
+                # Search for goal-related content
+                chunks = rag_manager.text_splitter.split_text(text)
+                docs = [Document(page_content=c, metadata={"source": current_url, "step": step+1}) for c in chunks]
+                db = FAISS.from_documents(docs, rag_manager.embeddings)
+                retriever = db.as_retriever(search_kwargs={"k": 3})
+                retrieved = retriever.invoke(goal)
+                # Clean up
+                del db
+                del retriever
+                import gc
+                gc.collect()
+                if retrieved:
+                    print(f"      Found {len(retrieved)} relevant chunks")
+                    for i, doc in enumerate(retrieved):
+                        all_findings.append({
+                            'step': step + 1,
+                            'url': current_url,
+                            'content': doc.page_content
+                        })
+                # Extract links for next step
+                if step < max_steps - 1:
+                    links = []
+                    for a in main.find_all('a', href=True):
+                        href = a.get('href')
+                        text = a.get_text(strip=True).lower()
+                        # Make absolute URL
+                        if href.startswith('/'):
+                            from urllib.parse import urljoin
+                            href = urljoin(current_url, href)
+                        # Filter relevant links
+                        goal_keywords = goal.lower().split()
+                        if any(keyword in href.lower() or keyword in text for keyword in goal_keywords):
+                            if href.startswith('http') and href not in visited_urls:
+                                links.append((href, text))
+                    if links:
+                        # Pick most relevant link
+                        current_url = links[0][0]
+                        print(f"      Found {len(links)} potential links, following: {links[0][1][:50]}")
+                    else:
+                        print(f"      No more relevant links found")
+                        break
+                else:
+                    print(f"      Max steps reached")
+                    break
+            except Exception as e:
+                print(f"      Error on step {step+1}: {e}")
+                break
+        # Compile findings
+        if not all_findings:
+            result = f"Browsed {len(visited_urls)} pages but found no relevant information for: '{goal}'"
+        else:
+            result = f"Information gathered from {len(visited_urls)} pages:\n\n"
+            for finding in all_findings:
+                result += f"[Step {finding['step']} - {finding['url']}]\n{finding['content']}\n\n---\n\n"
+            result = truncate_if_needed(result)
+        telemetry.record_call("iterative_web_browser", time.time() - start_time, True)
+        return result
+    except Exception as e:
+        telemetry.record_call("iterative_web_browser", time.time() - start_time, False)
+        raise ToolError("iterative_web_browser", e, "Try starting from a more specific URL")
 class ScrapeInput(BaseModel):
     url: str = Field(description="URL (http:// or https://)")
     query: str = Field(description="Specific info to find")
 # TOOLS LIST
 # =============================================================================
 defined_tools = [
+    # Planning & Reflection
     think_through_logic,
     create_plan,
     reflect_on_progress,
     validate_answer,
+    # Search & Browse
+    wikipedia_search,  # NEW: Better for encyclopedic queries
     search_tool,
+    iterative_web_browser,  # NEW: Multi-turn web navigation
+    scrape_and_retrieve,
+    # Core computation
     calculator,
     code_interpreter,
+    # File operations
     read_file,
     write_file,
     list_directory,
+    analyze_data_file,  # NEW: Smart CSV/Excel analysis
+    # Specialized
     audio_transcription_tool,
+    analyze_image,
     get_youtube_transcript,
+    # Final
     final_answer_tool
 ]
             tool_desc_list.append(desc)
         tool_descriptions = "\n".join(tool_desc_list)
+        self.system_prompt = f"""You are an elite AI agent for GAIA benchmark. Your ONLY job: provide the EXACT answer requested.
+═══════════════════════════════════════════════════════════════
+⚠️ ABSOLUTE RULES - VIOLATE THESE AND YOU FAIL:
+═══════════════════════════════════════════════════════════════
+1. **EVERY TURN MUST CALL EXACTLY ONE TOOL** - No exceptions
+2. **NEVER OUTPUT REASONING TEXT WITHOUT A TOOL CALL** - You will fail
+3. **IDENTIFY QUESTION TYPE FIRST** - Logic? Factual? Data? Math?
+4. **ALWAYS VALIDATE**: Call validate_answer() before final_answer_tool()
+5. **FINAL ANSWER FORMAT**: EXACTLY what was asked. NO "The answer is..." or explanations
 ═══════════════════════════════════════════════════════════════
+📋 QUESTION TYPE → TOOL SEQUENCE:
 ═══════════════════════════════════════════════════════════════
+**LOGIC PUZZLES** (No web search needed):
+→ think_through_logic → calculator (if math) → validate → final_answer
+**FACTUAL/BIOGRAPHICAL** (Need web):
+→ wikipedia_search (if person/place/thing) → validate → final_answer
+OR search_tool → scrape_and_retrieve → validate → final_answer
+**COUNTING FROM WEB** (Need full page content):
+→ wikipedia_search (if Wikipedia topic) → validate → final_answer
+OR iterative_web_browser (if needs navigation) → validate → final_answer
+**DATA FILES** (CSV/Excel):
+→ list_directory → analyze_data_file → code_interpreter → validate → final_answer
+**IMAGES** (Chess, diagrams, photos):
+→ analyze_image → validate → final_answer
+**AUDIO FILES**:
+→ audio_transcription_tool → validate → final_answer
+**MATH CALCULATIONS**:
+→ calculator → validate → final_answer
 ═══════════════════════════════════════════════════════════════
+🎯 CRITICAL TOOL USAGE PATTERNS:
+═══════════════════════════════════════════════════════════════
+**For Counting Questions:**
+BAD:  search_tool("Mercedes Sosa albums") → snippets only
+GOOD: wikipedia_search("Mercedes Sosa") → full discography section
+**For Multi-Step Web Questions:**
+BAD:  scrape_and_retrieve("https://...") → single page only
+GOOD: iterative_web_browser("https://...", "find X", max_steps=3)
+**For Data Questions:**
+BAD:  read_file("data.csv") → raw text dump
+GOOD: analyze_data_file("data.csv", "count rows where X > Y")
+**For Validation:**
+ALWAYS: validate_answer("your answer", "original question")
+THEN:   final_answer_tool("your answer")
+═══════════════════════════════════════════════════════════════
+📚 AVAILABLE TOOLS:
 ═══════════════════════════════════════════════════════════════
 {tool_descriptions}
 ═══════════════════════════════════════════════════════════════
+⚡ EXECUTION RULES:
 ═══════════════════════════════════════════════════════════════
+- Text without tool call = FAILURE
+- Unsure? → think_through_logic() to organize thoughts
+- After EVERY tool result: "Do I have the answer? → validate → submit"
 - Stuck after 3 turns? → reflect_on_progress()
+- For Wikipedia topics → ALWAYS use wikipedia_search, NOT search_tool
+- For counting from web → Use wikipedia_search or iterative_web_browser
+- For data files → Use analyze_data_file, NOT just read_file
+═══════════════════════════════════════════════════════════════
+🎓 EXAMPLES OF PERFECT EXECUTION:
+═══════════════════════════════════════════════════════════════
+Example 1: "How many studio albums did Mercedes Sosa release 2000-2009?"
+Turn 1: wikipedia_search("Mercedes Sosa")
+        → Gets full discography with all albums and years
+Turn 2: code_interpreter("count albums 2000-2009 from text")
+        → Result: 3
+Turn 3: validate_answer("3", "How many studio albums...")
+        → ✅ PASSED
+Turn 4: final_answer_tool("3")
+Example 2: "What's the population of Einstein's birthplace in 1900?"
+Turn 1: wikipedia_search("Albert Einstein")
+        → Birthplace: Ulm, Germany
+Turn 2: search_tool("Ulm Germany population 1900")
+        → Find sources
+Turn 3: scrape_and_retrieve("url", "population 1900")
+        → ~50,000
+Turn 4: validate_answer("50000", "population 1900")
+        → ✅ PASSED
+Turn 5: final_answer_tool("50000")
+Example 3: Logic puzzle
+Turn 1: think_through_logic("Work through the logic...")
+        → Reasoning recorded
+Turn 2: calculator("30") [if calculation needed]
+        → 30
+Turn 3: validate_answer("30", "coin puzzle")
+        → ✅ PASSED
+Turn 4: final_answer_tool("30")
+═════════════��═════════════════════════════════════════════════
+REMEMBER: One tool per turn. No reasoning without tools. Exact answer format.
 ═══════════════════════════════════════════════════════════════
 """
+        # Initialize LLMs (Groq primary, Claude fallback)
+        print("Initializing LLMs...")
+        # Primary: Groq (fast, free)
+        self.groq_llm = ChatGroq(
             temperature=0,
             groq_api_key=GROQ_API_KEY,
             model_name="llama-3.3-70b-versatile",
             timeout=60
         ).bind_tools(self.tools, tool_choice="auto")
+        # Fallback: Claude (slower, more reliable)
+        ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
+        if ANTHROPIC_API_KEY:
+            from langchain_anthropic import ChatAnthropic
+            self.claude_llm = ChatAnthropic(
+                model="claude-sonnet-4-20250514",
+                anthropic_api_key=ANTHROPIC_API_KEY,
+                temperature=0,
+                max_tokens=4096
+            ).bind_tools(self.tools, tool_choice="auto")
+            print("✅ Both Groq and Claude initialized")
+        else:
+            self.claude_llm = None
+            print("✅ Groq initialized (Claude fallback unavailable)")
+        # Start with Groq
+        self.llm_with_tools = self.groq_llm
+        self.current_llm = "groq"
         # Build agent graph
         def agent_node(state: AgentState):
                 messages_to_send.append(hint)
                 print("🤔 Reflection hint")
+            # Invoke LLM with retries and fallback
             ai_message = None
             for attempt in range(config.MAX_RETRIES):
                     print(f"⚠️ No tool calls (attempt {attempt+1})")
                 except Exception as e:
+                    error_str = str(e)
+                    print(f"⚠️ {self.current_llm.upper()} error (attempt {attempt+1}): {error_str[:200]}")
+                    # If Groq fails and we have Claude, switch to Claude
+                    if self.current_llm == "groq" and self.claude_llm and attempt == config.MAX_RETRIES - 1:
+                        print("🔄 Switching from Groq to Claude for this question...")
+                        self.llm_with_tools = self.claude_llm
+                        self.current_llm = "claude"
+                        try:
+                            ai_message = self.llm_with_tools.invoke(messages_to_send)
+                            if ai_message.tool_calls:
+                                break
+                        except Exception as e2:
+                            print(f"⚠️ Claude also failed: {e2}")
                     if attempt == config.MAX_RETRIES - 1:
                         print("🚨 Forcing think_through_logic")
             "last_tool_was_thinking": False
         }
+        # Reset to Groq for each question
+        if self.groq_llm:
+            self.llm_with_tools = self.groq_llm
+            self.current_llm = "groq"
         final_answer = "AGENT FAILED"
         all_messages = []
                                         break
                                 break
+            # Clean answer more aggressively
             cleaned = str(final_answer).strip()
+            # Remove common prefixes (case-insensitive)
             prefixes = [
                 "the answer is:", "here is the answer:", "based on",
                 "final answer:", "answer:", "the final answer is:",
                 "my answer is:", "according to", "i found that",
+                "the result is:", "result:", "here's the answer:",
+                "after analysis:", "the correct answer is:",
+                "from the data:", "from the search:",
             ]
             for prefix in prefixes:
                 if cleaned.lower().startswith(prefix.lower()):
             # Remove code fences
             cleaned = remove_fences_simple(cleaned)
+            # Remove backticks
             while cleaned.startswith("`") and cleaned.endswith("`"):
                 cleaned = cleaned[1:-1].strip()
+            # Remove quotes (but only if they wrap entire answer)
             if (cleaned.startswith('"') and cleaned.endswith('"')) or \
                (cleaned.startswith("'") and cleaned.endswith("'")):
                 cleaned = cleaned[1:-1].strip()
+            # Remove trailing period for short answers
             if cleaned.endswith('.') and len(cleaned.split()) < 10:
                 cleaned = cleaned[:-1]
+            # Remove markdown bold/italic
+            cleaned = cleaned.replace('**', '').replace('__', '').replace('*', '').replace('_', '')
+            # Remove bullet points
+            if cleaned.startswith(('- ', '* ', '• ')):
+                cleaned = cleaned[2:].strip()
+            # Remove numbered list prefix
+            import re
+            cleaned = re.sub(r'^\d+\.\s+', '', cleaned)
+            # Final whitespace cleanup
+            cleaned = ' '.join(cleaned.split())
             print(f"\n🎉 RETURNING: {cleaned}\n")
             return cleaned