gabejavitt commited on
Commit
b81ce26
Β·
verified Β·
1 Parent(s): 2e5ef21

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +700 -41
app.py CHANGED
@@ -510,38 +510,111 @@ class ValidateInput(BaseModel):
510
 
511
  @tool(args_schema=ValidateInput)
512
  def validate_answer(proposed_answer: str, original_question: str) -> str:
513
- """Validate answer before submission"""
 
 
 
 
514
  start_time = time.time()
515
  try:
516
  print(f"βœ“ Validating: '{proposed_answer[:50]}...'")
517
 
518
  issues = []
519
  warnings = []
 
520
 
521
- # Check conversational fluff
522
- fluff = ["the answer is", "based on", "according to", "i found", "here is"]
 
523
  if any(p in proposed_answer.lower() for p in fluff):
524
- issues.append("❌ Remove conversational text")
525
-
526
- # Check code fences
527
  if "```" in proposed_answer:
528
- issues.append("❌ Remove code fences")
529
 
530
- # Check length
 
 
 
 
 
531
  if len(proposed_answer) > 500:
532
- warnings.append("⚠️ Very long answer")
 
533
 
534
- # Check numbers
535
- if any(k in original_question.lower() for k in ["how many", "what number", "count"]):
 
 
536
  if not any(c.isdigit() for c in proposed_answer):
537
- warnings.append("⚠️ Number expected but none found")
538
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
539
  if issues:
540
  result = "🚫 VALIDATION FAILED:\n" + "\n".join(issues)
 
 
 
541
  elif warnings:
542
- result = "⚠️ WARNINGS:\n" + "\n".join(warnings) + "\n\nProceed if confident."
 
 
 
 
 
 
543
  else:
544
- result = "βœ… PASSED! Call final_answer_tool() now."
545
 
546
  telemetry.record_call("validate_answer", time.time() - start_time, True)
547
  return result
@@ -553,6 +626,113 @@ def validate_answer(proposed_answer: str, original_question: str) -> str:
553
  # =============================================================================
554
  # CORE TOOLS
555
  # =============================================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
556
  class SearchInput(BaseModel):
557
  query: str = Field(description="Search query (concise)")
558
 
@@ -684,6 +864,185 @@ def code_interpreter(code: str) -> str:
684
  raise ToolError("code_interpreter", e, "Check code syntax")
685
 
686
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
687
  class ReadFileInput(BaseModel):
688
  path: str = Field(description="File path")
689
 
@@ -909,6 +1268,153 @@ def get_youtube_transcript(video_url: str) -> str:
909
  raise ToolError("get_youtube_transcript", e)
910
 
911
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
912
  class ScrapeInput(BaseModel):
913
  url: str = Field(description="URL (http:// or https://)")
914
  query: str = Field(description="Specific info to find")
@@ -1016,20 +1522,34 @@ def final_answer_tool(answer: str) -> str:
1016
  # TOOLS LIST
1017
  # =============================================================================
1018
  defined_tools = [
 
1019
  think_through_logic,
1020
  create_plan,
1021
  reflect_on_progress,
1022
  validate_answer,
 
 
 
1023
  search_tool,
 
 
 
 
1024
  calculator,
1025
  code_interpreter,
 
 
1026
  read_file,
1027
  write_file,
1028
  list_directory,
 
 
 
1029
  audio_transcription_tool,
1030
- analyze_image,
1031
  get_youtube_transcript,
1032
- scrape_and_retrieve,
 
1033
  final_answer_tool
1034
  ]
1035
 
@@ -1195,41 +1715,126 @@ class PlanningReflectionAgent:
1195
  tool_desc_list.append(desc)
1196
  tool_descriptions = "\n".join(tool_desc_list)
1197
 
1198
- self.system_prompt = f"""You are an elite AI agent for GAIA benchmark.
 
 
 
 
 
 
 
 
 
 
1199
 
1200
  ═══════════════════════════════════════════════════════════════
1201
- ⚠️ ABSOLUTE RULES:
1202
  ═══════════════════════════════════════════════════════════════
1203
 
1204
- 1. EVERY TURN MUST CALL EXACTLY ONE TOOL
1205
- 2. NEVER OUTPUT REASONING TEXT WITHOUT TOOL CALL
1206
- 3. IDENTIFY QUESTION TYPE FIRST
1207
- 4. LOGIC: think β†’ calc β†’ validate β†’ final
1208
- 5. FACTUAL: search β†’ scrape β†’ validate β†’ final
1209
- 6. DATA: read β†’ code β†’ validate β†’ final
1210
- 7. ALWAYS VALIDATE before final_answer
1211
- 8. FINAL FORMAT: EXACTLY what asked, NO fluff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1212
 
1213
  ═══════════════════════════════════════════════════════════════
1214
- πŸ“š TOOLS:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1215
  ═══════════════════════════════════════════════════════════════
1216
 
1217
  {tool_descriptions}
1218
 
1219
  ═══════════════════════════════════════════════════════════════
1220
- ⚑ EXECUTION:
1221
  ═══════════════════════════════════════════════════════════════
1222
 
1223
- - Text without tool = FAILURE
1224
- - Unsure? β†’ think_through_logic()
1225
- - After each tool: Have answer? β†’ validate β†’ submit
1226
  - Stuck after 3 turns? β†’ reflect_on_progress()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1227
  ═══════════════════════════════════════════════════════════════
1228
  """
1229
 
1230
- # Initialize LLM
1231
- print("Initializing Groq LLM...")
1232
- self.llm_with_tools = ChatGroq(
 
 
1233
  temperature=0,
1234
  groq_api_key=GROQ_API_KEY,
1235
  model_name="llama-3.3-70b-versatile",
@@ -1237,7 +1842,24 @@ class PlanningReflectionAgent:
1237
  timeout=60
1238
  ).bind_tools(self.tools, tool_choice="auto")
1239
 
1240
- print("βœ… LLM initialized")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1241
 
1242
  # Build agent graph
1243
  def agent_node(state: AgentState):
@@ -1274,7 +1896,7 @@ class PlanningReflectionAgent:
1274
  messages_to_send.append(hint)
1275
  print("πŸ€” Reflection hint")
1276
 
1277
- # Invoke LLM with retries
1278
  ai_message = None
1279
 
1280
  for attempt in range(config.MAX_RETRIES):
@@ -1287,7 +1909,20 @@ class PlanningReflectionAgent:
1287
  print(f"⚠️ No tool calls (attempt {attempt+1})")
1288
 
1289
  except Exception as e:
1290
- print(f"⚠️ LLM error (attempt {attempt+1}): {str(e)[:200]}")
 
 
 
 
 
 
 
 
 
 
 
 
 
1291
 
1292
  if attempt == config.MAX_RETRIES - 1:
1293
  print("🚨 Forcing think_through_logic")
@@ -1424,6 +2059,11 @@ class PlanningReflectionAgent:
1424
  "last_tool_was_thinking": False
1425
  }
1426
 
 
 
 
 
 
1427
  final_answer = "AGENT FAILED"
1428
  all_messages = []
1429
 
@@ -1468,15 +2108,17 @@ class PlanningReflectionAgent:
1468
  break
1469
  break
1470
 
1471
- # Clean answer
1472
  cleaned = str(final_answer).strip()
1473
 
1474
- # Remove prefixes
1475
  prefixes = [
1476
  "the answer is:", "here is the answer:", "based on",
1477
  "final answer:", "answer:", "the final answer is:",
1478
  "my answer is:", "according to", "i found that",
1479
- "the result is:", "result:"
 
 
1480
  ]
1481
  for prefix in prefixes:
1482
  if cleaned.lower().startswith(prefix.lower()):
@@ -1488,16 +2130,33 @@ class PlanningReflectionAgent:
1488
  # Remove code fences
1489
  cleaned = remove_fences_simple(cleaned)
1490
 
 
1491
  while cleaned.startswith("`") and cleaned.endswith("`"):
1492
  cleaned = cleaned[1:-1].strip()
1493
 
 
1494
  if (cleaned.startswith('"') and cleaned.endswith('"')) or \
1495
  (cleaned.startswith("'") and cleaned.endswith("'")):
1496
  cleaned = cleaned[1:-1].strip()
1497
 
 
1498
  if cleaned.endswith('.') and len(cleaned.split()) < 10:
1499
  cleaned = cleaned[:-1]
1500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1501
  print(f"\nπŸŽ‰ RETURNING: {cleaned}\n")
1502
 
1503
  return cleaned
 
510
 
511
  @tool(args_schema=ValidateInput)
512
  def validate_answer(proposed_answer: str, original_question: str) -> str:
513
+ """
514
+ ENHANCED: Validate answer before submission with comprehensive checks.
515
+
516
+ ALWAYS use before final_answer_tool.
517
+ """
518
  start_time = time.time()
519
  try:
520
  print(f"βœ“ Validating: '{proposed_answer[:50]}...'")
521
 
522
  issues = []
523
  warnings = []
524
+ suggestions = []
525
 
526
+ # 1. Check conversational fluff
527
+ fluff = ["the answer is", "based on", "according to", "i found", "here is",
528
+ "here's", "after searching", "from my research", "the result is"]
529
  if any(p in proposed_answer.lower() for p in fluff):
530
+ issues.append("❌ Remove conversational text - answer ONLY")
531
+
532
+ # 2. Check code fences
533
  if "```" in proposed_answer:
534
+ issues.append("❌ Remove code fences (```)")
535
 
536
+ # 3. Check markdown formatting
537
+ if proposed_answer.startswith('#') or '**' in proposed_answer:
538
+ issues.append("❌ Remove markdown formatting")
539
+
540
+ # 4. Check length appropriateness
541
+ question_lower = original_question.lower()
542
  if len(proposed_answer) > 500:
543
+ if not any(k in question_lower for k in ['explain', 'describe', 'why', 'how does']):
544
+ warnings.append("⚠️ Answer very long. Question asks for short answer?")
545
 
546
+ # 5. Check for number questions
547
+ number_keywords = ["how many", "what number", "count", "total", "sum",
548
+ "what year", "when did", "what date"]
549
+ if any(k in question_lower for k in number_keywords):
550
  if not any(c.isdigit() for c in proposed_answer):
551
+ issues.append("❌ Question asks for number but answer has no digits")
552
+ else:
553
+ # Extract just the number(s)
554
+ import re
555
+ numbers = re.findall(r'\d+(?:\.\d+)?', proposed_answer)
556
+ if numbers and len(proposed_answer) > 50:
557
+ suggestions.append(f"πŸ’‘ Consider just the number(s): {', '.join(numbers)}")
558
+
559
+ # 6. Check for list questions
560
+ list_keywords = ["list", "what are", "name the", "which"]
561
+ if any(k in question_lower for k in list_keywords):
562
+ if '\n' in proposed_answer or len(proposed_answer.split(',')) > 1:
563
+ # Good, it's formatted as a list
564
+ pass
565
+ else:
566
+ warnings.append("⚠️ Question might ask for multiple items")
567
+
568
+ # 7. Check for yes/no questions
569
+ if question_lower.startswith(('is ', 'does ', 'did ', 'can ', 'will ', 'was ', 'were ', 'are ')):
570
+ if proposed_answer.lower() not in ['yes', 'no', 'true', 'false']:
571
+ if not proposed_answer.lower().startswith(('yes', 'no')):
572
+ warnings.append("⚠️ Question seems yes/no. Answer should start with yes/no?")
573
+
574
+ # 8. Check for excessive punctuation
575
+ if proposed_answer.count('!') > 2 or proposed_answer.count('?') > 1:
576
+ issues.append("❌ Remove excessive punctuation")
577
+
578
+ # 9. Check for quotes around answer
579
+ if (proposed_answer.startswith('"') and proposed_answer.endswith('"')) or \
580
+ (proposed_answer.startswith("'") and proposed_answer.endswith("'")):
581
+ suggestions.append("πŸ’‘ Consider removing quotes around answer")
582
+
583
+ # 10. Check for multiple sentences when one expected
584
+ sentences = [s.strip() for s in proposed_answer.split('.') if s.strip()]
585
+ if len(sentences) > 3:
586
+ if not any(k in question_lower for k in ['explain', 'describe', 'why', 'how']):
587
+ warnings.append("⚠️ Multiple sentences. Question asks for simple answer?")
588
+
589
+ # 11. Sanity check: is it empty?
590
+ if not proposed_answer.strip():
591
+ issues.append("❌ Answer is empty!")
592
+
593
+ # 12. Check for units in measurement questions
594
+ unit_keywords = ['height', 'weight', 'distance', 'speed', 'temperature', 'size']
595
+ if any(k in question_lower for k in unit_keywords):
596
+ has_unit = any(u in proposed_answer.lower() for u in
597
+ ['km', 'miles', 'kg', 'lbs', 'cm', 'inches', 'celsius',
598
+ 'fahrenheit', 'mph', 'kph', 'meters', 'feet'])
599
+ if not has_unit and any(c.isdigit() for c in proposed_answer):
600
+ warnings.append("⚠️ Measurement question but no unit found")
601
+
602
+ # Build response
603
  if issues:
604
  result = "🚫 VALIDATION FAILED:\n" + "\n".join(issues)
605
+ if suggestions:
606
+ result += "\n\nSuggestions:\n" + "\n".join(suggestions)
607
+ result += "\n\nFix issues then retry validation."
608
  elif warnings:
609
+ result = "⚠️ WARNINGS:\n" + "\n".join(warnings)
610
+ if suggestions:
611
+ result += "\n\nSuggestions:\n" + "\n".join(suggestions)
612
+ result += "\n\nProceed if confident, or refine answer."
613
+ elif suggestions:
614
+ result = "βœ… PASSED with suggestions:\n" + "\n".join(suggestions)
615
+ result += "\n\nCall final_answer_tool() when ready."
616
  else:
617
+ result = "βœ… VALIDATION PASSED! Call final_answer_tool() now."
618
 
619
  telemetry.record_call("validate_answer", time.time() - start_time, True)
620
  return result
 
626
  # =============================================================================
627
  # CORE TOOLS
628
  # =============================================================================
629
+ class WikipediaInput(BaseModel):
630
+ query: str = Field(description="Topic to search (e.g., 'Mercedes Sosa', 'Python programming')")
631
+
632
+ @tool(args_schema=WikipediaInput)
633
+ @retry_with_backoff(max_retries=2)
634
+ def wikipedia_search(query: str) -> str:
635
+ """
636
+ Search Wikipedia with automatic page retrieval.
637
+
638
+ Better than search_tool for:
639
+ - Biographical information
640
+ - Historical facts
641
+ - Scientific concepts
642
+ - Counting items in lists (discography, filmography, etc.)
643
+
644
+ Returns full article sections, not just snippets.
645
+ """
646
+ start_time = time.time()
647
+
648
+ try:
649
+ print(f"πŸ“š Wikipedia search: {query}")
650
+
651
+ # Check cache first
652
+ cache_key = f"wiki:{query}"
653
+ cached = search_cache.get(cache_key)
654
+ if cached:
655
+ print(f" (cached)")
656
+ telemetry.record_call("wikipedia_search", time.time() - start_time, True)
657
+ return cached
658
+
659
+ import requests
660
+
661
+ # Step 1: Search for page
662
+ search_url = "https://en.wikipedia.org/w/api.php"
663
+ search_params = {
664
+ 'action': 'opensearch',
665
+ 'search': query,
666
+ 'limit': 1,
667
+ 'namespace': 0,
668
+ 'format': 'json'
669
+ }
670
+
671
+ response = requests.get(search_url, params=search_params, timeout=10)
672
+ response.raise_for_status()
673
+ search_results = response.json()
674
+
675
+ if not search_results[1]: # No results
676
+ result = f"No Wikipedia article found for: '{query}'"
677
+ search_cache.put(cache_key, result)
678
+ telemetry.record_call("wikipedia_search", time.time() - start_time, True)
679
+ return result
680
+
681
+ page_title = search_results[1][0]
682
+ page_url = search_results[3][0]
683
+
684
+ print(f" Found: {page_title}")
685
+ print(f" URL: {page_url}")
686
+
687
+ # Step 2: Get full page content
688
+ content_params = {
689
+ 'action': 'query',
690
+ 'titles': page_title,
691
+ 'prop': 'extracts',
692
+ 'explaintext': True,
693
+ 'format': 'json'
694
+ }
695
+
696
+ response = requests.get(search_url, params=content_params, timeout=10)
697
+ response.raise_for_status()
698
+ data = response.json()
699
+
700
+ pages = data['query']['pages']
701
+ page_id = list(pages.keys())[0]
702
+
703
+ if page_id == '-1':
704
+ result = f"Wikipedia page not found: '{query}'"
705
+ search_cache.put(cache_key, result)
706
+ telemetry.record_call("wikipedia_search", time.time() - start_time, True)
707
+ return result
708
+
709
+ content = pages[page_id].get('extract', '')
710
+
711
+ if not content:
712
+ result = f"Wikipedia page found but content empty: '{page_title}'"
713
+ search_cache.put(cache_key, result)
714
+ telemetry.record_call("wikipedia_search", time.time() - start_time, True)
715
+ return result
716
+
717
+ print(f" Retrieved {len(content)} chars")
718
+
719
+ # Format result
720
+ result = f"Wikipedia: {page_title}\n"
721
+ result += f"URL: {page_url}\n\n"
722
+ result += content
723
+ result = truncate_if_needed(result, max_length=12000) # Allow more for Wikipedia
724
+
725
+ # Cache result
726
+ search_cache.put(cache_key, result)
727
+
728
+ telemetry.record_call("wikipedia_search", time.time() - start_time, True)
729
+ return result
730
+
731
+ except Exception as e:
732
+ telemetry.record_call("wikipedia_search", time.time() - start_time, False)
733
+ raise ToolError("wikipedia_search", e, "Try a more specific search term")
734
+
735
+
736
  class SearchInput(BaseModel):
737
  query: str = Field(description="Search query (concise)")
738
 
 
864
  raise ToolError("code_interpreter", e, "Check code syntax")
865
 
866
 
867
+ class AnalyzeDataInput(BaseModel):
868
+ file_path: str = Field(description="Path to CSV or Excel file")
869
+ question: str = Field(description="What to find (e.g., 'count rows where year > 2000')")
870
+
871
+ @tool(args_schema=AnalyzeDataInput)
872
+ def analyze_data_file(file_path: str, question: str) -> str:
873
+ """
874
+ Analyze CSV/Excel files with automatic data profiling.
875
+
876
+ Generates Python code to answer questions about data files.
877
+ Better than code_interpreter alone because it:
878
+ 1. Profiles the data first (columns, types, sample)
879
+ 2. Generates appropriate pandas code
880
+ 3. Handles common data issues (encoding, missing values)
881
+
882
+ Use for questions like:
883
+ - "How many rows have X?"
884
+ - "What's the sum/average of column Y?"
885
+ - "Count items grouped by Z"
886
+ """
887
+ start_time = time.time()
888
+
889
+ try:
890
+ print(f"πŸ“Š Analyzing data file: {file_path}")
891
+ print(f" Question: {question[:100]}...")
892
+
893
+ # Find file
894
+ data_file = find_file(file_path)
895
+ if not data_file:
896
+ raise FileNotFoundError(f"Data file not found: {file_path}")
897
+
898
+ file_ext = data_file.suffix.lower()
899
+
900
+ if file_ext not in ['.csv', '.xlsx', '.xls', '.tsv']:
901
+ raise ValueError(f"Unsupported file type: {file_ext}. Use .csv, .xlsx, .xls, or .tsv")
902
+
903
+ print(f" File type: {file_ext}")
904
+
905
+ # Generate profiling code
906
+ profiling_code = f"""
907
+ import pandas as pd
908
+ import numpy as np
909
+
910
+ # Load file
911
+ file_path = r"{data_file}"
912
+ """
913
+
914
+ if file_ext == '.csv':
915
+ profiling_code += """
916
+ # Try different encodings
917
+ for encoding in ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252']:
918
+ try:
919
+ df = pd.read_csv(file_path, encoding=encoding)
920
+ break
921
+ except:
922
+ continue
923
+ """
924
+ elif file_ext == '.tsv':
925
+ profiling_code += """
926
+ df = pd.read_csv(file_path, sep='\\t', encoding='utf-8')
927
+ """
928
+ else: # Excel
929
+ profiling_code += """
930
+ df = pd.read_excel(file_path)
931
+ """
932
+
933
+ profiling_code += """
934
+ # Profile data
935
+ print("=" * 60)
936
+ print("DATA PROFILE")
937
+ print("=" * 60)
938
+ print(f"Shape: {df.shape[0]} rows Γ— {df.shape[1]} columns")
939
+ print(f"\\nColumns: {', '.join(df.columns.tolist())}")
940
+ print(f"\\nData types:")
941
+ print(df.dtypes)
942
+ print(f"\\nFirst 3 rows:")
943
+ print(df.head(3))
944
+ print(f"\\nMissing values:")
945
+ print(df.isnull().sum())
946
+ """
947
+
948
+ # Execute profiling
949
+ print(f" Profiling data...")
950
+ output_stream = io.StringIO()
951
+ error_stream = io.StringIO()
952
+
953
+ with contextlib.redirect_stdout(output_stream), contextlib.redirect_stderr(error_stream):
954
+ exec(profiling_code, {"pd": pd, "np": np, "__builtins__": __builtins__})
955
+
956
+ profile_output = output_stream.getvalue()
957
+
958
+ if error_stream.getvalue():
959
+ raise RuntimeError(f"Profiling failed: {error_stream.getvalue()}")
960
+
961
+ print(f" Profiling complete")
962
+ print(profile_output[:500] + "..." if len(profile_output) > 500 else profile_output)
963
+
964
+ # Now generate analysis code based on question
965
+ analysis_code = profiling_code + f"""
966
+
967
+ # Analysis for: {question}
968
+ print("\\n" + "=" * 60)
969
+ print("ANALYSIS RESULT")
970
+ print("=" * 60)
971
+
972
+ """
973
+
974
+ # Add intelligent code based on question keywords
975
+ q_lower = question.lower()
976
+
977
+ if 'count' in q_lower or 'how many' in q_lower:
978
+ if 'where' in q_lower or 'with' in q_lower:
979
+ analysis_code += """
980
+ # Count rows matching condition
981
+ # NOTE: Adjust the filter condition based on your needs
982
+ result = len(df) # Total count
983
+ print(f"Total rows: {result}")
984
+
985
+ # Example filters (uncomment and modify as needed):
986
+ # result = len(df[df['column'] > value])
987
+ # result = len(df[df['column'].str.contains('text', na=False)])
988
+ """
989
+ else:
990
+ analysis_code += """
991
+ result = len(df)
992
+ print(f"Total rows: {result}")
993
+ """
994
+
995
+ elif 'sum' in q_lower or 'total' in q_lower:
996
+ analysis_code += """
997
+ # Sum a numeric column
998
+ # NOTE: Replace 'column_name' with actual column
999
+ # result = df['column_name'].sum()
1000
+ # print(f"Sum: {result}")
1001
+ """
1002
+
1003
+ elif 'average' in q_lower or 'mean' in q_lower:
1004
+ analysis_code += """
1005
+ # Average of a column
1006
+ # result = df['column_name'].mean()
1007
+ # print(f"Average: {result}")
1008
+ """
1009
+
1010
+ elif 'group' in q_lower or 'by' in q_lower:
1011
+ analysis_code += """
1012
+ # Group by and count
1013
+ # result = df.groupby('column_name').size()
1014
+ # print(result)
1015
+ """
1016
+
1017
+ else:
1018
+ # Generic: show summary
1019
+ analysis_code += """
1020
+ # Summary statistics
1021
+ print(df.describe())
1022
+ """
1023
+
1024
+ result = f"""Data Profile:
1025
+ {profile_output}
1026
+
1027
+ Generated Analysis Code:
1028
+ ```python
1029
+ {analysis_code}
1030
+ ```
1031
+
1032
+ **IMPORTANT**: The code above needs column names adjusted.
1033
+ Use code_interpreter() with the corrected code to get the answer.
1034
+
1035
+ Columns available: {", ".join(pd.read_csv(data_file) if file_ext == '.csv' else pd.read_excel(data_file)).columns.tolist()}
1036
+ """
1037
+
1038
+ telemetry.record_call("analyze_data_file", time.time() - start_time, True)
1039
+ return truncate_if_needed(result)
1040
+
1041
+ except Exception as e:
1042
+ telemetry.record_call("analyze_data_file", time.time() - start_time, False)
1043
+ raise ToolError("analyze_data_file", e, "Check file path and format")
1044
+
1045
+
1046
  class ReadFileInput(BaseModel):
1047
  path: str = Field(description="File path")
1048
 
 
1268
  raise ToolError("get_youtube_transcript", e)
1269
 
1270
 
1271
+ class BrowseInput(BaseModel):
1272
+ start_url: str = Field(description="Starting URL (http:// or https://)")
1273
+ goal: str = Field(description="What you're trying to find (e.g., 'Mercedes Sosa albums 2000-2009')")
1274
+ max_steps: int = Field(description="Max pages to visit (1-5)", default=3)
1275
+
1276
+ @tool(args_schema=BrowseInput)
1277
+ @retry_with_backoff(max_retries=2)
1278
+ def iterative_web_browser(start_url: str, goal: str, max_steps: int = 3) -> str:
1279
+ """
1280
+ Multi-turn web browsing - follows links iteratively to find information.
1281
+
1282
+ Use when:
1283
+ - Information requires navigating through multiple pages
1284
+ - Need to follow "Read more" or "Details" links
1285
+ - Example: "Find Mercedes Sosa's discography, then count 2000-2009 albums"
1286
+
1287
+ This tool:
1288
+ 1. Visits start_url
1289
+ 2. Searches content for goal-related info
1290
+ 3. Extracts relevant links
1291
+ 4. Follows most promising link
1292
+ 5. Repeats until info found or max_steps reached
1293
+
1294
+ Better than scrape_and_retrieve when single page doesn't have complete info.
1295
+ """
1296
+ start_time = time.time()
1297
+
1298
+ try:
1299
+ if not rag_manager.is_ready():
1300
+ rag_manager.initialize()
1301
+
1302
+ print(f"🌐 Iterative browsing starting at: {start_url}")
1303
+ print(f" Goal: {goal[:100]}...")
1304
+ print(f" Max steps: {max_steps}")
1305
+
1306
+ visited_urls = set()
1307
+ current_url = start_url
1308
+ all_findings = []
1309
+
1310
+ headers = {
1311
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
1312
+ }
1313
+
1314
+ for step in range(max_steps):
1315
+ if current_url in visited_urls:
1316
+ print(f" Step {step+1}: Already visited, stopping")
1317
+ break
1318
+
1319
+ visited_urls.add(current_url)
1320
+ print(f" Step {step+1}: Visiting {current_url}")
1321
+
1322
+ try:
1323
+ response = requests.get(current_url, headers=headers, timeout=15)
1324
+ response.raise_for_status()
1325
+
1326
+ soup = BeautifulSoup(response.text, 'html.parser')
1327
+
1328
+ # Remove noise
1329
+ for tag in soup(["script", "style", "nav", "footer", "aside", "header", "iframe"]):
1330
+ tag.extract()
1331
+
1332
+ # Extract main content
1333
+ main = soup.find('main') or soup.find('article') or soup.find('div', class_='mw-parser-output') or soup.body
1334
+
1335
+ if not main:
1336
+ print(f" No main content found")
1337
+ continue
1338
+
1339
+ text = main.get_text(separator='\n', strip=True)
1340
+ lines = [l.strip() for l in text.splitlines() if l.strip()]
1341
+ text = '\n'.join(lines)
1342
+
1343
+ print(f" Extracted {len(text)} chars")
1344
+
1345
+ # Search for goal-related content
1346
+ chunks = rag_manager.text_splitter.split_text(text)
1347
+ docs = [Document(page_content=c, metadata={"source": current_url, "step": step+1}) for c in chunks]
1348
+
1349
+ db = FAISS.from_documents(docs, rag_manager.embeddings)
1350
+ retriever = db.as_retriever(search_kwargs={"k": 3})
1351
+ retrieved = retriever.invoke(goal)
1352
+
1353
+ # Clean up
1354
+ del db
1355
+ del retriever
1356
+ import gc
1357
+ gc.collect()
1358
+
1359
+ if retrieved:
1360
+ print(f" Found {len(retrieved)} relevant chunks")
1361
+ for i, doc in enumerate(retrieved):
1362
+ all_findings.append({
1363
+ 'step': step + 1,
1364
+ 'url': current_url,
1365
+ 'content': doc.page_content
1366
+ })
1367
+
1368
+ # Extract links for next step
1369
+ if step < max_steps - 1:
1370
+ links = []
1371
+ for a in main.find_all('a', href=True):
1372
+ href = a.get('href')
1373
+ text = a.get_text(strip=True).lower()
1374
+
1375
+ # Make absolute URL
1376
+ if href.startswith('/'):
1377
+ from urllib.parse import urljoin
1378
+ href = urljoin(current_url, href)
1379
+
1380
+ # Filter relevant links
1381
+ goal_keywords = goal.lower().split()
1382
+ if any(keyword in href.lower() or keyword in text for keyword in goal_keywords):
1383
+ if href.startswith('http') and href not in visited_urls:
1384
+ links.append((href, text))
1385
+
1386
+ if links:
1387
+ # Pick most relevant link
1388
+ current_url = links[0][0]
1389
+ print(f" Found {len(links)} potential links, following: {links[0][1][:50]}")
1390
+ else:
1391
+ print(f" No more relevant links found")
1392
+ break
1393
+ else:
1394
+ print(f" Max steps reached")
1395
+ break
1396
+
1397
+ except Exception as e:
1398
+ print(f" Error on step {step+1}: {e}")
1399
+ break
1400
+
1401
+ # Compile findings
1402
+ if not all_findings:
1403
+ result = f"Browsed {len(visited_urls)} pages but found no relevant information for: '{goal}'"
1404
+ else:
1405
+ result = f"Information gathered from {len(visited_urls)} pages:\n\n"
1406
+ for finding in all_findings:
1407
+ result += f"[Step {finding['step']} - {finding['url']}]\n{finding['content']}\n\n---\n\n"
1408
+ result = truncate_if_needed(result)
1409
+
1410
+ telemetry.record_call("iterative_web_browser", time.time() - start_time, True)
1411
+ return result
1412
+
1413
+ except Exception as e:
1414
+ telemetry.record_call("iterative_web_browser", time.time() - start_time, False)
1415
+ raise ToolError("iterative_web_browser", e, "Try starting from a more specific URL")
1416
+
1417
+
1418
  class ScrapeInput(BaseModel):
1419
  url: str = Field(description="URL (http:// or https://)")
1420
  query: str = Field(description="Specific info to find")
 
1522
  # TOOLS LIST
1523
  # =============================================================================
1524
  defined_tools = [
1525
+ # Planning & Reflection
1526
  think_through_logic,
1527
  create_plan,
1528
  reflect_on_progress,
1529
  validate_answer,
1530
+
1531
+ # Search & Browse
1532
+ wikipedia_search, # NEW: Better for encyclopedic queries
1533
  search_tool,
1534
+ iterative_web_browser, # NEW: Multi-turn web navigation
1535
+ scrape_and_retrieve,
1536
+
1537
+ # Core computation
1538
  calculator,
1539
  code_interpreter,
1540
+
1541
+ # File operations
1542
  read_file,
1543
  write_file,
1544
  list_directory,
1545
+ analyze_data_file, # NEW: Smart CSV/Excel analysis
1546
+
1547
+ # Specialized
1548
  audio_transcription_tool,
1549
+ analyze_image,
1550
  get_youtube_transcript,
1551
+
1552
+ # Final
1553
  final_answer_tool
1554
  ]
1555
 
 
1715
  tool_desc_list.append(desc)
1716
  tool_descriptions = "\n".join(tool_desc_list)
1717
 
1718
+ self.system_prompt = f"""You are an elite AI agent for GAIA benchmark. Your ONLY job: provide the EXACT answer requested.
1719
+
1720
+ ═══════════════════════════════════════════════════════════════
1721
+ ⚠️ ABSOLUTE RULES - VIOLATE THESE AND YOU FAIL:
1722
+ ═══════════════════════════════════════════════════════════════
1723
+
1724
+ 1. **EVERY TURN MUST CALL EXACTLY ONE TOOL** - No exceptions
1725
+ 2. **NEVER OUTPUT REASONING TEXT WITHOUT A TOOL CALL** - You will fail
1726
+ 3. **IDENTIFY QUESTION TYPE FIRST** - Logic? Factual? Data? Math?
1727
+ 4. **ALWAYS VALIDATE**: Call validate_answer() before final_answer_tool()
1728
+ 5. **FINAL ANSWER FORMAT**: EXACTLY what was asked. NO "The answer is..." or explanations
1729
 
1730
  ═══════════════════════════════════════════════════════════════
1731
+ πŸ“‹ QUESTION TYPE β†’ TOOL SEQUENCE:
1732
  ═══════════════════════════════════════════════════════════════
1733
 
1734
+ **LOGIC PUZZLES** (No web search needed):
1735
+ β†’ think_through_logic β†’ calculator (if math) β†’ validate β†’ final_answer
1736
+
1737
+ **FACTUAL/BIOGRAPHICAL** (Need web):
1738
+ β†’ wikipedia_search (if person/place/thing) β†’ validate β†’ final_answer
1739
+ OR search_tool β†’ scrape_and_retrieve β†’ validate β†’ final_answer
1740
+
1741
+ **COUNTING FROM WEB** (Need full page content):
1742
+ β†’ wikipedia_search (if Wikipedia topic) β†’ validate β†’ final_answer
1743
+ OR iterative_web_browser (if needs navigation) β†’ validate β†’ final_answer
1744
+
1745
+ **DATA FILES** (CSV/Excel):
1746
+ β†’ list_directory β†’ analyze_data_file β†’ code_interpreter β†’ validate β†’ final_answer
1747
+
1748
+ **IMAGES** (Chess, diagrams, photos):
1749
+ β†’ analyze_image β†’ validate β†’ final_answer
1750
+
1751
+ **AUDIO FILES**:
1752
+ β†’ audio_transcription_tool β†’ validate β†’ final_answer
1753
+
1754
+ **MATH CALCULATIONS**:
1755
+ β†’ calculator β†’ validate β†’ final_answer
1756
 
1757
  ═══════════════════════════════════════════════════════════════
1758
+ 🎯 CRITICAL TOOL USAGE PATTERNS:
1759
+ ═══════════════════════════════════════════════════════════════
1760
+
1761
+ **For Counting Questions:**
1762
+ BAD: search_tool("Mercedes Sosa albums") β†’ snippets only
1763
+ GOOD: wikipedia_search("Mercedes Sosa") β†’ full discography section
1764
+
1765
+ **For Multi-Step Web Questions:**
1766
+ BAD: scrape_and_retrieve("https://...") β†’ single page only
1767
+ GOOD: iterative_web_browser("https://...", "find X", max_steps=3)
1768
+
1769
+ **For Data Questions:**
1770
+ BAD: read_file("data.csv") β†’ raw text dump
1771
+ GOOD: analyze_data_file("data.csv", "count rows where X > Y")
1772
+
1773
+ **For Validation:**
1774
+ ALWAYS: validate_answer("your answer", "original question")
1775
+ THEN: final_answer_tool("your answer")
1776
+
1777
+ ═══════════════════════════════════════════════════════════════
1778
+ πŸ“š AVAILABLE TOOLS:
1779
  ═══════════════════════════════════════════════════════════════
1780
 
1781
  {tool_descriptions}
1782
 
1783
  ═══════════════════════════════════════════════════════════════
1784
+ ⚑ EXECUTION RULES:
1785
  ═══════════════════════════════════════════════════════════════
1786
 
1787
+ - Text without tool call = FAILURE
1788
+ - Unsure? β†’ think_through_logic() to organize thoughts
1789
+ - After EVERY tool result: "Do I have the answer? β†’ validate β†’ submit"
1790
  - Stuck after 3 turns? β†’ reflect_on_progress()
1791
+ - For Wikipedia topics β†’ ALWAYS use wikipedia_search, NOT search_tool
1792
+ - For counting from web β†’ Use wikipedia_search or iterative_web_browser
1793
+ - For data files β†’ Use analyze_data_file, NOT just read_file
1794
+
1795
+ ═══════════════════════════════════════════════════════════════
1796
+ πŸŽ“ EXAMPLES OF PERFECT EXECUTION:
1797
+ ═══════════════════════════════════════════════════════════════
1798
+
1799
+ Example 1: "How many studio albums did Mercedes Sosa release 2000-2009?"
1800
+ Turn 1: wikipedia_search("Mercedes Sosa")
1801
+ β†’ Gets full discography with all albums and years
1802
+ Turn 2: code_interpreter("count albums 2000-2009 from text")
1803
+ β†’ Result: 3
1804
+ Turn 3: validate_answer("3", "How many studio albums...")
1805
+ β†’ βœ… PASSED
1806
+ Turn 4: final_answer_tool("3")
1807
+
1808
+ Example 2: "What's the population of Einstein's birthplace in 1900?"
1809
+ Turn 1: wikipedia_search("Albert Einstein")
1810
+ β†’ Birthplace: Ulm, Germany
1811
+ Turn 2: search_tool("Ulm Germany population 1900")
1812
+ β†’ Find sources
1813
+ Turn 3: scrape_and_retrieve("url", "population 1900")
1814
+ β†’ ~50,000
1815
+ Turn 4: validate_answer("50000", "population 1900")
1816
+ β†’ βœ… PASSED
1817
+ Turn 5: final_answer_tool("50000")
1818
+
1819
+ Example 3: Logic puzzle
1820
+ Turn 1: think_through_logic("Work through the logic...")
1821
+ β†’ Reasoning recorded
1822
+ Turn 2: calculator("30") [if calculation needed]
1823
+ β†’ 30
1824
+ Turn 3: validate_answer("30", "coin puzzle")
1825
+ β†’ βœ… PASSED
1826
+ Turn 4: final_answer_tool("30")
1827
+
1828
+ ═════════════��═════════════════════════════════════════════════
1829
+ REMEMBER: One tool per turn. No reasoning without tools. Exact answer format.
1830
  ═══════════════════════════════════════════════════════════════
1831
  """
1832
 
1833
+ # Initialize LLMs (Groq primary, Claude fallback)
1834
+ print("Initializing LLMs...")
1835
+
1836
+ # Primary: Groq (fast, free)
1837
+ self.groq_llm = ChatGroq(
1838
  temperature=0,
1839
  groq_api_key=GROQ_API_KEY,
1840
  model_name="llama-3.3-70b-versatile",
 
1842
  timeout=60
1843
  ).bind_tools(self.tools, tool_choice="auto")
1844
 
1845
+ # Fallback: Claude (slower, more reliable)
1846
+ ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
1847
+ if ANTHROPIC_API_KEY:
1848
+ from langchain_anthropic import ChatAnthropic
1849
+ self.claude_llm = ChatAnthropic(
1850
+ model="claude-sonnet-4-20250514",
1851
+ anthropic_api_key=ANTHROPIC_API_KEY,
1852
+ temperature=0,
1853
+ max_tokens=4096
1854
+ ).bind_tools(self.tools, tool_choice="auto")
1855
+ print("βœ… Both Groq and Claude initialized")
1856
+ else:
1857
+ self.claude_llm = None
1858
+ print("βœ… Groq initialized (Claude fallback unavailable)")
1859
+
1860
+ # Start with Groq
1861
+ self.llm_with_tools = self.groq_llm
1862
+ self.current_llm = "groq"
1863
 
1864
  # Build agent graph
1865
  def agent_node(state: AgentState):
 
1896
  messages_to_send.append(hint)
1897
  print("πŸ€” Reflection hint")
1898
 
1899
+ # Invoke LLM with retries and fallback
1900
  ai_message = None
1901
 
1902
  for attempt in range(config.MAX_RETRIES):
 
1909
  print(f"⚠️ No tool calls (attempt {attempt+1})")
1910
 
1911
  except Exception as e:
1912
+ error_str = str(e)
1913
+ print(f"⚠️ {self.current_llm.upper()} error (attempt {attempt+1}): {error_str[:200]}")
1914
+
1915
+ # If Groq fails and we have Claude, switch to Claude
1916
+ if self.current_llm == "groq" and self.claude_llm and attempt == config.MAX_RETRIES - 1:
1917
+ print("πŸ”„ Switching from Groq to Claude for this question...")
1918
+ self.llm_with_tools = self.claude_llm
1919
+ self.current_llm = "claude"
1920
+ try:
1921
+ ai_message = self.llm_with_tools.invoke(messages_to_send)
1922
+ if ai_message.tool_calls:
1923
+ break
1924
+ except Exception as e2:
1925
+ print(f"⚠️ Claude also failed: {e2}")
1926
 
1927
  if attempt == config.MAX_RETRIES - 1:
1928
  print("🚨 Forcing think_through_logic")
 
2059
  "last_tool_was_thinking": False
2060
  }
2061
 
2062
+ # Reset to Groq for each question
2063
+ if self.groq_llm:
2064
+ self.llm_with_tools = self.groq_llm
2065
+ self.current_llm = "groq"
2066
+
2067
  final_answer = "AGENT FAILED"
2068
  all_messages = []
2069
 
 
2108
  break
2109
  break
2110
 
2111
+ # Clean answer more aggressively
2112
  cleaned = str(final_answer).strip()
2113
 
2114
+ # Remove common prefixes (case-insensitive)
2115
  prefixes = [
2116
  "the answer is:", "here is the answer:", "based on",
2117
  "final answer:", "answer:", "the final answer is:",
2118
  "my answer is:", "according to", "i found that",
2119
+ "the result is:", "result:", "here's the answer:",
2120
+ "after analysis:", "the correct answer is:",
2121
+ "from the data:", "from the search:",
2122
  ]
2123
  for prefix in prefixes:
2124
  if cleaned.lower().startswith(prefix.lower()):
 
2130
  # Remove code fences
2131
  cleaned = remove_fences_simple(cleaned)
2132
 
2133
+ # Remove backticks
2134
  while cleaned.startswith("`") and cleaned.endswith("`"):
2135
  cleaned = cleaned[1:-1].strip()
2136
 
2137
+ # Remove quotes (but only if they wrap entire answer)
2138
  if (cleaned.startswith('"') and cleaned.endswith('"')) or \
2139
  (cleaned.startswith("'") and cleaned.endswith("'")):
2140
  cleaned = cleaned[1:-1].strip()
2141
 
2142
+ # Remove trailing period for short answers
2143
  if cleaned.endswith('.') and len(cleaned.split()) < 10:
2144
  cleaned = cleaned[:-1]
2145
 
2146
+ # Remove markdown bold/italic
2147
+ cleaned = cleaned.replace('**', '').replace('__', '').replace('*', '').replace('_', '')
2148
+
2149
+ # Remove bullet points
2150
+ if cleaned.startswith(('- ', '* ', 'β€’ ')):
2151
+ cleaned = cleaned[2:].strip()
2152
+
2153
+ # Remove numbered list prefix
2154
+ import re
2155
+ cleaned = re.sub(r'^\d+\.\s+', '', cleaned)
2156
+
2157
+ # Final whitespace cleanup
2158
+ cleaned = ' '.join(cleaned.split())
2159
+
2160
  print(f"\nπŸŽ‰ RETURNING: {cleaned}\n")
2161
 
2162
  return cleaned