Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -510,38 +510,111 @@ class ValidateInput(BaseModel):
|
|
| 510 |
|
| 511 |
@tool(args_schema=ValidateInput)
|
| 512 |
def validate_answer(proposed_answer: str, original_question: str) -> str:
|
| 513 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 514 |
start_time = time.time()
|
| 515 |
try:
|
| 516 |
print(f"β Validating: '{proposed_answer[:50]}...'")
|
| 517 |
|
| 518 |
issues = []
|
| 519 |
warnings = []
|
|
|
|
| 520 |
|
| 521 |
-
# Check conversational fluff
|
| 522 |
-
fluff = ["the answer is", "based on", "according to", "i found", "here is"
|
|
|
|
| 523 |
if any(p in proposed_answer.lower() for p in fluff):
|
| 524 |
-
issues.append("β Remove conversational text")
|
| 525 |
-
|
| 526 |
-
# Check code fences
|
| 527 |
if "```" in proposed_answer:
|
| 528 |
-
issues.append("β Remove code fences")
|
| 529 |
|
| 530 |
-
# Check
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 531 |
if len(proposed_answer) > 500:
|
| 532 |
-
|
|
|
|
| 533 |
|
| 534 |
-
# Check
|
| 535 |
-
|
|
|
|
|
|
|
| 536 |
if not any(c.isdigit() for c in proposed_answer):
|
| 537 |
-
|
| 538 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 539 |
if issues:
|
| 540 |
result = "π« VALIDATION FAILED:\n" + "\n".join(issues)
|
|
|
|
|
|
|
|
|
|
| 541 |
elif warnings:
|
| 542 |
-
result = "β οΈ WARNINGS:\n" + "\n".join(warnings)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 543 |
else:
|
| 544 |
-
result = "β
PASSED! Call final_answer_tool() now."
|
| 545 |
|
| 546 |
telemetry.record_call("validate_answer", time.time() - start_time, True)
|
| 547 |
return result
|
|
@@ -553,6 +626,113 @@ def validate_answer(proposed_answer: str, original_question: str) -> str:
|
|
| 553 |
# =============================================================================
|
| 554 |
# CORE TOOLS
|
| 555 |
# =============================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 556 |
class SearchInput(BaseModel):
|
| 557 |
query: str = Field(description="Search query (concise)")
|
| 558 |
|
|
@@ -684,6 +864,185 @@ def code_interpreter(code: str) -> str:
|
|
| 684 |
raise ToolError("code_interpreter", e, "Check code syntax")
|
| 685 |
|
| 686 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 687 |
class ReadFileInput(BaseModel):
|
| 688 |
path: str = Field(description="File path")
|
| 689 |
|
|
@@ -909,6 +1268,153 @@ def get_youtube_transcript(video_url: str) -> str:
|
|
| 909 |
raise ToolError("get_youtube_transcript", e)
|
| 910 |
|
| 911 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 912 |
class ScrapeInput(BaseModel):
|
| 913 |
url: str = Field(description="URL (http:// or https://)")
|
| 914 |
query: str = Field(description="Specific info to find")
|
|
@@ -1016,20 +1522,34 @@ def final_answer_tool(answer: str) -> str:
|
|
| 1016 |
# TOOLS LIST
|
| 1017 |
# =============================================================================
|
| 1018 |
defined_tools = [
|
|
|
|
| 1019 |
think_through_logic,
|
| 1020 |
create_plan,
|
| 1021 |
reflect_on_progress,
|
| 1022 |
validate_answer,
|
|
|
|
|
|
|
|
|
|
| 1023 |
search_tool,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1024 |
calculator,
|
| 1025 |
code_interpreter,
|
|
|
|
|
|
|
| 1026 |
read_file,
|
| 1027 |
write_file,
|
| 1028 |
list_directory,
|
|
|
|
|
|
|
|
|
|
| 1029 |
audio_transcription_tool,
|
| 1030 |
-
analyze_image,
|
| 1031 |
get_youtube_transcript,
|
| 1032 |
-
|
|
|
|
| 1033 |
final_answer_tool
|
| 1034 |
]
|
| 1035 |
|
|
@@ -1195,41 +1715,126 @@ class PlanningReflectionAgent:
|
|
| 1195 |
tool_desc_list.append(desc)
|
| 1196 |
tool_descriptions = "\n".join(tool_desc_list)
|
| 1197 |
|
| 1198 |
-
self.system_prompt = f"""You are an elite AI agent for GAIA benchmark.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1199 |
|
| 1200 |
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1201 |
-
|
| 1202 |
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1203 |
|
| 1204 |
-
|
| 1205 |
-
|
| 1206 |
-
|
| 1207 |
-
|
| 1208 |
-
|
| 1209 |
-
|
| 1210 |
-
|
| 1211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1212 |
|
| 1213 |
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1214 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1215 |
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1216 |
|
| 1217 |
{tool_descriptions}
|
| 1218 |
|
| 1219 |
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1220 |
-
β‘ EXECUTION:
|
| 1221 |
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1222 |
|
| 1223 |
-
- Text without tool = FAILURE
|
| 1224 |
-
- Unsure? β think_through_logic()
|
| 1225 |
-
- After
|
| 1226 |
- Stuck after 3 turns? β reflect_on_progress()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1227 |
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1228 |
"""
|
| 1229 |
|
| 1230 |
-
# Initialize
|
| 1231 |
-
print("Initializing
|
| 1232 |
-
|
|
|
|
|
|
|
| 1233 |
temperature=0,
|
| 1234 |
groq_api_key=GROQ_API_KEY,
|
| 1235 |
model_name="llama-3.3-70b-versatile",
|
|
@@ -1237,7 +1842,24 @@ class PlanningReflectionAgent:
|
|
| 1237 |
timeout=60
|
| 1238 |
).bind_tools(self.tools, tool_choice="auto")
|
| 1239 |
|
| 1240 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1241 |
|
| 1242 |
# Build agent graph
|
| 1243 |
def agent_node(state: AgentState):
|
|
@@ -1274,7 +1896,7 @@ class PlanningReflectionAgent:
|
|
| 1274 |
messages_to_send.append(hint)
|
| 1275 |
print("π€ Reflection hint")
|
| 1276 |
|
| 1277 |
-
# Invoke LLM with retries
|
| 1278 |
ai_message = None
|
| 1279 |
|
| 1280 |
for attempt in range(config.MAX_RETRIES):
|
|
@@ -1287,7 +1909,20 @@ class PlanningReflectionAgent:
|
|
| 1287 |
print(f"β οΈ No tool calls (attempt {attempt+1})")
|
| 1288 |
|
| 1289 |
except Exception as e:
|
| 1290 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1291 |
|
| 1292 |
if attempt == config.MAX_RETRIES - 1:
|
| 1293 |
print("π¨ Forcing think_through_logic")
|
|
@@ -1424,6 +2059,11 @@ class PlanningReflectionAgent:
|
|
| 1424 |
"last_tool_was_thinking": False
|
| 1425 |
}
|
| 1426 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1427 |
final_answer = "AGENT FAILED"
|
| 1428 |
all_messages = []
|
| 1429 |
|
|
@@ -1468,15 +2108,17 @@ class PlanningReflectionAgent:
|
|
| 1468 |
break
|
| 1469 |
break
|
| 1470 |
|
| 1471 |
-
# Clean answer
|
| 1472 |
cleaned = str(final_answer).strip()
|
| 1473 |
|
| 1474 |
-
# Remove prefixes
|
| 1475 |
prefixes = [
|
| 1476 |
"the answer is:", "here is the answer:", "based on",
|
| 1477 |
"final answer:", "answer:", "the final answer is:",
|
| 1478 |
"my answer is:", "according to", "i found that",
|
| 1479 |
-
"the result is:", "result:"
|
|
|
|
|
|
|
| 1480 |
]
|
| 1481 |
for prefix in prefixes:
|
| 1482 |
if cleaned.lower().startswith(prefix.lower()):
|
|
@@ -1488,16 +2130,33 @@ class PlanningReflectionAgent:
|
|
| 1488 |
# Remove code fences
|
| 1489 |
cleaned = remove_fences_simple(cleaned)
|
| 1490 |
|
|
|
|
| 1491 |
while cleaned.startswith("`") and cleaned.endswith("`"):
|
| 1492 |
cleaned = cleaned[1:-1].strip()
|
| 1493 |
|
|
|
|
| 1494 |
if (cleaned.startswith('"') and cleaned.endswith('"')) or \
|
| 1495 |
(cleaned.startswith("'") and cleaned.endswith("'")):
|
| 1496 |
cleaned = cleaned[1:-1].strip()
|
| 1497 |
|
|
|
|
| 1498 |
if cleaned.endswith('.') and len(cleaned.split()) < 10:
|
| 1499 |
cleaned = cleaned[:-1]
|
| 1500 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1501 |
print(f"\nπ RETURNING: {cleaned}\n")
|
| 1502 |
|
| 1503 |
return cleaned
|
|
|
|
| 510 |
|
| 511 |
@tool(args_schema=ValidateInput)
|
| 512 |
def validate_answer(proposed_answer: str, original_question: str) -> str:
|
| 513 |
+
"""
|
| 514 |
+
ENHANCED: Validate answer before submission with comprehensive checks.
|
| 515 |
+
|
| 516 |
+
ALWAYS use before final_answer_tool.
|
| 517 |
+
"""
|
| 518 |
start_time = time.time()
|
| 519 |
try:
|
| 520 |
print(f"β Validating: '{proposed_answer[:50]}...'")
|
| 521 |
|
| 522 |
issues = []
|
| 523 |
warnings = []
|
| 524 |
+
suggestions = []
|
| 525 |
|
| 526 |
+
# 1. Check conversational fluff
|
| 527 |
+
fluff = ["the answer is", "based on", "according to", "i found", "here is",
|
| 528 |
+
"here's", "after searching", "from my research", "the result is"]
|
| 529 |
if any(p in proposed_answer.lower() for p in fluff):
|
| 530 |
+
issues.append("β Remove conversational text - answer ONLY")
|
| 531 |
+
|
| 532 |
+
# 2. Check code fences
|
| 533 |
if "```" in proposed_answer:
|
| 534 |
+
issues.append("β Remove code fences (```)")
|
| 535 |
|
| 536 |
+
# 3. Check markdown formatting
|
| 537 |
+
if proposed_answer.startswith('#') or '**' in proposed_answer:
|
| 538 |
+
issues.append("β Remove markdown formatting")
|
| 539 |
+
|
| 540 |
+
# 4. Check length appropriateness
|
| 541 |
+
question_lower = original_question.lower()
|
| 542 |
if len(proposed_answer) > 500:
|
| 543 |
+
if not any(k in question_lower for k in ['explain', 'describe', 'why', 'how does']):
|
| 544 |
+
warnings.append("β οΈ Answer very long. Question asks for short answer?")
|
| 545 |
|
| 546 |
+
# 5. Check for number questions
|
| 547 |
+
number_keywords = ["how many", "what number", "count", "total", "sum",
|
| 548 |
+
"what year", "when did", "what date"]
|
| 549 |
+
if any(k in question_lower for k in number_keywords):
|
| 550 |
if not any(c.isdigit() for c in proposed_answer):
|
| 551 |
+
issues.append("β Question asks for number but answer has no digits")
|
| 552 |
+
else:
|
| 553 |
+
# Extract just the number(s)
|
| 554 |
+
import re
|
| 555 |
+
numbers = re.findall(r'\d+(?:\.\d+)?', proposed_answer)
|
| 556 |
+
if numbers and len(proposed_answer) > 50:
|
| 557 |
+
suggestions.append(f"π‘ Consider just the number(s): {', '.join(numbers)}")
|
| 558 |
+
|
| 559 |
+
# 6. Check for list questions
|
| 560 |
+
list_keywords = ["list", "what are", "name the", "which"]
|
| 561 |
+
if any(k in question_lower for k in list_keywords):
|
| 562 |
+
if '\n' in proposed_answer or len(proposed_answer.split(',')) > 1:
|
| 563 |
+
# Good, it's formatted as a list
|
| 564 |
+
pass
|
| 565 |
+
else:
|
| 566 |
+
warnings.append("β οΈ Question might ask for multiple items")
|
| 567 |
+
|
| 568 |
+
# 7. Check for yes/no questions
|
| 569 |
+
if question_lower.startswith(('is ', 'does ', 'did ', 'can ', 'will ', 'was ', 'were ', 'are ')):
|
| 570 |
+
if proposed_answer.lower() not in ['yes', 'no', 'true', 'false']:
|
| 571 |
+
if not proposed_answer.lower().startswith(('yes', 'no')):
|
| 572 |
+
warnings.append("β οΈ Question seems yes/no. Answer should start with yes/no?")
|
| 573 |
+
|
| 574 |
+
# 8. Check for excessive punctuation
|
| 575 |
+
if proposed_answer.count('!') > 2 or proposed_answer.count('?') > 1:
|
| 576 |
+
issues.append("β Remove excessive punctuation")
|
| 577 |
+
|
| 578 |
+
# 9. Check for quotes around answer
|
| 579 |
+
if (proposed_answer.startswith('"') and proposed_answer.endswith('"')) or \
|
| 580 |
+
(proposed_answer.startswith("'") and proposed_answer.endswith("'")):
|
| 581 |
+
suggestions.append("π‘ Consider removing quotes around answer")
|
| 582 |
+
|
| 583 |
+
# 10. Check for multiple sentences when one expected
|
| 584 |
+
sentences = [s.strip() for s in proposed_answer.split('.') if s.strip()]
|
| 585 |
+
if len(sentences) > 3:
|
| 586 |
+
if not any(k in question_lower for k in ['explain', 'describe', 'why', 'how']):
|
| 587 |
+
warnings.append("β οΈ Multiple sentences. Question asks for simple answer?")
|
| 588 |
+
|
| 589 |
+
# 11. Sanity check: is it empty?
|
| 590 |
+
if not proposed_answer.strip():
|
| 591 |
+
issues.append("β Answer is empty!")
|
| 592 |
+
|
| 593 |
+
# 12. Check for units in measurement questions
|
| 594 |
+
unit_keywords = ['height', 'weight', 'distance', 'speed', 'temperature', 'size']
|
| 595 |
+
if any(k in question_lower for k in unit_keywords):
|
| 596 |
+
has_unit = any(u in proposed_answer.lower() for u in
|
| 597 |
+
['km', 'miles', 'kg', 'lbs', 'cm', 'inches', 'celsius',
|
| 598 |
+
'fahrenheit', 'mph', 'kph', 'meters', 'feet'])
|
| 599 |
+
if not has_unit and any(c.isdigit() for c in proposed_answer):
|
| 600 |
+
warnings.append("β οΈ Measurement question but no unit found")
|
| 601 |
+
|
| 602 |
+
# Build response
|
| 603 |
if issues:
|
| 604 |
result = "π« VALIDATION FAILED:\n" + "\n".join(issues)
|
| 605 |
+
if suggestions:
|
| 606 |
+
result += "\n\nSuggestions:\n" + "\n".join(suggestions)
|
| 607 |
+
result += "\n\nFix issues then retry validation."
|
| 608 |
elif warnings:
|
| 609 |
+
result = "β οΈ WARNINGS:\n" + "\n".join(warnings)
|
| 610 |
+
if suggestions:
|
| 611 |
+
result += "\n\nSuggestions:\n" + "\n".join(suggestions)
|
| 612 |
+
result += "\n\nProceed if confident, or refine answer."
|
| 613 |
+
elif suggestions:
|
| 614 |
+
result = "β
PASSED with suggestions:\n" + "\n".join(suggestions)
|
| 615 |
+
result += "\n\nCall final_answer_tool() when ready."
|
| 616 |
else:
|
| 617 |
+
result = "β
VALIDATION PASSED! Call final_answer_tool() now."
|
| 618 |
|
| 619 |
telemetry.record_call("validate_answer", time.time() - start_time, True)
|
| 620 |
return result
|
|
|
|
| 626 |
# =============================================================================
|
| 627 |
# CORE TOOLS
|
| 628 |
# =============================================================================
|
| 629 |
+
class WikipediaInput(BaseModel):
|
| 630 |
+
query: str = Field(description="Topic to search (e.g., 'Mercedes Sosa', 'Python programming')")
|
| 631 |
+
|
| 632 |
+
@tool(args_schema=WikipediaInput)
|
| 633 |
+
@retry_with_backoff(max_retries=2)
|
| 634 |
+
def wikipedia_search(query: str) -> str:
|
| 635 |
+
"""
|
| 636 |
+
Search Wikipedia with automatic page retrieval.
|
| 637 |
+
|
| 638 |
+
Better than search_tool for:
|
| 639 |
+
- Biographical information
|
| 640 |
+
- Historical facts
|
| 641 |
+
- Scientific concepts
|
| 642 |
+
- Counting items in lists (discography, filmography, etc.)
|
| 643 |
+
|
| 644 |
+
Returns full article sections, not just snippets.
|
| 645 |
+
"""
|
| 646 |
+
start_time = time.time()
|
| 647 |
+
|
| 648 |
+
try:
|
| 649 |
+
print(f"π Wikipedia search: {query}")
|
| 650 |
+
|
| 651 |
+
# Check cache first
|
| 652 |
+
cache_key = f"wiki:{query}"
|
| 653 |
+
cached = search_cache.get(cache_key)
|
| 654 |
+
if cached:
|
| 655 |
+
print(f" (cached)")
|
| 656 |
+
telemetry.record_call("wikipedia_search", time.time() - start_time, True)
|
| 657 |
+
return cached
|
| 658 |
+
|
| 659 |
+
import requests
|
| 660 |
+
|
| 661 |
+
# Step 1: Search for page
|
| 662 |
+
search_url = "https://en.wikipedia.org/w/api.php"
|
| 663 |
+
search_params = {
|
| 664 |
+
'action': 'opensearch',
|
| 665 |
+
'search': query,
|
| 666 |
+
'limit': 1,
|
| 667 |
+
'namespace': 0,
|
| 668 |
+
'format': 'json'
|
| 669 |
+
}
|
| 670 |
+
|
| 671 |
+
response = requests.get(search_url, params=search_params, timeout=10)
|
| 672 |
+
response.raise_for_status()
|
| 673 |
+
search_results = response.json()
|
| 674 |
+
|
| 675 |
+
if not search_results[1]: # No results
|
| 676 |
+
result = f"No Wikipedia article found for: '{query}'"
|
| 677 |
+
search_cache.put(cache_key, result)
|
| 678 |
+
telemetry.record_call("wikipedia_search", time.time() - start_time, True)
|
| 679 |
+
return result
|
| 680 |
+
|
| 681 |
+
page_title = search_results[1][0]
|
| 682 |
+
page_url = search_results[3][0]
|
| 683 |
+
|
| 684 |
+
print(f" Found: {page_title}")
|
| 685 |
+
print(f" URL: {page_url}")
|
| 686 |
+
|
| 687 |
+
# Step 2: Get full page content
|
| 688 |
+
content_params = {
|
| 689 |
+
'action': 'query',
|
| 690 |
+
'titles': page_title,
|
| 691 |
+
'prop': 'extracts',
|
| 692 |
+
'explaintext': True,
|
| 693 |
+
'format': 'json'
|
| 694 |
+
}
|
| 695 |
+
|
| 696 |
+
response = requests.get(search_url, params=content_params, timeout=10)
|
| 697 |
+
response.raise_for_status()
|
| 698 |
+
data = response.json()
|
| 699 |
+
|
| 700 |
+
pages = data['query']['pages']
|
| 701 |
+
page_id = list(pages.keys())[0]
|
| 702 |
+
|
| 703 |
+
if page_id == '-1':
|
| 704 |
+
result = f"Wikipedia page not found: '{query}'"
|
| 705 |
+
search_cache.put(cache_key, result)
|
| 706 |
+
telemetry.record_call("wikipedia_search", time.time() - start_time, True)
|
| 707 |
+
return result
|
| 708 |
+
|
| 709 |
+
content = pages[page_id].get('extract', '')
|
| 710 |
+
|
| 711 |
+
if not content:
|
| 712 |
+
result = f"Wikipedia page found but content empty: '{page_title}'"
|
| 713 |
+
search_cache.put(cache_key, result)
|
| 714 |
+
telemetry.record_call("wikipedia_search", time.time() - start_time, True)
|
| 715 |
+
return result
|
| 716 |
+
|
| 717 |
+
print(f" Retrieved {len(content)} chars")
|
| 718 |
+
|
| 719 |
+
# Format result
|
| 720 |
+
result = f"Wikipedia: {page_title}\n"
|
| 721 |
+
result += f"URL: {page_url}\n\n"
|
| 722 |
+
result += content
|
| 723 |
+
result = truncate_if_needed(result, max_length=12000) # Allow more for Wikipedia
|
| 724 |
+
|
| 725 |
+
# Cache result
|
| 726 |
+
search_cache.put(cache_key, result)
|
| 727 |
+
|
| 728 |
+
telemetry.record_call("wikipedia_search", time.time() - start_time, True)
|
| 729 |
+
return result
|
| 730 |
+
|
| 731 |
+
except Exception as e:
|
| 732 |
+
telemetry.record_call("wikipedia_search", time.time() - start_time, False)
|
| 733 |
+
raise ToolError("wikipedia_search", e, "Try a more specific search term")
|
| 734 |
+
|
| 735 |
+
|
| 736 |
class SearchInput(BaseModel):
|
| 737 |
query: str = Field(description="Search query (concise)")
|
| 738 |
|
|
|
|
| 864 |
raise ToolError("code_interpreter", e, "Check code syntax")
|
| 865 |
|
| 866 |
|
| 867 |
+
class AnalyzeDataInput(BaseModel):
|
| 868 |
+
file_path: str = Field(description="Path to CSV or Excel file")
|
| 869 |
+
question: str = Field(description="What to find (e.g., 'count rows where year > 2000')")
|
| 870 |
+
|
| 871 |
+
@tool(args_schema=AnalyzeDataInput)
|
| 872 |
+
def analyze_data_file(file_path: str, question: str) -> str:
|
| 873 |
+
"""
|
| 874 |
+
Analyze CSV/Excel files with automatic data profiling.
|
| 875 |
+
|
| 876 |
+
Generates Python code to answer questions about data files.
|
| 877 |
+
Better than code_interpreter alone because it:
|
| 878 |
+
1. Profiles the data first (columns, types, sample)
|
| 879 |
+
2. Generates appropriate pandas code
|
| 880 |
+
3. Handles common data issues (encoding, missing values)
|
| 881 |
+
|
| 882 |
+
Use for questions like:
|
| 883 |
+
- "How many rows have X?"
|
| 884 |
+
- "What's the sum/average of column Y?"
|
| 885 |
+
- "Count items grouped by Z"
|
| 886 |
+
"""
|
| 887 |
+
start_time = time.time()
|
| 888 |
+
|
| 889 |
+
try:
|
| 890 |
+
print(f"π Analyzing data file: {file_path}")
|
| 891 |
+
print(f" Question: {question[:100]}...")
|
| 892 |
+
|
| 893 |
+
# Find file
|
| 894 |
+
data_file = find_file(file_path)
|
| 895 |
+
if not data_file:
|
| 896 |
+
raise FileNotFoundError(f"Data file not found: {file_path}")
|
| 897 |
+
|
| 898 |
+
file_ext = data_file.suffix.lower()
|
| 899 |
+
|
| 900 |
+
if file_ext not in ['.csv', '.xlsx', '.xls', '.tsv']:
|
| 901 |
+
raise ValueError(f"Unsupported file type: {file_ext}. Use .csv, .xlsx, .xls, or .tsv")
|
| 902 |
+
|
| 903 |
+
print(f" File type: {file_ext}")
|
| 904 |
+
|
| 905 |
+
# Generate profiling code
|
| 906 |
+
profiling_code = f"""
|
| 907 |
+
import pandas as pd
|
| 908 |
+
import numpy as np
|
| 909 |
+
|
| 910 |
+
# Load file
|
| 911 |
+
file_path = r"{data_file}"
|
| 912 |
+
"""
|
| 913 |
+
|
| 914 |
+
if file_ext == '.csv':
|
| 915 |
+
profiling_code += """
|
| 916 |
+
# Try different encodings
|
| 917 |
+
for encoding in ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252']:
|
| 918 |
+
try:
|
| 919 |
+
df = pd.read_csv(file_path, encoding=encoding)
|
| 920 |
+
break
|
| 921 |
+
except:
|
| 922 |
+
continue
|
| 923 |
+
"""
|
| 924 |
+
elif file_ext == '.tsv':
|
| 925 |
+
profiling_code += """
|
| 926 |
+
df = pd.read_csv(file_path, sep='\\t', encoding='utf-8')
|
| 927 |
+
"""
|
| 928 |
+
else: # Excel
|
| 929 |
+
profiling_code += """
|
| 930 |
+
df = pd.read_excel(file_path)
|
| 931 |
+
"""
|
| 932 |
+
|
| 933 |
+
profiling_code += """
|
| 934 |
+
# Profile data
|
| 935 |
+
print("=" * 60)
|
| 936 |
+
print("DATA PROFILE")
|
| 937 |
+
print("=" * 60)
|
| 938 |
+
print(f"Shape: {df.shape[0]} rows Γ {df.shape[1]} columns")
|
| 939 |
+
print(f"\\nColumns: {', '.join(df.columns.tolist())}")
|
| 940 |
+
print(f"\\nData types:")
|
| 941 |
+
print(df.dtypes)
|
| 942 |
+
print(f"\\nFirst 3 rows:")
|
| 943 |
+
print(df.head(3))
|
| 944 |
+
print(f"\\nMissing values:")
|
| 945 |
+
print(df.isnull().sum())
|
| 946 |
+
"""
|
| 947 |
+
|
| 948 |
+
# Execute profiling
|
| 949 |
+
print(f" Profiling data...")
|
| 950 |
+
output_stream = io.StringIO()
|
| 951 |
+
error_stream = io.StringIO()
|
| 952 |
+
|
| 953 |
+
with contextlib.redirect_stdout(output_stream), contextlib.redirect_stderr(error_stream):
|
| 954 |
+
exec(profiling_code, {"pd": pd, "np": np, "__builtins__": __builtins__})
|
| 955 |
+
|
| 956 |
+
profile_output = output_stream.getvalue()
|
| 957 |
+
|
| 958 |
+
if error_stream.getvalue():
|
| 959 |
+
raise RuntimeError(f"Profiling failed: {error_stream.getvalue()}")
|
| 960 |
+
|
| 961 |
+
print(f" Profiling complete")
|
| 962 |
+
print(profile_output[:500] + "..." if len(profile_output) > 500 else profile_output)
|
| 963 |
+
|
| 964 |
+
# Now generate analysis code based on question
|
| 965 |
+
analysis_code = profiling_code + f"""
|
| 966 |
+
|
| 967 |
+
# Analysis for: {question}
|
| 968 |
+
print("\\n" + "=" * 60)
|
| 969 |
+
print("ANALYSIS RESULT")
|
| 970 |
+
print("=" * 60)
|
| 971 |
+
|
| 972 |
+
"""
|
| 973 |
+
|
| 974 |
+
# Add intelligent code based on question keywords
|
| 975 |
+
q_lower = question.lower()
|
| 976 |
+
|
| 977 |
+
if 'count' in q_lower or 'how many' in q_lower:
|
| 978 |
+
if 'where' in q_lower or 'with' in q_lower:
|
| 979 |
+
analysis_code += """
|
| 980 |
+
# Count rows matching condition
|
| 981 |
+
# NOTE: Adjust the filter condition based on your needs
|
| 982 |
+
result = len(df) # Total count
|
| 983 |
+
print(f"Total rows: {result}")
|
| 984 |
+
|
| 985 |
+
# Example filters (uncomment and modify as needed):
|
| 986 |
+
# result = len(df[df['column'] > value])
|
| 987 |
+
# result = len(df[df['column'].str.contains('text', na=False)])
|
| 988 |
+
"""
|
| 989 |
+
else:
|
| 990 |
+
analysis_code += """
|
| 991 |
+
result = len(df)
|
| 992 |
+
print(f"Total rows: {result}")
|
| 993 |
+
"""
|
| 994 |
+
|
| 995 |
+
elif 'sum' in q_lower or 'total' in q_lower:
|
| 996 |
+
analysis_code += """
|
| 997 |
+
# Sum a numeric column
|
| 998 |
+
# NOTE: Replace 'column_name' with actual column
|
| 999 |
+
# result = df['column_name'].sum()
|
| 1000 |
+
# print(f"Sum: {result}")
|
| 1001 |
+
"""
|
| 1002 |
+
|
| 1003 |
+
elif 'average' in q_lower or 'mean' in q_lower:
|
| 1004 |
+
analysis_code += """
|
| 1005 |
+
# Average of a column
|
| 1006 |
+
# result = df['column_name'].mean()
|
| 1007 |
+
# print(f"Average: {result}")
|
| 1008 |
+
"""
|
| 1009 |
+
|
| 1010 |
+
elif 'group' in q_lower or 'by' in q_lower:
|
| 1011 |
+
analysis_code += """
|
| 1012 |
+
# Group by and count
|
| 1013 |
+
# result = df.groupby('column_name').size()
|
| 1014 |
+
# print(result)
|
| 1015 |
+
"""
|
| 1016 |
+
|
| 1017 |
+
else:
|
| 1018 |
+
# Generic: show summary
|
| 1019 |
+
analysis_code += """
|
| 1020 |
+
# Summary statistics
|
| 1021 |
+
print(df.describe())
|
| 1022 |
+
"""
|
| 1023 |
+
|
| 1024 |
+
result = f"""Data Profile:
|
| 1025 |
+
{profile_output}
|
| 1026 |
+
|
| 1027 |
+
Generated Analysis Code:
|
| 1028 |
+
```python
|
| 1029 |
+
{analysis_code}
|
| 1030 |
+
```
|
| 1031 |
+
|
| 1032 |
+
**IMPORTANT**: The code above needs column names adjusted.
|
| 1033 |
+
Use code_interpreter() with the corrected code to get the answer.
|
| 1034 |
+
|
| 1035 |
+
Columns available: {", ".join(pd.read_csv(data_file) if file_ext == '.csv' else pd.read_excel(data_file)).columns.tolist()}
|
| 1036 |
+
"""
|
| 1037 |
+
|
| 1038 |
+
telemetry.record_call("analyze_data_file", time.time() - start_time, True)
|
| 1039 |
+
return truncate_if_needed(result)
|
| 1040 |
+
|
| 1041 |
+
except Exception as e:
|
| 1042 |
+
telemetry.record_call("analyze_data_file", time.time() - start_time, False)
|
| 1043 |
+
raise ToolError("analyze_data_file", e, "Check file path and format")
|
| 1044 |
+
|
| 1045 |
+
|
| 1046 |
class ReadFileInput(BaseModel):
|
| 1047 |
path: str = Field(description="File path")
|
| 1048 |
|
|
|
|
| 1268 |
raise ToolError("get_youtube_transcript", e)
|
| 1269 |
|
| 1270 |
|
| 1271 |
+
class BrowseInput(BaseModel):
|
| 1272 |
+
start_url: str = Field(description="Starting URL (http:// or https://)")
|
| 1273 |
+
goal: str = Field(description="What you're trying to find (e.g., 'Mercedes Sosa albums 2000-2009')")
|
| 1274 |
+
max_steps: int = Field(description="Max pages to visit (1-5)", default=3)
|
| 1275 |
+
|
| 1276 |
+
@tool(args_schema=BrowseInput)
|
| 1277 |
+
@retry_with_backoff(max_retries=2)
|
| 1278 |
+
def iterative_web_browser(start_url: str, goal: str, max_steps: int = 3) -> str:
|
| 1279 |
+
"""
|
| 1280 |
+
Multi-turn web browsing - follows links iteratively to find information.
|
| 1281 |
+
|
| 1282 |
+
Use when:
|
| 1283 |
+
- Information requires navigating through multiple pages
|
| 1284 |
+
- Need to follow "Read more" or "Details" links
|
| 1285 |
+
- Example: "Find Mercedes Sosa's discography, then count 2000-2009 albums"
|
| 1286 |
+
|
| 1287 |
+
This tool:
|
| 1288 |
+
1. Visits start_url
|
| 1289 |
+
2. Searches content for goal-related info
|
| 1290 |
+
3. Extracts relevant links
|
| 1291 |
+
4. Follows most promising link
|
| 1292 |
+
5. Repeats until info found or max_steps reached
|
| 1293 |
+
|
| 1294 |
+
Better than scrape_and_retrieve when single page doesn't have complete info.
|
| 1295 |
+
"""
|
| 1296 |
+
start_time = time.time()
|
| 1297 |
+
|
| 1298 |
+
try:
|
| 1299 |
+
if not rag_manager.is_ready():
|
| 1300 |
+
rag_manager.initialize()
|
| 1301 |
+
|
| 1302 |
+
print(f"π Iterative browsing starting at: {start_url}")
|
| 1303 |
+
print(f" Goal: {goal[:100]}...")
|
| 1304 |
+
print(f" Max steps: {max_steps}")
|
| 1305 |
+
|
| 1306 |
+
visited_urls = set()
|
| 1307 |
+
current_url = start_url
|
| 1308 |
+
all_findings = []
|
| 1309 |
+
|
| 1310 |
+
headers = {
|
| 1311 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
| 1312 |
+
}
|
| 1313 |
+
|
| 1314 |
+
for step in range(max_steps):
|
| 1315 |
+
if current_url in visited_urls:
|
| 1316 |
+
print(f" Step {step+1}: Already visited, stopping")
|
| 1317 |
+
break
|
| 1318 |
+
|
| 1319 |
+
visited_urls.add(current_url)
|
| 1320 |
+
print(f" Step {step+1}: Visiting {current_url}")
|
| 1321 |
+
|
| 1322 |
+
try:
|
| 1323 |
+
response = requests.get(current_url, headers=headers, timeout=15)
|
| 1324 |
+
response.raise_for_status()
|
| 1325 |
+
|
| 1326 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 1327 |
+
|
| 1328 |
+
# Remove noise
|
| 1329 |
+
for tag in soup(["script", "style", "nav", "footer", "aside", "header", "iframe"]):
|
| 1330 |
+
tag.extract()
|
| 1331 |
+
|
| 1332 |
+
# Extract main content
|
| 1333 |
+
main = soup.find('main') or soup.find('article') or soup.find('div', class_='mw-parser-output') or soup.body
|
| 1334 |
+
|
| 1335 |
+
if not main:
|
| 1336 |
+
print(f" No main content found")
|
| 1337 |
+
continue
|
| 1338 |
+
|
| 1339 |
+
text = main.get_text(separator='\n', strip=True)
|
| 1340 |
+
lines = [l.strip() for l in text.splitlines() if l.strip()]
|
| 1341 |
+
text = '\n'.join(lines)
|
| 1342 |
+
|
| 1343 |
+
print(f" Extracted {len(text)} chars")
|
| 1344 |
+
|
| 1345 |
+
# Search for goal-related content
|
| 1346 |
+
chunks = rag_manager.text_splitter.split_text(text)
|
| 1347 |
+
docs = [Document(page_content=c, metadata={"source": current_url, "step": step+1}) for c in chunks]
|
| 1348 |
+
|
| 1349 |
+
db = FAISS.from_documents(docs, rag_manager.embeddings)
|
| 1350 |
+
retriever = db.as_retriever(search_kwargs={"k": 3})
|
| 1351 |
+
retrieved = retriever.invoke(goal)
|
| 1352 |
+
|
| 1353 |
+
# Clean up
|
| 1354 |
+
del db
|
| 1355 |
+
del retriever
|
| 1356 |
+
import gc
|
| 1357 |
+
gc.collect()
|
| 1358 |
+
|
| 1359 |
+
if retrieved:
|
| 1360 |
+
print(f" Found {len(retrieved)} relevant chunks")
|
| 1361 |
+
for i, doc in enumerate(retrieved):
|
| 1362 |
+
all_findings.append({
|
| 1363 |
+
'step': step + 1,
|
| 1364 |
+
'url': current_url,
|
| 1365 |
+
'content': doc.page_content
|
| 1366 |
+
})
|
| 1367 |
+
|
| 1368 |
+
# Extract links for next step
|
| 1369 |
+
if step < max_steps - 1:
|
| 1370 |
+
links = []
|
| 1371 |
+
for a in main.find_all('a', href=True):
|
| 1372 |
+
href = a.get('href')
|
| 1373 |
+
text = a.get_text(strip=True).lower()
|
| 1374 |
+
|
| 1375 |
+
# Make absolute URL
|
| 1376 |
+
if href.startswith('/'):
|
| 1377 |
+
from urllib.parse import urljoin
|
| 1378 |
+
href = urljoin(current_url, href)
|
| 1379 |
+
|
| 1380 |
+
# Filter relevant links
|
| 1381 |
+
goal_keywords = goal.lower().split()
|
| 1382 |
+
if any(keyword in href.lower() or keyword in text for keyword in goal_keywords):
|
| 1383 |
+
if href.startswith('http') and href not in visited_urls:
|
| 1384 |
+
links.append((href, text))
|
| 1385 |
+
|
| 1386 |
+
if links:
|
| 1387 |
+
# Pick most relevant link
|
| 1388 |
+
current_url = links[0][0]
|
| 1389 |
+
print(f" Found {len(links)} potential links, following: {links[0][1][:50]}")
|
| 1390 |
+
else:
|
| 1391 |
+
print(f" No more relevant links found")
|
| 1392 |
+
break
|
| 1393 |
+
else:
|
| 1394 |
+
print(f" Max steps reached")
|
| 1395 |
+
break
|
| 1396 |
+
|
| 1397 |
+
except Exception as e:
|
| 1398 |
+
print(f" Error on step {step+1}: {e}")
|
| 1399 |
+
break
|
| 1400 |
+
|
| 1401 |
+
# Compile findings
|
| 1402 |
+
if not all_findings:
|
| 1403 |
+
result = f"Browsed {len(visited_urls)} pages but found no relevant information for: '{goal}'"
|
| 1404 |
+
else:
|
| 1405 |
+
result = f"Information gathered from {len(visited_urls)} pages:\n\n"
|
| 1406 |
+
for finding in all_findings:
|
| 1407 |
+
result += f"[Step {finding['step']} - {finding['url']}]\n{finding['content']}\n\n---\n\n"
|
| 1408 |
+
result = truncate_if_needed(result)
|
| 1409 |
+
|
| 1410 |
+
telemetry.record_call("iterative_web_browser", time.time() - start_time, True)
|
| 1411 |
+
return result
|
| 1412 |
+
|
| 1413 |
+
except Exception as e:
|
| 1414 |
+
telemetry.record_call("iterative_web_browser", time.time() - start_time, False)
|
| 1415 |
+
raise ToolError("iterative_web_browser", e, "Try starting from a more specific URL")
|
| 1416 |
+
|
| 1417 |
+
|
| 1418 |
class ScrapeInput(BaseModel):
|
| 1419 |
url: str = Field(description="URL (http:// or https://)")
|
| 1420 |
query: str = Field(description="Specific info to find")
|
|
|
|
| 1522 |
# TOOLS LIST
|
| 1523 |
# =============================================================================
|
| 1524 |
defined_tools = [
|
| 1525 |
+
# Planning & Reflection
|
| 1526 |
think_through_logic,
|
| 1527 |
create_plan,
|
| 1528 |
reflect_on_progress,
|
| 1529 |
validate_answer,
|
| 1530 |
+
|
| 1531 |
+
# Search & Browse
|
| 1532 |
+
wikipedia_search, # NEW: Better for encyclopedic queries
|
| 1533 |
search_tool,
|
| 1534 |
+
iterative_web_browser, # NEW: Multi-turn web navigation
|
| 1535 |
+
scrape_and_retrieve,
|
| 1536 |
+
|
| 1537 |
+
# Core computation
|
| 1538 |
calculator,
|
| 1539 |
code_interpreter,
|
| 1540 |
+
|
| 1541 |
+
# File operations
|
| 1542 |
read_file,
|
| 1543 |
write_file,
|
| 1544 |
list_directory,
|
| 1545 |
+
analyze_data_file, # NEW: Smart CSV/Excel analysis
|
| 1546 |
+
|
| 1547 |
+
# Specialized
|
| 1548 |
audio_transcription_tool,
|
| 1549 |
+
analyze_image,
|
| 1550 |
get_youtube_transcript,
|
| 1551 |
+
|
| 1552 |
+
# Final
|
| 1553 |
final_answer_tool
|
| 1554 |
]
|
| 1555 |
|
|
|
|
| 1715 |
tool_desc_list.append(desc)
|
| 1716 |
tool_descriptions = "\n".join(tool_desc_list)
|
| 1717 |
|
| 1718 |
+
self.system_prompt = f"""You are an elite AI agent for GAIA benchmark. Your ONLY job: provide the EXACT answer requested.
|
| 1719 |
+
|
| 1720 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1721 |
+
β οΈ ABSOLUTE RULES - VIOLATE THESE AND YOU FAIL:
|
| 1722 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1723 |
+
|
| 1724 |
+
1. **EVERY TURN MUST CALL EXACTLY ONE TOOL** - No exceptions
|
| 1725 |
+
2. **NEVER OUTPUT REASONING TEXT WITHOUT A TOOL CALL** - You will fail
|
| 1726 |
+
3. **IDENTIFY QUESTION TYPE FIRST** - Logic? Factual? Data? Math?
|
| 1727 |
+
4. **ALWAYS VALIDATE**: Call validate_answer() before final_answer_tool()
|
| 1728 |
+
5. **FINAL ANSWER FORMAT**: EXACTLY what was asked. NO "The answer is..." or explanations
|
| 1729 |
|
| 1730 |
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1731 |
+
π QUESTION TYPE β TOOL SEQUENCE:
|
| 1732 |
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1733 |
|
| 1734 |
+
**LOGIC PUZZLES** (No web search needed):
|
| 1735 |
+
β think_through_logic β calculator (if math) β validate β final_answer
|
| 1736 |
+
|
| 1737 |
+
**FACTUAL/BIOGRAPHICAL** (Need web):
|
| 1738 |
+
β wikipedia_search (if person/place/thing) β validate β final_answer
|
| 1739 |
+
OR search_tool β scrape_and_retrieve β validate β final_answer
|
| 1740 |
+
|
| 1741 |
+
**COUNTING FROM WEB** (Need full page content):
|
| 1742 |
+
β wikipedia_search (if Wikipedia topic) β validate β final_answer
|
| 1743 |
+
OR iterative_web_browser (if needs navigation) β validate β final_answer
|
| 1744 |
+
|
| 1745 |
+
**DATA FILES** (CSV/Excel):
|
| 1746 |
+
β list_directory β analyze_data_file β code_interpreter β validate β final_answer
|
| 1747 |
+
|
| 1748 |
+
**IMAGES** (Chess, diagrams, photos):
|
| 1749 |
+
β analyze_image β validate β final_answer
|
| 1750 |
+
|
| 1751 |
+
**AUDIO FILES**:
|
| 1752 |
+
β audio_transcription_tool β validate β final_answer
|
| 1753 |
+
|
| 1754 |
+
**MATH CALCULATIONS**:
|
| 1755 |
+
β calculator β validate β final_answer
|
| 1756 |
|
| 1757 |
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1758 |
+
π― CRITICAL TOOL USAGE PATTERNS:
|
| 1759 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1760 |
+
|
| 1761 |
+
**For Counting Questions:**
|
| 1762 |
+
BAD: search_tool("Mercedes Sosa albums") β snippets only
|
| 1763 |
+
GOOD: wikipedia_search("Mercedes Sosa") β full discography section
|
| 1764 |
+
|
| 1765 |
+
**For Multi-Step Web Questions:**
|
| 1766 |
+
BAD: scrape_and_retrieve("https://...") β single page only
|
| 1767 |
+
GOOD: iterative_web_browser("https://...", "find X", max_steps=3)
|
| 1768 |
+
|
| 1769 |
+
**For Data Questions:**
|
| 1770 |
+
BAD: read_file("data.csv") β raw text dump
|
| 1771 |
+
GOOD: analyze_data_file("data.csv", "count rows where X > Y")
|
| 1772 |
+
|
| 1773 |
+
**For Validation:**
|
| 1774 |
+
ALWAYS: validate_answer("your answer", "original question")
|
| 1775 |
+
THEN: final_answer_tool("your answer")
|
| 1776 |
+
|
| 1777 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1778 |
+
π AVAILABLE TOOLS:
|
| 1779 |
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1780 |
|
| 1781 |
{tool_descriptions}
|
| 1782 |
|
| 1783 |
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1784 |
+
β‘ EXECUTION RULES:
|
| 1785 |
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1786 |
|
| 1787 |
+
- Text without tool call = FAILURE
|
| 1788 |
+
- Unsure? β think_through_logic() to organize thoughts
|
| 1789 |
+
- After EVERY tool result: "Do I have the answer? β validate β submit"
|
| 1790 |
- Stuck after 3 turns? β reflect_on_progress()
|
| 1791 |
+
- For Wikipedia topics β ALWAYS use wikipedia_search, NOT search_tool
|
| 1792 |
+
- For counting from web β Use wikipedia_search or iterative_web_browser
|
| 1793 |
+
- For data files β Use analyze_data_file, NOT just read_file
|
| 1794 |
+
|
| 1795 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1796 |
+
π EXAMPLES OF PERFECT EXECUTION:
|
| 1797 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1798 |
+
|
| 1799 |
+
Example 1: "How many studio albums did Mercedes Sosa release 2000-2009?"
|
| 1800 |
+
Turn 1: wikipedia_search("Mercedes Sosa")
|
| 1801 |
+
β Gets full discography with all albums and years
|
| 1802 |
+
Turn 2: code_interpreter("count albums 2000-2009 from text")
|
| 1803 |
+
β Result: 3
|
| 1804 |
+
Turn 3: validate_answer("3", "How many studio albums...")
|
| 1805 |
+
β β
PASSED
|
| 1806 |
+
Turn 4: final_answer_tool("3")
|
| 1807 |
+
|
| 1808 |
+
Example 2: "What's the population of Einstein's birthplace in 1900?"
|
| 1809 |
+
Turn 1: wikipedia_search("Albert Einstein")
|
| 1810 |
+
β Birthplace: Ulm, Germany
|
| 1811 |
+
Turn 2: search_tool("Ulm Germany population 1900")
|
| 1812 |
+
β Find sources
|
| 1813 |
+
Turn 3: scrape_and_retrieve("url", "population 1900")
|
| 1814 |
+
β ~50,000
|
| 1815 |
+
Turn 4: validate_answer("50000", "population 1900")
|
| 1816 |
+
β β
PASSED
|
| 1817 |
+
Turn 5: final_answer_tool("50000")
|
| 1818 |
+
|
| 1819 |
+
Example 3: Logic puzzle
|
| 1820 |
+
Turn 1: think_through_logic("Work through the logic...")
|
| 1821 |
+
β Reasoning recorded
|
| 1822 |
+
Turn 2: calculator("30") [if calculation needed]
|
| 1823 |
+
β 30
|
| 1824 |
+
Turn 3: validate_answer("30", "coin puzzle")
|
| 1825 |
+
β β
PASSED
|
| 1826 |
+
Turn 4: final_answer_tool("30")
|
| 1827 |
+
|
| 1828 |
+
βββββββββββββοΏ½οΏ½βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1829 |
+
REMEMBER: One tool per turn. No reasoning without tools. Exact answer format.
|
| 1830 |
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1831 |
"""
|
| 1832 |
|
| 1833 |
+
# Initialize LLMs (Groq primary, Claude fallback)
|
| 1834 |
+
print("Initializing LLMs...")
|
| 1835 |
+
|
| 1836 |
+
# Primary: Groq (fast, free)
|
| 1837 |
+
self.groq_llm = ChatGroq(
|
| 1838 |
temperature=0,
|
| 1839 |
groq_api_key=GROQ_API_KEY,
|
| 1840 |
model_name="llama-3.3-70b-versatile",
|
|
|
|
| 1842 |
timeout=60
|
| 1843 |
).bind_tools(self.tools, tool_choice="auto")
|
| 1844 |
|
| 1845 |
+
# Fallback: Claude (slower, more reliable)
|
| 1846 |
+
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
|
| 1847 |
+
if ANTHROPIC_API_KEY:
|
| 1848 |
+
from langchain_anthropic import ChatAnthropic
|
| 1849 |
+
self.claude_llm = ChatAnthropic(
|
| 1850 |
+
model="claude-sonnet-4-20250514",
|
| 1851 |
+
anthropic_api_key=ANTHROPIC_API_KEY,
|
| 1852 |
+
temperature=0,
|
| 1853 |
+
max_tokens=4096
|
| 1854 |
+
).bind_tools(self.tools, tool_choice="auto")
|
| 1855 |
+
print("β
Both Groq and Claude initialized")
|
| 1856 |
+
else:
|
| 1857 |
+
self.claude_llm = None
|
| 1858 |
+
print("β
Groq initialized (Claude fallback unavailable)")
|
| 1859 |
+
|
| 1860 |
+
# Start with Groq
|
| 1861 |
+
self.llm_with_tools = self.groq_llm
|
| 1862 |
+
self.current_llm = "groq"
|
| 1863 |
|
| 1864 |
# Build agent graph
|
| 1865 |
def agent_node(state: AgentState):
|
|
|
|
| 1896 |
messages_to_send.append(hint)
|
| 1897 |
print("π€ Reflection hint")
|
| 1898 |
|
| 1899 |
+
# Invoke LLM with retries and fallback
|
| 1900 |
ai_message = None
|
| 1901 |
|
| 1902 |
for attempt in range(config.MAX_RETRIES):
|
|
|
|
| 1909 |
print(f"β οΈ No tool calls (attempt {attempt+1})")
|
| 1910 |
|
| 1911 |
except Exception as e:
|
| 1912 |
+
error_str = str(e)
|
| 1913 |
+
print(f"β οΈ {self.current_llm.upper()} error (attempt {attempt+1}): {error_str[:200]}")
|
| 1914 |
+
|
| 1915 |
+
# If Groq fails and we have Claude, switch to Claude
|
| 1916 |
+
if self.current_llm == "groq" and self.claude_llm and attempt == config.MAX_RETRIES - 1:
|
| 1917 |
+
print("π Switching from Groq to Claude for this question...")
|
| 1918 |
+
self.llm_with_tools = self.claude_llm
|
| 1919 |
+
self.current_llm = "claude"
|
| 1920 |
+
try:
|
| 1921 |
+
ai_message = self.llm_with_tools.invoke(messages_to_send)
|
| 1922 |
+
if ai_message.tool_calls:
|
| 1923 |
+
break
|
| 1924 |
+
except Exception as e2:
|
| 1925 |
+
print(f"β οΈ Claude also failed: {e2}")
|
| 1926 |
|
| 1927 |
if attempt == config.MAX_RETRIES - 1:
|
| 1928 |
print("π¨ Forcing think_through_logic")
|
|
|
|
| 2059 |
"last_tool_was_thinking": False
|
| 2060 |
}
|
| 2061 |
|
| 2062 |
+
# Reset to Groq for each question
|
| 2063 |
+
if self.groq_llm:
|
| 2064 |
+
self.llm_with_tools = self.groq_llm
|
| 2065 |
+
self.current_llm = "groq"
|
| 2066 |
+
|
| 2067 |
final_answer = "AGENT FAILED"
|
| 2068 |
all_messages = []
|
| 2069 |
|
|
|
|
| 2108 |
break
|
| 2109 |
break
|
| 2110 |
|
| 2111 |
+
# Clean answer more aggressively
|
| 2112 |
cleaned = str(final_answer).strip()
|
| 2113 |
|
| 2114 |
+
# Remove common prefixes (case-insensitive)
|
| 2115 |
prefixes = [
|
| 2116 |
"the answer is:", "here is the answer:", "based on",
|
| 2117 |
"final answer:", "answer:", "the final answer is:",
|
| 2118 |
"my answer is:", "according to", "i found that",
|
| 2119 |
+
"the result is:", "result:", "here's the answer:",
|
| 2120 |
+
"after analysis:", "the correct answer is:",
|
| 2121 |
+
"from the data:", "from the search:",
|
| 2122 |
]
|
| 2123 |
for prefix in prefixes:
|
| 2124 |
if cleaned.lower().startswith(prefix.lower()):
|
|
|
|
| 2130 |
# Remove code fences
|
| 2131 |
cleaned = remove_fences_simple(cleaned)
|
| 2132 |
|
| 2133 |
+
# Remove backticks
|
| 2134 |
while cleaned.startswith("`") and cleaned.endswith("`"):
|
| 2135 |
cleaned = cleaned[1:-1].strip()
|
| 2136 |
|
| 2137 |
+
# Remove quotes (but only if they wrap entire answer)
|
| 2138 |
if (cleaned.startswith('"') and cleaned.endswith('"')) or \
|
| 2139 |
(cleaned.startswith("'") and cleaned.endswith("'")):
|
| 2140 |
cleaned = cleaned[1:-1].strip()
|
| 2141 |
|
| 2142 |
+
# Remove trailing period for short answers
|
| 2143 |
if cleaned.endswith('.') and len(cleaned.split()) < 10:
|
| 2144 |
cleaned = cleaned[:-1]
|
| 2145 |
|
| 2146 |
+
# Remove markdown bold/italic
|
| 2147 |
+
cleaned = cleaned.replace('**', '').replace('__', '').replace('*', '').replace('_', '')
|
| 2148 |
+
|
| 2149 |
+
# Remove bullet points
|
| 2150 |
+
if cleaned.startswith(('- ', '* ', 'β’ ')):
|
| 2151 |
+
cleaned = cleaned[2:].strip()
|
| 2152 |
+
|
| 2153 |
+
# Remove numbered list prefix
|
| 2154 |
+
import re
|
| 2155 |
+
cleaned = re.sub(r'^\d+\.\s+', '', cleaned)
|
| 2156 |
+
|
| 2157 |
+
# Final whitespace cleanup
|
| 2158 |
+
cleaned = ' '.join(cleaned.split())
|
| 2159 |
+
|
| 2160 |
print(f"\nπ RETURNING: {cleaned}\n")
|
| 2161 |
|
| 2162 |
return cleaned
|