Spaces:
Sleeping
Sleeping
Chris
commited on
Commit
·
6c60f72
1
Parent(s):
5ec1e1b
Final 7.1.3
Browse files
src/agents/__pycache__/web_researcher.cpython-310.pyc
CHANGED
|
Binary files a/src/agents/__pycache__/web_researcher.cpython-310.pyc and b/src/agents/__pycache__/web_researcher.cpython-310.pyc differ
|
|
|
src/agents/web_researcher.py
CHANGED
|
@@ -413,90 +413,114 @@ class WebResearchAgent:
|
|
| 413 |
|
| 414 |
return ' '.join(topic_words[:3]) if topic_words else "topic"
|
| 415 |
|
| 416 |
-
def _extract_search_terms(self, question: str) -> str:
|
| 417 |
-
"""
|
|
|
|
|
|
|
|
|
|
| 418 |
|
| 419 |
-
#
|
| 420 |
-
|
|
|
|
| 421 |
|
| 422 |
-
#
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
#
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
# Extract
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
years = re.findall(r'\b(19|20)\d{2}\b', question)
|
| 440 |
-
numbers = re.findall(r'\b\d+\b', question)
|
| 441 |
-
|
| 442 |
-
# Remove very common stop words and question patterns
|
| 443 |
-
stop_patterns = [
|
| 444 |
-
r'\b(?:what|who|when|where|why|how|is|are|was|were|do|does|did|can|could|would|should|will)\b',
|
| 445 |
-
r'\b(?:the|a|an|and|or|but|in|on|at|to|for|of|with|by|from|about)\b',
|
| 446 |
-
r'\b(?:please|could|you|tell|me|find|search|for|give|provide|list|show)\b',
|
| 447 |
-
r'\b(?:information|details|data|facts|answer)\b',
|
| 448 |
-
r'[?.,!]+', # Punctuation
|
| 449 |
-
]
|
| 450 |
|
| 451 |
-
#
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
# Extract remaining meaningful words
|
| 457 |
-
words = clean_question.split()
|
| 458 |
-
meaningful_words = []
|
| 459 |
-
|
| 460 |
-
for word in words:
|
| 461 |
-
word = word.strip()
|
| 462 |
-
if len(word) > 2 and word.isalpha(): # Only alphabetic words longer than 2 chars
|
| 463 |
-
meaningful_words.append(word)
|
| 464 |
|
| 465 |
-
# Build search terms
|
| 466 |
search_terms = []
|
| 467 |
|
| 468 |
-
# Add
|
| 469 |
-
|
| 470 |
-
if len(' '.join(search_terms + [noun])) <= 100: # Conservative length limit
|
| 471 |
-
search_terms.append(noun)
|
| 472 |
|
| 473 |
-
# Add
|
| 474 |
-
|
| 475 |
-
if len(' '.join(search_terms + [year])) <= 100:
|
| 476 |
-
search_terms.append(year)
|
| 477 |
|
| 478 |
-
# Add
|
| 479 |
-
for word in
|
| 480 |
-
|
| 481 |
-
if len(potential_query) <= 100: # Keep well under 250 char limit
|
| 482 |
search_terms.append(word)
|
| 483 |
-
else:
|
| 484 |
-
break
|
| 485 |
|
| 486 |
-
#
|
| 487 |
-
|
| 488 |
-
# Take first few words of the original question
|
| 489 |
-
first_words = question.split()[:5] # First 5 words max
|
| 490 |
-
search_terms = [w for w in first_words if w.isalpha() and len(w) > 2]
|
| 491 |
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 497 |
|
| 498 |
-
logger.info(f"📝
|
| 499 |
-
return
|
| 500 |
|
| 501 |
def _extract_youtube_info(self, question: str) -> str:
|
| 502 |
"""Extract YouTube URL or search terms"""
|
|
@@ -578,53 +602,79 @@ class WebResearchAgent:
|
|
| 578 |
def _analyze_web_search_result(self, state: GAIAAgentState, web_result: ToolResult) -> AgentResult:
|
| 579 |
"""Analyze web search results"""
|
| 580 |
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
# Combine top results for analysis
|
| 584 |
-
combined_content = []
|
| 585 |
-
for i, result in enumerate(search_results[:3], 1):
|
| 586 |
-
combined_content.append(f"Result {i}: {result['title']}")
|
| 587 |
-
combined_content.append(f"URL: {result['url']}")
|
| 588 |
-
combined_content.append(f"Description: {result['snippet']}")
|
| 589 |
-
combined_content.append("")
|
| 590 |
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
|
| 610 |
-
|
| 611 |
-
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 617 |
else:
|
| 618 |
-
#
|
| 619 |
-
first_result = search_results[0] if search_results else {}
|
| 620 |
return AgentResult(
|
| 621 |
agent_role=AgentRole.WEB_RESEARCHER,
|
| 622 |
-
success=
|
| 623 |
-
result=
|
| 624 |
-
confidence=0.
|
| 625 |
-
reasoning="
|
| 626 |
tools_used=[web_result],
|
| 627 |
-
model_used="
|
| 628 |
processing_time=web_result.execution_time,
|
| 629 |
cost_estimate=0.0
|
| 630 |
)
|
|
|
|
| 413 |
|
| 414 |
return ' '.join(topic_words[:3]) if topic_words else "topic"
|
| 415 |
|
| 416 |
+
def _extract_search_terms(self, question: str, max_length: int = 100) -> str:
|
| 417 |
+
"""
|
| 418 |
+
Extract optimized search terms from question
|
| 419 |
+
Prioritizes important terms while staying under length limits
|
| 420 |
+
"""
|
| 421 |
|
| 422 |
+
# Clean the question first
|
| 423 |
+
clean_question = re.sub(r'[^\w\s\-]', ' ', question.lower())
|
| 424 |
+
words = clean_question.split()
|
| 425 |
|
| 426 |
+
# Remove common stop words but keep question words
|
| 427 |
+
stop_words = {
|
| 428 |
+
'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
|
| 429 |
+
'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
|
| 430 |
+
'should', 'may', 'might', 'must', 'shall', 'can', 'to', 'of', 'in',
|
| 431 |
+
'on', 'at', 'by', 'for', 'with', 'from', 'as', 'but', 'or', 'and',
|
| 432 |
+
'if', 'then', 'than', 'this', 'that', 'these', 'those', 'i', 'you',
|
| 433 |
+
'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them'
|
| 434 |
+
}
|
| 435 |
+
|
| 436 |
+
# Keep important question words
|
| 437 |
+
question_words = {'who', 'what', 'when', 'where', 'why', 'how', 'which'}
|
| 438 |
+
|
| 439 |
+
# Priority terms (always include if present)
|
| 440 |
+
priority_terms = []
|
| 441 |
+
|
| 442 |
+
# Extract quoted phrases first
|
| 443 |
+
quoted_phrases = re.findall(r'"([^"]*)"', question)
|
| 444 |
+
for phrase in quoted_phrases:
|
| 445 |
+
if len(phrase.strip()) > 0:
|
| 446 |
+
priority_terms.append(phrase.strip())
|
| 447 |
+
|
| 448 |
+
# Extract proper nouns (capitalized words)
|
| 449 |
+
proper_nouns = []
|
| 450 |
+
for word in question.split():
|
| 451 |
+
clean_word = re.sub(r'[^\w]', '', word)
|
| 452 |
+
if clean_word and clean_word[0].isupper() and len(clean_word) > 1:
|
| 453 |
+
proper_nouns.append(clean_word)
|
| 454 |
+
|
| 455 |
+
# Extract years (4-digit numbers)
|
| 456 |
years = re.findall(r'\b(19|20)\d{2}\b', question)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 457 |
|
| 458 |
+
# Extract other important numbers (but not random ones)
|
| 459 |
+
important_numbers = re.findall(r'\b\d{1,4}\b', question)
|
| 460 |
+
# Filter out years and common numbers from important numbers to avoid duplication
|
| 461 |
+
common_numbers = {'19', '20', '1', '2', '3', '4', '5', '10'} # Filter out very common numbers
|
| 462 |
+
important_numbers = [num for num in important_numbers if num not in years and num not in common_numbers]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 463 |
|
| 464 |
+
# Build search terms with priority
|
| 465 |
search_terms = []
|
| 466 |
|
| 467 |
+
# Add quoted phrases (highest priority)
|
| 468 |
+
search_terms.extend(priority_terms)
|
|
|
|
|
|
|
| 469 |
|
| 470 |
+
# Add proper nouns (high priority)
|
| 471 |
+
search_terms.extend(proper_nouns[:5]) # Limit to avoid duplication
|
|
|
|
|
|
|
| 472 |
|
| 473 |
+
# Add question words if present
|
| 474 |
+
for word in words:
|
| 475 |
+
if word in question_words and word not in search_terms:
|
|
|
|
| 476 |
search_terms.append(word)
|
|
|
|
|
|
|
| 477 |
|
| 478 |
+
# Add years
|
| 479 |
+
search_terms.extend(years[:2]) # Limit to 2 years max
|
|
|
|
|
|
|
|
|
|
| 480 |
|
| 481 |
+
# Add other important terms
|
| 482 |
+
for word in words:
|
| 483 |
+
if (word not in stop_words and
|
| 484 |
+
word not in search_terms and
|
| 485 |
+
len(word) > 2 and
|
| 486 |
+
not word.isdigit()): # Avoid random numbers
|
| 487 |
+
search_terms.append(word)
|
| 488 |
+
|
| 489 |
+
# Stop if we have enough terms
|
| 490 |
+
if len(' '.join(search_terms)) > max_length - 20:
|
| 491 |
+
break
|
| 492 |
+
|
| 493 |
+
# Add a few important numbers if space allows
|
| 494 |
+
if len(' '.join(search_terms)) < max_length - 10:
|
| 495 |
+
search_terms.extend(important_numbers[:2])
|
| 496 |
+
|
| 497 |
+
# Join and clean up
|
| 498 |
+
search_query = ' '.join(search_terms)
|
| 499 |
+
|
| 500 |
+
# Remove duplicates while preserving order
|
| 501 |
+
seen = set()
|
| 502 |
+
unique_terms = []
|
| 503 |
+
for term in search_terms:
|
| 504 |
+
if term.lower() not in seen:
|
| 505 |
+
seen.add(term.lower())
|
| 506 |
+
unique_terms.append(term)
|
| 507 |
+
|
| 508 |
+
# Final cleanup and length check
|
| 509 |
+
final_query = ' '.join(unique_terms)
|
| 510 |
+
if len(final_query) > max_length:
|
| 511 |
+
# Truncate to fit
|
| 512 |
+
truncated_terms = []
|
| 513 |
+
current_length = 0
|
| 514 |
+
for term in unique_terms:
|
| 515 |
+
if current_length + len(term) + 1 <= max_length:
|
| 516 |
+
truncated_terms.append(term)
|
| 517 |
+
current_length += len(term) + 1
|
| 518 |
+
else:
|
| 519 |
+
break
|
| 520 |
+
final_query = ' '.join(truncated_terms)
|
| 521 |
|
| 522 |
+
logger.info(f"📝 Optimized search terms: '{final_query}' from question: '{question[:50]}...'")
|
| 523 |
+
return final_query
|
| 524 |
|
| 525 |
def _extract_youtube_info(self, question: str) -> str:
|
| 526 |
"""Extract YouTube URL or search terms"""
|
|
|
|
| 602 |
def _analyze_web_search_result(self, state: GAIAAgentState, web_result: ToolResult) -> AgentResult:
|
| 603 |
"""Analyze web search results"""
|
| 604 |
|
| 605 |
+
search_data = web_result.result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 606 |
|
| 607 |
+
# Handle new search result format
|
| 608 |
+
if search_data.get('success') and search_data.get('results'):
|
| 609 |
+
search_results = search_data['results']
|
| 610 |
+
|
| 611 |
+
# Convert WebSearchResult objects to dictionaries if needed
|
| 612 |
+
if search_results and hasattr(search_results[0], 'to_dict'):
|
| 613 |
+
search_results = [r.to_dict() for r in search_results]
|
| 614 |
+
|
| 615 |
+
# Combine top results for analysis
|
| 616 |
+
combined_content = []
|
| 617 |
+
for i, result in enumerate(search_results[:3], 1):
|
| 618 |
+
combined_content.append(f"Result {i}: {result.get('title', 'No title')}")
|
| 619 |
+
combined_content.append(f"URL: {result.get('url', 'No URL')}")
|
| 620 |
+
combined_content.append(f"Description: {result.get('snippet', result.get('content', 'No description'))[:200]}")
|
| 621 |
+
combined_content.append(f"Source: {result.get('source', 'Unknown')}")
|
| 622 |
+
combined_content.append("")
|
| 623 |
+
|
| 624 |
+
analysis_prompt = f"""
|
| 625 |
+
Based on these web search results, please answer the following question:
|
| 626 |
+
|
| 627 |
+
Question: {state.question}
|
| 628 |
+
|
| 629 |
+
Search Query: {search_data.get('query', 'N/A')}
|
| 630 |
+
Search Engine: {search_data.get('source', 'Unknown')}
|
| 631 |
+
Results Found: {search_data.get('count', len(search_results))}
|
| 632 |
+
|
| 633 |
+
Search Results:
|
| 634 |
+
{chr(10).join(combined_content)}
|
| 635 |
+
|
| 636 |
+
Please provide a direct answer based on the most relevant information.
|
| 637 |
+
"""
|
| 638 |
+
|
| 639 |
+
model_tier = ModelTier.COMPLEX # Use 72B model for better analysis
|
| 640 |
+
llm_result = self.llm_client.generate(analysis_prompt, tier=model_tier, max_tokens=400)
|
| 641 |
+
|
| 642 |
+
if llm_result.success:
|
| 643 |
+
return AgentResult(
|
| 644 |
+
agent_role=AgentRole.WEB_RESEARCHER,
|
| 645 |
+
success=True,
|
| 646 |
+
result=llm_result.response,
|
| 647 |
+
confidence=0.80, # Higher confidence with better model
|
| 648 |
+
reasoning=f"Analyzed {len(search_results)} web search results using {search_data.get('source', 'search engine')}",
|
| 649 |
+
tools_used=[web_result],
|
| 650 |
+
model_used=llm_result.model_used,
|
| 651 |
+
processing_time=web_result.execution_time + llm_result.response_time,
|
| 652 |
+
cost_estimate=llm_result.cost_estimate
|
| 653 |
+
)
|
| 654 |
+
else:
|
| 655 |
+
# Fallback to first result description
|
| 656 |
+
first_result = search_results[0] if search_results else {}
|
| 657 |
+
return AgentResult(
|
| 658 |
+
agent_role=AgentRole.WEB_RESEARCHER,
|
| 659 |
+
success=True,
|
| 660 |
+
result=first_result.get('snippet', first_result.get('content', 'Web search completed')),
|
| 661 |
+
confidence=0.50,
|
| 662 |
+
reasoning="Web search completed but analysis failed",
|
| 663 |
+
tools_used=[web_result],
|
| 664 |
+
model_used="fallback",
|
| 665 |
+
processing_time=web_result.execution_time,
|
| 666 |
+
cost_estimate=0.0
|
| 667 |
+
)
|
| 668 |
else:
|
| 669 |
+
# Handle search failure or empty results
|
|
|
|
| 670 |
return AgentResult(
|
| 671 |
agent_role=AgentRole.WEB_RESEARCHER,
|
| 672 |
+
success=False,
|
| 673 |
+
result="Web search returned no useful results",
|
| 674 |
+
confidence=0.20,
|
| 675 |
+
reasoning=f"Search failed or empty: {search_data.get('note', 'Unknown reason')}",
|
| 676 |
tools_used=[web_result],
|
| 677 |
+
model_used="none",
|
| 678 |
processing_time=web_result.execution_time,
|
| 679 |
cost_estimate=0.0
|
| 680 |
)
|
src/tools/__pycache__/final_answer_tool.cpython-310.pyc
CHANGED
|
Binary files a/src/tools/__pycache__/final_answer_tool.cpython-310.pyc and b/src/tools/__pycache__/final_answer_tool.cpython-310.pyc differ
|
|
|
src/tools/__pycache__/web_search_tool.cpython-310.pyc
CHANGED
|
Binary files a/src/tools/__pycache__/web_search_tool.cpython-310.pyc and b/src/tools/__pycache__/web_search_tool.cpython-310.pyc differ
|
|
|
src/tools/final_answer_tool.py
CHANGED
|
@@ -93,7 +93,7 @@ EXTRACTION RULES:
|
|
| 93 |
"""
|
| 94 |
|
| 95 |
# Add type-specific rules
|
| 96 |
-
if "mathematical" in question_type.lower() or any(word in question.lower() for word in ["how many", "count", "number", "
|
| 97 |
base_prompt += """
|
| 98 |
- If asking for a count/number: respond with ONLY the number (e.g., "5", "23", "0")
|
| 99 |
- If asking for calculation: respond with ONLY the result (e.g., "42", "3.14", "100")
|
|
@@ -155,6 +155,9 @@ Extract the precise answer NOW:"""
|
|
| 155 |
"result:",
|
| 156 |
"response:",
|
| 157 |
"conclusion:",
|
|
|
|
|
|
|
|
|
|
| 158 |
]
|
| 159 |
|
| 160 |
for prefix in prefixes_to_remove:
|
|
@@ -167,18 +170,67 @@ Extract the precise answer NOW:"""
|
|
| 167 |
if answer.startswith("'") and answer.endswith("'"):
|
| 168 |
answer = answer[1:-1]
|
| 169 |
|
| 170 |
-
#
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
if
|
| 175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
-
|
|
|
|
| 178 |
# For reversed text questions, ensure clean output
|
| 179 |
if len(answer.split()) == 1: # Single word answer
|
| 180 |
answer = answer.lower()
|
| 181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
# Remove any trailing punctuation that's not part of the answer
|
| 183 |
answer = answer.rstrip('.,!?;:')
|
| 184 |
|
|
|
|
| 93 |
"""
|
| 94 |
|
| 95 |
# Add type-specific rules
|
| 96 |
+
if "mathematical" in question_type.lower() or any(word in question.lower() for word in ["how many", "count", "number", "albums"]):
|
| 97 |
base_prompt += """
|
| 98 |
- If asking for a count/number: respond with ONLY the number (e.g., "5", "23", "0")
|
| 99 |
- If asking for calculation: respond with ONLY the result (e.g., "42", "3.14", "100")
|
|
|
|
| 155 |
"result:",
|
| 156 |
"response:",
|
| 157 |
"conclusion:",
|
| 158 |
+
"based on",
|
| 159 |
+
"according to",
|
| 160 |
+
"from the",
|
| 161 |
]
|
| 162 |
|
| 163 |
for prefix in prefixes_to_remove:
|
|
|
|
| 170 |
if answer.startswith("'") and answer.endswith("'"):
|
| 171 |
answer = answer[1:-1]
|
| 172 |
|
| 173 |
+
# AGGRESSIVE LENGTH ENFORCEMENT FOR GAIA
|
| 174 |
+
# If answer is too long, extract the core information
|
| 175 |
+
if len(answer) > 50:
|
| 176 |
+
# For different question types, extract differently
|
| 177 |
+
if "mathematical" in question_type.lower() or any(word in question.lower() for word in ["how many", "count", "number", "albums"]):
|
| 178 |
+
# Extract just the number for mathematical questions
|
| 179 |
+
number_match = re.search(r'-?\d+(?:\.\d+)?', answer)
|
| 180 |
+
if number_match:
|
| 181 |
+
answer = number_match.group()
|
| 182 |
+
elif "name" in question_type.lower() or any(word in question.lower() for word in ["who", "name"]):
|
| 183 |
+
# Extract just the name (first few words)
|
| 184 |
+
words = answer.split()
|
| 185 |
+
if len(words) > 3:
|
| 186 |
+
answer = ' '.join(words[:3]) # Keep only first 3 words for names
|
| 187 |
+
elif "location" in question_type.lower() or any(word in question.lower() for word in ["where", "city", "country"]):
|
| 188 |
+
# Extract just the location name
|
| 189 |
+
words = answer.split()
|
| 190 |
+
if len(words) > 2:
|
| 191 |
+
answer = ' '.join(words[:2]) # Keep only first 2 words for locations
|
| 192 |
+
elif "yes_no" in question_type.lower() or any(word in answer.lower() for word in ["yes", "no", "true", "false"]):
|
| 193 |
+
# Extract yes/no/true/false
|
| 194 |
+
if any(word in answer.lower() for word in ["yes", "no", "true", "false"]):
|
| 195 |
+
for word in answer.lower().split():
|
| 196 |
+
if word in ["yes", "no", "true", "false"]:
|
| 197 |
+
answer = word
|
| 198 |
+
break
|
| 199 |
+
else:
|
| 200 |
+
# For other types, take first sentence or clause
|
| 201 |
+
sentences = re.split(r'[.!?]', answer)
|
| 202 |
+
if sentences:
|
| 203 |
+
answer = sentences[0].strip()
|
| 204 |
+
# If still too long, take first clause
|
| 205 |
+
if len(answer) > 30:
|
| 206 |
+
clauses = re.split(r'[,;:]', answer)
|
| 207 |
+
if clauses:
|
| 208 |
+
answer = clauses[0].strip()
|
| 209 |
|
| 210 |
+
# Handle specific formatting based on question type
|
| 211 |
+
if "text_manipulation" in question_type.lower():
|
| 212 |
# For reversed text questions, ensure clean output
|
| 213 |
if len(answer.split()) == 1: # Single word answer
|
| 214 |
answer = answer.lower()
|
| 215 |
|
| 216 |
+
# Final aggressive truncation if still too long
|
| 217 |
+
if len(answer) > 40:
|
| 218 |
+
# Split into words and take as many as fit
|
| 219 |
+
words = answer.split()
|
| 220 |
+
truncated_words = []
|
| 221 |
+
current_length = 0
|
| 222 |
+
for word in words:
|
| 223 |
+
if current_length + len(word) + 1 <= 40:
|
| 224 |
+
truncated_words.append(word)
|
| 225 |
+
current_length += len(word) + 1
|
| 226 |
+
else:
|
| 227 |
+
break
|
| 228 |
+
if truncated_words:
|
| 229 |
+
answer = ' '.join(truncated_words)
|
| 230 |
+
else:
|
| 231 |
+
# Last resort - take first 40 characters
|
| 232 |
+
answer = answer[:40].strip()
|
| 233 |
+
|
| 234 |
# Remove any trailing punctuation that's not part of the answer
|
| 235 |
answer = answer.rstrip('.,!?;:')
|
| 236 |
|
src/tools/web_search_tool.py
CHANGED
|
@@ -20,18 +20,20 @@ logger = logging.getLogger(__name__)
|
|
| 20 |
class WebSearchResult:
|
| 21 |
"""Container for web search results"""
|
| 22 |
|
| 23 |
-
def __init__(self, title: str, url: str, snippet: str, content: str = ""):
|
| 24 |
self.title = title
|
| 25 |
self.url = url
|
| 26 |
self.snippet = snippet
|
| 27 |
self.content = content
|
|
|
|
| 28 |
|
| 29 |
def to_dict(self) -> Dict[str, str]:
|
| 30 |
return {
|
| 31 |
"title": self.title,
|
| 32 |
"url": self.url,
|
| 33 |
"snippet": self.snippet,
|
| 34 |
-
"content": self.content[:1500] + "..." if len(self.content) > 1500 else self.content
|
|
|
|
| 35 |
}
|
| 36 |
|
| 37 |
class WebSearchTool(BaseTool):
|
|
@@ -246,53 +248,78 @@ class WebSearchTool(BaseTool):
|
|
| 246 |
title=result.get('title', 'No title'),
|
| 247 |
url=result.get('href', ''),
|
| 248 |
snippet=result.get('body', 'No description'),
|
| 249 |
-
|
| 250 |
)
|
| 251 |
-
results.append(web_result
|
| 252 |
-
|
| 253 |
-
# Extract content if requested
|
| 254 |
-
if extract_content and results:
|
| 255 |
-
for result in results[:2]: # Only extract from first 2 results to save time
|
| 256 |
-
try:
|
| 257 |
-
content_result = self._extract_content_from_url(result['url'])
|
| 258 |
-
if content_result.get('found'):
|
| 259 |
-
result['content'] = content_result.get('content', '')[:1000]
|
| 260 |
-
except:
|
| 261 |
-
pass # Skip content extraction errors
|
| 262 |
|
| 263 |
logger.info(f"✅ DuckDuckGo found {len(results)} results")
|
|
|
|
| 264 |
return {
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
"search_engine": "duckduckgo"
|
| 271 |
}
|
| 272 |
|
| 273 |
except Exception as e:
|
| 274 |
-
logger.warning(f"DuckDuckGo search failed: {str(e)
|
| 275 |
-
#
|
| 276 |
return self._search_with_fallback(query, limit)
|
| 277 |
|
| 278 |
-
def _search_with_fallback(self, query: str, limit: int) -> Dict[str, Any]:
|
| 279 |
-
"""
|
| 280 |
-
|
| 281 |
-
|
|
|
|
|
|
|
|
|
|
| 282 |
try:
|
| 283 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
except Exception as e:
|
| 285 |
-
logger.warning(f"Tavily
|
| 286 |
|
| 287 |
-
#
|
| 288 |
-
|
| 289 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
|
|
|
|
|
|
|
| 291 |
return {
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
|
|
|
|
|
|
| 296 |
}
|
| 297 |
|
| 298 |
def _search_with_tavily(self, query: str, limit: int = 5, extract_content: bool = False) -> Dict[str, Any]:
|
|
@@ -339,17 +366,16 @@ class WebSearchTool(BaseTool):
|
|
| 339 |
snippet=result.get('content', 'No description'),
|
| 340 |
content=result.get('raw_content', '') if extract_content else ''
|
| 341 |
)
|
| 342 |
-
results.append(web_result
|
| 343 |
|
| 344 |
if results:
|
| 345 |
logger.info(f"✅ Tavily found {len(results)} results")
|
| 346 |
return {
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
"search_engine": "tavily"
|
| 353 |
}
|
| 354 |
else:
|
| 355 |
logger.warning("Tavily returned no results")
|
|
@@ -367,10 +393,12 @@ class WebSearchTool(BaseTool):
|
|
| 367 |
return self._search_with_wikipedia(query, limit)
|
| 368 |
|
| 369 |
return {
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
|
|
|
|
|
|
| 374 |
}
|
| 375 |
|
| 376 |
def _search_with_wikipedia(self, query: str, limit: int = 5) -> Dict[str, Any]:
|
|
@@ -390,11 +418,12 @@ class WebSearchTool(BaseTool):
|
|
| 390 |
|
| 391 |
if not wiki_results:
|
| 392 |
return {
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
|
|
|
| 398 |
}
|
| 399 |
|
| 400 |
results = []
|
|
@@ -414,7 +443,7 @@ class WebSearchTool(BaseTool):
|
|
| 414 |
snippet=summary,
|
| 415 |
content=page.summary[:1000] + "..." if len(page.summary) > 1000 else page.summary
|
| 416 |
)
|
| 417 |
-
results.append(web_result
|
| 418 |
processed += 1
|
| 419 |
|
| 420 |
except self.wikipedia.exceptions.DisambiguationError as e:
|
|
@@ -430,7 +459,7 @@ class WebSearchTool(BaseTool):
|
|
| 430 |
snippet=summary,
|
| 431 |
content=page.summary[:1000] + "..." if len(page.summary) > 1000 else page.summary
|
| 432 |
)
|
| 433 |
-
results.append(web_result
|
| 434 |
processed += 1
|
| 435 |
except:
|
| 436 |
continue
|
|
@@ -446,30 +475,31 @@ class WebSearchTool(BaseTool):
|
|
| 446 |
if results:
|
| 447 |
logger.info(f"✅ Wikipedia found {len(results)} results")
|
| 448 |
return {
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
"search_engine": "wikipedia"
|
| 455 |
}
|
| 456 |
else:
|
| 457 |
return {
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
|
|
|
| 463 |
}
|
| 464 |
|
| 465 |
except Exception as e:
|
| 466 |
logger.error(f"Wikipedia search failed: {e}")
|
| 467 |
return {
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
|
|
|
| 473 |
}
|
| 474 |
|
| 475 |
def _extract_content_from_url(self, url: str) -> Dict[str, Any]:
|
|
@@ -603,7 +633,7 @@ def test_web_search_tool():
|
|
| 603 |
|
| 604 |
if result.success:
|
| 605 |
print(f"✅ Success: {result.result.get('message', 'No message')}")
|
| 606 |
-
search_engine = result.result.get('
|
| 607 |
print(f" Search engine: {search_engine}")
|
| 608 |
|
| 609 |
if result.result.get('found'):
|
|
|
|
| 20 |
class WebSearchResult:
|
| 21 |
"""Container for web search results"""
|
| 22 |
|
| 23 |
+
def __init__(self, title: str, url: str, snippet: str, content: str = "", source: str = ""):
|
| 24 |
self.title = title
|
| 25 |
self.url = url
|
| 26 |
self.snippet = snippet
|
| 27 |
self.content = content
|
| 28 |
+
self.source = source
|
| 29 |
|
| 30 |
def to_dict(self) -> Dict[str, str]:
|
| 31 |
return {
|
| 32 |
"title": self.title,
|
| 33 |
"url": self.url,
|
| 34 |
"snippet": self.snippet,
|
| 35 |
+
"content": self.content[:1500] + "..." if len(self.content) > 1500 else self.content,
|
| 36 |
+
"source": self.source
|
| 37 |
}
|
| 38 |
|
| 39 |
class WebSearchTool(BaseTool):
|
|
|
|
| 248 |
title=result.get('title', 'No title'),
|
| 249 |
url=result.get('href', ''),
|
| 250 |
snippet=result.get('body', 'No description'),
|
| 251 |
+
source='DuckDuckGo'
|
| 252 |
)
|
| 253 |
+
results.append(web_result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
|
| 255 |
logger.info(f"✅ DuckDuckGo found {len(results)} results")
|
| 256 |
+
|
| 257 |
return {
|
| 258 |
+
'success': True,
|
| 259 |
+
'results': results,
|
| 260 |
+
'source': 'DuckDuckGo',
|
| 261 |
+
'query': query,
|
| 262 |
+
'count': len(results)
|
|
|
|
| 263 |
}
|
| 264 |
|
| 265 |
except Exception as e:
|
| 266 |
+
logger.warning(f"DuckDuckGo search failed: {str(e)}")
|
| 267 |
+
# Don't log the full exception details to avoid spam
|
| 268 |
return self._search_with_fallback(query, limit)
|
| 269 |
|
| 270 |
+
def _search_with_fallback(self, query: str, limit: int = 5) -> Dict[str, Any]:
|
| 271 |
+
"""Enhanced fallback search when DuckDuckGo fails"""
|
| 272 |
+
|
| 273 |
+
logger.info(f"🔄 Using fallback search engines for: {query}")
|
| 274 |
+
|
| 275 |
+
# Try Tavily API first if available
|
| 276 |
+
if hasattr(self, 'tavily') and self.tavily:
|
| 277 |
try:
|
| 278 |
+
logger.info("📡 Trying Tavily API search")
|
| 279 |
+
tavily_result = self.tavily.search(query, max_results=limit)
|
| 280 |
+
|
| 281 |
+
if tavily_result and 'results' in tavily_result:
|
| 282 |
+
results = []
|
| 283 |
+
for result in tavily_result['results'][:limit]:
|
| 284 |
+
web_result = WebSearchResult(
|
| 285 |
+
title=result.get('title', 'No title'),
|
| 286 |
+
url=result.get('url', ''),
|
| 287 |
+
snippet=result.get('content', 'No description'),
|
| 288 |
+
source='Tavily'
|
| 289 |
+
)
|
| 290 |
+
results.append(web_result)
|
| 291 |
+
|
| 292 |
+
if results:
|
| 293 |
+
logger.info(f"✅ Tavily found {len(results)} results")
|
| 294 |
+
return {
|
| 295 |
+
'success': True,
|
| 296 |
+
'results': results,
|
| 297 |
+
'source': 'Tavily',
|
| 298 |
+
'query': query,
|
| 299 |
+
'count': len(results)
|
| 300 |
+
}
|
| 301 |
except Exception as e:
|
| 302 |
+
logger.warning(f"Tavily search failed: {str(e)}")
|
| 303 |
|
| 304 |
+
# Fall back to Wikipedia search
|
| 305 |
+
logger.info("📚 Wikipedia search for: " + query)
|
| 306 |
+
try:
|
| 307 |
+
wiki_results = self._search_wikipedia(query, limit)
|
| 308 |
+
if wiki_results and wiki_results.get('success'):
|
| 309 |
+
logger.info(f"✅ Wikipedia found {wiki_results.get('count', 0)} results")
|
| 310 |
+
return wiki_results
|
| 311 |
+
except Exception as e:
|
| 312 |
+
logger.warning(f"Wikipedia fallback failed: {str(e)}")
|
| 313 |
|
| 314 |
+
# Final fallback - return empty but successful result to allow processing to continue
|
| 315 |
+
logger.warning("All search engines failed, returning empty results")
|
| 316 |
return {
|
| 317 |
+
'success': True,
|
| 318 |
+
'results': [],
|
| 319 |
+
'source': 'none',
|
| 320 |
+
'query': query,
|
| 321 |
+
'count': 0,
|
| 322 |
+
'note': 'All search engines failed'
|
| 323 |
}
|
| 324 |
|
| 325 |
def _search_with_tavily(self, query: str, limit: int = 5, extract_content: bool = False) -> Dict[str, Any]:
|
|
|
|
| 366 |
snippet=result.get('content', 'No description'),
|
| 367 |
content=result.get('raw_content', '') if extract_content else ''
|
| 368 |
)
|
| 369 |
+
results.append(web_result)
|
| 370 |
|
| 371 |
if results:
|
| 372 |
logger.info(f"✅ Tavily found {len(results)} results")
|
| 373 |
return {
|
| 374 |
+
'success': True,
|
| 375 |
+
'results': results,
|
| 376 |
+
'source': 'Tavily',
|
| 377 |
+
'query': query,
|
| 378 |
+
'count': len(results)
|
|
|
|
| 379 |
}
|
| 380 |
else:
|
| 381 |
logger.warning("Tavily returned no results")
|
|
|
|
| 393 |
return self._search_with_wikipedia(query, limit)
|
| 394 |
|
| 395 |
return {
|
| 396 |
+
'success': False,
|
| 397 |
+
'results': [],
|
| 398 |
+
'source': 'Tavily',
|
| 399 |
+
'query': query,
|
| 400 |
+
'count': 0,
|
| 401 |
+
'note': 'Tavily search failed and no fallback available'
|
| 402 |
}
|
| 403 |
|
| 404 |
def _search_with_wikipedia(self, query: str, limit: int = 5) -> Dict[str, Any]:
|
|
|
|
| 418 |
|
| 419 |
if not wiki_results:
|
| 420 |
return {
|
| 421 |
+
'success': False,
|
| 422 |
+
'results': [],
|
| 423 |
+
'source': 'Wikipedia',
|
| 424 |
+
'query': query,
|
| 425 |
+
'count': 0,
|
| 426 |
+
'note': 'No Wikipedia articles found for this query'
|
| 427 |
}
|
| 428 |
|
| 429 |
results = []
|
|
|
|
| 443 |
snippet=summary,
|
| 444 |
content=page.summary[:1000] + "..." if len(page.summary) > 1000 else page.summary
|
| 445 |
)
|
| 446 |
+
results.append(web_result)
|
| 447 |
processed += 1
|
| 448 |
|
| 449 |
except self.wikipedia.exceptions.DisambiguationError as e:
|
|
|
|
| 459 |
snippet=summary,
|
| 460 |
content=page.summary[:1000] + "..." if len(page.summary) > 1000 else page.summary
|
| 461 |
)
|
| 462 |
+
results.append(web_result)
|
| 463 |
processed += 1
|
| 464 |
except:
|
| 465 |
continue
|
|
|
|
| 475 |
if results:
|
| 476 |
logger.info(f"✅ Wikipedia found {len(results)} results")
|
| 477 |
return {
|
| 478 |
+
'success': True,
|
| 479 |
+
'results': results,
|
| 480 |
+
'source': 'Wikipedia',
|
| 481 |
+
'query': query,
|
| 482 |
+
'count': len(results)
|
|
|
|
| 483 |
}
|
| 484 |
else:
|
| 485 |
return {
|
| 486 |
+
'success': False,
|
| 487 |
+
'results': [],
|
| 488 |
+
'source': 'Wikipedia',
|
| 489 |
+
'query': query,
|
| 490 |
+
'count': 0,
|
| 491 |
+
'note': 'No accessible Wikipedia articles found for this query'
|
| 492 |
}
|
| 493 |
|
| 494 |
except Exception as e:
|
| 495 |
logger.error(f"Wikipedia search failed: {e}")
|
| 496 |
return {
|
| 497 |
+
'success': False,
|
| 498 |
+
'results': [],
|
| 499 |
+
'source': 'Wikipedia',
|
| 500 |
+
'query': query,
|
| 501 |
+
'count': 0,
|
| 502 |
+
'note': f"Wikipedia search failed: {str(e)}"
|
| 503 |
}
|
| 504 |
|
| 505 |
def _extract_content_from_url(self, url: str) -> Dict[str, Any]:
|
|
|
|
| 633 |
|
| 634 |
if result.success:
|
| 635 |
print(f"✅ Success: {result.result.get('message', 'No message')}")
|
| 636 |
+
search_engine = result.result.get('source', 'unknown')
|
| 637 |
print(f" Search engine: {search_engine}")
|
| 638 |
|
| 639 |
if result.result.get('found'):
|