Spaces:
Sleeping
Sleeping
Chris
commited on
Commit
·
f753656
1
Parent(s):
4048da2
Final 7.10.3
Browse files
src/agents/__pycache__/router.cpython-310.pyc
CHANGED
|
Binary files a/src/agents/__pycache__/router.cpython-310.pyc and b/src/agents/__pycache__/router.cpython-310.pyc differ
|
|
|
src/agents/__pycache__/web_researcher.cpython-310.pyc
CHANGED
|
Binary files a/src/agents/__pycache__/web_researcher.cpython-310.pyc and b/src/agents/__pycache__/web_researcher.cpython-310.pyc differ
|
|
|
src/agents/router.py
CHANGED
|
@@ -57,7 +57,7 @@ class RouterAgent:
|
|
| 57 |
state.add_error(error_msg)
|
| 58 |
|
| 59 |
# Fallback to basic routing
|
| 60 |
-
state.question_type = QuestionType.
|
| 61 |
state.selected_agents = [AgentRole.WEB_RESEARCHER, AgentRole.REASONING_AGENT, AgentRole.SYNTHESIZER]
|
| 62 |
state.routing_decision = f"Enhanced routing failed, using fallback: {error_msg}"
|
| 63 |
|
|
|
|
| 57 |
state.add_error(error_msg)
|
| 58 |
|
| 59 |
# Fallback to basic routing
|
| 60 |
+
state.question_type = QuestionType.UNKNOWN
|
| 61 |
state.selected_agents = [AgentRole.WEB_RESEARCHER, AgentRole.REASONING_AGENT, AgentRole.SYNTHESIZER]
|
| 62 |
state.routing_decision = f"Enhanced routing failed, using fallback: {error_msg}"
|
| 63 |
|
src/agents/web_researcher.py
CHANGED
|
@@ -904,56 +904,167 @@ Provide your analysis and answer:"""
|
|
| 904 |
|
| 905 |
def _extract_search_terms(self, question: str, max_length: int = 180) -> str:
|
| 906 |
"""
|
| 907 |
-
|
| 908 |
-
|
| 909 |
"""
|
| 910 |
-
|
| 911 |
-
question_clean = re.sub(r'\b(what|who|when|where|why|how|is|are|was|were|did|do|does|can|could|should|would|please|tell|me|find|about)\b', '', question.lower())
|
| 912 |
-
|
| 913 |
-
# Extract key patterns first
|
| 914 |
-
entities = []
|
| 915 |
-
|
| 916 |
-
# Extract quoted phrases (highest priority)
|
| 917 |
-
quoted_phrases = re.findall(r'"([^"]+)"', question)
|
| 918 |
-
entities.extend(quoted_phrases)
|
| 919 |
-
|
| 920 |
-
# Extract proper nouns (names, places, organizations)
|
| 921 |
-
proper_nouns = re.findall(r'\b[A-Z][a-zA-Z]*(?:\s+[A-Z][a-zA-Z]*)*\b', question)
|
| 922 |
-
# Filter out common question words that might be capitalized
|
| 923 |
-
filtered_nouns = [noun for noun in proper_nouns if noun.lower() not in {'you', 'i', 'me', 'my', 'the', 'a', 'an'}]
|
| 924 |
-
entities.extend(filtered_nouns[:4]) # Limit to top 4
|
| 925 |
|
| 926 |
-
#
|
| 927 |
-
|
| 928 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 929 |
|
| 930 |
-
#
|
| 931 |
-
|
| 932 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 933 |
|
| 934 |
-
|
| 935 |
-
|
| 936 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 937 |
else:
|
| 938 |
-
# Fallback:
|
| 939 |
-
|
| 940 |
-
|
| 941 |
-
|
| 942 |
-
|
| 943 |
-
|
| 944 |
-
|
| 945 |
-
#
|
| 946 |
-
|
| 947 |
-
|
| 948 |
-
|
| 949 |
-
|
| 950 |
-
|
| 951 |
-
|
| 952 |
-
|
| 953 |
-
|
| 954 |
-
|
| 955 |
-
|
| 956 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 957 |
|
| 958 |
def _extract_youtube_info(self, question: str) -> str:
|
| 959 |
"""Extract YouTube URL or search terms"""
|
|
@@ -1220,4 +1331,4 @@ Provide your analysis and answer:"""
|
|
| 1220 |
model_used="error",
|
| 1221 |
processing_time=0.0,
|
| 1222 |
cost_estimate=0.0
|
| 1223 |
-
)
|
|
|
|
| 904 |
|
| 905 |
def _extract_search_terms(self, question: str, max_length: int = 180) -> str:
|
| 906 |
"""
|
| 907 |
+
Extract intelligent search terms from a question
|
| 908 |
+
Creates clean, focused queries that search engines can understand
|
| 909 |
"""
|
| 910 |
+
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 911 |
|
| 912 |
+
# Handle backwards text questions - detect and reverse them
|
| 913 |
+
if re.search(r'\.rewsna\b|etirw\b|dnatsrednu\b|ecnetnes\b', question.lower()):
|
| 914 |
+
# This appears to be backwards text - reverse the entire question
|
| 915 |
+
reversed_question = question[::-1]
|
| 916 |
+
logger.info(f"🔄 Detected backwards text, reversed: '{reversed_question[:50]}...'")
|
| 917 |
+
return self._extract_search_terms(reversed_question, max_length)
|
| 918 |
+
|
| 919 |
+
# Clean the question first
|
| 920 |
+
clean_question = question.strip()
|
| 921 |
+
|
| 922 |
+
# Special handling for specific question types
|
| 923 |
+
question_lower = clean_question.lower()
|
| 924 |
+
|
| 925 |
+
# For YouTube video questions, extract the video ID and search for it
|
| 926 |
+
youtube_match = re.search(r'youtube\.com/watch\?v=([a-zA-Z0-9_-]+)', question)
|
| 927 |
+
if youtube_match:
|
| 928 |
+
video_id = youtube_match.group(1)
|
| 929 |
+
return f"youtube video {video_id}"
|
| 930 |
+
|
| 931 |
+
# For file-based questions, don't search the web
|
| 932 |
+
if any(phrase in question_lower for phrase in ['attached file', 'attached python', 'excel file contains', 'attached excel']):
|
| 933 |
+
return "file processing data analysis"
|
| 934 |
+
|
| 935 |
+
# Extract key entities using smart patterns
|
| 936 |
+
search_terms = []
|
| 937 |
+
|
| 938 |
+
# 1. Extract quoted phrases (highest priority)
|
| 939 |
+
quoted_phrases = re.findall(r'"([^"]{3,})"', question)
|
| 940 |
+
search_terms.extend(quoted_phrases[:2]) # Max 2 quoted phrases
|
| 941 |
+
|
| 942 |
+
# 2. Extract proper nouns (names, places, organizations)
|
| 943 |
+
# Look for capitalized sequences
|
| 944 |
+
proper_nouns = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]*)*\b', question)
|
| 945 |
+
# Filter out question starters and common words that should not be included
|
| 946 |
+
excluded_words = {'How', 'What', 'Where', 'When', 'Who', 'Why', 'Which', 'The', 'This', 'That', 'If', 'Please', 'Hi', 'Could', 'Review', 'Provide', 'Give', 'On', 'In', 'At', 'To', 'For', 'Of', 'With', 'By', 'Examine', 'Given'}
|
| 947 |
+
meaningful_nouns = []
|
| 948 |
+
for noun in proper_nouns:
|
| 949 |
+
if noun not in excluded_words and len(noun) > 2:
|
| 950 |
+
meaningful_nouns.append(noun)
|
| 951 |
+
search_terms.extend(meaningful_nouns[:4]) # Max 4 proper nouns
|
| 952 |
+
|
| 953 |
+
# 3. Extract years (but avoid duplicates)
|
| 954 |
+
years = list(set(re.findall(r'\b(19\d{2}|20\d{2})\b', question)))
|
| 955 |
+
search_terms.extend(years[:2]) # Max 2 unique years
|
| 956 |
+
|
| 957 |
+
# 4. Extract important domain-specific keywords
|
| 958 |
+
domain_keywords = []
|
| 959 |
+
|
| 960 |
+
# Music/entertainment
|
| 961 |
+
if any(word in question_lower for word in ['album', 'song', 'artist', 'band', 'music']):
|
| 962 |
+
domain_keywords.extend(['studio albums', 'discography'] if 'album' in question_lower else ['music'])
|
| 963 |
+
|
| 964 |
+
# Wikipedia-specific
|
| 965 |
+
if 'wikipedia' in question_lower:
|
| 966 |
+
domain_keywords.extend(['wikipedia', 'featured article'] if 'featured' in question_lower else ['wikipedia'])
|
| 967 |
+
|
| 968 |
+
# Sports/Olympics
|
| 969 |
+
if any(word in question_lower for word in ['athlete', 'olympics', 'sport', 'team']):
|
| 970 |
+
domain_keywords.append('olympics' if 'olympics' in question_lower else 'sports')
|
| 971 |
+
|
| 972 |
+
# Competition/awards
|
| 973 |
+
if any(word in question_lower for word in ['competition', 'winner', 'recipient', 'award']):
|
| 974 |
+
domain_keywords.append('competition')
|
| 975 |
+
|
| 976 |
+
# Add unique domain keywords
|
| 977 |
+
for keyword in domain_keywords:
|
| 978 |
+
if keyword not in [term.lower() for term in search_terms]:
|
| 979 |
+
search_terms.append(keyword)
|
| 980 |
+
|
| 981 |
+
# 5. Extract specific important terms from the question
|
| 982 |
+
# Be more selective about stop words - keep important descriptive words
|
| 983 |
+
words = re.findall(r'\b\w+\b', clean_question.lower())
|
| 984 |
+
|
| 985 |
+
# Reduced skip words list - keep more meaningful terms
|
| 986 |
+
skip_words = {
|
| 987 |
+
'how', 'many', 'what', 'who', 'when', 'where', 'why', 'which', 'whose',
|
| 988 |
+
'is', 'are', 'was', 'were', 'did', 'does', 'do', 'can', 'could', 'would', 'should',
|
| 989 |
+
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
|
| 990 |
+
'from', 'up', 'about', 'into', 'through', 'during', 'before', 'after', 'above', 'below',
|
| 991 |
+
'among', 'this', 'that', 'these', 'those', 'i', 'me', 'my', 'we', 'our',
|
| 992 |
+
'you', 'your', 'he', 'him', 'his', 'she', 'her', 'it', 'its', 'they', 'them', 'their',
|
| 993 |
+
'be', 'been', 'being', 'have', 'has', 'had', 'will', 'may', 'might', 'must',
|
| 994 |
+
'please', 'tell', 'find', 'here', 'there', 'only', 'just', 'some', 'help', 'give', 'provide', 'review'
|
| 995 |
+
}
|
| 996 |
|
| 997 |
+
# Look for important content words - be more inclusive
|
| 998 |
+
important_words = []
|
| 999 |
+
for word in words:
|
| 1000 |
+
if (len(word) > 3 and
|
| 1001 |
+
word not in skip_words and
|
| 1002 |
+
word not in [term.lower() for term in search_terms] and
|
| 1003 |
+
not word.isdigit()):
|
| 1004 |
+
# Include important descriptive words
|
| 1005 |
+
important_words.append(word)
|
| 1006 |
+
|
| 1007 |
+
# Add more important content words
|
| 1008 |
+
search_terms.extend(important_words[:4]) # Increased from 3 to 4
|
| 1009 |
+
|
| 1010 |
+
# 6. Special inclusion of key terms that are often missed
|
| 1011 |
+
# Look for important terms that might have been filtered out
|
| 1012 |
+
key_terms_patterns = {
|
| 1013 |
+
'image': r'\b(image|picture|photo|visual)\b',
|
| 1014 |
+
'video': r'\b(video|clip|footage)\b',
|
| 1015 |
+
'file': r'\b(file|document|attachment)\b',
|
| 1016 |
+
'chess': r'\b(chess|position|move|game)\b',
|
| 1017 |
+
'move': r'\b(move|next|correct|turn)\b',
|
| 1018 |
+
'dinosaur': r'\b(dinosaur|fossil|extinct)\b',
|
| 1019 |
+
'shopping': r'\b(shopping|grocery|list|market)\b',
|
| 1020 |
+
'list': r'\b(list|shopping|grocery)\b',
|
| 1021 |
+
'black': r'\b(black|white|color|turn)\b',
|
| 1022 |
+
'opposite': r'\b(opposite|reverse|contrary)\b',
|
| 1023 |
+
'nominated': r'\b(nominated|nominated|nomination)\b'
|
| 1024 |
+
}
|
| 1025 |
|
| 1026 |
+
for key_term, pattern in key_terms_patterns.items():
|
| 1027 |
+
if re.search(pattern, question_lower) and key_term not in [term.lower() for term in search_terms]:
|
| 1028 |
+
search_terms.append(key_term)
|
| 1029 |
+
|
| 1030 |
+
# 7. Build the final search query
|
| 1031 |
+
if search_terms:
|
| 1032 |
+
# Remove duplicates while preserving order
|
| 1033 |
+
unique_terms = []
|
| 1034 |
+
seen = set()
|
| 1035 |
+
for term in search_terms:
|
| 1036 |
+
term_lower = term.lower()
|
| 1037 |
+
if term_lower not in seen and len(term.strip()) > 0:
|
| 1038 |
+
seen.add(term_lower)
|
| 1039 |
+
unique_terms.append(term)
|
| 1040 |
+
|
| 1041 |
+
search_query = ' '.join(unique_terms)
|
| 1042 |
else:
|
| 1043 |
+
# Fallback: extract the most important words from the question
|
| 1044 |
+
fallback_words = []
|
| 1045 |
+
for word in words:
|
| 1046 |
+
if len(word) > 3 and word not in skip_words:
|
| 1047 |
+
fallback_words.append(word)
|
| 1048 |
+
search_query = ' '.join(fallback_words[:4])
|
| 1049 |
+
|
| 1050 |
+
# Final cleanup
|
| 1051 |
+
search_query = ' '.join(search_query.split()) # Remove extra whitespace
|
| 1052 |
+
|
| 1053 |
+
# Truncate at word boundary if too long
|
| 1054 |
+
if len(search_query) > max_length:
|
| 1055 |
+
search_query = search_query[:max_length].rsplit(' ', 1)[0]
|
| 1056 |
+
|
| 1057 |
+
# Ensure we have something meaningful
|
| 1058 |
+
if not search_query.strip() or len(search_query.strip()) < 3:
|
| 1059 |
+
# Last resort: use the first few meaningful words from the original question
|
| 1060 |
+
words = question.split()
|
| 1061 |
+
meaningful_words = [w for w in words if len(w) > 2 and not w.lower() in skip_words]
|
| 1062 |
+
search_query = ' '.join(meaningful_words[:4])
|
| 1063 |
+
|
| 1064 |
+
# Log for debugging
|
| 1065 |
+
logger.info(f"📝 Extracted search terms: '{search_query}' from question: '{question[:100]}...'")
|
| 1066 |
+
|
| 1067 |
+
return search_query.strip()
|
| 1068 |
|
| 1069 |
def _extract_youtube_info(self, question: str) -> str:
|
| 1070 |
"""Extract YouTube URL or search terms"""
|
|
|
|
| 1331 |
model_used="error",
|
| 1332 |
processing_time=0.0,
|
| 1333 |
cost_estimate=0.0
|
| 1334 |
+
)
|
src/tools/__pycache__/web_search_tool.cpython-310.pyc
CHANGED
|
Binary files a/src/tools/__pycache__/web_search_tool.cpython-310.pyc and b/src/tools/__pycache__/web_search_tool.cpython-310.pyc differ
|
|
|
src/tools/web_search_tool.py
CHANGED
|
@@ -128,130 +128,162 @@ class WebSearchTool(BaseTool):
|
|
| 128 |
|
| 129 |
def _extract_search_terms(self, question: str, max_length: int = 200) -> str:
|
| 130 |
"""
|
| 131 |
-
Extract
|
| 132 |
-
|
| 133 |
"""
|
| 134 |
import re
|
| 135 |
|
| 136 |
-
#
|
| 137 |
-
if re.search(r'\.rewsna\b|etirw\b|dnatsrednu\b', question.lower()):
|
| 138 |
-
# This
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
reversed_question = ' '.join(reversed_words)
|
| 142 |
return self._extract_search_terms(reversed_question, max_length)
|
| 143 |
|
| 144 |
-
#
|
| 145 |
-
clean_question = question
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
r'\bcould\s+you\s+',
|
| 156 |
-
r'\bplease\s+',
|
| 157 |
-
r'\btell\s+me\s+',
|
| 158 |
-
r'\bfind\s+',
|
| 159 |
-
r'\blist\s+',
|
| 160 |
-
]
|
| 161 |
|
| 162 |
-
|
| 163 |
-
|
|
|
|
| 164 |
|
| 165 |
-
# Extract key
|
| 166 |
-
|
| 167 |
|
| 168 |
# 1. Extract quoted phrases (highest priority)
|
| 169 |
-
quoted_phrases = re.findall(r'"([^"]
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
#
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
#
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
#
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
}
|
| 199 |
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
#
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
words = re.findall(r'\b\w+\b', clean_question.lower())
|
| 229 |
-
content_words = [w for w in words if w not in stop_words and len(w) > 2]
|
| 230 |
-
|
| 231 |
-
# Add important content words not already included
|
| 232 |
-
for word in content_words[:3]:
|
| 233 |
-
if word not in [part.lower() for part in search_parts]:
|
| 234 |
-
search_parts.append(word)
|
| 235 |
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
else:
|
| 240 |
-
# Fallback:
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
|
|
|
|
|
|
|
|
|
| 245 |
search_query = ' '.join(search_query.split()) # Remove extra whitespace
|
| 246 |
|
| 247 |
# Truncate at word boundary if too long
|
| 248 |
if len(search_query) > max_length:
|
| 249 |
search_query = search_query[:max_length].rsplit(' ', 1)[0]
|
| 250 |
|
| 251 |
-
# Ensure we have something
|
| 252 |
-
if not search_query.strip():
|
| 253 |
-
|
| 254 |
-
|
|
|
|
|
|
|
| 255 |
|
| 256 |
# Log for debugging
|
| 257 |
logger.info(f"📝 Extracted search terms: '{search_query}' from question: '{question[:100]}...'")
|
|
@@ -328,30 +360,50 @@ class WebSearchTool(BaseTool):
|
|
| 328 |
|
| 329 |
def _search_with_duckduckgo(self, query: str, limit: int = 5, extract_content: bool = False) -> Dict[str, Any]:
|
| 330 |
"""
|
| 331 |
-
Search using DuckDuckGo
|
| 332 |
"""
|
| 333 |
try:
|
| 334 |
logger.info(f"🦆 DuckDuckGo search for: {query}")
|
| 335 |
|
| 336 |
-
# Add
|
| 337 |
-
time.sleep(0
|
| 338 |
|
| 339 |
-
# Use DuckDuckGo text search with retry logic
|
| 340 |
-
max_retries =
|
| 341 |
for attempt in range(max_retries):
|
| 342 |
try:
|
| 343 |
-
|
| 344 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
except Exception as retry_error:
|
|
|
|
| 346 |
if attempt < max_retries - 1:
|
| 347 |
-
|
| 348 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
continue
|
| 350 |
else:
|
|
|
|
| 351 |
raise retry_error
|
| 352 |
|
| 353 |
if not ddg_results:
|
| 354 |
-
logger.warning("DuckDuckGo returned no results")
|
| 355 |
return self._search_with_fallback(query, limit)
|
| 356 |
|
| 357 |
# Process DuckDuckGo results
|
|
@@ -376,11 +428,12 @@ class WebSearchTool(BaseTool):
|
|
| 376 |
}
|
| 377 |
|
| 378 |
except Exception as e:
|
| 379 |
-
logger.warning(f"DuckDuckGo search failed: {str(e)}")
|
| 380 |
-
#
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
|
|
|
| 384 |
return self._search_with_fallback(query, limit)
|
| 385 |
|
| 386 |
def _search_with_fallback(self, query: str, limit: int = 5) -> Dict[str, Any]:
|
|
|
|
| 128 |
|
| 129 |
def _extract_search_terms(self, question: str, max_length: int = 200) -> str:
|
| 130 |
"""
|
| 131 |
+
Extract intelligent search terms from a question
|
| 132 |
+
Creates clean, focused queries that search engines can understand
|
| 133 |
"""
|
| 134 |
import re
|
| 135 |
|
| 136 |
+
# Handle backwards text questions - detect and reverse them
|
| 137 |
+
if re.search(r'\.rewsna\b|etirw\b|dnatsrednu\b|ecnetnes\b', question.lower()):
|
| 138 |
+
# This appears to be backwards text - reverse the entire question
|
| 139 |
+
reversed_question = question[::-1]
|
| 140 |
+
logger.info(f"🔄 Detected backwards text, reversed: '{reversed_question[:50]}...'")
|
|
|
|
| 141 |
return self._extract_search_terms(reversed_question, max_length)
|
| 142 |
|
| 143 |
+
# Clean the question first
|
| 144 |
+
clean_question = question.strip()
|
| 145 |
+
|
| 146 |
+
# Special handling for specific question types
|
| 147 |
+
question_lower = clean_question.lower()
|
| 148 |
+
|
| 149 |
+
# For YouTube video questions, extract the video ID and search for it
|
| 150 |
+
youtube_match = re.search(r'youtube\.com/watch\?v=([a-zA-Z0-9_-]+)', question)
|
| 151 |
+
if youtube_match:
|
| 152 |
+
video_id = youtube_match.group(1)
|
| 153 |
+
return f"youtube video {video_id}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
+
# For file-based questions, don't search the web
|
| 156 |
+
if any(phrase in question_lower for phrase in ['attached file', 'attached python', 'excel file contains', 'attached excel']):
|
| 157 |
+
return "file processing data analysis"
|
| 158 |
|
| 159 |
+
# Extract key entities using smart patterns
|
| 160 |
+
search_terms = []
|
| 161 |
|
| 162 |
# 1. Extract quoted phrases (highest priority)
|
| 163 |
+
quoted_phrases = re.findall(r'"([^"]{3,})"', question)
|
| 164 |
+
search_terms.extend(quoted_phrases[:2]) # Max 2 quoted phrases
|
| 165 |
+
|
| 166 |
+
# 2. Extract proper nouns (names, places, organizations)
|
| 167 |
+
# Look for capitalized sequences
|
| 168 |
+
proper_nouns = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]*)*\b', question)
|
| 169 |
+
# Filter out question starters and common words that should not be included
|
| 170 |
+
excluded_words = {'How', 'What', 'Where', 'When', 'Who', 'Why', 'Which', 'The', 'This', 'That', 'If', 'Please', 'Hi', 'Could', 'Review', 'Provide', 'Give', 'On', 'In', 'At', 'To', 'For', 'Of', 'With', 'By', 'Examine', 'Given'}
|
| 171 |
+
meaningful_nouns = []
|
| 172 |
+
for noun in proper_nouns:
|
| 173 |
+
if noun not in excluded_words and len(noun) > 2:
|
| 174 |
+
meaningful_nouns.append(noun)
|
| 175 |
+
search_terms.extend(meaningful_nouns[:4]) # Max 4 proper nouns
|
| 176 |
+
|
| 177 |
+
# 3. Extract years (but avoid duplicates)
|
| 178 |
+
years = list(set(re.findall(r'\b(19\d{2}|20\d{2})\b', question)))
|
| 179 |
+
search_terms.extend(years[:2]) # Max 2 unique years
|
| 180 |
+
|
| 181 |
+
# 4. Extract important domain-specific keywords
|
| 182 |
+
domain_keywords = []
|
| 183 |
+
|
| 184 |
+
# Music/entertainment
|
| 185 |
+
if any(word in question_lower for word in ['album', 'song', 'artist', 'band', 'music']):
|
| 186 |
+
domain_keywords.extend(['studio albums', 'discography'] if 'album' in question_lower else ['music'])
|
| 187 |
+
|
| 188 |
+
# Wikipedia-specific
|
| 189 |
+
if 'wikipedia' in question_lower:
|
| 190 |
+
domain_keywords.extend(['wikipedia', 'featured article'] if 'featured' in question_lower else ['wikipedia'])
|
| 191 |
+
|
| 192 |
+
# Sports/Olympics
|
| 193 |
+
if any(word in question_lower for word in ['athlete', 'olympics', 'sport', 'team']):
|
| 194 |
+
domain_keywords.append('olympics' if 'olympics' in question_lower else 'sports')
|
| 195 |
+
|
| 196 |
+
# Competition/awards
|
| 197 |
+
if any(word in question_lower for word in ['competition', 'winner', 'recipient', 'award']):
|
| 198 |
+
domain_keywords.append('competition')
|
| 199 |
+
|
| 200 |
+
# Add unique domain keywords
|
| 201 |
+
for keyword in domain_keywords:
|
| 202 |
+
if keyword not in [term.lower() for term in search_terms]:
|
| 203 |
+
search_terms.append(keyword)
|
| 204 |
+
|
| 205 |
+
# 5. Extract specific important terms from the question
|
| 206 |
+
# Be more selective about stop words - keep important descriptive words
|
| 207 |
+
words = re.findall(r'\b\w+\b', clean_question.lower())
|
| 208 |
+
|
| 209 |
+
# Reduced skip words list - keep more meaningful terms
|
| 210 |
+
skip_words = {
|
| 211 |
+
'how', 'many', 'what', 'who', 'when', 'where', 'why', 'which', 'whose',
|
| 212 |
+
'is', 'are', 'was', 'were', 'did', 'does', 'do', 'can', 'could', 'would', 'should',
|
| 213 |
+
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
|
| 214 |
+
'from', 'up', 'about', 'into', 'through', 'during', 'before', 'after', 'above', 'below',
|
| 215 |
+
'among', 'this', 'that', 'these', 'those', 'i', 'me', 'my', 'we', 'our',
|
| 216 |
+
'you', 'your', 'he', 'him', 'his', 'she', 'her', 'it', 'its', 'they', 'them', 'their',
|
| 217 |
+
'be', 'been', 'being', 'have', 'has', 'had', 'will', 'may', 'might', 'must',
|
| 218 |
+
'please', 'tell', 'find', 'here', 'there', 'only', 'just', 'some', 'help', 'give', 'provide', 'review'
|
| 219 |
}
|
| 220 |
|
| 221 |
+
# Look for important content words - be more inclusive
|
| 222 |
+
important_words = []
|
| 223 |
+
for word in words:
|
| 224 |
+
if (len(word) > 3 and
|
| 225 |
+
word not in skip_words and
|
| 226 |
+
word not in [term.lower() for term in search_terms] and
|
| 227 |
+
not word.isdigit()):
|
| 228 |
+
# Include important descriptive words
|
| 229 |
+
important_words.append(word)
|
| 230 |
+
|
| 231 |
+
# Add more important content words
|
| 232 |
+
search_terms.extend(important_words[:4]) # Increased from 3 to 4
|
| 233 |
+
|
| 234 |
+
# 6. Special inclusion of key terms that are often missed
|
| 235 |
+
# Look for important terms that might have been filtered out
|
| 236 |
+
key_terms_patterns = {
|
| 237 |
+
'image': r'\b(image|picture|photo|visual)\b',
|
| 238 |
+
'video': r'\b(video|clip|footage)\b',
|
| 239 |
+
'file': r'\b(file|document|attachment)\b',
|
| 240 |
+
'chess': r'\b(chess|position|move|game)\b',
|
| 241 |
+
'move': r'\b(move|next|correct|turn)\b',
|
| 242 |
+
'dinosaur': r'\b(dinosaur|fossil|extinct)\b',
|
| 243 |
+
'shopping': r'\b(shopping|grocery|list|market)\b',
|
| 244 |
+
'list': r'\b(list|shopping|grocery)\b',
|
| 245 |
+
'black': r'\b(black|white|color|turn)\b',
|
| 246 |
+
'opposite': r'\b(opposite|reverse|contrary)\b',
|
| 247 |
+
'nominated': r'\b(nominated|nominated|nomination)\b'
|
| 248 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
|
| 250 |
+
for key_term, pattern in key_terms_patterns.items():
|
| 251 |
+
if re.search(pattern, question_lower) and key_term not in [term.lower() for term in search_terms]:
|
| 252 |
+
search_terms.append(key_term)
|
| 253 |
+
|
| 254 |
+
# 7. Build the final search query
|
| 255 |
+
if search_terms:
|
| 256 |
+
# Remove duplicates while preserving order
|
| 257 |
+
unique_terms = []
|
| 258 |
+
seen = set()
|
| 259 |
+
for term in search_terms:
|
| 260 |
+
term_lower = term.lower()
|
| 261 |
+
if term_lower not in seen and len(term.strip()) > 0:
|
| 262 |
+
seen.add(term_lower)
|
| 263 |
+
unique_terms.append(term)
|
| 264 |
+
|
| 265 |
+
search_query = ' '.join(unique_terms)
|
| 266 |
else:
|
| 267 |
+
# Fallback: extract the most important words from the question
|
| 268 |
+
fallback_words = []
|
| 269 |
+
for word in words:
|
| 270 |
+
if len(word) > 3 and word not in skip_words:
|
| 271 |
+
fallback_words.append(word)
|
| 272 |
+
search_query = ' '.join(fallback_words[:4])
|
| 273 |
+
|
| 274 |
+
# Final cleanup
|
| 275 |
search_query = ' '.join(search_query.split()) # Remove extra whitespace
|
| 276 |
|
| 277 |
# Truncate at word boundary if too long
|
| 278 |
if len(search_query) > max_length:
|
| 279 |
search_query = search_query[:max_length].rsplit(' ', 1)[0]
|
| 280 |
|
| 281 |
+
# Ensure we have something meaningful
|
| 282 |
+
if not search_query.strip() or len(search_query.strip()) < 3:
|
| 283 |
+
# Last resort: use the first few meaningful words from the original question
|
| 284 |
+
words = question.split()
|
| 285 |
+
meaningful_words = [w for w in words if len(w) > 2 and not w.lower() in skip_words]
|
| 286 |
+
search_query = ' '.join(meaningful_words[:4])
|
| 287 |
|
| 288 |
# Log for debugging
|
| 289 |
logger.info(f"📝 Extracted search terms: '{search_query}' from question: '{question[:100]}...'")
|
|
|
|
| 360 |
|
| 361 |
def _search_with_duckduckgo(self, query: str, limit: int = 5, extract_content: bool = False) -> Dict[str, Any]:
|
| 362 |
"""
|
| 363 |
+
Search using DuckDuckGo with robust rate limiting handling
|
| 364 |
"""
|
| 365 |
try:
|
| 366 |
logger.info(f"🦆 DuckDuckGo search for: {query}")
|
| 367 |
|
| 368 |
+
# Add progressive delay to avoid rate limiting
|
| 369 |
+
time.sleep(1.0) # Increased base delay
|
| 370 |
|
| 371 |
+
# Use DuckDuckGo text search with enhanced retry logic
|
| 372 |
+
max_retries = 3 # Increased retries
|
| 373 |
for attempt in range(max_retries):
|
| 374 |
try:
|
| 375 |
+
# Create a fresh DDGS instance for each attempt to avoid session issues
|
| 376 |
+
from duckduckgo_search import DDGS
|
| 377 |
+
ddgs_instance = DDGS()
|
| 378 |
+
|
| 379 |
+
ddg_results = list(ddgs_instance.text(query, max_results=min(limit, 8)))
|
| 380 |
+
|
| 381 |
+
if ddg_results:
|
| 382 |
+
break
|
| 383 |
+
else:
|
| 384 |
+
logger.warning(f"DuckDuckGo returned no results on attempt {attempt + 1}")
|
| 385 |
+
if attempt < max_retries - 1:
|
| 386 |
+
time.sleep(2 * (attempt + 1)) # Progressive delay
|
| 387 |
+
|
| 388 |
except Exception as retry_error:
|
| 389 |
+
error_str = str(retry_error).lower()
|
| 390 |
if attempt < max_retries - 1:
|
| 391 |
+
# Increase delay for rate limiting
|
| 392 |
+
if "ratelimit" in error_str or "202" in error_str or "429" in error_str:
|
| 393 |
+
delay = 3 * (attempt + 1) # 3s, 6s, 9s delays
|
| 394 |
+
logger.warning(f"DuckDuckGo rate limited on attempt {attempt + 1}, waiting {delay}s: {retry_error}")
|
| 395 |
+
time.sleep(delay)
|
| 396 |
+
else:
|
| 397 |
+
delay = 1 * (attempt + 1) # Regular exponential backoff
|
| 398 |
+
logger.warning(f"DuckDuckGo error on attempt {attempt + 1}, retrying in {delay}s: {retry_error}")
|
| 399 |
+
time.sleep(delay)
|
| 400 |
continue
|
| 401 |
else:
|
| 402 |
+
logger.warning(f"DuckDuckGo failed after {max_retries} attempts: {retry_error}")
|
| 403 |
raise retry_error
|
| 404 |
|
| 405 |
if not ddg_results:
|
| 406 |
+
logger.warning("DuckDuckGo returned no results after all attempts")
|
| 407 |
return self._search_with_fallback(query, limit)
|
| 408 |
|
| 409 |
# Process DuckDuckGo results
|
|
|
|
| 428 |
}
|
| 429 |
|
| 430 |
except Exception as e:
|
| 431 |
+
logger.warning(f"DuckDuckGo search completely failed: {str(e)}")
|
| 432 |
+
# Add delay before fallback for severe rate limiting
|
| 433 |
+
error_str = str(e).lower()
|
| 434 |
+
if "ratelimit" in error_str or "429" in error_str or "202" in error_str:
|
| 435 |
+
logger.warning("Severe rate limiting detected, adding 5s delay before fallback")
|
| 436 |
+
time.sleep(5.0)
|
| 437 |
return self._search_with_fallback(query, limit)
|
| 438 |
|
| 439 |
def _search_with_fallback(self, query: str, limit: int = 5) -> Dict[str, Any]:
|