LuisZermeno commited on
Commit
17befe8
·
verified ·
1 Parent(s): b8febd7

Create search_strategies.py

Browse files
Files changed (1) hide show
  1. search_strategies.py +242 -0
search_strategies.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import time
3
+ import logging
4
+ from typing import List, Dict, Optional, Tuple
5
+ from datetime import datetime
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ class SearchStrategy:
10
+ """Advanced search strategies for GAIA questions"""
11
+
12
+ def __init__(self):
13
+ self.search_history = []
14
+ self.cache = {}
15
+
16
+ def search_cascade(self, query: str, max_attempts: int = 5) -> str:
17
+ """Cascade through different search strategies"""
18
+ strategies = [
19
+ self._direct_search,
20
+ self._site_specific_search,
21
+ self._date_filtered_search,
22
+ self._rephrased_search,
23
+ self._component_search,
24
+ ]
25
+
26
+ results = []
27
+ for i, strategy in enumerate(strategies[:max_attempts]):
28
+ try:
29
+ result = strategy(query)
30
+ if result and "No results found" not in result:
31
+ results.append(result)
32
+ if len(results) >= 2: # Got enough good results
33
+ break
34
+ except Exception as e:
35
+ logger.warning(f"Search strategy {i} failed: {str(e)}")
36
+ continue
37
+
38
+ if results:
39
+ return "\n\n---\n\n".join(results)
40
+ return "No search results found after multiple attempts."
41
+
42
+ def wikipedia_fallback(self, query: str) -> str:
43
+ """Fallback strategies for Wikipedia searches"""
44
+ # Try different query formulations
45
+ attempts = [
46
+ query,
47
+ self._simplify_query(query),
48
+ self._extract_key_terms(query),
49
+ self._remove_stop_words(query),
50
+ ]
51
+
52
+ for attempt in attempts:
53
+ if attempt:
54
+ try:
55
+ # Use web search with Wikipedia filter
56
+ from tools import web_search_tool
57
+ result = web_search_tool(f"site:wikipedia.org {attempt}", num_results=3)
58
+ if result and "No search results" not in result:
59
+ return result
60
+ except:
61
+ continue
62
+
63
+ return "Wikipedia information not found."
64
+
65
+ def _direct_search(self, query: str) -> str:
66
+ """Direct search with the original query"""
67
+ from tools import web_search_tool
68
+ return web_search_tool(query, num_results=5)
69
+
70
+ def _site_specific_search(self, query: str) -> str:
71
+ """Search specific authoritative sites"""
72
+ sites = ['wikipedia.org', 'britannica.com', 'sciencedirect.com', 'nature.com']
73
+
74
+ for site in sites:
75
+ try:
76
+ from tools import web_search_tool
77
+ result = web_search_tool(f"site:{site} {query}", num_results=3)
78
+ if result and "No search results" not in result:
79
+ return f"Results from {site}:\n{result}"
80
+ except:
81
+ continue
82
+
83
+ return ""
84
+
85
+ def _date_filtered_search(self, query: str) -> str:
86
+ """Search with date filters for recent information"""
87
+ # Add current year to query for recent info
88
+ current_year = datetime.now().year
89
+ from tools import web_search_tool
90
+ return web_search_tool(f"{query} {current_year}", num_results=5)
91
+
92
+ def _rephrased_search(self, query: str) -> str:
93
+ """Rephrase query for better results"""
94
+ # Extract key information and rephrase
95
+ rephrased = self._rephrase_query(query)
96
+ if rephrased != query:
97
+ from tools import web_search_tool
98
+ return web_search_tool(rephrased, num_results=5)
99
+ return ""
100
+
101
+ def _component_search(self, query: str) -> str:
102
+ """Break query into components and search separately"""
103
+ components = self._extract_components(query)
104
+ results = []
105
+
106
+ for component in components[:3]: # Limit to avoid too many searches
107
+ try:
108
+ from tools import web_search_tool
109
+ result = web_search_tool(component, num_results=2)
110
+ if result and "No search results" not in result:
111
+ results.append(f"Results for '{component}':\n{result}")
112
+ except:
113
+ continue
114
+
115
+ return "\n\n".join(results) if results else ""
116
+
117
+ def _simplify_query(self, query: str) -> str:
118
+ """Simplify query by removing complex phrases"""
119
+ # Remove question words and simplify
120
+ question_words = ['what', 'who', 'where', 'when', 'why', 'how', 'which']
121
+ words = query.lower().split()
122
+
123
+ # Remove question words from beginning
124
+ if words and words[0] in question_words:
125
+ words = words[1:]
126
+
127
+ # Remove common filler words
128
+ filler_words = ['is', 'are', 'was', 'were', 'the', 'a', 'an', 'of', 'in', 'on', 'at']
129
+ words = [w for w in words if w not in filler_words]
130
+
131
+ return ' '.join(words)
132
+
133
+ def _extract_key_terms(self, query: str) -> str:
134
+ """Extract key terms from query"""
135
+ # Look for capitalized words (proper nouns)
136
+ proper_nouns = re.findall(r'\b[A-Z][a-z]+\b', query)
137
+
138
+ # Look for numbers
139
+ numbers = re.findall(r'\b\d+\b', query)
140
+
141
+ # Look for quoted terms
142
+ quoted = re.findall(r'"([^"]+)"', query)
143
+
144
+ key_terms = proper_nouns + numbers + quoted
145
+ return ' '.join(key_terms) if key_terms else query
146
+
147
+ def _remove_stop_words(self, query: str) -> str:
148
+ """Remove stop words to focus search"""
149
+ stop_words = {
150
+ 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
151
+ 'of', 'with', 'by', 'from', 'about', 'as', 'is', 'was', 'are', 'were',
152
+ 'been', 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
153
+ 'should', 'could', 'may', 'might', 'must', 'can', 'this', 'that', 'these',
154
+ 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'them', 'their'
155
+ }
156
+
157
+ words = [w for w in query.lower().split() if w not in stop_words]
158
+ return ' '.join(words)
159
+
160
+ def _rephrase_query(self, query: str) -> str:
161
+ """Rephrase query for better search results"""
162
+ # Common rephrasing patterns
163
+ replacements = [
164
+ (r'how many', 'number of'),
165
+ (r'what is the name of', ''),
166
+ (r'who is', ''),
167
+ (r'what year', 'year'),
168
+ (r'which country', 'country'),
169
+ (r'what is', ''),
170
+ ]
171
+
172
+ rephrased = query.lower()
173
+ for pattern, replacement in replacements:
174
+ rephrased = re.sub(pattern, replacement, rephrased)
175
+
176
+ return rephrased.strip()
177
+
178
+ def _extract_components(self, query: str) -> List[str]:
179
+ """Extract component queries from complex question"""
180
+ components = []
181
+
182
+ # Split by conjunctions
183
+ parts = re.split(r'\b(?:and|or|but)\b', query)
184
+ components.extend([p.strip() for p in parts if len(p.strip()) > 3])
185
+
186
+ # Extract entities (capitalized words)
187
+ entities = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', query)
188
+ components.extend(entities)
189
+
190
+ # Extract quoted phrases
191
+ quoted = re.findall(r'"([^"]+)"', query)
192
+ components.extend(quoted)
193
+
194
+ # Remove duplicates while preserving order
195
+ seen = set()
196
+ unique_components = []
197
+ for c in components:
198
+ if c not in seen:
199
+ seen.add(c)
200
+ unique_components.append(c)
201
+
202
+ return unique_components
203
+
204
+ class DataAnalysisStrategy:
205
+ """Strategies for analyzing data files"""
206
+
207
+ @staticmethod
208
+ def analyze_for_temporal_data(df, question: str) -> Optional[str]:
209
+ """Analyze dataframe for temporal patterns"""
210
+ # Look for date/time columns
211
+ date_columns = []
212
+ for col in df.columns:
213
+ if df[col].dtype == 'object':
214
+ try:
215
+ pd.to_datetime(df[col])
216
+ date_columns.append(col)
217
+ except:
218
+ pass
219
+
220
+ # Extract month/year if mentioned in question
221
+ month_match = re.search(r'\b(january|february|march|april|may|june|july|august|september|october|november|december)\b',
222
+ question.lower())
223
+ year_match = re.search(r'\b(19\d{2}|20\d{2})\b', question)
224
+
225
+ if month_match and date_columns:
226
+ month = month_match.group(1)
227
+ month_num = {
228
+ 'january': 1, 'february': 2, 'march': 3, 'april': 4,
229
+ 'may': 5, 'june': 6, 'july': 7, 'august': 8,
230
+ 'september': 9, 'october': 10, 'november': 11, 'december': 12
231
+ }[month]
232
+
233
+ for date_col in date_columns:
234
+ df[date_col] = pd.to_datetime(df[date_col])
235
+ df['month'] = df[date_col].dt.month
236
+
237
+ # Filter for specific month
238
+ month_data = df[df['month'] == month_num]
239
+ if not month_data.empty:
240
+ return month_data
241
+
242
+ return None