LuisZermeno_Final_Assignment_Template

Runtime error

App Files Files Community

LuisZermeno commited on May 22, 2025

Commit

17befe8

verified ·

1 Parent(s): b8febd7

Create search_strategies.py

Browse files

Files changed (1) hide show

search_strategies.py +242 -0

search_strategies.py ADDED Viewed

	@@ -0,0 +1,242 @@

+import re
+import time
+import logging
+from typing import List, Dict, Optional, Tuple
+from datetime import datetime
+logger = logging.getLogger(__name__)
+class SearchStrategy:
+    """Advanced search strategies for GAIA questions"""
+    def __init__(self):
+        self.search_history = []
+        self.cache = {}
+    def search_cascade(self, query: str, max_attempts: int = 5) -> str:
+        """Cascade through different search strategies"""
+        strategies = [
+            self._direct_search,
+            self._site_specific_search,
+            self._date_filtered_search,
+            self._rephrased_search,
+            self._component_search,
+        ]
+        results = []
+        for i, strategy in enumerate(strategies[:max_attempts]):
+            try:
+                result = strategy(query)
+                if result and "No results found" not in result:
+                    results.append(result)
+                    if len(results) >= 2:  # Got enough good results
+                        break
+            except Exception as e:
+                logger.warning(f"Search strategy {i} failed: {str(e)}")
+                continue
+        if results:
+            return "\n\n---\n\n".join(results)
+        return "No search results found after multiple attempts."
+    def wikipedia_fallback(self, query: str) -> str:
+        """Fallback strategies for Wikipedia searches"""
+        # Try different query formulations
+        attempts = [
+            query,
+            self._simplify_query(query),
+            self._extract_key_terms(query),
+            self._remove_stop_words(query),
+        ]
+        for attempt in attempts:
+            if attempt:
+                try:
+                    # Use web search with Wikipedia filter
+                    from tools import web_search_tool
+                    result = web_search_tool(f"site:wikipedia.org {attempt}", num_results=3)
+                    if result and "No search results" not in result:
+                        return result
+                except:
+                    continue
+        return "Wikipedia information not found."
+    def _direct_search(self, query: str) -> str:
+        """Direct search with the original query"""
+        from tools import web_search_tool
+        return web_search_tool(query, num_results=5)
+    def _site_specific_search(self, query: str) -> str:
+        """Search specific authoritative sites"""
+        sites = ['wikipedia.org', 'britannica.com', 'sciencedirect.com', 'nature.com']
+        for site in sites:
+            try:
+                from tools import web_search_tool
+                result = web_search_tool(f"site:{site} {query}", num_results=3)
+                if result and "No search results" not in result:
+                    return f"Results from {site}:\n{result}"
+            except:
+                continue
+        return ""
+    def _date_filtered_search(self, query: str) -> str:
+        """Search with date filters for recent information"""
+        # Add current year to query for recent info
+        current_year = datetime.now().year
+        from tools import web_search_tool
+        return web_search_tool(f"{query} {current_year}", num_results=5)
+    def _rephrased_search(self, query: str) -> str:
+        """Rephrase query for better results"""
+        # Extract key information and rephrase
+        rephrased = self._rephrase_query(query)
+        if rephrased != query:
+            from tools import web_search_tool
+            return web_search_tool(rephrased, num_results=5)
+        return ""
+    def _component_search(self, query: str) -> str:
+        """Break query into components and search separately"""
+        components = self._extract_components(query)
+        results = []
+        for component in components[:3]:  # Limit to avoid too many searches
+            try:
+                from tools import web_search_tool
+                result = web_search_tool(component, num_results=2)
+                if result and "No search results" not in result:
+                    results.append(f"Results for '{component}':\n{result}")
+            except:
+                continue
+        return "\n\n".join(results) if results else ""
+    def _simplify_query(self, query: str) -> str:
+        """Simplify query by removing complex phrases"""
+        # Remove question words and simplify
+        question_words = ['what', 'who', 'where', 'when', 'why', 'how', 'which']
+        words = query.lower().split()
+        # Remove question words from beginning
+        if words and words[0] in question_words:
+            words = words[1:]
+        # Remove common filler words
+        filler_words = ['is', 'are', 'was', 'were', 'the', 'a', 'an', 'of', 'in', 'on', 'at']
+        words = [w for w in words if w not in filler_words]
+        return ' '.join(words)
+    def _extract_key_terms(self, query: str) -> str:
+        """Extract key terms from query"""
+        # Look for capitalized words (proper nouns)
+        proper_nouns = re.findall(r'\b[A-Z][a-z]+\b', query)
+        # Look for numbers
+        numbers = re.findall(r'\b\d+\b', query)
+        # Look for quoted terms
+        quoted = re.findall(r'"([^"]+)"', query)
+        key_terms = proper_nouns + numbers + quoted
+        return ' '.join(key_terms) if key_terms else query
+    def _remove_stop_words(self, query: str) -> str:
+        """Remove stop words to focus search"""
+        stop_words = {
+            'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
+            'of', 'with', 'by', 'from', 'about', 'as', 'is', 'was', 'are', 'were',
+            'been', 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
+            'should', 'could', 'may', 'might', 'must', 'can', 'this', 'that', 'these',
+            'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'them', 'their'
+        }
+        words = [w for w in query.lower().split() if w not in stop_words]
+        return ' '.join(words)
+    def _rephrase_query(self, query: str) -> str:
+        """Rephrase query for better search results"""
+        # Common rephrasing patterns
+        replacements = [
+            (r'how many', 'number of'),
+            (r'what is the name of', ''),
+            (r'who is', ''),
+            (r'what year', 'year'),
+            (r'which country', 'country'),
+            (r'what is', ''),
+        ]
+        rephrased = query.lower()
+        for pattern, replacement in replacements:
+            rephrased = re.sub(pattern, replacement, rephrased)
+        return rephrased.strip()
+    def _extract_components(self, query: str) -> List[str]:
+        """Extract component queries from complex question"""
+        components = []
+        # Split by conjunctions
+        parts = re.split(r'\b(?:and|or|but)\b', query)
+        components.extend([p.strip() for p in parts if len(p.strip()) > 3])
+        # Extract entities (capitalized words)
+        entities = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', query)
+        components.extend(entities)
+        # Extract quoted phrases
+        quoted = re.findall(r'"([^"]+)"', query)
+        components.extend(quoted)
+        # Remove duplicates while preserving order
+        seen = set()
+        unique_components = []
+        for c in components:
+            if c not in seen:
+                seen.add(c)
+                unique_components.append(c)
+        return unique_components
+class DataAnalysisStrategy:
+    """Strategies for analyzing data files"""
+    @staticmethod
+    def analyze_for_temporal_data(df, question: str) -> Optional[str]:
+        """Analyze dataframe for temporal patterns"""
+        # Look for date/time columns
+        date_columns = []
+        for col in df.columns:
+            if df[col].dtype == 'object':
+                try:
+                    pd.to_datetime(df[col])
+                    date_columns.append(col)
+                except:
+                    pass
+        # Extract month/year if mentioned in question
+        month_match = re.search(r'\b(january|february|march|april|may|june|july|august|september|october|november|december)\b',
+                               question.lower())
+        year_match = re.search(r'\b(19\d{2}|20\d{2})\b', question)
+        if month_match and date_columns:
+            month = month_match.group(1)
+            month_num = {
+                'january': 1, 'february': 2, 'march': 3, 'april': 4,
+                'may': 5, 'june': 6, 'july': 7, 'august': 8,
+                'september': 9, 'october': 10, 'november': 11, 'december': 12
+            }[month]
+            for date_col in date_columns:
+                df[date_col] = pd.to_datetime(df[date_col])
+                df['month'] = df[date_col].dt.month
+                # Filter for specific month
+                month_data = df[df['month'] == month_num]
+                if not month_data.empty:
+                    return month_data
+        return None