Spaces:

tensor-boy
/

aiws

Build error

App Files Files Community

fikird commited on Dec 2, 2024

Commit

5e3672b

1 Parent(s): 6c83b94

Improve search reliability with multiple regions and better error handling

Browse files

Files changed (1) hide show

search_engine.py +71 -22

search_engine.py CHANGED Viewed

@@ -13,6 +13,12 @@ import random
 logger = logging.getLogger(__name__)
 class ModelManager:
     """Manages different AI models for specific tasks"""
@@ -67,10 +73,13 @@ class WebSearchEngine:
     def __init__(self):
         self.processor = ContentProcessor()
         self.session = requests.Session()
-        self.request_delay = 2.0  # Increased delay between requests
         self.last_request_time = 0
         self.max_retries = 3
         self.ddgs = None
         self.initialize_search()
     def initialize_search(self):
@@ -83,6 +92,33 @@ class WebSearchEngine:
                 logger.error(f"Error initializing DDGS: {str(e)}")
                 time.sleep(random.uniform(1, 3))
         raise Exception("Failed to initialize DuckDuckGo search after multiple attempts")
     def is_valid_url(self, url: str) -> bool:
         """Check if URL is valid for crawling"""
@@ -109,19 +145,7 @@ class WebSearchEngine:
             return {'error': f"Invalid URL: {url}"}
         try:
-            # Rate limiting with random delay
-            current_time = time.time()
-            time_since_last = current_time - self.last_request_time
-            if time_since_last < self.request_delay:
-                delay = self.request_delay - time_since_last + random.uniform(0.5, 1.5)
-                time.sleep(delay)
-            response = self.session.get(url, timeout=10)
-            self.last_request_time = time.time()
-            if response.status_code != 200:
-                return {'error': f"Failed to fetch URL: {url}, status code: {response.status_code}"}
             soup = BeautifulSoup(response.text, 'lxml')
             # Extract text content
@@ -163,17 +187,40 @@ class WebSearchEngine:
             search_results = []
             retry_count = 0
-            while retry_count < self.max_retries:
                 try:
-                    for result in self.ddgs.text(query, max_results=max_results):
-                        search_results.append(result)
-                        # Add small delay between results
-                        time.sleep(random.uniform(0.2, 0.5))
-                    break
                 except Exception as e:
                     retry_count += 1
                     if retry_count >= self.max_retries:
-                        return {'error': f"Search failed after {self.max_retries} attempts: {str(e)}"}
                     logger.warning(f"Search attempt {retry_count} failed: {str(e)}")
                     time.sleep(random.uniform(2, 5))
                     self.initialize_search()
@@ -187,9 +234,11 @@ class WebSearchEngine:
                     processed = self.process_url(result['link'])
                     if 'error' not in processed:
                         results.append(processed)
-                        # Add delay between processing URLs
                         time.sleep(random.uniform(0.5, 1.0))
             # Generate insights from results
             all_content = " ".join([r['summary'] for r in results if 'summary' in r])

 logger = logging.getLogger(__name__)
+class SearchResult:
+    def __init__(self, title: str, link: str, snippet: str):
+        self.title = title
+        self.link = link
+        self.snippet = snippet
 class ModelManager:
     """Manages different AI models for specific tasks"""
     def __init__(self):
         self.processor = ContentProcessor()
         self.session = requests.Session()
+        self.request_delay = 2.0
         self.last_request_time = 0
         self.max_retries = 3
         self.ddgs = None
+        self.headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
         self.initialize_search()
     def initialize_search(self):
                 logger.error(f"Error initializing DDGS: {str(e)}")
                 time.sleep(random.uniform(1, 3))
         raise Exception("Failed to initialize DuckDuckGo search after multiple attempts")
+    def safe_get(self, url: str, max_retries: int = 3) -> requests.Response:
+        """Make a GET request with retries and error handling"""
+        for i in range(max_retries):
+            try:
+                # Add delay between requests
+                current_time = time.time()
+                time_since_last = current_time - self.last_request_time
+                if time_since_last < self.request_delay:
+                    time.sleep(self.request_delay - time_since_last + random.uniform(0.5, 1.5))
+                response = self.session.get(url, headers=self.headers, timeout=10)
+                self.last_request_time = time.time()
+                if response.status_code == 200:
+                    return response
+                elif response.status_code == 429:  # Rate limit
+                    wait_time = (i + 1) * 5
+                    time.sleep(wait_time)
+                    continue
+                else:
+                    response.raise_for_status()
+            except Exception as e:
+                if i == max_retries - 1:
+                    raise
+                time.sleep((i + 1) * 2)
+        raise Exception(f"Failed to fetch URL after {max_retries} attempts")
     def is_valid_url(self, url: str) -> bool:
         """Check if URL is valid for crawling"""
             return {'error': f"Invalid URL: {url}"}
         try:
+            response = self.safe_get(url)
             soup = BeautifulSoup(response.text, 'lxml')
             # Extract text content
             search_results = []
             retry_count = 0
+            while retry_count < self.max_retries and len(search_results) < max_results:
                 try:
+                    # Try different regions if search fails
+                    regions = ['wt-wt', 'us-en', 'uk-en']
+                    for region in regions:
+                        if len(search_results) >= max_results:
+                            break
+                        results_gen = self.ddgs.text(
+                            query,
+                            region=region,
+                            max_results=max_results - len(search_results)
+                        )
+                        for result in results_gen:
+                            if len(search_results) >= max_results:
+                                break
+                            if result and isinstance(result, dict) and 'link' in result:
+                                search_results.append(result)
+                                time.sleep(random.uniform(0.2, 0.5))
+                        if search_results:
+                            break
+                    if search_results:
+                        break
                 except Exception as e:
                     retry_count += 1
                     if retry_count >= self.max_retries:
+                        logger.error(f"Search failed after {self.max_retries} attempts: {str(e)}")
+                        if not search_results:
+                            return {'error': f"Search failed after {self.max_retries} attempts: {str(e)}"}
+                        break
                     logger.warning(f"Search attempt {retry_count} failed: {str(e)}")
                     time.sleep(random.uniform(2, 5))
                     self.initialize_search()
                     processed = self.process_url(result['link'])
                     if 'error' not in processed:
                         results.append(processed)
                         time.sleep(random.uniform(0.5, 1.0))
+            if not results:
+                return {'error': 'Failed to process any search results'}
             # Generate insights from results
             all_content = " ".join([r['summary'] for r in results if 'summary' in r])