Guiyom commited on
Commit
37482a6
·
verified ·
1 Parent(s): 1fb4469

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -36
app.py CHANGED
@@ -5,7 +5,8 @@ import json
5
  import httpx
6
  import os
7
  import logging
8
- from typing import Dict, List, Optional, Tuple
 
9
  from datetime import datetime
10
  from bs4 import BeautifulSoup
11
  from googlesearch import search
@@ -25,8 +26,6 @@ logger = logging.getLogger(__name__)
25
 
26
  class RaindropSearchBot:
27
  def __init__(self):
28
- self.min_delay = 2 # Minimum delay in seconds
29
- self.max_delay = 5 # Maximum delay in seconds
30
  self.openai_api_key = os.getenv('openaikey')
31
  self.raindrop_api_token = os.getenv('raindroptoken')
32
  self.newsapi_key = os.getenv('newsapikey')
@@ -45,35 +44,102 @@ class RaindropSearchBot:
45
  )
46
  )
47
  self.newsapi = NewsApiClient(api_key=self.newsapi_key)
48
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  def random_delay(self):
50
- """Add a random delay between requests"""
51
- delay = random.uniform(self.min_delay, self.max_delay)
 
 
52
  time.sleep(delay)
53
 
54
- @retry(wait=wait_exponential(multiplier=1, min=4, max=10),
55
- stop=stop_after_attempt(3))
56
-
57
  def get_google_results(self, query: str, num_results: int = 5) -> List[Dict]:
58
- """Get Google search results using googlesearch-python with retry and delay."""
59
  try:
60
  search_results = []
61
- for result in search(query, num_results=num_results, advanced=True):
62
- search_results.append({
63
- 'title': result.title,
64
- 'link': result.url,
65
- 'snippet': result.description
66
- })
67
- # Add random delay between each search result
68
  self.random_delay()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  return search_results
70
 
71
  except Exception as e:
72
  logger.error(f"Google search error: {e}")
73
- return []
74
-
75
- @retry(wait=wait_exponential(multiplier=1, min=4, max=10),
76
- stop=stop_after_attempt(3))
77
 
78
  def get_news_results(self, query: str, num_results: int = 5) -> List[Dict]:
79
  """Get news articles using NewsAPI with retry and delay."""
@@ -150,16 +216,34 @@ class RaindropSearchBot:
150
  return None
151
 
152
  def get_random_user_agent(self) -> str:
153
- """Return a random user agent string to avoid detection."""
154
- user_agents = [
155
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
156
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
157
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
158
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
159
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59',
160
- 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
161
- ]
162
- return random.choice(user_agents)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
  def get_content_and_summary(self, request: str, item: Dict, source_type: str) -> Dict:
165
  """Get content and generate summary for a single item."""
@@ -266,14 +350,14 @@ class RaindropSearchBot:
266
  logger.error(f"Search error: {e}")
267
  return []
268
 
269
- def process_all_results(self, raindrop_results: List[Dict],
270
  google_results: List[Dict],
271
  news_results: List[Dict]) -> Tuple[List[Dict], List[Dict], List[Dict]]:
272
  """Process and enrich all results with content and summaries."""
273
 
274
  processed_raindrop = []
275
  for item in raindrop_results:
276
- processed_item = self.get_content_and_summary(item, 'raindrop')
277
  if processed_item.get('detailed_summary'):
278
  processed_raindrop.append(processed_item)
279
  # Add delay between processing items
@@ -281,7 +365,7 @@ class RaindropSearchBot:
281
 
282
  processed_google = []
283
  for item in google_results:
284
- processed_item = self.get_content_and_summary(item, 'google')
285
  if processed_item.get('detailed_summary'):
286
  processed_google.append(processed_item)
287
  # Add delay between processing items
@@ -289,7 +373,7 @@ class RaindropSearchBot:
289
 
290
  processed_news = []
291
  for item in news_results:
292
- processed_item = self.get_content_and_summary(item, 'news')
293
  if processed_item.get('detailed_summary'):
294
  processed_news.append(processed_item)
295
  # Add delay between processing items
@@ -446,7 +530,7 @@ class RaindropSearchBot:
446
 
447
  # Process all results to get content and summaries
448
  processed_results = self.process_all_results(
449
- raindrop_results, google_results, news_results
450
  )
451
 
452
  # Generate essay-style analysis
 
5
  import httpx
6
  import os
7
  import logging
8
+ from typing import Optional, List, Dict, Tuple
9
+ from itertools import cycle
10
  from datetime import datetime
11
  from bs4 import BeautifulSoup
12
  from googlesearch import search
 
26
 
27
  class RaindropSearchBot:
28
  def __init__(self):
 
 
29
  self.openai_api_key = os.getenv('openaikey')
30
  self.raindrop_api_token = os.getenv('raindroptoken')
31
  self.newsapi_key = os.getenv('newsapikey')
 
44
  )
45
  )
46
  self.newsapi = NewsApiClient(api_key=self.newsapi_key)
47
+ self.min_delay = 5 # Increased minimum delay
48
+ self.max_delay = 15 # Increased maximum delay
49
+ self.ua = UserAgent()
50
+ self.setup_proxies()
51
+
52
+ def get_alternative_search_results(self, query: str) -> List[Dict]:
53
+ """Implement alternative search engine if Google fails"""
54
+ # Example using DuckDuckGo (you'll need to install duckduckgo-search)
55
+ try:
56
+ from duckduckgo_search import ddg
57
+
58
+ self.random_delay()
59
+ results = ddg(query, max_results=5)
60
+
61
+ return [{
62
+ 'title': result['title'],
63
+ 'link': result['link'],
64
+ 'snippet': result['snippet']
65
+ } for result in results]
66
+
67
+ except Exception as e:
68
+ logger.error(f"Alternative search failed: {e}")
69
+ return []
70
+
71
+ def search_with_fallback(self, query: str) -> List[Dict]:
72
+ """Search with fallback to alternative search engines"""
73
+ try:
74
+ return self.get_google_results(query)
75
+ except Exception as e:
76
+ logger.warning(f"Google search failed: {e}")
77
+ try:
78
+ # Implement alternative search engine here
79
+ # For example: DuckDuckGo, Bing, etc.
80
+ return self.get_alternative_search_results(query)
81
+ except Exception as e:
82
+ logger.error(f"All search attempts failed: {e}")
83
+ return []
84
+
85
+ def setup_proxies(self):
86
+ """Setup proxy rotation"""
87
+ # Free proxy list - replace with your paid proxy service for better reliability
88
+ self.proxies = [
89
+ 'http://proxy1.example.com:8080',
90
+ 'http://proxy2.example.com:8080',
91
+ # Add more proxies here
92
+ ]
93
+ self.proxy_cycle = cycle(self.proxies)
94
+
95
  def random_delay(self):
96
+ """Enhanced random delay with jitter"""
97
+ base_delay = random.uniform(self.min_delay, self.max_delay)
98
+ jitter = random.uniform(-1, 1) # Add/subtract up to 1 second
99
+ delay = max(0, base_delay + jitter)
100
  time.sleep(delay)
101
 
 
 
 
102
  def get_google_results(self, query: str, num_results: int = 5) -> List[Dict]:
103
+ """Get Google search results with improved handling"""
104
  try:
105
  search_results = []
106
+ session = self.create_session()
107
+
108
+ # Break the search into smaller chunks
109
+ chunk_size = 3
110
+ for i in range(0, num_results, chunk_size):
111
+ # Add substantial random delay between chunks
 
112
  self.random_delay()
113
+
114
+ try:
115
+ chunk_results = list(search(
116
+ query,
117
+ num_results=min(chunk_size, num_results - i),
118
+ advanced=True,
119
+ lang="en",
120
+ sleep_interval=random.uniform(5, 10), # Random delay between requests
121
+ timeout=30
122
+ ))
123
+
124
+ for result in chunk_results:
125
+ search_results.append({
126
+ 'title': result.title,
127
+ 'link': result.url,
128
+ 'snippet': result.description
129
+ })
130
+
131
+ # Add random delay between chunks
132
+ time.sleep(random.uniform(8, 15))
133
+
134
+ except Exception as e:
135
+ logger.warning(f"Error in search chunk {i}: {e}")
136
+ continue
137
+
138
  return search_results
139
 
140
  except Exception as e:
141
  logger.error(f"Google search error: {e}")
142
+ raise
 
 
 
143
 
144
  def get_news_results(self, query: str, num_results: int = 5) -> List[Dict]:
145
  """Get news articles using NewsAPI with retry and delay."""
 
216
  return None
217
 
218
  def get_random_user_agent(self) -> str:
219
+ """Get random user agent using fake-useragent"""
220
+ return self.ua.random
221
+
222
+ def create_session(self) -> requests.Session:
223
+ """Create a session with random user agent and proxy"""
224
+ session = requests.Session()
225
+ session.headers.update({
226
+ 'User-Agent': self.get_random_user_agent(),
227
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
228
+ 'Accept-Language': 'en-US,en;q=0.5',
229
+ 'Accept-Encoding': 'gzip, deflate, br',
230
+ 'DNT': '1',
231
+ 'Connection': 'keep-alive',
232
+ 'Upgrade-Insecure-Requests': '1',
233
+ 'Sec-Fetch-Dest': 'document',
234
+ 'Sec-Fetch-Mode': 'navigate',
235
+ 'Sec-Fetch-Site': 'none',
236
+ 'Sec-Fetch-User': '?1',
237
+ 'Cache-Control': 'max-age=0'
238
+ })
239
+ session.proxies = self.get_next_proxy()
240
+ return session
241
+
242
+ @retry(
243
+ wait=wait_exponential(multiplier=1, min=4, max=20),
244
+ stop=stop_after_attempt(3),
245
+ reraise=True
246
+ )
247
 
248
  def get_content_and_summary(self, request: str, item: Dict, source_type: str) -> Dict:
249
  """Get content and generate summary for a single item."""
 
350
  logger.error(f"Search error: {e}")
351
  return []
352
 
353
+ def process_all_results(self, request, raindrop_results: List[Dict],
354
  google_results: List[Dict],
355
  news_results: List[Dict]) -> Tuple[List[Dict], List[Dict], List[Dict]]:
356
  """Process and enrich all results with content and summaries."""
357
 
358
  processed_raindrop = []
359
  for item in raindrop_results:
360
+ processed_item = self.get_content_and_summary(request, item, 'raindrop')
361
  if processed_item.get('detailed_summary'):
362
  processed_raindrop.append(processed_item)
363
  # Add delay between processing items
 
365
 
366
  processed_google = []
367
  for item in google_results:
368
+ processed_item = self.get_content_and_summary(request, item, 'google')
369
  if processed_item.get('detailed_summary'):
370
  processed_google.append(processed_item)
371
  # Add delay between processing items
 
373
 
374
  processed_news = []
375
  for item in news_results:
376
+ processed_item = self.get_content_and_summary(request, item, 'news')
377
  if processed_item.get('detailed_summary'):
378
  processed_news.append(processed_item)
379
  # Add delay between processing items
 
530
 
531
  # Process all results to get content and summaries
532
  processed_results = self.process_all_results(
533
+ user_request, raindrop_results, google_results, news_results
534
  )
535
 
536
  # Generate essay-style analysis