Shreyas94 commited on
Commit
aa70df3
·
verified ·
1 Parent(s): c051a22

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +300 -65
app.py CHANGED
@@ -62,22 +62,36 @@ class SearchEngineInterface:
62
  def __init__(self):
63
  self.session = None
64
  self.headers = {
65
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
66
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
67
- 'Accept-Language': 'en-US,en;q=0.5',
68
- 'Accept-Encoding': 'gzip, deflate',
69
  'Connection': 'keep-alive',
 
 
 
 
 
 
70
  }
71
 
72
  async def get_session(self):
73
- """Get or create aiohttp session"""
74
- if self.session is None:
75
- connector = aiohttp.TCPConnector(limit=10)
76
- timeout = aiohttp.ClientTimeout(total=30)
 
 
 
 
 
 
 
77
  self.session = aiohttp.ClientSession(
78
  headers=self.headers,
79
  connector=connector,
80
- timeout=timeout
 
81
  )
82
  return self.session
83
 
@@ -209,57 +223,224 @@ class SearchEngineInterface:
209
  return []
210
 
211
  async def close(self):
212
- """Close the session"""
213
- if self.session:
214
  await self.session.close()
 
 
215
 
216
  class ContentScraper:
217
- """Scrape and parse article content using newspaper3k"""
218
 
219
  def __init__(self):
220
  self.session = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
 
222
  async def get_session(self):
223
- """Get or create aiohttp session"""
224
- if self.session is None:
225
- connector = aiohttp.TCPConnector(limit=20)
226
- timeout = aiohttp.ClientTimeout(total=30)
 
 
 
 
 
 
 
 
227
  self.session = aiohttp.ClientSession(
 
228
  connector=connector,
229
- timeout=timeout
 
230
  )
231
  return self.session
232
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  async def scrape_article(self, url: str) -> Tuple[str, Optional[str]]:
234
- """Scrape article content and publication date"""
235
  try:
236
- # Use newspaper3k for article extraction
237
  article = Article(url)
238
- article.download()
239
- article.parse()
240
-
241
- content = article.text
242
- pub_date = article.publish_date.isoformat() if article.publish_date else None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
 
244
- return content, pub_date
245
  except Exception as e:
246
- print(f"Error scraping {url}: {e}")
247
  return "", None
248
 
249
- async def scrape_multiple(self, search_results: List[SearchResult]) -> List[SearchResult]:
250
- """Scrape multiple articles in parallel"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  tasks = []
252
  for result in search_results:
253
- tasks.append(self.scrape_article(result.url))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
 
255
- scraped_data = await asyncio.gather(*tasks, return_exceptions=True)
 
 
256
 
257
- for i, (content, pub_date) in enumerate(scraped_data):
258
- if not isinstance(content, Exception):
259
- search_results[i].content = content
260
- search_results[i].publication_date = pub_date
261
 
262
- return search_results
263
 
264
  async def close(self):
265
  """Close the session"""
@@ -475,7 +656,7 @@ class AISearchEngine:
475
  temperature: float,
476
  max_results: int,
477
  max_tokens: int) -> Tuple[str, str]:
478
- """Main search and summarization pipeline"""
479
 
480
  start_time = time.time()
481
  status_updates = []
@@ -500,53 +681,90 @@ class AISearchEngine:
500
  if not search_tasks:
501
  return "No search engines selected", "\n".join(status_updates)
502
 
503
- search_results_lists = await asyncio.gather(*search_tasks)
504
 
505
- # Combine and deduplicate results
506
  all_results = []
507
  seen_urls = set()
508
 
509
  for results_list in search_results_lists:
510
- for result in results_list:
511
- if result.url not in seen_urls:
512
- all_results.append(result)
513
- seen_urls.add(result.url)
 
514
 
515
  status_updates.append(f"Found {len(all_results)} unique results")
516
 
517
  if not all_results:
518
- return "No search results found", "\n".join(status_updates)
519
 
520
- # Step 3: Content Scraping
521
  status_updates.append("📄 Scraping article content...")
522
- scraped_results = await self.content_scraper.scrape_multiple(all_results[:max_results])
523
 
524
- # Filter results with content
525
- results_with_content = [r for r in scraped_results if r.content.strip()]
526
- status_updates.append(f"Successfully scraped {len(results_with_content)} articles")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
527
 
528
  # Step 4: Optional Embedding-based Filtering
529
  if use_embeddings and results_with_content:
530
  status_updates.append("🧠 Filtering results using embeddings...")
531
- filtered_results = self.embedding_filter.filter_by_relevance(query, results_with_content)
532
- status_updates.append(f"Filtered to {len(filtered_results)} most relevant results")
533
- else:
534
- filtered_results = results_with_content
 
 
 
 
 
535
 
536
- if not filtered_results:
537
  return "No relevant results found after filtering", "\n".join(status_updates)
538
 
539
  # Step 5: LLM Summarization
540
  status_updates.append(f"🤖 Generating summary using {model}...")
541
 
542
- if model.startswith("Groq"):
543
- summary = await self.llm_summarizer.summarize_with_groq(
544
- query, filtered_results, temperature, max_tokens
545
- )
546
- else: # OpenRouter
547
- summary = await self.llm_summarizer.summarize_with_openrouter(
548
- query, filtered_results, temperature, max_tokens
549
- )
 
 
 
 
 
 
 
 
 
 
 
 
550
 
551
  # Add metadata
552
  end_time = time.time()
@@ -556,7 +774,6 @@ class AISearchEngine:
556
  metadata += f"- Processing time: {processing_time:.2f} seconds\n"
557
  metadata += f"- Results found: {len(all_results)}\n"
558
  metadata += f"- Articles scraped: {len(results_with_content)}\n"
559
- metadata += f"- Results used for summary: {len(filtered_results)}\n"
560
  metadata += f"- Search engines: {', '.join(search_engines)}\n"
561
  metadata += f"- Model: {model}\n"
562
  metadata += f"- Embeddings used: {use_embeddings}\n"
@@ -572,9 +789,27 @@ class AISearchEngine:
572
  return error_msg, "\n".join(status_updates)
573
 
574
  finally:
575
- # Cleanup
576
- await self.search_interface.close()
577
- await self.content_scraper.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
578
 
579
  # Global search engine instance
580
  search_engine = None
 
62
  def __init__(self):
63
  self.session = None
64
  self.headers = {
65
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
66
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
67
+ 'Accept-Language': 'en-US,en;q=0.9',
68
+ 'Accept-Encoding': 'gzip, deflate, br',
69
  'Connection': 'keep-alive',
70
+ 'Upgrade-Insecure-Requests': '1',
71
+ 'Sec-Fetch-Dest': 'document',
72
+ 'Sec-Fetch-Mode': 'navigate',
73
+ 'Sec-Fetch-Site': 'none',
74
+ 'Sec-Fetch-User': '?1',
75
+ 'Cache-Control': 'max-age=0',
76
  }
77
 
78
  async def get_session(self):
79
+ """Get or create aiohttp session with better configuration"""
80
+ if self.session is None or self.session.closed:
81
+ connector = aiohttp.TCPConnector(
82
+ limit=20,
83
+ limit_per_host=5,
84
+ ttl_dns_cache=300,
85
+ use_dns_cache=True,
86
+ keepalive_timeout=30,
87
+ enable_cleanup_closed=True
88
+ )
89
+ timeout = aiohttp.ClientTimeout(total=45, connect=15, sock_read=30)
90
  self.session = aiohttp.ClientSession(
91
  headers=self.headers,
92
  connector=connector,
93
+ timeout=timeout,
94
+ trust_env=True
95
  )
96
  return self.session
97
 
 
223
  return []
224
 
225
  async def close(self):
226
+ """Close the session safely"""
227
+ if self.session and not self.session.closed:
228
  await self.session.close()
229
+ # Wait a bit for the underlying connections to close
230
+ await asyncio.sleep(0.1)
231
 
232
  class ContentScraper:
233
+ """Scrape and parse article content using newspaper3k with robust error handling"""
234
 
235
  def __init__(self):
236
  self.session = None
237
+ self.headers = {
238
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
239
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
240
+ 'Accept-Language': 'en-US,en;q=0.9',
241
+ 'Accept-Encoding': 'gzip, deflate, br',
242
+ 'Connection': 'keep-alive',
243
+ 'Upgrade-Insecure-Requests': '1',
244
+ 'Sec-Fetch-Dest': 'document',
245
+ 'Sec-Fetch-Mode': 'navigate',
246
+ 'Sec-Fetch-Site': 'cross-site',
247
+ 'Sec-Fetch-User': '?1',
248
+ 'Cache-Control': 'no-cache',
249
+ 'Pragma': 'no-cache'
250
+ }
251
+ # Domains known to block scrapers - we'll handle these differently
252
+ self.blocked_domains = {
253
+ 'bloomberg.com', 'wsj.com', 'ft.com', 'nytimes.com',
254
+ 'washingtonpost.com', 'economist.com', 'reuters.com'
255
+ }
256
 
257
  async def get_session(self):
258
+ """Get or create aiohttp session with robust configuration"""
259
+ if self.session is None or self.session.closed:
260
+ connector = aiohttp.TCPConnector(
261
+ limit=30,
262
+ limit_per_host=10,
263
+ ttl_dns_cache=300,
264
+ use_dns_cache=True,
265
+ keepalive_timeout=60,
266
+ enable_cleanup_closed=True,
267
+ ssl=False # Disable SSL verification for problematic sites
268
+ )
269
+ timeout = aiohttp.ClientTimeout(total=60, connect=20, sock_read=40)
270
  self.session = aiohttp.ClientSession(
271
+ headers=self.headers,
272
  connector=connector,
273
+ timeout=timeout,
274
+ trust_env=True
275
  )
276
  return self.session
277
 
278
+ def is_blocked_domain(self, url: str) -> bool:
279
+ """Check if domain is known to block scrapers"""
280
+ from urllib.parse import urlparse
281
+ try:
282
+ domain = urlparse(url).netloc.lower()
283
+ return any(blocked in domain for blocked in self.blocked_domains)
284
+ except:
285
+ return False
286
+
287
+ async def scrape_article_fallback(self, url: str) -> Tuple[str, Optional[str]]:
288
+ """Fallback scraping method using direct HTTP request"""
289
+ try:
290
+ session = await self.get_session()
291
+
292
+ # Add random delay to avoid rate limiting
293
+ await asyncio.sleep(0.5)
294
+
295
+ async with session.get(url, allow_redirects=True) as response:
296
+ if response.status == 200:
297
+ html = await response.text()
298
+ soup = BeautifulSoup(html, 'html.parser')
299
+
300
+ # Remove script and style elements
301
+ for script in soup(["script", "style", "nav", "header", "footer", "aside"]):
302
+ script.decompose()
303
+
304
+ # Try to find main content
305
+ content_selectors = [
306
+ 'article', '.article-body', '.entry-content', '.post-content',
307
+ '.content', '.main-content', '[data-module="ArticleBody"]',
308
+ '.story-body', '.article-content', 'main'
309
+ ]
310
+
311
+ content = ""
312
+ for selector in content_selectors:
313
+ elements = soup.select(selector)
314
+ if elements:
315
+ content = ' '.join(elem.get_text(strip=True) for elem in elements)
316
+ if len(content) > 200: # Minimum content length
317
+ break
318
+
319
+ # If no content found, get all paragraph text
320
+ if not content or len(content) < 100:
321
+ paragraphs = soup.find_all('p')
322
+ content = ' '.join(p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 20)
323
+
324
+ # Try to extract publication date
325
+ pub_date = None
326
+ date_selectors = [
327
+ 'time[datetime]', '.published-date', '.post-date',
328
+ '.article-date', '[data-testid="timestamp"]'
329
+ ]
330
+
331
+ for selector in date_selectors:
332
+ date_elem = soup.select_one(selector)
333
+ if date_elem:
334
+ pub_date = date_elem.get('datetime') or date_elem.get_text(strip=True)
335
+ break
336
+
337
+ return content[:3000], pub_date # Limit content length
338
+ else:
339
+ return "", None
340
+ except Exception as e:
341
+ print(f"Fallback scraping failed for {url}: {e}")
342
+ return "", None
343
+
344
  async def scrape_article(self, url: str) -> Tuple[str, Optional[str]]:
345
+ """Scrape article content with multiple fallback strategies"""
346
  try:
347
+ # First, try newspaper3k with custom configuration
348
  article = Article(url)
349
+ article.set_config({
350
+ 'browser_user_agent': self.headers['User-Agent'],
351
+ 'request_timeout': 30,
352
+ 'number_threads': 1,
353
+ 'verbose': False,
354
+ 'fetch_images': False,
355
+ 'memoize_articles': False,
356
+ 'use_cached_categories': False
357
+ })
358
+
359
+ # Try newspaper3k first
360
+ try:
361
+ article.download()
362
+ article.parse()
363
+
364
+ if article.text and len(article.text.strip()) > 100:
365
+ content = article.text.strip()
366
+ pub_date = article.publish_date.isoformat() if article.publish_date else None
367
+ return content[:3000], pub_date
368
+ except Exception as e:
369
+ print(f"Newspaper3k failed for {url}: {e}")
370
+
371
+ # If newspaper3k fails or domain is blocked, try fallback
372
+ content, pub_date = await self.scrape_article_fallback(url)
373
+ if content and len(content.strip()) > 50:
374
+ return content, pub_date
375
+
376
+ return "", None
377
 
 
378
  except Exception as e:
379
+ print(f"All scraping methods failed for {url}: {e}")
380
  return "", None
381
 
382
+ async def scrape_multiple(self, search_results: List[SearchResult], max_successful: int = None) -> List[SearchResult]:
383
+ """Scrape multiple articles with robust error handling and retry logic"""
384
+ if not search_results:
385
+ return search_results
386
+
387
+ max_successful = max_successful or len(search_results)
388
+ successful_scraped = 0
389
+ semaphore = asyncio.Semaphore(5) # Limit concurrent requests
390
+
391
+ async def scrape_with_semaphore(result: SearchResult) -> SearchResult:
392
+ nonlocal successful_scraped
393
+
394
+ if successful_scraped >= max_successful:
395
+ return result
396
+
397
+ async with semaphore:
398
+ try:
399
+ # Skip if already have enough successful results
400
+ if successful_scraped >= max_successful:
401
+ return result
402
+
403
+ content, pub_date = await self.scrape_article(result.url)
404
+
405
+ if content and len(content.strip()) > 50:
406
+ result.content = content
407
+ result.publication_date = pub_date
408
+ successful_scraped += 1
409
+ print(f"✅ Successfully scraped: {result.url[:60]}...")
410
+ else:
411
+ print(f"⚠️ No content extracted from: {result.url[:60]}...")
412
+
413
+ except Exception as e:
414
+ print(f"❌ Failed to scrape {result.url[:60]}...: {e}")
415
+
416
+ return result
417
+
418
+ # Process all URLs but stop when we have enough successful results
419
  tasks = []
420
  for result in search_results:
421
+ if successful_scraped < max_successful:
422
+ tasks.append(scrape_with_semaphore(result))
423
+ else:
424
+ break
425
+
426
+ if tasks:
427
+ scraped_results = await asyncio.gather(*tasks, return_exceptions=True)
428
+
429
+ # Filter out exceptions and return successful results
430
+ valid_results = []
431
+ for result in scraped_results:
432
+ if not isinstance(result, Exception):
433
+ valid_results.append(result)
434
+ else:
435
+ valid_results = search_results
436
 
437
+ # Return results with content first, then others
438
+ results_with_content = [r for r in valid_results if r.content.strip()]
439
+ results_without_content = [r for r in valid_results if not r.content.strip()]
440
 
441
+ print(f"📊 Scraping summary: {len(results_with_content)} successful, {len(results_without_content)} failed")
 
 
 
442
 
443
+ return results_with_content + results_without_content
444
 
445
  async def close(self):
446
  """Close the session"""
 
656
  temperature: float,
657
  max_results: int,
658
  max_tokens: int) -> Tuple[str, str]:
659
+ """Main search and summarization pipeline with robust error handling"""
660
 
661
  start_time = time.time()
662
  status_updates = []
 
681
  if not search_tasks:
682
  return "No search engines selected", "\n".join(status_updates)
683
 
684
+ search_results_lists = await asyncio.gather(*search_tasks, return_exceptions=True)
685
 
686
+ # Combine and deduplicate results, handling exceptions
687
  all_results = []
688
  seen_urls = set()
689
 
690
  for results_list in search_results_lists:
691
+ if not isinstance(results_list, Exception) and results_list:
692
+ for result in results_list:
693
+ if result.url not in seen_urls and result.url.startswith('http'):
694
+ all_results.append(result)
695
+ seen_urls.add(result.url)
696
 
697
  status_updates.append(f"Found {len(all_results)} unique results")
698
 
699
  if not all_results:
700
+ return "No search results found. This might be due to rate limiting or network issues. Please try again.", "\n".join(status_updates)
701
 
702
+ # Step 3: Content Scraping with intelligent retry and fallback
703
  status_updates.append("📄 Scraping article content...")
 
704
 
705
+ # Prioritize results and scrape intelligently
706
+ target_successful = min(max_results, len(all_results))
707
+ scraped_results = await self.content_scraper.scrape_multiple(
708
+ all_results[:max_results * 2], # Try more URLs to ensure we get enough content
709
+ max_successful=target_successful
710
+ )
711
+
712
+ # Filter results with meaningful content
713
+ results_with_content = [r for r in scraped_results if r.content.strip() and len(r.content.strip()) > 100]
714
+ status_updates.append(f"Successfully scraped {len(results_with_content)} articles with meaningful content")
715
+
716
+ # If we don't have enough content, try to get some from snippets
717
+ if len(results_with_content) < 3:
718
+ status_updates.append("Using search snippets as fallback content...")
719
+ for result in scraped_results:
720
+ if not result.content.strip() and result.snippet.strip():
721
+ result.content = result.snippet
722
+ results_with_content.append(result)
723
+ if len(results_with_content) >= 5: # Reasonable minimum
724
+ break
725
+
726
+ if not results_with_content:
727
+ return "No article content could be extracted. This might be due to anti-bot protections. Please try a different query or try again later.", "\n".join(status_updates)
728
 
729
  # Step 4: Optional Embedding-based Filtering
730
  if use_embeddings and results_with_content:
731
  status_updates.append("🧠 Filtering results using embeddings...")
732
+ try:
733
+ filtered_results = self.embedding_filter.filter_by_relevance(query, results_with_content)
734
+ if filtered_results:
735
+ results_with_content = filtered_results
736
+ status_updates.append(f"Filtered to {len(filtered_results)} most relevant results")
737
+ else:
738
+ status_updates.append("Embedding filter returned no results, using all scraped content")
739
+ except Exception as e:
740
+ status_updates.append(f"Embedding filtering failed, using all results: {str(e)}")
741
 
742
+ if not results_with_content:
743
  return "No relevant results found after filtering", "\n".join(status_updates)
744
 
745
  # Step 5: LLM Summarization
746
  status_updates.append(f"🤖 Generating summary using {model}...")
747
 
748
+ try:
749
+ if model.startswith("Groq"):
750
+ summary = await self.llm_summarizer.summarize_with_groq(
751
+ query, results_with_content, temperature, max_tokens
752
+ )
753
+ else: # OpenRouter
754
+ summary = await self.llm_summarizer.summarize_with_openrouter(
755
+ query, results_with_content, temperature, max_tokens
756
+ )
757
+
758
+ # Check if summarization failed
759
+ if summary.startswith("Error") or summary.startswith("Groq API error") or summary.startswith("OpenRouter API error"):
760
+ # Provide a basic summary from the content
761
+ basic_summary = self.create_basic_summary(query, results_with_content)
762
+ summary = f"AI summarization failed, but here's what I found:\n\n{basic_summary}\n\n---\n⚠️ Original error: {summary}"
763
+
764
+ except Exception as e:
765
+ # Fallback to basic summary
766
+ basic_summary = self.create_basic_summary(query, results_with_content)
767
+ summary = f"AI summarization encountered an error, but here's what I found:\n\n{basic_summary}\n\n---\n⚠️ Error: {str(e)}"
768
 
769
  # Add metadata
770
  end_time = time.time()
 
774
  metadata += f"- Processing time: {processing_time:.2f} seconds\n"
775
  metadata += f"- Results found: {len(all_results)}\n"
776
  metadata += f"- Articles scraped: {len(results_with_content)}\n"
 
777
  metadata += f"- Search engines: {', '.join(search_engines)}\n"
778
  metadata += f"- Model: {model}\n"
779
  metadata += f"- Embeddings used: {use_embeddings}\n"
 
789
  return error_msg, "\n".join(status_updates)
790
 
791
  finally:
792
+ # Cleanup - but don't close sessions immediately to allow reuse
793
+ try:
794
+ # Don't close sessions here as they might be reused
795
+ pass
796
+ except Exception as e:
797
+ print(f"Cleanup error: {e}")
798
+
799
+ def create_basic_summary(self, query: str, results: List[SearchResult]) -> str:
800
+ """Create a basic summary when AI summarization fails"""
801
+ summary_parts = [f"Based on search results for: **{query}**\n"]
802
+
803
+ for i, result in enumerate(results[:5], 1):
804
+ content_preview = result.content[:300] + "..." if len(result.content) > 300 else result.content
805
+ summary_parts.append(f"**{i}. {result.title}**")
806
+ summary_parts.append(f"Source: {result.url}")
807
+ if result.publication_date:
808
+ summary_parts.append(f"Date: {result.publication_date}")
809
+ summary_parts.append(f"Content: {content_preview}")
810
+ summary_parts.append("")
811
+
812
+ return "\n".join(summary_parts)
813
 
814
  # Global search engine instance
815
  search_engine = None