Shreyas94 commited on
Commit
75acc4f
·
verified ·
1 Parent(s): aa70df3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +152 -68
app.py CHANGED
@@ -285,99 +285,183 @@ class ContentScraper:
285
  return False
286
 
287
  async def scrape_article_fallback(self, url: str) -> Tuple[str, Optional[str]]:
288
- """Fallback scraping method using direct HTTP request"""
289
  try:
290
  session = await self.get_session()
291
 
292
  # Add random delay to avoid rate limiting
293
- await asyncio.sleep(0.5)
294
 
295
  async with session.get(url, allow_redirects=True) as response:
296
- if response.status == 200:
297
- html = await response.text()
298
- soup = BeautifulSoup(html, 'html.parser')
299
-
300
- # Remove script and style elements
301
- for script in soup(["script", "style", "nav", "header", "footer", "aside"]):
302
- script.decompose()
303
-
304
- # Try to find main content
305
- content_selectors = [
306
- 'article', '.article-body', '.entry-content', '.post-content',
307
- '.content', '.main-content', '[data-module="ArticleBody"]',
308
- '.story-body', '.article-content', 'main'
309
- ]
310
 
311
- content = ""
312
- for selector in content_selectors:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
  elements = soup.select(selector)
314
  if elements:
315
- content = ' '.join(elem.get_text(strip=True) for elem in elements)
316
- if len(content) > 200: # Minimum content length
317
- break
318
-
319
- # If no content found, get all paragraph text
320
- if not content or len(content) < 100:
321
- paragraphs = soup.find_all('p')
322
- content = ' '.join(p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 20)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
 
324
- # Try to extract publication date
325
- pub_date = None
326
- date_selectors = [
327
- 'time[datetime]', '.published-date', '.post-date',
328
- '.article-date', '[data-testid="timestamp"]'
329
- ]
 
 
 
 
 
 
 
 
330
 
331
- for selector in date_selectors:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
  date_elem = soup.select_one(selector)
333
  if date_elem:
334
- pub_date = date_elem.get('datetime') or date_elem.get_text(strip=True)
335
- break
336
-
337
- return content[:3000], pub_date # Limit content length
338
- else:
339
- return "", None
 
 
 
 
 
 
 
 
 
 
 
340
  except Exception as e:
341
- print(f"Fallback scraping failed for {url}: {e}")
342
  return "", None
343
 
344
  async def scrape_article(self, url: str) -> Tuple[str, Optional[str]]:
345
  """Scrape article content with multiple fallback strategies"""
 
 
 
 
346
  try:
347
- # First, try newspaper3k with custom configuration
348
  article = Article(url)
349
- article.set_config({
350
- 'browser_user_agent': self.headers['User-Agent'],
351
- 'request_timeout': 30,
352
- 'number_threads': 1,
353
- 'verbose': False,
354
- 'fetch_images': False,
355
- 'memoize_articles': False,
356
- 'use_cached_categories': False
357
- })
358
-
359
- # Try newspaper3k first
360
- try:
361
- article.download()
362
- article.parse()
363
-
364
- if article.text and len(article.text.strip()) > 100:
365
- content = article.text.strip()
366
- pub_date = article.publish_date.isoformat() if article.publish_date else None
367
- return content[:3000], pub_date
368
- except Exception as e:
369
- print(f"Newspaper3k failed for {url}: {e}")
370
 
371
- # If newspaper3k fails or domain is blocked, try fallback
 
 
 
 
 
 
 
 
 
372
  content, pub_date = await self.scrape_article_fallback(url)
373
  if content and len(content.strip()) > 50:
374
  return content, pub_date
375
-
376
- return "", None
377
-
378
  except Exception as e:
379
- print(f"All scraping methods failed for {url}: {e}")
380
- return "", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
 
382
  async def scrape_multiple(self, search_results: List[SearchResult], max_successful: int = None) -> List[SearchResult]:
383
  """Scrape multiple articles with robust error handling and retry logic"""
 
285
  return False
286
 
287
  async def scrape_article_fallback(self, url: str) -> Tuple[str, Optional[str]]:
288
+ """Enhanced fallback scraping method using direct HTTP request"""
289
  try:
290
  session = await self.get_session()
291
 
292
  # Add random delay to avoid rate limiting
293
+ await asyncio.sleep(0.2)
294
 
295
  async with session.get(url, allow_redirects=True) as response:
296
+ if response.status != 200:
297
+ return "", None
 
 
 
 
 
 
 
 
 
 
 
 
298
 
299
+ html = await response.text()
300
+ soup = BeautifulSoup(html, 'html.parser')
301
+
302
+ # Remove unwanted elements
303
+ for unwanted in soup(["script", "style", "nav", "header", "footer", "aside", "iframe", "noscript"]):
304
+ unwanted.decompose()
305
+
306
+ # Try multiple content extraction strategies
307
+ content = ""
308
+
309
+ # Strategy 1: Look for common article content containers
310
+ content_selectors = [
311
+ # Generic selectors
312
+ 'article', '[role="main"]', 'main', '.main-content', '.content',
313
+ # News-specific selectors
314
+ '.story-body', '.article-body', '.entry-content', '.post-content',
315
+ '.article-content', '.story-content', '.news-content',
316
+ # Site-specific selectors
317
+ '[data-module="ArticleBody"]', '.RichTextStoryBody', '.InlineVideo',
318
+ '.zone-content', '.field-name-body', '.story-text',
319
+ # CNN specific
320
+ '.zn-body__paragraph', '.zn-body-text',
321
+ # Fox News specific
322
+ '.article-body', '.article-text',
323
+ # NBC specific
324
+ '.articleText', '.inline-story-content',
325
+ # AP News specific
326
+ '.Article', '.RichTextStoryBody',
327
+ # BBC specific
328
+ '[data-component="text-block"]', '.ssrcss-1q0x1qg-Paragraph',
329
+ # Generic fallbacks
330
+ '.text', '.body', '[class*="content"]', '[class*="article"]', '[class*="story"]'
331
+ ]
332
+
333
+ for selector in content_selectors:
334
+ try:
335
  elements = soup.select(selector)
336
  if elements:
337
+ texts = []
338
+ for elem in elements:
339
+ text = elem.get_text(separator=' ', strip=True)
340
+ if len(text) > 50: # Only meaningful content
341
+ texts.append(text)
342
+
343
+ if texts:
344
+ content = ' '.join(texts)
345
+ if len(content) > 200: # Good content found
346
+ break
347
+ except:
348
+ continue
349
+
350
+ # Strategy 2: If no structured content, get all paragraphs
351
+ if not content or len(content) < 100:
352
+ paragraphs = soup.find_all('p')
353
+ p_texts = []
354
+ for p in paragraphs:
355
+ text = p.get_text(strip=True)
356
+ # Filter out short paragraphs, likely navigation/ads
357
+ if len(text) > 30 and not any(skip in text.lower() for skip in
358
+ ['cookie', 'advertisement', 'subscribe', 'newsletter',
359
+ 'follow us', 'social media', 'share this']):
360
+ p_texts.append(text)
361
 
362
+ if p_texts:
363
+ content = ' '.join(p_texts)
364
+
365
+ # Strategy 3: Extract from divs with text content
366
+ if not content or len(content) < 100:
367
+ divs = soup.find_all('div')
368
+ div_texts = []
369
+ for div in divs:
370
+ # Only direct text, not nested
371
+ text = div.get_text(separator=' ', strip=True)
372
+ if 100 < len(text) < 1000: # Reasonable paragraph length
373
+ # Check if it's likely article content
374
+ if any(word in text.lower() for word in ['said', 'according', 'reported', 'stated', 'announced']):
375
+ div_texts.append(text)
376
 
377
+ if div_texts:
378
+ content = ' '.join(div_texts[:3]) # Take first 3 relevant divs
379
+
380
+ # Try to extract publication date
381
+ pub_date = None
382
+ date_selectors = [
383
+ 'time[datetime]', '[datetime]',
384
+ '.published-date', '.post-date', '.article-date',
385
+ '.timestamp', '.date', '.publish-date',
386
+ '[data-testid="timestamp"]', '.byline-timestamp',
387
+ '.story-date', '.news-date'
388
+ ]
389
+
390
+ for selector in date_selectors:
391
+ try:
392
  date_elem = soup.select_one(selector)
393
  if date_elem:
394
+ pub_date = (date_elem.get('datetime') or
395
+ date_elem.get('content') or
396
+ date_elem.get_text(strip=True))
397
+ if pub_date:
398
+ break
399
+ except:
400
+ continue
401
+
402
+ # Clean and limit content
403
+ if content:
404
+ # Remove excessive whitespace
405
+ content = ' '.join(content.split())
406
+ # Limit length
407
+ content = content[:3000]
408
+
409
+ return content, pub_date
410
+
411
  except Exception as e:
412
+ print(f"Enhanced fallback scraping failed for {url}: {str(e)[:100]}...")
413
  return "", None
414
 
415
  async def scrape_article(self, url: str) -> Tuple[str, Optional[str]]:
416
  """Scrape article content with multiple fallback strategies"""
417
+ content = ""
418
+ pub_date = None
419
+
420
+ # Method 1: Try newspaper3k first (simple approach)
421
  try:
 
422
  article = Article(url)
423
+ article.download()
424
+ article.parse()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
425
 
426
+ if article.text and len(article.text.strip()) > 100:
427
+ content = article.text.strip()[:3000]
428
+ pub_date = article.publish_date.isoformat() if article.publish_date else None
429
+ return content, pub_date
430
+
431
+ except Exception as e:
432
+ print(f"Newspaper3k failed for {url}: {str(e)[:100]}...")
433
+
434
+ # Method 2: Fallback to direct HTTP scraping
435
+ try:
436
  content, pub_date = await self.scrape_article_fallback(url)
437
  if content and len(content.strip()) > 50:
438
  return content, pub_date
 
 
 
439
  except Exception as e:
440
+ print(f"Fallback scraping failed for {url}: {str(e)[:100]}...")
441
+
442
+ # Method 3: Last resort - try to get at least the title/snippet
443
+ try:
444
+ session = await self.get_session()
445
+ async with session.get(url, allow_redirects=True) as response:
446
+ if response.status == 200:
447
+ html = await response.text()
448
+ soup = BeautifulSoup(html, 'html.parser')
449
+
450
+ # Get at least the title and meta description
451
+ title = soup.find('title')
452
+ title_text = title.get_text().strip() if title else ""
453
+
454
+ meta_desc = soup.find('meta', attrs={'name': 'description'})
455
+ desc_text = meta_desc.get('content', '').strip() if meta_desc else ""
456
+
457
+ if title_text or desc_text:
458
+ content = f"{title_text}. {desc_text}".strip()
459
+ return content, None
460
+
461
+ except Exception as e:
462
+ print(f"Last resort scraping failed for {url}: {str(e)[:100]}...")
463
+
464
+ return "", None
465
 
466
  async def scrape_multiple(self, search_results: List[SearchResult], max_successful: int = None) -> List[SearchResult]:
467
  """Scrape multiple articles with robust error handling and retry logic"""