Spaces:

Shreyas94
/

Sentinel02

Sleeping

App Files Files Community

Shreyas94 commited on Aug 4, 2025

Commit

75acc4f

verified ·

1 Parent(s): aa70df3

Update app.py

Browse files

Files changed (1) hide show

app.py +152 -68

app.py CHANGED Viewed

@@ -285,99 +285,183 @@ class ContentScraper:
             return False
     async def scrape_article_fallback(self, url: str) -> Tuple[str, Optional[str]]:
-        """Fallback scraping method using direct HTTP request"""
         try:
             session = await self.get_session()
             # Add random delay to avoid rate limiting
-            await asyncio.sleep(0.5)
             async with session.get(url, allow_redirects=True) as response:
-                if response.status == 200:
-                    html = await response.text()
-                    soup = BeautifulSoup(html, 'html.parser')
-                    # Remove script and style elements
-                    for script in soup(["script", "style", "nav", "header", "footer", "aside"]):
-                        script.decompose()
-                    # Try to find main content
-                    content_selectors = [
-                        'article', '.article-body', '.entry-content', '.post-content',
-                        '.content', '.main-content', '[data-module="ArticleBody"]',
-                        '.story-body', '.article-content', 'main'
-                    ]
-                    content = ""
-                    for selector in content_selectors:
                         elements = soup.select(selector)
                         if elements:
-                            content = ' '.join(elem.get_text(strip=True) for elem in elements)
-                            if len(content) > 200:  # Minimum content length
-                                break
-                    # If no content found, get all paragraph text
-                    if not content or len(content) < 100:
-                        paragraphs = soup.find_all('p')
-                        content = ' '.join(p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 20)
-                    # Try to extract publication date
-                    pub_date = None
-                    date_selectors = [
-                        'time[datetime]', '.published-date', '.post-date',
-                        '.article-date', '[data-testid="timestamp"]'
-                    ]
-                    for selector in date_selectors:
                         date_elem = soup.select_one(selector)
                         if date_elem:
-                            pub_date = date_elem.get('datetime') or date_elem.get_text(strip=True)
-                            break
-                    return content[:3000], pub_date  # Limit content length
-                else:
-                    return "", None
         except Exception as e:
-            print(f"Fallback scraping failed for {url}: {e}")
             return "", None
     async def scrape_article(self, url: str) -> Tuple[str, Optional[str]]:
         """Scrape article content with multiple fallback strategies"""
         try:
-            # First, try newspaper3k with custom configuration
             article = Article(url)
-            article.set_config({
-                'browser_user_agent': self.headers['User-Agent'],
-                'request_timeout': 30,
-                'number_threads': 1,
-                'verbose': False,
-                'fetch_images': False,
-                'memoize_articles': False,
-                'use_cached_categories': False
-            })
-            # Try newspaper3k first
-            try:
-                article.download()
-                article.parse()
-                if article.text and len(article.text.strip()) > 100:
-                    content = article.text.strip()
-                    pub_date = article.publish_date.isoformat() if article.publish_date else None
-                    return content[:3000], pub_date
-            except Exception as e:
-                print(f"Newspaper3k failed for {url}: {e}")
-            # If newspaper3k fails or domain is blocked, try fallback
             content, pub_date = await self.scrape_article_fallback(url)
             if content and len(content.strip()) > 50:
                 return content, pub_date
-            return "", None
         except Exception as e:
-            print(f"All scraping methods failed for {url}: {e}")
-            return "", None
     async def scrape_multiple(self, search_results: List[SearchResult], max_successful: int = None) -> List[SearchResult]:
         """Scrape multiple articles with robust error handling and retry logic"""

             return False
     async def scrape_article_fallback(self, url: str) -> Tuple[str, Optional[str]]:
+        """Enhanced fallback scraping method using direct HTTP request"""
         try:
             session = await self.get_session()
             # Add random delay to avoid rate limiting
+            await asyncio.sleep(0.2)
             async with session.get(url, allow_redirects=True) as response:
+                if response.status != 200:
+                    return "", None
+                html = await response.text()
+                soup = BeautifulSoup(html, 'html.parser')
+                # Remove unwanted elements
+                for unwanted in soup(["script", "style", "nav", "header", "footer", "aside", "iframe", "noscript"]):
+                    unwanted.decompose()
+                # Try multiple content extraction strategies
+                content = ""
+                # Strategy 1: Look for common article content containers
+                content_selectors = [
+                    # Generic selectors
+                    'article', '[role="main"]', 'main', '.main-content', '.content',
+                    # News-specific selectors
+                    '.story-body', '.article-body', '.entry-content', '.post-content',
+                    '.article-content', '.story-content', '.news-content',
+                    # Site-specific selectors
+                    '[data-module="ArticleBody"]', '.RichTextStoryBody', '.InlineVideo',
+                    '.zone-content', '.field-name-body', '.story-text',
+                    # CNN specific
+                    '.zn-body__paragraph', '.zn-body-text',
+                    # Fox News specific
+                    '.article-body', '.article-text',
+                    # NBC specific
+                    '.articleText', '.inline-story-content',
+                    # AP News specific
+                    '.Article', '.RichTextStoryBody',
+                    # BBC specific
+                    '[data-component="text-block"]', '.ssrcss-1q0x1qg-Paragraph',
+                    # Generic fallbacks
+                    '.text', '.body', '[class*="content"]', '[class*="article"]', '[class*="story"]'
+                ]
+                for selector in content_selectors:
+                    try:
                         elements = soup.select(selector)
                         if elements:
+                            texts = []
+                            for elem in elements:
+                                text = elem.get_text(separator=' ', strip=True)
+                                if len(text) > 50:  # Only meaningful content
+                                    texts.append(text)
+                            if texts:
+                                content = ' '.join(texts)
+                                if len(content) > 200:  # Good content found
+                                    break
+                    except:
+                        continue
+                # Strategy 2: If no structured content, get all paragraphs
+                if not content or len(content) < 100:
+                    paragraphs = soup.find_all('p')
+                    p_texts = []
+                    for p in paragraphs:
+                        text = p.get_text(strip=True)
+                        # Filter out short paragraphs, likely navigation/ads
+                        if len(text) > 30 and not any(skip in text.lower() for skip in
+                                                    ['cookie', 'advertisement', 'subscribe', 'newsletter',
+                                                     'follow us', 'social media', 'share this']):
+                            p_texts.append(text)
+                    if p_texts:
+                        content = ' '.join(p_texts)
+                # Strategy 3: Extract from divs with text content
+                if not content or len(content) < 100:
+                    divs = soup.find_all('div')
+                    div_texts = []
+                    for div in divs:
+                        # Only direct text, not nested
+                        text = div.get_text(separator=' ', strip=True)
+                        if 100 < len(text) < 1000:  # Reasonable paragraph length
+                            # Check if it's likely article content
+                            if any(word in text.lower() for word in ['said', 'according', 'reported', 'stated', 'announced']):
+                                div_texts.append(text)
+                    if div_texts:
+                        content = ' '.join(div_texts[:3])  # Take first 3 relevant divs
+                # Try to extract publication date
+                pub_date = None
+                date_selectors = [
+                    'time[datetime]', '[datetime]',
+                    '.published-date', '.post-date', '.article-date',
+                    '.timestamp', '.date', '.publish-date',
+                    '[data-testid="timestamp"]', '.byline-timestamp',
+                    '.story-date', '.news-date'
+                ]
+                for selector in date_selectors:
+                    try:
                         date_elem = soup.select_one(selector)
                         if date_elem:
+                            pub_date = (date_elem.get('datetime') or
+                                       date_elem.get('content') or
+                                       date_elem.get_text(strip=True))
+                            if pub_date:
+                                break
+                    except:
+                        continue
+                # Clean and limit content
+                if content:
+                    # Remove excessive whitespace
+                    content = ' '.join(content.split())
+                    # Limit length
+                    content = content[:3000]
+                return content, pub_date
         except Exception as e:
+            print(f"Enhanced fallback scraping failed for {url}: {str(e)[:100]}...")
             return "", None
     async def scrape_article(self, url: str) -> Tuple[str, Optional[str]]:
         """Scrape article content with multiple fallback strategies"""
+        content = ""
+        pub_date = None
+        # Method 1: Try newspaper3k first (simple approach)
         try:
             article = Article(url)
+            article.download()
+            article.parse()
+            if article.text and len(article.text.strip()) > 100:
+                content = article.text.strip()[:3000]
+                pub_date = article.publish_date.isoformat() if article.publish_date else None
+                return content, pub_date
+        except Exception as e:
+            print(f"Newspaper3k failed for {url}: {str(e)[:100]}...")
+        # Method 2: Fallback to direct HTTP scraping
+        try:
             content, pub_date = await self.scrape_article_fallback(url)
             if content and len(content.strip()) > 50:
                 return content, pub_date
         except Exception as e:
+            print(f"Fallback scraping failed for {url}: {str(e)[:100]}...")
+        # Method 3: Last resort - try to get at least the title/snippet
+        try:
+            session = await self.get_session()
+            async with session.get(url, allow_redirects=True) as response:
+                if response.status == 200:
+                    html = await response.text()
+                    soup = BeautifulSoup(html, 'html.parser')
+                    # Get at least the title and meta description
+                    title = soup.find('title')
+                    title_text = title.get_text().strip() if title else ""
+                    meta_desc = soup.find('meta', attrs={'name': 'description'})
+                    desc_text = meta_desc.get('content', '').strip() if meta_desc else ""
+                    if title_text or desc_text:
+                        content = f"{title_text}. {desc_text}".strip()
+                        return content, None
+        except Exception as e:
+            print(f"Last resort scraping failed for {url}: {str(e)[:100]}...")
+        return "", None
     async def scrape_multiple(self, search_results: List[SearchResult], max_successful: int = None) -> List[SearchResult]:
         """Scrape multiple articles with robust error handling and retry logic"""