Spaces:

cwpkd
/

Todlong

Runtime error

App Files Files Community

cwpkd commited on Nov 3, 2025

Commit

e9f1adf

verified ·

1 Parent(s): 6aab42e

Create utils/scraper.py

Browse files

Files changed (1) hide show

utils/scraper.py +113 -0

utils/scraper.py ADDED Viewed

	@@ -0,0 +1,113 @@

+# utils/scraper.py
+"""
+Yahoo Finance news scraper
+"""
+import requests
+from bs4 import BeautifulSoup
+import yfinance as yf
+from typing import List, Dict
+import time
+from config import USER_AGENT
+class YahooFinanceScraper:
+    """Scrape news articles from Yahoo Finance"""
+    def __init__(self):
+        self.headers = {
+            'User-Agent': USER_AGENT
+        }
+    def get_stock_news(self, symbol: str, max_articles: int = 10) -> List[Dict]:
+        """
+        Get news articles for a specific stock symbol
+        Args:
+            symbol: Stock ticker symbol (e.g., 'AAPL')
+            max_articles: Maximum number of articles to retrieve
+        Returns:
+            List of dictionaries containing article information
+        """
+        try:
+            # Use yfinance to get news
+            ticker = yf.Ticker(symbol)
+            news = ticker.news
+            articles = []
+            for item in news[:max_articles]:
+                article = {
+                    'title': item.get('title', ''),
+                    'publisher': item.get('publisher', 'Unknown'),
+                    'link': item.get('link', ''),
+                    'publish_time': item.get('providerPublishTime', 0),
+                    'type': item.get('type', 'STORY'),
+                    'thumbnail': item.get('thumbnail', {}).get('resolutions', [{}])[0].get('url', '') if item.get('thumbnail') else ''
+                }
+                # Try to get article summary/description
+                if 'summary' in item:
+                    article['summary'] = item['summary']
+                else:
+                    article['summary'] = self._extract_summary(article['link'])
+                articles.append(article)
+                time.sleep(0.5)  # Be polite to the server
+            return articles
+        except Exception as e:
+            print(f"Error fetching news for {symbol}: {str(e)}")
+            return []
+    def _extract_summary(self, url: str) -> str:
+        """
+        Extract article summary from URL
+        Args:
+            url: Article URL
+        Returns:
+            Article summary text
+        """
+        try:
+            response = requests.get(url, headers=self.headers, timeout=10)
+            soup = BeautifulSoup(response.content, 'html.parser')
+            # Try to find meta description
+            meta_desc = soup.find('meta', attrs={'name': 'description'})
+            if meta_desc and meta_desc.get('content'):
+                return meta_desc['content']
+            # Try to find og:description
+            og_desc = soup.find('meta', attrs={'property': 'og:description'})
+            if og_desc and og_desc.get('content'):
+                return og_desc['content']
+            # Fallback to first paragraph
+            paragraphs = soup.find_all('p')
+            if paragraphs:
+                for p in paragraphs:
+                    text = p.get_text().strip()
+                    if len(text) > 50:
+                        return text[:300]
+            return "No summary available"
+        except Exception as e:
+            print(f"Error extracting summary: {str(e)}")
+            return "Could not extract summary"
+    def get_market_news(self, max_articles: int = 10) -> List[Dict]:
+        """
+        Get general market news from Yahoo Finance homepage
+        Args:
+            max_articles: Maximum number of articles to retrieve
+        Returns:
+            List of dictionaries containing article information
+        """
+        # For general market news, use popular index symbols
+        return self.get_stock_news("^GSPC", max_articles)  # S&P 500