# utils/scraper.py """ Yahoo Finance news scraper """ import requests from bs4 import BeautifulSoup import yfinance as yf from typing import List, Dict import time from config import USER_AGENT class YahooFinanceScraper: """Scrape news articles from Yahoo Finance""" def __init__(self): self.headers = { 'User-Agent': USER_AGENT } def get_stock_news(self, symbol: str, max_articles: int = 10) -> List[Dict]: """ Get news articles for a specific stock symbol Args: symbol: Stock ticker symbol (e.g., 'AAPL') max_articles: Maximum number of articles to retrieve Returns: List of dictionaries containing article information """ try: # Use yfinance to get news ticker = yf.Ticker(symbol) news = ticker.news articles = [] for item in news[:max_articles]: article = { 'title': item.get('title', ''), 'publisher': item.get('publisher', 'Unknown'), 'link': item.get('link', ''), 'publish_time': item.get('providerPublishTime', 0), 'type': item.get('type', 'STORY'), 'thumbnail': item.get('thumbnail', {}).get('resolutions', [{}])[0].get('url', '') if item.get('thumbnail') else '' } # Try to get article summary/description if 'summary' in item: article['summary'] = item['summary'] else: article['summary'] = self._extract_summary(article['link']) articles.append(article) time.sleep(0.5) # Be polite to the server return articles except Exception as e: print(f"Error fetching news for {symbol}: {str(e)}") return [] def _extract_summary(self, url: str) -> str: """ Extract article summary from URL Args: url: Article URL Returns: Article summary text """ try: response = requests.get(url, headers=self.headers, timeout=10) soup = BeautifulSoup(response.content, 'html.parser') # Try to find meta description meta_desc = soup.find('meta', attrs={'name': 'description'}) if meta_desc and meta_desc.get('content'): return meta_desc['content'] # Try to find og:description og_desc = soup.find('meta', attrs={'property': 'og:description'}) if og_desc and og_desc.get('content'): return og_desc['content'] # Fallback to first paragraph paragraphs = soup.find_all('p') if paragraphs: for p in paragraphs: text = p.get_text().strip() if len(text) > 50: return text[:300] return "No summary available" except Exception as e: print(f"Error extracting summary: {str(e)}") return "Could not extract summary" def get_market_news(self, max_articles: int = 10) -> List[Dict]: """ Get general market news from Yahoo Finance homepage Args: max_articles: Maximum number of articles to retrieve Returns: List of dictionaries containing article information """ # For general market news, use popular index symbols return self.get_stock_news("^GSPC", max_articles) # S&P 500