Todlong / utils /scraper.py
cwpkd's picture
Create utils/scraper.py
e9f1adf verified
# utils/scraper.py
"""
Yahoo Finance news scraper
"""
import requests
from bs4 import BeautifulSoup
import yfinance as yf
from typing import List, Dict
import time
from config import USER_AGENT
class YahooFinanceScraper:
"""Scrape news articles from Yahoo Finance"""
def __init__(self):
self.headers = {
'User-Agent': USER_AGENT
}
def get_stock_news(self, symbol: str, max_articles: int = 10) -> List[Dict]:
"""
Get news articles for a specific stock symbol
Args:
symbol: Stock ticker symbol (e.g., 'AAPL')
max_articles: Maximum number of articles to retrieve
Returns:
List of dictionaries containing article information
"""
try:
# Use yfinance to get news
ticker = yf.Ticker(symbol)
news = ticker.news
articles = []
for item in news[:max_articles]:
article = {
'title': item.get('title', ''),
'publisher': item.get('publisher', 'Unknown'),
'link': item.get('link', ''),
'publish_time': item.get('providerPublishTime', 0),
'type': item.get('type', 'STORY'),
'thumbnail': item.get('thumbnail', {}).get('resolutions', [{}])[0].get('url', '') if item.get('thumbnail') else ''
}
# Try to get article summary/description
if 'summary' in item:
article['summary'] = item['summary']
else:
article['summary'] = self._extract_summary(article['link'])
articles.append(article)
time.sleep(0.5) # Be polite to the server
return articles
except Exception as e:
print(f"Error fetching news for {symbol}: {str(e)}")
return []
def _extract_summary(self, url: str) -> str:
"""
Extract article summary from URL
Args:
url: Article URL
Returns:
Article summary text
"""
try:
response = requests.get(url, headers=self.headers, timeout=10)
soup = BeautifulSoup(response.content, 'html.parser')
# Try to find meta description
meta_desc = soup.find('meta', attrs={'name': 'description'})
if meta_desc and meta_desc.get('content'):
return meta_desc['content']
# Try to find og:description
og_desc = soup.find('meta', attrs={'property': 'og:description'})
if og_desc and og_desc.get('content'):
return og_desc['content']
# Fallback to first paragraph
paragraphs = soup.find_all('p')
if paragraphs:
for p in paragraphs:
text = p.get_text().strip()
if len(text) > 50:
return text[:300]
return "No summary available"
except Exception as e:
print(f"Error extracting summary: {str(e)}")
return "Could not extract summary"
def get_market_news(self, max_articles: int = 10) -> List[Dict]:
"""
Get general market news from Yahoo Finance homepage
Args:
max_articles: Maximum number of articles to retrieve
Returns:
List of dictionaries containing article information
"""
# For general market news, use popular index symbols
return self.get_stock_news("^GSPC", max_articles) # S&P 500