|
|
|
|
|
""" |
|
|
Yahoo Finance news scraper |
|
|
""" |
|
|
|
|
|
import requests |
|
|
from bs4 import BeautifulSoup |
|
|
import yfinance as yf |
|
|
from typing import List, Dict |
|
|
import time |
|
|
from config import USER_AGENT |
|
|
|
|
|
|
|
|
class YahooFinanceScraper: |
|
|
"""Scrape news articles from Yahoo Finance""" |
|
|
|
|
|
def __init__(self): |
|
|
self.headers = { |
|
|
'User-Agent': USER_AGENT |
|
|
} |
|
|
|
|
|
def get_stock_news(self, symbol: str, max_articles: int = 10) -> List[Dict]: |
|
|
""" |
|
|
Get news articles for a specific stock symbol |
|
|
|
|
|
Args: |
|
|
symbol: Stock ticker symbol (e.g., 'AAPL') |
|
|
max_articles: Maximum number of articles to retrieve |
|
|
|
|
|
Returns: |
|
|
List of dictionaries containing article information |
|
|
""" |
|
|
try: |
|
|
|
|
|
ticker = yf.Ticker(symbol) |
|
|
news = ticker.news |
|
|
|
|
|
articles = [] |
|
|
for item in news[:max_articles]: |
|
|
article = { |
|
|
'title': item.get('title', ''), |
|
|
'publisher': item.get('publisher', 'Unknown'), |
|
|
'link': item.get('link', ''), |
|
|
'publish_time': item.get('providerPublishTime', 0), |
|
|
'type': item.get('type', 'STORY'), |
|
|
'thumbnail': item.get('thumbnail', {}).get('resolutions', [{}])[0].get('url', '') if item.get('thumbnail') else '' |
|
|
} |
|
|
|
|
|
|
|
|
if 'summary' in item: |
|
|
article['summary'] = item['summary'] |
|
|
else: |
|
|
article['summary'] = self._extract_summary(article['link']) |
|
|
|
|
|
articles.append(article) |
|
|
time.sleep(0.5) |
|
|
|
|
|
return articles |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error fetching news for {symbol}: {str(e)}") |
|
|
return [] |
|
|
|
|
|
def _extract_summary(self, url: str) -> str: |
|
|
""" |
|
|
Extract article summary from URL |
|
|
|
|
|
Args: |
|
|
url: Article URL |
|
|
|
|
|
Returns: |
|
|
Article summary text |
|
|
""" |
|
|
try: |
|
|
response = requests.get(url, headers=self.headers, timeout=10) |
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
|
|
|
meta_desc = soup.find('meta', attrs={'name': 'description'}) |
|
|
if meta_desc and meta_desc.get('content'): |
|
|
return meta_desc['content'] |
|
|
|
|
|
|
|
|
og_desc = soup.find('meta', attrs={'property': 'og:description'}) |
|
|
if og_desc and og_desc.get('content'): |
|
|
return og_desc['content'] |
|
|
|
|
|
|
|
|
paragraphs = soup.find_all('p') |
|
|
if paragraphs: |
|
|
for p in paragraphs: |
|
|
text = p.get_text().strip() |
|
|
if len(text) > 50: |
|
|
return text[:300] |
|
|
|
|
|
return "No summary available" |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error extracting summary: {str(e)}") |
|
|
return "Could not extract summary" |
|
|
|
|
|
def get_market_news(self, max_articles: int = 10) -> List[Dict]: |
|
|
""" |
|
|
Get general market news from Yahoo Finance homepage |
|
|
|
|
|
Args: |
|
|
max_articles: Maximum number of articles to retrieve |
|
|
|
|
|
Returns: |
|
|
List of dictionaries containing article information |
|
|
""" |
|
|
|
|
|
return self.get_stock_news("^GSPC", max_articles) |