Create utils/scraper.py
Browse files- utils/scraper.py +113 -0
utils/scraper.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# utils/scraper.py
|
| 2 |
+
"""
|
| 3 |
+
Yahoo Finance news scraper
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import requests
|
| 7 |
+
from bs4 import BeautifulSoup
|
| 8 |
+
import yfinance as yf
|
| 9 |
+
from typing import List, Dict
|
| 10 |
+
import time
|
| 11 |
+
from config import USER_AGENT
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class YahooFinanceScraper:
|
| 15 |
+
"""Scrape news articles from Yahoo Finance"""
|
| 16 |
+
|
| 17 |
+
def __init__(self):
|
| 18 |
+
self.headers = {
|
| 19 |
+
'User-Agent': USER_AGENT
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
def get_stock_news(self, symbol: str, max_articles: int = 10) -> List[Dict]:
|
| 23 |
+
"""
|
| 24 |
+
Get news articles for a specific stock symbol
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
symbol: Stock ticker symbol (e.g., 'AAPL')
|
| 28 |
+
max_articles: Maximum number of articles to retrieve
|
| 29 |
+
|
| 30 |
+
Returns:
|
| 31 |
+
List of dictionaries containing article information
|
| 32 |
+
"""
|
| 33 |
+
try:
|
| 34 |
+
# Use yfinance to get news
|
| 35 |
+
ticker = yf.Ticker(symbol)
|
| 36 |
+
news = ticker.news
|
| 37 |
+
|
| 38 |
+
articles = []
|
| 39 |
+
for item in news[:max_articles]:
|
| 40 |
+
article = {
|
| 41 |
+
'title': item.get('title', ''),
|
| 42 |
+
'publisher': item.get('publisher', 'Unknown'),
|
| 43 |
+
'link': item.get('link', ''),
|
| 44 |
+
'publish_time': item.get('providerPublishTime', 0),
|
| 45 |
+
'type': item.get('type', 'STORY'),
|
| 46 |
+
'thumbnail': item.get('thumbnail', {}).get('resolutions', [{}])[0].get('url', '') if item.get('thumbnail') else ''
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
# Try to get article summary/description
|
| 50 |
+
if 'summary' in item:
|
| 51 |
+
article['summary'] = item['summary']
|
| 52 |
+
else:
|
| 53 |
+
article['summary'] = self._extract_summary(article['link'])
|
| 54 |
+
|
| 55 |
+
articles.append(article)
|
| 56 |
+
time.sleep(0.5) # Be polite to the server
|
| 57 |
+
|
| 58 |
+
return articles
|
| 59 |
+
|
| 60 |
+
except Exception as e:
|
| 61 |
+
print(f"Error fetching news for {symbol}: {str(e)}")
|
| 62 |
+
return []
|
| 63 |
+
|
| 64 |
+
def _extract_summary(self, url: str) -> str:
|
| 65 |
+
"""
|
| 66 |
+
Extract article summary from URL
|
| 67 |
+
|
| 68 |
+
Args:
|
| 69 |
+
url: Article URL
|
| 70 |
+
|
| 71 |
+
Returns:
|
| 72 |
+
Article summary text
|
| 73 |
+
"""
|
| 74 |
+
try:
|
| 75 |
+
response = requests.get(url, headers=self.headers, timeout=10)
|
| 76 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 77 |
+
|
| 78 |
+
# Try to find meta description
|
| 79 |
+
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
| 80 |
+
if meta_desc and meta_desc.get('content'):
|
| 81 |
+
return meta_desc['content']
|
| 82 |
+
|
| 83 |
+
# Try to find og:description
|
| 84 |
+
og_desc = soup.find('meta', attrs={'property': 'og:description'})
|
| 85 |
+
if og_desc and og_desc.get('content'):
|
| 86 |
+
return og_desc['content']
|
| 87 |
+
|
| 88 |
+
# Fallback to first paragraph
|
| 89 |
+
paragraphs = soup.find_all('p')
|
| 90 |
+
if paragraphs:
|
| 91 |
+
for p in paragraphs:
|
| 92 |
+
text = p.get_text().strip()
|
| 93 |
+
if len(text) > 50:
|
| 94 |
+
return text[:300]
|
| 95 |
+
|
| 96 |
+
return "No summary available"
|
| 97 |
+
|
| 98 |
+
except Exception as e:
|
| 99 |
+
print(f"Error extracting summary: {str(e)}")
|
| 100 |
+
return "Could not extract summary"
|
| 101 |
+
|
| 102 |
+
def get_market_news(self, max_articles: int = 10) -> List[Dict]:
|
| 103 |
+
"""
|
| 104 |
+
Get general market news from Yahoo Finance homepage
|
| 105 |
+
|
| 106 |
+
Args:
|
| 107 |
+
max_articles: Maximum number of articles to retrieve
|
| 108 |
+
|
| 109 |
+
Returns:
|
| 110 |
+
List of dictionaries containing article information
|
| 111 |
+
"""
|
| 112 |
+
# For general market news, use popular index symbols
|
| 113 |
+
return self.get_stock_news("^GSPC", max_articles) # S&P 500
|