cwpkd commited on
Commit
e9f1adf
Β·
verified Β·
1 Parent(s): 6aab42e

Create utils/scraper.py

Browse files
Files changed (1) hide show
  1. utils/scraper.py +113 -0
utils/scraper.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utils/scraper.py
2
+ """
3
+ Yahoo Finance news scraper
4
+ """
5
+
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+ import yfinance as yf
9
+ from typing import List, Dict
10
+ import time
11
+ from config import USER_AGENT
12
+
13
+
14
+ class YahooFinanceScraper:
15
+ """Scrape news articles from Yahoo Finance"""
16
+
17
+ def __init__(self):
18
+ self.headers = {
19
+ 'User-Agent': USER_AGENT
20
+ }
21
+
22
+ def get_stock_news(self, symbol: str, max_articles: int = 10) -> List[Dict]:
23
+ """
24
+ Get news articles for a specific stock symbol
25
+
26
+ Args:
27
+ symbol: Stock ticker symbol (e.g., 'AAPL')
28
+ max_articles: Maximum number of articles to retrieve
29
+
30
+ Returns:
31
+ List of dictionaries containing article information
32
+ """
33
+ try:
34
+ # Use yfinance to get news
35
+ ticker = yf.Ticker(symbol)
36
+ news = ticker.news
37
+
38
+ articles = []
39
+ for item in news[:max_articles]:
40
+ article = {
41
+ 'title': item.get('title', ''),
42
+ 'publisher': item.get('publisher', 'Unknown'),
43
+ 'link': item.get('link', ''),
44
+ 'publish_time': item.get('providerPublishTime', 0),
45
+ 'type': item.get('type', 'STORY'),
46
+ 'thumbnail': item.get('thumbnail', {}).get('resolutions', [{}])[0].get('url', '') if item.get('thumbnail') else ''
47
+ }
48
+
49
+ # Try to get article summary/description
50
+ if 'summary' in item:
51
+ article['summary'] = item['summary']
52
+ else:
53
+ article['summary'] = self._extract_summary(article['link'])
54
+
55
+ articles.append(article)
56
+ time.sleep(0.5) # Be polite to the server
57
+
58
+ return articles
59
+
60
+ except Exception as e:
61
+ print(f"Error fetching news for {symbol}: {str(e)}")
62
+ return []
63
+
64
+ def _extract_summary(self, url: str) -> str:
65
+ """
66
+ Extract article summary from URL
67
+
68
+ Args:
69
+ url: Article URL
70
+
71
+ Returns:
72
+ Article summary text
73
+ """
74
+ try:
75
+ response = requests.get(url, headers=self.headers, timeout=10)
76
+ soup = BeautifulSoup(response.content, 'html.parser')
77
+
78
+ # Try to find meta description
79
+ meta_desc = soup.find('meta', attrs={'name': 'description'})
80
+ if meta_desc and meta_desc.get('content'):
81
+ return meta_desc['content']
82
+
83
+ # Try to find og:description
84
+ og_desc = soup.find('meta', attrs={'property': 'og:description'})
85
+ if og_desc and og_desc.get('content'):
86
+ return og_desc['content']
87
+
88
+ # Fallback to first paragraph
89
+ paragraphs = soup.find_all('p')
90
+ if paragraphs:
91
+ for p in paragraphs:
92
+ text = p.get_text().strip()
93
+ if len(text) > 50:
94
+ return text[:300]
95
+
96
+ return "No summary available"
97
+
98
+ except Exception as e:
99
+ print(f"Error extracting summary: {str(e)}")
100
+ return "Could not extract summary"
101
+
102
+ def get_market_news(self, max_articles: int = 10) -> List[Dict]:
103
+ """
104
+ Get general market news from Yahoo Finance homepage
105
+
106
+ Args:
107
+ max_articles: Maximum number of articles to retrieve
108
+
109
+ Returns:
110
+ List of dictionaries containing article information
111
+ """
112
+ # For general market news, use popular index symbols
113
+ return self.get_stock_news("^GSPC", max_articles) # S&P 500