File size: 3,824 Bytes
e9f1adf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# utils/scraper.py
"""
Yahoo Finance news scraper
"""

import requests
from bs4 import BeautifulSoup
import yfinance as yf
from typing import List, Dict
import time
from config import USER_AGENT


class YahooFinanceScraper:
    """Scrape news articles from Yahoo Finance"""
    
    def __init__(self):
        self.headers = {
            'User-Agent': USER_AGENT
        }
    
    def get_stock_news(self, symbol: str, max_articles: int = 10) -> List[Dict]:
        """
        Get news articles for a specific stock symbol
        
        Args:
            symbol: Stock ticker symbol (e.g., 'AAPL')
            max_articles: Maximum number of articles to retrieve
            
        Returns:
            List of dictionaries containing article information
        """
        try:
            # Use yfinance to get news
            ticker = yf.Ticker(symbol)
            news = ticker.news
            
            articles = []
            for item in news[:max_articles]:
                article = {
                    'title': item.get('title', ''),
                    'publisher': item.get('publisher', 'Unknown'),
                    'link': item.get('link', ''),
                    'publish_time': item.get('providerPublishTime', 0),
                    'type': item.get('type', 'STORY'),
                    'thumbnail': item.get('thumbnail', {}).get('resolutions', [{}])[0].get('url', '') if item.get('thumbnail') else ''
                }
                
                # Try to get article summary/description
                if 'summary' in item:
                    article['summary'] = item['summary']
                else:
                    article['summary'] = self._extract_summary(article['link'])
                
                articles.append(article)
                time.sleep(0.5)  # Be polite to the server
            
            return articles
            
        except Exception as e:
            print(f"Error fetching news for {symbol}: {str(e)}")
            return []
    
    def _extract_summary(self, url: str) -> str:
        """
        Extract article summary from URL
        
        Args:
            url: Article URL
            
        Returns:
            Article summary text
        """
        try:
            response = requests.get(url, headers=self.headers, timeout=10)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Try to find meta description
            meta_desc = soup.find('meta', attrs={'name': 'description'})
            if meta_desc and meta_desc.get('content'):
                return meta_desc['content']
            
            # Try to find og:description
            og_desc = soup.find('meta', attrs={'property': 'og:description'})
            if og_desc and og_desc.get('content'):
                return og_desc['content']
            
            # Fallback to first paragraph
            paragraphs = soup.find_all('p')
            if paragraphs:
                for p in paragraphs:
                    text = p.get_text().strip()
                    if len(text) > 50:
                        return text[:300]
            
            return "No summary available"
            
        except Exception as e:
            print(f"Error extracting summary: {str(e)}")
            return "Could not extract summary"
    
    def get_market_news(self, max_articles: int = 10) -> List[Dict]:
        """
        Get general market news from Yahoo Finance homepage
        
        Args:
            max_articles: Maximum number of articles to retrieve
            
        Returns:
            List of dictionaries containing article information
        """
        # For general market news, use popular index symbols
        return self.get_stock_news("^GSPC", max_articles)  # S&P 500