File size: 7,448 Bytes
cec5d18
 
 
 
34efb38
cec5d18
 
 
 
 
 
3ba06b5
cec5d18
 
 
 
 
 
 
34efb38
cec5d18
 
34efb38
 
cec5d18
 
 
 
34efb38
 
cec5d18
3ba06b5
cec5d18
 
 
 
 
 
 
 
3ba06b5
cec5d18
34efb38
cec5d18
 
34efb38
 
 
 
cec5d18
 
34efb38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05830a7
cec5d18
34efb38
 
 
 
 
 
 
 
 
 
 
 
 
 
cec5d18
 
05830a7
34efb38
 
cec5d18
34efb38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05830a7
34efb38
 
 
 
 
 
 
 
 
 
3ba06b5
34efb38
 
 
 
 
 
 
05830a7
34efb38
 
 
 
cec5d18
 
 
 
 
34efb38
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import asyncio
import aiohttp
import xml.etree.ElementTree as ET
import ssl
import re
from datetime import datetime
from email.utils import parsedate_to_datetime
import glob
import os

class NewsScraper:
    def __init__(self, limit=600):
        self.limit = limit
        self.ssl_context = ssl.create_default_context()
        self.ssl_context.check_hostname = False
        self.ssl_context.verify_mode = ssl.CERT_NONE

    async def fetch_feed(self, session, url):
        try:
            async with session.get(url, timeout=aiohttp.ClientTimeout(total=8)) as response:
                if response.status == 200:
                    return await response.text()
        except:
            pass
        return ""

    def parse_feed(self, xml_text, lookback_date):
        articles = []
        if not xml_text:
            return articles
        try:
            lb_date = lookback_date.date()
            root = ET.fromstring(xml_text)
            for item in root.findall('.//item'):
                title = item.findtext('title')
                link = item.findtext('link')
                pub_date_str = item.findtext('pubDate')
                if title and link and pub_date_str:
                    try:
                        pub_dt = parsedate_to_datetime(pub_date_str)
                        if pub_dt.date() >= lb_date:
                            articles.append({
                                'title': title, 'link': link,
                                'pub_date': pub_date_str, 'timestamp': pub_dt.isoformat()
                            })
                    except:
                        pass
        except:
            pass
        return articles

    def _build_queries(self, ticker):
        """Generate a massive, diverse set of search queries to maximize article yield."""
        t = ticker
        base = [
            t, f"{t} stock", f"{t} news", f"{t} market", f"{t} earnings",
            f"{t} analyst", f"{t} forecast", f"{t} price target",
            f"{t} options", f"{t} technical", f"{t} dividend",
            f"{t} industry", f"{t} competitor", f"{t} share price",
            f"{t} hedge fund", f"{t} institutional",
        ]
        
        # Financial action queries
        actions = [
            f"{t} buy sell hold", f"{t} upgrade downgrade", f"{t} outperform underperform",
            f"{t} bullish bearish", f"{t} momentum", f"{t} breakout breakdown",
            f"{t} rally crash", f"{t} surge plunge", f"{t} soar tumble",
            f"{t} gains losses", f"{t} beat miss expectations",
        ]
        
        # Corporate event queries
        events = [
            f"{t} CEO news", f"{t} quarterly results", f"{t} revenue profit",
            f"{t} guidance outlook", f"{t} acquisition merger",
            f"{t} lawsuit legal SEC", f"{t} insider trading",
            f"{t} IPO offering", f"{t} buyback repurchase",
            f"{t} partnership deal", f"{t} product launch",
            f"{t} layoffs restructuring", f"{t} expansion growth",
        ]
        
        # Analyst and research queries
        research = [
            f"{t} wall street", f"{t} Goldman Sachs", f"{t} Morgan Stanley",
            f"{t} JP Morgan", f"{t} analyst rating", f"{t} price prediction",
            f"{t} short interest", f"{t} short squeeze",
            f"{t} put call ratio", f"{t} unusual activity",
            f"{t} fund holdings", f"{t} 13F filing",
        ]
        
        # Sector and macro queries
        macro = [
            f"{t} sector outlook", f"{t} industry trend", f"{t} supply chain",
            f"{t} regulation policy", f"{t} inflation impact",
            f"{t} interest rate", f"{t} trade war tariff",
            f"{t} innovation technology", f"{t} ESG sustainability",
        ]
        
        # Time-sensitive queries
        time_q = [
            f"{t} today", f"{t} this week", f"{t} latest",
            f"{t} breaking news", f"{t} update",
            f"{t} premarket", f"{t} after hours",
        ]
        
        all_queries = base + actions + events + research + macro + time_q
        return all_queries

    async def scrape(self, ticker, lookback_date, progress_cb=None):
        queries = self._build_queries(ticker)
        total_queries = len(queries)
        
        all_articles = []
        seen = set()
        
        # Batch fetch: fire all requests concurrently for speed
        connector = aiohttp.TCPConnector(limit=50, ssl=self.ssl_context)
        async with aiohttp.ClientSession(connector=connector) as session:
            # Build all URLs
            urls = []
            for q in queries:
                encoded = q.replace(' ', '+')
                urls.append((q, f"https://news.google.com/rss/search?q={encoded}&hl=en-US&gl=US&ceid=US:en"))
            
            # Also add general financial news feeds to pad the count
            general_feeds = [
                "https://news.google.com/rss/headlines/section/topic/BUSINESS?hl=en-US&gl=US&ceid=US:en",
                "https://news.google.com/rss/search?q=stock+market&hl=en-US&gl=US&ceid=US:en",
                "https://news.google.com/rss/search?q=wall+street+today&hl=en-US&gl=US&ceid=US:en",
                "https://news.google.com/rss/search?q=stocks+trading&hl=en-US&gl=US&ceid=US:en",
                "https://news.google.com/rss/search?q=financial+markets&hl=en-US&gl=US&ceid=US:en",
            ]
            for gf in general_feeds:
                urls.append(("General Market", gf))
            
            # Fire all requests concurrently in batches of 20
            batch_size = 20
            for batch_start in range(0, len(urls), batch_size):
                if len(all_articles) >= self.limit:
                    break
                    
                batch = urls[batch_start:batch_start + batch_size]
                tasks = [self.fetch_feed(session, url) for _, url in batch]
                results = await asyncio.gather(*tasks, return_exceptions=True)
                
                for (query_name, _), xml in zip(batch, results):
                    if isinstance(xml, Exception) or not xml:
                        continue
                    parsed = self.parse_feed(xml, lookback_date)
                    for a in parsed:
                        if a['link'] not in seen:
                            seen.add(a['link'])
                            all_articles.append(a)
                            if len(all_articles) >= self.limit:
                                break
                
                # Report progress
                if progress_cb:
                    scrape_progress = min(len(all_articles) / self.limit, 1.0)
                    progress_cb(
                        scrape_progress,
                        f"Collecting headlines: {len(all_articles)}/{self.limit}"
                    )
                
                # Small delay between batches to avoid rate limiting
                await asyncio.sleep(0.1)
        
        print(f"[Scraper] Total unique articles collected: {len(all_articles)}")
        return all_articles[:self.limit]

    @staticmethod
    def cleanup():
        for f in glob.glob("*.csv"):
            try:
                os.remove(f)
            except:
                pass