Spaces:
Sleeping
Sleeping
| import asyncio | |
| import aiohttp | |
| import xml.etree.ElementTree as ET | |
| import ssl | |
| import re | |
| from datetime import datetime | |
| from email.utils import parsedate_to_datetime | |
| import glob | |
| import os | |
| class NewsScraper: | |
| def __init__(self, limit=600): | |
| self.limit = limit | |
| self.ssl_context = ssl.create_default_context() | |
| self.ssl_context.check_hostname = False | |
| self.ssl_context.verify_mode = ssl.CERT_NONE | |
| async def fetch_feed(self, session, url): | |
| try: | |
| async with session.get(url, timeout=aiohttp.ClientTimeout(total=8)) as response: | |
| if response.status == 200: | |
| return await response.text() | |
| except: | |
| pass | |
| return "" | |
| def parse_feed(self, xml_text, lookback_date): | |
| articles = [] | |
| if not xml_text: | |
| return articles | |
| try: | |
| lb_date = lookback_date.date() | |
| root = ET.fromstring(xml_text) | |
| for item in root.findall('.//item'): | |
| title = item.findtext('title') | |
| link = item.findtext('link') | |
| pub_date_str = item.findtext('pubDate') | |
| if title and link and pub_date_str: | |
| try: | |
| pub_dt = parsedate_to_datetime(pub_date_str) | |
| if pub_dt.date() >= lb_date: | |
| articles.append({ | |
| 'title': title, 'link': link, | |
| 'pub_date': pub_date_str, 'timestamp': pub_dt.isoformat() | |
| }) | |
| except: | |
| pass | |
| except: | |
| pass | |
| return articles | |
| def _build_queries(self, ticker): | |
| """Generate a massive, diverse set of search queries to maximize article yield.""" | |
| t = ticker | |
| base = [ | |
| t, f"{t} stock", f"{t} news", f"{t} market", f"{t} earnings", | |
| f"{t} analyst", f"{t} forecast", f"{t} price target", | |
| f"{t} options", f"{t} technical", f"{t} dividend", | |
| f"{t} industry", f"{t} competitor", f"{t} share price", | |
| f"{t} hedge fund", f"{t} institutional", | |
| ] | |
| # Financial action queries | |
| actions = [ | |
| f"{t} buy sell hold", f"{t} upgrade downgrade", f"{t} outperform underperform", | |
| f"{t} bullish bearish", f"{t} momentum", f"{t} breakout breakdown", | |
| f"{t} rally crash", f"{t} surge plunge", f"{t} soar tumble", | |
| f"{t} gains losses", f"{t} beat miss expectations", | |
| ] | |
| # Corporate event queries | |
| events = [ | |
| f"{t} CEO news", f"{t} quarterly results", f"{t} revenue profit", | |
| f"{t} guidance outlook", f"{t} acquisition merger", | |
| f"{t} lawsuit legal SEC", f"{t} insider trading", | |
| f"{t} IPO offering", f"{t} buyback repurchase", | |
| f"{t} partnership deal", f"{t} product launch", | |
| f"{t} layoffs restructuring", f"{t} expansion growth", | |
| ] | |
| # Analyst and research queries | |
| research = [ | |
| f"{t} wall street", f"{t} Goldman Sachs", f"{t} Morgan Stanley", | |
| f"{t} JP Morgan", f"{t} analyst rating", f"{t} price prediction", | |
| f"{t} short interest", f"{t} short squeeze", | |
| f"{t} put call ratio", f"{t} unusual activity", | |
| f"{t} fund holdings", f"{t} 13F filing", | |
| ] | |
| # Sector and macro queries | |
| macro = [ | |
| f"{t} sector outlook", f"{t} industry trend", f"{t} supply chain", | |
| f"{t} regulation policy", f"{t} inflation impact", | |
| f"{t} interest rate", f"{t} trade war tariff", | |
| f"{t} innovation technology", f"{t} ESG sustainability", | |
| ] | |
| # Time-sensitive queries | |
| time_q = [ | |
| f"{t} today", f"{t} this week", f"{t} latest", | |
| f"{t} breaking news", f"{t} update", | |
| f"{t} premarket", f"{t} after hours", | |
| ] | |
| all_queries = base + actions + events + research + macro + time_q | |
| return all_queries | |
| async def scrape(self, ticker, lookback_date, progress_cb=None): | |
| queries = self._build_queries(ticker) | |
| total_queries = len(queries) | |
| all_articles = [] | |
| seen = set() | |
| # Batch fetch: fire all requests concurrently for speed | |
| connector = aiohttp.TCPConnector(limit=50, ssl=self.ssl_context) | |
| async with aiohttp.ClientSession(connector=connector) as session: | |
| # Build all URLs | |
| urls = [] | |
| for q in queries: | |
| encoded = q.replace(' ', '+') | |
| urls.append((q, f"https://news.google.com/rss/search?q={encoded}&hl=en-US&gl=US&ceid=US:en")) | |
| # Also add general financial news feeds to pad the count | |
| general_feeds = [ | |
| "https://news.google.com/rss/headlines/section/topic/BUSINESS?hl=en-US&gl=US&ceid=US:en", | |
| "https://news.google.com/rss/search?q=stock+market&hl=en-US&gl=US&ceid=US:en", | |
| "https://news.google.com/rss/search?q=wall+street+today&hl=en-US&gl=US&ceid=US:en", | |
| "https://news.google.com/rss/search?q=stocks+trading&hl=en-US&gl=US&ceid=US:en", | |
| "https://news.google.com/rss/search?q=financial+markets&hl=en-US&gl=US&ceid=US:en", | |
| ] | |
| for gf in general_feeds: | |
| urls.append(("General Market", gf)) | |
| # Fire all requests concurrently in batches of 20 | |
| batch_size = 20 | |
| for batch_start in range(0, len(urls), batch_size): | |
| if len(all_articles) >= self.limit: | |
| break | |
| batch = urls[batch_start:batch_start + batch_size] | |
| tasks = [self.fetch_feed(session, url) for _, url in batch] | |
| results = await asyncio.gather(*tasks, return_exceptions=True) | |
| for (query_name, _), xml in zip(batch, results): | |
| if isinstance(xml, Exception) or not xml: | |
| continue | |
| parsed = self.parse_feed(xml, lookback_date) | |
| for a in parsed: | |
| if a['link'] not in seen: | |
| seen.add(a['link']) | |
| all_articles.append(a) | |
| if len(all_articles) >= self.limit: | |
| break | |
| # Report progress | |
| if progress_cb: | |
| scrape_progress = min(len(all_articles) / self.limit, 1.0) | |
| progress_cb( | |
| scrape_progress, | |
| f"Collecting headlines: {len(all_articles)}/{self.limit}" | |
| ) | |
| # Small delay between batches to avoid rate limiting | |
| await asyncio.sleep(0.1) | |
| print(f"[Scraper] Total unique articles collected: {len(all_articles)}") | |
| return all_articles[:self.limit] | |
| def cleanup(): | |
| for f in glob.glob("*.csv"): | |
| try: | |
| os.remove(f) | |
| except: | |
| pass | |