# scraper.py import asyncio import aiohttp import xml.etree.ElementTree as ET import ssl from email.utils import parsedate_to_datetime class NewsScraper: def __init__(self, limit=1000): self.limit = limit self.ssl_context = ssl.create_default_context() self.ssl_context.check_hostname = False self.ssl_context.verify_mode = ssl.CERT_NONE async def fetch_feed(self, session, url): try: async with session.get( url, timeout=aiohttp.ClientTimeout(total=8) ) as resp: if resp.status == 200: return await resp.text() except: pass return "" def parse_feed(self, xml_text, lookback_date): articles = [] if not xml_text: return articles try: lb = lookback_date.date() root = ET.fromstring(xml_text) for item in root.findall('.//item'): t = item.findtext('title') l = item.findtext('link') p = item.findtext('pubDate') s = item.find('source') s_name = s.text if s is not None else "Unknown" d = item.findtext('description') if t and l and p: try: dt = parsedate_to_datetime(p) if dt.date() >= lb: articles.append({ 'title': t, 'link': l, 'pub_date': p, 'timestamp': dt.isoformat(), 'source': s_name, 'description': d if d else "" }) except: pass except: pass return articles def _build_queries(self, ticker): # Resolve aliases if it's a known ticker to expand search coverage aliases = [ticker] t_low = ticker.lower() if t_low in ["^nsei", "nifty", "nifty 50"]: aliases.extend(["nifty 50", "nifty50", "nifty index", "nse india", "nsei stocks", "indian market", "nifty 100", "nifty next 50"]) elif t_low in ["^bsesn", "sensex"]: aliases.extend(["sensex", "bse sensex", "bombay stock exchange", "bse india", "sensex 30"]) elif t_low in ["^nsebank", "banknifty"]: aliases.extend(["bank nifty", "nifty bank", "banknifty", "banking stocks india", "hdfc bank news", "icici bank news"]) queries = [] for a in aliases[:6]: # Use more aliases for broader coverage queries.extend([ a, f"{a} stock", f"{a} news", f"{a} market", f"{a} forecast", f"{a} predictions", f"{a} today", f"{a} analysis", f"{a} outlook", f"{a} update", f"{a} breakout", f"{a} technicals", f"{a} sentiment" ]) # Add high-yield generic financial terms for context if it's a major index if t_low in ["^nsei", "^bsesn", "nifty", "sensex"]: queries.extend([ "indian stock market news", "dalal street updates", "rbi policy news", "fpi flows india", "nifty earnings season" ]) return list(dict.fromkeys(queries)) # Remove duplicates async def scrape(self, ticker, lookback_date, progress_cb=None): queries = self._build_queries(ticker) all_articles = [] seen = set() conn = aiohttp.TCPConnector(limit=50, ssl=self.ssl_context) async with aiohttp.ClientSession(connector=conn) as session: urls = [] for q in queries: enc = q.replace(' ', '+') urls.append( f"https://news.google.com/rss/search?q={enc}" f"&hl=en-US&gl=US&ceid=US:en" ) for i in range(0, len(urls), 20): if len(all_articles) >= self.limit: break batch = urls[i:i+20] tasks = [self.fetch_feed(session, u) for u in batch] results = await asyncio.gather( *tasks, return_exceptions=True ) for xml in results: if isinstance(xml, Exception) or not xml: continue for a in self.parse_feed(xml, lookback_date): if a['link'] not in seen: seen.add(a['link']) all_articles.append(a) if progress_cb: progress_cb( min(len(all_articles)/self.limit, 1.0), len(all_articles) ) await asyncio.sleep(0.1) print(f"[Scraper] {len(all_articles)} unique articles") return all_articles[:self.limit]