|
|
| import asyncio
|
| import aiohttp
|
| import xml.etree.ElementTree as ET
|
| import ssl
|
| from email.utils import parsedate_to_datetime
|
|
|
| class NewsScraper:
|
| def __init__(self, limit=1000):
|
| self.limit = limit
|
| self.ssl_context = ssl.create_default_context()
|
| self.ssl_context.check_hostname = False
|
| self.ssl_context.verify_mode = ssl.CERT_NONE
|
|
|
| async def fetch_feed(self, session, url):
|
| try:
|
| async with session.get(
|
| url, timeout=aiohttp.ClientTimeout(total=8)
|
| ) as resp:
|
| if resp.status == 200:
|
| return await resp.text()
|
| except:
|
| pass
|
| return ""
|
|
|
| def parse_feed(self, xml_text, lookback_date):
|
| articles = []
|
| if not xml_text:
|
| return articles
|
| try:
|
| lb = lookback_date.date()
|
| root = ET.fromstring(xml_text)
|
| for item in root.findall('.//item'):
|
| t = item.findtext('title')
|
| l = item.findtext('link')
|
| p = item.findtext('pubDate')
|
| s = item.find('source')
|
| s_name = s.text if s is not None else "Unknown"
|
| d = item.findtext('description')
|
| if t and l and p:
|
| try:
|
| dt = parsedate_to_datetime(p)
|
| if dt.date() >= lb:
|
| articles.append({
|
| 'title': t,
|
| 'link': l,
|
| 'pub_date': p,
|
| 'timestamp': dt.isoformat(),
|
| 'source': s_name,
|
| 'description': d if d else ""
|
| })
|
| except:
|
| pass
|
| except:
|
| pass
|
| return articles
|
|
|
| def _build_queries(self, ticker):
|
|
|
| aliases = [ticker]
|
| t_low = ticker.lower()
|
| if t_low in ["^nsei", "nifty", "nifty 50"]:
|
| aliases.extend(["nifty 50", "nifty50", "nifty index", "nse india", "nsei stocks", "indian market", "nifty 100", "nifty next 50"])
|
| elif t_low in ["^bsesn", "sensex"]:
|
| aliases.extend(["sensex", "bse sensex", "bombay stock exchange", "bse india", "sensex 30"])
|
| elif t_low in ["^nsebank", "banknifty"]:
|
| aliases.extend(["bank nifty", "nifty bank", "banknifty", "banking stocks india", "hdfc bank news", "icici bank news"])
|
|
|
| queries = []
|
| for a in aliases[:6]:
|
| queries.extend([
|
| a, f"{a} stock", f"{a} news", f"{a} market",
|
| f"{a} forecast", f"{a} predictions", f"{a} today",
|
| f"{a} analysis", f"{a} outlook", f"{a} update",
|
| f"{a} breakout", f"{a} technicals", f"{a} sentiment"
|
| ])
|
|
|
|
|
| if t_low in ["^nsei", "^bsesn", "nifty", "sensex"]:
|
| queries.extend([
|
| "indian stock market news", "dalal street updates",
|
| "rbi policy news", "fpi flows india", "nifty earnings season"
|
| ])
|
|
|
| return list(dict.fromkeys(queries))
|
|
|
| async def scrape(self, ticker, lookback_date, progress_cb=None):
|
| queries = self._build_queries(ticker)
|
| all_articles = []
|
| seen = set()
|
| conn = aiohttp.TCPConnector(limit=50, ssl=self.ssl_context)
|
| async with aiohttp.ClientSession(connector=conn) as session:
|
| urls = []
|
| for q in queries:
|
| enc = q.replace(' ', '+')
|
| urls.append(
|
| f"https://news.google.com/rss/search?q={enc}"
|
| f"&hl=en-US&gl=US&ceid=US:en"
|
| )
|
| for i in range(0, len(urls), 20):
|
| if len(all_articles) >= self.limit:
|
| break
|
| batch = urls[i:i+20]
|
| tasks = [self.fetch_feed(session, u) for u in batch]
|
| results = await asyncio.gather(
|
| *tasks, return_exceptions=True
|
| )
|
| for xml in results:
|
| if isinstance(xml, Exception) or not xml:
|
| continue
|
| for a in self.parse_feed(xml, lookback_date):
|
| if a['link'] not in seen:
|
| seen.add(a['link'])
|
| all_articles.append(a)
|
| if progress_cb:
|
| progress_cb(
|
| min(len(all_articles)/self.limit, 1.0),
|
| len(all_articles)
|
| )
|
| await asyncio.sleep(0.1)
|
| print(f"[Scraper] {len(all_articles)} unique articles")
|
| return all_articles[:self.limit]
|
|
|