Jitendra12421's picture
Upload 3 files
f17c710 verified
# scraper.py
import asyncio
import aiohttp
import xml.etree.ElementTree as ET
import ssl
from email.utils import parsedate_to_datetime
class NewsScraper:
def __init__(self, limit=1000):
self.limit = limit
self.ssl_context = ssl.create_default_context()
self.ssl_context.check_hostname = False
self.ssl_context.verify_mode = ssl.CERT_NONE
async def fetch_feed(self, session, url):
try:
async with session.get(
url, timeout=aiohttp.ClientTimeout(total=8)
) as resp:
if resp.status == 200:
return await resp.text()
except:
pass
return ""
def parse_feed(self, xml_text, lookback_date):
articles = []
if not xml_text:
return articles
try:
lb = lookback_date.date()
root = ET.fromstring(xml_text)
for item in root.findall('.//item'):
t = item.findtext('title')
l = item.findtext('link')
p = item.findtext('pubDate')
s = item.find('source')
s_name = s.text if s is not None else "Unknown"
d = item.findtext('description')
if t and l and p:
try:
dt = parsedate_to_datetime(p)
if dt.date() >= lb:
articles.append({
'title': t,
'link': l,
'pub_date': p,
'timestamp': dt.isoformat(),
'source': s_name,
'description': d if d else ""
})
except:
pass
except:
pass
return articles
def _build_queries(self, ticker):
# Resolve aliases if it's a known ticker to expand search coverage
aliases = [ticker]
t_low = ticker.lower()
if t_low in ["^nsei", "nifty", "nifty 50"]:
aliases.extend(["nifty 50", "nifty50", "nifty index", "nse india", "nsei stocks", "indian market", "nifty 100", "nifty next 50"])
elif t_low in ["^bsesn", "sensex"]:
aliases.extend(["sensex", "bse sensex", "bombay stock exchange", "bse india", "sensex 30"])
elif t_low in ["^nsebank", "banknifty"]:
aliases.extend(["bank nifty", "nifty bank", "banknifty", "banking stocks india", "hdfc bank news", "icici bank news"])
queries = []
for a in aliases[:6]: # Use more aliases for broader coverage
queries.extend([
a, f"{a} stock", f"{a} news", f"{a} market",
f"{a} forecast", f"{a} predictions", f"{a} today",
f"{a} analysis", f"{a} outlook", f"{a} update",
f"{a} breakout", f"{a} technicals", f"{a} sentiment"
])
# Add high-yield generic financial terms for context if it's a major index
if t_low in ["^nsei", "^bsesn", "nifty", "sensex"]:
queries.extend([
"indian stock market news", "dalal street updates",
"rbi policy news", "fpi flows india", "nifty earnings season"
])
return list(dict.fromkeys(queries)) # Remove duplicates
async def scrape(self, ticker, lookback_date, progress_cb=None):
queries = self._build_queries(ticker)
all_articles = []
seen = set()
conn = aiohttp.TCPConnector(limit=50, ssl=self.ssl_context)
async with aiohttp.ClientSession(connector=conn) as session:
urls = []
for q in queries:
enc = q.replace(' ', '+')
urls.append(
f"https://news.google.com/rss/search?q={enc}"
f"&hl=en-US&gl=US&ceid=US:en"
)
for i in range(0, len(urls), 20):
if len(all_articles) >= self.limit:
break
batch = urls[i:i+20]
tasks = [self.fetch_feed(session, u) for u in batch]
results = await asyncio.gather(
*tasks, return_exceptions=True
)
for xml in results:
if isinstance(xml, Exception) or not xml:
continue
for a in self.parse_feed(xml, lookback_date):
if a['link'] not in seen:
seen.add(a['link'])
all_articles.append(a)
if progress_cb:
progress_cb(
min(len(all_articles)/self.limit, 1.0),
len(all_articles)
)
await asyncio.sleep(0.1)
print(f"[Scraper] {len(all_articles)} unique articles")
return all_articles[:self.limit]