Spaces:
Sleeping
Sleeping
File size: 7,448 Bytes
cec5d18 34efb38 cec5d18 3ba06b5 cec5d18 34efb38 cec5d18 34efb38 cec5d18 34efb38 cec5d18 3ba06b5 cec5d18 3ba06b5 cec5d18 34efb38 cec5d18 34efb38 cec5d18 34efb38 05830a7 cec5d18 34efb38 cec5d18 05830a7 34efb38 cec5d18 34efb38 05830a7 34efb38 3ba06b5 34efb38 05830a7 34efb38 cec5d18 34efb38 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 | import asyncio
import aiohttp
import xml.etree.ElementTree as ET
import ssl
import re
from datetime import datetime
from email.utils import parsedate_to_datetime
import glob
import os
class NewsScraper:
def __init__(self, limit=600):
self.limit = limit
self.ssl_context = ssl.create_default_context()
self.ssl_context.check_hostname = False
self.ssl_context.verify_mode = ssl.CERT_NONE
async def fetch_feed(self, session, url):
try:
async with session.get(url, timeout=aiohttp.ClientTimeout(total=8)) as response:
if response.status == 200:
return await response.text()
except:
pass
return ""
def parse_feed(self, xml_text, lookback_date):
articles = []
if not xml_text:
return articles
try:
lb_date = lookback_date.date()
root = ET.fromstring(xml_text)
for item in root.findall('.//item'):
title = item.findtext('title')
link = item.findtext('link')
pub_date_str = item.findtext('pubDate')
if title and link and pub_date_str:
try:
pub_dt = parsedate_to_datetime(pub_date_str)
if pub_dt.date() >= lb_date:
articles.append({
'title': title, 'link': link,
'pub_date': pub_date_str, 'timestamp': pub_dt.isoformat()
})
except:
pass
except:
pass
return articles
def _build_queries(self, ticker):
"""Generate a massive, diverse set of search queries to maximize article yield."""
t = ticker
base = [
t, f"{t} stock", f"{t} news", f"{t} market", f"{t} earnings",
f"{t} analyst", f"{t} forecast", f"{t} price target",
f"{t} options", f"{t} technical", f"{t} dividend",
f"{t} industry", f"{t} competitor", f"{t} share price",
f"{t} hedge fund", f"{t} institutional",
]
# Financial action queries
actions = [
f"{t} buy sell hold", f"{t} upgrade downgrade", f"{t} outperform underperform",
f"{t} bullish bearish", f"{t} momentum", f"{t} breakout breakdown",
f"{t} rally crash", f"{t} surge plunge", f"{t} soar tumble",
f"{t} gains losses", f"{t} beat miss expectations",
]
# Corporate event queries
events = [
f"{t} CEO news", f"{t} quarterly results", f"{t} revenue profit",
f"{t} guidance outlook", f"{t} acquisition merger",
f"{t} lawsuit legal SEC", f"{t} insider trading",
f"{t} IPO offering", f"{t} buyback repurchase",
f"{t} partnership deal", f"{t} product launch",
f"{t} layoffs restructuring", f"{t} expansion growth",
]
# Analyst and research queries
research = [
f"{t} wall street", f"{t} Goldman Sachs", f"{t} Morgan Stanley",
f"{t} JP Morgan", f"{t} analyst rating", f"{t} price prediction",
f"{t} short interest", f"{t} short squeeze",
f"{t} put call ratio", f"{t} unusual activity",
f"{t} fund holdings", f"{t} 13F filing",
]
# Sector and macro queries
macro = [
f"{t} sector outlook", f"{t} industry trend", f"{t} supply chain",
f"{t} regulation policy", f"{t} inflation impact",
f"{t} interest rate", f"{t} trade war tariff",
f"{t} innovation technology", f"{t} ESG sustainability",
]
# Time-sensitive queries
time_q = [
f"{t} today", f"{t} this week", f"{t} latest",
f"{t} breaking news", f"{t} update",
f"{t} premarket", f"{t} after hours",
]
all_queries = base + actions + events + research + macro + time_q
return all_queries
async def scrape(self, ticker, lookback_date, progress_cb=None):
queries = self._build_queries(ticker)
total_queries = len(queries)
all_articles = []
seen = set()
# Batch fetch: fire all requests concurrently for speed
connector = aiohttp.TCPConnector(limit=50, ssl=self.ssl_context)
async with aiohttp.ClientSession(connector=connector) as session:
# Build all URLs
urls = []
for q in queries:
encoded = q.replace(' ', '+')
urls.append((q, f"https://news.google.com/rss/search?q={encoded}&hl=en-US&gl=US&ceid=US:en"))
# Also add general financial news feeds to pad the count
general_feeds = [
"https://news.google.com/rss/headlines/section/topic/BUSINESS?hl=en-US&gl=US&ceid=US:en",
"https://news.google.com/rss/search?q=stock+market&hl=en-US&gl=US&ceid=US:en",
"https://news.google.com/rss/search?q=wall+street+today&hl=en-US&gl=US&ceid=US:en",
"https://news.google.com/rss/search?q=stocks+trading&hl=en-US&gl=US&ceid=US:en",
"https://news.google.com/rss/search?q=financial+markets&hl=en-US&gl=US&ceid=US:en",
]
for gf in general_feeds:
urls.append(("General Market", gf))
# Fire all requests concurrently in batches of 20
batch_size = 20
for batch_start in range(0, len(urls), batch_size):
if len(all_articles) >= self.limit:
break
batch = urls[batch_start:batch_start + batch_size]
tasks = [self.fetch_feed(session, url) for _, url in batch]
results = await asyncio.gather(*tasks, return_exceptions=True)
for (query_name, _), xml in zip(batch, results):
if isinstance(xml, Exception) or not xml:
continue
parsed = self.parse_feed(xml, lookback_date)
for a in parsed:
if a['link'] not in seen:
seen.add(a['link'])
all_articles.append(a)
if len(all_articles) >= self.limit:
break
# Report progress
if progress_cb:
scrape_progress = min(len(all_articles) / self.limit, 1.0)
progress_cb(
scrape_progress,
f"Collecting headlines: {len(all_articles)}/{self.limit}"
)
# Small delay between batches to avoid rate limiting
await asyncio.sleep(0.1)
print(f"[Scraper] Total unique articles collected: {len(all_articles)}")
return all_articles[:self.limit]
@staticmethod
def cleanup():
for f in glob.glob("*.csv"):
try:
os.remove(f)
except:
pass
|