import asyncio
import aiohttp
import xml.etree.ElementTree as ET
import ssl
import re
from datetime import datetime
from email.utils import parsedate_to_datetime
import glob
import os

class NewsScraper:
    def __init__(self, limit=600):
        self.limit = limit
        self.ssl_context = ssl.create_default_context()
        self.ssl_context.check_hostname = False
        self.ssl_context.verify_mode = ssl.CERT_NONE

    async def fetch_feed(self, session, url):
        try:
            async with session.get(url, timeout=aiohttp.ClientTimeout(total=8)) as response:
                if response.status == 200:
                    return await response.text()
        except:
            pass
        return ""

    def parse_feed(self, xml_text, lookback_date):
        articles = []
        if not xml_text:
            return articles
        try:
            lb_date = lookback_date.date()
            root = ET.fromstring(xml_text)
            for item in root.findall('.//item'):
                title = item.findtext('title')
                link = item.findtext('link')
                pub_date_str = item.findtext('pubDate')
                if title and link and pub_date_str:
                    try:
                        pub_dt = parsedate_to_datetime(pub_date_str)
                        if pub_dt.date() >= lb_date:
                            articles.append({
                                'title': title, 'link': link,
                                'pub_date': pub_date_str, 'timestamp': pub_dt.isoformat()
                            })
                    except:
                        pass
        except:
            pass
        return articles

    def _build_queries(self, ticker):
        """Generate a massive, diverse set of search queries to maximize article yield."""
        t = ticker
        base = [
            t, f"{t} stock", f"{t} news", f"{t} market", f"{t} earnings",
            f"{t} analyst", f"{t} forecast", f"{t} price target",
            f"{t} options", f"{t} technical", f"{t} dividend",
            f"{t} industry", f"{t} competitor", f"{t} share price",
            f"{t} hedge fund", f"{t} institutional",
        ]
        
        # Financial action queries
        actions = [
            f"{t} buy sell hold", f"{t} upgrade downgrade", f"{t} outperform underperform",
            f"{t} bullish bearish", f"{t} momentum", f"{t} breakout breakdown",
            f"{t} rally crash", f"{t} surge plunge", f"{t} soar tumble",
            f"{t} gains losses", f"{t} beat miss expectations",
        ]
        
        # Corporate event queries
        events = [
            f"{t} CEO news", f"{t} quarterly results", f"{t} revenue profit",
            f"{t} guidance outlook", f"{t} acquisition merger",
            f"{t} lawsuit legal SEC", f"{t} insider trading",
            f"{t} IPO offering", f"{t} buyback repurchase",
            f"{t} partnership deal", f"{t} product launch",
            f"{t} layoffs restructuring", f"{t} expansion growth",
        ]
        
        # Analyst and research queries
        research = [
            f"{t} wall street", f"{t} Goldman Sachs", f"{t} Morgan Stanley",
            f"{t} JP Morgan", f"{t} analyst rating", f"{t} price prediction",
            f"{t} short interest", f"{t} short squeeze",
            f"{t} put call ratio", f"{t} unusual activity",
            f"{t} fund holdings", f"{t} 13F filing",
        ]
        
        # Sector and macro queries
        macro = [
            f"{t} sector outlook", f"{t} industry trend", f"{t} supply chain",
            f"{t} regulation policy", f"{t} inflation impact",
            f"{t} interest rate", f"{t} trade war tariff",
            f"{t} innovation technology", f"{t} ESG sustainability",
        ]
        
        # Time-sensitive queries
        time_q = [
            f"{t} today", f"{t} this week", f"{t} latest",
            f"{t} breaking news", f"{t} update",
            f"{t} premarket", f"{t} after hours",
        ]
        
        all_queries = base + actions + events + research + macro + time_q
        return all_queries

    async def scrape(self, ticker, lookback_date, progress_cb=None):
        queries = self._build_queries(ticker)
        total_queries = len(queries)
        
        all_articles = []
        seen = set()
        
        # Batch fetch: fire all requests concurrently for speed
        connector = aiohttp.TCPConnector(limit=50, ssl=self.ssl_context)
        async with aiohttp.ClientSession(connector=connector) as session:
            # Build all URLs
            urls = []
            for q in queries:
                encoded = q.replace(' ', '+')
                urls.append((q, f"https://news.google.com/rss/search?q={encoded}&hl=en-US&gl=US&ceid=US:en"))
            
            # Also add general financial news feeds to pad the count
            general_feeds = [
                "https://news.google.com/rss/headlines/section/topic/BUSINESS?hl=en-US&gl=US&ceid=US:en",
                "https://news.google.com/rss/search?q=stock+market&hl=en-US&gl=US&ceid=US:en",
                "https://news.google.com/rss/search?q=wall+street+today&hl=en-US&gl=US&ceid=US:en",
                "https://news.google.com/rss/search?q=stocks+trading&hl=en-US&gl=US&ceid=US:en",
                "https://news.google.com/rss/search?q=financial+markets&hl=en-US&gl=US&ceid=US:en",
            ]
            for gf in general_feeds:
                urls.append(("General Market", gf))
            
            # Fire all requests concurrently in batches of 20
            batch_size = 20
            for batch_start in range(0, len(urls), batch_size):
                if len(all_articles) >= self.limit:
                    break
                    
                batch = urls[batch_start:batch_start + batch_size]
                tasks = [self.fetch_feed(session, url) for _, url in batch]
                results = await asyncio.gather(*tasks, return_exceptions=True)
                
                for (query_name, _), xml in zip(batch, results):
                    if isinstance(xml, Exception) or not xml:
                        continue
                    parsed = self.parse_feed(xml, lookback_date)
                    for a in parsed:
                        if a['link'] not in seen:
                            seen.add(a['link'])
                            all_articles.append(a)
                            if len(all_articles) >= self.limit:
                                break
                
                # Report progress
                if progress_cb:
                    scrape_progress = min(len(all_articles) / self.limit, 1.0)
                    progress_cb(
                        scrape_progress,
                        f"Collecting headlines: {len(all_articles)}/{self.limit}"
                    )
                
                # Small delay between batches to avoid rate limiting
                await asyncio.sleep(0.1)
        
        print(f"[Scraper] Total unique articles collected: {len(all_articles)}")
        return all_articles[:self.limit]

    @staticmethod
    def cleanup():
        for f in glob.glob("*.csv"):
            try:
                os.remove(f)
            except:
                pass