# extractor.py import asyncio import aiohttp import ssl import re from urllib.parse import quote_plus from bs4 import BeautifulSoup class ContentExtractor: def __init__(self): self.ssl_ctx = ssl.create_default_context() self.ssl_ctx.check_hostname = False self.ssl_ctx.verify_mode = ssl.CERT_NONE def _build_dynamic_image_url(self, title="", source="", ticker="market"): # Keep the image source deterministic and avoid scraping article hosts. # Filter out common news source noise that leads to irrelevant images. noise_words = {"google", "news", "yahoo", "finance", "reuters", "bloomberg", "inc", "com", "the", "and"} parts = [] # Prioritize ticker as a strong context signal if ticker and ticker.lower() != "market": parts.append(ticker) # Extract meaningful words from title and source for raw in (source, title): if not raw: continue cleaned = re.sub(r"[^a-zA-Z0-9\s]+", " ", str(raw).lower()) words = [w for w in cleaned.split() if len(w) > 2 and w not in noise_words] parts.extend(words[:3]) # WHITELIST of 'safe' professional keywords that reliably return business/finance images. # We explicitly AVOID tags like "stock" or "trading" which often return cat sculptures/statues. safe_tags = ["finance", "corporate", "office", "business", "skyscraper", "building", "desk"] # Select up to 2 safe tags randomly based on the title to maintain variety but ensure safety. # This keeps the image relevant to financial aesthetics without triggering the cat 'trap'. tag_index = abs(hash(str(title))) % len(safe_tags) tag_index_2 = (tag_index + 1) % len(safe_tags) query = f"{safe_tags[tag_index]},{safe_tags[tag_index_2]}" # We append a hash of the title as a 'random' seed to ensure uniqueness for different articles. seed = abs(hash(str(title) + str(source))) % 10000 return f"https://loremflickr.com/1200/675/{query}?random={seed}" async def _fetch_one(self, session, url): try: async with session.get( url, timeout=aiohttp.ClientTimeout(total=10), headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)' }, allow_redirects=True ) as resp: if resp.status == 200: html = await resp.text() return self._parse_html(html) except: pass return {"site_name": ""} def _parse_html(self, html): try: soup = BeautifulSoup(html, 'html.parser') # --- Source Discovery (Site Branding) --- site_name = "" meta_site = soup.find("meta", property="og:site_name") or \ soup.find("meta", attrs={"name": "application-name"}) if meta_site and meta_site.get("content"): site_name = meta_site["content"] return {"site_name": site_name} except: return {"site_name": ""} async def extract_all(self, articles): # We only need images/sources for the best headlines now conn = aiohttp.TCPConnector(limit=25, ssl=self.ssl_ctx) async with aiohttp.ClientSession(connector=conn) as session: for i in range(0, len(articles), 10): batch = articles[i:i+10] tasks = [self._fetch_one(session, a['link']) for a in batch] results = await asyncio.gather(*tasks, return_exceptions=True) for j, res in enumerate(results): if isinstance(res, dict): # Refine source name if RSS was generic or unknown rss_source = articles[i+j].get('source', 'Unknown').lower() extracted_site = res.get('site_name', '') # If RSS source is generic (e.g., google news, yahoo), use extracted site name if any(x in rss_source for x in ['google', 'yahoo', 'unknown']) and extracted_site: articles[i+j]['source'] = extracted_site # Always provide an image link in the payload. articles[i + j]['image'] = self._build_dynamic_image_url( title=articles[i + j].get('title', ''), source=articles[i + j].get('source', ''), ticker=articles[i + j].get('ticker', 'market'), ) await asyncio.sleep(0.1) return articles