Jitendra12421's picture
Upload 3 files
d71d998 verified
# extractor.py
import asyncio
import aiohttp
import ssl
import re
from urllib.parse import quote_plus
from bs4 import BeautifulSoup
class ContentExtractor:
def __init__(self):
self.ssl_ctx = ssl.create_default_context()
self.ssl_ctx.check_hostname = False
self.ssl_ctx.verify_mode = ssl.CERT_NONE
def _build_dynamic_image_url(self, title="", source="", ticker="market"):
# Keep the image source deterministic and avoid scraping article hosts.
# Filter out common news source noise that leads to irrelevant images.
noise_words = {"google", "news", "yahoo", "finance", "reuters", "bloomberg", "inc", "com", "the", "and"}
parts = []
# Prioritize ticker as a strong context signal
if ticker and ticker.lower() != "market":
parts.append(ticker)
# Extract meaningful words from title and source
for raw in (source, title):
if not raw:
continue
cleaned = re.sub(r"[^a-zA-Z0-9\s]+", " ", str(raw).lower())
words = [w for w in cleaned.split() if len(w) > 2 and w not in noise_words]
parts.extend(words[:3])
# WHITELIST of 'safe' professional keywords that reliably return business/finance images.
# We explicitly AVOID tags like "stock" or "trading" which often return cat sculptures/statues.
safe_tags = ["finance", "corporate", "office", "business", "skyscraper", "building", "desk"]
# Select up to 2 safe tags randomly based on the title to maintain variety but ensure safety.
# This keeps the image relevant to financial aesthetics without triggering the cat 'trap'.
tag_index = abs(hash(str(title))) % len(safe_tags)
tag_index_2 = (tag_index + 1) % len(safe_tags)
query = f"{safe_tags[tag_index]},{safe_tags[tag_index_2]}"
# We append a hash of the title as a 'random' seed to ensure uniqueness for different articles.
seed = abs(hash(str(title) + str(source))) % 10000
return f"https://loremflickr.com/1200/675/{query}?random={seed}"
async def _fetch_one(self, session, url):
try:
async with session.get(
url, timeout=aiohttp.ClientTimeout(total=10),
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
},
allow_redirects=True
) as resp:
if resp.status == 200:
html = await resp.text()
return self._parse_html(html)
except:
pass
return {"site_name": ""}
def _parse_html(self, html):
try:
soup = BeautifulSoup(html, 'html.parser')
# --- Source Discovery (Site Branding) ---
site_name = ""
meta_site = soup.find("meta", property="og:site_name") or \
soup.find("meta", attrs={"name": "application-name"})
if meta_site and meta_site.get("content"):
site_name = meta_site["content"]
return {"site_name": site_name}
except:
return {"site_name": ""}
async def extract_all(self, articles):
# We only need images/sources for the best headlines now
conn = aiohttp.TCPConnector(limit=25, ssl=self.ssl_ctx)
async with aiohttp.ClientSession(connector=conn) as session:
for i in range(0, len(articles), 10):
batch = articles[i:i+10]
tasks = [self._fetch_one(session, a['link']) for a in batch]
results = await asyncio.gather(*tasks, return_exceptions=True)
for j, res in enumerate(results):
if isinstance(res, dict):
# Refine source name if RSS was generic or unknown
rss_source = articles[i+j].get('source', 'Unknown').lower()
extracted_site = res.get('site_name', '')
# If RSS source is generic (e.g., google news, yahoo), use extracted site name
if any(x in rss_source for x in ['google', 'yahoo', 'unknown']) and extracted_site:
articles[i+j]['source'] = extracted_site
# Always provide an image link in the payload.
articles[i + j]['image'] = self._build_dynamic_image_url(
title=articles[i + j].get('title', ''),
source=articles[i + j].get('source', ''),
ticker=articles[i + j].get('ticker', 'market'),
)
await asyncio.sleep(0.1)
return articles