Spaces:

Jitendra12421
/

Miscellonoues_model_backend

Sleeping

App Files Files Community

Miscellonoues_model_backend / extractor.py

Jitendra12421

Upload 3 files

d71d998 verified about 2 months ago

raw

history blame contribute delete

4.79 kB

	# extractor.py
	import asyncio
	import aiohttp
	import ssl
	import re
	from urllib.parse import quote_plus
	from bs4 import BeautifulSoup

	class ContentExtractor:
	def __init__(self):
	self.ssl_ctx = ssl.create_default_context()
	self.ssl_ctx.check_hostname = False
	self.ssl_ctx.verify_mode = ssl.CERT_NONE

	def _build_dynamic_image_url(self, title="", source="", ticker="market"):
	# Keep the image source deterministic and avoid scraping article hosts.
	# Filter out common news source noise that leads to irrelevant images.
	noise_words = {"google", "news", "yahoo", "finance", "reuters", "bloomberg", "inc", "com", "the", "and"}

	parts = []
	# Prioritize ticker as a strong context signal
	if ticker and ticker.lower() != "market":
	parts.append(ticker)

	# Extract meaningful words from title and source
	for raw in (source, title):
	if not raw:
	continue
	cleaned = re.sub(r"[^a-zA-Z0-9\s]+", " ", str(raw).lower())
	words = [w for w in cleaned.split() if len(w) > 2 and w not in noise_words]
	parts.extend(words[:3])

	# WHITELIST of 'safe' professional keywords that reliably return business/finance images.
	# We explicitly AVOID tags like "stock" or "trading" which often return cat sculptures/statues.
	safe_tags = ["finance", "corporate", "office", "business", "skyscraper", "building", "desk"]

	# Select up to 2 safe tags randomly based on the title to maintain variety but ensure safety.
	# This keeps the image relevant to financial aesthetics without triggering the cat 'trap'.
	tag_index = abs(hash(str(title))) % len(safe_tags)
	tag_index_2 = (tag_index + 1) % len(safe_tags)

	query = f"{safe_tags[tag_index]},{safe_tags[tag_index_2]}"

	# We append a hash of the title as a 'random' seed to ensure uniqueness for different articles.
	seed = abs(hash(str(title) + str(source))) % 10000
	return f"https://loremflickr.com/1200/675/{query}?random={seed}"

	async def _fetch_one(self, session, url):
	try:
	async with session.get(
	url, timeout=aiohttp.ClientTimeout(total=10),
	headers={
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
	},
	allow_redirects=True
	) as resp:
	if resp.status == 200:
	html = await resp.text()
	return self._parse_html(html)
	except:
	pass
	return {"site_name": ""}

	def _parse_html(self, html):
	try:
	soup = BeautifulSoup(html, 'html.parser')

	# --- Source Discovery (Site Branding) ---
	site_name = ""
	meta_site = soup.find("meta", property="og:site_name") or \
	soup.find("meta", attrs={"name": "application-name"})
	if meta_site and meta_site.get("content"):
	site_name = meta_site["content"]

	return {"site_name": site_name}
	except:
	return {"site_name": ""}

	async def extract_all(self, articles):
	# We only need images/sources for the best headlines now
	conn = aiohttp.TCPConnector(limit=25, ssl=self.ssl_ctx)
	async with aiohttp.ClientSession(connector=conn) as session:
	for i in range(0, len(articles), 10):
	batch = articles[i:i+10]
	tasks = [self._fetch_one(session, a['link']) for a in batch]
	results = await asyncio.gather(*tasks, return_exceptions=True)
	for j, res in enumerate(results):
	if isinstance(res, dict):
	# Refine source name if RSS was generic or unknown
	rss_source = articles[i+j].get('source', 'Unknown').lower()
	extracted_site = res.get('site_name', '')

	# If RSS source is generic (e.g., google news, yahoo), use extracted site name
	if any(x in rss_source for x in ['google', 'yahoo', 'unknown']) and extracted_site:
	articles[i+j]['source'] = extracted_site

	# Always provide an image link in the payload.
	articles[i + j]['image'] = self._build_dynamic_image_url(
	title=articles[i + j].get('title', ''),
	source=articles[i + j].get('source', ''),
	ticker=articles[i + j].get('ticker', 'market'),
	)
	await asyncio.sleep(0.1)
	return articles