| |
| import asyncio |
| import aiohttp |
| import ssl |
| import re |
| from urllib.parse import quote_plus |
| from bs4 import BeautifulSoup |
|
|
| class ContentExtractor: |
| def __init__(self): |
| self.ssl_ctx = ssl.create_default_context() |
| self.ssl_ctx.check_hostname = False |
| self.ssl_ctx.verify_mode = ssl.CERT_NONE |
|
|
| def _build_dynamic_image_url(self, title="", source="", ticker="market"): |
| |
| |
| noise_words = {"google", "news", "yahoo", "finance", "reuters", "bloomberg", "inc", "com", "the", "and"} |
| |
| parts = [] |
| |
| if ticker and ticker.lower() != "market": |
| parts.append(ticker) |
| |
| |
| for raw in (source, title): |
| if not raw: |
| continue |
| cleaned = re.sub(r"[^a-zA-Z0-9\s]+", " ", str(raw).lower()) |
| words = [w for w in cleaned.split() if len(w) > 2 and w not in noise_words] |
| parts.extend(words[:3]) |
|
|
| |
| |
| safe_tags = ["finance", "corporate", "office", "business", "skyscraper", "building", "desk"] |
| |
| |
| |
| tag_index = abs(hash(str(title))) % len(safe_tags) |
| tag_index_2 = (tag_index + 1) % len(safe_tags) |
| |
| query = f"{safe_tags[tag_index]},{safe_tags[tag_index_2]}" |
| |
| |
| seed = abs(hash(str(title) + str(source))) % 10000 |
| return f"https://loremflickr.com/1200/675/{query}?random={seed}" |
|
|
| async def _fetch_one(self, session, url): |
| try: |
| async with session.get( |
| url, timeout=aiohttp.ClientTimeout(total=10), |
| headers={ |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)' |
| }, |
| allow_redirects=True |
| ) as resp: |
| if resp.status == 200: |
| html = await resp.text() |
| return self._parse_html(html) |
| except: |
| pass |
| return {"site_name": ""} |
|
|
| def _parse_html(self, html): |
| try: |
| soup = BeautifulSoup(html, 'html.parser') |
|
|
| |
| site_name = "" |
| meta_site = soup.find("meta", property="og:site_name") or \ |
| soup.find("meta", attrs={"name": "application-name"}) |
| if meta_site and meta_site.get("content"): |
| site_name = meta_site["content"] |
|
|
| return {"site_name": site_name} |
| except: |
| return {"site_name": ""} |
|
|
| async def extract_all(self, articles): |
| |
| conn = aiohttp.TCPConnector(limit=25, ssl=self.ssl_ctx) |
| async with aiohttp.ClientSession(connector=conn) as session: |
| for i in range(0, len(articles), 10): |
| batch = articles[i:i+10] |
| tasks = [self._fetch_one(session, a['link']) for a in batch] |
| results = await asyncio.gather(*tasks, return_exceptions=True) |
| for j, res in enumerate(results): |
| if isinstance(res, dict): |
| |
| rss_source = articles[i+j].get('source', 'Unknown').lower() |
| extracted_site = res.get('site_name', '') |
| |
| |
| if any(x in rss_source for x in ['google', 'yahoo', 'unknown']) and extracted_site: |
| articles[i+j]['source'] = extracted_site |
|
|
| |
| articles[i + j]['image'] = self._build_dynamic_image_url( |
| title=articles[i + j].get('title', ''), |
| source=articles[i + j].get('source', ''), |
| ticker=articles[i + j].get('ticker', 'market'), |
| ) |
| await asyncio.sleep(0.1) |
| return articles |
|
|