File size: 4,794 Bytes
d3ce5a6
ec17376
 
 
 
 
 
 
 
 
 
 
 
 
d3ce5a6
ec17376
d3ce5a6
 
 
ec17376
d3ce5a6
 
 
 
 
 
ec17376
 
d3ce5a6
 
ec17376
 
d71d998
 
 
 
 
 
 
 
 
 
 
 
f17c710
 
ec17376
 
 
 
 
d3ce5a6
 
 
ec17376
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d3ce5a6
 
 
ec17376
 
 
 
 
 
 
 
 
 
 
d3ce5a6
 
ec17376
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# extractor.py
import asyncio
import aiohttp
import ssl
import re
from urllib.parse import quote_plus
from bs4 import BeautifulSoup

class ContentExtractor:
    def __init__(self):
        self.ssl_ctx = ssl.create_default_context()
        self.ssl_ctx.check_hostname = False
        self.ssl_ctx.verify_mode = ssl.CERT_NONE

    def _build_dynamic_image_url(self, title="", source="", ticker="market"):
        # Keep the image source deterministic and avoid scraping article hosts.
        # Filter out common news source noise that leads to irrelevant images.
        noise_words = {"google", "news", "yahoo", "finance", "reuters", "bloomberg", "inc", "com", "the", "and"}
        
        parts = []
        # Prioritize ticker as a strong context signal
        if ticker and ticker.lower() != "market":
            parts.append(ticker)
            
        # Extract meaningful words from title and source
        for raw in (source, title):
            if not raw:
                continue
            cleaned = re.sub(r"[^a-zA-Z0-9\s]+", " ", str(raw).lower())
            words = [w for w in cleaned.split() if len(w) > 2 and w not in noise_words]
            parts.extend(words[:3])

        # WHITELIST of 'safe' professional keywords that reliably return business/finance images.
        # We explicitly AVOID tags like "stock" or "trading" which often return cat sculptures/statues.
        safe_tags = ["finance", "corporate", "office", "business", "skyscraper", "building", "desk"]
        
        # Select up to 2 safe tags randomly based on the title to maintain variety but ensure safety.
        # This keeps the image relevant to financial aesthetics without triggering the cat 'trap'.
        tag_index = abs(hash(str(title))) % len(safe_tags)
        tag_index_2 = (tag_index + 1) % len(safe_tags)
        
        query = f"{safe_tags[tag_index]},{safe_tags[tag_index_2]}"
        
        # We append a hash of the title as a 'random' seed to ensure uniqueness for different articles.
        seed = abs(hash(str(title) + str(source))) % 10000
        return f"https://loremflickr.com/1200/675/{query}?random={seed}"

    async def _fetch_one(self, session, url):
        try:
            async with session.get(
                url, timeout=aiohttp.ClientTimeout(total=10),
                headers={
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
                },
                allow_redirects=True
            ) as resp:
                if resp.status == 200:
                    html = await resp.text()
                    return self._parse_html(html)
        except:
            pass
        return {"site_name": ""}

    def _parse_html(self, html):
        try:
            soup = BeautifulSoup(html, 'html.parser')

            # --- Source Discovery (Site Branding) ---
            site_name = ""
            meta_site = soup.find("meta", property="og:site_name") or \
                        soup.find("meta", attrs={"name": "application-name"})
            if meta_site and meta_site.get("content"):
                site_name = meta_site["content"]

            return {"site_name": site_name}
        except:
            return {"site_name": ""}

    async def extract_all(self, articles):
        # We only need images/sources for the best headlines now
        conn = aiohttp.TCPConnector(limit=25, ssl=self.ssl_ctx)
        async with aiohttp.ClientSession(connector=conn) as session:
            for i in range(0, len(articles), 10):
                batch = articles[i:i+10]
                tasks = [self._fetch_one(session, a['link']) for a in batch]
                results = await asyncio.gather(*tasks, return_exceptions=True)
                for j, res in enumerate(results):
                    if isinstance(res, dict):
                        # Refine source name if RSS was generic or unknown
                        rss_source = articles[i+j].get('source', 'Unknown').lower()
                        extracted_site = res.get('site_name', '')
                        
                        # If RSS source is generic (e.g., google news, yahoo), use extracted site name
                        if any(x in rss_source for x in ['google', 'yahoo', 'unknown']) and extracted_site:
                            articles[i+j]['source'] = extracted_site

                        # Always provide an image link in the payload.
                        articles[i + j]['image'] = self._build_dynamic_image_url(
                            title=articles[i + j].get('title', ''),
                            source=articles[i + j].get('source', ''),
                            ticker=articles[i + j].get('ticker', 'market'),
                        )
                await asyncio.sleep(0.1)
        return articles