Dmitry Beresnev
commited on
Commit
·
a975122
1
Parent(s):
acede88
fix news in feed section
Browse files- app/components/news.py +3 -2
- app/services/news_scraper.py +53 -52
app/components/news.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
| 3 |
import streamlit as st
|
| 4 |
import pandas as pd
|
| 5 |
from datetime import datetime
|
|
|
|
| 6 |
|
| 7 |
|
| 8 |
def display_news_card(news_item: dict):
|
|
@@ -80,7 +81,7 @@ def display_news_card(news_item: dict):
|
|
| 80 |
<!-- Title -->
|
| 81 |
<h3 style='color: #f3f4f6; margin: 0 0 12px 0; font-size: 17px;
|
| 82 |
line-height: 1.5; font-weight: 600;'>
|
| 83 |
-
{news_item['summary']}
|
| 84 |
</h3>
|
| 85 |
|
| 86 |
<!-- Meta info -->
|
|
@@ -244,7 +245,7 @@ def display_breaking_news_banner(df: pd.DataFrame):
|
|
| 244 |
BREAKING NEWS • {latest['source'].upper()}
|
| 245 |
</div>
|
| 246 |
<div style='color: white; font-size: 18px; font-weight: 600; line-height: 1.4;'>
|
| 247 |
-
{latest['summary']}
|
| 248 |
</div>
|
| 249 |
</div>
|
| 250 |
<a href='{latest['url']}' target='_blank'
|
|
|
|
| 3 |
import streamlit as st
|
| 4 |
import pandas as pd
|
| 5 |
from datetime import datetime
|
| 6 |
+
import html
|
| 7 |
|
| 8 |
|
| 9 |
def display_news_card(news_item: dict):
|
|
|
|
| 81 |
<!-- Title -->
|
| 82 |
<h3 style='color: #f3f4f6; margin: 0 0 12px 0; font-size: 17px;
|
| 83 |
line-height: 1.5; font-weight: 600;'>
|
| 84 |
+
{html.escape(news_item['summary'])}
|
| 85 |
</h3>
|
| 86 |
|
| 87 |
<!-- Meta info -->
|
|
|
|
| 245 |
BREAKING NEWS • {latest['source'].upper()}
|
| 246 |
</div>
|
| 247 |
<div style='color: white; font-size: 18px; font-weight: 600; line-height: 1.4;'>
|
| 248 |
+
{html.escape(latest['summary'])}
|
| 249 |
</div>
|
| 250 |
</div>
|
| 251 |
<a href='{latest['url']}' target='_blank'
|
app/services/news_scraper.py
CHANGED
|
@@ -27,24 +27,9 @@ class FinanceNewsScraper:
|
|
| 27 |
"""
|
| 28 |
|
| 29 |
# News sources with RSS feeds and web scraping endpoints
|
|
|
|
| 30 |
SOURCES = {
|
| 31 |
-
# ===== TIER 1: Major Financial News
|
| 32 |
-
'reuters_business': {
|
| 33 |
-
'name': 'Reuters Business',
|
| 34 |
-
'rss': 'https://www.reutersagency.com/feed/?taxonomy=best-topics&post_type=best',
|
| 35 |
-
'web': 'https://www.reuters.com/business/',
|
| 36 |
-
'selectors': {'headline': 'h3.text__text__1FZLe', 'link': 'a.text__text__1FZLe'},
|
| 37 |
-
'weight': 1.5,
|
| 38 |
-
'specialization': ['macro', 'markets']
|
| 39 |
-
},
|
| 40 |
-
'reuters_markets': {
|
| 41 |
-
'name': 'Reuters Markets',
|
| 42 |
-
'rss': 'https://www.reutersagency.com/feed/?best-sectors=business-finance&post_type=best',
|
| 43 |
-
'web': 'https://www.reuters.com/markets/',
|
| 44 |
-
'selectors': {'headline': 'h3', 'link': 'a[data-testid="Heading"]'},
|
| 45 |
-
'weight': 1.5,
|
| 46 |
-
'specialization': ['markets']
|
| 47 |
-
},
|
| 48 |
'cnbc': {
|
| 49 |
'name': 'CNBC',
|
| 50 |
'rss': 'https://www.cnbc.com/id/100003114/device/rss/rss.html',
|
|
@@ -53,12 +38,18 @@ class FinanceNewsScraper:
|
|
| 53 |
'weight': 1.2,
|
| 54 |
'specialization': ['markets']
|
| 55 |
},
|
| 56 |
-
'
|
| 57 |
-
'name': '
|
| 58 |
-
'rss': 'https://
|
| 59 |
-
'web':
|
| 60 |
-
'
|
| 61 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
'specialization': ['markets']
|
| 63 |
},
|
| 64 |
'ft_markets': {
|
|
@@ -69,19 +60,10 @@ class FinanceNewsScraper:
|
|
| 69 |
'weight': 1.4,
|
| 70 |
'specialization': ['markets']
|
| 71 |
},
|
| 72 |
-
'wsj_markets': {
|
| 73 |
-
'name': 'WSJ Markets',
|
| 74 |
-
'rss': 'https://feeds.a.dj.com/rss/RSSMarketsMain.xml',
|
| 75 |
-
'web': 'https://www.wsj.com/news/markets',
|
| 76 |
-
'selectors': {'headline': 'h3.WSJTheme--headline', 'link': 'a'},
|
| 77 |
-
'weight': 1.4,
|
| 78 |
-
'specialization': ['markets']
|
| 79 |
-
},
|
| 80 |
'economist': {
|
| 81 |
'name': 'The Economist',
|
| 82 |
'rss': 'https://www.economist.com/finance-and-economics/rss.xml',
|
| 83 |
-
'web':
|
| 84 |
-
'selectors': {'headline': 'span._headline', 'link': 'a'},
|
| 85 |
'weight': 1.3,
|
| 86 |
'specialization': ['macro', 'geopolitical']
|
| 87 |
},
|
|
@@ -95,16 +77,8 @@ class FinanceNewsScraper:
|
|
| 95 |
'weight': 1.4,
|
| 96 |
'specialization': ['geopolitical', 'macro']
|
| 97 |
},
|
| 98 |
-
'bloomberg_markets': {
|
| 99 |
-
'name': 'Bloomberg',
|
| 100 |
-
'rss': 'https://www.bloomberg.com/feed/podcast/etf-report.xml',
|
| 101 |
-
'web': 'https://www.bloomberg.com/markets',
|
| 102 |
-
'selectors': {'headline': 'div.single-story-module__headline', 'link': 'a'},
|
| 103 |
-
'weight': 1.5,
|
| 104 |
-
'specialization': ['markets']
|
| 105 |
-
},
|
| 106 |
|
| 107 |
-
# ===== TIER 3: Central Banks
|
| 108 |
'federal_reserve': {
|
| 109 |
'name': 'Federal Reserve',
|
| 110 |
'rss': 'https://www.federalreserve.gov/feeds/press_all.xml',
|
|
@@ -123,9 +97,8 @@ class FinanceNewsScraper:
|
|
| 123 |
},
|
| 124 |
'imf': {
|
| 125 |
'name': 'IMF',
|
| 126 |
-
'rss': 'https://www.imf.org/en/
|
| 127 |
-
'web':
|
| 128 |
-
'selectors': {'headline': 'h3', 'link': 'a'},
|
| 129 |
'weight': 1.7,
|
| 130 |
'specialization': ['macro', 'geopolitical']
|
| 131 |
}
|
|
@@ -154,8 +127,15 @@ class FinanceNewsScraper:
|
|
| 154 |
self.last_fetch = None
|
| 155 |
self.cache_ttl = 180 # 3 minutes
|
| 156 |
self.session = requests.Session()
|
|
|
|
| 157 |
self.session.headers.update({
|
| 158 |
-
'User-Agent': 'Mozilla/5.0 (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
})
|
| 160 |
|
| 161 |
def _fetch_rss_feed(self, source_name: str, source_info: Dict) -> List[Dict]:
|
|
@@ -245,11 +225,24 @@ class FinanceNewsScraper:
|
|
| 245 |
|
| 246 |
for headline_elem in headlines[:10]: # Limit to 10 most recent
|
| 247 |
try:
|
| 248 |
-
# Extract title text
|
| 249 |
-
title = headline_elem.get_text(strip=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
if not title or len(title) < 10:
|
| 251 |
continue
|
| 252 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
# Find associated link
|
| 254 |
# Try to find link within the headline element or its parent
|
| 255 |
link_elem = headline_elem if headline_elem.name == 'a' else headline_elem.find('a')
|
|
@@ -279,16 +272,22 @@ class FinanceNewsScraper:
|
|
| 279 |
if not url.startswith('http'):
|
| 280 |
continue
|
| 281 |
|
|
|
|
|
|
|
|
|
|
| 282 |
# Categorize and analyze
|
| 283 |
category = self._categorize_text(title, source_info['specialization'])
|
| 284 |
sentiment = self._analyze_sentiment(title)
|
| 285 |
impact = self._assess_impact(source_info['weight'], title)
|
| 286 |
is_breaking = self._detect_breaking_news(title)
|
| 287 |
|
|
|
|
|
|
|
|
|
|
| 288 |
news_items.append({
|
| 289 |
'id': hash(url),
|
| 290 |
'title': title,
|
| 291 |
-
'summary':
|
| 292 |
'source': source_info['name'],
|
| 293 |
'category': category,
|
| 294 |
'timestamp': datetime.now(), # Web scraping doesn't have timestamps
|
|
@@ -329,8 +328,9 @@ class FinanceNewsScraper:
|
|
| 329 |
for name, info in _self.SOURCES.items():
|
| 330 |
# RSS feed task
|
| 331 |
futures.append((executor.submit(_self._fetch_rss_feed, name, info), name, 'RSS'))
|
| 332 |
-
# Web scraping task
|
| 333 |
-
|
|
|
|
| 334 |
|
| 335 |
for future, source_name, method in futures:
|
| 336 |
try:
|
|
@@ -344,7 +344,8 @@ class FinanceNewsScraper:
|
|
| 344 |
unique_items.append(item)
|
| 345 |
|
| 346 |
all_news.extend(unique_items)
|
| 347 |
-
|
|
|
|
| 348 |
except Exception as e:
|
| 349 |
logger.error(f"Error processing {source_name} ({method}): {e}")
|
| 350 |
|
|
|
|
| 27 |
"""
|
| 28 |
|
| 29 |
# News sources with RSS feeds and web scraping endpoints
|
| 30 |
+
# web=None means web scraping is disabled (blocked by anti-bot measures)
|
| 31 |
SOURCES = {
|
| 32 |
+
# ===== TIER 1: Major Financial News =====
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
'cnbc': {
|
| 34 |
'name': 'CNBC',
|
| 35 |
'rss': 'https://www.cnbc.com/id/100003114/device/rss/rss.html',
|
|
|
|
| 38 |
'weight': 1.2,
|
| 39 |
'specialization': ['markets']
|
| 40 |
},
|
| 41 |
+
'wsj_markets': {
|
| 42 |
+
'name': 'WSJ Markets',
|
| 43 |
+
'rss': 'https://feeds.a.dj.com/rss/RSSMarketsMain.xml',
|
| 44 |
+
'web': None, # Blocked by paywall
|
| 45 |
+
'weight': 1.4,
|
| 46 |
+
'specialization': ['markets']
|
| 47 |
+
},
|
| 48 |
+
'bloomberg_markets': {
|
| 49 |
+
'name': 'Bloomberg',
|
| 50 |
+
'rss': 'https://feeds.bloomberg.com/markets/news.rss',
|
| 51 |
+
'web': None, # Blocked by Cloudflare
|
| 52 |
+
'weight': 1.5,
|
| 53 |
'specialization': ['markets']
|
| 54 |
},
|
| 55 |
'ft_markets': {
|
|
|
|
| 60 |
'weight': 1.4,
|
| 61 |
'specialization': ['markets']
|
| 62 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
'economist': {
|
| 64 |
'name': 'The Economist',
|
| 65 |
'rss': 'https://www.economist.com/finance-and-economics/rss.xml',
|
| 66 |
+
'web': None, # Blocked by anti-bot
|
|
|
|
| 67 |
'weight': 1.3,
|
| 68 |
'specialization': ['macro', 'geopolitical']
|
| 69 |
},
|
|
|
|
| 77 |
'weight': 1.4,
|
| 78 |
'specialization': ['geopolitical', 'macro']
|
| 79 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
+
# ===== TIER 3: Central Banks & Institutions =====
|
| 82 |
'federal_reserve': {
|
| 83 |
'name': 'Federal Reserve',
|
| 84 |
'rss': 'https://www.federalreserve.gov/feeds/press_all.xml',
|
|
|
|
| 97 |
},
|
| 98 |
'imf': {
|
| 99 |
'name': 'IMF',
|
| 100 |
+
'rss': 'https://www.imf.org/en/news/rss',
|
| 101 |
+
'web': None, # Timeout issues
|
|
|
|
| 102 |
'weight': 1.7,
|
| 103 |
'specialization': ['macro', 'geopolitical']
|
| 104 |
}
|
|
|
|
| 127 |
self.last_fetch = None
|
| 128 |
self.cache_ttl = 180 # 3 minutes
|
| 129 |
self.session = requests.Session()
|
| 130 |
+
# Enhanced headers to avoid bot detection
|
| 131 |
self.session.headers.update({
|
| 132 |
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 133 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
| 134 |
+
'Accept-Language': 'en-US,en;q=0.9',
|
| 135 |
+
'Accept-Encoding': 'gzip, deflate, br',
|
| 136 |
+
'DNT': '1',
|
| 137 |
+
'Connection': 'keep-alive',
|
| 138 |
+
'Upgrade-Insecure-Requests': '1'
|
| 139 |
})
|
| 140 |
|
| 141 |
def _fetch_rss_feed(self, source_name: str, source_info: Dict) -> List[Dict]:
|
|
|
|
| 225 |
|
| 226 |
for headline_elem in headlines[:10]: # Limit to 10 most recent
|
| 227 |
try:
|
| 228 |
+
# Extract title text - clean all HTML tags
|
| 229 |
+
title = headline_elem.get_text(separator=' ', strip=True)
|
| 230 |
+
# Remove extra whitespace
|
| 231 |
+
title = re.sub(r'\s+', ' ', title)
|
| 232 |
+
# Remove any HTML tags that might have been missed
|
| 233 |
+
title = re.sub(r'<[^>]+>', '', title)
|
| 234 |
+
# Clean up HTML entities
|
| 235 |
+
from html import unescape
|
| 236 |
+
title = unescape(title)
|
| 237 |
+
|
| 238 |
if not title or len(title) < 10:
|
| 239 |
continue
|
| 240 |
|
| 241 |
+
# Skip if title looks like it contains HTML comments or code
|
| 242 |
+
if '<!--' in title or '-->' in title or 'style=' in title:
|
| 243 |
+
logger.debug(f"Skipping malformed title from {source_name}: {title[:100]}")
|
| 244 |
+
continue
|
| 245 |
+
|
| 246 |
# Find associated link
|
| 247 |
# Try to find link within the headline element or its parent
|
| 248 |
link_elem = headline_elem if headline_elem.name == 'a' else headline_elem.find('a')
|
|
|
|
| 272 |
if not url.startswith('http'):
|
| 273 |
continue
|
| 274 |
|
| 275 |
+
# Clean title from any remaining artifacts
|
| 276 |
+
title = title.replace('\n', ' ').replace('\r', ' ').strip()
|
| 277 |
+
|
| 278 |
# Categorize and analyze
|
| 279 |
category = self._categorize_text(title, source_info['specialization'])
|
| 280 |
sentiment = self._analyze_sentiment(title)
|
| 281 |
impact = self._assess_impact(source_info['weight'], title)
|
| 282 |
is_breaking = self._detect_breaking_news(title)
|
| 283 |
|
| 284 |
+
# Create clean summary
|
| 285 |
+
summary = self._extract_summary(title) if len(title) > 150 else title
|
| 286 |
+
|
| 287 |
news_items.append({
|
| 288 |
'id': hash(url),
|
| 289 |
'title': title,
|
| 290 |
+
'summary': summary,
|
| 291 |
'source': source_info['name'],
|
| 292 |
'category': category,
|
| 293 |
'timestamp': datetime.now(), # Web scraping doesn't have timestamps
|
|
|
|
| 328 |
for name, info in _self.SOURCES.items():
|
| 329 |
# RSS feed task
|
| 330 |
futures.append((executor.submit(_self._fetch_rss_feed, name, info), name, 'RSS'))
|
| 331 |
+
# Web scraping task (only if web URL is configured)
|
| 332 |
+
if info.get('web'):
|
| 333 |
+
futures.append((executor.submit(_self._scrape_web_page, name, info), name, 'Web'))
|
| 334 |
|
| 335 |
for future, source_name, method in futures:
|
| 336 |
try:
|
|
|
|
| 344 |
unique_items.append(item)
|
| 345 |
|
| 346 |
all_news.extend(unique_items)
|
| 347 |
+
if len(unique_items) > 0:
|
| 348 |
+
logger.info(f"Fetched {len(unique_items)} unique items from {source_name} ({method})")
|
| 349 |
except Exception as e:
|
| 350 |
logger.error(f"Error processing {source_name} ({method}): {e}")
|
| 351 |
|