Dmitry Beresnev commited on
Commit
a975122
·
1 Parent(s): acede88

fix news in feed section

Browse files
app/components/news.py CHANGED
@@ -3,6 +3,7 @@
3
  import streamlit as st
4
  import pandas as pd
5
  from datetime import datetime
 
6
 
7
 
8
  def display_news_card(news_item: dict):
@@ -80,7 +81,7 @@ def display_news_card(news_item: dict):
80
  <!-- Title -->
81
  <h3 style='color: #f3f4f6; margin: 0 0 12px 0; font-size: 17px;
82
  line-height: 1.5; font-weight: 600;'>
83
- {news_item['summary']}
84
  </h3>
85
 
86
  <!-- Meta info -->
@@ -244,7 +245,7 @@ def display_breaking_news_banner(df: pd.DataFrame):
244
  BREAKING NEWS • {latest['source'].upper()}
245
  </div>
246
  <div style='color: white; font-size: 18px; font-weight: 600; line-height: 1.4;'>
247
- {latest['summary']}
248
  </div>
249
  </div>
250
  <a href='{latest['url']}' target='_blank'
 
3
  import streamlit as st
4
  import pandas as pd
5
  from datetime import datetime
6
+ import html
7
 
8
 
9
  def display_news_card(news_item: dict):
 
81
  <!-- Title -->
82
  <h3 style='color: #f3f4f6; margin: 0 0 12px 0; font-size: 17px;
83
  line-height: 1.5; font-weight: 600;'>
84
+ {html.escape(news_item['summary'])}
85
  </h3>
86
 
87
  <!-- Meta info -->
 
245
  BREAKING NEWS • {latest['source'].upper()}
246
  </div>
247
  <div style='color: white; font-size: 18px; font-weight: 600; line-height: 1.4;'>
248
+ {html.escape(latest['summary'])}
249
  </div>
250
  </div>
251
  <a href='{latest['url']}' target='_blank'
app/services/news_scraper.py CHANGED
@@ -27,24 +27,9 @@ class FinanceNewsScraper:
27
  """
28
 
29
  # News sources with RSS feeds and web scraping endpoints
 
30
  SOURCES = {
31
- # ===== TIER 1: Major Financial News (RSS + Web Scraping) =====
32
- 'reuters_business': {
33
- 'name': 'Reuters Business',
34
- 'rss': 'https://www.reutersagency.com/feed/?taxonomy=best-topics&post_type=best',
35
- 'web': 'https://www.reuters.com/business/',
36
- 'selectors': {'headline': 'h3.text__text__1FZLe', 'link': 'a.text__text__1FZLe'},
37
- 'weight': 1.5,
38
- 'specialization': ['macro', 'markets']
39
- },
40
- 'reuters_markets': {
41
- 'name': 'Reuters Markets',
42
- 'rss': 'https://www.reutersagency.com/feed/?best-sectors=business-finance&post_type=best',
43
- 'web': 'https://www.reuters.com/markets/',
44
- 'selectors': {'headline': 'h3', 'link': 'a[data-testid="Heading"]'},
45
- 'weight': 1.5,
46
- 'specialization': ['markets']
47
- },
48
  'cnbc': {
49
  'name': 'CNBC',
50
  'rss': 'https://www.cnbc.com/id/100003114/device/rss/rss.html',
@@ -53,12 +38,18 @@ class FinanceNewsScraper:
53
  'weight': 1.2,
54
  'specialization': ['markets']
55
  },
56
- 'marketwatch': {
57
- 'name': 'MarketWatch',
58
- 'rss': 'https://www.marketwatch.com/rss/topstories',
59
- 'web': 'https://www.marketwatch.com/',
60
- 'selectors': {'headline': 'h3.article__headline', 'link': 'a.link'},
61
- 'weight': 1.1,
 
 
 
 
 
 
62
  'specialization': ['markets']
63
  },
64
  'ft_markets': {
@@ -69,19 +60,10 @@ class FinanceNewsScraper:
69
  'weight': 1.4,
70
  'specialization': ['markets']
71
  },
72
- 'wsj_markets': {
73
- 'name': 'WSJ Markets',
74
- 'rss': 'https://feeds.a.dj.com/rss/RSSMarketsMain.xml',
75
- 'web': 'https://www.wsj.com/news/markets',
76
- 'selectors': {'headline': 'h3.WSJTheme--headline', 'link': 'a'},
77
- 'weight': 1.4,
78
- 'specialization': ['markets']
79
- },
80
  'economist': {
81
  'name': 'The Economist',
82
  'rss': 'https://www.economist.com/finance-and-economics/rss.xml',
83
- 'web': 'https://www.economist.com/finance-and-economics',
84
- 'selectors': {'headline': 'span._headline', 'link': 'a'},
85
  'weight': 1.3,
86
  'specialization': ['macro', 'geopolitical']
87
  },
@@ -95,16 +77,8 @@ class FinanceNewsScraper:
95
  'weight': 1.4,
96
  'specialization': ['geopolitical', 'macro']
97
  },
98
- 'bloomberg_markets': {
99
- 'name': 'Bloomberg',
100
- 'rss': 'https://www.bloomberg.com/feed/podcast/etf-report.xml',
101
- 'web': 'https://www.bloomberg.com/markets',
102
- 'selectors': {'headline': 'div.single-story-module__headline', 'link': 'a'},
103
- 'weight': 1.5,
104
- 'specialization': ['markets']
105
- },
106
 
107
- # ===== TIER 3: Central Banks (RSS + Web) =====
108
  'federal_reserve': {
109
  'name': 'Federal Reserve',
110
  'rss': 'https://www.federalreserve.gov/feeds/press_all.xml',
@@ -123,9 +97,8 @@ class FinanceNewsScraper:
123
  },
124
  'imf': {
125
  'name': 'IMF',
126
- 'rss': 'https://www.imf.org/en/News/rss?language_id=1',
127
- 'web': 'https://www.imf.org/en/News',
128
- 'selectors': {'headline': 'h3', 'link': 'a'},
129
  'weight': 1.7,
130
  'specialization': ['macro', 'geopolitical']
131
  }
@@ -154,8 +127,15 @@ class FinanceNewsScraper:
154
  self.last_fetch = None
155
  self.cache_ttl = 180 # 3 minutes
156
  self.session = requests.Session()
 
157
  self.session.headers.update({
158
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
 
 
 
 
 
 
159
  })
160
 
161
  def _fetch_rss_feed(self, source_name: str, source_info: Dict) -> List[Dict]:
@@ -245,11 +225,24 @@ class FinanceNewsScraper:
245
 
246
  for headline_elem in headlines[:10]: # Limit to 10 most recent
247
  try:
248
- # Extract title text
249
- title = headline_elem.get_text(strip=True)
 
 
 
 
 
 
 
 
250
  if not title or len(title) < 10:
251
  continue
252
 
 
 
 
 
 
253
  # Find associated link
254
  # Try to find link within the headline element or its parent
255
  link_elem = headline_elem if headline_elem.name == 'a' else headline_elem.find('a')
@@ -279,16 +272,22 @@ class FinanceNewsScraper:
279
  if not url.startswith('http'):
280
  continue
281
 
 
 
 
282
  # Categorize and analyze
283
  category = self._categorize_text(title, source_info['specialization'])
284
  sentiment = self._analyze_sentiment(title)
285
  impact = self._assess_impact(source_info['weight'], title)
286
  is_breaking = self._detect_breaking_news(title)
287
 
 
 
 
288
  news_items.append({
289
  'id': hash(url),
290
  'title': title,
291
- 'summary': self._extract_summary(title),
292
  'source': source_info['name'],
293
  'category': category,
294
  'timestamp': datetime.now(), # Web scraping doesn't have timestamps
@@ -329,8 +328,9 @@ class FinanceNewsScraper:
329
  for name, info in _self.SOURCES.items():
330
  # RSS feed task
331
  futures.append((executor.submit(_self._fetch_rss_feed, name, info), name, 'RSS'))
332
- # Web scraping task
333
- futures.append((executor.submit(_self._scrape_web_page, name, info), name, 'Web'))
 
334
 
335
  for future, source_name, method in futures:
336
  try:
@@ -344,7 +344,8 @@ class FinanceNewsScraper:
344
  unique_items.append(item)
345
 
346
  all_news.extend(unique_items)
347
- logger.info(f"Fetched {len(unique_items)} unique items from {source_name} ({method})")
 
348
  except Exception as e:
349
  logger.error(f"Error processing {source_name} ({method}): {e}")
350
 
 
27
  """
28
 
29
  # News sources with RSS feeds and web scraping endpoints
30
+ # web=None means web scraping is disabled (blocked by anti-bot measures)
31
  SOURCES = {
32
+ # ===== TIER 1: Major Financial News =====
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  'cnbc': {
34
  'name': 'CNBC',
35
  'rss': 'https://www.cnbc.com/id/100003114/device/rss/rss.html',
 
38
  'weight': 1.2,
39
  'specialization': ['markets']
40
  },
41
+ 'wsj_markets': {
42
+ 'name': 'WSJ Markets',
43
+ 'rss': 'https://feeds.a.dj.com/rss/RSSMarketsMain.xml',
44
+ 'web': None, # Blocked by paywall
45
+ 'weight': 1.4,
46
+ 'specialization': ['markets']
47
+ },
48
+ 'bloomberg_markets': {
49
+ 'name': 'Bloomberg',
50
+ 'rss': 'https://feeds.bloomberg.com/markets/news.rss',
51
+ 'web': None, # Blocked by Cloudflare
52
+ 'weight': 1.5,
53
  'specialization': ['markets']
54
  },
55
  'ft_markets': {
 
60
  'weight': 1.4,
61
  'specialization': ['markets']
62
  },
 
 
 
 
 
 
 
 
63
  'economist': {
64
  'name': 'The Economist',
65
  'rss': 'https://www.economist.com/finance-and-economics/rss.xml',
66
+ 'web': None, # Blocked by anti-bot
 
67
  'weight': 1.3,
68
  'specialization': ['macro', 'geopolitical']
69
  },
 
77
  'weight': 1.4,
78
  'specialization': ['geopolitical', 'macro']
79
  },
 
 
 
 
 
 
 
 
80
 
81
+ # ===== TIER 3: Central Banks & Institutions =====
82
  'federal_reserve': {
83
  'name': 'Federal Reserve',
84
  'rss': 'https://www.federalreserve.gov/feeds/press_all.xml',
 
97
  },
98
  'imf': {
99
  'name': 'IMF',
100
+ 'rss': 'https://www.imf.org/en/news/rss',
101
+ 'web': None, # Timeout issues
 
102
  'weight': 1.7,
103
  'specialization': ['macro', 'geopolitical']
104
  }
 
127
  self.last_fetch = None
128
  self.cache_ttl = 180 # 3 minutes
129
  self.session = requests.Session()
130
+ # Enhanced headers to avoid bot detection
131
  self.session.headers.update({
132
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
133
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
134
+ 'Accept-Language': 'en-US,en;q=0.9',
135
+ 'Accept-Encoding': 'gzip, deflate, br',
136
+ 'DNT': '1',
137
+ 'Connection': 'keep-alive',
138
+ 'Upgrade-Insecure-Requests': '1'
139
  })
140
 
141
  def _fetch_rss_feed(self, source_name: str, source_info: Dict) -> List[Dict]:
 
225
 
226
  for headline_elem in headlines[:10]: # Limit to 10 most recent
227
  try:
228
+ # Extract title text - clean all HTML tags
229
+ title = headline_elem.get_text(separator=' ', strip=True)
230
+ # Remove extra whitespace
231
+ title = re.sub(r'\s+', ' ', title)
232
+ # Remove any HTML tags that might have been missed
233
+ title = re.sub(r'<[^>]+>', '', title)
234
+ # Clean up HTML entities
235
+ from html import unescape
236
+ title = unescape(title)
237
+
238
  if not title or len(title) < 10:
239
  continue
240
 
241
+ # Skip if title looks like it contains HTML comments or code
242
+ if '<!--' in title or '-->' in title or 'style=' in title:
243
+ logger.debug(f"Skipping malformed title from {source_name}: {title[:100]}")
244
+ continue
245
+
246
  # Find associated link
247
  # Try to find link within the headline element or its parent
248
  link_elem = headline_elem if headline_elem.name == 'a' else headline_elem.find('a')
 
272
  if not url.startswith('http'):
273
  continue
274
 
275
+ # Clean title from any remaining artifacts
276
+ title = title.replace('\n', ' ').replace('\r', ' ').strip()
277
+
278
  # Categorize and analyze
279
  category = self._categorize_text(title, source_info['specialization'])
280
  sentiment = self._analyze_sentiment(title)
281
  impact = self._assess_impact(source_info['weight'], title)
282
  is_breaking = self._detect_breaking_news(title)
283
 
284
+ # Create clean summary
285
+ summary = self._extract_summary(title) if len(title) > 150 else title
286
+
287
  news_items.append({
288
  'id': hash(url),
289
  'title': title,
290
+ 'summary': summary,
291
  'source': source_info['name'],
292
  'category': category,
293
  'timestamp': datetime.now(), # Web scraping doesn't have timestamps
 
328
  for name, info in _self.SOURCES.items():
329
  # RSS feed task
330
  futures.append((executor.submit(_self._fetch_rss_feed, name, info), name, 'RSS'))
331
+ # Web scraping task (only if web URL is configured)
332
+ if info.get('web'):
333
+ futures.append((executor.submit(_self._scrape_web_page, name, info), name, 'Web'))
334
 
335
  for future, source_name, method in futures:
336
  try:
 
344
  unique_items.append(item)
345
 
346
  all_news.extend(unique_items)
347
+ if len(unique_items) > 0:
348
+ logger.info(f"Fetched {len(unique_items)} unique items from {source_name} ({method})")
349
  except Exception as e:
350
  logger.error(f"Error processing {source_name} ({method}): {e}")
351