Dmitry Beresnev commited on
Commit
7e1fe44
·
1 Parent(s): 0e6d54c

fix news feed

Browse files
app/pages/05_Dashboard.py CHANGED
@@ -175,10 +175,34 @@ display_category_breakdown(stats)
175
 
176
  st.markdown("---")
177
 
178
- # News feed controls
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  col1, col2, col3 = st.columns([2, 1, 1])
180
  with col1:
181
- st.markdown("## 📰 Latest News Feed")
182
  with col2:
183
  show_count = st.selectbox("Show", [10, 20, 50, 100], index=1, label_visibility="collapsed")
184
  with col3:
 
175
 
176
  st.markdown("---")
177
 
178
+ # ---- MAIN PAGE NEWS (Web-Scraped) ----
179
+ st.markdown("## 🔥 Top Stories from Main Pages")
180
+ st.caption("Latest headlines directly from news source homepages")
181
+
182
+ main_page_df = monitor.get_main_page_news()
183
+ if not main_page_df.empty:
184
+ # Apply filters to main page news
185
+ filtered_main = main_page_df.copy()
186
+ if category_filter != 'all':
187
+ filtered_main = filtered_main[filtered_main['category'] == category_filter]
188
+ if sentiment_filter != 'all':
189
+ filtered_main = filtered_main[filtered_main['sentiment'] == sentiment_filter]
190
+ if impact_filter != 'all':
191
+ filtered_main = filtered_main[filtered_main['impact'] == impact_filter]
192
+
193
+ if not filtered_main.empty:
194
+ display_news_feed(filtered_main, max_items=10)
195
+ else:
196
+ st.info("📭 No main page news matches your filters.")
197
+ else:
198
+ st.info("⏳ Main page news will appear here...")
199
+
200
+ st.markdown("---")
201
+
202
+ # ---- ALL NEWS FEED (RSS + Web) ----
203
  col1, col2, col3 = st.columns([2, 1, 1])
204
  with col1:
205
+ st.markdown("## 📰 All News Feed")
206
  with col2:
207
  show_count = st.selectbox("Show", [10, 20, 50, 100], index=1, label_visibility="collapsed")
208
  with col3:
app/services/news_scraper.py CHANGED
@@ -36,6 +36,7 @@ class FinanceNewsScraper:
36
  'web': 'https://www.cnbc.com/world/',
37
  'selectors': {'headline': 'a.Card-title', 'link': 'a.Card-title'},
38
  'weight': 1.2,
 
39
  'specialization': ['markets']
40
  },
41
  'wsj_markets': {
@@ -58,6 +59,7 @@ class FinanceNewsScraper:
58
  'web': 'https://www.ft.com/markets',
59
  'selectors': {'headline': 'div.o-teaser__heading', 'link': 'a.js-teaser-heading-link'},
60
  'weight': 1.4,
 
61
  'specialization': ['markets']
62
  },
63
  'economist': {
@@ -73,8 +75,9 @@ class FinanceNewsScraper:
73
  'name': 'BBC Business',
74
  'rss': 'http://feeds.bbci.co.uk/news/business/rss.xml',
75
  'web': 'https://www.bbc.com/news/business',
76
- 'selectors': {'headline': 'h3', 'link': 'a[data-testid="internal-link"]'},
77
  'weight': 1.4,
 
78
  'specialization': ['geopolitical', 'macro']
79
  },
80
 
@@ -82,16 +85,14 @@ class FinanceNewsScraper:
82
  'federal_reserve': {
83
  'name': 'Federal Reserve',
84
  'rss': 'https://www.federalreserve.gov/feeds/press_all.xml',
85
- 'web': 'https://www.federalreserve.gov/newsevents/pressreleases.htm',
86
- 'selectors': {'headline': 'div.row', 'link': 'a'},
87
  'weight': 2.0,
88
  'specialization': ['macro']
89
  },
90
  'ecb': {
91
  'name': 'European Central Bank',
92
  'rss': 'https://www.ecb.europa.eu/rss/press.xml',
93
- 'web': 'https://www.ecb.europa.eu/press/pr/date/html/index.en.html',
94
- 'selectors': {'headline': 'dt', 'link': 'a'},
95
  'weight': 2.0,
96
  'specialization': ['macro']
97
  },
@@ -196,7 +197,8 @@ class FinanceNewsScraper:
196
  'likes': 0, # RSS feeds don't have engagement metrics
197
  'retweets': 0,
198
  'is_breaking': is_breaking,
199
- 'source_weight': source_info['weight']
 
200
  })
201
 
202
  return news_items
@@ -302,7 +304,8 @@ class FinanceNewsScraper:
302
  'likes': 0,
303
  'retweets': 0,
304
  'is_breaking': is_breaking,
305
- 'source_weight': source_info['weight']
 
306
  })
307
 
308
  except Exception as e:
@@ -359,15 +362,27 @@ class FinanceNewsScraper:
359
  logger.warning("No news fetched from any source - using mock data")
360
  return _self._get_mock_news()
361
 
362
- # Sort by breaking news, impact, and timestamp
363
  all_news.sort(
364
- key=lambda x: (x['is_breaking'], x['impact'] == 'high', x['timestamp']),
365
  reverse=True
366
  )
367
 
368
- logger.info(f"Total unique news items: {len(all_news)}")
369
  return all_news[:max_items]
370
 
 
 
 
 
 
 
 
 
 
 
 
 
371
  def _categorize_text(self, text: str, source_specialization: List[str]) -> str:
372
  """Categorize news based on keywords and source specialization"""
373
  text_lower = text.lower()
 
36
  'web': 'https://www.cnbc.com/world/',
37
  'selectors': {'headline': 'a.Card-title', 'link': 'a.Card-title'},
38
  'weight': 1.2,
39
+ 'web_priority': True, # Web scraping is higher priority
40
  'specialization': ['markets']
41
  },
42
  'wsj_markets': {
 
59
  'web': 'https://www.ft.com/markets',
60
  'selectors': {'headline': 'div.o-teaser__heading', 'link': 'a.js-teaser-heading-link'},
61
  'weight': 1.4,
62
+ 'web_priority': True,
63
  'specialization': ['markets']
64
  },
65
  'economist': {
 
75
  'name': 'BBC Business',
76
  'rss': 'http://feeds.bbci.co.uk/news/business/rss.xml',
77
  'web': 'https://www.bbc.com/news/business',
78
+ 'selectors': {'headline': 'h2[data-testid="card-headline"]', 'link': 'a[data-testid="internal-link"]'},
79
  'weight': 1.4,
80
+ 'web_priority': True,
81
  'specialization': ['geopolitical', 'macro']
82
  },
83
 
 
85
  'federal_reserve': {
86
  'name': 'Federal Reserve',
87
  'rss': 'https://www.federalreserve.gov/feeds/press_all.xml',
88
+ 'web': None, # Disabled - RSS works well
 
89
  'weight': 2.0,
90
  'specialization': ['macro']
91
  },
92
  'ecb': {
93
  'name': 'European Central Bank',
94
  'rss': 'https://www.ecb.europa.eu/rss/press.xml',
95
+ 'web': None, # Disabled - RSS works well
 
96
  'weight': 2.0,
97
  'specialization': ['macro']
98
  },
 
197
  'likes': 0, # RSS feeds don't have engagement metrics
198
  'retweets': 0,
199
  'is_breaking': is_breaking,
200
+ 'source_weight': source_info['weight'],
201
+ 'from_web': False # Mark as RSS feed
202
  })
203
 
204
  return news_items
 
304
  'likes': 0,
305
  'retweets': 0,
306
  'is_breaking': is_breaking,
307
+ 'source_weight': source_info['weight'],
308
+ 'from_web': True # Mark as web-scraped (main page news)
309
  })
310
 
311
  except Exception as e:
 
362
  logger.warning("No news fetched from any source - using mock data")
363
  return _self._get_mock_news()
364
 
365
+ # Sort by: web-scraped first, then breaking news, then impact, then timestamp
366
  all_news.sort(
367
+ key=lambda x: (x.get('from_web', False), x['is_breaking'], x['impact'] == 'high', x['timestamp']),
368
  reverse=True
369
  )
370
 
371
+ logger.info(f"Total unique news items: {len(all_news)} (Web: {sum(1 for n in all_news if n.get('from_web'))}, RSS: {sum(1 for n in all_news if not n.get('from_web'))})")
372
  return all_news[:max_items]
373
 
374
+ def get_main_page_news(self) -> pd.DataFrame:
375
+ """Get only news from main pages (web-scraped)"""
376
+ if not self.news_cache:
377
+ self.news_cache = self.scrape_news(max_items=100)
378
+ self.last_fetch = datetime.now()
379
+
380
+ main_news = [n for n in self.news_cache if n.get('from_web', False)]
381
+ df = pd.DataFrame(main_news)
382
+ if not df.empty:
383
+ df['timestamp'] = pd.to_datetime(df['timestamp'])
384
+ return df
385
+
386
  def _categorize_text(self, text: str, source_specialization: List[str]) -> str:
387
  """Categorize news based on keywords and source specialization"""
388
  text_lower = text.lower()