Dmitry Beresnev
commited on
Commit
·
7e1fe44
1
Parent(s):
0e6d54c
fix news feed
Browse files- app/pages/05_Dashboard.py +26 -2
- app/services/news_scraper.py +25 -10
app/pages/05_Dashboard.py
CHANGED
|
@@ -175,10 +175,34 @@ display_category_breakdown(stats)
|
|
| 175 |
|
| 176 |
st.markdown("---")
|
| 177 |
|
| 178 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
col1, col2, col3 = st.columns([2, 1, 1])
|
| 180 |
with col1:
|
| 181 |
-
st.markdown("## 📰
|
| 182 |
with col2:
|
| 183 |
show_count = st.selectbox("Show", [10, 20, 50, 100], index=1, label_visibility="collapsed")
|
| 184 |
with col3:
|
|
|
|
| 175 |
|
| 176 |
st.markdown("---")
|
| 177 |
|
| 178 |
+
# ---- MAIN PAGE NEWS (Web-Scraped) ----
|
| 179 |
+
st.markdown("## 🔥 Top Stories from Main Pages")
|
| 180 |
+
st.caption("Latest headlines directly from news source homepages")
|
| 181 |
+
|
| 182 |
+
main_page_df = monitor.get_main_page_news()
|
| 183 |
+
if not main_page_df.empty:
|
| 184 |
+
# Apply filters to main page news
|
| 185 |
+
filtered_main = main_page_df.copy()
|
| 186 |
+
if category_filter != 'all':
|
| 187 |
+
filtered_main = filtered_main[filtered_main['category'] == category_filter]
|
| 188 |
+
if sentiment_filter != 'all':
|
| 189 |
+
filtered_main = filtered_main[filtered_main['sentiment'] == sentiment_filter]
|
| 190 |
+
if impact_filter != 'all':
|
| 191 |
+
filtered_main = filtered_main[filtered_main['impact'] == impact_filter]
|
| 192 |
+
|
| 193 |
+
if not filtered_main.empty:
|
| 194 |
+
display_news_feed(filtered_main, max_items=10)
|
| 195 |
+
else:
|
| 196 |
+
st.info("📭 No main page news matches your filters.")
|
| 197 |
+
else:
|
| 198 |
+
st.info("⏳ Main page news will appear here...")
|
| 199 |
+
|
| 200 |
+
st.markdown("---")
|
| 201 |
+
|
| 202 |
+
# ---- ALL NEWS FEED (RSS + Web) ----
|
| 203 |
col1, col2, col3 = st.columns([2, 1, 1])
|
| 204 |
with col1:
|
| 205 |
+
st.markdown("## 📰 All News Feed")
|
| 206 |
with col2:
|
| 207 |
show_count = st.selectbox("Show", [10, 20, 50, 100], index=1, label_visibility="collapsed")
|
| 208 |
with col3:
|
app/services/news_scraper.py
CHANGED
|
@@ -36,6 +36,7 @@ class FinanceNewsScraper:
|
|
| 36 |
'web': 'https://www.cnbc.com/world/',
|
| 37 |
'selectors': {'headline': 'a.Card-title', 'link': 'a.Card-title'},
|
| 38 |
'weight': 1.2,
|
|
|
|
| 39 |
'specialization': ['markets']
|
| 40 |
},
|
| 41 |
'wsj_markets': {
|
|
@@ -58,6 +59,7 @@ class FinanceNewsScraper:
|
|
| 58 |
'web': 'https://www.ft.com/markets',
|
| 59 |
'selectors': {'headline': 'div.o-teaser__heading', 'link': 'a.js-teaser-heading-link'},
|
| 60 |
'weight': 1.4,
|
|
|
|
| 61 |
'specialization': ['markets']
|
| 62 |
},
|
| 63 |
'economist': {
|
|
@@ -73,8 +75,9 @@ class FinanceNewsScraper:
|
|
| 73 |
'name': 'BBC Business',
|
| 74 |
'rss': 'http://feeds.bbci.co.uk/news/business/rss.xml',
|
| 75 |
'web': 'https://www.bbc.com/news/business',
|
| 76 |
-
'selectors': {'headline': '
|
| 77 |
'weight': 1.4,
|
|
|
|
| 78 |
'specialization': ['geopolitical', 'macro']
|
| 79 |
},
|
| 80 |
|
|
@@ -82,16 +85,14 @@ class FinanceNewsScraper:
|
|
| 82 |
'federal_reserve': {
|
| 83 |
'name': 'Federal Reserve',
|
| 84 |
'rss': 'https://www.federalreserve.gov/feeds/press_all.xml',
|
| 85 |
-
'web':
|
| 86 |
-
'selectors': {'headline': 'div.row', 'link': 'a'},
|
| 87 |
'weight': 2.0,
|
| 88 |
'specialization': ['macro']
|
| 89 |
},
|
| 90 |
'ecb': {
|
| 91 |
'name': 'European Central Bank',
|
| 92 |
'rss': 'https://www.ecb.europa.eu/rss/press.xml',
|
| 93 |
-
'web':
|
| 94 |
-
'selectors': {'headline': 'dt', 'link': 'a'},
|
| 95 |
'weight': 2.0,
|
| 96 |
'specialization': ['macro']
|
| 97 |
},
|
|
@@ -196,7 +197,8 @@ class FinanceNewsScraper:
|
|
| 196 |
'likes': 0, # RSS feeds don't have engagement metrics
|
| 197 |
'retweets': 0,
|
| 198 |
'is_breaking': is_breaking,
|
| 199 |
-
'source_weight': source_info['weight']
|
|
|
|
| 200 |
})
|
| 201 |
|
| 202 |
return news_items
|
|
@@ -302,7 +304,8 @@ class FinanceNewsScraper:
|
|
| 302 |
'likes': 0,
|
| 303 |
'retweets': 0,
|
| 304 |
'is_breaking': is_breaking,
|
| 305 |
-
'source_weight': source_info['weight']
|
|
|
|
| 306 |
})
|
| 307 |
|
| 308 |
except Exception as e:
|
|
@@ -359,15 +362,27 @@ class FinanceNewsScraper:
|
|
| 359 |
logger.warning("No news fetched from any source - using mock data")
|
| 360 |
return _self._get_mock_news()
|
| 361 |
|
| 362 |
-
# Sort by breaking news, impact,
|
| 363 |
all_news.sort(
|
| 364 |
-
key=lambda x: (x['is_breaking'], x['impact'] == 'high', x['timestamp']),
|
| 365 |
reverse=True
|
| 366 |
)
|
| 367 |
|
| 368 |
-
logger.info(f"Total unique news items: {len(all_news)}")
|
| 369 |
return all_news[:max_items]
|
| 370 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 371 |
def _categorize_text(self, text: str, source_specialization: List[str]) -> str:
|
| 372 |
"""Categorize news based on keywords and source specialization"""
|
| 373 |
text_lower = text.lower()
|
|
|
|
| 36 |
'web': 'https://www.cnbc.com/world/',
|
| 37 |
'selectors': {'headline': 'a.Card-title', 'link': 'a.Card-title'},
|
| 38 |
'weight': 1.2,
|
| 39 |
+
'web_priority': True, # Web scraping is higher priority
|
| 40 |
'specialization': ['markets']
|
| 41 |
},
|
| 42 |
'wsj_markets': {
|
|
|
|
| 59 |
'web': 'https://www.ft.com/markets',
|
| 60 |
'selectors': {'headline': 'div.o-teaser__heading', 'link': 'a.js-teaser-heading-link'},
|
| 61 |
'weight': 1.4,
|
| 62 |
+
'web_priority': True,
|
| 63 |
'specialization': ['markets']
|
| 64 |
},
|
| 65 |
'economist': {
|
|
|
|
| 75 |
'name': 'BBC Business',
|
| 76 |
'rss': 'http://feeds.bbci.co.uk/news/business/rss.xml',
|
| 77 |
'web': 'https://www.bbc.com/news/business',
|
| 78 |
+
'selectors': {'headline': 'h2[data-testid="card-headline"]', 'link': 'a[data-testid="internal-link"]'},
|
| 79 |
'weight': 1.4,
|
| 80 |
+
'web_priority': True,
|
| 81 |
'specialization': ['geopolitical', 'macro']
|
| 82 |
},
|
| 83 |
|
|
|
|
| 85 |
'federal_reserve': {
|
| 86 |
'name': 'Federal Reserve',
|
| 87 |
'rss': 'https://www.federalreserve.gov/feeds/press_all.xml',
|
| 88 |
+
'web': None, # Disabled - RSS works well
|
|
|
|
| 89 |
'weight': 2.0,
|
| 90 |
'specialization': ['macro']
|
| 91 |
},
|
| 92 |
'ecb': {
|
| 93 |
'name': 'European Central Bank',
|
| 94 |
'rss': 'https://www.ecb.europa.eu/rss/press.xml',
|
| 95 |
+
'web': None, # Disabled - RSS works well
|
|
|
|
| 96 |
'weight': 2.0,
|
| 97 |
'specialization': ['macro']
|
| 98 |
},
|
|
|
|
| 197 |
'likes': 0, # RSS feeds don't have engagement metrics
|
| 198 |
'retweets': 0,
|
| 199 |
'is_breaking': is_breaking,
|
| 200 |
+
'source_weight': source_info['weight'],
|
| 201 |
+
'from_web': False # Mark as RSS feed
|
| 202 |
})
|
| 203 |
|
| 204 |
return news_items
|
|
|
|
| 304 |
'likes': 0,
|
| 305 |
'retweets': 0,
|
| 306 |
'is_breaking': is_breaking,
|
| 307 |
+
'source_weight': source_info['weight'],
|
| 308 |
+
'from_web': True # Mark as web-scraped (main page news)
|
| 309 |
})
|
| 310 |
|
| 311 |
except Exception as e:
|
|
|
|
| 362 |
logger.warning("No news fetched from any source - using mock data")
|
| 363 |
return _self._get_mock_news()
|
| 364 |
|
| 365 |
+
# Sort by: web-scraped first, then breaking news, then impact, then timestamp
|
| 366 |
all_news.sort(
|
| 367 |
+
key=lambda x: (x.get('from_web', False), x['is_breaking'], x['impact'] == 'high', x['timestamp']),
|
| 368 |
reverse=True
|
| 369 |
)
|
| 370 |
|
| 371 |
+
logger.info(f"Total unique news items: {len(all_news)} (Web: {sum(1 for n in all_news if n.get('from_web'))}, RSS: {sum(1 for n in all_news if not n.get('from_web'))})")
|
| 372 |
return all_news[:max_items]
|
| 373 |
|
| 374 |
+
def get_main_page_news(self) -> pd.DataFrame:
|
| 375 |
+
"""Get only news from main pages (web-scraped)"""
|
| 376 |
+
if not self.news_cache:
|
| 377 |
+
self.news_cache = self.scrape_news(max_items=100)
|
| 378 |
+
self.last_fetch = datetime.now()
|
| 379 |
+
|
| 380 |
+
main_news = [n for n in self.news_cache if n.get('from_web', False)]
|
| 381 |
+
df = pd.DataFrame(main_news)
|
| 382 |
+
if not df.empty:
|
| 383 |
+
df['timestamp'] = pd.to_datetime(df['timestamp'])
|
| 384 |
+
return df
|
| 385 |
+
|
| 386 |
def _categorize_text(self, text: str, source_specialization: List[str]) -> str:
|
| 387 |
"""Categorize news based on keywords and source specialization"""
|
| 388 |
text_lower = text.lower()
|