Spaces:

ResearchEngineering
/

FinancialPlatform

Paused

App Files Files Community

Dmitry Beresnev commited on Jan 21

Commit

7e1fe44

1 Parent(s): 0e6d54c

fix news feed

Browse files

Files changed (2) hide show

app/pages/05_Dashboard.py +26 -2
app/services/news_scraper.py +25 -10

app/pages/05_Dashboard.py CHANGED Viewed

@@ -175,10 +175,34 @@ display_category_breakdown(stats)
 st.markdown("---")
-# News feed controls
 col1, col2, col3 = st.columns([2, 1, 1])
 with col1:
-    st.markdown("## 📰 Latest News Feed")
 with col2:
     show_count = st.selectbox("Show", [10, 20, 50, 100], index=1, label_visibility="collapsed")
 with col3:

 st.markdown("---")
+# ---- MAIN PAGE NEWS (Web-Scraped) ----
+st.markdown("## 🔥 Top Stories from Main Pages")
+st.caption("Latest headlines directly from news source homepages")
+main_page_df = monitor.get_main_page_news()
+if not main_page_df.empty:
+    # Apply filters to main page news
+    filtered_main = main_page_df.copy()
+    if category_filter != 'all':
+        filtered_main = filtered_main[filtered_main['category'] == category_filter]
+    if sentiment_filter != 'all':
+        filtered_main = filtered_main[filtered_main['sentiment'] == sentiment_filter]
+    if impact_filter != 'all':
+        filtered_main = filtered_main[filtered_main['impact'] == impact_filter]
+    if not filtered_main.empty:
+        display_news_feed(filtered_main, max_items=10)
+    else:
+        st.info("📭 No main page news matches your filters.")
+else:
+    st.info("⏳ Main page news will appear here...")
+st.markdown("---")
+# ---- ALL NEWS FEED (RSS + Web) ----
 col1, col2, col3 = st.columns([2, 1, 1])
 with col1:
+    st.markdown("## 📰 All News Feed")
 with col2:
     show_count = st.selectbox("Show", [10, 20, 50, 100], index=1, label_visibility="collapsed")
 with col3:

app/services/news_scraper.py CHANGED Viewed

@@ -36,6 +36,7 @@ class FinanceNewsScraper:
             'web': 'https://www.cnbc.com/world/',
             'selectors': {'headline': 'a.Card-title', 'link': 'a.Card-title'},
             'weight': 1.2,
             'specialization': ['markets']
         },
         'wsj_markets': {
@@ -58,6 +59,7 @@ class FinanceNewsScraper:
             'web': 'https://www.ft.com/markets',
             'selectors': {'headline': 'div.o-teaser__heading', 'link': 'a.js-teaser-heading-link'},
             'weight': 1.4,
             'specialization': ['markets']
         },
         'economist': {
@@ -73,8 +75,9 @@ class FinanceNewsScraper:
             'name': 'BBC Business',
             'rss': 'http://feeds.bbci.co.uk/news/business/rss.xml',
             'web': 'https://www.bbc.com/news/business',
-            'selectors': {'headline': 'h3', 'link': 'a[data-testid="internal-link"]'},
             'weight': 1.4,
             'specialization': ['geopolitical', 'macro']
         },
@@ -82,16 +85,14 @@ class FinanceNewsScraper:
         'federal_reserve': {
             'name': 'Federal Reserve',
             'rss': 'https://www.federalreserve.gov/feeds/press_all.xml',
-            'web': 'https://www.federalreserve.gov/newsevents/pressreleases.htm',
-            'selectors': {'headline': 'div.row', 'link': 'a'},
             'weight': 2.0,
             'specialization': ['macro']
         },
         'ecb': {
             'name': 'European Central Bank',
             'rss': 'https://www.ecb.europa.eu/rss/press.xml',
-            'web': 'https://www.ecb.europa.eu/press/pr/date/html/index.en.html',
-            'selectors': {'headline': 'dt', 'link': 'a'},
             'weight': 2.0,
             'specialization': ['macro']
         },
@@ -196,7 +197,8 @@ class FinanceNewsScraper:
                     'likes': 0,  # RSS feeds don't have engagement metrics
                     'retweets': 0,
                     'is_breaking': is_breaking,
-                    'source_weight': source_info['weight']
                 })
             return news_items
@@ -302,7 +304,8 @@ class FinanceNewsScraper:
                         'likes': 0,
                         'retweets': 0,
                         'is_breaking': is_breaking,
-                        'source_weight': source_info['weight']
                     })
                 except Exception as e:
@@ -359,15 +362,27 @@ class FinanceNewsScraper:
             logger.warning("No news fetched from any source - using mock data")
             return _self._get_mock_news()
-        # Sort by breaking news, impact, and timestamp
         all_news.sort(
-            key=lambda x: (x['is_breaking'], x['impact'] == 'high', x['timestamp']),
             reverse=True
         )
-        logger.info(f"Total unique news items: {len(all_news)}")
         return all_news[:max_items]
     def _categorize_text(self, text: str, source_specialization: List[str]) -> str:
         """Categorize news based on keywords and source specialization"""
         text_lower = text.lower()

             'web': 'https://www.cnbc.com/world/',
             'selectors': {'headline': 'a.Card-title', 'link': 'a.Card-title'},
             'weight': 1.2,
+            'web_priority': True,  # Web scraping is higher priority
             'specialization': ['markets']
         },
         'wsj_markets': {
             'web': 'https://www.ft.com/markets',
             'selectors': {'headline': 'div.o-teaser__heading', 'link': 'a.js-teaser-heading-link'},
             'weight': 1.4,
+            'web_priority': True,
             'specialization': ['markets']
         },
         'economist': {
             'name': 'BBC Business',
             'rss': 'http://feeds.bbci.co.uk/news/business/rss.xml',
             'web': 'https://www.bbc.com/news/business',
+            'selectors': {'headline': 'h2[data-testid="card-headline"]', 'link': 'a[data-testid="internal-link"]'},
             'weight': 1.4,
+            'web_priority': True,
             'specialization': ['geopolitical', 'macro']
         },
         'federal_reserve': {
             'name': 'Federal Reserve',
             'rss': 'https://www.federalreserve.gov/feeds/press_all.xml',
+            'web': None,  # Disabled - RSS works well
             'weight': 2.0,
             'specialization': ['macro']
         },
         'ecb': {
             'name': 'European Central Bank',
             'rss': 'https://www.ecb.europa.eu/rss/press.xml',
+            'web': None,  # Disabled - RSS works well
             'weight': 2.0,
             'specialization': ['macro']
         },
                     'likes': 0,  # RSS feeds don't have engagement metrics
                     'retweets': 0,
                     'is_breaking': is_breaking,
+                    'source_weight': source_info['weight'],
+                    'from_web': False  # Mark as RSS feed
                 })
             return news_items
                         'likes': 0,
                         'retweets': 0,
                         'is_breaking': is_breaking,
+                        'source_weight': source_info['weight'],
+                        'from_web': True  # Mark as web-scraped (main page news)
                     })
                 except Exception as e:
             logger.warning("No news fetched from any source - using mock data")
             return _self._get_mock_news()
+        # Sort by: web-scraped first, then breaking news, then impact, then timestamp
         all_news.sort(
+            key=lambda x: (x.get('from_web', False), x['is_breaking'], x['impact'] == 'high', x['timestamp']),
             reverse=True
         )
+        logger.info(f"Total unique news items: {len(all_news)} (Web: {sum(1 for n in all_news if n.get('from_web'))}, RSS: {sum(1 for n in all_news if not n.get('from_web'))})")
         return all_news[:max_items]
+    def get_main_page_news(self) -> pd.DataFrame:
+        """Get only news from main pages (web-scraped)"""
+        if not self.news_cache:
+            self.news_cache = self.scrape_news(max_items=100)
+            self.last_fetch = datetime.now()
+        main_news = [n for n in self.news_cache if n.get('from_web', False)]
+        df = pd.DataFrame(main_news)
+        if not df.empty:
+            df['timestamp'] = pd.to_datetime(df['timestamp'])
+        return df
     def _categorize_text(self, text: str, source_specialization: List[str]) -> str:
         """Categorize news based on keywords and source specialization"""
         text_lower = text.lower()