Spaces:

ResearchEngineering
/

FinancialPlatform

Paused

App Files Files Community

Dmitry Beresnev commited on Jan 21

Commit

a975122

1 Parent(s): acede88

fix news in feed section

Browse files

Files changed (2) hide show

app/components/news.py +3 -2
app/services/news_scraper.py +53 -52

app/components/news.py CHANGED Viewed

@@ -3,6 +3,7 @@
 import streamlit as st
 import pandas as pd
 from datetime import datetime
 def display_news_card(news_item: dict):
@@ -80,7 +81,7 @@ def display_news_card(news_item: dict):
                 <!-- Title -->
                 <h3 style='color: #f3f4f6; margin: 0 0 12px 0; font-size: 17px;
                            line-height: 1.5; font-weight: 600;'>
-                    {news_item['summary']}
                 </h3>
                 <!-- Meta info -->
@@ -244,7 +245,7 @@ def display_breaking_news_banner(df: pd.DataFrame):
                         BREAKING NEWS • {latest['source'].upper()}
                     </div>
                     <div style='color: white; font-size: 18px; font-weight: 600; line-height: 1.4;'>
-                        {latest['summary']}
                     </div>
                 </div>
                 <a href='{latest['url']}' target='_blank'

 import streamlit as st
 import pandas as pd
 from datetime import datetime
+import html
 def display_news_card(news_item: dict):
                 <!-- Title -->
                 <h3 style='color: #f3f4f6; margin: 0 0 12px 0; font-size: 17px;
                            line-height: 1.5; font-weight: 600;'>
+                    {html.escape(news_item['summary'])}
                 </h3>
                 <!-- Meta info -->
                         BREAKING NEWS • {latest['source'].upper()}
                     </div>
                     <div style='color: white; font-size: 18px; font-weight: 600; line-height: 1.4;'>
+                        {html.escape(latest['summary'])}
                     </div>
                 </div>
                 <a href='{latest['url']}' target='_blank'

app/services/news_scraper.py CHANGED Viewed

@@ -27,24 +27,9 @@ class FinanceNewsScraper:
     """
     # News sources with RSS feeds and web scraping endpoints
     SOURCES = {
-        # ===== TIER 1: Major Financial News (RSS + Web Scraping) =====
-        'reuters_business': {
-            'name': 'Reuters Business',
-            'rss': 'https://www.reutersagency.com/feed/?taxonomy=best-topics&post_type=best',
-            'web': 'https://www.reuters.com/business/',
-            'selectors': {'headline': 'h3.text__text__1FZLe', 'link': 'a.text__text__1FZLe'},
-            'weight': 1.5,
-            'specialization': ['macro', 'markets']
-        },
-        'reuters_markets': {
-            'name': 'Reuters Markets',
-            'rss': 'https://www.reutersagency.com/feed/?best-sectors=business-finance&post_type=best',
-            'web': 'https://www.reuters.com/markets/',
-            'selectors': {'headline': 'h3', 'link': 'a[data-testid="Heading"]'},
-            'weight': 1.5,
-            'specialization': ['markets']
-        },
         'cnbc': {
             'name': 'CNBC',
             'rss': 'https://www.cnbc.com/id/100003114/device/rss/rss.html',
@@ -53,12 +38,18 @@ class FinanceNewsScraper:
             'weight': 1.2,
             'specialization': ['markets']
         },
-        'marketwatch': {
-            'name': 'MarketWatch',
-            'rss': 'https://www.marketwatch.com/rss/topstories',
-            'web': 'https://www.marketwatch.com/',
-            'selectors': {'headline': 'h3.article__headline', 'link': 'a.link'},
-            'weight': 1.1,
             'specialization': ['markets']
         },
         'ft_markets': {
@@ -69,19 +60,10 @@ class FinanceNewsScraper:
             'weight': 1.4,
             'specialization': ['markets']
         },
-        'wsj_markets': {
-            'name': 'WSJ Markets',
-            'rss': 'https://feeds.a.dj.com/rss/RSSMarketsMain.xml',
-            'web': 'https://www.wsj.com/news/markets',
-            'selectors': {'headline': 'h3.WSJTheme--headline', 'link': 'a'},
-            'weight': 1.4,
-            'specialization': ['markets']
-        },
         'economist': {
             'name': 'The Economist',
             'rss': 'https://www.economist.com/finance-and-economics/rss.xml',
-            'web': 'https://www.economist.com/finance-and-economics',
-            'selectors': {'headline': 'span._headline', 'link': 'a'},
             'weight': 1.3,
             'specialization': ['macro', 'geopolitical']
         },
@@ -95,16 +77,8 @@ class FinanceNewsScraper:
             'weight': 1.4,
             'specialization': ['geopolitical', 'macro']
         },
-        'bloomberg_markets': {
-            'name': 'Bloomberg',
-            'rss': 'https://www.bloomberg.com/feed/podcast/etf-report.xml',
-            'web': 'https://www.bloomberg.com/markets',
-            'selectors': {'headline': 'div.single-story-module__headline', 'link': 'a'},
-            'weight': 1.5,
-            'specialization': ['markets']
-        },
-        # ===== TIER 3: Central Banks (RSS + Web) =====
         'federal_reserve': {
             'name': 'Federal Reserve',
             'rss': 'https://www.federalreserve.gov/feeds/press_all.xml',
@@ -123,9 +97,8 @@ class FinanceNewsScraper:
         },
         'imf': {
             'name': 'IMF',
-            'rss': 'https://www.imf.org/en/News/rss?language_id=1',
-            'web': 'https://www.imf.org/en/News',
-            'selectors': {'headline': 'h3', 'link': 'a'},
             'weight': 1.7,
             'specialization': ['macro', 'geopolitical']
         }
@@ -154,8 +127,15 @@ class FinanceNewsScraper:
         self.last_fetch = None
         self.cache_ttl = 180  # 3 minutes
         self.session = requests.Session()
         self.session.headers.update({
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
         })
     def _fetch_rss_feed(self, source_name: str, source_info: Dict) -> List[Dict]:
@@ -245,11 +225,24 @@ class FinanceNewsScraper:
             for headline_elem in headlines[:10]:  # Limit to 10 most recent
                 try:
-                    # Extract title text
-                    title = headline_elem.get_text(strip=True)
                     if not title or len(title) < 10:
                         continue
                     # Find associated link
                     # Try to find link within the headline element or its parent
                     link_elem = headline_elem if headline_elem.name == 'a' else headline_elem.find('a')
@@ -279,16 +272,22 @@ class FinanceNewsScraper:
                     if not url.startswith('http'):
                         continue
                     # Categorize and analyze
                     category = self._categorize_text(title, source_info['specialization'])
                     sentiment = self._analyze_sentiment(title)
                     impact = self._assess_impact(source_info['weight'], title)
                     is_breaking = self._detect_breaking_news(title)
                     news_items.append({
                         'id': hash(url),
                         'title': title,
-                        'summary': self._extract_summary(title),
                         'source': source_info['name'],
                         'category': category,
                         'timestamp': datetime.now(),  # Web scraping doesn't have timestamps
@@ -329,8 +328,9 @@ class FinanceNewsScraper:
             for name, info in _self.SOURCES.items():
                 # RSS feed task
                 futures.append((executor.submit(_self._fetch_rss_feed, name, info), name, 'RSS'))
-                # Web scraping task
-                futures.append((executor.submit(_self._scrape_web_page, name, info), name, 'Web'))
             for future, source_name, method in futures:
                 try:
@@ -344,7 +344,8 @@ class FinanceNewsScraper:
                             unique_items.append(item)
                     all_news.extend(unique_items)
-                    logger.info(f"Fetched {len(unique_items)} unique items from {source_name} ({method})")
                 except Exception as e:
                     logger.error(f"Error processing {source_name} ({method}): {e}")

     """
     # News sources with RSS feeds and web scraping endpoints
+    # web=None means web scraping is disabled (blocked by anti-bot measures)
     SOURCES = {
+        # ===== TIER 1: Major Financial News =====
         'cnbc': {
             'name': 'CNBC',
             'rss': 'https://www.cnbc.com/id/100003114/device/rss/rss.html',
             'weight': 1.2,
             'specialization': ['markets']
         },
+        'wsj_markets': {
+            'name': 'WSJ Markets',
+            'rss': 'https://feeds.a.dj.com/rss/RSSMarketsMain.xml',
+            'web': None,  # Blocked by paywall
+            'weight': 1.4,
+            'specialization': ['markets']
+        },
+        'bloomberg_markets': {
+            'name': 'Bloomberg',
+            'rss': 'https://feeds.bloomberg.com/markets/news.rss',
+            'web': None,  # Blocked by Cloudflare
+            'weight': 1.5,
             'specialization': ['markets']
         },
         'ft_markets': {
             'weight': 1.4,
             'specialization': ['markets']
         },
         'economist': {
             'name': 'The Economist',
             'rss': 'https://www.economist.com/finance-and-economics/rss.xml',
+            'web': None,  # Blocked by anti-bot
             'weight': 1.3,
             'specialization': ['macro', 'geopolitical']
         },
             'weight': 1.4,
             'specialization': ['geopolitical', 'macro']
         },
+        # ===== TIER 3: Central Banks & Institutions =====
         'federal_reserve': {
             'name': 'Federal Reserve',
             'rss': 'https://www.federalreserve.gov/feeds/press_all.xml',
         },
         'imf': {
             'name': 'IMF',
+            'rss': 'https://www.imf.org/en/news/rss',
+            'web': None,  # Timeout issues
             'weight': 1.7,
             'specialization': ['macro', 'geopolitical']
         }
         self.last_fetch = None
         self.cache_ttl = 180  # 3 minutes
         self.session = requests.Session()
+        # Enhanced headers to avoid bot detection
         self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.9',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'DNT': '1',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1'
         })
     def _fetch_rss_feed(self, source_name: str, source_info: Dict) -> List[Dict]:
             for headline_elem in headlines[:10]:  # Limit to 10 most recent
                 try:
+                    # Extract title text - clean all HTML tags
+                    title = headline_elem.get_text(separator=' ', strip=True)
+                    # Remove extra whitespace
+                    title = re.sub(r'\s+', ' ', title)
+                    # Remove any HTML tags that might have been missed
+                    title = re.sub(r'<[^>]+>', '', title)
+                    # Clean up HTML entities
+                    from html import unescape
+                    title = unescape(title)
                     if not title or len(title) < 10:
                         continue
+                    # Skip if title looks like it contains HTML comments or code
+                    if '<!--' in title or '-->' in title or 'style=' in title:
+                        logger.debug(f"Skipping malformed title from {source_name}: {title[:100]}")
+                        continue
                     # Find associated link
                     # Try to find link within the headline element or its parent
                     link_elem = headline_elem if headline_elem.name == 'a' else headline_elem.find('a')
                     if not url.startswith('http'):
                         continue
+                    # Clean title from any remaining artifacts
+                    title = title.replace('\n', ' ').replace('\r', ' ').strip()
                     # Categorize and analyze
                     category = self._categorize_text(title, source_info['specialization'])
                     sentiment = self._analyze_sentiment(title)
                     impact = self._assess_impact(source_info['weight'], title)
                     is_breaking = self._detect_breaking_news(title)
+                    # Create clean summary
+                    summary = self._extract_summary(title) if len(title) > 150 else title
                     news_items.append({
                         'id': hash(url),
                         'title': title,
+                        'summary': summary,
                         'source': source_info['name'],
                         'category': category,
                         'timestamp': datetime.now(),  # Web scraping doesn't have timestamps
             for name, info in _self.SOURCES.items():
                 # RSS feed task
                 futures.append((executor.submit(_self._fetch_rss_feed, name, info), name, 'RSS'))
+                # Web scraping task (only if web URL is configured)
+                if info.get('web'):
+                    futures.append((executor.submit(_self._scrape_web_page, name, info), name, 'Web'))
             for future, source_name, method in futures:
                 try:
                             unique_items.append(item)
                     all_news.extend(unique_items)
+                    if len(unique_items) > 0:
+                        logger.info(f"Fetched {len(unique_items)} unique items from {source_name} ({method})")
                 except Exception as e:
                     logger.error(f"Error processing {source_name} ({method}): {e}")