Spaces:

ResearchEngineering
/

FinancialPlatform

Paused

App Files Files Community

Dmitry Beresnev commited on Jan 24

Commit

64315f0

1 Parent(s): 5fbc88e

add reddit news, cache system (per user), etc

Browse files

Files changed (6) hide show

.gitignore +3 -1
app/pages/05_Dashboard.py +157 -52
app/services/news_scraper.py +6 -10
app/services/reddit_news.py +316 -0
app/services/twitter_news_playwright.py +5 -10
app/utils/news_cache.py +347 -0

.gitignore CHANGED Viewed

@@ -33,4 +33,6 @@ tests/__pycache__/
 # Ignore md files
 *.md
 #
-docs/

 # Ignore md files
 *.md
 #
+docs/
+#
+*_example.py

app/pages/05_Dashboard.py CHANGED Viewed

@@ -31,6 +31,12 @@ try:
 except ImportError:
     TWITTER_AVAILABLE = False
 # ---- Page Configuration ----
 st.set_page_config(
@@ -50,8 +56,19 @@ if 'rss_monitor' not in st.session_state and RSS_AVAILABLE:
 if 'twitter_monitor' not in st.session_state and TWITTER_AVAILABLE:
     st.session_state.twitter_monitor = TwitterFinanceMonitor()
 rss_monitor = st.session_state.get('rss_monitor')
 twitter_monitor = st.session_state.get('twitter_monitor')
 # ---- Header ----
 st.markdown("# 🤖 Live Financial News & AI Dashboard")
@@ -118,6 +135,12 @@ with st.sidebar:
         high_impact_count += twitter_stats['high_impact']
         breaking_count += twitter_stats['breaking']
     if rss_monitor:
         rss_stats = rss_monitor.get_statistics()
         total_stories += rss_stats['total']
@@ -133,8 +156,9 @@ with st.sidebar:
     # Count total sources
     twitter_sources = len(twitter_monitor.SOURCES) if twitter_monitor else 0
     rss_sources = len(rss_monitor.SOURCES) if rss_monitor else 0
-    total_sources = twitter_sources + rss_sources
     st.markdown(f"""
     <div style='font-size: 11px; line-height: 1.6;'>
@@ -145,6 +169,11 @@ with st.sidebar:
     • CNBC • BBC • MarketWatch
     • The Economist • AP • AFP
     **RSS + Web Scraping ({rss_sources})**
     • CNBC • Bloomberg • FT • WSJ
     • BBC • Yahoo Finance • The Economist
@@ -162,57 +191,133 @@ force_refresh = st.session_state.get('force_refresh', False)
 if force_refresh:
     st.session_state.force_refresh = False
-# Fetch news from all sources
 import pandas as pd
 twitter_df = pd.DataFrame()
 rss_all_df = pd.DataFrame()
 rss_main_df = pd.DataFrame()
-with st.spinner("🔍 Fetching latest financial news..."):
-    # Fetch Twitter/X news (highest priority)
-    if twitter_monitor:
-        try:
-            twitter_news = twitter_monitor.scrape_twitter_news(max_tweets=50)
             if twitter_news:
-                twitter_df = pd.DataFrame(twitter_news)
-                if not twitter_df.empty:
-                    twitter_df['timestamp'] = pd.to_datetime(twitter_df['timestamp'])
-        except Exception as e:
-            st.warning(f"Twitter scraping unavailable: {e}")
-    # Fetch RSS + Web scraped news
-    if rss_monitor:
-        try:
-            rss_news = rss_monitor.scrape_news(max_items=100)
             if rss_news:
-                rss_all_df = pd.DataFrame(rss_news)
-                if not rss_all_df.empty:
-                    rss_all_df['timestamp'] = pd.to_datetime(rss_all_df['timestamp'])
-                    # Get main page news subset
-                    rss_main_df = rss_all_df[rss_all_df['from_web'] == True].copy()
-        except Exception as e:
-            st.warning(f"RSS scraping unavailable: {e}")
-# Apply filters to each dataset
-def apply_filters(df):
-    if df.empty:
-        return df
-    filtered = df.copy()
-    if category_filter != 'all':
-        filtered = filtered[filtered['category'] == category_filter]
-    if sentiment_filter != 'all':
-        filtered = filtered[filtered['sentiment'] == sentiment_filter]
-    if impact_filter != 'all':
-        filtered = filtered[filtered['impact'] == impact_filter]
-    return filtered
-twitter_filtered = apply_filters(twitter_df)
-rss_main_filtered = apply_filters(rss_main_df)
-rss_all_filtered = apply_filters(rss_all_df)
 # Combine all for breaking news banner
-all_news_df = pd.concat([twitter_filtered, rss_all_filtered], ignore_index=True) if not twitter_filtered.empty or not rss_all_filtered.empty else pd.DataFrame()
 # Display breaking news banner
 if not all_news_df.empty:
@@ -225,20 +330,20 @@ st.markdown("---")
 col1, col2, col3 = st.columns(3)
 with col1:
-    # SECTION 1: Twitter/X Breaking News
-    if not twitter_filtered.empty:
         display_scrollable_news_section(
-            twitter_filtered,
-            section_title="Twitter/X News",
-            section_icon="🐦",
-            section_subtitle="Real-time breaking news from premium accounts",
-            max_items=50,
             height="700px"
         )
-    elif not twitter_df.empty:
-        st.info("📭 No Twitter news matches your current filters.")
     else:
-        st.info("⏳ Twitter news scraping in progress... This may take 30-60 seconds on first load.")
 with col2:
     # SECTION 2: Main Page News (Web-Scraped)

 except ImportError:
     TWITTER_AVAILABLE = False
+try:
+    from services.reddit_news import RedditFinanceMonitor
+    REDDIT_AVAILABLE = True
+except ImportError:
+    REDDIT_AVAILABLE = False
 # ---- Page Configuration ----
 st.set_page_config(
 if 'twitter_monitor' not in st.session_state and TWITTER_AVAILABLE:
     st.session_state.twitter_monitor = TwitterFinanceMonitor()
+if 'reddit_monitor' not in st.session_state and REDDIT_AVAILABLE:
+    st.session_state.reddit_monitor = RedditFinanceMonitor()
 rss_monitor = st.session_state.get('rss_monitor')
 twitter_monitor = st.session_state.get('twitter_monitor')
+reddit_monitor = st.session_state.get('reddit_monitor')
+# Initialize unified cache manager
+if 'news_cache_manager' not in st.session_state:
+    from utils.news_cache import NewsCacheManager
+    st.session_state.news_cache_manager = NewsCacheManager(default_ttl=180)
+cache_manager = st.session_state.news_cache_manager
 # ---- Header ----
 st.markdown("# 🤖 Live Financial News & AI Dashboard")
         high_impact_count += twitter_stats['high_impact']
         breaking_count += twitter_stats['breaking']
+    if reddit_monitor:
+        reddit_stats = reddit_monitor.get_statistics()
+        total_stories += reddit_stats['total']
+        high_impact_count += reddit_stats['high_impact']
+        breaking_count += reddit_stats['breaking']
     if rss_monitor:
         rss_stats = rss_monitor.get_statistics()
         total_stories += rss_stats['total']
     # Count total sources
     twitter_sources = len(twitter_monitor.SOURCES) if twitter_monitor else 0
+    reddit_sources = len(reddit_monitor.SUBREDDITS) if reddit_monitor else 0
     rss_sources = len(rss_monitor.SOURCES) if rss_monitor else 0
+    total_sources = twitter_sources + reddit_sources + rss_sources
     st.markdown(f"""
     <div style='font-size: 11px; line-height: 1.6;'>
     • CNBC • BBC • MarketWatch
     • The Economist • AP • AFP
+    **Reddit Communities ({reddit_sources})**
+    • r/wallstreetbets • r/stocks • r/investing
+    • r/algotrading • r/economics • r/geopolitics
+    • r/options • r/SecurityAnalysis
     **RSS + Web Scraping ({rss_sources})**
     • CNBC • Bloomberg • FT • WSJ
     • BBC • Yahoo Finance • The Economist
 if force_refresh:
     st.session_state.force_refresh = False
+# Fetch news from all sources IN PARALLEL for maximum performance
 import pandas as pd
+from concurrent.futures import ThreadPoolExecutor
 twitter_df = pd.DataFrame()
+reddit_df = pd.DataFrame()
 rss_all_df = pd.DataFrame()
 rss_main_df = pd.DataFrame()
+def fetch_twitter_news():
+    """Fetch Twitter/X news via cache manager"""
+    try:
+        if twitter_monitor:
+            # Use cache manager for smart caching
+            twitter_news = cache_manager.get_news(
+                source='twitter',
+                fetcher_func=twitter_monitor.scrape_twitter_news,
+                force_refresh=force_refresh,
+                max_tweets=50
+            )
             if twitter_news:
+                df = pd.DataFrame(twitter_news)
+                if not df.empty:
+                    df['timestamp'] = pd.to_datetime(df['timestamp'])
+                return df, None
+    except Exception as e:
+        return pd.DataFrame(), f"Twitter scraping unavailable: {e}"
+    return pd.DataFrame(), None
+def fetch_reddit_news():
+    """Fetch Reddit news via cache manager"""
+    try:
+        if reddit_monitor:
+            # Use cache manager for smart caching
+            reddit_news = cache_manager.get_news(
+                source='reddit',
+                fetcher_func=reddit_monitor.scrape_reddit_news,
+                force_refresh=force_refresh,
+                max_posts=50,
+                hours=12
+            )
+            if reddit_news:
+                df = pd.DataFrame(reddit_news)
+                if not df.empty:
+                    df['timestamp'] = pd.to_datetime(df['timestamp'])
+                return df, None
+    except Exception as e:
+        return pd.DataFrame(), f"Reddit scraping unavailable: {e}"
+    return pd.DataFrame(), None
+def fetch_rss_news():
+    """Fetch RSS + Web scraped news via cache manager"""
+    try:
+        if rss_monitor:
+            # Use cache manager for smart caching
+            rss_news = cache_manager.get_news(
+                source='rss',
+                fetcher_func=rss_monitor.scrape_news,
+                force_refresh=force_refresh,
+                max_items=100
+            )
             if rss_news:
+                df = pd.DataFrame(rss_news)
+                if not df.empty:
+                    df['timestamp'] = pd.to_datetime(df['timestamp'])
+                    return df, None
+    except Exception as e:
+        return pd.DataFrame(), f"RSS scraping unavailable: {e}"
+    return pd.DataFrame(), None
+with st.spinner("🔍 Fetching latest financial news in parallel..."):
+    # Execute all news fetching operations in parallel using ThreadPoolExecutor
+    with ThreadPoolExecutor(max_workers=3) as executor:
+        # Submit all tasks
+        future_twitter = executor.submit(fetch_twitter_news)
+        future_reddit = executor.submit(fetch_reddit_news)
+        future_rss = executor.submit(fetch_rss_news)
+        # Collect results as they complete
+        futures = {
+            'twitter': future_twitter,
+            'reddit': future_reddit,
+            'rss': future_rss
+        }
+        for source_name, future in futures.items():
+            try:
+                result_df, error = future.result(timeout=90)  # 90 second timeout per source
+                if source_name == 'twitter':
+                    twitter_df = result_df
+                    if error:
+                        st.warning(error)
+                elif source_name == 'reddit':
+                    reddit_df = result_df
+                    if error:
+                        st.warning(error)
+                elif source_name == 'rss':
+                    rss_all_df = result_df
+                    if error:
+                        st.warning(error)
+                    # Get main page news subset for RSS
+                    if not rss_all_df.empty and 'from_web' in rss_all_df.columns:
+                        rss_main_df = rss_all_df[rss_all_df['from_web'] == True].copy()
+            except Exception as e:
+                st.warning(f"Error fetching {source_name} news: {e}")
+# Apply filters using cache manager (with filter result caching)
+filters = {
+    'category': category_filter,
+    'sentiment': sentiment_filter,
+    'impact': impact_filter
+}
+twitter_filtered = cache_manager.get_filtered_news(twitter_df, filters, 'twitter') if not twitter_df.empty else twitter_df
+reddit_filtered = cache_manager.get_filtered_news(reddit_df, filters, 'reddit') if not reddit_df.empty else reddit_df
+rss_main_filtered = cache_manager.get_filtered_news(rss_main_df, filters, 'rss_main') if not rss_main_df.empty else rss_main_df
+rss_all_filtered = cache_manager.get_filtered_news(rss_all_df, filters, 'rss_all') if not rss_all_df.empty else rss_all_df
+# Combine Twitter and Reddit for first column
+twitter_reddit_df = pd.concat([twitter_filtered, reddit_filtered], ignore_index=True) if not twitter_filtered.empty or not reddit_filtered.empty else pd.DataFrame()
+if not twitter_reddit_df.empty:
+    twitter_reddit_df = twitter_reddit_df.sort_values('timestamp', ascending=False)
 # Combine all for breaking news banner
+all_news_df = pd.concat([twitter_filtered, reddit_filtered, rss_all_filtered], ignore_index=True) if not twitter_filtered.empty or not reddit_filtered.empty or not rss_all_filtered.empty else pd.DataFrame()
 # Display breaking news banner
 if not all_news_df.empty:
 col1, col2, col3 = st.columns(3)
 with col1:
+    # SECTION 1: Twitter/X & Reddit Breaking News
+    if not twitter_reddit_df.empty:
         display_scrollable_news_section(
+            twitter_reddit_df,
+            section_title="Twitter/X & Reddit News",
+            section_icon="🌐",
+            section_subtitle="Real-time news from premium accounts & communities (last 12h)",
+            max_items=100,
             height="700px"
         )
+    elif not twitter_df.empty or not reddit_df.empty:
+        st.info("📭 No Twitter/Reddit news matches your current filters.")
     else:
+        st.info("⏳ Fetching Twitter & Reddit news... This may take 30-60 seconds on first load.")
 with col2:
     # SECTION 2: Main Page News (Web-Scraped)

app/services/news_scraper.py CHANGED Viewed

@@ -135,10 +135,7 @@ class FinanceNewsScraper:
     ]
     def __init__(self):
-        """Initialize scraper with caching"""
-        self.news_cache = []
-        self.last_fetch = None
-        self.cache_ttl = 180  # 3 minutes
         self.session = requests.Session()
         # Enhanced headers to avoid bot detection
         self.session.headers.update({
@@ -331,8 +328,7 @@ class FinanceNewsScraper:
             logger.error(f"Error scraping web page for {source_name}: {e}")
             return []
-    @st.cache_data(ttl=180)
-    def scrape_news(_self, max_items: int = 100) -> List[Dict]:
         """
         Scrape news from all sources with caching
         Uses ThreadPoolExecutor for parallel fetching from both RSS and web pages
@@ -345,12 +341,12 @@ class FinanceNewsScraper:
             futures = []
             # Submit both RSS and web scraping tasks for each source
-            for name, info in _self.SOURCES.items():
                 # RSS feed task
-                futures.append((executor.submit(_self._fetch_rss_feed, name, info), name, 'RSS'))
                 # Web scraping task (only if web URL is configured)
                 if info.get('web'):
-                    futures.append((executor.submit(_self._scrape_web_page, name, info), name, 'Web'))
             for future, source_name, method in futures:
                 try:
@@ -372,7 +368,7 @@ class FinanceNewsScraper:
         # If no news was fetched, use mock data
         if not all_news:
             logger.warning("No news fetched from any source - using mock data")
-            return _self._get_mock_news()
         # Sort by: web-scraped first, then breaking news, then impact, then timestamp
         all_news.sort(

     ]
     def __init__(self):
+        """Initialize scraper"""
         self.session = requests.Session()
         # Enhanced headers to avoid bot detection
         self.session.headers.update({
             logger.error(f"Error scraping web page for {source_name}: {e}")
             return []
+    def scrape_news(self, max_items: int = 100) -> List[Dict]:
         """
         Scrape news from all sources with caching
         Uses ThreadPoolExecutor for parallel fetching from both RSS and web pages
             futures = []
             # Submit both RSS and web scraping tasks for each source
+            for name, info in self.SOURCES.items():
                 # RSS feed task
+                futures.append((executor.submit(self._fetch_rss_feed, name, info), name, 'RSS'))
                 # Web scraping task (only if web URL is configured)
                 if info.get('web'):
+                    futures.append((executor.submit(self._scrape_web_page, name, info), name, 'Web'))
             for future, source_name, method in futures:
                 try:
         # If no news was fetched, use mock data
         if not all_news:
             logger.warning("No news fetched from any source - using mock data")
+            return self._get_mock_news()
         # Sort by: web-scraped first, then breaking news, then impact, then timestamp
         all_news.sort(

app/services/reddit_news.py ADDED Viewed

	@@ -0,0 +1,316 @@

+"""
+Reddit Financial News Scraper
+Scrapes financial, trading, quant, and geopolitical news from Reddit
+No authentication required - uses public RSS feeds
+"""
+import feedparser
+import logging
+from datetime import datetime, timedelta
+from typing import List, Dict
+import re
+logger = logging.getLogger(__name__)
+class RedditFinanceMonitor:
+    """
+    Reddit financial news aggregator using RSS feeds
+    No authentication required - public RSS feeds only
+    """
+    # Premium financial subreddits
+    SUBREDDITS = {
+        # Financial & Markets
+        'wallstreetbets': {
+            'url': 'https://www.reddit.com/r/wallstreetbets/top/.rss?t=day',
+            'weight': 1.6,
+            'specialization': ['markets'],
+            'category': 'markets'
+        },
+        'stocks': {
+            'url': 'https://www.reddit.com/r/stocks/top/.rss?t=day',
+            'weight': 1.7,
+            'specialization': ['markets'],
+            'category': 'markets'
+        },
+        'investing': {
+            'url': 'https://www.reddit.com/r/investing/top/.rss?t=day',
+            'weight': 1.8,
+            'specialization': ['markets', 'macro'],
+            'category': 'markets'
+        },
+        'stockmarket': {
+            'url': 'https://www.reddit.com/r/StockMarket/top/.rss?t=day',
+            'weight': 1.6,
+            'specialization': ['markets'],
+            'category': 'markets'
+        },
+        'options': {
+            'url': 'https://www.reddit.com/r/options/top/.rss?t=day',
+            'weight': 1.5,
+            'specialization': ['markets'],
+            'category': 'markets'
+        },
+        'daytrading': {
+            'url': 'https://www.reddit.com/r/Daytrading/top/.rss?t=day',
+            'weight': 1.5,
+            'specialization': ['markets'],
+            'category': 'markets'
+        },
+        'securityanalysis': {
+            'url': 'https://www.reddit.com/r/SecurityAnalysis/top/.rss?t=day',
+            'weight': 1.7,
+            'specialization': ['markets'],
+            'category': 'markets'
+        },
+        # Economics & Macro
+        'economics': {
+            'url': 'https://www.reddit.com/r/Economics/top/.rss?t=day',
+            'weight': 1.8,
+            'specialization': ['macro'],
+            'category': 'macro'
+        },
+        'economy': {
+            'url': 'https://www.reddit.com/r/economy/top/.rss?t=day',
+            'weight': 1.6,
+            'specialization': ['macro'],
+            'category': 'macro'
+        },
+        # Quantitative Finance
+        'algotrading': {
+            'url': 'https://www.reddit.com/r/algotrading/top/.rss?t=day',
+            'weight': 1.7,
+            'specialization': ['markets'],
+            'category': 'markets'
+        },
+        'quantfinance': {
+            'url': 'https://www.reddit.com/r/quant/top/.rss?t=day',
+            'weight': 1.7,
+            'specialization': ['markets'],
+            'category': 'markets'
+        },
+        # Geopolitics
+        'geopolitics': {
+            'url': 'https://www.reddit.com/r/geopolitics/top/.rss?t=day',
+            'weight': 1.8,
+            'specialization': ['geopolitical'],
+            'category': 'geopolitical'
+        },
+        'worldnews': {
+            'url': 'https://www.reddit.com/r/worldnews/top/.rss?t=day',
+            'weight': 1.7,
+            'specialization': ['geopolitical'],
+            'category': 'geopolitical'
+        },
+        'neutralpolitics': {
+            'url': 'https://www.reddit.com/r/NeutralPolitics/top/.rss?t=day',
+            'weight': 1.6,
+            'specialization': ['geopolitical'],
+            'category': 'geopolitical'
+        },
+    }
+    # Keyword detection for additional categorization
+    MACRO_KEYWORDS = [
+        'Fed', 'ECB', 'BoE', 'BoJ', 'FOMC', 'Powell', 'Lagarde',
+        'interest rate', 'inflation', 'CPI', 'PPI', 'GDP',
+        'unemployment', 'jobs report', 'NFP', 'central bank',
+        'recession', 'QE', 'quantitative easing', 'monetary policy'
+    ]
+    MARKETS_KEYWORDS = [
+        'stock', 'equity', 'bond', 'commodity', 'oil', 'gold',
+        'earnings', 'revenue', 'profit', 'IPO', 'merger',
+        'acquisition', 'trading', 'options', 'futures', 'forex'
+    ]
+    GEOPOLITICAL_KEYWORDS = [
+        'war', 'conflict', 'sanction', 'trade', 'tariff',
+        'election', 'China', 'Russia', 'Ukraine', 'Taiwan',
+        'Middle East', 'Iran', 'Israel', 'NATO', 'UN'
+    ]
+    def __init__(self):
+        """Initialize Reddit monitor"""
+        pass
+    def _categorize_post(self, title: str, subreddit_info: Dict) -> str:
+        """Categorize post based on title and subreddit"""
+        title_lower = title.lower()
+        # Use subreddit default category
+        default_category = subreddit_info.get('category', 'markets')
+        # Check keywords for override
+        if any(keyword.lower() in title_lower for keyword in self.MACRO_KEYWORDS):
+            return 'macro'
+        elif any(keyword.lower() in title_lower for keyword in self.GEOPOLITICAL_KEYWORDS):
+            return 'geopolitical'
+        elif any(keyword.lower() in title_lower for keyword in self.MARKETS_KEYWORDS):
+            return 'markets'
+        return default_category
+    def _detect_sentiment(self, title: str) -> str:
+        """Simple sentiment detection based on keywords"""
+        title_lower = title.lower()
+        positive_words = ['bullish', 'bull', 'surge', 'gain', 'up', 'rally', 'boom', 'profit', 'growth']
+        negative_words = ['bearish', 'bear', 'crash', 'loss', 'down', 'fall', 'decline', 'recession', 'crisis']
+        positive_count = sum(1 for word in positive_words if word in title_lower)
+        negative_count = sum(1 for word in negative_words if word in title_lower)
+        if positive_count > negative_count:
+            return 'positive'
+        elif negative_count > positive_count:
+            return 'negative'
+        else:
+            return 'neutral'
+    def _calculate_impact(self, score: int, num_comments: int, subreddit_weight: float) -> str:
+        """Calculate impact based on upvotes, comments, and subreddit weight"""
+        # Normalize score (upvotes - downvotes)
+        engagement_score = (score * 0.7) + (num_comments * 0.3)
+        weighted_score = engagement_score * subreddit_weight
+        if weighted_score > 500:
+            return 'high'
+        elif weighted_score > 100:
+            return 'medium'
+        else:
+            return 'low'
+    def scrape_reddit_news(self, max_posts: int = 100, hours: int = 12) -> List[Dict]:
+        """
+        Scrape Reddit posts from financial subreddits
+        Args:
+            max_posts: Maximum number of posts to return
+            hours: Only include posts from the last N hours (default: 12)
+        Returns:
+            List of news items with metadata
+        """
+        all_posts = []
+        seen_titles = set()
+        cutoff_time = datetime.now() - timedelta(hours=hours)
+        logger.info(f"Scraping Reddit posts from last {hours} hours...")
+        for subreddit_name, subreddit_info in self.SUBREDDITS.items():
+            try:
+                logger.info(f"Fetching r/{subreddit_name}...")
+                # Parse RSS feed
+                feed = feedparser.parse(subreddit_info['url'])
+                for entry in feed.entries[:20]:  # Get top 20 per subreddit
+                    try:
+                        # Parse publication date
+                        if hasattr(entry, 'published_parsed'):
+                            pub_date = datetime(*entry.published_parsed[:6])
+                        else:
+                            pub_date = datetime.now()
+                        # Filter by time (last 12 hours by default)
+                        if pub_date < cutoff_time:
+                            continue
+                        # Extract title and link
+                        title = entry.title.strip()
+                        link = entry.link
+                        # Deduplicate
+                        title_hash = hash(title[:100])
+                        if title_hash in seen_titles:
+                            continue
+                        seen_titles.add(title_hash)
+                        # Extract score and comments from content
+                        score = 0
+                        num_comments = 0
+                        if hasattr(entry, 'content'):
+                            content_text = entry.content[0].value if entry.content else ''
+                            # Try to extract score from content
+                            score_match = re.search(r'(\d+)\s+points?', content_text)
+                            if score_match:
+                                score = int(score_match.group(1))
+                            # Try to extract comments
+                            comment_match = re.search(r'(\d+)\s+comments?', content_text)
+                            if comment_match:
+                                num_comments = int(comment_match.group(1))
+                        # Categorize and analyze
+                        category = self._categorize_post(title, subreddit_info)
+                        sentiment = self._detect_sentiment(title)
+                        impact = self._calculate_impact(score, num_comments, subreddit_info['weight'])
+                        # Check if breaking news (high score in last 3 hours)
+                        is_breaking = (
+                            (datetime.now() - pub_date).total_seconds() < 10800 and  # 3 hours
+                            score > 1000
+                        )
+                        post_data = {
+                            'title': title,
+                            'summary': title,  # Reddit posts don't have separate summaries
+                            'url': link,
+                            'source': f"r/{subreddit_name}",
+                            'timestamp': pub_date,
+                            'category': category,
+                            'sentiment': sentiment,
+                            'impact': impact,
+                            'is_breaking': is_breaking,
+                            'engagement': {
+                                'score': score,
+                                'comments': num_comments
+                            },
+                            'platform': 'reddit'
+                        }
+                        all_posts.append(post_data)
+                    except Exception as e:
+                        logger.error(f"Error processing entry from r/{subreddit_name}: {e}")
+                        continue
+                logger.info(f"Fetched {len([p for p in all_posts if p['source'] == f'r/{subreddit_name}'])} posts from r/{subreddit_name}")
+            except Exception as e:
+                logger.error(f"Error fetching r/{subreddit_name}: {e}")
+                continue
+        # Sort by engagement score (weighted by source weight)
+        all_posts.sort(key=lambda x: x['engagement']['score'] * self.SUBREDDITS.get(
+            x['source'].replace('r/', ''), {}
+        ).get('weight', 1.0), reverse=True)
+        logger.info(f"Total Reddit posts scraped: {len(all_posts)}")
+        return all_posts[:max_posts]
+    def get_statistics(self) -> Dict:
+        """Get statistics about scraped Reddit posts"""
+        if not self.news_cache:
+            posts = self.scrape_reddit_news()
+            self.news_cache = posts
+        total = len(self.news_cache)
+        high_impact = len([p for p in self.news_cache if p['impact'] == 'high'])
+        breaking = len([p for p in self.news_cache if p.get('is_breaking', False)])
+        return {
+            'total': total,
+            'high_impact': high_impact,
+            'breaking': breaking,
+            'by_category': {
+                'macro': len([p for p in self.news_cache if p['category'] == 'macro']),
+                'markets': len([p for p in self.news_cache if p['category'] == 'markets']),
+                'geopolitical': len([p for p in self.news_cache if p['category'] == 'geopolitical'])
+            }
+        }

app/services/twitter_news_playwright.py CHANGED Viewed

@@ -179,10 +179,6 @@ class TwitterFinanceMonitor:
     def __init__(self):
         """Initialize monitor"""
-        self.news_cache = []
-        self.last_fetch = None
-        self.cache_ttl = 180  # 3 minutes
         # Find Chromium executable
         self.chromium_path = self._find_chromium()
@@ -311,22 +307,21 @@ class TwitterFinanceMonitor:
             logger.error(f"Error scraping {source_name}: {e}")
             return []
-    @st.cache_data(ttl=180)
-    def scrape_twitter_news(_self, max_tweets: int = 100) -> List[Dict]:
         """
         Scrape latest financial news from Twitter using Playwright
         Runs in parallel for better performance - 19 sources in ~30-45 seconds
         """
         if not PLAYWRIGHT_AVAILABLE:
             logger.info("Playwright not available - using mock data")
-            return _self._get_mock_news()
         all_news = []
         seen_texts = set()
         # Sort sources by weight (priority) - scrape high-value sources first
         sorted_sources = sorted(
-            _self.SOURCES.items(),
             key=lambda x: x[1]['weight'],
             reverse=True
         )
@@ -337,7 +332,7 @@ class TwitterFinanceMonitor:
             futures = []
             for name, info in sorted_sources:
                 # Increased timeout for better success rate
-                future = executor.submit(_self._scrape_twitter_profile, name, info, timeout=30)
                 futures.append((future, name))
             for future, source_name in futures:
@@ -365,7 +360,7 @@ class TwitterFinanceMonitor:
         # If no news was fetched, use mock data
         if not all_news:
             logger.warning("No tweets fetched - using mock data")
-            return _self._get_mock_news()
         # Sort by breaking news, then impact, then timestamp
         all_news.sort(

     def __init__(self):
         """Initialize monitor"""
         # Find Chromium executable
         self.chromium_path = self._find_chromium()
             logger.error(f"Error scraping {source_name}: {e}")
             return []
+    def scrape_twitter_news(self, max_tweets: int = 100) -> List[Dict]:
         """
         Scrape latest financial news from Twitter using Playwright
         Runs in parallel for better performance - 19 sources in ~30-45 seconds
         """
         if not PLAYWRIGHT_AVAILABLE:
             logger.info("Playwright not available - using mock data")
+            return self._get_mock_news()
         all_news = []
         seen_texts = set()
         # Sort sources by weight (priority) - scrape high-value sources first
         sorted_sources = sorted(
+            self.SOURCES.items(),
             key=lambda x: x[1]['weight'],
             reverse=True
         )
             futures = []
             for name, info in sorted_sources:
                 # Increased timeout for better success rate
+                future = executor.submit(self._scrape_twitter_profile, name, info, timeout=30)
                 futures.append((future, name))
             for future, source_name in futures:
         # If no news was fetched, use mock data
         if not all_news:
             logger.warning("No tweets fetched - using mock data")
+            return self._get_mock_news()
         # Sort by breaking news, then impact, then timestamp
         all_news.sort(

app/utils/news_cache.py ADDED Viewed

	@@ -0,0 +1,347 @@

+"""
+Unified News Caching System
+Centralized cache manager for Twitter, Reddit, and RSS news feeds
+"""
+import hashlib
+import logging
+import re
+import pandas as pd
+from datetime import datetime, timedelta
+from typing import List, Dict, Optional, Callable
+logger = logging.getLogger(__name__)
+class NewsCacheManager:
+    """
+    Centralized cache manager for news feeds with:
+    - Per-source caching with TTL
+    - Cross-service deduplication
+    - Filtered results caching
+    - Force refresh support
+    """
+    def __init__(self, default_ttl: int = 180):
+        """
+        Initialize cache manager
+        Args:
+            default_ttl: Default time-to-live in seconds (default: 180 = 3 minutes)
+        """
+        self.cache = {
+            'twitter': {'raw_news': [], 'last_fetch': None, 'ttl': default_ttl},
+            'reddit': {'raw_news': [], 'last_fetch': None, 'ttl': default_ttl},
+            'rss': {'raw_news': [], 'last_fetch': None, 'ttl': default_ttl},
+            'dedup_index': {},  # Global deduplication index
+            'filtered_cache': {}  # Cached filtered results
+        }
+        logger.info(f"NewsCacheManager initialized with {default_ttl}s TTL")
+    def get_news(
+        self,
+        source: str,
+        fetcher_func: Callable,
+        force_refresh: bool = False,
+        **kwargs
+    ) -> List[Dict]:
+        """
+        Get news from cache or fetch fresh if needed
+        Args:
+            source: News source ('twitter', 'reddit', 'rss')
+            fetcher_func: Function to fetch fresh news
+            force_refresh: If True, bypass cache and fetch fresh
+            **kwargs: Arguments to pass to fetcher_func
+        Returns:
+            List of news items
+        """
+        if source not in ['twitter', 'reddit', 'rss']:
+            logger.error(f"Invalid source: {source}")
+            return []
+        # Force refresh clears dedup index for that source
+        if force_refresh:
+            self._clear_source_from_dedup(source)
+        # Check if cache is valid
+        if not force_refresh and self._is_cache_valid(source):
+            logger.info(f"✅ Cache HIT for {source} (age: {self._get_cache_age(source):.1f}s)")
+            return self.cache[source]['raw_news']
+        # Cache miss or force refresh - fetch fresh news
+        logger.info(f"🔄 Cache MISS for {source} - fetching fresh news...")
+        try:
+            new_items = fetcher_func(**kwargs)
+            if not new_items:
+                logger.warning(f"No news items fetched for {source}")
+                # Return cached data if available, even if expired
+                return self.cache[source]['raw_news']
+            # Update cache
+            self._update_cache(source, new_items)
+            # Deduplicate across sources
+            deduplicated = self._deduplicate(new_items, source)
+            logger.info(f"✅ Fetched {len(new_items)} items for {source}, {len(deduplicated)} unique after dedup")
+            return deduplicated
+        except Exception as e:
+            logger.error(f"Error fetching news for {source}: {e}")
+            # Return cached data if available
+            return self.cache[source]['raw_news']
+    def _is_cache_valid(self, source: str) -> bool:
+        """
+        Check if cached data is still fresh
+        Args:
+            source: News source to check
+        Returns:
+            True if cache is valid, False otherwise
+        """
+        source_cache = self.cache[source]
+        if not source_cache['last_fetch']:
+            return False
+        age = (datetime.now() - source_cache['last_fetch']).total_seconds()
+        is_valid = age < source_cache['ttl']
+        return is_valid
+    def _get_cache_age(self, source: str) -> float:
+        """
+        Get age of cached data in seconds
+        Args:
+            source: News source
+        Returns:
+            Age in seconds, or -1 if never fetched
+        """
+        source_cache = self.cache[source]
+        if not source_cache['last_fetch']:
+            return -1
+        return (datetime.now() - source_cache['last_fetch']).total_seconds()
+    def _normalize_text(self, text: str) -> str:
+        """
+        Normalize text for deduplication
+        Args:
+            text: Text to normalize
+        Returns:
+            Normalized text
+        """
+        if not text:
+            return ""
+        # Convert to lowercase
+        text = text.lower().strip()
+        # Remove punctuation
+        text = re.sub(r'[^\w\s]', '', text)
+        # Normalize whitespace
+        text = re.sub(r'\s+', ' ', text)
+        return text
+    def _compute_hash(self, item: Dict) -> str:
+        """
+        Compute content hash for deduplication
+        Args:
+            item: News item dict
+        Returns:
+            MD5 hash string
+        """
+        title = self._normalize_text(item.get('title', ''))
+        summary = self._normalize_text(item.get('summary', '')[:200])  # First 200 chars
+        # Combine title and summary
+        combined = f"{title}|{summary}"
+        return hashlib.md5(combined.encode()).hexdigest()
+    def _deduplicate(self, items: List[Dict], source: str) -> List[Dict]:
+        """
+        Remove duplicates using global dedup index
+        Args:
+            items: List of news items
+            source: Source name
+        Returns:
+            Deduplicated list of items
+        """
+        deduplicated = []
+        duplicate_count = 0
+        for item in items:
+            content_hash = self._compute_hash(item)
+            if content_hash in self.cache['dedup_index']:
+                # Duplicate found - update sources list
+                dup_entry = self.cache['dedup_index'][content_hash]
+                if source not in dup_entry['sources']:
+                    dup_entry['sources'].append(source)
+                duplicate_count += 1
+            else:
+                # New item - add to index and result
+                self.cache['dedup_index'][content_hash] = {
+                    'first_seen': datetime.now(),
+                    'sources': [source],
+                    'canonical_item': item
+                }
+                deduplicated.append(item)
+        if duplicate_count > 0:
+            logger.info(f"🔍 Deduplication: Found {duplicate_count} duplicates for {source}")
+        return deduplicated
+    def _update_cache(self, source: str, items: List[Dict]):
+        """
+        Update cache with new items
+        Args:
+            source: News source
+            items: List of news items
+        """
+        self.cache[source]['raw_news'] = items
+        self.cache[source]['last_fetch'] = datetime.now()
+        logger.info(f"📦 Updated cache for {source} with {len(items)} items")
+    def get_filtered_news(
+        self,
+        source_df: pd.DataFrame,
+        filters: Dict,
+        source_name: str = "unknown"
+    ) -> pd.DataFrame:
+        """
+        Get filtered news with caching
+        Args:
+            source_df: Source dataframe
+            filters: Filter dict with 'category', 'sentiment', 'impact' keys
+            source_name: Name of source (for logging)
+        Returns:
+            Filtered dataframe
+        """
+        if source_df.empty:
+            return source_df
+        # Create cache key from filters
+        category = filters.get('category', 'all')
+        sentiment = filters.get('sentiment', 'all')
+        impact = filters.get('impact', 'all')
+        cache_key = f"{source_name}_{category}_{sentiment}_{impact}"
+        # Check filtered cache
+        if cache_key in self.cache['filtered_cache']:
+            cached_entry = self.cache['filtered_cache'][cache_key]
+            if datetime.now() < cached_entry['expires_at']:
+                logger.debug(f"✅ Filtered cache HIT for {cache_key}")
+                return cached_entry['results']
+        # Apply filters
+        filtered_df = source_df.copy()
+        if category != 'all':
+            filtered_df = filtered_df[filtered_df['category'] == category]
+        if sentiment != 'all':
+            filtered_df = filtered_df[filtered_df['sentiment'] == sentiment]
+        if impact != 'all':
+            filtered_df = filtered_df[filtered_df['impact'] == impact]
+        logger.debug(f"🔍 Filtered {source_name}: {len(source_df)} → {len(filtered_df)} items")
+        # Cache filtered results (5 minute TTL)
+        self.cache['filtered_cache'][cache_key] = {
+            'results': filtered_df,
+            'expires_at': datetime.now() + timedelta(seconds=300)
+        }
+        return filtered_df
+    def _clear_source_from_dedup(self, source: str):
+        """
+        Remove all entries from dedup index that only belong to this source
+        Args:
+            source: Source to remove from dedup index
+        """
+        to_remove = []
+        for content_hash, entry in self.cache['dedup_index'].items():
+            # Remove source from sources list
+            if source in entry['sources']:
+                entry['sources'].remove(source)
+            # If no sources left, mark for removal
+            if not entry['sources']:
+                to_remove.append(content_hash)
+        # Remove entries with no sources
+        for content_hash in to_remove:
+            del self.cache['dedup_index'][content_hash]
+        if to_remove:
+            logger.info(f"🗑️  Removed {len(to_remove)} entries from dedup index for {source}")
+    def clear_cache(self, source: Optional[str] = None):
+        """
+        Clear cache for specific source or all sources
+        Args:
+            source: Source to clear, or None to clear all
+        """
+        if source:
+            self.cache[source] = {'raw_news': [], 'last_fetch': None, 'ttl': 180}
+            self._clear_source_from_dedup(source)
+            logger.info(f"🗑️  Cleared cache for {source}")
+        else:
+            for src in ['twitter', 'reddit', 'rss']:
+                self.cache[src] = {'raw_news': [], 'last_fetch': None, 'ttl': 180}
+            self.cache['dedup_index'] = {}
+            self.cache['filtered_cache'] = {}
+            logger.info("🗑️  Cleared ALL caches")
+    def get_statistics(self) -> Dict:
+        """
+        Get cache statistics
+        Returns:
+            Dict with cache stats
+        """
+        stats = {
+            'twitter': {
+                'items': len(self.cache['twitter']['raw_news']),
+                'age_seconds': self._get_cache_age('twitter'),
+                'is_valid': self._is_cache_valid('twitter')
+            },
+            'reddit': {
+                'items': len(self.cache['reddit']['raw_news']),
+                'age_seconds': self._get_cache_age('reddit'),
+                'is_valid': self._is_cache_valid('reddit')
+            },
+            'rss': {
+                'items': len(self.cache['rss']['raw_news']),
+                'age_seconds': self._get_cache_age('rss'),
+                'is_valid': self._is_cache_valid('rss')
+            },
+            'dedup_index_size': len(self.cache['dedup_index']),
+            'filtered_cache_size': len(self.cache['filtered_cache'])
+        }
+        return stats