Spaces:

ResearchEngineering
/

FinancialPlatform

Paused

App Files Files Community

Dmitry Beresnev commited on Jan 23

Commit

f6b2909

1 Parent(s): e9c5fb7

add twitter news

Browse files

Files changed (3) hide show

app/pages/05_Dashboard.py +130 -80
app/services/twitter_news_playwright.py +458 -0
requirements.txt +2 -0

app/pages/05_Dashboard.py CHANGED Viewed

@@ -18,17 +18,18 @@ from components.news import (
     display_breaking_news_banner
 )
-# Try to import RSS scraper first (most reliable), fall back to Twikit, then old snscrape
 try:
-    from services.news_scraper import FinanceNewsScraper as FinanceNewsMonitor
-    NEWS_SOURCE = "RSS Feeds"
 except ImportError:
-    try:
-        from services.news_monitor_twikit import FinanceNewsMonitor
-        NEWS_SOURCE = "Twikit"
-    except ImportError:
-        from services.news_monitor import FinanceNewsMonitor
-        NEWS_SOURCE = "snscrape"
 # ---- Page Configuration ----
@@ -42,11 +43,15 @@ st.set_page_config(
 # ---- Apply Dark Theme ----
 st.markdown(DARK_THEME_CSS, unsafe_allow_html=True)
-# Initialize news monitor (with caching)
-if 'news_monitor' not in st.session_state:
-    st.session_state.news_monitor = FinanceNewsMonitor()
-monitor = st.session_state.news_monitor
 # ---- Header ----
 st.markdown("# 🤖 Live Financial News & AI Dashboard")
@@ -102,38 +107,48 @@ with st.sidebar:
     st.markdown("---")
     st.markdown("### 📊 Feed Statistics")
-    # Get and display stats
-    stats = monitor.get_statistics()
-    st.metric("Total Stories", stats['total'])
-    st.metric("High Impact", stats['high_impact'])
-    st.metric("Breaking News", stats['breaking'])
-    st.caption(f"Last update: {stats['last_update']}")
     st.markdown("---")
     st.markdown("### ℹ️ Sources")
-    # Get actual source count
-    total_sources = len(monitor.SOURCES)
     st.markdown(f"""
     <div style='font-size: 11px; line-height: 1.6;'>
-    **Tier 1: Financial News (8)**
-    • Reuters • Bloomberg × 2 • FT
-    • WSJ • The Economist • CNBC
-    • MarketWatch
-    **Tier 2: Geopolitical (5)**
-    • BBC World • AFP • Al Jazeera
-    • Politico • DW News
-    **Tier 3: Central Banks (7)**
-    • Fed (2.0x) • ECB (2.0x) • Lagarde
-    • BoE • IMF • World Bank • Treasury
-    **Tier 4: Alpha Accounts (3)**
-    • Zero Hedge • First Squawk
-    • Live Squawk
     **Total: {total_sources} Premium Sources**
     </div>
@@ -147,73 +162,108 @@ force_refresh = st.session_state.get('force_refresh', False)
 if force_refresh:
     st.session_state.force_refresh = False
-# Get filtered news
 with st.spinner("🔍 Fetching latest financial news..."):
-    news_df = monitor.get_news(
-        category=category_filter,
-        sentiment=sentiment_filter,
-        impact=impact_filter,
-        refresh=force_refresh
-    )
-# Display demo mode notice if using mock data
-if len(news_df) > 0 and news_df.iloc[0].get('id', 0) < 100:
-    st.info("📢 **Demo Mode**: Twitter/X API is currently unavailable. Displaying sample news data to showcase the platform's features. In production, this would show real-time financial news from 23 premium sources.")
-# Display breaking news banner if exists
-display_breaking_news_banner(news_df)
-# Statistics overview
-st.markdown("## 📊 News Feed Overview")
-stats = monitor.get_statistics()
-display_news_statistics(stats)
-st.markdown("<br>", unsafe_allow_html=True)
-# Category breakdown
-display_category_breakdown(stats)
 st.markdown("---")
-# ---- MAIN PAGE NEWS (Web-Scraped) ----
 st.markdown("## 🔥 Top Stories from Main Pages")
-st.caption("Latest headlines directly from news source homepages")
-main_page_df = monitor.get_main_page_news()
-if not main_page_df.empty:
-    # Apply filters to main page news
-    filtered_main = main_page_df.copy()
-    if category_filter != 'all':
-        filtered_main = filtered_main[filtered_main['category'] == category_filter]
-    if sentiment_filter != 'all':
-        filtered_main = filtered_main[filtered_main['sentiment'] == sentiment_filter]
-    if impact_filter != 'all':
-        filtered_main = filtered_main[filtered_main['impact'] == impact_filter]
-    if not filtered_main.empty:
-        display_news_feed(filtered_main, max_items=10)
-    else:
-        st.info("📭 No main page news matches your filters.")
 else:
     st.info("⏳ Main page news will appear here...")
 st.markdown("---")
-# ---- ALL NEWS FEED (RSS + Web) ----
 col1, col2, col3 = st.columns([2, 1, 1])
 with col1:
-    st.markdown("## 📰 All News Feed")
 with col2:
     show_count = st.selectbox("Show", [10, 20, 50, 100], index=1, label_visibility="collapsed")
 with col3:
-    if not news_df.empty:
-        st.caption(f"Displaying {min(show_count, len(news_df))} of {len(news_df)} stories")
-# Display news feed
-if not news_df.empty:
-    display_news_feed(news_df, max_items=show_count)
 else:
-    st.info("📭 No news matches your current filters. Try adjusting the filters or refresh the feed.")
 # Auto-refresh logic
 if auto_refresh:

     display_breaking_news_banner
 )
+# Import news scrapers
 try:
+    from services.news_scraper import FinanceNewsScraper
+    RSS_AVAILABLE = True
 except ImportError:
+    RSS_AVAILABLE = False
+try:
+    from services.twitter_news_playwright import TwitterFinanceMonitor
+    TWITTER_AVAILABLE = True
+except ImportError:
+    TWITTER_AVAILABLE = False
 # ---- Page Configuration ----
 # ---- Apply Dark Theme ----
 st.markdown(DARK_THEME_CSS, unsafe_allow_html=True)
+# Initialize news monitors (with caching)
+if 'rss_monitor' not in st.session_state and RSS_AVAILABLE:
+    st.session_state.rss_monitor = FinanceNewsScraper()
+if 'twitter_monitor' not in st.session_state and TWITTER_AVAILABLE:
+    st.session_state.twitter_monitor = TwitterFinanceMonitor()
+rss_monitor = st.session_state.get('rss_monitor')
+twitter_monitor = st.session_state.get('twitter_monitor')
 # ---- Header ----
 st.markdown("# 🤖 Live Financial News & AI Dashboard")
     st.markdown("---")
     st.markdown("### 📊 Feed Statistics")
+    # Calculate combined stats
+    total_stories = 0
+    high_impact_count = 0
+    breaking_count = 0
+    if twitter_monitor:
+        twitter_stats = twitter_monitor.get_statistics()
+        total_stories += twitter_stats['total']
+        high_impact_count += twitter_stats['high_impact']
+        breaking_count += twitter_stats['breaking']
+    if rss_monitor:
+        rss_stats = rss_monitor.get_statistics()
+        total_stories += rss_stats['total']
+        high_impact_count += rss_stats['high_impact']
+        breaking_count += rss_stats['breaking']
+    st.metric("Total Stories", total_stories)
+    st.metric("High Impact", high_impact_count)
+    st.metric("Breaking News", breaking_count)
     st.markdown("---")
     st.markdown("### ℹ️ Sources")
+    # Count total sources
+    twitter_sources = len(twitter_monitor.SOURCES) if twitter_monitor else 0
+    rss_sources = len(rss_monitor.SOURCES) if rss_monitor else 0
+    total_sources = twitter_sources + rss_sources
     st.markdown(f"""
     <div style='font-size: 11px; line-height: 1.6;'>
+    **Twitter/X Accounts ({twitter_sources})**
+    • WalterBloomberg • FXHedge • DeItaone
+    • Reuters • Bloomberg • FT • WSJ
+    • CNBC • BBC • MarketWatch
+    • The Economist • AP • AFP
+    **RSS + Web Scraping ({rss_sources})**
+    • CNBC • Bloomberg • FT • WSJ
+    • BBC • Yahoo Finance • The Economist
+    • Fed (2.0x) • ECB (2.0x) • IMF
     **Total: {total_sources} Premium Sources**
     </div>
 if force_refresh:
     st.session_state.force_refresh = False
+# Fetch news from all sources
+import pandas as pd
+twitter_df = pd.DataFrame()
+rss_all_df = pd.DataFrame()
+rss_main_df = pd.DataFrame()
 with st.spinner("🔍 Fetching latest financial news..."):
+    # Fetch Twitter/X news (highest priority)
+    if twitter_monitor:
+        try:
+            twitter_news = twitter_monitor.scrape_twitter_news(max_tweets=50)
+            if twitter_news:
+                twitter_df = pd.DataFrame(twitter_news)
+                if not twitter_df.empty:
+                    twitter_df['timestamp'] = pd.to_datetime(twitter_df['timestamp'])
+        except Exception as e:
+            st.warning(f"Twitter scraping unavailable: {e}")
+    # Fetch RSS + Web scraped news
+    if rss_monitor:
+        try:
+            rss_news = rss_monitor.scrape_news(max_items=100)
+            if rss_news:
+                rss_all_df = pd.DataFrame(rss_news)
+                if not rss_all_df.empty:
+                    rss_all_df['timestamp'] = pd.to_datetime(rss_all_df['timestamp'])
+                    # Get main page news subset
+                    rss_main_df = rss_all_df[rss_all_df['from_web'] == True].copy()
+        except Exception as e:
+            st.warning(f"RSS scraping unavailable: {e}")
+# Apply filters to each dataset
+def apply_filters(df):
+    if df.empty:
+        return df
+    filtered = df.copy()
+    if category_filter != 'all':
+        filtered = filtered[filtered['category'] == category_filter]
+    if sentiment_filter != 'all':
+        filtered = filtered[filtered['sentiment'] == sentiment_filter]
+    if impact_filter != 'all':
+        filtered = filtered[filtered['impact'] == impact_filter]
+    return filtered
+twitter_filtered = apply_filters(twitter_df)
+rss_main_filtered = apply_filters(rss_main_df)
+rss_all_filtered = apply_filters(rss_all_df)
+# Combine all for breaking news banner
+all_news_df = pd.concat([twitter_filtered, rss_all_filtered], ignore_index=True) if not twitter_filtered.empty or not rss_all_filtered.empty else pd.DataFrame()
+# Display breaking news banner
+if not all_news_df.empty:
+    display_breaking_news_banner(all_news_df)
+st.markdown("---")
+# ---- SECTION 1: Twitter/X Breaking News (Highest Priority) ----
+st.markdown("## 🐦 Twitter/X Financial News Feed")
+st.caption("Real-time breaking news from premium Twitter/X accounts (WalterBloomberg, Reuters, Bloomberg, FT, etc.)")
+if not twitter_filtered.empty:
+    display_news_feed(twitter_filtered, max_items=15)
+elif not twitter_df.empty:
+    st.info("📭 No Twitter news matches your current filters.")
+else:
+    st.info("⏳ Twitter news scraping in progress... This may take 30-60 seconds on first load.")
 st.markdown("---")
+# ---- SECTION 2: Main Page News (Web-Scraped) ----
 st.markdown("## 🔥 Top Stories from Main Pages")
+st.caption("Latest headlines directly scraped from news source homepages")
+if not rss_main_filtered.empty:
+    display_news_feed(rss_main_filtered, max_items=10)
+elif not rss_main_df.empty:
+    st.info("📭 No main page news matches your filters.")
 else:
     st.info("⏳ Main page news will appear here...")
 st.markdown("---")
+# ---- SECTION 3: RSS Feed News (Lowest Priority) ----
 col1, col2, col3 = st.columns([2, 1, 1])
 with col1:
+    st.markdown("## 📰 RSS Feed News")
 with col2:
     show_count = st.selectbox("Show", [10, 20, 50, 100], index=1, label_visibility="collapsed")
 with col3:
+    if not rss_all_filtered.empty:
+        st.caption(f"Displaying {min(show_count, len(rss_all_filtered))} of {len(rss_all_filtered)} stories")
+st.caption("Aggregated news from RSS feeds across all sources")
+if not rss_all_filtered.empty:
+    display_news_feed(rss_all_filtered, max_items=show_count)
+elif not rss_all_df.empty:
+    st.info("📭 No RSS news matches your current filters.")
 else:
+    st.info("⏳ RSS feed news will appear here...")
 # Auto-refresh logic
 if auto_refresh:

app/services/twitter_news_playwright.py ADDED Viewed

	@@ -0,0 +1,458 @@

+"""
+Professional Finance News Monitor using Playwright
+Real-time Twitter/X scraping without authentication
+Optimized for low-latency trading decisions
+"""
+import pandas as pd
+from datetime import datetime, timedelta
+from typing import List, Dict, Optional
+import streamlit as st
+import re
+import logging
+from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+try:
+    from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
+    PLAYWRIGHT_AVAILABLE = True
+except ImportError:
+    PLAYWRIGHT_AVAILABLE = False
+    logger.warning("playwright not available. Install with: pip install playwright && playwright install chromium")
+class TwitterFinanceMonitor:
+    """
+    Professional-grade financial news aggregator using Playwright
+    No authentication required - public Twitter/X profiles only
+    """
+    # Premium financial Twitter accounts
+    SOURCES = {
+        # ===== TIER 1: Breaking News Aggregators =====
+        'walter_bloomberg': {
+            'handle': 'WalterBloomberg',
+            'url': 'https://x.com/WalterBloomberg',
+            'weight': 1.9,
+            'specialization': ['macro', 'markets', 'geopolitical']
+        },
+        'fxhedge': {
+            'handle': 'Fxhedgers',
+            'url': 'https://x.com/Fxhedgers',
+            'weight': 1.7,
+            'specialization': ['macro', 'markets']
+        },
+        'deitaone': {
+            'handle': 'DeItaone',
+            'url': 'https://x.com/DeItaone',
+            'weight': 1.8,
+            'specialization': ['markets', 'macro']
+        },
+        'firstsquawk': {
+            'handle': 'FirstSquawk',
+            'url': 'https://x.com/FirstSquawk',
+            'weight': 1.7,
+            'specialization': ['markets', 'macro']
+        },
+        'livesquawk': {
+            'handle': 'LiveSquawk',
+            'url': 'https://x.com/LiveSquawk',
+            'weight': 1.7,
+            'specialization': ['markets', 'macro']
+        },
+        # ===== TIER 2: Major News Agencies =====
+        'reuters': {
+            'handle': 'Reuters',
+            'url': 'https://x.com/Reuters',
+            'weight': 1.9,
+            'specialization': ['geopolitical', 'macro', 'markets']
+        },
+        'bloomberg': {
+            'handle': 'business',
+            'url': 'https://x.com/business',
+            'weight': 1.9,
+            'specialization': ['markets', 'macro']
+        },
+        'ft': {
+            'handle': 'FT',
+            'url': 'https://x.com/FT',
+            'weight': 1.8,
+            'specialization': ['markets', 'macro', 'geopolitical']
+        },
+        'wsj': {
+            'handle': 'WSJ',
+            'url': 'https://x.com/WSJ',
+            'weight': 1.8,
+            'specialization': ['markets', 'macro', 'geopolitical']
+        },
+        'cnbc': {
+            'handle': 'CNBC',
+            'url': 'https://x.com/CNBC',
+            'weight': 1.6,
+            'specialization': ['markets', 'macro']
+        },
+        'bbcbusiness': {
+            'handle': 'BBCBusiness',
+            'url': 'https://x.com/BBCBusiness',
+            'weight': 1.7,
+            'specialization': ['geopolitical', 'macro', 'markets']
+        },
+        # ===== TIER 3: Specialized Financial Media =====
+        'zerohedge': {
+            'handle': 'zerohedge',
+            'url': 'https://x.com/zerohedge',
+            'weight': 1.5,
+            'specialization': ['macro', 'geopolitical', 'markets']
+        },
+        'marketwatch': {
+            'handle': 'MarketWatch',
+            'url': 'https://x.com/MarketWatch',
+            'weight': 1.6,
+            'specialization': ['markets', 'macro']
+        },
+        'unusual_whales': {
+            'handle': 'unusual_whales',
+            'url': 'https://x.com/unusual_whales',
+            'weight': 1.5,
+            'specialization': ['markets']
+        },
+        'financialtimes': {
+            'handle': 'FinancialTimes',
+            'url': 'https://x.com/FinancialTimes',
+            'weight': 1.8,
+            'specialization': ['markets', 'macro', 'geopolitical']
+        },
+        # ===== TIER 4: Economists & Analysis =====
+        'economics': {
+            'handle': 'economics',
+            'url': 'https://x.com/economics',
+            'weight': 1.7,
+            'specialization': ['macro', 'geopolitical']
+        },
+        'ap': {
+            'handle': 'AP',
+            'url': 'https://x.com/AP',
+            'weight': 1.7,
+            'specialization': ['geopolitical', 'macro']
+        },
+        'afp': {
+            'handle': 'AFP',
+            'url': 'https://x.com/AFP',
+            'weight': 1.7,
+            'specialization': ['geopolitical', 'macro']
+        },
+        'ajenglish': {
+            'handle': 'AJEnglish',
+            'url': 'https://x.com/AJEnglish',
+            'weight': 1.6,
+            'specialization': ['geopolitical', 'macro']
+        }
+    }
+    # Keyword detection for categorization
+    MACRO_KEYWORDS = [
+        'Fed', 'ECB', 'BoE', 'BoJ', 'FOMC', 'Powell', 'Lagarde',
+        'interest rate', 'inflation', 'CPI', 'PPI', 'GDP',
+        'unemployment', 'jobs report', 'NFP', 'central bank',
+        'monetary policy', 'quantitative', 'recession'
+    ]
+    MARKET_KEYWORDS = [
+        'S&P', 'Dow', 'Nasdaq', 'Russell', 'stocks', 'equities',
+        'earnings', 'revenue', 'profit', 'shares', 'IPO',
+        'merger', 'acquisition', 'crypto', 'Bitcoin', 'Ethereum',
+        'oil', 'gold', 'commodities', 'futures', 'options'
+    ]
+    GEOPOLITICAL_KEYWORDS = [
+        'war', 'conflict', 'sanctions', 'trade', 'tariff',
+        'China', 'Russia', 'Ukraine', 'Taiwan', 'Middle East',
+        'election', 'government', 'military', 'diplomatic',
+        'treaty', 'EU', 'Brexit', 'OPEC'
+    ]
+    def __init__(self):
+        """Initialize monitor"""
+        self.news_cache = []
+        self.last_fetch = None
+        self.cache_ttl = 180  # 3 minutes
+    def _scrape_twitter_profile(self, source_name: str, source_info: Dict, timeout: int = 15) -> List[Dict]:
+        """Scrape tweets from a single Twitter profile using Playwright"""
+        if not PLAYWRIGHT_AVAILABLE:
+            logger.warning("Playwright not available")
+            return []
+        try:
+            with sync_playwright() as p:
+                # Launch lightweight browser
+                browser = p.chromium.launch(
+                    headless=True,
+                    args=['--disable-blink-features=AutomationControlled']
+                )
+                context = browser.new_context(
+                    user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+                )
+                page = context.new_page()
+                # Block images, fonts, and css for speed
+                def route_intercept(route):
+                    if route.request.resource_type in ["image", "media", "font", "stylesheet"]:
+                        route.abort()
+                    else:
+                        route.continue_()
+                page.route("**/*", route_intercept)
+                # Navigate to profile
+                logger.info(f"Scraping {source_name} from {source_info['url']}")
+                page.goto(source_info['url'], timeout=timeout * 1000)
+                # Wait for tweets to load
+                try:
+                    page.wait_for_selector("article", timeout=timeout * 1000)
+                except PlaywrightTimeoutError:
+                    logger.warning(f"Timeout waiting for tweets from {source_name}")
+                    browser.close()
+                    return []
+                # Extract tweet texts
+                tweet_elements = page.locator("article div[data-testid='tweetText']").all()
+                news_items = []
+                for idx, element in enumerate(tweet_elements[:20]):  # Limit to 20 most recent
+                    try:
+                        text = element.text_content()
+                        if not text or len(text) < 10:
+                            continue
+                        # Clean text
+                        text = text.strip()
+                        text = re.sub(r'\s+', ' ', text)
+                        # Skip retweets and replies
+                        if text.startswith('RT @') or text.startswith('@'):
+                            continue
+                        # Categorize and analyze
+                        category = self._categorize_text(text, source_info['specialization'])
+                        sentiment = self._analyze_sentiment(text)
+                        impact = self._assess_impact(source_info['weight'], text)
+                        is_breaking = self._detect_breaking_news(text)
+                        # Create summary
+                        summary = self._extract_summary(text) if len(text) > 150 else text
+                        news_items.append({
+                            'id': hash(f"{source_name}_{idx}_{datetime.now().isoformat()}"),
+                            'title': text,
+                            'summary': summary,
+                            'source': source_info['handle'],
+                            'category': category,
+                            'timestamp': datetime.now() - timedelta(minutes=idx),  # Approximate time
+                            'sentiment': sentiment,
+                            'impact': impact,
+                            'url': source_info['url'],
+                            'likes': 0,
+                            'retweets': 0,
+                            'is_breaking': is_breaking,
+                            'source_weight': source_info['weight'],
+                            'from_web': True
+                        })
+                    except Exception as e:
+                        logger.debug(f"Error parsing tweet from {source_name}: {e}")
+                        continue
+                browser.close()
+                logger.info(f"Scraped {len(news_items)} tweets from {source_name}")
+                return news_items
+        except Exception as e:
+            logger.error(f"Error scraping {source_name}: {e}")
+            return []
+    @st.cache_data(ttl=180)
+    def scrape_twitter_news(_self, max_tweets: int = 100) -> List[Dict]:
+        """
+        Scrape latest financial news from Twitter using Playwright
+        Runs in parallel for better performance
+        """
+        if not PLAYWRIGHT_AVAILABLE:
+            logger.info("Playwright not available - using mock data")
+            return _self._get_mock_news()
+        all_news = []
+        seen_texts = set()
+        # Scrape sources in parallel with timeout
+        with ThreadPoolExecutor(max_workers=3) as executor:
+            futures = []
+            for name, info in _self.SOURCES.items():
+                future = executor.submit(_self._scrape_twitter_profile, name, info, timeout=15)
+                futures.append((future, name))
+            for future, source_name in futures:
+                try:
+                    # Wait max 20 seconds per source
+                    news_items = future.result(timeout=20)
+                    # Deduplicate based on text similarity
+                    unique_items = []
+                    for item in news_items:
+                        text_hash = hash(item['title'][:100])
+                        if text_hash not in seen_texts:
+                            seen_texts.add(text_hash)
+                            unique_items.append(item)
+                    all_news.extend(unique_items)
+                    if len(unique_items) > 0:
+                        logger.info(f"Fetched {len(unique_items)} unique tweets from {source_name}")
+                except FuturesTimeoutError:
+                    logger.warning(f"Timeout scraping {source_name}")
+                except Exception as e:
+                    logger.error(f"Error processing {source_name}: {e}")
+        # If no news was fetched, use mock data
+        if not all_news:
+            logger.warning("No tweets fetched - using mock data")
+            return _self._get_mock_news()
+        # Sort by breaking news, then impact, then timestamp
+        all_news.sort(
+            key=lambda x: (x['is_breaking'], x['impact'] == 'high', x['timestamp']),
+            reverse=True
+        )
+        logger.info(f"Total unique tweets: {len(all_news)}")
+        return all_news[:max_tweets]
+    def _categorize_text(self, text: str, source_specialization: List[str]) -> str:
+        """Categorize news based on keywords and source specialization"""
+        text_lower = text.lower()
+        # Count keyword matches
+        macro_score = sum(1 for kw in self.MACRO_KEYWORDS if kw.lower() in text_lower)
+        market_score = sum(1 for kw in self.MARKET_KEYWORDS if kw.lower() in text_lower)
+        geo_score = sum(1 for kw in self.GEOPOLITICAL_KEYWORDS if kw.lower() in text_lower)
+        # Boost scores based on source specialization
+        if 'macro' in source_specialization:
+            macro_score *= 1.5
+        if 'markets' in source_specialization:
+            market_score *= 1.5
+        if 'geopolitical' in source_specialization:
+            geo_score *= 1.5
+        # Return category with highest score
+        scores = {'macro': macro_score, 'markets': market_score, 'geopolitical': geo_score}
+        return max(scores, key=scores.get)
+    def _analyze_sentiment(self, text: str) -> str:
+        """Simple keyword-based sentiment analysis for trading"""
+        text_lower = text.lower()
+        positive_keywords = ['surge', 'rally', 'gain', 'rise', 'up', 'bullish', 'strong', 'beat', 'exceed']
+        negative_keywords = ['crash', 'plunge', 'fall', 'down', 'bearish', 'weak', 'miss', 'below', 'loss']
+        pos_count = sum(1 for kw in positive_keywords if kw in text_lower)
+        neg_count = sum(1 for kw in negative_keywords if kw in text_lower)
+        if pos_count > neg_count:
+            return 'positive'
+        elif neg_count > pos_count:
+            return 'negative'
+        return 'neutral'
+    def _assess_impact(self, source_weight: float, text: str) -> str:
+        """Assess market impact based on source weight and keywords"""
+        text_lower = text.lower()
+        high_impact_keywords = ['breaking', 'alert', 'urgent', 'flash', 'fed', 'powell', 'rate', 'war']
+        impact_score = sum(1 for kw in high_impact_keywords if kw in text_lower)
+        # Combine source weight and keyword impact
+        total_impact = source_weight + (impact_score * 0.3)
+        if total_impact >= 1.8:
+            return 'high'
+        elif total_impact >= 1.4:
+            return 'medium'
+        return 'low'
+    def _detect_breaking_news(self, text: str) -> bool:
+        """Detect if news is breaking/urgent"""
+        text_lower = text.lower()
+        breaking_keywords = ['breaking', 'alert', 'urgent', 'flash', '*breaking*', '🚨']
+        return any(kw in text_lower for kw in breaking_keywords)
+    def _extract_summary(self, text: str) -> str:
+        """Extract first 150 characters as summary"""
+        if len(text) <= 150:
+            return text
+        return text[:147] + "..."
+    def _get_mock_news(self) -> List[Dict]:
+        """Return mock data when scraping fails"""
+        mock_news = [
+            {
+                'id': hash('mock1'),
+                'title': 'Fed signals potential rate pause as inflation moderates',
+                'summary': 'Fed signals potential rate pause as inflation moderates',
+                'source': 'Mock Data',
+                'category': 'macro',
+                'timestamp': datetime.now() - timedelta(minutes=5),
+                'sentiment': 'neutral',
+                'impact': 'high',
+                'url': 'https://x.com',
+                'likes': 0,
+                'retweets': 0,
+                'is_breaking': False,
+                'source_weight': 1.5,
+                'from_web': True
+            },
+            {
+                'id': hash('mock2'),
+                'title': 'S&P 500 futures rise ahead of key earnings reports',
+                'summary': 'S&P 500 futures rise ahead of key earnings reports',
+                'source': 'Mock Data',
+                'category': 'markets',
+                'timestamp': datetime.now() - timedelta(minutes=15),
+                'sentiment': 'positive',
+                'impact': 'medium',
+                'url': 'https://x.com',
+                'likes': 0,
+                'retweets': 0,
+                'is_breaking': False,
+                'source_weight': 1.5,
+                'from_web': True
+            }
+        ]
+        return mock_news
+    def get_statistics(self) -> Dict:
+        """Get statistics about cached news"""
+        if not self.news_cache:
+            return {
+                'total': 0,
+                'high_impact': 0,
+                'breaking': 0,
+                'last_update': 'Never',
+                'by_category': {}
+            }
+        df = pd.DataFrame(self.news_cache)
+        return {
+            'total': len(df),
+            'high_impact': len(df[df['impact'] == 'high']),
+            'breaking': len(df[df['is_breaking'] == True]),
+            'last_update': self.last_fetch.strftime('%H:%M:%S') if self.last_fetch else 'Never',
+            'by_category': df['category'].value_counts().to_dict()
+        }

requirements.txt CHANGED Viewed

@@ -8,3 +8,5 @@ twikit>=2.3.0
 feedparser>=6.0.0
 beautifulsoup4>=4.12.0
 lxml>=5.0.0

 feedparser>=6.0.0
 beautifulsoup4>=4.12.0
 lxml>=5.0.0
+ntscraper
+playwright>=1.40.0