Dmitry Beresnev commited on
Commit
f6b2909
·
1 Parent(s): e9c5fb7

add twitter news

Browse files
app/pages/05_Dashboard.py CHANGED
@@ -18,17 +18,18 @@ from components.news import (
18
  display_breaking_news_banner
19
  )
20
 
21
- # Try to import RSS scraper first (most reliable), fall back to Twikit, then old snscrape
22
  try:
23
- from services.news_scraper import FinanceNewsScraper as FinanceNewsMonitor
24
- NEWS_SOURCE = "RSS Feeds"
25
  except ImportError:
26
- try:
27
- from services.news_monitor_twikit import FinanceNewsMonitor
28
- NEWS_SOURCE = "Twikit"
29
- except ImportError:
30
- from services.news_monitor import FinanceNewsMonitor
31
- NEWS_SOURCE = "snscrape"
 
32
 
33
 
34
  # ---- Page Configuration ----
@@ -42,11 +43,15 @@ st.set_page_config(
42
  # ---- Apply Dark Theme ----
43
  st.markdown(DARK_THEME_CSS, unsafe_allow_html=True)
44
 
45
- # Initialize news monitor (with caching)
46
- if 'news_monitor' not in st.session_state:
47
- st.session_state.news_monitor = FinanceNewsMonitor()
48
 
49
- monitor = st.session_state.news_monitor
 
 
 
 
50
 
51
  # ---- Header ----
52
  st.markdown("# 🤖 Live Financial News & AI Dashboard")
@@ -102,38 +107,48 @@ with st.sidebar:
102
  st.markdown("---")
103
  st.markdown("### 📊 Feed Statistics")
104
 
105
- # Get and display stats
106
- stats = monitor.get_statistics()
107
- st.metric("Total Stories", stats['total'])
108
- st.metric("High Impact", stats['high_impact'])
109
- st.metric("Breaking News", stats['breaking'])
110
- st.caption(f"Last update: {stats['last_update']}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
  st.markdown("---")
113
  st.markdown("### ℹ️ Sources")
114
 
115
- # Get actual source count
116
- total_sources = len(monitor.SOURCES)
 
 
117
 
118
  st.markdown(f"""
119
  <div style='font-size: 11px; line-height: 1.6;'>
120
 
121
- **Tier 1: Financial News (8)**
122
- ReutersBloomberg × 2 FT
123
- WSJThe EconomistCNBC
124
- • MarketWatch
125
-
126
- **Tier 2: Geopolitical (5)**
127
- • BBC World • AFP • Al Jazeera
128
- • Politico • DW News
129
-
130
- **Tier 3: Central Banks (7)**
131
- • Fed (2.0x) • ECB (2.0x) • Lagarde
132
- • BoE • IMF • World Bank • Treasury
133
 
134
- **Tier 4: Alpha Accounts (3)**
135
- Zero HedgeFirst Squawk
136
- Live Squawk
 
137
 
138
  **Total: {total_sources} Premium Sources**
139
  </div>
@@ -147,73 +162,108 @@ force_refresh = st.session_state.get('force_refresh', False)
147
  if force_refresh:
148
  st.session_state.force_refresh = False
149
 
150
- # Get filtered news
 
 
 
 
 
 
151
  with st.spinner("🔍 Fetching latest financial news..."):
152
- news_df = monitor.get_news(
153
- category=category_filter,
154
- sentiment=sentiment_filter,
155
- impact=impact_filter,
156
- refresh=force_refresh
157
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
- # Display demo mode notice if using mock data
160
- if len(news_df) > 0 and news_df.iloc[0].get('id', 0) < 100:
161
- st.info("📢 **Demo Mode**: Twitter/X API is currently unavailable. Displaying sample news data to showcase the platform's features. In production, this would show real-time financial news from 23 premium sources.")
162
 
163
- # Display breaking news banner if exists
164
- display_breaking_news_banner(news_df)
165
 
166
- # Statistics overview
167
- st.markdown("## 📊 News Feed Overview")
168
- stats = monitor.get_statistics()
169
- display_news_statistics(stats)
170
 
171
- st.markdown("<br>", unsafe_allow_html=True)
 
 
 
 
172
 
173
- # Category breakdown
174
- display_category_breakdown(stats)
 
 
 
 
175
 
176
  st.markdown("---")
177
 
178
- # ---- MAIN PAGE NEWS (Web-Scraped) ----
179
  st.markdown("## 🔥 Top Stories from Main Pages")
180
- st.caption("Latest headlines directly from news source homepages")
181
 
182
- main_page_df = monitor.get_main_page_news()
183
- if not main_page_df.empty:
184
- # Apply filters to main page news
185
- filtered_main = main_page_df.copy()
186
- if category_filter != 'all':
187
- filtered_main = filtered_main[filtered_main['category'] == category_filter]
188
- if sentiment_filter != 'all':
189
- filtered_main = filtered_main[filtered_main['sentiment'] == sentiment_filter]
190
- if impact_filter != 'all':
191
- filtered_main = filtered_main[filtered_main['impact'] == impact_filter]
192
-
193
- if not filtered_main.empty:
194
- display_news_feed(filtered_main, max_items=10)
195
- else:
196
- st.info("📭 No main page news matches your filters.")
197
  else:
198
  st.info("⏳ Main page news will appear here...")
199
 
200
  st.markdown("---")
201
 
202
- # ---- ALL NEWS FEED (RSS + Web) ----
203
  col1, col2, col3 = st.columns([2, 1, 1])
204
  with col1:
205
- st.markdown("## 📰 All News Feed")
206
  with col2:
207
  show_count = st.selectbox("Show", [10, 20, 50, 100], index=1, label_visibility="collapsed")
208
  with col3:
209
- if not news_df.empty:
210
- st.caption(f"Displaying {min(show_count, len(news_df))} of {len(news_df)} stories")
 
 
211
 
212
- # Display news feed
213
- if not news_df.empty:
214
- display_news_feed(news_df, max_items=show_count)
 
215
  else:
216
- st.info("📭 No news matches your current filters. Try adjusting the filters or refresh the feed.")
217
 
218
  # Auto-refresh logic
219
  if auto_refresh:
 
18
  display_breaking_news_banner
19
  )
20
 
21
+ # Import news scrapers
22
  try:
23
+ from services.news_scraper import FinanceNewsScraper
24
+ RSS_AVAILABLE = True
25
  except ImportError:
26
+ RSS_AVAILABLE = False
27
+
28
+ try:
29
+ from services.twitter_news_playwright import TwitterFinanceMonitor
30
+ TWITTER_AVAILABLE = True
31
+ except ImportError:
32
+ TWITTER_AVAILABLE = False
33
 
34
 
35
  # ---- Page Configuration ----
 
43
  # ---- Apply Dark Theme ----
44
  st.markdown(DARK_THEME_CSS, unsafe_allow_html=True)
45
 
46
+ # Initialize news monitors (with caching)
47
+ if 'rss_monitor' not in st.session_state and RSS_AVAILABLE:
48
+ st.session_state.rss_monitor = FinanceNewsScraper()
49
 
50
+ if 'twitter_monitor' not in st.session_state and TWITTER_AVAILABLE:
51
+ st.session_state.twitter_monitor = TwitterFinanceMonitor()
52
+
53
+ rss_monitor = st.session_state.get('rss_monitor')
54
+ twitter_monitor = st.session_state.get('twitter_monitor')
55
 
56
  # ---- Header ----
57
  st.markdown("# 🤖 Live Financial News & AI Dashboard")
 
107
  st.markdown("---")
108
  st.markdown("### 📊 Feed Statistics")
109
 
110
+ # Calculate combined stats
111
+ total_stories = 0
112
+ high_impact_count = 0
113
+ breaking_count = 0
114
+
115
+ if twitter_monitor:
116
+ twitter_stats = twitter_monitor.get_statistics()
117
+ total_stories += twitter_stats['total']
118
+ high_impact_count += twitter_stats['high_impact']
119
+ breaking_count += twitter_stats['breaking']
120
+
121
+ if rss_monitor:
122
+ rss_stats = rss_monitor.get_statistics()
123
+ total_stories += rss_stats['total']
124
+ high_impact_count += rss_stats['high_impact']
125
+ breaking_count += rss_stats['breaking']
126
+
127
+ st.metric("Total Stories", total_stories)
128
+ st.metric("High Impact", high_impact_count)
129
+ st.metric("Breaking News", breaking_count)
130
 
131
  st.markdown("---")
132
  st.markdown("### ℹ️ Sources")
133
 
134
+ # Count total sources
135
+ twitter_sources = len(twitter_monitor.SOURCES) if twitter_monitor else 0
136
+ rss_sources = len(rss_monitor.SOURCES) if rss_monitor else 0
137
+ total_sources = twitter_sources + rss_sources
138
 
139
  st.markdown(f"""
140
  <div style='font-size: 11px; line-height: 1.6;'>
141
 
142
+ **Twitter/X Accounts ({twitter_sources})**
143
+ WalterBloombergFXHedgeDeItaone
144
+ ReutersBloomberg FT WSJ
145
+ CNBC • BBC • MarketWatch
146
+ • The Economist • AP • AFP
 
 
 
 
 
 
 
147
 
148
+ **RSS + Web Scraping ({rss_sources})**
149
+ CNBC Bloomberg FT • WSJ
150
+ BBC • Yahoo Finance • The Economist
151
+ • Fed (2.0x) • ECB (2.0x) • IMF
152
 
153
  **Total: {total_sources} Premium Sources**
154
  </div>
 
162
  if force_refresh:
163
  st.session_state.force_refresh = False
164
 
165
+ # Fetch news from all sources
166
+ import pandas as pd
167
+
168
+ twitter_df = pd.DataFrame()
169
+ rss_all_df = pd.DataFrame()
170
+ rss_main_df = pd.DataFrame()
171
+
172
  with st.spinner("🔍 Fetching latest financial news..."):
173
+ # Fetch Twitter/X news (highest priority)
174
+ if twitter_monitor:
175
+ try:
176
+ twitter_news = twitter_monitor.scrape_twitter_news(max_tweets=50)
177
+ if twitter_news:
178
+ twitter_df = pd.DataFrame(twitter_news)
179
+ if not twitter_df.empty:
180
+ twitter_df['timestamp'] = pd.to_datetime(twitter_df['timestamp'])
181
+ except Exception as e:
182
+ st.warning(f"Twitter scraping unavailable: {e}")
183
+
184
+ # Fetch RSS + Web scraped news
185
+ if rss_monitor:
186
+ try:
187
+ rss_news = rss_monitor.scrape_news(max_items=100)
188
+ if rss_news:
189
+ rss_all_df = pd.DataFrame(rss_news)
190
+ if not rss_all_df.empty:
191
+ rss_all_df['timestamp'] = pd.to_datetime(rss_all_df['timestamp'])
192
+ # Get main page news subset
193
+ rss_main_df = rss_all_df[rss_all_df['from_web'] == True].copy()
194
+ except Exception as e:
195
+ st.warning(f"RSS scraping unavailable: {e}")
196
+
197
+ # Apply filters to each dataset
198
+ def apply_filters(df):
199
+ if df.empty:
200
+ return df
201
+ filtered = df.copy()
202
+ if category_filter != 'all':
203
+ filtered = filtered[filtered['category'] == category_filter]
204
+ if sentiment_filter != 'all':
205
+ filtered = filtered[filtered['sentiment'] == sentiment_filter]
206
+ if impact_filter != 'all':
207
+ filtered = filtered[filtered['impact'] == impact_filter]
208
+ return filtered
209
 
210
+ twitter_filtered = apply_filters(twitter_df)
211
+ rss_main_filtered = apply_filters(rss_main_df)
212
+ rss_all_filtered = apply_filters(rss_all_df)
213
 
214
+ # Combine all for breaking news banner
215
+ all_news_df = pd.concat([twitter_filtered, rss_all_filtered], ignore_index=True) if not twitter_filtered.empty or not rss_all_filtered.empty else pd.DataFrame()
216
 
217
+ # Display breaking news banner
218
+ if not all_news_df.empty:
219
+ display_breaking_news_banner(all_news_df)
 
220
 
221
+ st.markdown("---")
222
+
223
+ # ---- SECTION 1: Twitter/X Breaking News (Highest Priority) ----
224
+ st.markdown("## 🐦 Twitter/X Financial News Feed")
225
+ st.caption("Real-time breaking news from premium Twitter/X accounts (WalterBloomberg, Reuters, Bloomberg, FT, etc.)")
226
 
227
+ if not twitter_filtered.empty:
228
+ display_news_feed(twitter_filtered, max_items=15)
229
+ elif not twitter_df.empty:
230
+ st.info("📭 No Twitter news matches your current filters.")
231
+ else:
232
+ st.info("⏳ Twitter news scraping in progress... This may take 30-60 seconds on first load.")
233
 
234
  st.markdown("---")
235
 
236
+ # ---- SECTION 2: Main Page News (Web-Scraped) ----
237
  st.markdown("## 🔥 Top Stories from Main Pages")
238
+ st.caption("Latest headlines directly scraped from news source homepages")
239
 
240
+ if not rss_main_filtered.empty:
241
+ display_news_feed(rss_main_filtered, max_items=10)
242
+ elif not rss_main_df.empty:
243
+ st.info("📭 No main page news matches your filters.")
 
 
 
 
 
 
 
 
 
 
 
244
  else:
245
  st.info("⏳ Main page news will appear here...")
246
 
247
  st.markdown("---")
248
 
249
+ # ---- SECTION 3: RSS Feed News (Lowest Priority) ----
250
  col1, col2, col3 = st.columns([2, 1, 1])
251
  with col1:
252
+ st.markdown("## 📰 RSS Feed News")
253
  with col2:
254
  show_count = st.selectbox("Show", [10, 20, 50, 100], index=1, label_visibility="collapsed")
255
  with col3:
256
+ if not rss_all_filtered.empty:
257
+ st.caption(f"Displaying {min(show_count, len(rss_all_filtered))} of {len(rss_all_filtered)} stories")
258
+
259
+ st.caption("Aggregated news from RSS feeds across all sources")
260
 
261
+ if not rss_all_filtered.empty:
262
+ display_news_feed(rss_all_filtered, max_items=show_count)
263
+ elif not rss_all_df.empty:
264
+ st.info("📭 No RSS news matches your current filters.")
265
  else:
266
+ st.info(" RSS feed news will appear here...")
267
 
268
  # Auto-refresh logic
269
  if auto_refresh:
app/services/twitter_news_playwright.py ADDED
@@ -0,0 +1,458 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Professional Finance News Monitor using Playwright
3
+ Real-time Twitter/X scraping without authentication
4
+ Optimized for low-latency trading decisions
5
+ """
6
+
7
+ import pandas as pd
8
+ from datetime import datetime, timedelta
9
+ from typing import List, Dict, Optional
10
+ import streamlit as st
11
+ import re
12
+ import logging
13
+ from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError
14
+
15
+ # Configure logging
16
+ logging.basicConfig(level=logging.INFO)
17
+ logger = logging.getLogger(__name__)
18
+
19
+ try:
20
+ from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
21
+ PLAYWRIGHT_AVAILABLE = True
22
+ except ImportError:
23
+ PLAYWRIGHT_AVAILABLE = False
24
+ logger.warning("playwright not available. Install with: pip install playwright && playwright install chromium")
25
+
26
+
27
+ class TwitterFinanceMonitor:
28
+ """
29
+ Professional-grade financial news aggregator using Playwright
30
+ No authentication required - public Twitter/X profiles only
31
+ """
32
+
33
+ # Premium financial Twitter accounts
34
+ SOURCES = {
35
+ # ===== TIER 1: Breaking News Aggregators =====
36
+ 'walter_bloomberg': {
37
+ 'handle': 'WalterBloomberg',
38
+ 'url': 'https://x.com/WalterBloomberg',
39
+ 'weight': 1.9,
40
+ 'specialization': ['macro', 'markets', 'geopolitical']
41
+ },
42
+ 'fxhedge': {
43
+ 'handle': 'Fxhedgers',
44
+ 'url': 'https://x.com/Fxhedgers',
45
+ 'weight': 1.7,
46
+ 'specialization': ['macro', 'markets']
47
+ },
48
+ 'deitaone': {
49
+ 'handle': 'DeItaone',
50
+ 'url': 'https://x.com/DeItaone',
51
+ 'weight': 1.8,
52
+ 'specialization': ['markets', 'macro']
53
+ },
54
+ 'firstsquawk': {
55
+ 'handle': 'FirstSquawk',
56
+ 'url': 'https://x.com/FirstSquawk',
57
+ 'weight': 1.7,
58
+ 'specialization': ['markets', 'macro']
59
+ },
60
+ 'livesquawk': {
61
+ 'handle': 'LiveSquawk',
62
+ 'url': 'https://x.com/LiveSquawk',
63
+ 'weight': 1.7,
64
+ 'specialization': ['markets', 'macro']
65
+ },
66
+
67
+ # ===== TIER 2: Major News Agencies =====
68
+ 'reuters': {
69
+ 'handle': 'Reuters',
70
+ 'url': 'https://x.com/Reuters',
71
+ 'weight': 1.9,
72
+ 'specialization': ['geopolitical', 'macro', 'markets']
73
+ },
74
+ 'bloomberg': {
75
+ 'handle': 'business',
76
+ 'url': 'https://x.com/business',
77
+ 'weight': 1.9,
78
+ 'specialization': ['markets', 'macro']
79
+ },
80
+ 'ft': {
81
+ 'handle': 'FT',
82
+ 'url': 'https://x.com/FT',
83
+ 'weight': 1.8,
84
+ 'specialization': ['markets', 'macro', 'geopolitical']
85
+ },
86
+ 'wsj': {
87
+ 'handle': 'WSJ',
88
+ 'url': 'https://x.com/WSJ',
89
+ 'weight': 1.8,
90
+ 'specialization': ['markets', 'macro', 'geopolitical']
91
+ },
92
+ 'cnbc': {
93
+ 'handle': 'CNBC',
94
+ 'url': 'https://x.com/CNBC',
95
+ 'weight': 1.6,
96
+ 'specialization': ['markets', 'macro']
97
+ },
98
+ 'bbcbusiness': {
99
+ 'handle': 'BBCBusiness',
100
+ 'url': 'https://x.com/BBCBusiness',
101
+ 'weight': 1.7,
102
+ 'specialization': ['geopolitical', 'macro', 'markets']
103
+ },
104
+
105
+ # ===== TIER 3: Specialized Financial Media =====
106
+ 'zerohedge': {
107
+ 'handle': 'zerohedge',
108
+ 'url': 'https://x.com/zerohedge',
109
+ 'weight': 1.5,
110
+ 'specialization': ['macro', 'geopolitical', 'markets']
111
+ },
112
+ 'marketwatch': {
113
+ 'handle': 'MarketWatch',
114
+ 'url': 'https://x.com/MarketWatch',
115
+ 'weight': 1.6,
116
+ 'specialization': ['markets', 'macro']
117
+ },
118
+ 'unusual_whales': {
119
+ 'handle': 'unusual_whales',
120
+ 'url': 'https://x.com/unusual_whales',
121
+ 'weight': 1.5,
122
+ 'specialization': ['markets']
123
+ },
124
+ 'financialtimes': {
125
+ 'handle': 'FinancialTimes',
126
+ 'url': 'https://x.com/FinancialTimes',
127
+ 'weight': 1.8,
128
+ 'specialization': ['markets', 'macro', 'geopolitical']
129
+ },
130
+
131
+ # ===== TIER 4: Economists & Analysis =====
132
+ 'economics': {
133
+ 'handle': 'economics',
134
+ 'url': 'https://x.com/economics',
135
+ 'weight': 1.7,
136
+ 'specialization': ['macro', 'geopolitical']
137
+ },
138
+ 'ap': {
139
+ 'handle': 'AP',
140
+ 'url': 'https://x.com/AP',
141
+ 'weight': 1.7,
142
+ 'specialization': ['geopolitical', 'macro']
143
+ },
144
+ 'afp': {
145
+ 'handle': 'AFP',
146
+ 'url': 'https://x.com/AFP',
147
+ 'weight': 1.7,
148
+ 'specialization': ['geopolitical', 'macro']
149
+ },
150
+ 'ajenglish': {
151
+ 'handle': 'AJEnglish',
152
+ 'url': 'https://x.com/AJEnglish',
153
+ 'weight': 1.6,
154
+ 'specialization': ['geopolitical', 'macro']
155
+ }
156
+ }
157
+
158
+ # Keyword detection for categorization
159
+ MACRO_KEYWORDS = [
160
+ 'Fed', 'ECB', 'BoE', 'BoJ', 'FOMC', 'Powell', 'Lagarde',
161
+ 'interest rate', 'inflation', 'CPI', 'PPI', 'GDP',
162
+ 'unemployment', 'jobs report', 'NFP', 'central bank',
163
+ 'monetary policy', 'quantitative', 'recession'
164
+ ]
165
+
166
+ MARKET_KEYWORDS = [
167
+ 'S&P', 'Dow', 'Nasdaq', 'Russell', 'stocks', 'equities',
168
+ 'earnings', 'revenue', 'profit', 'shares', 'IPO',
169
+ 'merger', 'acquisition', 'crypto', 'Bitcoin', 'Ethereum',
170
+ 'oil', 'gold', 'commodities', 'futures', 'options'
171
+ ]
172
+
173
+ GEOPOLITICAL_KEYWORDS = [
174
+ 'war', 'conflict', 'sanctions', 'trade', 'tariff',
175
+ 'China', 'Russia', 'Ukraine', 'Taiwan', 'Middle East',
176
+ 'election', 'government', 'military', 'diplomatic',
177
+ 'treaty', 'EU', 'Brexit', 'OPEC'
178
+ ]
179
+
180
+ def __init__(self):
181
+ """Initialize monitor"""
182
+ self.news_cache = []
183
+ self.last_fetch = None
184
+ self.cache_ttl = 180 # 3 minutes
185
+
186
+ def _scrape_twitter_profile(self, source_name: str, source_info: Dict, timeout: int = 15) -> List[Dict]:
187
+ """Scrape tweets from a single Twitter profile using Playwright"""
188
+ if not PLAYWRIGHT_AVAILABLE:
189
+ logger.warning("Playwright not available")
190
+ return []
191
+
192
+ try:
193
+ with sync_playwright() as p:
194
+ # Launch lightweight browser
195
+ browser = p.chromium.launch(
196
+ headless=True,
197
+ args=['--disable-blink-features=AutomationControlled']
198
+ )
199
+ context = browser.new_context(
200
+ user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
201
+ )
202
+ page = context.new_page()
203
+
204
+ # Block images, fonts, and css for speed
205
+ def route_intercept(route):
206
+ if route.request.resource_type in ["image", "media", "font", "stylesheet"]:
207
+ route.abort()
208
+ else:
209
+ route.continue_()
210
+
211
+ page.route("**/*", route_intercept)
212
+
213
+ # Navigate to profile
214
+ logger.info(f"Scraping {source_name} from {source_info['url']}")
215
+ page.goto(source_info['url'], timeout=timeout * 1000)
216
+
217
+ # Wait for tweets to load
218
+ try:
219
+ page.wait_for_selector("article", timeout=timeout * 1000)
220
+ except PlaywrightTimeoutError:
221
+ logger.warning(f"Timeout waiting for tweets from {source_name}")
222
+ browser.close()
223
+ return []
224
+
225
+ # Extract tweet texts
226
+ tweet_elements = page.locator("article div[data-testid='tweetText']").all()
227
+
228
+ news_items = []
229
+ for idx, element in enumerate(tweet_elements[:20]): # Limit to 20 most recent
230
+ try:
231
+ text = element.text_content()
232
+ if not text or len(text) < 10:
233
+ continue
234
+
235
+ # Clean text
236
+ text = text.strip()
237
+ text = re.sub(r'\s+', ' ', text)
238
+
239
+ # Skip retweets and replies
240
+ if text.startswith('RT @') or text.startswith('@'):
241
+ continue
242
+
243
+ # Categorize and analyze
244
+ category = self._categorize_text(text, source_info['specialization'])
245
+ sentiment = self._analyze_sentiment(text)
246
+ impact = self._assess_impact(source_info['weight'], text)
247
+ is_breaking = self._detect_breaking_news(text)
248
+
249
+ # Create summary
250
+ summary = self._extract_summary(text) if len(text) > 150 else text
251
+
252
+ news_items.append({
253
+ 'id': hash(f"{source_name}_{idx}_{datetime.now().isoformat()}"),
254
+ 'title': text,
255
+ 'summary': summary,
256
+ 'source': source_info['handle'],
257
+ 'category': category,
258
+ 'timestamp': datetime.now() - timedelta(minutes=idx), # Approximate time
259
+ 'sentiment': sentiment,
260
+ 'impact': impact,
261
+ 'url': source_info['url'],
262
+ 'likes': 0,
263
+ 'retweets': 0,
264
+ 'is_breaking': is_breaking,
265
+ 'source_weight': source_info['weight'],
266
+ 'from_web': True
267
+ })
268
+
269
+ except Exception as e:
270
+ logger.debug(f"Error parsing tweet from {source_name}: {e}")
271
+ continue
272
+
273
+ browser.close()
274
+ logger.info(f"Scraped {len(news_items)} tweets from {source_name}")
275
+ return news_items
276
+
277
+ except Exception as e:
278
+ logger.error(f"Error scraping {source_name}: {e}")
279
+ return []
280
+
281
+ @st.cache_data(ttl=180)
282
+ def scrape_twitter_news(_self, max_tweets: int = 100) -> List[Dict]:
283
+ """
284
+ Scrape latest financial news from Twitter using Playwright
285
+ Runs in parallel for better performance
286
+ """
287
+ if not PLAYWRIGHT_AVAILABLE:
288
+ logger.info("Playwright not available - using mock data")
289
+ return _self._get_mock_news()
290
+
291
+ all_news = []
292
+ seen_texts = set()
293
+
294
+ # Scrape sources in parallel with timeout
295
+ with ThreadPoolExecutor(max_workers=3) as executor:
296
+ futures = []
297
+ for name, info in _self.SOURCES.items():
298
+ future = executor.submit(_self._scrape_twitter_profile, name, info, timeout=15)
299
+ futures.append((future, name))
300
+
301
+ for future, source_name in futures:
302
+ try:
303
+ # Wait max 20 seconds per source
304
+ news_items = future.result(timeout=20)
305
+
306
+ # Deduplicate based on text similarity
307
+ unique_items = []
308
+ for item in news_items:
309
+ text_hash = hash(item['title'][:100])
310
+ if text_hash not in seen_texts:
311
+ seen_texts.add(text_hash)
312
+ unique_items.append(item)
313
+
314
+ all_news.extend(unique_items)
315
+ if len(unique_items) > 0:
316
+ logger.info(f"Fetched {len(unique_items)} unique tweets from {source_name}")
317
+
318
+ except FuturesTimeoutError:
319
+ logger.warning(f"Timeout scraping {source_name}")
320
+ except Exception as e:
321
+ logger.error(f"Error processing {source_name}: {e}")
322
+
323
+ # If no news was fetched, use mock data
324
+ if not all_news:
325
+ logger.warning("No tweets fetched - using mock data")
326
+ return _self._get_mock_news()
327
+
328
+ # Sort by breaking news, then impact, then timestamp
329
+ all_news.sort(
330
+ key=lambda x: (x['is_breaking'], x['impact'] == 'high', x['timestamp']),
331
+ reverse=True
332
+ )
333
+
334
+ logger.info(f"Total unique tweets: {len(all_news)}")
335
+ return all_news[:max_tweets]
336
+
337
+ def _categorize_text(self, text: str, source_specialization: List[str]) -> str:
338
+ """Categorize news based on keywords and source specialization"""
339
+ text_lower = text.lower()
340
+
341
+ # Count keyword matches
342
+ macro_score = sum(1 for kw in self.MACRO_KEYWORDS if kw.lower() in text_lower)
343
+ market_score = sum(1 for kw in self.MARKET_KEYWORDS if kw.lower() in text_lower)
344
+ geo_score = sum(1 for kw in self.GEOPOLITICAL_KEYWORDS if kw.lower() in text_lower)
345
+
346
+ # Boost scores based on source specialization
347
+ if 'macro' in source_specialization:
348
+ macro_score *= 1.5
349
+ if 'markets' in source_specialization:
350
+ market_score *= 1.5
351
+ if 'geopolitical' in source_specialization:
352
+ geo_score *= 1.5
353
+
354
+ # Return category with highest score
355
+ scores = {'macro': macro_score, 'markets': market_score, 'geopolitical': geo_score}
356
+ return max(scores, key=scores.get)
357
+
358
+ def _analyze_sentiment(self, text: str) -> str:
359
+ """Simple keyword-based sentiment analysis for trading"""
360
+ text_lower = text.lower()
361
+
362
+ positive_keywords = ['surge', 'rally', 'gain', 'rise', 'up', 'bullish', 'strong', 'beat', 'exceed']
363
+ negative_keywords = ['crash', 'plunge', 'fall', 'down', 'bearish', 'weak', 'miss', 'below', 'loss']
364
+
365
+ pos_count = sum(1 for kw in positive_keywords if kw in text_lower)
366
+ neg_count = sum(1 for kw in negative_keywords if kw in text_lower)
367
+
368
+ if pos_count > neg_count:
369
+ return 'positive'
370
+ elif neg_count > pos_count:
371
+ return 'negative'
372
+ return 'neutral'
373
+
374
+ def _assess_impact(self, source_weight: float, text: str) -> str:
375
+ """Assess market impact based on source weight and keywords"""
376
+ text_lower = text.lower()
377
+
378
+ high_impact_keywords = ['breaking', 'alert', 'urgent', 'flash', 'fed', 'powell', 'rate', 'war']
379
+ impact_score = sum(1 for kw in high_impact_keywords if kw in text_lower)
380
+
381
+ # Combine source weight and keyword impact
382
+ total_impact = source_weight + (impact_score * 0.3)
383
+
384
+ if total_impact >= 1.8:
385
+ return 'high'
386
+ elif total_impact >= 1.4:
387
+ return 'medium'
388
+ return 'low'
389
+
390
+ def _detect_breaking_news(self, text: str) -> bool:
391
+ """Detect if news is breaking/urgent"""
392
+ text_lower = text.lower()
393
+ breaking_keywords = ['breaking', 'alert', 'urgent', 'flash', '*breaking*', '🚨']
394
+ return any(kw in text_lower for kw in breaking_keywords)
395
+
396
+ def _extract_summary(self, text: str) -> str:
397
+ """Extract first 150 characters as summary"""
398
+ if len(text) <= 150:
399
+ return text
400
+ return text[:147] + "..."
401
+
402
+ def _get_mock_news(self) -> List[Dict]:
403
+ """Return mock data when scraping fails"""
404
+ mock_news = [
405
+ {
406
+ 'id': hash('mock1'),
407
+ 'title': 'Fed signals potential rate pause as inflation moderates',
408
+ 'summary': 'Fed signals potential rate pause as inflation moderates',
409
+ 'source': 'Mock Data',
410
+ 'category': 'macro',
411
+ 'timestamp': datetime.now() - timedelta(minutes=5),
412
+ 'sentiment': 'neutral',
413
+ 'impact': 'high',
414
+ 'url': 'https://x.com',
415
+ 'likes': 0,
416
+ 'retweets': 0,
417
+ 'is_breaking': False,
418
+ 'source_weight': 1.5,
419
+ 'from_web': True
420
+ },
421
+ {
422
+ 'id': hash('mock2'),
423
+ 'title': 'S&P 500 futures rise ahead of key earnings reports',
424
+ 'summary': 'S&P 500 futures rise ahead of key earnings reports',
425
+ 'source': 'Mock Data',
426
+ 'category': 'markets',
427
+ 'timestamp': datetime.now() - timedelta(minutes=15),
428
+ 'sentiment': 'positive',
429
+ 'impact': 'medium',
430
+ 'url': 'https://x.com',
431
+ 'likes': 0,
432
+ 'retweets': 0,
433
+ 'is_breaking': False,
434
+ 'source_weight': 1.5,
435
+ 'from_web': True
436
+ }
437
+ ]
438
+ return mock_news
439
+
440
+ def get_statistics(self) -> Dict:
441
+ """Get statistics about cached news"""
442
+ if not self.news_cache:
443
+ return {
444
+ 'total': 0,
445
+ 'high_impact': 0,
446
+ 'breaking': 0,
447
+ 'last_update': 'Never',
448
+ 'by_category': {}
449
+ }
450
+
451
+ df = pd.DataFrame(self.news_cache)
452
+ return {
453
+ 'total': len(df),
454
+ 'high_impact': len(df[df['impact'] == 'high']),
455
+ 'breaking': len(df[df['is_breaking'] == True]),
456
+ 'last_update': self.last_fetch.strftime('%H:%M:%S') if self.last_fetch else 'Never',
457
+ 'by_category': df['category'].value_counts().to_dict()
458
+ }
requirements.txt CHANGED
@@ -8,3 +8,5 @@ twikit>=2.3.0
8
  feedparser>=6.0.0
9
  beautifulsoup4>=4.12.0
10
  lxml>=5.0.0
 
 
 
8
  feedparser>=6.0.0
9
  beautifulsoup4>=4.12.0
10
  lxml>=5.0.0
11
+ ntscraper
12
+ playwright>=1.40.0