Dmitry Beresnev commited on
Commit
e918eaf
·
1 Parent(s): 07b3173

add AI news feed

Browse files
app/pages/05_Dashboard.py CHANGED
@@ -38,6 +38,12 @@ try:
38
  except ImportError:
39
  REDDIT_AVAILABLE = False
40
 
 
 
 
 
 
 
41
 
42
  # ---- Page Configuration ----
43
  st.set_page_config(
@@ -60,9 +66,13 @@ if 'twitter_monitor' not in st.session_state and TWITTER_AVAILABLE:
60
  if 'reddit_monitor' not in st.session_state and REDDIT_AVAILABLE:
61
  st.session_state.reddit_monitor = RedditFinanceMonitor()
62
 
 
 
 
63
  rss_monitor = st.session_state.get('rss_monitor')
64
  twitter_monitor = st.session_state.get('twitter_monitor')
65
  reddit_monitor = st.session_state.get('reddit_monitor')
 
66
 
67
  # Initialize unified cache manager
68
  if 'news_cache_manager' not in st.session_state:
@@ -132,7 +142,8 @@ with st.sidebar:
132
  total_stories = (
133
  cache_stats['twitter']['items'] +
134
  cache_stats['reddit']['items'] +
135
- cache_stats['rss']['items']
 
136
  )
137
 
138
  # Display metrics
@@ -153,7 +164,8 @@ with st.sidebar:
153
  twitter_sources = len(twitter_monitor.SOURCES) if twitter_monitor else 0
154
  reddit_sources = len(reddit_monitor.SUBREDDITS) if reddit_monitor else 0
155
  rss_sources = len(rss_monitor.SOURCES) if rss_monitor else 0
156
- total_sources = twitter_sources + reddit_sources + rss_sources
 
157
 
158
  st.markdown(f"""
159
  <div style='font-size: 11px; line-height: 1.6;'>
@@ -174,6 +186,12 @@ with st.sidebar:
174
  • BBC • Yahoo Finance • The Economist
175
  • Fed (2.0x) • ECB (2.0x) • IMF
176
 
 
 
 
 
 
 
177
  **Total: {total_sources} Premium Sources**
178
  </div>
179
  """, unsafe_allow_html=True)
@@ -192,6 +210,7 @@ twitter_df = pd.DataFrame()
192
  reddit_df = pd.DataFrame()
193
  rss_all_df = pd.DataFrame()
194
  rss_main_df = pd.DataFrame()
 
195
 
196
  def fetch_twitter_news():
197
  """Fetch Twitter/X news via cache manager"""
@@ -254,19 +273,42 @@ def fetch_rss_news():
254
  return pd.DataFrame(), f"RSS scraping unavailable: {e}"
255
  return pd.DataFrame(), None
256
 
257
- with st.spinner("🔍 Fetching latest financial news in parallel..."):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  # Execute all news fetching operations in parallel using ThreadPoolExecutor
259
- with ThreadPoolExecutor(max_workers=3) as executor:
260
  # Submit all tasks
261
  future_twitter = executor.submit(fetch_twitter_news)
262
  future_reddit = executor.submit(fetch_reddit_news)
263
  future_rss = executor.submit(fetch_rss_news)
 
264
 
265
  # Collect results as they complete
266
  futures = {
267
  'twitter': future_twitter,
268
  'reddit': future_reddit,
269
- 'rss': future_rss
 
270
  }
271
 
272
  for source_name, future in futures.items():
@@ -285,6 +327,10 @@ with st.spinner("🔍 Fetching latest financial news in parallel..."):
285
  rss_all_df = result_df
286
  if error:
287
  st.warning(error)
 
 
 
 
288
  # Get main page news subset for RSS
289
  if not rss_all_df.empty and 'from_web' in rss_all_df.columns:
290
  rss_main_df = rss_all_df[rss_all_df['from_web'] == True].copy()
@@ -338,9 +384,9 @@ if not all_news_df.empty:
338
 
339
  st.markdown("---")
340
 
341
- # ---- THREE-COLUMN SCROLLABLE NEWS LAYOUT (TradingView Style) ----
342
 
343
- col1, col2, col3 = st.columns(3)
344
 
345
  with col1:
346
  # SECTION 1: Twitter/X & Reddit Breaking News
@@ -447,6 +493,33 @@ with col3:
447
  </style>
448
  """, unsafe_allow_html=True)
449
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
450
  # Auto-refresh logic
451
  if auto_refresh:
452
  import time
 
38
  except ImportError:
39
  REDDIT_AVAILABLE = False
40
 
41
+ try:
42
+ from services.ai_tech_news import AITechNewsScraper
43
+ AI_TECH_AVAILABLE = True
44
+ except ImportError:
45
+ AI_TECH_AVAILABLE = False
46
+
47
 
48
  # ---- Page Configuration ----
49
  st.set_page_config(
 
66
  if 'reddit_monitor' not in st.session_state and REDDIT_AVAILABLE:
67
  st.session_state.reddit_monitor = RedditFinanceMonitor()
68
 
69
+ if 'ai_tech_monitor' not in st.session_state and AI_TECH_AVAILABLE:
70
+ st.session_state.ai_tech_monitor = AITechNewsScraper()
71
+
72
  rss_monitor = st.session_state.get('rss_monitor')
73
  twitter_monitor = st.session_state.get('twitter_monitor')
74
  reddit_monitor = st.session_state.get('reddit_monitor')
75
+ ai_tech_monitor = st.session_state.get('ai_tech_monitor')
76
 
77
  # Initialize unified cache manager
78
  if 'news_cache_manager' not in st.session_state:
 
142
  total_stories = (
143
  cache_stats['twitter']['items'] +
144
  cache_stats['reddit']['items'] +
145
+ cache_stats['rss']['items'] +
146
+ cache_stats.get('ai_tech', {}).get('items', 0)
147
  )
148
 
149
  # Display metrics
 
164
  twitter_sources = len(twitter_monitor.SOURCES) if twitter_monitor else 0
165
  reddit_sources = len(reddit_monitor.SUBREDDITS) if reddit_monitor else 0
166
  rss_sources = len(rss_monitor.SOURCES) if rss_monitor else 0
167
+ ai_tech_sources = len(ai_tech_monitor.SOURCES) if ai_tech_monitor else 0
168
+ total_sources = twitter_sources + reddit_sources + rss_sources + ai_tech_sources
169
 
170
  st.markdown(f"""
171
  <div style='font-size: 11px; line-height: 1.6;'>
 
186
  • BBC • Yahoo Finance • The Economist
187
  • Fed (2.0x) • ECB (2.0x) • IMF
188
 
189
+ **AI & Tech Sources ({ai_tech_sources})**
190
+ • OpenAI • Google AI • Microsoft AI • Meta AI
191
+ • DeepMind • Anthropic • AWS AI • NVIDIA
192
+ • TechCrunch • The Verge • VentureBeat
193
+ • MIT Tech Review • Wired • Ars Technica
194
+
195
  **Total: {total_sources} Premium Sources**
196
  </div>
197
  """, unsafe_allow_html=True)
 
210
  reddit_df = pd.DataFrame()
211
  rss_all_df = pd.DataFrame()
212
  rss_main_df = pd.DataFrame()
213
+ ai_tech_df = pd.DataFrame()
214
 
215
  def fetch_twitter_news():
216
  """Fetch Twitter/X news via cache manager"""
 
273
  return pd.DataFrame(), f"RSS scraping unavailable: {e}"
274
  return pd.DataFrame(), None
275
 
276
+ def fetch_ai_tech_news():
277
+ """Fetch AI/Tech news via cache manager"""
278
+ try:
279
+ if ai_tech_monitor:
280
+ # Use cache manager for smart caching
281
+ ai_tech_news = cache_manager.get_news(
282
+ source='ai_tech',
283
+ fetcher_func=ai_tech_monitor.scrape_ai_tech_news,
284
+ force_refresh=force_refresh,
285
+ max_items=100,
286
+ hours=48
287
+ )
288
+ if ai_tech_news:
289
+ df = pd.DataFrame(ai_tech_news)
290
+ if not df.empty:
291
+ df['timestamp'] = pd.to_datetime(df['timestamp'])
292
+ return df, None
293
+ except Exception as e:
294
+ return pd.DataFrame(), f"AI/Tech news unavailable: {e}"
295
+ return pd.DataFrame(), None
296
+
297
+ with st.spinner("🔍 Fetching latest financial & tech news in parallel..."):
298
  # Execute all news fetching operations in parallel using ThreadPoolExecutor
299
+ with ThreadPoolExecutor(max_workers=4) as executor:
300
  # Submit all tasks
301
  future_twitter = executor.submit(fetch_twitter_news)
302
  future_reddit = executor.submit(fetch_reddit_news)
303
  future_rss = executor.submit(fetch_rss_news)
304
+ future_ai_tech = executor.submit(fetch_ai_tech_news)
305
 
306
  # Collect results as they complete
307
  futures = {
308
  'twitter': future_twitter,
309
  'reddit': future_reddit,
310
+ 'rss': future_rss,
311
+ 'ai_tech': future_ai_tech
312
  }
313
 
314
  for source_name, future in futures.items():
 
327
  rss_all_df = result_df
328
  if error:
329
  st.warning(error)
330
+ elif source_name == 'ai_tech':
331
+ ai_tech_df = result_df
332
+ if error:
333
+ st.warning(error)
334
  # Get main page news subset for RSS
335
  if not rss_all_df.empty and 'from_web' in rss_all_df.columns:
336
  rss_main_df = rss_all_df[rss_all_df['from_web'] == True].copy()
 
384
 
385
  st.markdown("---")
386
 
387
+ # ---- FOUR-COLUMN SCROLLABLE NEWS LAYOUT (TradingView Style) ----
388
 
389
+ col1, col2, col3, col4 = st.columns(4)
390
 
391
  with col1:
392
  # SECTION 1: Twitter/X & Reddit Breaking News
 
493
  </style>
494
  """, unsafe_allow_html=True)
495
 
496
+ with col4:
497
+ # SECTION 4: AI & Tech News
498
+ if not ai_tech_df.empty:
499
+ display_scrollable_news_section(
500
+ ai_tech_df,
501
+ section_title="AI & Tech News",
502
+ section_icon="🤖",
503
+ section_subtitle="Latest from tech giants & AI research",
504
+ max_items=100,
505
+ height="700px"
506
+ )
507
+ else:
508
+ st.markdown("""
509
+ <div style="background: linear-gradient(135deg, #1E222D 0%, #131722 100%); border: 1px solid #2A2E39; border-radius: 8px; padding: 30px; text-align: center;">
510
+ <div style="font-size: 48px; margin-bottom: 16px; animation: pulse 2s ease-in-out infinite;">⏳</div>
511
+ <div style="color: #D1D4DC; font-size: 16px; font-weight: 600; margin-bottom: 8px;">Loading AI & Tech News</div>
512
+ <div style="color: #787B86; font-size: 13px;">Aggregating from tech blogs & research...</div>
513
+ <div style="color: #787B86; font-size: 12px; margin-top: 8px; opacity: 0.7;">OpenAI, Google AI, Microsoft, Meta & more</div>
514
+ </div>
515
+ <style>
516
+ @keyframes pulse {
517
+ 0%, 100% { opacity: 1; transform: scale(1); }
518
+ 50% { opacity: 0.6; transform: scale(1.1); }
519
+ }
520
+ </style>
521
+ """, unsafe_allow_html=True)
522
+
523
  # Auto-refresh logic
524
  if auto_refresh:
525
  import time
app/services/ai_tech_news.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AI & Tech News Scraper
3
+ Fetches news from popular tech resources and big tech company blogs
4
+ """
5
+
6
+ import feedparser
7
+ import requests
8
+ from bs4 import BeautifulSoup
9
+ from datetime import datetime, timedelta
10
+ from typing import List, Dict
11
+ import logging
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class AITechNewsScraper:
17
+ """Scraper for AI and tech news from major sources and company blogs"""
18
+
19
+ # AI/Tech News Sources (RSS + Web)
20
+ SOURCES = {
21
+ # Major Tech News
22
+ 'TechCrunch AI': {
23
+ 'url': 'https://techcrunch.com/category/artificial-intelligence/feed/',
24
+ 'type': 'rss',
25
+ 'category': 'ai'
26
+ },
27
+ 'The Verge AI': {
28
+ 'url': 'https://www.theverge.com/ai-artificial-intelligence/rss/index.xml',
29
+ 'type': 'rss',
30
+ 'category': 'ai'
31
+ },
32
+ 'VentureBeat AI': {
33
+ 'url': 'https://venturebeat.com/category/ai/feed/',
34
+ 'type': 'rss',
35
+ 'category': 'ai'
36
+ },
37
+ 'MIT Technology Review AI': {
38
+ 'url': 'https://www.technologyreview.com/topic/artificial-intelligence/feed',
39
+ 'type': 'rss',
40
+ 'category': 'ai'
41
+ },
42
+ 'Ars Technica AI': {
43
+ 'url': 'https://feeds.arstechnica.com/arstechnica/technology-lab',
44
+ 'type': 'rss',
45
+ 'category': 'tech'
46
+ },
47
+ 'Wired AI': {
48
+ 'url': 'https://www.wired.com/feed/tag/ai/latest/rss',
49
+ 'type': 'rss',
50
+ 'category': 'ai'
51
+ },
52
+
53
+ # Big Tech Company Blogs
54
+ 'OpenAI Blog': {
55
+ 'url': 'https://openai.com/blog/rss.xml',
56
+ 'type': 'rss',
57
+ 'category': 'ai'
58
+ },
59
+ 'Google AI Blog': {
60
+ 'url': 'https://blog.google/technology/ai/rss/',
61
+ 'type': 'rss',
62
+ 'category': 'ai'
63
+ },
64
+ 'Microsoft AI Blog': {
65
+ 'url': 'https://blogs.microsoft.com/ai/feed/',
66
+ 'type': 'rss',
67
+ 'category': 'ai'
68
+ },
69
+ 'Meta AI Blog': {
70
+ 'url': 'https://ai.meta.com/blog/rss/',
71
+ 'type': 'rss',
72
+ 'category': 'ai'
73
+ },
74
+ 'DeepMind Blog': {
75
+ 'url': 'https://deepmind.google/blog/rss.xml',
76
+ 'type': 'rss',
77
+ 'category': 'ai'
78
+ },
79
+ 'Anthropic News': {
80
+ 'url': 'https://www.anthropic.com/news/rss.xml',
81
+ 'type': 'rss',
82
+ 'category': 'ai'
83
+ },
84
+ 'AWS AI Blog': {
85
+ 'url': 'https://aws.amazon.com/blogs/machine-learning/feed/',
86
+ 'type': 'rss',
87
+ 'category': 'ai'
88
+ },
89
+ 'NVIDIA AI Blog': {
90
+ 'url': 'https://blogs.nvidia.com/feed/',
91
+ 'type': 'rss',
92
+ 'category': 'ai'
93
+ },
94
+
95
+ # Research & Academia
96
+ 'Stanford HAI': {
97
+ 'url': 'https://hai.stanford.edu/news/rss.xml',
98
+ 'type': 'rss',
99
+ 'category': 'research'
100
+ },
101
+ 'Berkeley AI Research': {
102
+ 'url': 'https://bair.berkeley.edu/blog/feed.xml',
103
+ 'type': 'rss',
104
+ 'category': 'research'
105
+ },
106
+ }
107
+
108
+ def __init__(self):
109
+ """Initialize the AI/Tech news scraper"""
110
+ self.session = requests.Session()
111
+ self.session.headers.update({
112
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
113
+ })
114
+
115
+ def scrape_ai_tech_news(self, max_items: int = 100, hours: int = 48) -> List[Dict]:
116
+ """
117
+ Scrape AI and tech news from all sources
118
+
119
+ Args:
120
+ max_items: Maximum number of news items to return
121
+ hours: Only include news from the last N hours
122
+
123
+ Returns:
124
+ List of news items with standardized format
125
+ """
126
+ all_news = []
127
+ cutoff_time = datetime.now() - timedelta(hours=hours)
128
+
129
+ for source_name, source_config in self.SOURCES.items():
130
+ try:
131
+ if source_config['type'] == 'rss':
132
+ news_items = self._scrape_rss_feed(
133
+ source_name,
134
+ source_config['url'],
135
+ source_config['category'],
136
+ cutoff_time
137
+ )
138
+ all_news.extend(news_items)
139
+ logger.info(f"Scraped {len(news_items)} items from {source_name}")
140
+
141
+ except Exception as e:
142
+ logger.error(f"Error scraping {source_name}: {e}")
143
+ continue
144
+
145
+ # Sort by timestamp (newest first)
146
+ all_news.sort(key=lambda x: x['timestamp'], reverse=True)
147
+
148
+ # Limit to max_items
149
+ return all_news[:max_items]
150
+
151
+ def _scrape_rss_feed(self, source_name: str, feed_url: str,
152
+ category: str, cutoff_time: datetime) -> List[Dict]:
153
+ """Scrape a single RSS feed"""
154
+ news_items = []
155
+
156
+ try:
157
+ feed = feedparser.parse(feed_url)
158
+
159
+ for entry in feed.entries:
160
+ try:
161
+ # Parse timestamp
162
+ if hasattr(entry, 'published_parsed') and entry.published_parsed:
163
+ timestamp = datetime(*entry.published_parsed[:6])
164
+ elif hasattr(entry, 'updated_parsed') and entry.updated_parsed:
165
+ timestamp = datetime(*entry.updated_parsed[:6])
166
+ else:
167
+ timestamp = datetime.now()
168
+
169
+ # Skip old news
170
+ if timestamp < cutoff_time:
171
+ continue
172
+
173
+ # Extract title and summary
174
+ title = entry.get('title', 'No title')
175
+ summary = entry.get('summary', entry.get('description', ''))
176
+
177
+ # Clean HTML from summary
178
+ if summary:
179
+ soup = BeautifulSoup(summary, 'html.parser')
180
+ summary = soup.get_text().strip()
181
+ # Limit summary length
182
+ if len(summary) > 300:
183
+ summary = summary[:297] + '...'
184
+
185
+ # Determine impact and sentiment based on keywords
186
+ impact = self._determine_impact(title, summary)
187
+ sentiment = self._determine_sentiment(title, summary)
188
+
189
+ news_item = {
190
+ 'title': title,
191
+ 'summary': summary or title,
192
+ 'source': source_name,
193
+ 'url': entry.get('link', ''),
194
+ 'timestamp': timestamp,
195
+ 'category': category,
196
+ 'impact': impact,
197
+ 'sentiment': sentiment,
198
+ 'is_breaking': self._is_breaking_news(title, summary),
199
+ 'likes': 0, # No engagement data for RSS
200
+ 'retweets': 0,
201
+ 'reddit_score': 0,
202
+ 'reddit_comments': 0
203
+ }
204
+
205
+ news_items.append(news_item)
206
+
207
+ except Exception as e:
208
+ logger.error(f"Error parsing entry from {source_name}: {e}")
209
+ continue
210
+
211
+ except Exception as e:
212
+ logger.error(f"Error fetching RSS feed {feed_url}: {e}")
213
+
214
+ return news_items
215
+
216
+ def _determine_impact(self, title: str, summary: str) -> str:
217
+ """Determine impact level based on keywords"""
218
+ text = f"{title} {summary}".lower()
219
+
220
+ high_impact_keywords = [
221
+ 'breakthrough', 'announce', 'launch', 'release', 'new model',
222
+ 'gpt', 'claude', 'gemini', 'llama', 'chatgpt',
223
+ 'billion', 'trillion', 'acquisition', 'merger',
224
+ 'regulation', 'ban', 'lawsuit', 'security breach',
225
+ 'major', 'significant', 'revolutionary', 'first-ever'
226
+ ]
227
+
228
+ medium_impact_keywords = [
229
+ 'update', 'improve', 'enhance', 'study', 'research',
230
+ 'partnership', 'collaboration', 'funding', 'investment',
231
+ 'expands', 'grows', 'adopts', 'implements'
232
+ ]
233
+
234
+ for keyword in high_impact_keywords:
235
+ if keyword in text:
236
+ return 'high'
237
+
238
+ for keyword in medium_impact_keywords:
239
+ if keyword in text:
240
+ return 'medium'
241
+
242
+ return 'low'
243
+
244
+ def _determine_sentiment(self, title: str, summary: str) -> str:
245
+ """Determine sentiment based on keywords"""
246
+ text = f"{title} {summary}".lower()
247
+
248
+ positive_keywords = [
249
+ 'breakthrough', 'success', 'achieve', 'improve', 'advance',
250
+ 'innovative', 'revolutionary', 'launch', 'release', 'win',
251
+ 'growth', 'expand', 'partnership', 'collaboration'
252
+ ]
253
+
254
+ negative_keywords = [
255
+ 'fail', 'issue', 'problem', 'concern', 'worry', 'risk',
256
+ 'ban', 'lawsuit', 'breach', 'hack', 'leak', 'crisis',
257
+ 'decline', 'loss', 'shutdown', 'controversy'
258
+ ]
259
+
260
+ positive_count = sum(1 for kw in positive_keywords if kw in text)
261
+ negative_count = sum(1 for kw in negative_keywords if kw in text)
262
+
263
+ if positive_count > negative_count:
264
+ return 'positive'
265
+ elif negative_count > positive_count:
266
+ return 'negative'
267
+ else:
268
+ return 'neutral'
269
+
270
+ def _is_breaking_news(self, title: str, summary: str) -> bool:
271
+ """Determine if news is breaking"""
272
+ text = f"{title} {summary}".lower()
273
+
274
+ breaking_indicators = [
275
+ 'breaking', 'just announced', 'just released', 'just launched',
276
+ 'alert', 'urgent', 'developing', 'live', 'now:'
277
+ ]
278
+
279
+ return any(indicator in text for indicator in breaking_indicators)
280
+
281
+ def get_statistics(self) -> Dict:
282
+ """Get statistics - returns empty for backward compatibility"""
283
+ return {
284
+ 'total': 0,
285
+ 'high_impact': 0,
286
+ 'breaking': 0,
287
+ 'last_update': 'Managed by cache',
288
+ 'by_category': {
289
+ 'ai': 0,
290
+ 'tech': 0,
291
+ 'research': 0
292
+ }
293
+ }