Dmitry Beresnev commited on
Commit
acede88
·
1 Parent(s): d34f6ef

add news scrapper

Browse files
app/pages/05_Dashboard.py CHANGED
@@ -18,13 +18,17 @@ from components.news import (
18
  display_breaking_news_banner
19
  )
20
 
21
- # Try to import Twikit version first, fall back to old version
22
  try:
23
- from services.news_monitor_twikit import FinanceNewsMonitor
24
- USING_TWIKIT = True
25
  except ImportError:
26
- from services.news_monitor import FinanceNewsMonitor
27
- USING_TWIKIT = False
 
 
 
 
28
 
29
 
30
  # ---- Page Configuration ----
@@ -210,7 +214,7 @@ st.markdown("""
210
  - Breaking news (🔴) indicates urgent market-moving information
211
  - Check engagement metrics (likes + retweets) for news importance
212
 
213
- **Data Source:** Live tweets from premium financial news sources via Twikit
214
  **Update Frequency:** 3-minute cache for low-latency delivery
215
- **Authentication:** Requires Twitter/X account credentials in .env file
216
  """)
 
18
  display_breaking_news_banner
19
  )
20
 
21
+ # Try to import RSS scraper first (most reliable), fall back to Twikit, then old snscrape
22
  try:
23
+ from services.news_scraper import FinanceNewsScraper as FinanceNewsMonitor
24
+ NEWS_SOURCE = "RSS Feeds"
25
  except ImportError:
26
+ try:
27
+ from services.news_monitor_twikit import FinanceNewsMonitor
28
+ NEWS_SOURCE = "Twikit"
29
+ except ImportError:
30
+ from services.news_monitor import FinanceNewsMonitor
31
+ NEWS_SOURCE = "snscrape"
32
 
33
 
34
  # ---- Page Configuration ----
 
214
  - Breaking news (🔴) indicates urgent market-moving information
215
  - Check engagement metrics (likes + retweets) for news importance
216
 
217
+ **Data Source:** Dual-mode scraping - RSS feeds + direct web page parsing from Reuters, Bloomberg, FT, WSJ, CNBC, Fed, ECB and more
218
  **Update Frequency:** 3-minute cache for low-latency delivery
219
+ **No Authentication Required:** Public sources - works out of the box
220
  """)
app/services/news_scraper.py ADDED
@@ -0,0 +1,528 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Professional Finance News Scraper - Direct from Source Websites
3
+ Scrapes: Reuters, Bloomberg, FT, WSJ, CNBC, MarketWatch, etc.
4
+ No Twitter API needed - direct RSS and web scraping
5
+ """
6
+
7
+ import pandas as pd
8
+ from datetime import datetime, timedelta
9
+ from typing import List, Dict, Optional
10
+ import streamlit as st
11
+ import logging
12
+ import re
13
+ import feedparser
14
+ import requests
15
+ from bs4 import BeautifulSoup
16
+ from concurrent.futures import ThreadPoolExecutor
17
+
18
+ # Configure logging
19
+ logging.basicConfig(level=logging.INFO)
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class FinanceNewsScraper:
24
+ """
25
+ Professional-grade financial news scraper using RSS feeds and web scraping
26
+ No authentication required - publicly available sources
27
+ """
28
+
29
+ # News sources with RSS feeds and web scraping endpoints
30
+ SOURCES = {
31
+ # ===== TIER 1: Major Financial News (RSS + Web Scraping) =====
32
+ 'reuters_business': {
33
+ 'name': 'Reuters Business',
34
+ 'rss': 'https://www.reutersagency.com/feed/?taxonomy=best-topics&post_type=best',
35
+ 'web': 'https://www.reuters.com/business/',
36
+ 'selectors': {'headline': 'h3.text__text__1FZLe', 'link': 'a.text__text__1FZLe'},
37
+ 'weight': 1.5,
38
+ 'specialization': ['macro', 'markets']
39
+ },
40
+ 'reuters_markets': {
41
+ 'name': 'Reuters Markets',
42
+ 'rss': 'https://www.reutersagency.com/feed/?best-sectors=business-finance&post_type=best',
43
+ 'web': 'https://www.reuters.com/markets/',
44
+ 'selectors': {'headline': 'h3', 'link': 'a[data-testid="Heading"]'},
45
+ 'weight': 1.5,
46
+ 'specialization': ['markets']
47
+ },
48
+ 'cnbc': {
49
+ 'name': 'CNBC',
50
+ 'rss': 'https://www.cnbc.com/id/100003114/device/rss/rss.html',
51
+ 'web': 'https://www.cnbc.com/world/',
52
+ 'selectors': {'headline': 'a.Card-title', 'link': 'a.Card-title'},
53
+ 'weight': 1.2,
54
+ 'specialization': ['markets']
55
+ },
56
+ 'marketwatch': {
57
+ 'name': 'MarketWatch',
58
+ 'rss': 'https://www.marketwatch.com/rss/topstories',
59
+ 'web': 'https://www.marketwatch.com/',
60
+ 'selectors': {'headline': 'h3.article__headline', 'link': 'a.link'},
61
+ 'weight': 1.1,
62
+ 'specialization': ['markets']
63
+ },
64
+ 'ft_markets': {
65
+ 'name': 'Financial Times',
66
+ 'rss': 'https://www.ft.com/markets?format=rss',
67
+ 'web': 'https://www.ft.com/markets',
68
+ 'selectors': {'headline': 'div.o-teaser__heading', 'link': 'a.js-teaser-heading-link'},
69
+ 'weight': 1.4,
70
+ 'specialization': ['markets']
71
+ },
72
+ 'wsj_markets': {
73
+ 'name': 'WSJ Markets',
74
+ 'rss': 'https://feeds.a.dj.com/rss/RSSMarketsMain.xml',
75
+ 'web': 'https://www.wsj.com/news/markets',
76
+ 'selectors': {'headline': 'h3.WSJTheme--headline', 'link': 'a'},
77
+ 'weight': 1.4,
78
+ 'specialization': ['markets']
79
+ },
80
+ 'economist': {
81
+ 'name': 'The Economist',
82
+ 'rss': 'https://www.economist.com/finance-and-economics/rss.xml',
83
+ 'web': 'https://www.economist.com/finance-and-economics',
84
+ 'selectors': {'headline': 'span._headline', 'link': 'a'},
85
+ 'weight': 1.3,
86
+ 'specialization': ['macro', 'geopolitical']
87
+ },
88
+
89
+ # ===== TIER 2: Geopolitical & Economic =====
90
+ 'bbc_business': {
91
+ 'name': 'BBC Business',
92
+ 'rss': 'http://feeds.bbci.co.uk/news/business/rss.xml',
93
+ 'web': 'https://www.bbc.com/news/business',
94
+ 'selectors': {'headline': 'h3', 'link': 'a[data-testid="internal-link"]'},
95
+ 'weight': 1.4,
96
+ 'specialization': ['geopolitical', 'macro']
97
+ },
98
+ 'bloomberg_markets': {
99
+ 'name': 'Bloomberg',
100
+ 'rss': 'https://www.bloomberg.com/feed/podcast/etf-report.xml',
101
+ 'web': 'https://www.bloomberg.com/markets',
102
+ 'selectors': {'headline': 'div.single-story-module__headline', 'link': 'a'},
103
+ 'weight': 1.5,
104
+ 'specialization': ['markets']
105
+ },
106
+
107
+ # ===== TIER 3: Central Banks (RSS + Web) =====
108
+ 'federal_reserve': {
109
+ 'name': 'Federal Reserve',
110
+ 'rss': 'https://www.federalreserve.gov/feeds/press_all.xml',
111
+ 'web': 'https://www.federalreserve.gov/newsevents/pressreleases.htm',
112
+ 'selectors': {'headline': 'div.row', 'link': 'a'},
113
+ 'weight': 2.0,
114
+ 'specialization': ['macro']
115
+ },
116
+ 'ecb': {
117
+ 'name': 'European Central Bank',
118
+ 'rss': 'https://www.ecb.europa.eu/rss/press.xml',
119
+ 'web': 'https://www.ecb.europa.eu/press/pr/date/html/index.en.html',
120
+ 'selectors': {'headline': 'dt', 'link': 'a'},
121
+ 'weight': 2.0,
122
+ 'specialization': ['macro']
123
+ },
124
+ 'imf': {
125
+ 'name': 'IMF',
126
+ 'rss': 'https://www.imf.org/en/News/rss?language_id=1',
127
+ 'web': 'https://www.imf.org/en/News',
128
+ 'selectors': {'headline': 'h3', 'link': 'a'},
129
+ 'weight': 1.7,
130
+ 'specialization': ['macro', 'geopolitical']
131
+ }
132
+ }
133
+
134
+ # Keyword detection
135
+ MACRO_KEYWORDS = [
136
+ 'Fed', 'ECB', 'BoE', 'BoJ', 'FOMC', 'Powell', 'Lagarde',
137
+ 'interest rate', 'rate cut', 'rate hike', 'inflation', 'CPI',
138
+ 'GDP', 'unemployment', 'jobs report', 'NFP', 'monetary policy'
139
+ ]
140
+
141
+ MARKET_KEYWORDS = [
142
+ 'S&P', 'Dow', 'Nasdaq', 'earnings', 'EPS', 'stock', 'equity',
143
+ 'rally', 'selloff', 'correction', 'merger', 'acquisition', 'IPO'
144
+ ]
145
+
146
+ GEOPOLITICAL_KEYWORDS = [
147
+ 'war', 'conflict', 'sanctions', 'trade', 'tariff', 'crisis',
148
+ 'Ukraine', 'Russia', 'China', 'Taiwan', 'Middle East'
149
+ ]
150
+
151
+ def __init__(self):
152
+ """Initialize scraper with caching"""
153
+ self.news_cache = []
154
+ self.last_fetch = None
155
+ self.cache_ttl = 180 # 3 minutes
156
+ self.session = requests.Session()
157
+ self.session.headers.update({
158
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
159
+ })
160
+
161
+ def _fetch_rss_feed(self, source_name: str, source_info: Dict) -> List[Dict]:
162
+ """Fetch and parse RSS feed from a single source"""
163
+ try:
164
+ feed = feedparser.parse(source_info['rss'])
165
+
166
+ if not feed.entries:
167
+ logger.warning(f"No entries found for {source_name}")
168
+ return []
169
+
170
+ news_items = []
171
+ for entry in feed.entries[:10]: # Limit to 10 most recent
172
+ # Parse published date
173
+ try:
174
+ if hasattr(entry, 'published_parsed') and entry.published_parsed:
175
+ timestamp = datetime(*entry.published_parsed[:6])
176
+ elif hasattr(entry, 'updated_parsed') and entry.updated_parsed:
177
+ timestamp = datetime(*entry.updated_parsed[:6])
178
+ else:
179
+ timestamp = datetime.now()
180
+ except:
181
+ timestamp = datetime.now()
182
+
183
+ # Skip old news (>24h)
184
+ if (datetime.now() - timestamp).days > 1:
185
+ continue
186
+
187
+ # Extract title and summary
188
+ title = entry.get('title', '')
189
+ summary = entry.get('summary', '') or entry.get('description', '')
190
+
191
+ # Clean HTML from summary
192
+ if summary:
193
+ summary = BeautifulSoup(summary, 'html.parser').get_text()
194
+ summary = self._extract_summary(summary)
195
+
196
+ # Get URL
197
+ url = entry.get('link', '')
198
+
199
+ # Categorize and analyze
200
+ text = f"{title} {summary}"
201
+ category = self._categorize_text(text, source_info['specialization'])
202
+ sentiment = self._analyze_sentiment(text)
203
+ impact = self._assess_impact(source_info['weight'], title)
204
+ is_breaking = self._detect_breaking_news(title)
205
+
206
+ news_items.append({
207
+ 'id': hash(url),
208
+ 'title': title,
209
+ 'summary': summary or self._extract_summary(title),
210
+ 'source': source_info['name'],
211
+ 'category': category,
212
+ 'timestamp': timestamp,
213
+ 'sentiment': sentiment,
214
+ 'impact': impact,
215
+ 'url': url,
216
+ 'likes': 0, # RSS feeds don't have engagement metrics
217
+ 'retweets': 0,
218
+ 'is_breaking': is_breaking,
219
+ 'source_weight': source_info['weight']
220
+ })
221
+
222
+ return news_items
223
+
224
+ except Exception as e:
225
+ logger.error(f"Error fetching RSS for {source_name}: {e}")
226
+ return []
227
+
228
+ def _scrape_web_page(self, source_name: str, source_info: Dict) -> List[Dict]:
229
+ """Scrape news headlines directly from website main page"""
230
+ try:
231
+ # Fetch HTML from web URL
232
+ response = self.session.get(source_info['web'], timeout=10)
233
+ response.raise_for_status()
234
+
235
+ soup = BeautifulSoup(response.content, 'lxml')
236
+
237
+ # Get CSS selectors
238
+ headline_selector = source_info['selectors']['headline']
239
+ link_selector = source_info['selectors']['link']
240
+
241
+ news_items = []
242
+
243
+ # Find all headline elements
244
+ headlines = soup.select(headline_selector)
245
+
246
+ for headline_elem in headlines[:10]: # Limit to 10 most recent
247
+ try:
248
+ # Extract title text
249
+ title = headline_elem.get_text(strip=True)
250
+ if not title or len(title) < 10:
251
+ continue
252
+
253
+ # Find associated link
254
+ # Try to find link within the headline element or its parent
255
+ link_elem = headline_elem if headline_elem.name == 'a' else headline_elem.find('a')
256
+ if not link_elem:
257
+ # Try parent element
258
+ link_elem = headline_elem.find_parent('a')
259
+ if not link_elem:
260
+ # Try sibling link with same selector
261
+ parent = headline_elem.find_parent()
262
+ if parent:
263
+ link_elem = parent.find('a')
264
+
265
+ if not link_elem:
266
+ continue
267
+
268
+ # Get URL and make absolute if relative
269
+ url = link_elem.get('href', '')
270
+ if not url:
271
+ continue
272
+
273
+ if url.startswith('/'):
274
+ # Make absolute URL
275
+ from urllib.parse import urljoin
276
+ url = urljoin(source_info['web'], url)
277
+
278
+ # Skip non-http URLs
279
+ if not url.startswith('http'):
280
+ continue
281
+
282
+ # Categorize and analyze
283
+ category = self._categorize_text(title, source_info['specialization'])
284
+ sentiment = self._analyze_sentiment(title)
285
+ impact = self._assess_impact(source_info['weight'], title)
286
+ is_breaking = self._detect_breaking_news(title)
287
+
288
+ news_items.append({
289
+ 'id': hash(url),
290
+ 'title': title,
291
+ 'summary': self._extract_summary(title),
292
+ 'source': source_info['name'],
293
+ 'category': category,
294
+ 'timestamp': datetime.now(), # Web scraping doesn't have timestamps
295
+ 'sentiment': sentiment,
296
+ 'impact': impact,
297
+ 'url': url,
298
+ 'likes': 0,
299
+ 'retweets': 0,
300
+ 'is_breaking': is_breaking,
301
+ 'source_weight': source_info['weight']
302
+ })
303
+
304
+ except Exception as e:
305
+ logger.debug(f"Error parsing headline from {source_name}: {e}")
306
+ continue
307
+
308
+ logger.info(f"Scraped {len(news_items)} items from {source_name} web page")
309
+ return news_items
310
+
311
+ except Exception as e:
312
+ logger.error(f"Error scraping web page for {source_name}: {e}")
313
+ return []
314
+
315
+ @st.cache_data(ttl=180)
316
+ def scrape_news(_self, max_items: int = 100) -> List[Dict]:
317
+ """
318
+ Scrape news from all sources with caching
319
+ Uses ThreadPoolExecutor for parallel fetching from both RSS and web pages
320
+ """
321
+ all_news = []
322
+ seen_urls = set()
323
+
324
+ # Parallel fetching using ThreadPoolExecutor
325
+ with ThreadPoolExecutor(max_workers=8) as executor:
326
+ futures = []
327
+
328
+ # Submit both RSS and web scraping tasks for each source
329
+ for name, info in _self.SOURCES.items():
330
+ # RSS feed task
331
+ futures.append((executor.submit(_self._fetch_rss_feed, name, info), name, 'RSS'))
332
+ # Web scraping task
333
+ futures.append((executor.submit(_self._scrape_web_page, name, info), name, 'Web'))
334
+
335
+ for future, source_name, method in futures:
336
+ try:
337
+ news_items = future.result()
338
+
339
+ # Deduplicate based on URL
340
+ unique_items = []
341
+ for item in news_items:
342
+ if item['url'] not in seen_urls:
343
+ seen_urls.add(item['url'])
344
+ unique_items.append(item)
345
+
346
+ all_news.extend(unique_items)
347
+ logger.info(f"Fetched {len(unique_items)} unique items from {source_name} ({method})")
348
+ except Exception as e:
349
+ logger.error(f"Error processing {source_name} ({method}): {e}")
350
+
351
+ # If no news was fetched, use mock data
352
+ if not all_news:
353
+ logger.warning("No news fetched from any source - using mock data")
354
+ return _self._get_mock_news()
355
+
356
+ # Sort by breaking news, impact, and timestamp
357
+ all_news.sort(
358
+ key=lambda x: (x['is_breaking'], x['impact'] == 'high', x['timestamp']),
359
+ reverse=True
360
+ )
361
+
362
+ logger.info(f"Total unique news items: {len(all_news)}")
363
+ return all_news[:max_items]
364
+
365
+ def _categorize_text(self, text: str, source_specialization: List[str]) -> str:
366
+ """Categorize news based on keywords and source specialization"""
367
+ text_lower = text.lower()
368
+
369
+ # Count keyword matches
370
+ macro_score = sum(1 for kw in self.MACRO_KEYWORDS if kw.lower() in text_lower)
371
+ market_score = sum(1 for kw in self.MARKET_KEYWORDS if kw.lower() in text_lower)
372
+ geo_score = sum(1 for kw in self.GEOPOLITICAL_KEYWORDS if kw.lower() in text_lower)
373
+
374
+ # Weight by source specialization
375
+ if 'macro' in source_specialization:
376
+ macro_score *= 1.5
377
+ if 'markets' in source_specialization:
378
+ market_score *= 1.5
379
+ if 'geopolitical' in source_specialization:
380
+ geo_score *= 1.5
381
+
382
+ scores = {'macro': macro_score, 'markets': market_score, 'geopolitical': geo_score}
383
+ return max(scores, key=scores.get) if max(scores.values()) > 0 else 'markets'
384
+
385
+ def _analyze_sentiment(self, text: str) -> str:
386
+ """Analyze sentiment based on keywords"""
387
+ text_lower = text.lower()
388
+
389
+ positive = ['surge', 'soar', 'rally', 'beat', 'upgrade', 'bullish',
390
+ 'gain', 'rise', 'jump', 'boost', 'positive']
391
+ negative = ['plunge', 'crash', 'fall', 'miss', 'downgrade', 'bearish',
392
+ 'loss', 'drop', 'slide', 'concern', 'negative']
393
+
394
+ pos_count = sum(1 for word in positive if word in text_lower)
395
+ neg_count = sum(1 for word in negative if word in text_lower)
396
+
397
+ if pos_count > neg_count:
398
+ return 'positive'
399
+ elif neg_count > pos_count:
400
+ return 'negative'
401
+ return 'neutral'
402
+
403
+ def _assess_impact(self, source_weight: float, title: str) -> str:
404
+ """Assess market impact"""
405
+ # Central banks and official sources = high impact
406
+ if source_weight >= 1.7:
407
+ return 'high'
408
+
409
+ # Check for high-impact keywords
410
+ high_impact_words = ['breaking', 'alert', 'emergency', 'crash', 'surge', 'fed']
411
+ if any(word in title.lower() for word in high_impact_words):
412
+ return 'high'
413
+
414
+ return 'medium' if source_weight >= 1.3 else 'low'
415
+
416
+ def _detect_breaking_news(self, text: str) -> bool:
417
+ """Detect breaking news"""
418
+ text_upper = text.upper()
419
+ breaking_signals = ['BREAKING', 'ALERT', 'URGENT', 'JUST IN', 'DEVELOPING']
420
+ return any(signal in text_upper for signal in breaking_signals)
421
+
422
+ def _extract_summary(self, text: str, max_length: int = 150) -> str:
423
+ """Extract clean summary"""
424
+ text = re.sub(r'http\S+', '', text)
425
+ text = text.strip()
426
+
427
+ if len(text) <= max_length:
428
+ return text
429
+ return text[:max_length] + '...'
430
+
431
+ def _get_mock_news(self) -> List[Dict]:
432
+ """Mock data fallback"""
433
+ return [
434
+ {
435
+ 'id': 1,
436
+ 'title': 'Federal Reserve holds rates steady, signals caution on inflation outlook',
437
+ 'summary': 'Fed maintains current rate policy',
438
+ 'source': 'Federal Reserve',
439
+ 'category': 'macro',
440
+ 'timestamp': datetime.now() - timedelta(minutes=15),
441
+ 'sentiment': 'neutral',
442
+ 'impact': 'high',
443
+ 'url': 'https://www.federalreserve.gov',
444
+ 'likes': 0,
445
+ 'retweets': 0,
446
+ 'is_breaking': False,
447
+ 'source_weight': 2.0
448
+ },
449
+ {
450
+ 'id': 2,
451
+ 'title': 'S&P 500 closes at record high as tech stocks rally on strong earnings',
452
+ 'summary': 'S&P 500 hits record on tech rally',
453
+ 'source': 'CNBC',
454
+ 'category': 'markets',
455
+ 'timestamp': datetime.now() - timedelta(minutes=30),
456
+ 'sentiment': 'positive',
457
+ 'impact': 'high',
458
+ 'url': 'https://www.cnbc.com',
459
+ 'likes': 0,
460
+ 'retweets': 0,
461
+ 'is_breaking': False,
462
+ 'source_weight': 1.2
463
+ },
464
+ {
465
+ 'id': 3,
466
+ 'title': 'ECB President Lagarde warns of persistent inflation pressures in eurozone',
467
+ 'summary': 'Lagarde warns on eurozone inflation',
468
+ 'source': 'European Central Bank',
469
+ 'category': 'macro',
470
+ 'timestamp': datetime.now() - timedelta(hours=1),
471
+ 'sentiment': 'negative',
472
+ 'impact': 'high',
473
+ 'url': 'https://www.ecb.europa.eu',
474
+ 'likes': 0,
475
+ 'retweets': 0,
476
+ 'is_breaking': False,
477
+ 'source_weight': 2.0
478
+ }
479
+ ]
480
+
481
+ def get_news(self, category: str = 'all', sentiment: str = 'all',
482
+ impact: str = 'all', refresh: bool = False) -> pd.DataFrame:
483
+ """Get filtered news with caching"""
484
+ # Check cache freshness
485
+ if refresh or not self.last_fetch or \
486
+ (datetime.now() - self.last_fetch).seconds > self.cache_ttl:
487
+ self.news_cache = self.scrape_news(max_items=100)
488
+ self.last_fetch = datetime.now()
489
+
490
+ news = self.news_cache.copy()
491
+
492
+ # Apply filters
493
+ if category != 'all':
494
+ news = [n for n in news if n['category'] == category]
495
+ if sentiment != 'all':
496
+ news = [n for n in news if n['sentiment'] == sentiment]
497
+ if impact != 'all':
498
+ news = [n for n in news if n['impact'] == impact]
499
+
500
+ df = pd.DataFrame(news)
501
+ if not df.empty:
502
+ df['timestamp'] = pd.to_datetime(df['timestamp'])
503
+
504
+ return df
505
+
506
+ def get_breaking_news(self) -> pd.DataFrame:
507
+ """Get breaking/high-impact news"""
508
+ return self.get_news(impact='high')
509
+
510
+ def get_statistics(self) -> Dict:
511
+ """Get feed statistics"""
512
+ if not self.news_cache:
513
+ return {
514
+ 'total': 0,
515
+ 'high_impact': 0,
516
+ 'breaking': 0,
517
+ 'last_update': 'Never',
518
+ 'by_category': {}
519
+ }
520
+
521
+ df = pd.DataFrame(self.news_cache)
522
+ return {
523
+ 'total': len(df),
524
+ 'high_impact': len(df[df['impact'] == 'high']),
525
+ 'breaking': len(df[df['is_breaking'] == True]),
526
+ 'last_update': self.last_fetch.strftime('%H:%M:%S') if self.last_fetch else 'Never',
527
+ 'by_category': df['category'].value_counts().to_dict()
528
+ }
requirements.txt CHANGED
@@ -5,3 +5,6 @@ openbb>=4.0.0
5
  python-dotenv>=1.0.0
6
  requests>=2.31.0
7
  twikit>=2.3.0
 
 
 
 
5
  python-dotenv>=1.0.0
6
  requests>=2.31.0
7
  twikit>=2.3.0
8
+ feedparser>=6.0.0
9
+ beautifulsoup4>=4.12.0
10
+ lxml>=5.0.0