""" Reddit Financial News Scraper Scrapes financial, trading, quant, and geopolitical news from Reddit No authentication required - uses public RSS feeds """ import feedparser import logging from datetime import datetime, timedelta from typing import List, Dict import re logger = logging.getLogger(__name__) class RedditFinanceMonitor: """ Reddit financial news aggregator using RSS feeds No authentication required - public RSS feeds only """ # Premium financial subreddits SUBREDDITS = { # Financial & Markets 'wallstreetbets': { 'url': 'https://www.reddit.com/r/wallstreetbets/top/.rss?t=day', 'weight': 1.6, 'specialization': ['markets'], 'category': 'markets' }, 'stocks': { 'url': 'https://www.reddit.com/r/stocks/top/.rss?t=day', 'weight': 1.7, 'specialization': ['markets'], 'category': 'markets' }, 'investing': { 'url': 'https://www.reddit.com/r/investing/top/.rss?t=day', 'weight': 1.8, 'specialization': ['markets', 'macro'], 'category': 'markets' }, 'stockmarket': { 'url': 'https://www.reddit.com/r/StockMarket/top/.rss?t=day', 'weight': 1.6, 'specialization': ['markets'], 'category': 'markets' }, 'options': { 'url': 'https://www.reddit.com/r/options/top/.rss?t=day', 'weight': 1.5, 'specialization': ['markets'], 'category': 'markets' }, 'daytrading': { 'url': 'https://www.reddit.com/r/Daytrading/top/.rss?t=day', 'weight': 1.5, 'specialization': ['markets'], 'category': 'markets' }, 'securityanalysis': { 'url': 'https://www.reddit.com/r/SecurityAnalysis/top/.rss?t=day', 'weight': 1.7, 'specialization': ['markets'], 'category': 'markets' }, # Economics & Macro 'economics': { 'url': 'https://www.reddit.com/r/Economics/top/.rss?t=day', 'weight': 1.8, 'specialization': ['macro'], 'category': 'macro' }, 'economy': { 'url': 'https://www.reddit.com/r/economy/top/.rss?t=day', 'weight': 1.6, 'specialization': ['macro'], 'category': 'macro' }, # Quantitative Finance 'algotrading': { 'url': 'https://www.reddit.com/r/algotrading/top/.rss?t=day', 'weight': 1.7, 'specialization': ['markets'], 'category': 'markets' }, 'quantfinance': { 'url': 'https://www.reddit.com/r/quant/top/.rss?t=day', 'weight': 1.7, 'specialization': ['markets'], 'category': 'markets' }, # Geopolitics 'geopolitics': { 'url': 'https://www.reddit.com/r/geopolitics/top/.rss?t=day', 'weight': 1.8, 'specialization': ['geopolitical'], 'category': 'geopolitical' }, 'worldnews': { 'url': 'https://www.reddit.com/r/worldnews/top/.rss?t=day', 'weight': 1.7, 'specialization': ['geopolitical'], 'category': 'geopolitical' }, 'neutralpolitics': { 'url': 'https://www.reddit.com/r/NeutralPolitics/top/.rss?t=day', 'weight': 1.6, 'specialization': ['geopolitical'], 'category': 'geopolitical' }, } # Keyword detection for additional categorization MACRO_KEYWORDS = [ 'Fed', 'ECB', 'BoE', 'BoJ', 'FOMC', 'Powell', 'Lagarde', 'interest rate', 'inflation', 'CPI', 'PPI', 'GDP', 'unemployment', 'jobs report', 'NFP', 'central bank', 'recession', 'QE', 'quantitative easing', 'monetary policy' ] MARKETS_KEYWORDS = [ 'stock', 'equity', 'bond', 'commodity', 'oil', 'gold', 'earnings', 'revenue', 'profit', 'IPO', 'merger', 'acquisition', 'trading', 'options', 'futures', 'forex' ] GEOPOLITICAL_KEYWORDS = [ 'war', 'conflict', 'sanction', 'trade', 'tariff', 'election', 'China', 'Russia', 'Ukraine', 'Taiwan', 'Middle East', 'Iran', 'Israel', 'NATO', 'UN' ] def __init__(self): """Initialize Reddit monitor""" pass def _categorize_post(self, title: str, subreddit_info: Dict) -> str: """Categorize post based on title and subreddit""" title_lower = title.lower() # Use subreddit default category default_category = subreddit_info.get('category', 'markets') # Check keywords for override if any(keyword.lower() in title_lower for keyword in self.MACRO_KEYWORDS): return 'macro' elif any(keyword.lower() in title_lower for keyword in self.GEOPOLITICAL_KEYWORDS): return 'geopolitical' elif any(keyword.lower() in title_lower for keyword in self.MARKETS_KEYWORDS): return 'markets' return default_category def _detect_sentiment(self, title: str) -> str: """Simple sentiment detection based on keywords""" title_lower = title.lower() positive_words = ['bullish', 'bull', 'surge', 'gain', 'up', 'rally', 'boom', 'profit', 'growth'] negative_words = ['bearish', 'bear', 'crash', 'loss', 'down', 'fall', 'decline', 'recession', 'crisis'] positive_count = sum(1 for word in positive_words if word in title_lower) negative_count = sum(1 for word in negative_words if word in title_lower) if positive_count > negative_count: return 'positive' elif negative_count > positive_count: return 'negative' else: return 'neutral' def _calculate_impact(self, score: int, num_comments: int, subreddit_weight: float) -> str: """Calculate impact based on upvotes, comments, and subreddit weight""" # Normalize score (upvotes - downvotes) engagement_score = (score * 0.7) + (num_comments * 0.3) weighted_score = engagement_score * subreddit_weight if weighted_score > 500: return 'high' elif weighted_score > 100: return 'medium' else: return 'low' def scrape_reddit_news(self, max_posts: int = 100, hours: int = 12) -> List[Dict]: """ Scrape Reddit posts from financial subreddits Args: max_posts: Maximum number of posts to return hours: Only include posts from the last N hours (default: 12) Returns: List of news items with metadata """ all_posts = [] seen_titles = set() cutoff_time = datetime.now() - timedelta(hours=hours) logger.info(f"Scraping Reddit posts from last {hours} hours...") for subreddit_name, subreddit_info in self.SUBREDDITS.items(): try: logger.info(f"Fetching r/{subreddit_name}...") # Parse RSS feed feed = feedparser.parse(subreddit_info['url']) for entry in feed.entries[:20]: # Get top 20 per subreddit try: # Parse publication date if hasattr(entry, 'published_parsed'): pub_date = datetime(*entry.published_parsed[:6]) else: pub_date = datetime.now() # Filter by time (last 12 hours by default) if pub_date < cutoff_time: continue # Extract title and link title = entry.title.strip() link = entry.link # Deduplicate title_hash = hash(title[:100]) if title_hash in seen_titles: continue seen_titles.add(title_hash) # Extract score and comments from content score = 0 num_comments = 0 if hasattr(entry, 'content'): content_text = entry.content[0].value if entry.content else '' # Try to extract score from content score_match = re.search(r'(\d+)\s+points?', content_text) if score_match: score = int(score_match.group(1)) # Try to extract comments comment_match = re.search(r'(\d+)\s+comments?', content_text) if comment_match: num_comments = int(comment_match.group(1)) # Categorize and analyze category = self._categorize_post(title, subreddit_info) sentiment = self._detect_sentiment(title) impact = self._calculate_impact(score, num_comments, subreddit_info['weight']) # Check if breaking news (high score in last 3 hours) is_breaking = ( (datetime.now() - pub_date).total_seconds() < 10800 and # 3 hours score > 1000 ) post_data = { 'title': title, 'summary': title, # Reddit posts don't have separate summaries 'url': link, 'source': f"r/{subreddit_name}", 'timestamp': pub_date, 'category': category, 'sentiment': sentiment, 'impact': impact, 'is_breaking': is_breaking, 'engagement': { 'score': score, 'comments': num_comments }, 'platform': 'reddit' } all_posts.append(post_data) except Exception as e: logger.error(f"Error processing entry from r/{subreddit_name}: {e}") continue logger.info(f"Fetched {len([p for p in all_posts if p['source'] == f'r/{subreddit_name}'])} posts from r/{subreddit_name}") except Exception as e: logger.error(f"Error fetching r/{subreddit_name}: {e}") continue # Sort by engagement score (weighted by source weight) all_posts.sort(key=lambda x: x['engagement']['score'] * self.SUBREDDITS.get( x['source'].replace('r/', ''), {} ).get('weight', 1.0), reverse=True) logger.info(f"Total Reddit posts scraped: {len(all_posts)}") return all_posts[:max_posts] def get_statistics(self) -> Dict: """ Get statistics about scraped Reddit posts Note: Statistics are now managed by NewsCacheManager This method returns empty stats for backward compatibility """ return { 'total': 0, 'high_impact': 0, 'breaking': 0, 'by_category': { 'macro': 0, 'markets': 0, 'geopolitical': 0 } }