FinancialPlatform / app /services /reddit_news.py
Dmitry Beresnev
fix news stats calcualtion
0e4b579
"""
Reddit Financial News Scraper
Scrapes financial, trading, quant, and geopolitical news from Reddit
No authentication required - uses public RSS feeds
"""
import feedparser
import logging
from datetime import datetime, timedelta
from typing import List, Dict
import re
logger = logging.getLogger(__name__)
class RedditFinanceMonitor:
"""
Reddit financial news aggregator using RSS feeds
No authentication required - public RSS feeds only
"""
# Premium financial subreddits
SUBREDDITS = {
# Financial & Markets
'wallstreetbets': {
'url': 'https://www.reddit.com/r/wallstreetbets/top/.rss?t=day',
'weight': 1.6,
'specialization': ['markets'],
'category': 'markets'
},
'stocks': {
'url': 'https://www.reddit.com/r/stocks/top/.rss?t=day',
'weight': 1.7,
'specialization': ['markets'],
'category': 'markets'
},
'investing': {
'url': 'https://www.reddit.com/r/investing/top/.rss?t=day',
'weight': 1.8,
'specialization': ['markets', 'macro'],
'category': 'markets'
},
'stockmarket': {
'url': 'https://www.reddit.com/r/StockMarket/top/.rss?t=day',
'weight': 1.6,
'specialization': ['markets'],
'category': 'markets'
},
'options': {
'url': 'https://www.reddit.com/r/options/top/.rss?t=day',
'weight': 1.5,
'specialization': ['markets'],
'category': 'markets'
},
'daytrading': {
'url': 'https://www.reddit.com/r/Daytrading/top/.rss?t=day',
'weight': 1.5,
'specialization': ['markets'],
'category': 'markets'
},
'securityanalysis': {
'url': 'https://www.reddit.com/r/SecurityAnalysis/top/.rss?t=day',
'weight': 1.7,
'specialization': ['markets'],
'category': 'markets'
},
# Economics & Macro
'economics': {
'url': 'https://www.reddit.com/r/Economics/top/.rss?t=day',
'weight': 1.8,
'specialization': ['macro'],
'category': 'macro'
},
'economy': {
'url': 'https://www.reddit.com/r/economy/top/.rss?t=day',
'weight': 1.6,
'specialization': ['macro'],
'category': 'macro'
},
# Quantitative Finance
'algotrading': {
'url': 'https://www.reddit.com/r/algotrading/top/.rss?t=day',
'weight': 1.7,
'specialization': ['markets'],
'category': 'markets'
},
'quantfinance': {
'url': 'https://www.reddit.com/r/quant/top/.rss?t=day',
'weight': 1.7,
'specialization': ['markets'],
'category': 'markets'
},
# Geopolitics
'geopolitics': {
'url': 'https://www.reddit.com/r/geopolitics/top/.rss?t=day',
'weight': 1.8,
'specialization': ['geopolitical'],
'category': 'geopolitical'
},
'worldnews': {
'url': 'https://www.reddit.com/r/worldnews/top/.rss?t=day',
'weight': 1.7,
'specialization': ['geopolitical'],
'category': 'geopolitical'
},
'neutralpolitics': {
'url': 'https://www.reddit.com/r/NeutralPolitics/top/.rss?t=day',
'weight': 1.6,
'specialization': ['geopolitical'],
'category': 'geopolitical'
},
}
# Keyword detection for additional categorization
MACRO_KEYWORDS = [
'Fed', 'ECB', 'BoE', 'BoJ', 'FOMC', 'Powell', 'Lagarde',
'interest rate', 'inflation', 'CPI', 'PPI', 'GDP',
'unemployment', 'jobs report', 'NFP', 'central bank',
'recession', 'QE', 'quantitative easing', 'monetary policy'
]
MARKETS_KEYWORDS = [
'stock', 'equity', 'bond', 'commodity', 'oil', 'gold',
'earnings', 'revenue', 'profit', 'IPO', 'merger',
'acquisition', 'trading', 'options', 'futures', 'forex'
]
GEOPOLITICAL_KEYWORDS = [
'war', 'conflict', 'sanction', 'trade', 'tariff',
'election', 'China', 'Russia', 'Ukraine', 'Taiwan',
'Middle East', 'Iran', 'Israel', 'NATO', 'UN'
]
def __init__(self):
"""Initialize Reddit monitor"""
pass
def _categorize_post(self, title: str, subreddit_info: Dict) -> str:
"""Categorize post based on title and subreddit"""
title_lower = title.lower()
# Use subreddit default category
default_category = subreddit_info.get('category', 'markets')
# Check keywords for override
if any(keyword.lower() in title_lower for keyword in self.MACRO_KEYWORDS):
return 'macro'
elif any(keyword.lower() in title_lower for keyword in self.GEOPOLITICAL_KEYWORDS):
return 'geopolitical'
elif any(keyword.lower() in title_lower for keyword in self.MARKETS_KEYWORDS):
return 'markets'
return default_category
def _detect_sentiment(self, title: str) -> str:
"""Simple sentiment detection based on keywords"""
title_lower = title.lower()
positive_words = ['bullish', 'bull', 'surge', 'gain', 'up', 'rally', 'boom', 'profit', 'growth']
negative_words = ['bearish', 'bear', 'crash', 'loss', 'down', 'fall', 'decline', 'recession', 'crisis']
positive_count = sum(1 for word in positive_words if word in title_lower)
negative_count = sum(1 for word in negative_words if word in title_lower)
if positive_count > negative_count:
return 'positive'
elif negative_count > positive_count:
return 'negative'
else:
return 'neutral'
def _calculate_impact(self, score: int, num_comments: int, subreddit_weight: float) -> str:
"""Calculate impact based on upvotes, comments, and subreddit weight"""
# Normalize score (upvotes - downvotes)
engagement_score = (score * 0.7) + (num_comments * 0.3)
weighted_score = engagement_score * subreddit_weight
if weighted_score > 500:
return 'high'
elif weighted_score > 100:
return 'medium'
else:
return 'low'
def scrape_reddit_news(self, max_posts: int = 100, hours: int = 12) -> List[Dict]:
"""
Scrape Reddit posts from financial subreddits
Args:
max_posts: Maximum number of posts to return
hours: Only include posts from the last N hours (default: 12)
Returns:
List of news items with metadata
"""
all_posts = []
seen_titles = set()
cutoff_time = datetime.now() - timedelta(hours=hours)
logger.info(f"Scraping Reddit posts from last {hours} hours...")
for subreddit_name, subreddit_info in self.SUBREDDITS.items():
try:
logger.info(f"Fetching r/{subreddit_name}...")
# Parse RSS feed
feed = feedparser.parse(subreddit_info['url'])
for entry in feed.entries[:20]: # Get top 20 per subreddit
try:
# Parse publication date
if hasattr(entry, 'published_parsed'):
pub_date = datetime(*entry.published_parsed[:6])
else:
pub_date = datetime.now()
# Filter by time (last 12 hours by default)
if pub_date < cutoff_time:
continue
# Extract title and link
title = entry.title.strip()
link = entry.link
# Deduplicate
title_hash = hash(title[:100])
if title_hash in seen_titles:
continue
seen_titles.add(title_hash)
# Extract score and comments from content
score = 0
num_comments = 0
if hasattr(entry, 'content'):
content_text = entry.content[0].value if entry.content else ''
# Try to extract score from content
score_match = re.search(r'(\d+)\s+points?', content_text)
if score_match:
score = int(score_match.group(1))
# Try to extract comments
comment_match = re.search(r'(\d+)\s+comments?', content_text)
if comment_match:
num_comments = int(comment_match.group(1))
# Categorize and analyze
category = self._categorize_post(title, subreddit_info)
sentiment = self._detect_sentiment(title)
impact = self._calculate_impact(score, num_comments, subreddit_info['weight'])
# Check if breaking news (high score in last 3 hours)
is_breaking = (
(datetime.now() - pub_date).total_seconds() < 10800 and # 3 hours
score > 1000
)
post_data = {
'title': title,
'summary': title, # Reddit posts don't have separate summaries
'url': link,
'source': f"r/{subreddit_name}",
'timestamp': pub_date,
'category': category,
'sentiment': sentiment,
'impact': impact,
'is_breaking': is_breaking,
'engagement': {
'score': score,
'comments': num_comments
},
'platform': 'reddit'
}
all_posts.append(post_data)
except Exception as e:
logger.error(f"Error processing entry from r/{subreddit_name}: {e}")
continue
logger.info(f"Fetched {len([p for p in all_posts if p['source'] == f'r/{subreddit_name}'])} posts from r/{subreddit_name}")
except Exception as e:
logger.error(f"Error fetching r/{subreddit_name}: {e}")
continue
# Sort by engagement score (weighted by source weight)
all_posts.sort(key=lambda x: x['engagement']['score'] * self.SUBREDDITS.get(
x['source'].replace('r/', ''), {}
).get('weight', 1.0), reverse=True)
logger.info(f"Total Reddit posts scraped: {len(all_posts)}")
return all_posts[:max_posts]
def get_statistics(self) -> Dict:
"""
Get statistics about scraped Reddit posts
Note: Statistics are now managed by NewsCacheManager
This method returns empty stats for backward compatibility
"""
return {
'total': 0,
'high_impact': 0,
'breaking': 0,
'by_category': {
'macro': 0,
'markets': 0,
'geopolitical': 0
}
}