|
|
""" |
|
|
Reddit Financial News Scraper |
|
|
Scrapes financial, trading, quant, and geopolitical news from Reddit |
|
|
No authentication required - uses public RSS feeds |
|
|
""" |
|
|
|
|
|
import feedparser |
|
|
import logging |
|
|
from datetime import datetime, timedelta |
|
|
from typing import List, Dict |
|
|
import re |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
class RedditFinanceMonitor: |
|
|
""" |
|
|
Reddit financial news aggregator using RSS feeds |
|
|
No authentication required - public RSS feeds only |
|
|
""" |
|
|
|
|
|
|
|
|
SUBREDDITS = { |
|
|
|
|
|
'wallstreetbets': { |
|
|
'url': 'https://www.reddit.com/r/wallstreetbets/top/.rss?t=day', |
|
|
'weight': 1.6, |
|
|
'specialization': ['markets'], |
|
|
'category': 'markets' |
|
|
}, |
|
|
'stocks': { |
|
|
'url': 'https://www.reddit.com/r/stocks/top/.rss?t=day', |
|
|
'weight': 1.7, |
|
|
'specialization': ['markets'], |
|
|
'category': 'markets' |
|
|
}, |
|
|
'investing': { |
|
|
'url': 'https://www.reddit.com/r/investing/top/.rss?t=day', |
|
|
'weight': 1.8, |
|
|
'specialization': ['markets', 'macro'], |
|
|
'category': 'markets' |
|
|
}, |
|
|
'stockmarket': { |
|
|
'url': 'https://www.reddit.com/r/StockMarket/top/.rss?t=day', |
|
|
'weight': 1.6, |
|
|
'specialization': ['markets'], |
|
|
'category': 'markets' |
|
|
}, |
|
|
'options': { |
|
|
'url': 'https://www.reddit.com/r/options/top/.rss?t=day', |
|
|
'weight': 1.5, |
|
|
'specialization': ['markets'], |
|
|
'category': 'markets' |
|
|
}, |
|
|
'daytrading': { |
|
|
'url': 'https://www.reddit.com/r/Daytrading/top/.rss?t=day', |
|
|
'weight': 1.5, |
|
|
'specialization': ['markets'], |
|
|
'category': 'markets' |
|
|
}, |
|
|
'securityanalysis': { |
|
|
'url': 'https://www.reddit.com/r/SecurityAnalysis/top/.rss?t=day', |
|
|
'weight': 1.7, |
|
|
'specialization': ['markets'], |
|
|
'category': 'markets' |
|
|
}, |
|
|
|
|
|
|
|
|
'economics': { |
|
|
'url': 'https://www.reddit.com/r/Economics/top/.rss?t=day', |
|
|
'weight': 1.8, |
|
|
'specialization': ['macro'], |
|
|
'category': 'macro' |
|
|
}, |
|
|
'economy': { |
|
|
'url': 'https://www.reddit.com/r/economy/top/.rss?t=day', |
|
|
'weight': 1.6, |
|
|
'specialization': ['macro'], |
|
|
'category': 'macro' |
|
|
}, |
|
|
|
|
|
|
|
|
'algotrading': { |
|
|
'url': 'https://www.reddit.com/r/algotrading/top/.rss?t=day', |
|
|
'weight': 1.7, |
|
|
'specialization': ['markets'], |
|
|
'category': 'markets' |
|
|
}, |
|
|
'quantfinance': { |
|
|
'url': 'https://www.reddit.com/r/quant/top/.rss?t=day', |
|
|
'weight': 1.7, |
|
|
'specialization': ['markets'], |
|
|
'category': 'markets' |
|
|
}, |
|
|
|
|
|
|
|
|
'geopolitics': { |
|
|
'url': 'https://www.reddit.com/r/geopolitics/top/.rss?t=day', |
|
|
'weight': 1.8, |
|
|
'specialization': ['geopolitical'], |
|
|
'category': 'geopolitical' |
|
|
}, |
|
|
'worldnews': { |
|
|
'url': 'https://www.reddit.com/r/worldnews/top/.rss?t=day', |
|
|
'weight': 1.7, |
|
|
'specialization': ['geopolitical'], |
|
|
'category': 'geopolitical' |
|
|
}, |
|
|
'neutralpolitics': { |
|
|
'url': 'https://www.reddit.com/r/NeutralPolitics/top/.rss?t=day', |
|
|
'weight': 1.6, |
|
|
'specialization': ['geopolitical'], |
|
|
'category': 'geopolitical' |
|
|
}, |
|
|
} |
|
|
|
|
|
|
|
|
MACRO_KEYWORDS = [ |
|
|
'Fed', 'ECB', 'BoE', 'BoJ', 'FOMC', 'Powell', 'Lagarde', |
|
|
'interest rate', 'inflation', 'CPI', 'PPI', 'GDP', |
|
|
'unemployment', 'jobs report', 'NFP', 'central bank', |
|
|
'recession', 'QE', 'quantitative easing', 'monetary policy' |
|
|
] |
|
|
|
|
|
MARKETS_KEYWORDS = [ |
|
|
'stock', 'equity', 'bond', 'commodity', 'oil', 'gold', |
|
|
'earnings', 'revenue', 'profit', 'IPO', 'merger', |
|
|
'acquisition', 'trading', 'options', 'futures', 'forex' |
|
|
] |
|
|
|
|
|
GEOPOLITICAL_KEYWORDS = [ |
|
|
'war', 'conflict', 'sanction', 'trade', 'tariff', |
|
|
'election', 'China', 'Russia', 'Ukraine', 'Taiwan', |
|
|
'Middle East', 'Iran', 'Israel', 'NATO', 'UN' |
|
|
] |
|
|
|
|
|
def __init__(self): |
|
|
"""Initialize Reddit monitor""" |
|
|
pass |
|
|
|
|
|
def _categorize_post(self, title: str, subreddit_info: Dict) -> str: |
|
|
"""Categorize post based on title and subreddit""" |
|
|
title_lower = title.lower() |
|
|
|
|
|
|
|
|
default_category = subreddit_info.get('category', 'markets') |
|
|
|
|
|
|
|
|
if any(keyword.lower() in title_lower for keyword in self.MACRO_KEYWORDS): |
|
|
return 'macro' |
|
|
elif any(keyword.lower() in title_lower for keyword in self.GEOPOLITICAL_KEYWORDS): |
|
|
return 'geopolitical' |
|
|
elif any(keyword.lower() in title_lower for keyword in self.MARKETS_KEYWORDS): |
|
|
return 'markets' |
|
|
|
|
|
return default_category |
|
|
|
|
|
def _detect_sentiment(self, title: str) -> str: |
|
|
"""Simple sentiment detection based on keywords""" |
|
|
title_lower = title.lower() |
|
|
|
|
|
positive_words = ['bullish', 'bull', 'surge', 'gain', 'up', 'rally', 'boom', 'profit', 'growth'] |
|
|
negative_words = ['bearish', 'bear', 'crash', 'loss', 'down', 'fall', 'decline', 'recession', 'crisis'] |
|
|
|
|
|
positive_count = sum(1 for word in positive_words if word in title_lower) |
|
|
negative_count = sum(1 for word in negative_words if word in title_lower) |
|
|
|
|
|
if positive_count > negative_count: |
|
|
return 'positive' |
|
|
elif negative_count > positive_count: |
|
|
return 'negative' |
|
|
else: |
|
|
return 'neutral' |
|
|
|
|
|
def _calculate_impact(self, score: int, num_comments: int, subreddit_weight: float) -> str: |
|
|
"""Calculate impact based on upvotes, comments, and subreddit weight""" |
|
|
|
|
|
engagement_score = (score * 0.7) + (num_comments * 0.3) |
|
|
weighted_score = engagement_score * subreddit_weight |
|
|
|
|
|
if weighted_score > 500: |
|
|
return 'high' |
|
|
elif weighted_score > 100: |
|
|
return 'medium' |
|
|
else: |
|
|
return 'low' |
|
|
|
|
|
def scrape_reddit_news(self, max_posts: int = 100, hours: int = 12) -> List[Dict]: |
|
|
""" |
|
|
Scrape Reddit posts from financial subreddits |
|
|
|
|
|
Args: |
|
|
max_posts: Maximum number of posts to return |
|
|
hours: Only include posts from the last N hours (default: 12) |
|
|
|
|
|
Returns: |
|
|
List of news items with metadata |
|
|
""" |
|
|
all_posts = [] |
|
|
seen_titles = set() |
|
|
cutoff_time = datetime.now() - timedelta(hours=hours) |
|
|
|
|
|
logger.info(f"Scraping Reddit posts from last {hours} hours...") |
|
|
|
|
|
for subreddit_name, subreddit_info in self.SUBREDDITS.items(): |
|
|
try: |
|
|
logger.info(f"Fetching r/{subreddit_name}...") |
|
|
|
|
|
|
|
|
feed = feedparser.parse(subreddit_info['url']) |
|
|
|
|
|
for entry in feed.entries[:20]: |
|
|
try: |
|
|
|
|
|
if hasattr(entry, 'published_parsed'): |
|
|
pub_date = datetime(*entry.published_parsed[:6]) |
|
|
else: |
|
|
pub_date = datetime.now() |
|
|
|
|
|
|
|
|
if pub_date < cutoff_time: |
|
|
continue |
|
|
|
|
|
|
|
|
title = entry.title.strip() |
|
|
link = entry.link |
|
|
|
|
|
|
|
|
title_hash = hash(title[:100]) |
|
|
if title_hash in seen_titles: |
|
|
continue |
|
|
seen_titles.add(title_hash) |
|
|
|
|
|
|
|
|
score = 0 |
|
|
num_comments = 0 |
|
|
if hasattr(entry, 'content'): |
|
|
content_text = entry.content[0].value if entry.content else '' |
|
|
|
|
|
score_match = re.search(r'(\d+)\s+points?', content_text) |
|
|
if score_match: |
|
|
score = int(score_match.group(1)) |
|
|
|
|
|
comment_match = re.search(r'(\d+)\s+comments?', content_text) |
|
|
if comment_match: |
|
|
num_comments = int(comment_match.group(1)) |
|
|
|
|
|
|
|
|
category = self._categorize_post(title, subreddit_info) |
|
|
sentiment = self._detect_sentiment(title) |
|
|
impact = self._calculate_impact(score, num_comments, subreddit_info['weight']) |
|
|
|
|
|
|
|
|
is_breaking = ( |
|
|
(datetime.now() - pub_date).total_seconds() < 10800 and |
|
|
score > 1000 |
|
|
) |
|
|
|
|
|
post_data = { |
|
|
'title': title, |
|
|
'summary': title, |
|
|
'url': link, |
|
|
'source': f"r/{subreddit_name}", |
|
|
'timestamp': pub_date, |
|
|
'category': category, |
|
|
'sentiment': sentiment, |
|
|
'impact': impact, |
|
|
'is_breaking': is_breaking, |
|
|
'engagement': { |
|
|
'score': score, |
|
|
'comments': num_comments |
|
|
}, |
|
|
'platform': 'reddit' |
|
|
} |
|
|
|
|
|
all_posts.append(post_data) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Error processing entry from r/{subreddit_name}: {e}") |
|
|
continue |
|
|
|
|
|
logger.info(f"Fetched {len([p for p in all_posts if p['source'] == f'r/{subreddit_name}'])} posts from r/{subreddit_name}") |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Error fetching r/{subreddit_name}: {e}") |
|
|
continue |
|
|
|
|
|
|
|
|
all_posts.sort(key=lambda x: x['engagement']['score'] * self.SUBREDDITS.get( |
|
|
x['source'].replace('r/', ''), {} |
|
|
).get('weight', 1.0), reverse=True) |
|
|
|
|
|
logger.info(f"Total Reddit posts scraped: {len(all_posts)}") |
|
|
|
|
|
return all_posts[:max_posts] |
|
|
|
|
|
def get_statistics(self) -> Dict: |
|
|
""" |
|
|
Get statistics about scraped Reddit posts |
|
|
Note: Statistics are now managed by NewsCacheManager |
|
|
This method returns empty stats for backward compatibility |
|
|
""" |
|
|
return { |
|
|
'total': 0, |
|
|
'high_impact': 0, |
|
|
'breaking': 0, |
|
|
'by_category': { |
|
|
'macro': 0, |
|
|
'markets': 0, |
|
|
'geopolitical': 0 |
|
|
} |
|
|
} |
|
|
|