UnifiedFinancialPlatform / app /services /sectoral_news.py
Dmitry Beresnev
init project
e189a31
"""
Sectoral News Scraper - 7 Major Market Sectors
Filters and aggregates news by sector: Finance, Tech, Energy, Healthcare, Consumer, Industrials, Real Estate
Leverages existing RSS infrastructure with sector-specific classification
"""
from datetime import datetime, timedelta
from typing import List, Dict, Optional
import logging
import re
from concurrent.futures import ThreadPoolExecutor
import requests
import pandas as pd
import feedparser
from bs4 import BeautifulSoup
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class SectoralNewsScraper:
"""
Aggregates news by market sector
Uses RSS feeds + keyword classification
"""
# 7 Sector configuration with keywords and RSS feeds
SECTORS = {
'finance': {
'name': 'Finance',
'keywords': [
'bank', 'JPMorgan', 'Goldman Sachs', 'Morgan Stanley', 'Wells Fargo',
'Citigroup', 'Bank of America', 'fintech', 'lending', 'credit',
'financial sector', 'banking', 'insurance', 'asset management'
],
'rss_sources': [
'https://www.cnbc.com/id/10000664/device/rss/rss.html', # CNBC Banking
'https://feeds.bloomberg.com/markets/news.rss'
],
'weight': 1.5
},
'tech': {
'name': 'Technology',
'keywords': [
'Apple', 'Microsoft', 'Google', 'Alphabet', 'Amazon', 'Meta', 'Facebook',
'NVIDIA', 'AMD', 'Intel', 'semiconductor', 'chip', 'software', 'cloud',
'AI', 'artificial intelligence', 'tech sector', 'Silicon Valley', 'Tesla'
],
'rss_sources': [
'https://www.cnbc.com/id/19854910/device/rss/rss.html', # CNBC Technology
'https://techcrunch.com/feed/'
],
'weight': 1.5
},
'energy': {
'name': 'Energy',
'keywords': [
'oil', 'gas', 'crude', 'petroleum', 'OPEC', 'Exxon', 'ExxonMobil', 'Chevron',
'ConocoPhillips', 'renewable', 'solar', 'wind', 'energy sector', 'pipeline',
'natural gas', 'LNG', 'fracking', 'drilling'
],
'rss_sources': [
'https://www.cnbc.com/id/19832390/device/rss/rss.html', # CNBC Energy
],
'weight': 1.6
},
'healthcare': {
'name': 'Healthcare',
'keywords': [
'pharma', 'pharmaceutical', 'biotech', 'FDA', 'drug', 'vaccine', 'clinical trial',
'Pfizer', 'Johnson & Johnson', 'Merck', 'AbbVie', 'Bristol Myers',
'healthcare', 'hospital', 'medical device', 'therapeutics'
],
'rss_sources': [
'https://www.cnbc.com/id/10000108/device/rss/rss.html', # CNBC Health
],
'weight': 1.5
},
'consumer': {
'name': 'Consumer & Retail',
'keywords': [
'retail', 'Amazon', 'Walmart', 'Target', 'Costco', 'Home Depot',
'e-commerce', 'consumer', 'shopping', 'Black Friday', 'sales',
'Nike', 'Starbucks', 'McDonald\'s', 'consumer goods', 'discretionary'
],
'rss_sources': [
'https://www.cnbc.com/id/10001009/device/rss/rss.html', # CNBC Retail
],
'weight': 1.3
},
'industrials': {
'name': 'Industrials',
'keywords': [
'Boeing', 'Airbus', 'Caterpillar', 'Deere', '3M', 'GE', 'General Electric',
'Honeywell', 'Lockheed Martin', 'manufacturing', 'industrial',
'aerospace', 'defense', 'machinery', 'equipment', 'logistics', 'freight'
],
'rss_sources': [
'https://www.reuters.com/rss/businessNews', # Reuters Business
],
'weight': 1.4
},
'real_estate': {
'name': 'Real Estate',
'keywords': [
'housing', 'mortgage', 'REIT', 'real estate', 'property', 'home sales',
'construction', 'residential', 'commercial real estate', 'housing market',
'home prices', 'rent', 'rental', 'builder', 'homebuilder'
],
'rss_sources': [], # Will rely on keyword filtering from general news
'weight': 1.3
}
}
def __init__(self):
"""Initialize scraper"""
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
})
def scrape_sectoral_news(self, max_items: int = 50, hours: int = 24) -> List[Dict]:
"""
Scrape and classify news by sector
Returns aggregated list sorted by sector and timestamp
"""
all_news = []
seen_urls = set()
# Parallel fetch from all sector RSS feeds
with ThreadPoolExecutor(max_workers=7) as executor:
futures = []
for sector_id, sector_info in self.SECTORS.items():
# Submit RSS fetching task for each sector
futures.append((
executor.submit(self._fetch_sector_news, sector_id, sector_info, hours),
sector_id
))
for future, sector_id in futures:
try:
sector_news = future.result(timeout=35)
# Deduplicate by URL
for item in sector_news:
if item['url'] not in seen_urls:
seen_urls.add(item['url'])
all_news.append(item)
logger.info(f"Fetched {len(sector_news)} items for {sector_id}")
except Exception as e:
logger.error(f"Error fetching {sector_id} news: {e}")
# If no news fetched, use mock data
if not all_news:
logger.warning("No sectoral news fetched - using mock data")
return self._get_mock_sectoral_news()
# Sort by sector priority and timestamp
all_news.sort(
key=lambda x: (x['sector'] != 'tech', x['sector'] != 'finance', -x['timestamp'].timestamp()),
)
return all_news[:max_items]
def _fetch_sector_news(self, sector_id: str, sector_info: Dict, hours: int) -> List[Dict]:
"""Fetch news for a specific sector"""
sector_news = []
# Fetch from sector-specific RSS feeds
for rss_url in sector_info['rss_sources']:
try:
feed_news = self._fetch_rss_feed(rss_url, sector_id, sector_info, hours)
sector_news.extend(feed_news)
except Exception as e:
logger.debug(f"Error fetching RSS {rss_url}: {e}")
# If no RSS news, could also filter general news sources by keywords
# (This would require access to FinanceNewsScraper - skipping for now)
return sector_news
def _fetch_rss_feed(self, rss_url: str, sector_id: str, sector_info: Dict, hours: int) -> List[Dict]:
"""Fetch and parse RSS feed for sector"""
try:
feed = feedparser.parse(rss_url)
if not feed.entries:
return []
news_items = []
cutoff_time = datetime.now() - timedelta(hours=hours)
for entry in feed.entries[:15]: # Limit to 15 per feed
try:
# Parse timestamp
if hasattr(entry, 'published_parsed') and entry.published_parsed:
timestamp = datetime(*entry.published_parsed[:6])
elif hasattr(entry, 'updated_parsed') and entry.updated_parsed:
timestamp = datetime(*entry.updated_parsed[:6])
else:
timestamp = datetime.now()
# Skip old news
if timestamp < cutoff_time:
continue
# Extract title and summary
title = entry.get('title', '')
summary = entry.get('summary', '') or entry.get('description', '')
# Clean HTML from summary
if summary:
summary = BeautifulSoup(summary, 'html.parser').get_text()
summary = summary[:200] + '...' if len(summary) > 200 else summary
url = entry.get('link', '')
# Verify sector relevance by keywords
text = f"{title} {summary}".lower()
keyword_matches = sum(1 for kw in sector_info['keywords'] if kw.lower() in text)
# Skip if not relevant enough (unless from sector-specific feed)
if keyword_matches == 0 and len(sector_info['rss_sources']) > 3:
continue
# Categorize and analyze
category = self._categorize_news(text)
sentiment = self._analyze_sentiment(text)
impact = self._assess_impact(sector_info['weight'], keyword_matches)
news_items.append({
'id': hash(url),
'title': title,
'summary': summary or title[:200],
'source': sector_info['name'],
'sector': sector_id, # Add sector field
'category': category,
'timestamp': timestamp,
'sentiment': sentiment,
'impact': impact,
'url': url,
'likes': 0,
'retweets': 0,
'is_breaking': False,
'source_weight': sector_info['weight'],
'from_web': False
})
except Exception as e:
logger.debug(f"Error parsing RSS entry: {e}")
continue
return news_items
except Exception as e:
logger.error(f"Error fetching RSS feed {rss_url}: {e}")
return []
def _categorize_news(self, text: str) -> str:
"""Categorize news (macro, markets, geopolitical)"""
macro_keywords = ['Fed', 'ECB', 'inflation', 'rate', 'GDP', 'economy', 'recession']
markets_keywords = ['stock', 'earnings', 'revenue', 'profit', 'IPO', 'merger', 'acquisition']
geo_keywords = ['China', 'tariff', 'trade war', 'sanctions', 'regulation']
macro_score = sum(1 for kw in macro_keywords if kw.lower() in text)
markets_score = sum(1 for kw in markets_keywords if kw.lower() in text)
geo_score = sum(1 for kw in geo_keywords if kw.lower() in text)
scores = {'macro': macro_score, 'markets': markets_score, 'geopolitical': geo_score}
return max(scores, key=scores.get) if max(scores.values()) > 0 else 'markets'
def _analyze_sentiment(self, text: str) -> str:
"""Analyze sentiment based on keywords"""
positive = ['surge', 'soar', 'rally', 'beat', 'upgrade', 'gain', 'rise', 'bullish', 'positive']
negative = ['plunge', 'crash', 'fall', 'miss', 'downgrade', 'loss', 'drop', 'bearish', 'negative']
pos_count = sum(1 for word in positive if word in text)
neg_count = sum(1 for word in negative if word in text)
if pos_count > neg_count:
return 'positive'
elif neg_count > pos_count:
return 'negative'
return 'neutral'
def _assess_impact(self, sector_weight: float, keyword_matches: int) -> str:
"""Assess impact based on sector weight and keyword relevance"""
if sector_weight >= 1.5 and keyword_matches >= 3:
return 'high'
elif keyword_matches >= 2:
return 'medium'
else:
return 'low'
def _get_mock_sectoral_news(self) -> List[Dict]:
"""Mock sectoral news for development"""
now = datetime.now()
return [
{
'id': 1,
'title': 'Apple announces new iPhone with advanced AI capabilities',
'summary': 'Apple unveils next-generation iPhone featuring on-device AI processing',
'source': 'Technology',
'sector': 'tech',
'category': 'markets',
'timestamp': now - timedelta(minutes=30),
'sentiment': 'positive',
'impact': 'high',
'url': 'https://techcrunch.com',
'likes': 0,
'retweets': 0,
'is_breaking': False,
'source_weight': 1.5,
'from_web': False
},
{
'id': 2,
'title': 'JPMorgan reports strong Q4 earnings beat analyst expectations',
'summary': 'Major investment bank posts record profits amid trading surge',
'source': 'Finance',
'sector': 'finance',
'category': 'markets',
'timestamp': now - timedelta(hours=1),
'sentiment': 'positive',
'impact': 'high',
'url': 'https://cnbc.com',
'likes': 0,
'retweets': 0,
'is_breaking': False,
'source_weight': 1.5,
'from_web': False
},
{
'id': 3,
'title': 'OPEC+ extends oil production cuts through Q2',
'summary': 'Major oil producers agree to maintain supply restrictions',
'source': 'Energy',
'sector': 'energy',
'category': 'geopolitical',
'timestamp': now - timedelta(hours=2),
'sentiment': 'neutral',
'impact': 'high',
'url': 'https://reuters.com',
'likes': 0,
'retweets': 0,
'is_breaking': False,
'source_weight': 1.6,
'from_web': False
},
{
'id': 4,
'title': 'Pfizer receives FDA approval for new cancer treatment',
'summary': 'Breakthrough therapy approved for late-stage lung cancer',
'source': 'Healthcare',
'sector': 'healthcare',
'category': 'markets',
'timestamp': now - timedelta(hours=3),
'sentiment': 'positive',
'impact': 'medium',
'url': 'https://cnbc.com',
'likes': 0,
'retweets': 0,
'is_breaking': False,
'source_weight': 1.5,
'from_web': False
},
{
'id': 5,
'title': 'Amazon expands same-day delivery to 50 new cities',
'summary': 'E-commerce giant accelerates logistics network expansion',
'source': 'Consumer & Retail',
'sector': 'consumer',
'category': 'markets',
'timestamp': now - timedelta(hours=4),
'sentiment': 'positive',
'impact': 'medium',
'url': 'https://techcrunch.com',
'likes': 0,
'retweets': 0,
'is_breaking': False,
'source_weight': 1.3,
'from_web': False
},
{
'id': 6,
'title': 'Boeing wins $10B contract for new military aircraft',
'summary': 'Defense contractor secures major government order',
'source': 'Industrials',
'sector': 'industrials',
'category': 'markets',
'timestamp': now - timedelta(hours=5),
'sentiment': 'positive',
'impact': 'medium',
'url': 'https://reuters.com',
'likes': 0,
'retweets': 0,
'is_breaking': False,
'source_weight': 1.4,
'from_web': False
},
{
'id': 7,
'title': 'US housing starts surge 15% in December',
'summary': 'Construction activity rebounds amid lower mortgage rates',
'source': 'Real Estate',
'sector': 'real_estate',
'category': 'macro',
'timestamp': now - timedelta(hours=6),
'sentiment': 'positive',
'impact': 'medium',
'url': 'https://cnbc.com',
'likes': 0,
'retweets': 0,
'is_breaking': False,
'source_weight': 1.3,
'from_web': False
}
]