Spaces:

ResearchEngineering
/

UnifiedFinancialPlatform

Paused

UnifiedFinancialPlatform / app /services /sectoral_news.py

Dmitry Beresnev

init project

e189a31 about 1 month ago

17.2 kB

	"""
	Sectoral News Scraper - 7 Major Market Sectors
	Filters and aggregates news by sector: Finance, Tech, Energy, Healthcare, Consumer, Industrials, Real Estate
	Leverages existing RSS infrastructure with sector-specific classification
	"""

	from datetime import datetime, timedelta
	from typing import List, Dict, Optional
	import logging
	import re
	from concurrent.futures import ThreadPoolExecutor

	import requests
	import pandas as pd
	import feedparser
	from bs4 import BeautifulSoup

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)


	class SectoralNewsScraper:
	"""
	Aggregates news by market sector
	Uses RSS feeds + keyword classification
	"""

	# 7 Sector configuration with keywords and RSS feeds
	SECTORS = {
	'finance': {
	'name': 'Finance',
	'keywords': [
	'bank', 'JPMorgan', 'Goldman Sachs', 'Morgan Stanley', 'Wells Fargo',
	'Citigroup', 'Bank of America', 'fintech', 'lending', 'credit',
	'financial sector', 'banking', 'insurance', 'asset management'
	],
	'rss_sources': [
	'https://www.cnbc.com/id/10000664/device/rss/rss.html', # CNBC Banking
	'https://feeds.bloomberg.com/markets/news.rss'
	],
	'weight': 1.5
	},
	'tech': {
	'name': 'Technology',
	'keywords': [
	'Apple', 'Microsoft', 'Google', 'Alphabet', 'Amazon', 'Meta', 'Facebook',
	'NVIDIA', 'AMD', 'Intel', 'semiconductor', 'chip', 'software', 'cloud',
	'AI', 'artificial intelligence', 'tech sector', 'Silicon Valley', 'Tesla'
	],
	'rss_sources': [
	'https://www.cnbc.com/id/19854910/device/rss/rss.html', # CNBC Technology
	'https://techcrunch.com/feed/'
	],
	'weight': 1.5
	},
	'energy': {
	'name': 'Energy',
	'keywords': [
	'oil', 'gas', 'crude', 'petroleum', 'OPEC', 'Exxon', 'ExxonMobil', 'Chevron',
	'ConocoPhillips', 'renewable', 'solar', 'wind', 'energy sector', 'pipeline',
	'natural gas', 'LNG', 'fracking', 'drilling'
	],
	'rss_sources': [
	'https://www.cnbc.com/id/19832390/device/rss/rss.html', # CNBC Energy
	],
	'weight': 1.6
	},
	'healthcare': {
	'name': 'Healthcare',
	'keywords': [
	'pharma', 'pharmaceutical', 'biotech', 'FDA', 'drug', 'vaccine', 'clinical trial',
	'Pfizer', 'Johnson & Johnson', 'Merck', 'AbbVie', 'Bristol Myers',
	'healthcare', 'hospital', 'medical device', 'therapeutics'
	],
	'rss_sources': [
	'https://www.cnbc.com/id/10000108/device/rss/rss.html', # CNBC Health
	],
	'weight': 1.5
	},
	'consumer': {
	'name': 'Consumer & Retail',
	'keywords': [
	'retail', 'Amazon', 'Walmart', 'Target', 'Costco', 'Home Depot',
	'e-commerce', 'consumer', 'shopping', 'Black Friday', 'sales',
	'Nike', 'Starbucks', 'McDonald\'s', 'consumer goods', 'discretionary'
	],
	'rss_sources': [
	'https://www.cnbc.com/id/10001009/device/rss/rss.html', # CNBC Retail
	],
	'weight': 1.3
	},
	'industrials': {
	'name': 'Industrials',
	'keywords': [
	'Boeing', 'Airbus', 'Caterpillar', 'Deere', '3M', 'GE', 'General Electric',
	'Honeywell', 'Lockheed Martin', 'manufacturing', 'industrial',
	'aerospace', 'defense', 'machinery', 'equipment', 'logistics', 'freight'
	],
	'rss_sources': [
	'https://www.reuters.com/rss/businessNews', # Reuters Business
	],
	'weight': 1.4
	},
	'real_estate': {
	'name': 'Real Estate',
	'keywords': [
	'housing', 'mortgage', 'REIT', 'real estate', 'property', 'home sales',
	'construction', 'residential', 'commercial real estate', 'housing market',
	'home prices', 'rent', 'rental', 'builder', 'homebuilder'
	],
	'rss_sources': [], # Will rely on keyword filtering from general news
	'weight': 1.3
	}
	}

	def __init__(self):
	"""Initialize scraper"""
	self.session = requests.Session()
	self.session.headers.update({
	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.9',
	})

	def scrape_sectoral_news(self, max_items: int = 50, hours: int = 24) -> List[Dict]:
	"""
	Scrape and classify news by sector
	Returns aggregated list sorted by sector and timestamp
	"""
	all_news = []
	seen_urls = set()

	# Parallel fetch from all sector RSS feeds
	with ThreadPoolExecutor(max_workers=7) as executor:
	futures = []

	for sector_id, sector_info in self.SECTORS.items():
	# Submit RSS fetching task for each sector
	futures.append((
	executor.submit(self._fetch_sector_news, sector_id, sector_info, hours),
	sector_id
	))

	for future, sector_id in futures:
	try:
	sector_news = future.result(timeout=35)

	# Deduplicate by URL
	for item in sector_news:
	if item['url'] not in seen_urls:
	seen_urls.add(item['url'])
	all_news.append(item)

	logger.info(f"Fetched {len(sector_news)} items for {sector_id}")

	except Exception as e:
	logger.error(f"Error fetching {sector_id} news: {e}")

	# If no news fetched, use mock data
	if not all_news:
	logger.warning("No sectoral news fetched - using mock data")
	return self._get_mock_sectoral_news()

	# Sort by sector priority and timestamp
	all_news.sort(
	key=lambda x: (x['sector'] != 'tech', x['sector'] != 'finance', -x['timestamp'].timestamp()),
	)

	return all_news[:max_items]

	def _fetch_sector_news(self, sector_id: str, sector_info: Dict, hours: int) -> List[Dict]:
	"""Fetch news for a specific sector"""
	sector_news = []

	# Fetch from sector-specific RSS feeds
	for rss_url in sector_info['rss_sources']:
	try:
	feed_news = self._fetch_rss_feed(rss_url, sector_id, sector_info, hours)
	sector_news.extend(feed_news)
	except Exception as e:
	logger.debug(f"Error fetching RSS {rss_url}: {e}")

	# If no RSS news, could also filter general news sources by keywords
	# (This would require access to FinanceNewsScraper - skipping for now)

	return sector_news

	def _fetch_rss_feed(self, rss_url: str, sector_id: str, sector_info: Dict, hours: int) -> List[Dict]:
	"""Fetch and parse RSS feed for sector"""
	try:
	feed = feedparser.parse(rss_url)

	if not feed.entries:
	return []

	news_items = []
	cutoff_time = datetime.now() - timedelta(hours=hours)

	for entry in feed.entries[:15]: # Limit to 15 per feed
	try:
	# Parse timestamp
	if hasattr(entry, 'published_parsed') and entry.published_parsed:
	timestamp = datetime(*entry.published_parsed[:6])
	elif hasattr(entry, 'updated_parsed') and entry.updated_parsed:
	timestamp = datetime(*entry.updated_parsed[:6])
	else:
	timestamp = datetime.now()

	# Skip old news
	if timestamp < cutoff_time:
	continue

	# Extract title and summary
	title = entry.get('title', '')
	summary = entry.get('summary', '') or entry.get('description', '')

	# Clean HTML from summary
	if summary:
	summary = BeautifulSoup(summary, 'html.parser').get_text()
	summary = summary[:200] + '...' if len(summary) > 200 else summary

	url = entry.get('link', '')

	# Verify sector relevance by keywords
	text = f"{title} {summary}".lower()
	keyword_matches = sum(1 for kw in sector_info['keywords'] if kw.lower() in text)

	# Skip if not relevant enough (unless from sector-specific feed)
	if keyword_matches == 0 and len(sector_info['rss_sources']) > 3:
	continue

	# Categorize and analyze
	category = self._categorize_news(text)
	sentiment = self._analyze_sentiment(text)
	impact = self._assess_impact(sector_info['weight'], keyword_matches)

	news_items.append({
	'id': hash(url),
	'title': title,
	'summary': summary or title[:200],
	'source': sector_info['name'],
	'sector': sector_id, # Add sector field
	'category': category,
	'timestamp': timestamp,
	'sentiment': sentiment,
	'impact': impact,
	'url': url,
	'likes': 0,
	'retweets': 0,
	'is_breaking': False,
	'source_weight': sector_info['weight'],
	'from_web': False
	})

	except Exception as e:
	logger.debug(f"Error parsing RSS entry: {e}")
	continue

	return news_items

	except Exception as e:
	logger.error(f"Error fetching RSS feed {rss_url}: {e}")
	return []

	def _categorize_news(self, text: str) -> str:
	"""Categorize news (macro, markets, geopolitical)"""
	macro_keywords = ['Fed', 'ECB', 'inflation', 'rate', 'GDP', 'economy', 'recession']
	markets_keywords = ['stock', 'earnings', 'revenue', 'profit', 'IPO', 'merger', 'acquisition']
	geo_keywords = ['China', 'tariff', 'trade war', 'sanctions', 'regulation']

	macro_score = sum(1 for kw in macro_keywords if kw.lower() in text)
	markets_score = sum(1 for kw in markets_keywords if kw.lower() in text)
	geo_score = sum(1 for kw in geo_keywords if kw.lower() in text)

	scores = {'macro': macro_score, 'markets': markets_score, 'geopolitical': geo_score}
	return max(scores, key=scores.get) if max(scores.values()) > 0 else 'markets'

	def _analyze_sentiment(self, text: str) -> str:
	"""Analyze sentiment based on keywords"""
	positive = ['surge', 'soar', 'rally', 'beat', 'upgrade', 'gain', 'rise', 'bullish', 'positive']
	negative = ['plunge', 'crash', 'fall', 'miss', 'downgrade', 'loss', 'drop', 'bearish', 'negative']

	pos_count = sum(1 for word in positive if word in text)
	neg_count = sum(1 for word in negative if word in text)

	if pos_count > neg_count:
	return 'positive'
	elif neg_count > pos_count:
	return 'negative'
	return 'neutral'

	def _assess_impact(self, sector_weight: float, keyword_matches: int) -> str:
	"""Assess impact based on sector weight and keyword relevance"""
	if sector_weight >= 1.5 and keyword_matches >= 3:
	return 'high'
	elif keyword_matches >= 2:
	return 'medium'
	else:
	return 'low'

	def _get_mock_sectoral_news(self) -> List[Dict]:
	"""Mock sectoral news for development"""
	now = datetime.now()

	return [
	{
	'id': 1,
	'title': 'Apple announces new iPhone with advanced AI capabilities',
	'summary': 'Apple unveils next-generation iPhone featuring on-device AI processing',
	'source': 'Technology',
	'sector': 'tech',
	'category': 'markets',
	'timestamp': now - timedelta(minutes=30),
	'sentiment': 'positive',
	'impact': 'high',
	'url': 'https://techcrunch.com',
	'likes': 0,
	'retweets': 0,
	'is_breaking': False,
	'source_weight': 1.5,
	'from_web': False
	},
	{
	'id': 2,
	'title': 'JPMorgan reports strong Q4 earnings beat analyst expectations',
	'summary': 'Major investment bank posts record profits amid trading surge',
	'source': 'Finance',
	'sector': 'finance',
	'category': 'markets',
	'timestamp': now - timedelta(hours=1),
	'sentiment': 'positive',
	'impact': 'high',
	'url': 'https://cnbc.com',
	'likes': 0,
	'retweets': 0,
	'is_breaking': False,
	'source_weight': 1.5,
	'from_web': False
	},
	{
	'id': 3,
	'title': 'OPEC+ extends oil production cuts through Q2',
	'summary': 'Major oil producers agree to maintain supply restrictions',
	'source': 'Energy',
	'sector': 'energy',
	'category': 'geopolitical',
	'timestamp': now - timedelta(hours=2),
	'sentiment': 'neutral',
	'impact': 'high',
	'url': 'https://reuters.com',
	'likes': 0,
	'retweets': 0,
	'is_breaking': False,
	'source_weight': 1.6,
	'from_web': False
	},
	{
	'id': 4,
	'title': 'Pfizer receives FDA approval for new cancer treatment',
	'summary': 'Breakthrough therapy approved for late-stage lung cancer',
	'source': 'Healthcare',
	'sector': 'healthcare',
	'category': 'markets',
	'timestamp': now - timedelta(hours=3),
	'sentiment': 'positive',
	'impact': 'medium',
	'url': 'https://cnbc.com',
	'likes': 0,
	'retweets': 0,
	'is_breaking': False,
	'source_weight': 1.5,
	'from_web': False
	},
	{
	'id': 5,
	'title': 'Amazon expands same-day delivery to 50 new cities',
	'summary': 'E-commerce giant accelerates logistics network expansion',
	'source': 'Consumer & Retail',
	'sector': 'consumer',
	'category': 'markets',
	'timestamp': now - timedelta(hours=4),
	'sentiment': 'positive',
	'impact': 'medium',
	'url': 'https://techcrunch.com',
	'likes': 0,
	'retweets': 0,
	'is_breaking': False,
	'source_weight': 1.3,
	'from_web': False
	},
	{
	'id': 6,
	'title': 'Boeing wins $10B contract for new military aircraft',
	'summary': 'Defense contractor secures major government order',
	'source': 'Industrials',
	'sector': 'industrials',
	'category': 'markets',
	'timestamp': now - timedelta(hours=5),
	'sentiment': 'positive',
	'impact': 'medium',
	'url': 'https://reuters.com',
	'likes': 0,
	'retweets': 0,
	'is_breaking': False,
	'source_weight': 1.4,
	'from_web': False
	},
	{
	'id': 7,
	'title': 'US housing starts surge 15% in December',
	'summary': 'Construction activity rebounds amid lower mortgage rates',
	'source': 'Real Estate',
	'sector': 'real_estate',
	'category': 'macro',
	'timestamp': now - timedelta(hours=6),
	'sentiment': 'positive',
	'impact': 'medium',
	'url': 'https://cnbc.com',
	'likes': 0,
	'retweets': 0,
	'is_breaking': False,
	'source_weight': 1.3,
	'from_web': False
	}
	]