| """ |
| Sectoral News Scraper - 7 Major Market Sectors |
| Filters and aggregates news by sector: Finance, Tech, Energy, Healthcare, Consumer, Industrials, Real Estate |
| Leverages existing RSS infrastructure with sector-specific classification |
| """ |
|
|
| from datetime import datetime, timedelta |
| from typing import List, Dict, Optional |
| import logging |
| import re |
| from concurrent.futures import ThreadPoolExecutor |
|
|
| import requests |
| import pandas as pd |
| import feedparser |
| from bs4 import BeautifulSoup |
|
|
| |
| logging.basicConfig(level=logging.INFO) |
| logger = logging.getLogger(__name__) |
|
|
|
|
| class SectoralNewsScraper: |
| """ |
| Aggregates news by market sector |
| Uses RSS feeds + keyword classification |
| """ |
|
|
| |
| SECTORS = { |
| 'finance': { |
| 'name': 'Finance', |
| 'keywords': [ |
| 'bank', 'JPMorgan', 'Goldman Sachs', 'Morgan Stanley', 'Wells Fargo', |
| 'Citigroup', 'Bank of America', 'fintech', 'lending', 'credit', |
| 'financial sector', 'banking', 'insurance', 'asset management' |
| ], |
| 'rss_sources': [ |
| 'https://www.cnbc.com/id/10000664/device/rss/rss.html', |
| 'https://feeds.bloomberg.com/markets/news.rss' |
| ], |
| 'weight': 1.5 |
| }, |
| 'tech': { |
| 'name': 'Technology', |
| 'keywords': [ |
| 'Apple', 'Microsoft', 'Google', 'Alphabet', 'Amazon', 'Meta', 'Facebook', |
| 'NVIDIA', 'AMD', 'Intel', 'semiconductor', 'chip', 'software', 'cloud', |
| 'AI', 'artificial intelligence', 'tech sector', 'Silicon Valley', 'Tesla' |
| ], |
| 'rss_sources': [ |
| 'https://www.cnbc.com/id/19854910/device/rss/rss.html', |
| 'https://techcrunch.com/feed/' |
| ], |
| 'weight': 1.5 |
| }, |
| 'energy': { |
| 'name': 'Energy', |
| 'keywords': [ |
| 'oil', 'gas', 'crude', 'petroleum', 'OPEC', 'Exxon', 'ExxonMobil', 'Chevron', |
| 'ConocoPhillips', 'renewable', 'solar', 'wind', 'energy sector', 'pipeline', |
| 'natural gas', 'LNG', 'fracking', 'drilling' |
| ], |
| 'rss_sources': [ |
| 'https://www.cnbc.com/id/19832390/device/rss/rss.html', |
| ], |
| 'weight': 1.6 |
| }, |
| 'healthcare': { |
| 'name': 'Healthcare', |
| 'keywords': [ |
| 'pharma', 'pharmaceutical', 'biotech', 'FDA', 'drug', 'vaccine', 'clinical trial', |
| 'Pfizer', 'Johnson & Johnson', 'Merck', 'AbbVie', 'Bristol Myers', |
| 'healthcare', 'hospital', 'medical device', 'therapeutics' |
| ], |
| 'rss_sources': [ |
| 'https://www.cnbc.com/id/10000108/device/rss/rss.html', |
| ], |
| 'weight': 1.5 |
| }, |
| 'consumer': { |
| 'name': 'Consumer & Retail', |
| 'keywords': [ |
| 'retail', 'Amazon', 'Walmart', 'Target', 'Costco', 'Home Depot', |
| 'e-commerce', 'consumer', 'shopping', 'Black Friday', 'sales', |
| 'Nike', 'Starbucks', 'McDonald\'s', 'consumer goods', 'discretionary' |
| ], |
| 'rss_sources': [ |
| 'https://www.cnbc.com/id/10001009/device/rss/rss.html', |
| ], |
| 'weight': 1.3 |
| }, |
| 'industrials': { |
| 'name': 'Industrials', |
| 'keywords': [ |
| 'Boeing', 'Airbus', 'Caterpillar', 'Deere', '3M', 'GE', 'General Electric', |
| 'Honeywell', 'Lockheed Martin', 'manufacturing', 'industrial', |
| 'aerospace', 'defense', 'machinery', 'equipment', 'logistics', 'freight' |
| ], |
| 'rss_sources': [ |
| 'https://www.reuters.com/rss/businessNews', |
| ], |
| 'weight': 1.4 |
| }, |
| 'real_estate': { |
| 'name': 'Real Estate', |
| 'keywords': [ |
| 'housing', 'mortgage', 'REIT', 'real estate', 'property', 'home sales', |
| 'construction', 'residential', 'commercial real estate', 'housing market', |
| 'home prices', 'rent', 'rental', 'builder', 'homebuilder' |
| ], |
| 'rss_sources': [], |
| 'weight': 1.3 |
| } |
| } |
|
|
| def __init__(self): |
| """Initialize scraper""" |
| self.session = requests.Session() |
| self.session.headers.update({ |
| 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36', |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', |
| 'Accept-Language': 'en-US,en;q=0.9', |
| }) |
|
|
| def scrape_sectoral_news(self, max_items: int = 50, hours: int = 24) -> List[Dict]: |
| """ |
| Scrape and classify news by sector |
| Returns aggregated list sorted by sector and timestamp |
| """ |
| all_news = [] |
| seen_urls = set() |
|
|
| |
| with ThreadPoolExecutor(max_workers=7) as executor: |
| futures = [] |
|
|
| for sector_id, sector_info in self.SECTORS.items(): |
| |
| futures.append(( |
| executor.submit(self._fetch_sector_news, sector_id, sector_info, hours), |
| sector_id |
| )) |
|
|
| for future, sector_id in futures: |
| try: |
| sector_news = future.result(timeout=35) |
|
|
| |
| for item in sector_news: |
| if item['url'] not in seen_urls: |
| seen_urls.add(item['url']) |
| all_news.append(item) |
|
|
| logger.info(f"Fetched {len(sector_news)} items for {sector_id}") |
|
|
| except Exception as e: |
| logger.error(f"Error fetching {sector_id} news: {e}") |
|
|
| |
| if not all_news: |
| logger.warning("No sectoral news fetched - using mock data") |
| return self._get_mock_sectoral_news() |
|
|
| |
| all_news.sort( |
| key=lambda x: (x['sector'] != 'tech', x['sector'] != 'finance', -x['timestamp'].timestamp()), |
| ) |
|
|
| return all_news[:max_items] |
|
|
| def _fetch_sector_news(self, sector_id: str, sector_info: Dict, hours: int) -> List[Dict]: |
| """Fetch news for a specific sector""" |
| sector_news = [] |
|
|
| |
| for rss_url in sector_info['rss_sources']: |
| try: |
| feed_news = self._fetch_rss_feed(rss_url, sector_id, sector_info, hours) |
| sector_news.extend(feed_news) |
| except Exception as e: |
| logger.debug(f"Error fetching RSS {rss_url}: {e}") |
|
|
| |
| |
|
|
| return sector_news |
|
|
| def _fetch_rss_feed(self, rss_url: str, sector_id: str, sector_info: Dict, hours: int) -> List[Dict]: |
| """Fetch and parse RSS feed for sector""" |
| try: |
| feed = feedparser.parse(rss_url) |
|
|
| if not feed.entries: |
| return [] |
|
|
| news_items = [] |
| cutoff_time = datetime.now() - timedelta(hours=hours) |
|
|
| for entry in feed.entries[:15]: |
| try: |
| |
| if hasattr(entry, 'published_parsed') and entry.published_parsed: |
| timestamp = datetime(*entry.published_parsed[:6]) |
| elif hasattr(entry, 'updated_parsed') and entry.updated_parsed: |
| timestamp = datetime(*entry.updated_parsed[:6]) |
| else: |
| timestamp = datetime.now() |
|
|
| |
| if timestamp < cutoff_time: |
| continue |
|
|
| |
| title = entry.get('title', '') |
| summary = entry.get('summary', '') or entry.get('description', '') |
|
|
| |
| if summary: |
| summary = BeautifulSoup(summary, 'html.parser').get_text() |
| summary = summary[:200] + '...' if len(summary) > 200 else summary |
|
|
| url = entry.get('link', '') |
|
|
| |
| text = f"{title} {summary}".lower() |
| keyword_matches = sum(1 for kw in sector_info['keywords'] if kw.lower() in text) |
|
|
| |
| if keyword_matches == 0 and len(sector_info['rss_sources']) > 3: |
| continue |
|
|
| |
| category = self._categorize_news(text) |
| sentiment = self._analyze_sentiment(text) |
| impact = self._assess_impact(sector_info['weight'], keyword_matches) |
|
|
| news_items.append({ |
| 'id': hash(url), |
| 'title': title, |
| 'summary': summary or title[:200], |
| 'source': sector_info['name'], |
| 'sector': sector_id, |
| 'category': category, |
| 'timestamp': timestamp, |
| 'sentiment': sentiment, |
| 'impact': impact, |
| 'url': url, |
| 'likes': 0, |
| 'retweets': 0, |
| 'is_breaking': False, |
| 'source_weight': sector_info['weight'], |
| 'from_web': False |
| }) |
|
|
| except Exception as e: |
| logger.debug(f"Error parsing RSS entry: {e}") |
| continue |
|
|
| return news_items |
|
|
| except Exception as e: |
| logger.error(f"Error fetching RSS feed {rss_url}: {e}") |
| return [] |
|
|
| def _categorize_news(self, text: str) -> str: |
| """Categorize news (macro, markets, geopolitical)""" |
| macro_keywords = ['Fed', 'ECB', 'inflation', 'rate', 'GDP', 'economy', 'recession'] |
| markets_keywords = ['stock', 'earnings', 'revenue', 'profit', 'IPO', 'merger', 'acquisition'] |
| geo_keywords = ['China', 'tariff', 'trade war', 'sanctions', 'regulation'] |
|
|
| macro_score = sum(1 for kw in macro_keywords if kw.lower() in text) |
| markets_score = sum(1 for kw in markets_keywords if kw.lower() in text) |
| geo_score = sum(1 for kw in geo_keywords if kw.lower() in text) |
|
|
| scores = {'macro': macro_score, 'markets': markets_score, 'geopolitical': geo_score} |
| return max(scores, key=scores.get) if max(scores.values()) > 0 else 'markets' |
|
|
| def _analyze_sentiment(self, text: str) -> str: |
| """Analyze sentiment based on keywords""" |
| positive = ['surge', 'soar', 'rally', 'beat', 'upgrade', 'gain', 'rise', 'bullish', 'positive'] |
| negative = ['plunge', 'crash', 'fall', 'miss', 'downgrade', 'loss', 'drop', 'bearish', 'negative'] |
|
|
| pos_count = sum(1 for word in positive if word in text) |
| neg_count = sum(1 for word in negative if word in text) |
|
|
| if pos_count > neg_count: |
| return 'positive' |
| elif neg_count > pos_count: |
| return 'negative' |
| return 'neutral' |
|
|
| def _assess_impact(self, sector_weight: float, keyword_matches: int) -> str: |
| """Assess impact based on sector weight and keyword relevance""" |
| if sector_weight >= 1.5 and keyword_matches >= 3: |
| return 'high' |
| elif keyword_matches >= 2: |
| return 'medium' |
| else: |
| return 'low' |
|
|
| def _get_mock_sectoral_news(self) -> List[Dict]: |
| """Mock sectoral news for development""" |
| now = datetime.now() |
|
|
| return [ |
| { |
| 'id': 1, |
| 'title': 'Apple announces new iPhone with advanced AI capabilities', |
| 'summary': 'Apple unveils next-generation iPhone featuring on-device AI processing', |
| 'source': 'Technology', |
| 'sector': 'tech', |
| 'category': 'markets', |
| 'timestamp': now - timedelta(minutes=30), |
| 'sentiment': 'positive', |
| 'impact': 'high', |
| 'url': 'https://techcrunch.com', |
| 'likes': 0, |
| 'retweets': 0, |
| 'is_breaking': False, |
| 'source_weight': 1.5, |
| 'from_web': False |
| }, |
| { |
| 'id': 2, |
| 'title': 'JPMorgan reports strong Q4 earnings beat analyst expectations', |
| 'summary': 'Major investment bank posts record profits amid trading surge', |
| 'source': 'Finance', |
| 'sector': 'finance', |
| 'category': 'markets', |
| 'timestamp': now - timedelta(hours=1), |
| 'sentiment': 'positive', |
| 'impact': 'high', |
| 'url': 'https://cnbc.com', |
| 'likes': 0, |
| 'retweets': 0, |
| 'is_breaking': False, |
| 'source_weight': 1.5, |
| 'from_web': False |
| }, |
| { |
| 'id': 3, |
| 'title': 'OPEC+ extends oil production cuts through Q2', |
| 'summary': 'Major oil producers agree to maintain supply restrictions', |
| 'source': 'Energy', |
| 'sector': 'energy', |
| 'category': 'geopolitical', |
| 'timestamp': now - timedelta(hours=2), |
| 'sentiment': 'neutral', |
| 'impact': 'high', |
| 'url': 'https://reuters.com', |
| 'likes': 0, |
| 'retweets': 0, |
| 'is_breaking': False, |
| 'source_weight': 1.6, |
| 'from_web': False |
| }, |
| { |
| 'id': 4, |
| 'title': 'Pfizer receives FDA approval for new cancer treatment', |
| 'summary': 'Breakthrough therapy approved for late-stage lung cancer', |
| 'source': 'Healthcare', |
| 'sector': 'healthcare', |
| 'category': 'markets', |
| 'timestamp': now - timedelta(hours=3), |
| 'sentiment': 'positive', |
| 'impact': 'medium', |
| 'url': 'https://cnbc.com', |
| 'likes': 0, |
| 'retweets': 0, |
| 'is_breaking': False, |
| 'source_weight': 1.5, |
| 'from_web': False |
| }, |
| { |
| 'id': 5, |
| 'title': 'Amazon expands same-day delivery to 50 new cities', |
| 'summary': 'E-commerce giant accelerates logistics network expansion', |
| 'source': 'Consumer & Retail', |
| 'sector': 'consumer', |
| 'category': 'markets', |
| 'timestamp': now - timedelta(hours=4), |
| 'sentiment': 'positive', |
| 'impact': 'medium', |
| 'url': 'https://techcrunch.com', |
| 'likes': 0, |
| 'retweets': 0, |
| 'is_breaking': False, |
| 'source_weight': 1.3, |
| 'from_web': False |
| }, |
| { |
| 'id': 6, |
| 'title': 'Boeing wins $10B contract for new military aircraft', |
| 'summary': 'Defense contractor secures major government order', |
| 'source': 'Industrials', |
| 'sector': 'industrials', |
| 'category': 'markets', |
| 'timestamp': now - timedelta(hours=5), |
| 'sentiment': 'positive', |
| 'impact': 'medium', |
| 'url': 'https://reuters.com', |
| 'likes': 0, |
| 'retweets': 0, |
| 'is_breaking': False, |
| 'source_weight': 1.4, |
| 'from_web': False |
| }, |
| { |
| 'id': 7, |
| 'title': 'US housing starts surge 15% in December', |
| 'summary': 'Construction activity rebounds amid lower mortgage rates', |
| 'source': 'Real Estate', |
| 'sector': 'real_estate', |
| 'category': 'macro', |
| 'timestamp': now - timedelta(hours=6), |
| 'sentiment': 'positive', |
| 'impact': 'medium', |
| 'url': 'https://cnbc.com', |
| 'likes': 0, |
| 'retweets': 0, |
| 'is_breaking': False, |
| 'source_weight': 1.3, |
| 'from_web': False |
| } |
| ] |
|
|