|
|
""" |
|
|
AI & Tech News Scraper |
|
|
Fetches news from popular tech resources and big tech company blogs |
|
|
""" |
|
|
|
|
|
import feedparser |
|
|
import requests |
|
|
from bs4 import BeautifulSoup |
|
|
from datetime import datetime, timedelta |
|
|
from typing import List, Dict |
|
|
import logging |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
class AITechNewsScraper: |
|
|
"""Scraper for AI and tech news from major sources and company blogs""" |
|
|
|
|
|
|
|
|
SOURCES = { |
|
|
|
|
|
'TechCrunch AI': { |
|
|
'url': 'https://techcrunch.com/category/artificial-intelligence/feed/', |
|
|
'type': 'rss', |
|
|
'category': 'ai' |
|
|
}, |
|
|
'The Verge AI': { |
|
|
'url': 'https://www.theverge.com/ai-artificial-intelligence/rss/index.xml', |
|
|
'type': 'rss', |
|
|
'category': 'ai' |
|
|
}, |
|
|
'VentureBeat AI': { |
|
|
'url': 'https://venturebeat.com/category/ai/feed/', |
|
|
'type': 'rss', |
|
|
'category': 'ai' |
|
|
}, |
|
|
'MIT Technology Review AI': { |
|
|
'url': 'https://www.technologyreview.com/topic/artificial-intelligence/feed', |
|
|
'type': 'rss', |
|
|
'category': 'ai' |
|
|
}, |
|
|
'Ars Technica AI': { |
|
|
'url': 'https://feeds.arstechnica.com/arstechnica/technology-lab', |
|
|
'type': 'rss', |
|
|
'category': 'tech' |
|
|
}, |
|
|
'Wired AI': { |
|
|
'url': 'https://www.wired.com/feed/tag/ai/latest/rss', |
|
|
'type': 'rss', |
|
|
'category': 'ai' |
|
|
}, |
|
|
|
|
|
|
|
|
'OpenAI Blog': { |
|
|
'url': 'https://openai.com/blog/rss.xml', |
|
|
'type': 'rss', |
|
|
'category': 'ai' |
|
|
}, |
|
|
'Google AI Blog': { |
|
|
'url': 'https://blog.google/technology/ai/rss/', |
|
|
'type': 'rss', |
|
|
'category': 'ai' |
|
|
}, |
|
|
'Microsoft AI Blog': { |
|
|
'url': 'https://blogs.microsoft.com/ai/feed/', |
|
|
'type': 'rss', |
|
|
'category': 'ai' |
|
|
}, |
|
|
'Meta AI Blog': { |
|
|
'url': 'https://ai.meta.com/blog/rss/', |
|
|
'type': 'rss', |
|
|
'category': 'ai' |
|
|
}, |
|
|
'DeepMind Blog': { |
|
|
'url': 'https://deepmind.google/blog/rss.xml', |
|
|
'type': 'rss', |
|
|
'category': 'ai' |
|
|
}, |
|
|
'Anthropic News': { |
|
|
'url': 'https://www.anthropic.com/news/rss.xml', |
|
|
'type': 'rss', |
|
|
'category': 'ai' |
|
|
}, |
|
|
'AWS AI Blog': { |
|
|
'url': 'https://aws.amazon.com/blogs/machine-learning/feed/', |
|
|
'type': 'rss', |
|
|
'category': 'ai' |
|
|
}, |
|
|
'NVIDIA AI Blog': { |
|
|
'url': 'https://blogs.nvidia.com/feed/', |
|
|
'type': 'rss', |
|
|
'category': 'ai' |
|
|
}, |
|
|
|
|
|
|
|
|
'Stanford HAI': { |
|
|
'url': 'https://hai.stanford.edu/news/rss.xml', |
|
|
'type': 'rss', |
|
|
'category': 'research' |
|
|
}, |
|
|
'Berkeley AI Research': { |
|
|
'url': 'https://bair.berkeley.edu/blog/feed.xml', |
|
|
'type': 'rss', |
|
|
'category': 'research' |
|
|
}, |
|
|
} |
|
|
|
|
|
def __init__(self): |
|
|
"""Initialize the AI/Tech news scraper""" |
|
|
self.session = requests.Session() |
|
|
self.session.headers.update({ |
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' |
|
|
}) |
|
|
|
|
|
def scrape_ai_tech_news(self, max_items: int = 100, hours: int = 48) -> List[Dict]: |
|
|
""" |
|
|
Scrape AI and tech news from all sources |
|
|
|
|
|
Args: |
|
|
max_items: Maximum number of news items to return |
|
|
hours: Only include news from the last N hours |
|
|
|
|
|
Returns: |
|
|
List of news items with standardized format |
|
|
""" |
|
|
all_news = [] |
|
|
cutoff_time = datetime.now() - timedelta(hours=hours) |
|
|
|
|
|
for source_name, source_config in self.SOURCES.items(): |
|
|
try: |
|
|
if source_config['type'] == 'rss': |
|
|
news_items = self._scrape_rss_feed( |
|
|
source_name, |
|
|
source_config['url'], |
|
|
source_config['category'], |
|
|
cutoff_time |
|
|
) |
|
|
all_news.extend(news_items) |
|
|
logger.info(f"Scraped {len(news_items)} items from {source_name}") |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Error scraping {source_name}: {e}") |
|
|
continue |
|
|
|
|
|
|
|
|
all_news.sort(key=lambda x: x['timestamp'], reverse=True) |
|
|
|
|
|
|
|
|
return all_news[:max_items] |
|
|
|
|
|
def _scrape_rss_feed(self, source_name: str, feed_url: str, |
|
|
category: str, cutoff_time: datetime) -> List[Dict]: |
|
|
"""Scrape a single RSS feed""" |
|
|
news_items = [] |
|
|
|
|
|
try: |
|
|
feed = feedparser.parse(feed_url) |
|
|
|
|
|
for entry in feed.entries: |
|
|
try: |
|
|
|
|
|
if hasattr(entry, 'published_parsed') and entry.published_parsed: |
|
|
timestamp = datetime(*entry.published_parsed[:6]) |
|
|
elif hasattr(entry, 'updated_parsed') and entry.updated_parsed: |
|
|
timestamp = datetime(*entry.updated_parsed[:6]) |
|
|
else: |
|
|
timestamp = datetime.now() |
|
|
|
|
|
|
|
|
if timestamp < cutoff_time: |
|
|
continue |
|
|
|
|
|
|
|
|
title = entry.get('title', 'No title') |
|
|
summary = entry.get('summary', entry.get('description', '')) |
|
|
|
|
|
|
|
|
if summary: |
|
|
soup = BeautifulSoup(summary, 'html.parser') |
|
|
summary = soup.get_text().strip() |
|
|
|
|
|
if len(summary) > 300: |
|
|
summary = summary[:297] + '...' |
|
|
|
|
|
|
|
|
impact = self._determine_impact(title, summary) |
|
|
sentiment = self._determine_sentiment(title, summary) |
|
|
|
|
|
news_item = { |
|
|
'title': title, |
|
|
'summary': summary or title, |
|
|
'source': source_name, |
|
|
'url': entry.get('link', ''), |
|
|
'timestamp': timestamp, |
|
|
'category': category, |
|
|
'impact': impact, |
|
|
'sentiment': sentiment, |
|
|
'is_breaking': self._is_breaking_news(title, summary), |
|
|
'likes': 0, |
|
|
'retweets': 0, |
|
|
'reddit_score': 0, |
|
|
'reddit_comments': 0 |
|
|
} |
|
|
|
|
|
news_items.append(news_item) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Error parsing entry from {source_name}: {e}") |
|
|
continue |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Error fetching RSS feed {feed_url}: {e}") |
|
|
|
|
|
return news_items |
|
|
|
|
|
def _determine_impact(self, title: str, summary: str) -> str: |
|
|
"""Determine impact level based on keywords""" |
|
|
text = f"{title} {summary}".lower() |
|
|
|
|
|
high_impact_keywords = [ |
|
|
'breakthrough', 'announce', 'launch', 'release', 'new model', |
|
|
'gpt', 'claude', 'gemini', 'llama', 'chatgpt', |
|
|
'billion', 'trillion', 'acquisition', 'merger', |
|
|
'regulation', 'ban', 'lawsuit', 'security breach', |
|
|
'major', 'significant', 'revolutionary', 'first-ever' |
|
|
] |
|
|
|
|
|
medium_impact_keywords = [ |
|
|
'update', 'improve', 'enhance', 'study', 'research', |
|
|
'partnership', 'collaboration', 'funding', 'investment', |
|
|
'expands', 'grows', 'adopts', 'implements' |
|
|
] |
|
|
|
|
|
for keyword in high_impact_keywords: |
|
|
if keyword in text: |
|
|
return 'high' |
|
|
|
|
|
for keyword in medium_impact_keywords: |
|
|
if keyword in text: |
|
|
return 'medium' |
|
|
|
|
|
return 'low' |
|
|
|
|
|
def _determine_sentiment(self, title: str, summary: str) -> str: |
|
|
"""Determine sentiment based on keywords""" |
|
|
text = f"{title} {summary}".lower() |
|
|
|
|
|
positive_keywords = [ |
|
|
'breakthrough', 'success', 'achieve', 'improve', 'advance', |
|
|
'innovative', 'revolutionary', 'launch', 'release', 'win', |
|
|
'growth', 'expand', 'partnership', 'collaboration' |
|
|
] |
|
|
|
|
|
negative_keywords = [ |
|
|
'fail', 'issue', 'problem', 'concern', 'worry', 'risk', |
|
|
'ban', 'lawsuit', 'breach', 'hack', 'leak', 'crisis', |
|
|
'decline', 'loss', 'shutdown', 'controversy' |
|
|
] |
|
|
|
|
|
positive_count = sum(1 for kw in positive_keywords if kw in text) |
|
|
negative_count = sum(1 for kw in negative_keywords if kw in text) |
|
|
|
|
|
if positive_count > negative_count: |
|
|
return 'positive' |
|
|
elif negative_count > positive_count: |
|
|
return 'negative' |
|
|
else: |
|
|
return 'neutral' |
|
|
|
|
|
def _is_breaking_news(self, title: str, summary: str) -> bool: |
|
|
"""Determine if news is breaking""" |
|
|
text = f"{title} {summary}".lower() |
|
|
|
|
|
breaking_indicators = [ |
|
|
'breaking', 'just announced', 'just released', 'just launched', |
|
|
'alert', 'urgent', 'developing', 'live', 'now:' |
|
|
] |
|
|
|
|
|
return any(indicator in text for indicator in breaking_indicators) |
|
|
|
|
|
def get_statistics(self) -> Dict: |
|
|
"""Get statistics - returns empty for backward compatibility""" |
|
|
return { |
|
|
'total': 0, |
|
|
'high_impact': 0, |
|
|
'breaking': 0, |
|
|
'last_update': 'Managed by cache', |
|
|
'by_category': { |
|
|
'ai': 0, |
|
|
'tech': 0, |
|
|
'research': 0 |
|
|
} |
|
|
} |
|
|
|