Spaces:

pareshmishra
/

MT564AITraining

Running

MT564AITraining / scrapers /news_scraper.py

pareshmishra

Add full project source files for MT564 AI

2c72e40 6 months ago

4.87 kB

	import logging
	import re
	from typing import Dict, Any
	from bs4 import BeautifulSoup
	from .base_scraper import BaseScraper

	logger = logging.getLogger(__name__)

	class NewsScraper(BaseScraper):
	"""Scraper for news websites"""

	def parse_content(self, html_content: str, text_content: str, url: str) -> Dict[str, Any]:
	"""Parse news content and extract structured data"""
	try:
	soup = BeautifulSoup(html_content, 'html.parser')

	# Try to extract metadata
	result = {
	"type": "news",
	"title": self._extract_title(soup),
	"publish_date": self._extract_publish_date(soup),
	"author": self._extract_author(soup),
	"summary": self._extract_summary(text_content),
	"source": self._extract_domain(url),
	}

	return result
	except Exception as e:
	logger.error(f"Error parsing news content: {str(e)}")
	return {"type": "news", "error_parsing": str(e)}

	def _extract_title(self, soup: BeautifulSoup) -> str:
	"""Extract title from article"""
	# Try different methods to find title
	title = None

	# Method 1: Look for <h1> tags
	h1_tags = soup.find_all('h1')
	if h1_tags and len(h1_tags) > 0:
	title = h1_tags[0].get_text().strip()

	# Method 2: Look for article titles in meta tags
	if not title:
	og_title = soup.find('meta', property='og:title')
	if og_title and og_title.get('content'):
	title = og_title['content'].strip()

	# Method 3: Use the document title
	if not title:
	title_tag = soup.find('title')
	if title_tag:
	title = title_tag.get_text().strip()

	return title or "Unknown Title"

	def _extract_publish_date(self, soup: BeautifulSoup) -> str:
	"""Extract publication date"""
	# Try various methods to find date
	date = None

	# Method 1: Look for common date meta tags
	date_meta = soup.find('meta', property='article:published_time')
	if date_meta and date_meta.get('content'):
	date = date_meta['content']

	# Method 2: Look for time tags
	if not date:
	time_tag = soup.find('time')
	if time_tag and time_tag.get('datetime'):
	date = time_tag['datetime']
	elif time_tag:
	date = time_tag.get_text().strip()

	# Method 3: Look for date in common class names
	if not date:
	date_classes = ['date', 'article-date', 'publish-date', 'timestamp']
	for class_name in date_classes:
	date_element = soup.find(class_=re.compile(class_name, re.I))
	if date_element:
	date = date_element.get_text().strip()
	break

	return date or "Unknown Date"

	def _extract_author(self, soup: BeautifulSoup) -> str:
	"""Extract author information"""
	# Try various methods to find author
	author = None

	# Method 1: Look for author meta tags
	author_meta = soup.find('meta', property='article:author')
	if author_meta and author_meta.get('content'):
	author = author_meta['content']

	# Method 2: Look for author in common class names
	if not author:
	author_classes = ['author', 'byline', 'writer']
	for class_name in author_classes:
	author_element = soup.find(class_=re.compile(class_name, re.I))
	if author_element:
	author = author_element.get_text().strip()
	break

	# Method 3: Look for rel="author" link
	if not author:
	author_link = soup.find('a', rel='author')
	if author_link:
	author = author_link.get_text().strip()

	return author or "Unknown Author"

	def _extract_summary(self, text_content: str) -> str:
	"""Extract or create a summary from the article text"""
	if not text_content:
	return "No summary available"

	# Take first few sentences (up to 500 chars)
	sentences = text_content.split('.')
	summary = '.'.join(sentences[:3])

	if len(summary) > 500:
	summary = summary[:497] + "..."

	return summary

	def _extract_domain(self, url: str) -> str:
	"""Extract domain from URL"""
	try:
	from urllib.parse import urlparse
	parsed_url = urlparse(url)
	return parsed_url.netloc
	except Exception:
	return "Unknown Source"