Spaces:

pareshmishra
/

MT564AITraining

Running

MT564AITraining / scrapers /blog_scraper.py

pareshmishra

Add full project source files for MT564 AI

2c72e40 6 months ago

6.02 kB

	import logging
	import re
	from typing import Dict, Any, List
	from bs4 import BeautifulSoup
	from .base_scraper import BaseScraper

	logger = logging.getLogger(__name__)

	class BlogScraper(BaseScraper):
	"""Scraper for blog websites"""

	def parse_content(self, html_content: str, text_content: str, url: str) -> Dict[str, Any]:
	"""Parse blog content and extract structured data"""
	try:
	soup = BeautifulSoup(html_content, 'html.parser')

	# Extract metadata
	result = {
	"type": "blog",
	"title": self._extract_title(soup),
	"publish_date": self._extract_publish_date(soup),
	"author": self._extract_author(soup),
	"categories": self._extract_categories(soup),
	"tags": self._extract_tags(soup),
	"summary": self._extract_summary(text_content),
	"source": self._extract_domain(url),
	}

	return result
	except Exception as e:
	logger.error(f"Error parsing blog content: {str(e)}")
	return {"type": "blog", "error_parsing": str(e)}

	def _extract_title(self, soup: BeautifulSoup) -> str:
	"""Extract title from blog post"""
	# Try different methods to find title
	title = None

	# Method 1: Look for <h1> tags in article or entry
	article = soup.find(['article', 'div'], class_=re.compile('(post\|entry\|article)'))
	if article:
	h1 = article.find('h1')
	if h1:
	title = h1.get_text().strip()

	# Method 2: Look for any h1 if above failed
	if not title:
	h1_tags = soup.find_all('h1')
	if h1_tags and len(h1_tags) > 0:
	title = h1_tags[0].get_text().strip()

	# Method 3: Look for blog titles in meta tags
	if not title:
	og_title = soup.find('meta', property='og:title')
	if og_title and og_title.get('content'):
	title = og_title['content'].strip()

	return title or "Unknown Title"

	def _extract_publish_date(self, soup: BeautifulSoup) -> str:
	"""Extract publication date"""
	# Try various methods to find date
	date = None

	# Method 1: Look for common date meta tags
	date_meta = soup.find('meta', property='article:published_time')
	if date_meta and date_meta.get('content'):
	date = date_meta['content']

	# Method 2: Look for common blog date classes
	if not date:
	date_classes = ['date', 'post-date', 'entry-date', 'published', 'post-meta']
	for class_name in date_classes:
	date_element = soup.find(class_=re.compile(class_name, re.I))
	if date_element:
	date = date_element.get_text().strip()
	break

	return date or "Unknown Date"

	def _extract_author(self, soup: BeautifulSoup) -> str:
	"""Extract author information"""
	# Try various methods to find author
	author = None

	# Method 1: Look for author meta tags
	author_meta = soup.find('meta', property='article:author')
	if author_meta and author_meta.get('content'):
	author = author_meta['content']

	# Method 2: Look for blog-specific author classes
	if not author:
	author_classes = ['author', 'byline', 'entry-author', 'post-author']
	for class_name in author_classes:
	author_element = soup.find(class_=re.compile(class_name, re.I))
	if author_element:
	author = author_element.get_text().strip()
	break

	return author or "Unknown Author"

	def _extract_categories(self, soup: BeautifulSoup) -> List[str]:
	"""Extract blog post categories"""
	categories = []

	# Method 1: Look for category links
	category_elements = soup.find_all('a', class_=re.compile('category'))
	if category_elements:
	for element in category_elements:
	cat_text = element.get_text().strip()
	if cat_text and cat_text not in categories:
	categories.append(cat_text)

	# Method 2: Look for category meta tag
	if not categories:
	category_meta = soup.find('meta', property='article:section')
	if category_meta and category_meta.get('content'):
	categories.append(category_meta['content'].strip())

	return categories

	def _extract_tags(self, soup: BeautifulSoup) -> List[str]:
	"""Extract blog post tags"""
	tags = []

	# Look for tag links
	tag_elements = soup.find_all('a', class_=re.compile('tag'))
	if tag_elements:
	for element in tag_elements:
	tag_text = element.get_text().strip()
	if tag_text and tag_text not in tags:
	tags.append(tag_text)

	return tags

	def _extract_summary(self, text_content: str) -> str:
	"""Extract or create a summary from the blog post text"""
	if not text_content:
	return "No summary available"

	# Take first paragraph or first few sentences (up to 300 chars)
	paragraphs = text_content.split('\n\n')
	if paragraphs:
	summary = paragraphs[0]
	if len(summary) > 300:
	summary = summary[:297] + "..."
	return summary

	return "No summary available"

	def _extract_domain(self, url: str) -> str:
	"""Extract domain from URL"""
	try:
	from urllib.parse import urlparse
	parsed_url = urlparse(url)
	return parsed_url.netloc
	except Exception:
	return "Unknown Source"