Spaces:
Running
Running
| import logging | |
| import re | |
| from typing import Dict, Any, List | |
| from bs4 import BeautifulSoup | |
| from .base_scraper import BaseScraper | |
| logger = logging.getLogger(__name__) | |
| class BlogScraper(BaseScraper): | |
| """Scraper for blog websites""" | |
| def parse_content(self, html_content: str, text_content: str, url: str) -> Dict[str, Any]: | |
| """Parse blog content and extract structured data""" | |
| try: | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| # Extract metadata | |
| result = { | |
| "type": "blog", | |
| "title": self._extract_title(soup), | |
| "publish_date": self._extract_publish_date(soup), | |
| "author": self._extract_author(soup), | |
| "categories": self._extract_categories(soup), | |
| "tags": self._extract_tags(soup), | |
| "summary": self._extract_summary(text_content), | |
| "source": self._extract_domain(url), | |
| } | |
| return result | |
| except Exception as e: | |
| logger.error(f"Error parsing blog content: {str(e)}") | |
| return {"type": "blog", "error_parsing": str(e)} | |
| def _extract_title(self, soup: BeautifulSoup) -> str: | |
| """Extract title from blog post""" | |
| # Try different methods to find title | |
| title = None | |
| # Method 1: Look for <h1> tags in article or entry | |
| article = soup.find(['article', 'div'], class_=re.compile('(post|entry|article)')) | |
| if article: | |
| h1 = article.find('h1') | |
| if h1: | |
| title = h1.get_text().strip() | |
| # Method 2: Look for any h1 if above failed | |
| if not title: | |
| h1_tags = soup.find_all('h1') | |
| if h1_tags and len(h1_tags) > 0: | |
| title = h1_tags[0].get_text().strip() | |
| # Method 3: Look for blog titles in meta tags | |
| if not title: | |
| og_title = soup.find('meta', property='og:title') | |
| if og_title and og_title.get('content'): | |
| title = og_title['content'].strip() | |
| return title or "Unknown Title" | |
| def _extract_publish_date(self, soup: BeautifulSoup) -> str: | |
| """Extract publication date""" | |
| # Try various methods to find date | |
| date = None | |
| # Method 1: Look for common date meta tags | |
| date_meta = soup.find('meta', property='article:published_time') | |
| if date_meta and date_meta.get('content'): | |
| date = date_meta['content'] | |
| # Method 2: Look for common blog date classes | |
| if not date: | |
| date_classes = ['date', 'post-date', 'entry-date', 'published', 'post-meta'] | |
| for class_name in date_classes: | |
| date_element = soup.find(class_=re.compile(class_name, re.I)) | |
| if date_element: | |
| date = date_element.get_text().strip() | |
| break | |
| return date or "Unknown Date" | |
| def _extract_author(self, soup: BeautifulSoup) -> str: | |
| """Extract author information""" | |
| # Try various methods to find author | |
| author = None | |
| # Method 1: Look for author meta tags | |
| author_meta = soup.find('meta', property='article:author') | |
| if author_meta and author_meta.get('content'): | |
| author = author_meta['content'] | |
| # Method 2: Look for blog-specific author classes | |
| if not author: | |
| author_classes = ['author', 'byline', 'entry-author', 'post-author'] | |
| for class_name in author_classes: | |
| author_element = soup.find(class_=re.compile(class_name, re.I)) | |
| if author_element: | |
| author = author_element.get_text().strip() | |
| break | |
| return author or "Unknown Author" | |
| def _extract_categories(self, soup: BeautifulSoup) -> List[str]: | |
| """Extract blog post categories""" | |
| categories = [] | |
| # Method 1: Look for category links | |
| category_elements = soup.find_all('a', class_=re.compile('category')) | |
| if category_elements: | |
| for element in category_elements: | |
| cat_text = element.get_text().strip() | |
| if cat_text and cat_text not in categories: | |
| categories.append(cat_text) | |
| # Method 2: Look for category meta tag | |
| if not categories: | |
| category_meta = soup.find('meta', property='article:section') | |
| if category_meta and category_meta.get('content'): | |
| categories.append(category_meta['content'].strip()) | |
| return categories | |
| def _extract_tags(self, soup: BeautifulSoup) -> List[str]: | |
| """Extract blog post tags""" | |
| tags = [] | |
| # Look for tag links | |
| tag_elements = soup.find_all('a', class_=re.compile('tag')) | |
| if tag_elements: | |
| for element in tag_elements: | |
| tag_text = element.get_text().strip() | |
| if tag_text and tag_text not in tags: | |
| tags.append(tag_text) | |
| return tags | |
| def _extract_summary(self, text_content: str) -> str: | |
| """Extract or create a summary from the blog post text""" | |
| if not text_content: | |
| return "No summary available" | |
| # Take first paragraph or first few sentences (up to 300 chars) | |
| paragraphs = text_content.split('\n\n') | |
| if paragraphs: | |
| summary = paragraphs[0] | |
| if len(summary) > 300: | |
| summary = summary[:297] + "..." | |
| return summary | |
| return "No summary available" | |
| def _extract_domain(self, url: str) -> str: | |
| """Extract domain from URL""" | |
| try: | |
| from urllib.parse import urlparse | |
| parsed_url = urlparse(url) | |
| return parsed_url.netloc | |
| except Exception: | |
| return "Unknown Source" | |