Spaces:

pareshmishra
/

MT564AITraining

Running

MT564AITraining / scrapers /horoscope_scraper.py

pareshmishra

Add full project source files for MT564 AI

2c72e40 6 months ago

6.16 kB

	import logging
	import re
	from typing import Dict, Any, List, Optional
	from datetime import datetime, date
	from bs4 import BeautifulSoup
	import requests
	from .base_scraper import BaseScraper

	logger = logging.getLogger(__name__)

	class HoroscopeScraper(BaseScraper):
	"""Scraper for horoscope websites"""

	# List of valid zodiac signs
	ZODIAC_SIGNS = [
	"aries", "taurus", "gemini", "cancer",
	"leo", "virgo", "libra", "scorpio",
	"sagittarius", "capricorn", "aquarius", "pisces"
	]

	def __init__(self, timeout: int = 30):
	super().__init__(timeout)
	self.source_name = "Generic Horoscope Site"

	def scrape_all_signs(self, base_url: str, date_str: Optional[str] = None) -> List[Dict[str, Any]]:
	"""
	Scrape horoscopes for all zodiac signs

	Args:
	base_url: Base URL for the horoscope site
	date_str: Optional date string in format YYYY-MM-DD

	Returns:
	List of dictionaries with horoscope data for each sign
	"""
	results = []

	for sign in self.ZODIAC_SIGNS:
	try:
	horoscope_data = self.scrape_sign(base_url, sign, date_str)
	if horoscope_data and horoscope_data.get('success', False):
	results.append(horoscope_data)
	except Exception as e:
	logger.error(f"Error scraping {sign} horoscope: {str(e)}")

	return results

	def scrape_sign(self, base_url: str, sign: str, date_str: Optional[str] = None) -> Dict[str, Any]:
	"""
	Scrape horoscope for a specific zodiac sign

	Args:
	base_url: Base URL for the horoscope site
	sign: Zodiac sign (lowercase)
	date_str: Optional date string in format YYYY-MM-DD

	Returns:
	Dictionary with horoscope data
	"""
	# Validate sign
	if sign.lower() not in self.ZODIAC_SIGNS:
	return {"success": False, "error": f"Invalid zodiac sign: {sign}"}

	# Format the URL for the specific sign
	url = self._format_url(base_url, sign, date_str)

	# Use the base scraper method to get the content
	result = self.scrape(url)

	# Add additional horoscope-specific metadata
	result["sign"] = sign.lower()
	result["scraped_date"] = date_str if date_str else date.today().isoformat()
	result["source_name"] = self.source_name

	return result

	def parse_content(self, html_content: str, text_content: str, url: str) -> Dict[str, Any]:
	"""Parse horoscope content and extract structured data"""
	try:
	soup = BeautifulSoup(html_content, 'html.parser')

	# Extract metadata - this is a generic implementation
	# Specific horoscope sites will need custom implementations
	result = {
	"type": "horoscope",
	"prediction": self._extract_prediction(soup, text_content),
	"date": self._extract_date(soup, url),
	"source": self._extract_domain(url),
	}

	return result
	except Exception as e:
	logger.error(f"Error parsing horoscope content: {str(e)}")
	return {"type": "horoscope", "error_parsing": str(e)}

	def _format_url(self, base_url: str, sign: str, date_str: Optional[str] = None) -> str:
	"""
	Format URL for horoscope site. This is a generic implementation.
	Should be overridden in specific scrapers.
	"""
	# Default implementation just appends the sign to the base URL
	return f"{base_url.rstrip('/')}/{sign.lower()}"

	def _extract_prediction(self, soup: BeautifulSoup, text_content: str) -> str:
	"""
	Extract horoscope prediction text.
	Generic implementation - should be overridden in specific scrapers.
	"""
	# Default implementation just returns the first paragraph or the text content
	prediction = ""

	# Look for common horoscope content containers
	containers = soup.select('.horoscope-content, .prediction, .horoscope-prediction, article p')
	if containers:
	prediction = containers[0].get_text().strip()

	# If no prediction was found, use the first few paragraphs from text content
	if not prediction and text_content:
	paragraphs = text_content.split('\n\n')
	prediction = paragraphs[0] if paragraphs else text_content[:500]

	return prediction or "No prediction available"

	def _extract_date(self, soup: BeautifulSoup, url: str) -> str:
	"""
	Extract horoscope date.
	Generic implementation - should be overridden in specific scrapers.
	"""
	# Look for date in URL
	date_match = re.search(r'(\d{4}-\d{2}-\d{2})', url)
	if date_match:
	return date_match.group(1)

	# Look for date in common elements
	date_elements = soup.select('.horoscope-date, .date, time')
	if date_elements:
	date_text = date_elements[0].get_text().strip()
	# Try to parse various date formats
	try:
	# Try common formats
	for fmt in ['%Y-%m-%d', '%B %d, %Y', '%d %B %Y', '%m/%d/%Y', '%d/%m/%Y']:
	try:
	parsed_date = datetime.strptime(date_text, fmt)
	return parsed_date.strftime('%Y-%m-%d')
	except ValueError:
	continue
	except Exception:
	pass

	# Default to today's date if no date found
	return date.today().isoformat()

	def _extract_domain(self, url: str) -> str:
	"""Extract domain from URL"""
	try:
	from urllib.parse import urlparse
	parsed_url = urlparse(url)
	return parsed_url.netloc
	except Exception:
	return "Unknown Source"