Spaces:
Running
Running
| import logging | |
| import re | |
| from typing import Dict, Any, List, Optional | |
| from datetime import datetime, date | |
| from bs4 import BeautifulSoup | |
| import requests | |
| from .base_scraper import BaseScraper | |
| logger = logging.getLogger(__name__) | |
| class HoroscopeScraper(BaseScraper): | |
| """Scraper for horoscope websites""" | |
| # List of valid zodiac signs | |
| ZODIAC_SIGNS = [ | |
| "aries", "taurus", "gemini", "cancer", | |
| "leo", "virgo", "libra", "scorpio", | |
| "sagittarius", "capricorn", "aquarius", "pisces" | |
| ] | |
| def __init__(self, timeout: int = 30): | |
| super().__init__(timeout) | |
| self.source_name = "Generic Horoscope Site" | |
| def scrape_all_signs(self, base_url: str, date_str: Optional[str] = None) -> List[Dict[str, Any]]: | |
| """ | |
| Scrape horoscopes for all zodiac signs | |
| Args: | |
| base_url: Base URL for the horoscope site | |
| date_str: Optional date string in format YYYY-MM-DD | |
| Returns: | |
| List of dictionaries with horoscope data for each sign | |
| """ | |
| results = [] | |
| for sign in self.ZODIAC_SIGNS: | |
| try: | |
| horoscope_data = self.scrape_sign(base_url, sign, date_str) | |
| if horoscope_data and horoscope_data.get('success', False): | |
| results.append(horoscope_data) | |
| except Exception as e: | |
| logger.error(f"Error scraping {sign} horoscope: {str(e)}") | |
| return results | |
| def scrape_sign(self, base_url: str, sign: str, date_str: Optional[str] = None) -> Dict[str, Any]: | |
| """ | |
| Scrape horoscope for a specific zodiac sign | |
| Args: | |
| base_url: Base URL for the horoscope site | |
| sign: Zodiac sign (lowercase) | |
| date_str: Optional date string in format YYYY-MM-DD | |
| Returns: | |
| Dictionary with horoscope data | |
| """ | |
| # Validate sign | |
| if sign.lower() not in self.ZODIAC_SIGNS: | |
| return {"success": False, "error": f"Invalid zodiac sign: {sign}"} | |
| # Format the URL for the specific sign | |
| url = self._format_url(base_url, sign, date_str) | |
| # Use the base scraper method to get the content | |
| result = self.scrape(url) | |
| # Add additional horoscope-specific metadata | |
| result["sign"] = sign.lower() | |
| result["scraped_date"] = date_str if date_str else date.today().isoformat() | |
| result["source_name"] = self.source_name | |
| return result | |
| def parse_content(self, html_content: str, text_content: str, url: str) -> Dict[str, Any]: | |
| """Parse horoscope content and extract structured data""" | |
| try: | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| # Extract metadata - this is a generic implementation | |
| # Specific horoscope sites will need custom implementations | |
| result = { | |
| "type": "horoscope", | |
| "prediction": self._extract_prediction(soup, text_content), | |
| "date": self._extract_date(soup, url), | |
| "source": self._extract_domain(url), | |
| } | |
| return result | |
| except Exception as e: | |
| logger.error(f"Error parsing horoscope content: {str(e)}") | |
| return {"type": "horoscope", "error_parsing": str(e)} | |
| def _format_url(self, base_url: str, sign: str, date_str: Optional[str] = None) -> str: | |
| """ | |
| Format URL for horoscope site. This is a generic implementation. | |
| Should be overridden in specific scrapers. | |
| """ | |
| # Default implementation just appends the sign to the base URL | |
| return f"{base_url.rstrip('/')}/{sign.lower()}" | |
| def _extract_prediction(self, soup: BeautifulSoup, text_content: str) -> str: | |
| """ | |
| Extract horoscope prediction text. | |
| Generic implementation - should be overridden in specific scrapers. | |
| """ | |
| # Default implementation just returns the first paragraph or the text content | |
| prediction = "" | |
| # Look for common horoscope content containers | |
| containers = soup.select('.horoscope-content, .prediction, .horoscope-prediction, article p') | |
| if containers: | |
| prediction = containers[0].get_text().strip() | |
| # If no prediction was found, use the first few paragraphs from text content | |
| if not prediction and text_content: | |
| paragraphs = text_content.split('\n\n') | |
| prediction = paragraphs[0] if paragraphs else text_content[:500] | |
| return prediction or "No prediction available" | |
| def _extract_date(self, soup: BeautifulSoup, url: str) -> str: | |
| """ | |
| Extract horoscope date. | |
| Generic implementation - should be overridden in specific scrapers. | |
| """ | |
| # Look for date in URL | |
| date_match = re.search(r'(\d{4}-\d{2}-\d{2})', url) | |
| if date_match: | |
| return date_match.group(1) | |
| # Look for date in common elements | |
| date_elements = soup.select('.horoscope-date, .date, time') | |
| if date_elements: | |
| date_text = date_elements[0].get_text().strip() | |
| # Try to parse various date formats | |
| try: | |
| # Try common formats | |
| for fmt in ['%Y-%m-%d', '%B %d, %Y', '%d %B %Y', '%m/%d/%Y', '%d/%m/%Y']: | |
| try: | |
| parsed_date = datetime.strptime(date_text, fmt) | |
| return parsed_date.strftime('%Y-%m-%d') | |
| except ValueError: | |
| continue | |
| except Exception: | |
| pass | |
| # Default to today's date if no date found | |
| return date.today().isoformat() | |
| def _extract_domain(self, url: str) -> str: | |
| """Extract domain from URL""" | |
| try: | |
| from urllib.parse import urlparse | |
| parsed_url = urlparse(url) | |
| return parsed_url.netloc | |
| except Exception: | |
| return "Unknown Source" |