""" Web Services Base Classes and Interfaces This module defines the abstract base classes and interfaces for web scraping services. It provides a foundation for implementing site-specific scrapers while maintaining a consistent API. The design separates: - Generic browser management (lifecycle, navigation, waits) - Generic table extraction (parsing, data cleaning, validation) - Site-specific configuration (URLs, selectors, timing, locales) - Site-specific business logic (filtering, data transformation) This allows for: - Reusable generic components across different websites - Easy testing with mocked configurations - Clear separation of concerns - Backward compatibility with existing implementations """ from abc import ABC, abstractmethod from typing import Dict, Any, List, Optional, Union, Tuple from dataclasses import dataclass, field from playwright.async_api import Page, BrowserContext import asyncio @dataclass class BrowserConfig: """Configuration for browser behavior and settings""" # Browser settings headless: bool = True viewport: Dict[str, int] = field(default_factory=lambda: {"width": 1920, "height": 1080}) locale: str = "en-US" timezone_id: str = "UTC" user_agent: str = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" # Timeout settings (in milliseconds) default_timeout: int = 30000 navigation_timeout: int = 60000 # Rate limiting rate_limit_ms: int = 500 # Custom headers extra_headers: Dict[str, str] = field(default_factory=dict) # Browser args browser_args: List[str] = field(default_factory=lambda: [ '--no-sandbox', '--disable-blink-features=AutomationControlled', '--disable-features=VizDisplayCompositor' ]) @dataclass class TableConfig: """Configuration for table extraction behavior""" # Target selectors table_selector: str = "table" header_selector: str = "thead tr th, tr:first-child th, tr:first-child td" row_selector: str = "tbody tr, tr" cell_selector: str = "td, th" # Table identification expected_headers: List[str] = field(default_factory=list) table_index: Optional[int] = None # Data validation required_fields: List[str] = field(default_factory=list) skip_empty_rows: bool = True # Timing wait_after_navigation_ms: int = 2000 wait_for_content_ms: int = 3000 @dataclass class SiteConfig: """Configuration for a specific website""" base_url: str browser_config: BrowserConfig table_config: TableConfig # Site-specific URLs urls: Dict[str, str] = field(default_factory=dict) # Site-specific selectors selectors: Dict[str, str] = field(default_factory=dict) # Site-specific data mappings data_mappings: Dict[str, Any] = field(default_factory=dict) class IBrowserManager(ABC): """Abstract interface for browser management""" @abstractmethod async def get_context(self, context_id: str = "default") -> BrowserContext: """Get or create a browser context""" pass @abstractmethod async def navigate_with_retry(self, page: Page, url: str, max_retries: int = 3) -> bool: """Navigate to URL with retry logic""" pass @abstractmethod async def apply_rate_limiting(self, delay_ms: Optional[int] = None): """Apply rate limiting delay""" pass @abstractmethod async def close_context(self, context_id: str = "default"): """Close a specific browser context""" pass @abstractmethod async def close_all(self): """Close all browser resources""" pass class ITableScraper(ABC): """Abstract interface for table scraping""" @abstractmethod async def extract_table_data( self, page: Page, table_config: Optional[TableConfig] = None ) -> List[Dict[str, Any]]: """Extract data from HTML tables on the page""" pass @abstractmethod async def find_table_by_headers( self, page: Page, expected_headers: List[str] ) -> Optional[int]: """Find table index by matching expected headers""" pass @abstractmethod async def extract_table_by_index( self, page: Page, table_index: int, table_config: Optional[TableConfig] = None ) -> List[Dict[str, Any]]: """Extract data from a specific table by index""" pass class ISiteAdapter(ABC): """Abstract interface for site-specific adapters""" @abstractmethod async def fetch_data( self, data_type: str, filters: Dict[str, Any] = None ) -> Dict[str, Any]: """Fetch data of a specific type with optional filters""" pass @abstractmethod def get_supported_data_types(self) -> List[str]: """Get list of supported data types""" pass @abstractmethod def validate_filters( self, data_type: str, filters: Dict[str, Any] ) -> Tuple[bool, Dict[str, Any], List[str]]: """Validate filters for a data type""" pass class WebScrapingError(Exception): """Base exception for web scraping errors""" pass class NavigationError(WebScrapingError): """Exception for navigation failures""" pass class TableExtractionError(WebScrapingError): """Exception for table extraction failures""" pass class ConfigurationError(WebScrapingError): """Exception for configuration errors""" pass