Spaces:
Runtime error
Runtime error
| """ | |
| Web Services Base Classes and Interfaces | |
| This module defines the abstract base classes and interfaces for web scraping | |
| services. It provides a foundation for implementing site-specific scrapers | |
| while maintaining a consistent API. | |
| The design separates: | |
| - Generic browser management (lifecycle, navigation, waits) | |
| - Generic table extraction (parsing, data cleaning, validation) | |
| - Site-specific configuration (URLs, selectors, timing, locales) | |
| - Site-specific business logic (filtering, data transformation) | |
| This allows for: | |
| - Reusable generic components across different websites | |
| - Easy testing with mocked configurations | |
| - Clear separation of concerns | |
| - Backward compatibility with existing implementations | |
| """ | |
| from abc import ABC, abstractmethod | |
| from typing import Dict, Any, List, Optional, Union, Tuple | |
| from dataclasses import dataclass, field | |
| from playwright.async_api import Page, BrowserContext | |
| import asyncio | |
| class BrowserConfig: | |
| """Configuration for browser behavior and settings""" | |
| # Browser settings | |
| headless: bool = True | |
| viewport: Dict[str, int] = field(default_factory=lambda: {"width": 1920, "height": 1080}) | |
| locale: str = "en-US" | |
| timezone_id: str = "UTC" | |
| user_agent: str = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" | |
| # Timeout settings (in milliseconds) | |
| default_timeout: int = 30000 | |
| navigation_timeout: int = 60000 | |
| # Rate limiting | |
| rate_limit_ms: int = 500 | |
| # Custom headers | |
| extra_headers: Dict[str, str] = field(default_factory=dict) | |
| # Browser args | |
| browser_args: List[str] = field(default_factory=lambda: [ | |
| '--no-sandbox', | |
| '--disable-blink-features=AutomationControlled', | |
| '--disable-features=VizDisplayCompositor' | |
| ]) | |
| class TableConfig: | |
| """Configuration for table extraction behavior""" | |
| # Target selectors | |
| table_selector: str = "table" | |
| header_selector: str = "thead tr th, tr:first-child th, tr:first-child td" | |
| row_selector: str = "tbody tr, tr" | |
| cell_selector: str = "td, th" | |
| # Table identification | |
| expected_headers: List[str] = field(default_factory=list) | |
| table_index: Optional[int] = None | |
| # Data validation | |
| required_fields: List[str] = field(default_factory=list) | |
| skip_empty_rows: bool = True | |
| # Timing | |
| wait_after_navigation_ms: int = 2000 | |
| wait_for_content_ms: int = 3000 | |
| class SiteConfig: | |
| """Configuration for a specific website""" | |
| base_url: str | |
| browser_config: BrowserConfig | |
| table_config: TableConfig | |
| # Site-specific URLs | |
| urls: Dict[str, str] = field(default_factory=dict) | |
| # Site-specific selectors | |
| selectors: Dict[str, str] = field(default_factory=dict) | |
| # Site-specific data mappings | |
| data_mappings: Dict[str, Any] = field(default_factory=dict) | |
| class IBrowserManager(ABC): | |
| """Abstract interface for browser management""" | |
| async def get_context(self, context_id: str = "default") -> BrowserContext: | |
| """Get or create a browser context""" | |
| pass | |
| async def navigate_with_retry(self, page: Page, url: str, max_retries: int = 3) -> bool: | |
| """Navigate to URL with retry logic""" | |
| pass | |
| async def apply_rate_limiting(self, delay_ms: Optional[int] = None): | |
| """Apply rate limiting delay""" | |
| pass | |
| async def close_context(self, context_id: str = "default"): | |
| """Close a specific browser context""" | |
| pass | |
| async def close_all(self): | |
| """Close all browser resources""" | |
| pass | |
| class ITableScraper(ABC): | |
| """Abstract interface for table scraping""" | |
| async def extract_table_data( | |
| self, | |
| page: Page, | |
| table_config: Optional[TableConfig] = None | |
| ) -> List[Dict[str, Any]]: | |
| """Extract data from HTML tables on the page""" | |
| pass | |
| async def find_table_by_headers( | |
| self, | |
| page: Page, | |
| expected_headers: List[str] | |
| ) -> Optional[int]: | |
| """Find table index by matching expected headers""" | |
| pass | |
| async def extract_table_by_index( | |
| self, | |
| page: Page, | |
| table_index: int, | |
| table_config: Optional[TableConfig] = None | |
| ) -> List[Dict[str, Any]]: | |
| """Extract data from a specific table by index""" | |
| pass | |
| class ISiteAdapter(ABC): | |
| """Abstract interface for site-specific adapters""" | |
| async def fetch_data( | |
| self, | |
| data_type: str, | |
| filters: Dict[str, Any] = None | |
| ) -> Dict[str, Any]: | |
| """Fetch data of a specific type with optional filters""" | |
| pass | |
| def get_supported_data_types(self) -> List[str]: | |
| """Get list of supported data types""" | |
| pass | |
| def validate_filters( | |
| self, | |
| data_type: str, | |
| filters: Dict[str, Any] | |
| ) -> Tuple[bool, Dict[str, Any], List[str]]: | |
| """Validate filters for a data type""" | |
| pass | |
| class WebScrapingError(Exception): | |
| """Base exception for web scraping errors""" | |
| pass | |
| class NavigationError(WebScrapingError): | |
| """Exception for navigation failures""" | |
| pass | |
| class TableExtractionError(WebScrapingError): | |
| """Exception for table extraction failures""" | |
| pass | |
| class ConfigurationError(WebScrapingError): | |
| """Exception for configuration errors""" | |
| pass | |