operations / services /web /base.py
jbbove's picture
🎯 Complete OMIRL web services refactoring with workflow enhancement
36f8fda
"""
Web Services Base Classes and Interfaces
This module defines the abstract base classes and interfaces for web scraping
services. It provides a foundation for implementing site-specific scrapers
while maintaining a consistent API.
The design separates:
- Generic browser management (lifecycle, navigation, waits)
- Generic table extraction (parsing, data cleaning, validation)
- Site-specific configuration (URLs, selectors, timing, locales)
- Site-specific business logic (filtering, data transformation)
This allows for:
- Reusable generic components across different websites
- Easy testing with mocked configurations
- Clear separation of concerns
- Backward compatibility with existing implementations
"""
from abc import ABC, abstractmethod
from typing import Dict, Any, List, Optional, Union, Tuple
from dataclasses import dataclass, field
from playwright.async_api import Page, BrowserContext
import asyncio
@dataclass
class BrowserConfig:
"""Configuration for browser behavior and settings"""
# Browser settings
headless: bool = True
viewport: Dict[str, int] = field(default_factory=lambda: {"width": 1920, "height": 1080})
locale: str = "en-US"
timezone_id: str = "UTC"
user_agent: str = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
# Timeout settings (in milliseconds)
default_timeout: int = 30000
navigation_timeout: int = 60000
# Rate limiting
rate_limit_ms: int = 500
# Custom headers
extra_headers: Dict[str, str] = field(default_factory=dict)
# Browser args
browser_args: List[str] = field(default_factory=lambda: [
'--no-sandbox',
'--disable-blink-features=AutomationControlled',
'--disable-features=VizDisplayCompositor'
])
@dataclass
class TableConfig:
"""Configuration for table extraction behavior"""
# Target selectors
table_selector: str = "table"
header_selector: str = "thead tr th, tr:first-child th, tr:first-child td"
row_selector: str = "tbody tr, tr"
cell_selector: str = "td, th"
# Table identification
expected_headers: List[str] = field(default_factory=list)
table_index: Optional[int] = None
# Data validation
required_fields: List[str] = field(default_factory=list)
skip_empty_rows: bool = True
# Timing
wait_after_navigation_ms: int = 2000
wait_for_content_ms: int = 3000
@dataclass
class SiteConfig:
"""Configuration for a specific website"""
base_url: str
browser_config: BrowserConfig
table_config: TableConfig
# Site-specific URLs
urls: Dict[str, str] = field(default_factory=dict)
# Site-specific selectors
selectors: Dict[str, str] = field(default_factory=dict)
# Site-specific data mappings
data_mappings: Dict[str, Any] = field(default_factory=dict)
class IBrowserManager(ABC):
"""Abstract interface for browser management"""
@abstractmethod
async def get_context(self, context_id: str = "default") -> BrowserContext:
"""Get or create a browser context"""
pass
@abstractmethod
async def navigate_with_retry(self, page: Page, url: str, max_retries: int = 3) -> bool:
"""Navigate to URL with retry logic"""
pass
@abstractmethod
async def apply_rate_limiting(self, delay_ms: Optional[int] = None):
"""Apply rate limiting delay"""
pass
@abstractmethod
async def close_context(self, context_id: str = "default"):
"""Close a specific browser context"""
pass
@abstractmethod
async def close_all(self):
"""Close all browser resources"""
pass
class ITableScraper(ABC):
"""Abstract interface for table scraping"""
@abstractmethod
async def extract_table_data(
self,
page: Page,
table_config: Optional[TableConfig] = None
) -> List[Dict[str, Any]]:
"""Extract data from HTML tables on the page"""
pass
@abstractmethod
async def find_table_by_headers(
self,
page: Page,
expected_headers: List[str]
) -> Optional[int]:
"""Find table index by matching expected headers"""
pass
@abstractmethod
async def extract_table_by_index(
self,
page: Page,
table_index: int,
table_config: Optional[TableConfig] = None
) -> List[Dict[str, Any]]:
"""Extract data from a specific table by index"""
pass
class ISiteAdapter(ABC):
"""Abstract interface for site-specific adapters"""
@abstractmethod
async def fetch_data(
self,
data_type: str,
filters: Dict[str, Any] = None
) -> Dict[str, Any]:
"""Fetch data of a specific type with optional filters"""
pass
@abstractmethod
def get_supported_data_types(self) -> List[str]:
"""Get list of supported data types"""
pass
@abstractmethod
def validate_filters(
self,
data_type: str,
filters: Dict[str, Any]
) -> Tuple[bool, Dict[str, Any], List[str]]:
"""Validate filters for a data type"""
pass
class WebScrapingError(Exception):
"""Base exception for web scraping errors"""
pass
class NavigationError(WebScrapingError):
"""Exception for navigation failures"""
pass
class TableExtractionError(WebScrapingError):
"""Exception for table extraction failures"""
pass
class ConfigurationError(WebScrapingError):
"""Exception for configuration errors"""
pass