Spaces:

promptAId
/

operations

Runtime error

App Files Files Community

operations / services /web /base.py

jbbove

🎯 Complete OMIRL web services refactoring with workflow enhancement

36f8fda 7 months ago

raw

history blame contribute delete

5.67 kB

	"""
	Web Services Base Classes and Interfaces

	This module defines the abstract base classes and interfaces for web scraping
	services. It provides a foundation for implementing site-specific scrapers
	while maintaining a consistent API.

	The design separates:
	- Generic browser management (lifecycle, navigation, waits)
	- Generic table extraction (parsing, data cleaning, validation)
	- Site-specific configuration (URLs, selectors, timing, locales)
	- Site-specific business logic (filtering, data transformation)

	This allows for:
	- Reusable generic components across different websites
	- Easy testing with mocked configurations
	- Clear separation of concerns
	- Backward compatibility with existing implementations
	"""

	from abc import ABC, abstractmethod
	from typing import Dict, Any, List, Optional, Union, Tuple
	from dataclasses import dataclass, field
	from playwright.async_api import Page, BrowserContext
	import asyncio


	@dataclass
	class BrowserConfig:
	"""Configuration for browser behavior and settings"""

	# Browser settings
	headless: bool = True
	viewport: Dict[str, int] = field(default_factory=lambda: {"width": 1920, "height": 1080})
	locale: str = "en-US"
	timezone_id: str = "UTC"
	user_agent: str = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"

	# Timeout settings (in milliseconds)
	default_timeout: int = 30000
	navigation_timeout: int = 60000

	# Rate limiting
	rate_limit_ms: int = 500

	# Custom headers
	extra_headers: Dict[str, str] = field(default_factory=dict)

	# Browser args
	browser_args: List[str] = field(default_factory=lambda: [
	'--no-sandbox',
	'--disable-blink-features=AutomationControlled',
	'--disable-features=VizDisplayCompositor'
	])


	@dataclass
	class TableConfig:
	"""Configuration for table extraction behavior"""

	# Target selectors
	table_selector: str = "table"
	header_selector: str = "thead tr th, tr:first-child th, tr:first-child td"
	row_selector: str = "tbody tr, tr"
	cell_selector: str = "td, th"

	# Table identification
	expected_headers: List[str] = field(default_factory=list)
	table_index: Optional[int] = None

	# Data validation
	required_fields: List[str] = field(default_factory=list)
	skip_empty_rows: bool = True

	# Timing
	wait_after_navigation_ms: int = 2000
	wait_for_content_ms: int = 3000


	@dataclass
	class SiteConfig:
	"""Configuration for a specific website"""

	base_url: str
	browser_config: BrowserConfig
	table_config: TableConfig

	# Site-specific URLs
	urls: Dict[str, str] = field(default_factory=dict)

	# Site-specific selectors
	selectors: Dict[str, str] = field(default_factory=dict)

	# Site-specific data mappings
	data_mappings: Dict[str, Any] = field(default_factory=dict)


	class IBrowserManager(ABC):
	"""Abstract interface for browser management"""

	@abstractmethod
	async def get_context(self, context_id: str = "default") -> BrowserContext:
	"""Get or create a browser context"""
	pass

	@abstractmethod
	async def navigate_with_retry(self, page: Page, url: str, max_retries: int = 3) -> bool:
	"""Navigate to URL with retry logic"""
	pass

	@abstractmethod
	async def apply_rate_limiting(self, delay_ms: Optional[int] = None):
	"""Apply rate limiting delay"""
	pass

	@abstractmethod
	async def close_context(self, context_id: str = "default"):
	"""Close a specific browser context"""
	pass

	@abstractmethod
	async def close_all(self):
	"""Close all browser resources"""
	pass


	class ITableScraper(ABC):
	"""Abstract interface for table scraping"""

	@abstractmethod
	async def extract_table_data(
	self,
	page: Page,
	table_config: Optional[TableConfig] = None
	) -> List[Dict[str, Any]]:
	"""Extract data from HTML tables on the page"""
	pass

	@abstractmethod
	async def find_table_by_headers(
	self,
	page: Page,
	expected_headers: List[str]
	) -> Optional[int]:
	"""Find table index by matching expected headers"""
	pass

	@abstractmethod
	async def extract_table_by_index(
	self,
	page: Page,
	table_index: int,
	table_config: Optional[TableConfig] = None
	) -> List[Dict[str, Any]]:
	"""Extract data from a specific table by index"""
	pass


	class ISiteAdapter(ABC):
	"""Abstract interface for site-specific adapters"""

	@abstractmethod
	async def fetch_data(
	self,
	data_type: str,
	filters: Dict[str, Any] = None
	) -> Dict[str, Any]:
	"""Fetch data of a specific type with optional filters"""
	pass

	@abstractmethod
	def get_supported_data_types(self) -> List[str]:
	"""Get list of supported data types"""
	pass

	@abstractmethod
	def validate_filters(
	self,
	data_type: str,
	filters: Dict[str, Any]
	) -> Tuple[bool, Dict[str, Any], List[str]]:
	"""Validate filters for a data type"""
	pass


	class WebScrapingError(Exception):
	"""Base exception for web scraping errors"""
	pass


	class NavigationError(WebScrapingError):
	"""Exception for navigation failures"""
	pass


	class TableExtractionError(WebScrapingError):
	"""Exception for table extraction failures"""
	pass


	class ConfigurationError(WebScrapingError):
	"""Exception for configuration errors"""
	pass