Spaces:
Sleeping
Sleeping
| """ | |
| ============================================ | |
| Browser Manager | |
| - Manages Playwright browser lifecycle | |
| - Creates isolated browser contexts (like separate browser profiles) | |
| - Implements concurrency control with asyncio.Semaphore | |
| - Applies stealth settings to avoid detection | |
| ============================================ | |
| """ | |
| import asyncio | |
| import logging | |
| from typing import Optional, Dict | |
| from playwright.async_api import ( | |
| async_playwright, | |
| Browser, | |
| BrowserContext, | |
| Page, | |
| Playwright, | |
| Error as PlaywrightError, | |
| ) | |
| from app.config import settings | |
| logger = logging.getLogger(__name__) | |
| class BrowserManager: | |
| """ | |
| Singleton-style manager for Playwright browser. | |
| Key Design: | |
| - ONE browser instance (Chromium) | |
| - MULTIPLE browser contexts (each novel gets its own context) | |
| - Semaphore limits concurrent contexts to MAX_CONCURRENT_BROWSERS | |
| - Each context has its own cookies, storage, fingerprint | |
| """ | |
| def __init__(self): | |
| self._playwright: Optional[Playwright] = None | |
| self._browser: Optional[Browser] = None | |
| # Semaphore controls how many novels scrape simultaneously | |
| self._semaphore = asyncio.Semaphore(settings.MAX_CONCURRENT_BROWSERS) | |
| # Track active contexts: {novel_id: BrowserContext} | |
| self._active_contexts: Dict[int, BrowserContext] = {} | |
| # Track active pages: {novel_id: Page} | |
| self._active_pages: Dict[int, Page] = {} | |
| # Lock for thread-safe operations | |
| self._lock = asyncio.Lock() | |
| # Status flag | |
| self._is_initialized = False | |
| async def initialize(self): | |
| """ | |
| Start Playwright and launch Chromium browser. | |
| Called once when the FastAPI app starts. | |
| """ | |
| async with self._lock: | |
| if self._is_initialized: | |
| logger.info("Browser already initialized, skipping.") | |
| return | |
| try: | |
| logger.info("๐ Starting Playwright...") | |
| self._playwright = await async_playwright().start() | |
| logger.info("๐ Launching Chromium browser...") | |
| self._browser = await self._playwright.chromium.launch( | |
| headless=True, # MUST be headless on Hugging Face | |
| args=[ | |
| # --- Performance & Stability --- | |
| "--no-sandbox", | |
| "--disable-setuid-sandbox", | |
| "--disable-dev-shm-usage", # Prevents /dev/shm issues in Docker | |
| "--disable-gpu", # No GPU in Docker | |
| "--disable-software-rasterizer", | |
| # --- Memory Optimization --- | |
| "--disable-extensions", | |
| "--disable-background-networking", | |
| "--disable-background-timer-throttling", | |
| "--disable-backgrounding-occluded-windows", | |
| "--disable-breakpad", | |
| "--disable-component-update", | |
| "--disable-default-apps", | |
| "--disable-hang-monitor", | |
| "--disable-popup-blocking", | |
| "--disable-prompt-on-repost", | |
| "--disable-renderer-backgrounding", | |
| "--disable-sync", | |
| "--disable-translate", | |
| "--metrics-recording-only", | |
| "--no-first-run", | |
| "--safebrowsing-disable-auto-update", | |
| # --- Anti-Detection --- | |
| "--disable-blink-features=AutomationControlled", | |
| "--disable-infobars", | |
| "--window-size=1920,1080", | |
| ], | |
| ) | |
| self._is_initialized = True | |
| logger.info("โ Browser launched successfully!") | |
| except Exception as e: | |
| logger.error(f"โ Failed to launch browser: {e}") | |
| await self.shutdown() | |
| raise | |
| async def create_context_for_novel(self, novel_id: int) -> tuple: | |
| """ | |
| Create an isolated browser context for a specific novel. | |
| Returns: (BrowserContext, Page) | |
| Each context is like a fresh browser profile: | |
| - Separate cookies (login stays separate per novel site) | |
| - Separate localStorage | |
| - Own viewport and user agent | |
| """ | |
| if not self._is_initialized or self._browser is None: | |
| raise RuntimeError("Browser not initialized! Call initialize() first.") | |
| # --- Stealth User Agents (rotate randomly) --- | |
| import random | |
| user_agents = [ | |
| # Chrome on Windows | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", | |
| # Chrome on Mac | |
| "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", | |
| # Firefox on Windows | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0", | |
| # Edge on Windows | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0", | |
| # Chrome on Linux | |
| "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", | |
| ] | |
| # --- Viewport sizes (realistic screen sizes) --- | |
| viewports = [ | |
| {"width": 1920, "height": 1080}, | |
| {"width": 1366, "height": 768}, | |
| {"width": 1536, "height": 864}, | |
| {"width": 1440, "height": 900}, | |
| {"width": 1280, "height": 720}, | |
| ] | |
| selected_ua = random.choice(user_agents) | |
| selected_viewport = random.choice(viewports) | |
| try: | |
| context = await self._browser.new_context( | |
| # --- Identity --- | |
| user_agent=selected_ua, | |
| viewport=selected_viewport, | |
| # --- Locale & Timezone (appear as US user) --- | |
| locale="en-US", | |
| timezone_id="America/New_York", | |
| # --- Permissions --- | |
| permissions=["geolocation"], | |
| geolocation={"latitude": 40.7128, "longitude": -74.0060}, # New York | |
| # --- Other Stealth Settings --- | |
| color_scheme="light", | |
| java_script_enabled=True, | |
| has_touch=False, | |
| is_mobile=False, | |
| # --- Ignore HTTPS errors (some novel sites have bad certs) --- | |
| ignore_https_errors=True, | |
| ) | |
| # --- Inject stealth scripts to hide automation --- | |
| await context.add_init_script(""" | |
| // Override webdriver detection | |
| Object.defineProperty(navigator, 'webdriver', { | |
| get: () => undefined | |
| }); | |
| // Override chrome detection | |
| window.chrome = { | |
| runtime: {}, | |
| loadTimes: function() {}, | |
| csi: function() {}, | |
| app: {} | |
| }; | |
| // Override permissions query | |
| const originalQuery = window.navigator.permissions.query; | |
| window.navigator.permissions.query = (parameters) => ( | |
| parameters.name === 'notifications' ? | |
| Promise.resolve({ state: Notification.permission }) : | |
| originalQuery(parameters) | |
| ); | |
| // Override plugins length | |
| Object.defineProperty(navigator, 'plugins', { | |
| get: () => [1, 2, 3, 4, 5] | |
| }); | |
| // Override languages | |
| Object.defineProperty(navigator, 'languages', { | |
| get: () => ['en-US', 'en'] | |
| }); | |
| // Override platform | |
| Object.defineProperty(navigator, 'platform', { | |
| get: () => 'Win32' | |
| }); | |
| // Remove automation-related properties | |
| delete navigator.__proto__.webdriver; | |
| """) | |
| # --- Create a new page in this context --- | |
| page = await context.new_page() | |
| # --- Set default timeouts --- | |
| page.set_default_timeout(settings.PAGE_TIMEOUT_SECONDS * 1000) | |
| page.set_default_navigation_timeout(settings.PAGE_TIMEOUT_SECONDS * 1000) | |
| # --- Store references --- | |
| self._active_contexts[novel_id] = context | |
| self._active_pages[novel_id] = page | |
| logger.info( | |
| f"โ Created browser context for Novel {novel_id} " | |
| f"(UA: {selected_ua[:50]}..., Viewport: {selected_viewport})" | |
| ) | |
| return context, page | |
| except Exception as e: | |
| logger.error(f"โ Failed to create context for Novel {novel_id}: {e}") | |
| raise | |
| def get_page(self, novel_id: int) -> Optional[Page]: | |
| """Get the active page for a specific novel.""" | |
| return self._active_pages.get(novel_id) | |
| def get_context(self, novel_id: int) -> Optional[BrowserContext]: | |
| """Get the active browser context for a specific novel.""" | |
| return self._active_contexts.get(novel_id) | |
| async def close_context(self, novel_id: int): | |
| """ | |
| Close the browser context for a specific novel. | |
| Called when scraping is done or failed. | |
| """ | |
| try: | |
| # Close page first | |
| page = self._active_pages.pop(novel_id, None) | |
| if page and not page.is_closed(): | |
| await page.close() | |
| # Then close context | |
| context = self._active_contexts.pop(novel_id, None) | |
| if context: | |
| await context.close() | |
| logger.info(f"๐ Closed browser context for Novel {novel_id}") | |
| except Exception as e: | |
| logger.warning(f"Error closing context for Novel {novel_id}: {e}") | |
| def semaphore(self) -> asyncio.Semaphore: | |
| """Get the concurrency semaphore.""" | |
| return self._semaphore | |
| def active_count(self) -> int: | |
| """Get number of currently active browser contexts.""" | |
| return len(self._active_contexts) | |
| def is_initialized(self) -> bool: | |
| """Check if browser is initialized.""" | |
| return self._is_initialized | |
| def get_active_novel_ids(self) -> list: | |
| """Get list of novel IDs with active browser contexts.""" | |
| return list(self._active_contexts.keys()) | |
| async def take_screenshot(self, novel_id: int, filename: str) -> Optional[str]: | |
| """ | |
| Take a screenshot of the current page for a novel. | |
| Used for captcha detection & manual intervention. | |
| Returns: Path to saved screenshot or None | |
| """ | |
| page = self._active_pages.get(novel_id) | |
| if page is None or page.is_closed(): | |
| logger.warning(f"Cannot take screenshot: No active page for Novel {novel_id}") | |
| return None | |
| try: | |
| import os | |
| os.makedirs(settings.SCREENSHOTS_DIR, exist_ok=True) | |
| filepath = os.path.join(settings.SCREENSHOTS_DIR, filename) | |
| await page.screenshot( | |
| path=filepath, | |
| full_page=False, # Only visible viewport | |
| type="png", | |
| ) | |
| logger.info(f"๐ธ Screenshot saved: {filepath}") | |
| return filepath | |
| except Exception as e: | |
| logger.error(f"Failed to take screenshot for Novel {novel_id}: {e}") | |
| return None | |
| async def click_at_coordinates(self, novel_id: int, x: int, y: int): | |
| """ | |
| Click at specific coordinates on the page. | |
| Used for manual captcha solving from the UI. | |
| """ | |
| page = self._active_pages.get(novel_id) | |
| if page is None or page.is_closed(): | |
| raise RuntimeError(f"No active page for Novel {novel_id}") | |
| try: | |
| await page.mouse.click(x, y) | |
| logger.info(f"๐ฑ๏ธ Clicked at ({x}, {y}) for Novel {novel_id}") | |
| # Wait a bit after clicking | |
| await asyncio.sleep(2) | |
| except Exception as e: | |
| logger.error(f"Failed to click at ({x}, {y}) for Novel {novel_id}: {e}") | |
| raise | |
| async def type_text(self, novel_id: int, selector: str, text: str): | |
| """ | |
| Type text into an input field. | |
| Used for captcha text input from the UI. | |
| """ | |
| page = self._active_pages.get(novel_id) | |
| if page is None or page.is_closed(): | |
| raise RuntimeError(f"No active page for Novel {novel_id}") | |
| try: | |
| await page.fill(selector, text) | |
| logger.info(f"โจ๏ธ Typed text into '{selector}' for Novel {novel_id}") | |
| except Exception as e: | |
| logger.error(f"Failed to type text for Novel {novel_id}: {e}") | |
| raise | |
| async def shutdown(self): | |
| """ | |
| Shut down everything cleanly. | |
| Called when FastAPI app is stopping. | |
| """ | |
| async with self._lock: | |
| logger.info("๐ Shutting down Browser Manager...") | |
| # Close all active contexts | |
| for novel_id in list(self._active_contexts.keys()): | |
| await self.close_context(novel_id) | |
| # Close browser | |
| if self._browser: | |
| try: | |
| await self._browser.close() | |
| except Exception as e: | |
| logger.warning(f"Error closing browser: {e}") | |
| self._browser = None | |
| # Stop Playwright | |
| if self._playwright: | |
| try: | |
| await self._playwright.stop() | |
| except Exception as e: | |
| logger.warning(f"Error stopping Playwright: {e}") | |
| self._playwright = None | |
| self._is_initialized = False | |
| logger.info("โ Browser Manager shut down completely.") | |
| # ============================================ | |
| # Global Singleton Instance | |
| # ============================================ | |
| browser_manager = BrowserManager() |