""" ============================================ Browser Manager - Manages Playwright browser lifecycle - Creates isolated browser contexts (like separate browser profiles) - Implements concurrency control with asyncio.Semaphore - Applies stealth settings to avoid detection ============================================ """ import asyncio import logging from typing import Optional, Dict from playwright.async_api import ( async_playwright, Browser, BrowserContext, Page, Playwright, Error as PlaywrightError, ) from app.config import settings logger = logging.getLogger(__name__) class BrowserManager: """ Singleton-style manager for Playwright browser. Key Design: - ONE browser instance (Chromium) - MULTIPLE browser contexts (each novel gets its own context) - Semaphore limits concurrent contexts to MAX_CONCURRENT_BROWSERS - Each context has its own cookies, storage, fingerprint """ def __init__(self): self._playwright: Optional[Playwright] = None self._browser: Optional[Browser] = None # Semaphore controls how many novels scrape simultaneously self._semaphore = asyncio.Semaphore(settings.MAX_CONCURRENT_BROWSERS) # Track active contexts: {novel_id: BrowserContext} self._active_contexts: Dict[int, BrowserContext] = {} # Track active pages: {novel_id: Page} self._active_pages: Dict[int, Page] = {} # Lock for thread-safe operations self._lock = asyncio.Lock() # Status flag self._is_initialized = False async def initialize(self): """ Start Playwright and launch Chromium browser. Called once when the FastAPI app starts. """ async with self._lock: if self._is_initialized: logger.info("Browser already initialized, skipping.") return try: logger.info("🚀 Starting Playwright...") self._playwright = await async_playwright().start() logger.info("🌐 Launching Chromium browser...") self._browser = await self._playwright.chromium.launch( headless=True, # MUST be headless on Hugging Face args=[ # --- Performance & Stability --- "--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage", # Prevents /dev/shm issues in Docker "--disable-gpu", # No GPU in Docker "--disable-software-rasterizer", # --- Memory Optimization --- "--disable-extensions", "--disable-background-networking", "--disable-background-timer-throttling", "--disable-backgrounding-occluded-windows", "--disable-breakpad", "--disable-component-update", "--disable-default-apps", "--disable-hang-monitor", "--disable-popup-blocking", "--disable-prompt-on-repost", "--disable-renderer-backgrounding", "--disable-sync", "--disable-translate", "--metrics-recording-only", "--no-first-run", "--safebrowsing-disable-auto-update", # --- Anti-Detection --- "--disable-blink-features=AutomationControlled", "--disable-infobars", "--window-size=1920,1080", ], ) self._is_initialized = True logger.info("✅ Browser launched successfully!") except Exception as e: logger.error(f"❌ Failed to launch browser: {e}") await self.shutdown() raise async def create_context_for_novel(self, novel_id: int) -> tuple: """ Create an isolated browser context for a specific novel. Returns: (BrowserContext, Page) Each context is like a fresh browser profile: - Separate cookies (login stays separate per novel site) - Separate localStorage - Own viewport and user agent """ if not self._is_initialized or self._browser is None: raise RuntimeError("Browser not initialized! Call initialize() first.") # --- Stealth User Agents (rotate randomly) --- import random user_agents = [ # Chrome on Windows "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", # Chrome on Mac "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", # Firefox on Windows "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0", # Edge on Windows "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0", # Chrome on Linux "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", ] # --- Viewport sizes (realistic screen sizes) --- viewports = [ {"width": 1920, "height": 1080}, {"width": 1366, "height": 768}, {"width": 1536, "height": 864}, {"width": 1440, "height": 900}, {"width": 1280, "height": 720}, ] selected_ua = random.choice(user_agents) selected_viewport = random.choice(viewports) try: context = await self._browser.new_context( # --- Identity --- user_agent=selected_ua, viewport=selected_viewport, # --- Locale & Timezone (appear as US user) --- locale="en-US", timezone_id="America/New_York", # --- Permissions --- permissions=["geolocation"], geolocation={"latitude": 40.7128, "longitude": -74.0060}, # New York # --- Other Stealth Settings --- color_scheme="light", java_script_enabled=True, has_touch=False, is_mobile=False, # --- Ignore HTTPS errors (some novel sites have bad certs) --- ignore_https_errors=True, ) # --- Inject stealth scripts to hide automation --- await context.add_init_script(""" // Override webdriver detection Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); // Override chrome detection window.chrome = { runtime: {}, loadTimes: function() {}, csi: function() {}, app: {} }; // Override permissions query const originalQuery = window.navigator.permissions.query; window.navigator.permissions.query = (parameters) => ( parameters.name === 'notifications' ? Promise.resolve({ state: Notification.permission }) : originalQuery(parameters) ); // Override plugins length Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] }); // Override languages Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); // Override platform Object.defineProperty(navigator, 'platform', { get: () => 'Win32' }); // Remove automation-related properties delete navigator.__proto__.webdriver; """) # --- Create a new page in this context --- page = await context.new_page() # --- Set default timeouts --- page.set_default_timeout(settings.PAGE_TIMEOUT_SECONDS * 1000) page.set_default_navigation_timeout(settings.PAGE_TIMEOUT_SECONDS * 1000) # --- Store references --- self._active_contexts[novel_id] = context self._active_pages[novel_id] = page logger.info( f"✅ Created browser context for Novel {novel_id} " f"(UA: {selected_ua[:50]}..., Viewport: {selected_viewport})" ) return context, page except Exception as e: logger.error(f"❌ Failed to create context for Novel {novel_id}: {e}") raise def get_page(self, novel_id: int) -> Optional[Page]: """Get the active page for a specific novel.""" return self._active_pages.get(novel_id) def get_context(self, novel_id: int) -> Optional[BrowserContext]: """Get the active browser context for a specific novel.""" return self._active_contexts.get(novel_id) async def close_context(self, novel_id: int): """ Close the browser context for a specific novel. Called when scraping is done or failed. """ try: # Close page first page = self._active_pages.pop(novel_id, None) if page and not page.is_closed(): await page.close() # Then close context context = self._active_contexts.pop(novel_id, None) if context: await context.close() logger.info(f"🔒 Closed browser context for Novel {novel_id}") except Exception as e: logger.warning(f"Error closing context for Novel {novel_id}: {e}") @property def semaphore(self) -> asyncio.Semaphore: """Get the concurrency semaphore.""" return self._semaphore @property def active_count(self) -> int: """Get number of currently active browser contexts.""" return len(self._active_contexts) @property def is_initialized(self) -> bool: """Check if browser is initialized.""" return self._is_initialized def get_active_novel_ids(self) -> list: """Get list of novel IDs with active browser contexts.""" return list(self._active_contexts.keys()) async def take_screenshot(self, novel_id: int, filename: str) -> Optional[str]: """ Take a screenshot of the current page for a novel. Used for captcha detection & manual intervention. Returns: Path to saved screenshot or None """ page = self._active_pages.get(novel_id) if page is None or page.is_closed(): logger.warning(f"Cannot take screenshot: No active page for Novel {novel_id}") return None try: import os os.makedirs(settings.SCREENSHOTS_DIR, exist_ok=True) filepath = os.path.join(settings.SCREENSHOTS_DIR, filename) await page.screenshot( path=filepath, full_page=False, # Only visible viewport type="png", ) logger.info(f"📸 Screenshot saved: {filepath}") return filepath except Exception as e: logger.error(f"Failed to take screenshot for Novel {novel_id}: {e}") return None async def click_at_coordinates(self, novel_id: int, x: int, y: int): """ Click at specific coordinates on the page. Used for manual captcha solving from the UI. """ page = self._active_pages.get(novel_id) if page is None or page.is_closed(): raise RuntimeError(f"No active page for Novel {novel_id}") try: await page.mouse.click(x, y) logger.info(f"🖱️ Clicked at ({x}, {y}) for Novel {novel_id}") # Wait a bit after clicking await asyncio.sleep(2) except Exception as e: logger.error(f"Failed to click at ({x}, {y}) for Novel {novel_id}: {e}") raise async def type_text(self, novel_id: int, selector: str, text: str): """ Type text into an input field. Used for captcha text input from the UI. """ page = self._active_pages.get(novel_id) if page is None or page.is_closed(): raise RuntimeError(f"No active page for Novel {novel_id}") try: await page.fill(selector, text) logger.info(f"⌨️ Typed text into '{selector}' for Novel {novel_id}") except Exception as e: logger.error(f"Failed to type text for Novel {novel_id}: {e}") raise async def shutdown(self): """ Shut down everything cleanly. Called when FastAPI app is stopping. """ async with self._lock: logger.info("🛑 Shutting down Browser Manager...") # Close all active contexts for novel_id in list(self._active_contexts.keys()): await self.close_context(novel_id) # Close browser if self._browser: try: await self._browser.close() except Exception as e: logger.warning(f"Error closing browser: {e}") self._browser = None # Stop Playwright if self._playwright: try: await self._playwright.stop() except Exception as e: logger.warning(f"Error stopping Playwright: {e}") self._playwright = None self._is_initialized = False logger.info("✅ Browser Manager shut down completely.") # ============================================ # Global Singleton Instance # ============================================ browser_manager = BrowserManager()