Translaterpeed / app /scraper /browser_manager.py
Ruhivig65's picture
Upload 5 files
14345e3 verified
"""
============================================
Browser Manager
- Manages Playwright browser lifecycle
- Creates isolated browser contexts (like separate browser profiles)
- Implements concurrency control with asyncio.Semaphore
- Applies stealth settings to avoid detection
============================================
"""
import asyncio
import logging
from typing import Optional, Dict
from playwright.async_api import (
async_playwright,
Browser,
BrowserContext,
Page,
Playwright,
Error as PlaywrightError,
)
from app.config import settings
logger = logging.getLogger(__name__)
class BrowserManager:
"""
Singleton-style manager for Playwright browser.
Key Design:
- ONE browser instance (Chromium)
- MULTIPLE browser contexts (each novel gets its own context)
- Semaphore limits concurrent contexts to MAX_CONCURRENT_BROWSERS
- Each context has its own cookies, storage, fingerprint
"""
def __init__(self):
self._playwright: Optional[Playwright] = None
self._browser: Optional[Browser] = None
# Semaphore controls how many novels scrape simultaneously
self._semaphore = asyncio.Semaphore(settings.MAX_CONCURRENT_BROWSERS)
# Track active contexts: {novel_id: BrowserContext}
self._active_contexts: Dict[int, BrowserContext] = {}
# Track active pages: {novel_id: Page}
self._active_pages: Dict[int, Page] = {}
# Lock for thread-safe operations
self._lock = asyncio.Lock()
# Status flag
self._is_initialized = False
async def initialize(self):
"""
Start Playwright and launch Chromium browser.
Called once when the FastAPI app starts.
"""
async with self._lock:
if self._is_initialized:
logger.info("Browser already initialized, skipping.")
return
try:
logger.info("๐Ÿš€ Starting Playwright...")
self._playwright = await async_playwright().start()
logger.info("๐ŸŒ Launching Chromium browser...")
self._browser = await self._playwright.chromium.launch(
headless=True, # MUST be headless on Hugging Face
args=[
# --- Performance & Stability ---
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage", # Prevents /dev/shm issues in Docker
"--disable-gpu", # No GPU in Docker
"--disable-software-rasterizer",
# --- Memory Optimization ---
"--disable-extensions",
"--disable-background-networking",
"--disable-background-timer-throttling",
"--disable-backgrounding-occluded-windows",
"--disable-breakpad",
"--disable-component-update",
"--disable-default-apps",
"--disable-hang-monitor",
"--disable-popup-blocking",
"--disable-prompt-on-repost",
"--disable-renderer-backgrounding",
"--disable-sync",
"--disable-translate",
"--metrics-recording-only",
"--no-first-run",
"--safebrowsing-disable-auto-update",
# --- Anti-Detection ---
"--disable-blink-features=AutomationControlled",
"--disable-infobars",
"--window-size=1920,1080",
],
)
self._is_initialized = True
logger.info("โœ… Browser launched successfully!")
except Exception as e:
logger.error(f"โŒ Failed to launch browser: {e}")
await self.shutdown()
raise
async def create_context_for_novel(self, novel_id: int) -> tuple:
"""
Create an isolated browser context for a specific novel.
Returns: (BrowserContext, Page)
Each context is like a fresh browser profile:
- Separate cookies (login stays separate per novel site)
- Separate localStorage
- Own viewport and user agent
"""
if not self._is_initialized or self._browser is None:
raise RuntimeError("Browser not initialized! Call initialize() first.")
# --- Stealth User Agents (rotate randomly) ---
import random
user_agents = [
# Chrome on Windows
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
# Chrome on Mac
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
# Firefox on Windows
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0",
# Edge on Windows
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0",
# Chrome on Linux
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
]
# --- Viewport sizes (realistic screen sizes) ---
viewports = [
{"width": 1920, "height": 1080},
{"width": 1366, "height": 768},
{"width": 1536, "height": 864},
{"width": 1440, "height": 900},
{"width": 1280, "height": 720},
]
selected_ua = random.choice(user_agents)
selected_viewport = random.choice(viewports)
try:
context = await self._browser.new_context(
# --- Identity ---
user_agent=selected_ua,
viewport=selected_viewport,
# --- Locale & Timezone (appear as US user) ---
locale="en-US",
timezone_id="America/New_York",
# --- Permissions ---
permissions=["geolocation"],
geolocation={"latitude": 40.7128, "longitude": -74.0060}, # New York
# --- Other Stealth Settings ---
color_scheme="light",
java_script_enabled=True,
has_touch=False,
is_mobile=False,
# --- Ignore HTTPS errors (some novel sites have bad certs) ---
ignore_https_errors=True,
)
# --- Inject stealth scripts to hide automation ---
await context.add_init_script("""
// Override webdriver detection
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
});
// Override chrome detection
window.chrome = {
runtime: {},
loadTimes: function() {},
csi: function() {},
app: {}
};
// Override permissions query
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.query = (parameters) => (
parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters)
);
// Override plugins length
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5]
});
// Override languages
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en']
});
// Override platform
Object.defineProperty(navigator, 'platform', {
get: () => 'Win32'
});
// Remove automation-related properties
delete navigator.__proto__.webdriver;
""")
# --- Create a new page in this context ---
page = await context.new_page()
# --- Set default timeouts ---
page.set_default_timeout(settings.PAGE_TIMEOUT_SECONDS * 1000)
page.set_default_navigation_timeout(settings.PAGE_TIMEOUT_SECONDS * 1000)
# --- Store references ---
self._active_contexts[novel_id] = context
self._active_pages[novel_id] = page
logger.info(
f"โœ… Created browser context for Novel {novel_id} "
f"(UA: {selected_ua[:50]}..., Viewport: {selected_viewport})"
)
return context, page
except Exception as e:
logger.error(f"โŒ Failed to create context for Novel {novel_id}: {e}")
raise
def get_page(self, novel_id: int) -> Optional[Page]:
"""Get the active page for a specific novel."""
return self._active_pages.get(novel_id)
def get_context(self, novel_id: int) -> Optional[BrowserContext]:
"""Get the active browser context for a specific novel."""
return self._active_contexts.get(novel_id)
async def close_context(self, novel_id: int):
"""
Close the browser context for a specific novel.
Called when scraping is done or failed.
"""
try:
# Close page first
page = self._active_pages.pop(novel_id, None)
if page and not page.is_closed():
await page.close()
# Then close context
context = self._active_contexts.pop(novel_id, None)
if context:
await context.close()
logger.info(f"๐Ÿ”’ Closed browser context for Novel {novel_id}")
except Exception as e:
logger.warning(f"Error closing context for Novel {novel_id}: {e}")
@property
def semaphore(self) -> asyncio.Semaphore:
"""Get the concurrency semaphore."""
return self._semaphore
@property
def active_count(self) -> int:
"""Get number of currently active browser contexts."""
return len(self._active_contexts)
@property
def is_initialized(self) -> bool:
"""Check if browser is initialized."""
return self._is_initialized
def get_active_novel_ids(self) -> list:
"""Get list of novel IDs with active browser contexts."""
return list(self._active_contexts.keys())
async def take_screenshot(self, novel_id: int, filename: str) -> Optional[str]:
"""
Take a screenshot of the current page for a novel.
Used for captcha detection & manual intervention.
Returns: Path to saved screenshot or None
"""
page = self._active_pages.get(novel_id)
if page is None or page.is_closed():
logger.warning(f"Cannot take screenshot: No active page for Novel {novel_id}")
return None
try:
import os
os.makedirs(settings.SCREENSHOTS_DIR, exist_ok=True)
filepath = os.path.join(settings.SCREENSHOTS_DIR, filename)
await page.screenshot(
path=filepath,
full_page=False, # Only visible viewport
type="png",
)
logger.info(f"๐Ÿ“ธ Screenshot saved: {filepath}")
return filepath
except Exception as e:
logger.error(f"Failed to take screenshot for Novel {novel_id}: {e}")
return None
async def click_at_coordinates(self, novel_id: int, x: int, y: int):
"""
Click at specific coordinates on the page.
Used for manual captcha solving from the UI.
"""
page = self._active_pages.get(novel_id)
if page is None or page.is_closed():
raise RuntimeError(f"No active page for Novel {novel_id}")
try:
await page.mouse.click(x, y)
logger.info(f"๐Ÿ–ฑ๏ธ Clicked at ({x}, {y}) for Novel {novel_id}")
# Wait a bit after clicking
await asyncio.sleep(2)
except Exception as e:
logger.error(f"Failed to click at ({x}, {y}) for Novel {novel_id}: {e}")
raise
async def type_text(self, novel_id: int, selector: str, text: str):
"""
Type text into an input field.
Used for captcha text input from the UI.
"""
page = self._active_pages.get(novel_id)
if page is None or page.is_closed():
raise RuntimeError(f"No active page for Novel {novel_id}")
try:
await page.fill(selector, text)
logger.info(f"โŒจ๏ธ Typed text into '{selector}' for Novel {novel_id}")
except Exception as e:
logger.error(f"Failed to type text for Novel {novel_id}: {e}")
raise
async def shutdown(self):
"""
Shut down everything cleanly.
Called when FastAPI app is stopping.
"""
async with self._lock:
logger.info("๐Ÿ›‘ Shutting down Browser Manager...")
# Close all active contexts
for novel_id in list(self._active_contexts.keys()):
await self.close_context(novel_id)
# Close browser
if self._browser:
try:
await self._browser.close()
except Exception as e:
logger.warning(f"Error closing browser: {e}")
self._browser = None
# Stop Playwright
if self._playwright:
try:
await self._playwright.stop()
except Exception as e:
logger.warning(f"Error stopping Playwright: {e}")
self._playwright = None
self._is_initialized = False
logger.info("โœ… Browser Manager shut down completely.")
# ============================================
# Global Singleton Instance
# ============================================
browser_manager = BrowserManager()