| import asyncio |
| import logging |
| from typing import Optional |
| from playwright.async_api import async_playwright, Browser, Playwright, BrowserContext |
| import random |
| import sys |
|
|
| |
| if sys.platform == 'win32': |
| asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy()) |
|
|
| |
| logger = logging.getLogger(__name__) |
|
|
| |
| USER_AGENTS = [ |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", |
| "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", |
| "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36" |
| ] |
|
|
| class BrowserManager: |
| _instance = None |
| |
| def __new__(cls): |
| if cls._instance is None: |
| cls._instance = super(BrowserManager, cls).__new__(cls) |
| cls._instance._initialized = False |
| return cls._instance |
| |
| def __init__(self): |
| if self._initialized: |
| return |
| |
| self.playwright: Optional[Playwright] = None |
| self.browser: Optional[Browser] = None |
| |
| |
| self.semaphore = asyncio.Semaphore(3) |
| self._initialized = True |
| logger.info("BrowserManager initialized (Semaphore: 3)") |
|
|
| async def start(self): |
| """Initialize the global browser instance""" |
| if self.browser: |
| return |
|
|
| try: |
| logger.info("Starting Playwright...") |
| self.playwright = await async_playwright().start() |
| self.browser = await self.playwright.chromium.launch( |
| headless=True, |
| |
| args=[ |
| '--no-sandbox', |
| '--disable-setuid-sandbox', |
| '--disable-dev-shm-usage', |
| '--disable-gpu' |
| ] |
| ) |
| logger.info("Global Browser Instance Started successfully") |
| except Exception as e: |
| logger.error(f"Failed to start Playwright: {e}") |
| raise |
|
|
| async def shutdown(self): |
| """Gracefully close the global browser instance""" |
| logger.info("Shutting down BrowserManager...") |
| if self.browser: |
| await self.browser.close() |
| self.browser = None |
| |
| if self.playwright: |
| await self.playwright.stop() |
| self.playwright = None |
| |
| logger.info("BrowserManager shutdown complete") |
|
|
| async def get_content(self, url: str) -> Optional[str]: |
| """ |
| Fetch dynamic content using a fresh context from the shared browser. |
| Controlled by semaphore to prevention resource exhaustion. |
| """ |
| if not self.browser: |
| logger.error("Browser not initialized! Call start() first.") |
| return None |
|
|
| async with self.semaphore: |
| context: Optional[BrowserContext] = None |
| page = None |
| try: |
| |
| |
| context = await self.browser.new_context( |
| user_agent=random.choice(USER_AGENTS), |
| viewport={'width': 1920, 'height': 1080}, |
| java_script_enabled=True |
| ) |
| |
| page = await context.new_page() |
| |
| logger.info(f"Navigating to {url}") |
| |
| await page.goto(url, wait_until="domcontentloaded", timeout=15000) |
| |
| |
| await page.wait_for_timeout(2000) |
| |
| content = await page.content() |
| logger.info(f"Successfully scraped {len(content)} bytes from {url}") |
| return content |
| |
| except Exception as e: |
| logger.error(f"Scraping failed for {url}: {e}") |
| return None |
| |
| finally: |
| |
| if page: |
| try: |
| await page.close() |
| except: |
| pass |
| |
| if context: |
| try: |
| await context.close() |
| except: |
| pass |
|
|
| |
| browser_manager = BrowserManager() |
|
|