Spaces:
Running
Running
| import asyncio | |
| from playwright.async_api import async_playwright | |
| from typing import Dict, Optional | |
| import time | |
| from settings import settings | |
| class HTMLLoader: | |
| def __init__(self): | |
| self.browser = None | |
| self.context = None | |
| async def __aenter__(self): | |
| self.playwright = await async_playwright().start() | |
| self.browser = await self.playwright.chromium.launch( | |
| headless=settings.scraping.headless | |
| ) | |
| self.context = await self.browser.new_context( | |
| user_agent=settings.scraping.user_agent | |
| ) | |
| return self | |
| async def __aexit__(self, exc_type, exc_val, exc_tb): | |
| if self.context: | |
| await self.context.close() | |
| if self.browser: | |
| await self.browser.close() | |
| if self.playwright: | |
| await self.playwright.stop() | |
| async def load_page(self, url: str) -> Dict[str, str]: | |
| """Load HTML content from URL handling both static and dynamic sites""" | |
| for attempt in range(settings.scraping.max_retries): | |
| try: | |
| page = await self.context.new_page() | |
| await page.goto(url, timeout=settings.scraping.timeout) | |
| # Wait for body to load | |
| await page.wait_for_selector( | |
| settings.scraping.wait_for_selector, | |
| timeout=10000 | |
| ) | |
| # Additional wait for dynamic content | |
| await page.wait_for_timeout(2000) | |
| html_content = await page.content() | |
| title = await page.title() | |
| url_final = page.url | |
| await page.close() | |
| return { | |
| "html": html_content, | |
| "title": title, | |
| "url": url_final, | |
| "timestamp": int(time.time()) | |
| } | |
| except Exception as e: | |
| if attempt == settings.scraping.max_retries - 1: | |
| raise Exception(f"Failed to load {url}: {str(e)}") | |
| await asyncio.sleep(settings.scraping.delay_between_requests) | |
| return None |