import asyncio from patchright.async_api import async_playwright from markdownify import markdownify as md from bs4 import BeautifulSoup from readability import Document async def scrape(url): async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context( viewport={'width': 1920, 'height': 1080}, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36" ) page = await context.new_page() await page.route("**/*", lambda route: route.abort() if route.request.resource_type in ["image", "media", "font", "stylesheet", "other"] else route.continue_()) try: await page.goto(url, wait_until='domcontentloaded', timeout=30000) content = await page.content() finally: await browser.close() return content async def sanitize(content): tree = BeautifulSoup(content, "html.parser") return tree.prettify() async def convert(content): return md(content)