File size: 1,225 Bytes
ffce262
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import asyncio
from patchright.async_api import async_playwright
from markdownify import markdownify as md
from bs4 import BeautifulSoup
from readability import Document

async def scrape(url):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        
        context = await browser.new_context(
            viewport={'width': 1920, 'height': 1080},
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
        )
        
        page = await context.new_page()
        
        await page.route("**/*", lambda route: route.abort() 
            if route.request.resource_type in ["image", "media", "font", "stylesheet", "other"] 
            else route.continue_())
            
        try:
            await page.goto(url, wait_until='domcontentloaded', timeout=30000)
            content = await page.content()
        finally:
            await browser.close()
            
        return content

async def sanitize(content):
    tree = BeautifulSoup(content, "html.parser")
    return tree.prettify()

async def convert(content):
    return md(content)