Hana Celeste commited on
Commit
0d18fcb
·
verified ·
1 Parent(s): d9c02b8

Create fetch.py

Browse files
Files changed (1) hide show
  1. app/fetch.py +59 -0
app/fetch.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from playwright.async_api import async_playwright
3
+
4
+ class Fetcher:
5
+ def __init__(self):
6
+ self.playwright = None
7
+ self.browser = None
8
+ self.context = None
9
+ self.sem = asyncio.Semaphore(2) # <= 2 core thì đừng quá tay
10
+
11
+ async def start(self):
12
+ self.playwright = await async_playwright().start()
13
+ self.browser = await self.playwright.chromium.launch(
14
+ headless=True,
15
+ args=[
16
+ "--disable-blink-features=AutomationControlled",
17
+ "--no-sandbox",
18
+ "--disable-dev-shm-usage",
19
+ ],
20
+ )
21
+ self.context = await self.browser.new_context(
22
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/121.0",
23
+ viewport={"width": 1280, "height": 720},
24
+ )
25
+
26
+ # block nặng
27
+ await self.context.route(
28
+ "**/*",
29
+ lambda route: route.abort()
30
+ if route.request.resource_type in ["image", "font", "media"]
31
+ else route.continue_(),
32
+ )
33
+
34
+ async def stop(self):
35
+ if self.context:
36
+ await self.context.close()
37
+ if self.browser:
38
+ await self.browser.close()
39
+ if self.playwright:
40
+ await self.playwright.stop()
41
+
42
+ async def fetch(self, url: str):
43
+ async with self.sem:
44
+ page = await self.context.new_page()
45
+ try:
46
+ await page.goto(url, timeout=30000, wait_until="domcontentloaded")
47
+ content = await page.content()
48
+ return {
49
+ "ok": True,
50
+ "length": len(content),
51
+ "html": content,
52
+ }
53
+ except Exception as e:
54
+ return {
55
+ "ok": False,
56
+ "error": str(e),
57
+ }
58
+ finally:
59
+ await page.close()