Hana Celeste commited on
Commit
e2b6def
·
verified ·
1 Parent(s): 4cd5dc8

Update app/fetch.py

Browse files
Files changed (1) hide show
  1. app/fetch.py +13 -33
app/fetch.py CHANGED
@@ -1,22 +1,21 @@
1
  import random
2
  import json
3
- import asyncio
4
  from fastapi import HTTPException
5
  from playwright.async_api import async_playwright, TimeoutError
6
 
7
-
8
  UAS = [
9
- # Windows Chrome
10
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
11
  "(KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
12
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
13
- "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
14
-
15
- # macOS Chrome
16
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
17
  "(KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
18
  ]
19
 
 
 
 
 
 
 
20
 
21
  class Fetcher:
22
  def __init__(self):
@@ -27,6 +26,7 @@ class Fetcher:
27
  self.pw = await async_playwright().start()
28
  self.browser = await self.pw.chromium.launch(
29
  headless=True,
 
30
  args=[
31
  "--no-sandbox",
32
  "--disable-dev-shm-usage",
@@ -40,10 +40,8 @@ class Fetcher:
40
  if self.pw:
41
  await self.pw.stop()
42
 
43
- # =============================
44
-
45
- async def _new_context(self):
46
- return await self.browser.new_context(
47
  user_agent=random.choice(UAS),
48
  locale="en-US",
49
  timezone_id="Asia/Ho_Chi_Minh",
@@ -53,45 +51,32 @@ class Fetcher:
53
  },
54
  )
55
 
56
- async def fetch(self, url: str, retry: int = 1):
57
- context = await self._new_context()
58
  page = await context.new_page()
59
 
60
  try:
61
- # ===== CF WARM UP =====
62
  await page.goto(
63
  "https://ihentai.dog/",
64
  wait_until="domcontentloaded",
65
- timeout=15000,
66
  )
67
- await page.wait_for_timeout(1200) # CF settle
68
 
69
- # ===== REAL NAVIGATION =====
70
  resp = await page.goto(
71
  url,
72
  wait_until="domcontentloaded",
73
- timeout=15000,
74
  )
75
 
76
  if not resp:
77
  raise HTTPException(500, "No response")
78
 
79
  status = resp.status
80
- headers = resp.headers
81
  body = await resp.text()
82
 
83
- # ===== STATUS HANDLE =====
84
  if status >= 400:
85
  raise HTTPException(status, "Upstream error")
86
 
87
- if not body:
88
- return {
89
- "status": status,
90
- "headers": headers,
91
- "data": None,
92
- }
93
-
94
- # ===== JSON SAFE PARSE =====
95
  try:
96
  data = json.loads(body)
97
  except Exception:
@@ -99,15 +84,10 @@ class Fetcher:
99
 
100
  return {
101
  "status": status,
102
- "headers": headers,
103
  "data": data,
104
  }
105
 
106
  except TimeoutError:
107
- if retry > 0:
108
- await page.close()
109
- await context.close()
110
- return await self.fetch(url, retry=retry - 1)
111
  raise HTTPException(504, "Timeout")
112
 
113
  finally:
 
1
  import random
2
  import json
 
3
  from fastapi import HTTPException
4
  from playwright.async_api import async_playwright, TimeoutError
5
 
 
6
  UAS = [
 
7
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
8
  "(KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
 
 
 
 
9
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
10
  "(KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
11
  ]
12
 
13
+ PROXY = {
14
+ "server": "http://proxy.scrapingant.com:8080",
15
+ "username": "scrapingant&proxy_type=residential&proxy_country=VN&browser=false",
16
+ "password": "fdd15bd8d6f042f893d8bb93a8e47722",
17
+ }
18
+
19
 
20
  class Fetcher:
21
  def __init__(self):
 
26
  self.pw = await async_playwright().start()
27
  self.browser = await self.pw.chromium.launch(
28
  headless=True,
29
+ proxy=PROXY,
30
  args=[
31
  "--no-sandbox",
32
  "--disable-dev-shm-usage",
 
40
  if self.pw:
41
  await self.pw.stop()
42
 
43
+ async def fetch(self, url: str):
44
+ context = await self.browser.new_context(
 
 
45
  user_agent=random.choice(UAS),
46
  locale="en-US",
47
  timezone_id="Asia/Ho_Chi_Minh",
 
51
  },
52
  )
53
 
 
 
54
  page = await context.new_page()
55
 
56
  try:
57
+ # warm CF
58
  await page.goto(
59
  "https://ihentai.dog/",
60
  wait_until="domcontentloaded",
61
+ timeout=20000,
62
  )
63
+ await page.wait_for_timeout(1500)
64
 
 
65
  resp = await page.goto(
66
  url,
67
  wait_until="domcontentloaded",
68
+ timeout=20000,
69
  )
70
 
71
  if not resp:
72
  raise HTTPException(500, "No response")
73
 
74
  status = resp.status
 
75
  body = await resp.text()
76
 
 
77
  if status >= 400:
78
  raise HTTPException(status, "Upstream error")
79
 
 
 
 
 
 
 
 
 
80
  try:
81
  data = json.loads(body)
82
  except Exception:
 
84
 
85
  return {
86
  "status": status,
 
87
  "data": data,
88
  }
89
 
90
  except TimeoutError:
 
 
 
 
91
  raise HTTPException(504, "Timeout")
92
 
93
  finally: