Aqso commited on
Commit
bc11e45
Β·
verified Β·
1 Parent(s): 7c74c3d

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +24 -27
scraper.py CHANGED
@@ -1,5 +1,5 @@
1
  from playwright.async_api import async_playwright
2
- from playwright_stealth import stealth_async
3
  from bs4 import BeautifulSoup
4
  from database import db
5
  from firebase_admin import firestore
@@ -9,35 +9,41 @@ import random
9
  async def start_scrape_generator(target_url):
10
  async with async_playwright() as p:
11
  yield "πŸš€ Menjalankan Browser Stealth (Chromium)..."
12
- browser = await p.chromium.launch(headless=True)
13
- # Meniru User Agent asli
 
 
 
 
 
14
  context = await browser.new_context(
15
- user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
16
  )
 
17
  page = await context.new_page()
18
- await stealth_async(page) # Ini kunci tembus Cloudflare
 
 
 
 
 
 
 
19
 
20
  try:
21
- # Handle Mode All Donghua (Input Homepage)
22
  if target_url.rstrip('/') == "https://anichin.cafe":
23
  target_url = "https://anichin.cafe/anime/?order=update"
24
  yield "⛓️ Mode Crawler: Menuju daftar anime..."
25
 
26
  yield f"🌐 Membuka: {target_url}"
27
- # Tunggu Cloudflare challenge (biasanya 5-10 detik)
28
- await page.goto(target_url, wait_until="networkidle", timeout=60000)
29
- await asyncio.sleep(5) # Jeda tambahan buat bypass
 
30
 
31
  content = await page.content()
32
- if "Just a moment" in content or "Enable JavaScript" in content:
33
- yield "❌ Cloudflare masi ngeblokir IP HF ini. Mencoba paksa scroll..."
34
- await page.mouse.wheel(0, 500)
35
- await asyncio.sleep(3)
36
- content = await page.content()
37
-
38
  soup = BeautifulSoup(content, 'html.parser')
39
 
40
- # Cek apakah halaman List Anime
41
  items = soup.select('.listupd .bsx a')
42
  if items:
43
  yield f"πŸ“‚ Ditemukan {len(items)} judul. Mulai sinkronisasi..."
@@ -46,26 +52,17 @@ async def start_scrape_generator(target_url):
46
  title = item.select_one('.tt').text.strip() if item.select_one('.tt') else "Judul"
47
  yield f"🎬 Memproses: {title}"
48
 
49
- # Simpan data dasar (Link & Title)
50
  doc_id = title.replace(' ', '_').replace('/', '-')
51
  db.collection('streaming').document(doc_id).set({
52
  "title": title,
53
  "url": link,
54
  "updated_at": firestore.SERVER_TIMESTAMP
55
- })
56
  yield "βœ… SEMUA JUDUL BERHASIL DISINKRONISASI!"
57
  else:
58
- # Logic Single Anime
59
- title_el = soup.select_one('h1.entry-title')
60
- if title_el:
61
- title = title_el.text.strip()
62
- yield f"βœ… Judul Ketemu: {title}"
63
- # ... (Logic simpan episode sama kayak sebelumnya)
64
- else:
65
- yield "❌ Gagal bypass Cloudflare atau Selector salah. IP HF lo kemungkinan besar di-ban total."
66
 
67
  except Exception as e:
68
  yield f"❌ Error: {str(e)}"
69
  finally:
70
  await browser.close()
71
-
 
1
  from playwright.async_api import async_playwright
2
+ import playwright_stealth # Import utuh biar gak ImportError
3
  from bs4 import BeautifulSoup
4
  from database import db
5
  from firebase_admin import firestore
 
9
  async def start_scrape_generator(target_url):
10
  async with async_playwright() as p:
11
  yield "πŸš€ Menjalankan Browser Stealth (Chromium)..."
12
+
13
+ # Launch browser dengan argumen tambahan buat stabilitas di HF
14
+ browser = await p.chromium.launch(
15
+ headless=True,
16
+ args=["--no-sandbox", "--disable-setuid-sandbox"]
17
+ )
18
+
19
  context = await browser.new_context(
20
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
21
  )
22
+
23
  page = await context.new_page()
24
+
25
+ # FIX DISINI: Gunakan pemanggilan langsung dari library
26
+ try:
27
+ await playwright_stealth.stealth_async(page)
28
+ except AttributeError:
29
+ # Fallback jika fungsi async tidak ditemukan
30
+ from playwright_stealth import stealth_sync
31
+ playwright_stealth.stealth_sync(page)
32
 
33
  try:
 
34
  if target_url.rstrip('/') == "https://anichin.cafe":
35
  target_url = "https://anichin.cafe/anime/?order=update"
36
  yield "⛓️ Mode Crawler: Menuju daftar anime..."
37
 
38
  yield f"🌐 Membuka: {target_url}"
39
+
40
+ # Timeout diperpanjang karena Playwright di HF butuh nafas
41
+ await page.goto(target_url, wait_until="networkidle", timeout=90000)
42
+ await asyncio.sleep(5)
43
 
44
  content = await page.content()
 
 
 
 
 
 
45
  soup = BeautifulSoup(content, 'html.parser')
46
 
 
47
  items = soup.select('.listupd .bsx a')
48
  if items:
49
  yield f"πŸ“‚ Ditemukan {len(items)} judul. Mulai sinkronisasi..."
 
52
  title = item.select_one('.tt').text.strip() if item.select_one('.tt') else "Judul"
53
  yield f"🎬 Memproses: {title}"
54
 
 
55
  doc_id = title.replace(' ', '_').replace('/', '-')
56
  db.collection('streaming').document(doc_id).set({
57
  "title": title,
58
  "url": link,
59
  "updated_at": firestore.SERVER_TIMESTAMP
60
+ }, merge=True)
61
  yield "βœ… SEMUA JUDUL BERHASIL DISINKRONISASI!"
62
  else:
63
+ yield "❌ Gagal bypass Cloudflare atau Selector salah. IP HF lo kemungkinan kena limit."
 
 
 
 
 
 
 
64
 
65
  except Exception as e:
66
  yield f"❌ Error: {str(e)}"
67
  finally:
68
  await browser.close()