Update scraper.py
Browse files- scraper.py +24 -27
scraper.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
from playwright.async_api import async_playwright
|
| 2 |
-
|
| 3 |
from bs4 import BeautifulSoup
|
| 4 |
from database import db
|
| 5 |
from firebase_admin import firestore
|
|
@@ -9,35 +9,41 @@ import random
|
|
| 9 |
async def start_scrape_generator(target_url):
|
| 10 |
async with async_playwright() as p:
|
| 11 |
yield "π Menjalankan Browser Stealth (Chromium)..."
|
| 12 |
-
|
| 13 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
context = await browser.new_context(
|
| 15 |
-
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/
|
| 16 |
)
|
|
|
|
| 17 |
page = await context.new_page()
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
try:
|
| 21 |
-
# Handle Mode All Donghua (Input Homepage)
|
| 22 |
if target_url.rstrip('/') == "https://anichin.cafe":
|
| 23 |
target_url = "https://anichin.cafe/anime/?order=update"
|
| 24 |
yield "βοΈ Mode Crawler: Menuju daftar anime..."
|
| 25 |
|
| 26 |
yield f"π Membuka: {target_url}"
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
await
|
|
|
|
| 30 |
|
| 31 |
content = await page.content()
|
| 32 |
-
if "Just a moment" in content or "Enable JavaScript" in content:
|
| 33 |
-
yield "β Cloudflare masi ngeblokir IP HF ini. Mencoba paksa scroll..."
|
| 34 |
-
await page.mouse.wheel(0, 500)
|
| 35 |
-
await asyncio.sleep(3)
|
| 36 |
-
content = await page.content()
|
| 37 |
-
|
| 38 |
soup = BeautifulSoup(content, 'html.parser')
|
| 39 |
|
| 40 |
-
# Cek apakah halaman List Anime
|
| 41 |
items = soup.select('.listupd .bsx a')
|
| 42 |
if items:
|
| 43 |
yield f"π Ditemukan {len(items)} judul. Mulai sinkronisasi..."
|
|
@@ -46,26 +52,17 @@ async def start_scrape_generator(target_url):
|
|
| 46 |
title = item.select_one('.tt').text.strip() if item.select_one('.tt') else "Judul"
|
| 47 |
yield f"π¬ Memproses: {title}"
|
| 48 |
|
| 49 |
-
# Simpan data dasar (Link & Title)
|
| 50 |
doc_id = title.replace(' ', '_').replace('/', '-')
|
| 51 |
db.collection('streaming').document(doc_id).set({
|
| 52 |
"title": title,
|
| 53 |
"url": link,
|
| 54 |
"updated_at": firestore.SERVER_TIMESTAMP
|
| 55 |
-
})
|
| 56 |
yield "β
SEMUA JUDUL BERHASIL DISINKRONISASI!"
|
| 57 |
else:
|
| 58 |
-
|
| 59 |
-
title_el = soup.select_one('h1.entry-title')
|
| 60 |
-
if title_el:
|
| 61 |
-
title = title_el.text.strip()
|
| 62 |
-
yield f"β
Judul Ketemu: {title}"
|
| 63 |
-
# ... (Logic simpan episode sama kayak sebelumnya)
|
| 64 |
-
else:
|
| 65 |
-
yield "β Gagal bypass Cloudflare atau Selector salah. IP HF lo kemungkinan besar di-ban total."
|
| 66 |
|
| 67 |
except Exception as e:
|
| 68 |
yield f"β Error: {str(e)}"
|
| 69 |
finally:
|
| 70 |
await browser.close()
|
| 71 |
-
|
|
|
|
| 1 |
from playwright.async_api import async_playwright
|
| 2 |
+
import playwright_stealth # Import utuh biar gak ImportError
|
| 3 |
from bs4 import BeautifulSoup
|
| 4 |
from database import db
|
| 5 |
from firebase_admin import firestore
|
|
|
|
| 9 |
async def start_scrape_generator(target_url):
|
| 10 |
async with async_playwright() as p:
|
| 11 |
yield "π Menjalankan Browser Stealth (Chromium)..."
|
| 12 |
+
|
| 13 |
+
# Launch browser dengan argumen tambahan buat stabilitas di HF
|
| 14 |
+
browser = await p.chromium.launch(
|
| 15 |
+
headless=True,
|
| 16 |
+
args=["--no-sandbox", "--disable-setuid-sandbox"]
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
context = await browser.new_context(
|
| 20 |
+
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
| 21 |
)
|
| 22 |
+
|
| 23 |
page = await context.new_page()
|
| 24 |
+
|
| 25 |
+
# FIX DISINI: Gunakan pemanggilan langsung dari library
|
| 26 |
+
try:
|
| 27 |
+
await playwright_stealth.stealth_async(page)
|
| 28 |
+
except AttributeError:
|
| 29 |
+
# Fallback jika fungsi async tidak ditemukan
|
| 30 |
+
from playwright_stealth import stealth_sync
|
| 31 |
+
playwright_stealth.stealth_sync(page)
|
| 32 |
|
| 33 |
try:
|
|
|
|
| 34 |
if target_url.rstrip('/') == "https://anichin.cafe":
|
| 35 |
target_url = "https://anichin.cafe/anime/?order=update"
|
| 36 |
yield "βοΈ Mode Crawler: Menuju daftar anime..."
|
| 37 |
|
| 38 |
yield f"π Membuka: {target_url}"
|
| 39 |
+
|
| 40 |
+
# Timeout diperpanjang karena Playwright di HF butuh nafas
|
| 41 |
+
await page.goto(target_url, wait_until="networkidle", timeout=90000)
|
| 42 |
+
await asyncio.sleep(5)
|
| 43 |
|
| 44 |
content = await page.content()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
soup = BeautifulSoup(content, 'html.parser')
|
| 46 |
|
|
|
|
| 47 |
items = soup.select('.listupd .bsx a')
|
| 48 |
if items:
|
| 49 |
yield f"π Ditemukan {len(items)} judul. Mulai sinkronisasi..."
|
|
|
|
| 52 |
title = item.select_one('.tt').text.strip() if item.select_one('.tt') else "Judul"
|
| 53 |
yield f"π¬ Memproses: {title}"
|
| 54 |
|
|
|
|
| 55 |
doc_id = title.replace(' ', '_').replace('/', '-')
|
| 56 |
db.collection('streaming').document(doc_id).set({
|
| 57 |
"title": title,
|
| 58 |
"url": link,
|
| 59 |
"updated_at": firestore.SERVER_TIMESTAMP
|
| 60 |
+
}, merge=True)
|
| 61 |
yield "β
SEMUA JUDUL BERHASIL DISINKRONISASI!"
|
| 62 |
else:
|
| 63 |
+
yield "β Gagal bypass Cloudflare atau Selector salah. IP HF lo kemungkinan kena limit."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
except Exception as e:
|
| 66 |
yield f"β Error: {str(e)}"
|
| 67 |
finally:
|
| 68 |
await browser.close()
|
|
|