Spaces:

Aqso
/

weu

Paused

App Files Files Community

Aqso commited on Feb 21

Commit

bc11e45

verified ·

1 Parent(s): 7c74c3d

Update scraper.py

Browse files

Files changed (1) hide show

scraper.py +24 -27

scraper.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from playwright.async_api import async_playwright
-from playwright_stealth import stealth_async
 from bs4 import BeautifulSoup
 from database import db
 from firebase_admin import firestore
@@ -9,35 +9,41 @@ import random
 async def start_scrape_generator(target_url):
     async with async_playwright() as p:
         yield "🚀 Menjalankan Browser Stealth (Chromium)..."
-        browser = await p.chromium.launch(headless=True)
-        # Meniru User Agent asli
         context = await browser.new_context(
-            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
         )
         page = await context.new_page()
-        await stealth_async(page) # Ini kunci tembus Cloudflare
         try:
-            # Handle Mode All Donghua (Input Homepage)
             if target_url.rstrip('/') == "https://anichin.cafe":
                 target_url = "https://anichin.cafe/anime/?order=update"
                 yield "⛓️ Mode Crawler: Menuju daftar anime..."
             yield f"🌐 Membuka: {target_url}"
-            # Tunggu Cloudflare challenge (biasanya 5-10 detik)
-            await page.goto(target_url, wait_until="networkidle", timeout=60000)
-            await asyncio.sleep(5) # Jeda tambahan buat bypass
             content = await page.content()
-            if "Just a moment" in content or "Enable JavaScript" in content:
-                yield "❌ Cloudflare masi ngeblokir IP HF ini. Mencoba paksa scroll..."
-                await page.mouse.wheel(0, 500)
-                await asyncio.sleep(3)
-                content = await page.content()
             soup = BeautifulSoup(content, 'html.parser')
-            # Cek apakah halaman List Anime
             items = soup.select('.listupd .bsx a')
             if items:
                 yield f"📂 Ditemukan {len(items)} judul. Mulai sinkronisasi..."
@@ -46,26 +52,17 @@ async def start_scrape_generator(target_url):
                     title = item.select_one('.tt').text.strip() if item.select_one('.tt') else "Judul"
                     yield f"🎬 Memproses: {title}"
-                    # Simpan data dasar (Link & Title)
                     doc_id = title.replace(' ', '_').replace('/', '-')
                     db.collection('streaming').document(doc_id).set({
                         "title": title,
                         "url": link,
                         "updated_at": firestore.SERVER_TIMESTAMP
-                    })
                 yield "✅ SEMUA JUDUL BERHASIL DISINKRONISASI!"
             else:
-                # Logic Single Anime
-                title_el = soup.select_one('h1.entry-title')
-                if title_el:
-                    title = title_el.text.strip()
-                    yield f"✅ Judul Ketemu: {title}"
-                    # ... (Logic simpan episode sama kayak sebelumnya)
-                else:
-                    yield "❌ Gagal bypass Cloudflare atau Selector salah. IP HF lo kemungkinan besar di-ban total."
         except Exception as e:
             yield f"❌ Error: {str(e)}"
         finally:
             await browser.close()

 from playwright.async_api import async_playwright
+import playwright_stealth # Import utuh biar gak ImportError
 from bs4 import BeautifulSoup
 from database import db
 from firebase_admin import firestore
 async def start_scrape_generator(target_url):
     async with async_playwright() as p:
         yield "🚀 Menjalankan Browser Stealth (Chromium)..."
+        # Launch browser dengan argumen tambahan buat stabilitas di HF
+        browser = await p.chromium.launch(
+            headless=True,
+            args=["--no-sandbox", "--disable-setuid-sandbox"]
+        )
         context = await browser.new_context(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
         )
         page = await context.new_page()
+        # FIX DISINI: Gunakan pemanggilan langsung dari library
+        try:
+            await playwright_stealth.stealth_async(page)
+        except AttributeError:
+            # Fallback jika fungsi async tidak ditemukan
+            from playwright_stealth import stealth_sync
+            playwright_stealth.stealth_sync(page)
         try:
             if target_url.rstrip('/') == "https://anichin.cafe":
                 target_url = "https://anichin.cafe/anime/?order=update"
                 yield "⛓️ Mode Crawler: Menuju daftar anime..."
             yield f"🌐 Membuka: {target_url}"
+            # Timeout diperpanjang karena Playwright di HF butuh nafas
+            await page.goto(target_url, wait_until="networkidle", timeout=90000)
+            await asyncio.sleep(5)
             content = await page.content()
             soup = BeautifulSoup(content, 'html.parser')
             items = soup.select('.listupd .bsx a')
             if items:
                 yield f"📂 Ditemukan {len(items)} judul. Mulai sinkronisasi..."
                     title = item.select_one('.tt').text.strip() if item.select_one('.tt') else "Judul"
                     yield f"🎬 Memproses: {title}"
                     doc_id = title.replace(' ', '_').replace('/', '-')
                     db.collection('streaming').document(doc_id).set({
                         "title": title,
                         "url": link,
                         "updated_at": firestore.SERVER_TIMESTAMP
+                    }, merge=True)
                 yield "✅ SEMUA JUDUL BERHASIL DISINKRONISASI!"
             else:
+                yield "❌ Gagal bypass Cloudflare atau Selector salah. IP HF lo kemungkinan kena limit."
         except Exception as e:
             yield f"❌ Error: {str(e)}"
         finally:
             await browser.close()