QJMKWB2 / scraper4.py
QJMKWB's picture
Update scraper4.py
586d343 verified
import asyncio
import urllib.parse
import json
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import re
DOMENA = "https://mrkaj.me"
class TorBrowserManager:
_instance = None
def __init__(self):
self.playwright = None
self.context = None
self._setup_done = False
self._lock = asyncio.Lock()
@classmethod
def get_instance(cls):
if cls._instance is None:
cls._instance = cls()
return cls._instance
async def reset_identity(self):
"""Reštartuje prehliadač a vyžiada novú IP"""
try:
# 1. Zavrieme starý kontext (vyčistí cache spojení)
if self.context:
await self.context.close()
# 2. Vyžiadame novú identitu od Toru
reader, writer = await asyncio.open_connection('127.0.0.1', 9051)
writer.write(b'AUTHENTICATE ""\r\n')
writer.write(b'SIGNAL NEWNYM\r\n')
writer.write(b'QUIT\r\n')
await writer.drain()
writer.close()
print("[TOR] Vyžiadaná nová IP. Reštartujem prehliadač...")
await asyncio.sleep(8)
# 3. Znova vytvoríme kontext (úplne nové spojenie)
self.context = await self.playwright.chromium.launch_persistent_context(
"/tmp/playwright_tor",
headless=True,
proxy={"server": "socks5://127.0.0.1:9050"},
args=["--no-sandbox", "--disable-setuid-sandbox"]
)
await self.get_current_ip()
except Exception as e:
print(f"[TOR RESET ERROR] {e}")
async def get_current_ip(self):
page = await self.context.new_page()
try:
await page.goto("https://api.ipify.org", timeout=20000)
ip = (await page.inner_text("body")).strip()
print(f"[TOR] Aktuálna IP adresa: {ip}")
return ip
except Exception as e:
print(f"[TOR IP ERROR] {e}")
return "unknown"
finally:
await page.close()
async def start(self):
async with self._lock:
if self._setup_done:
return
print("[TOR] Inicializujem prehliadač cez Tor SOCKS5...")
self.playwright = await async_playwright().start()
self.context = await self.playwright.chromium.launch_persistent_context(
"/tmp/playwright_tor",
headless=True,
proxy={"server": "socks5://127.0.0.1:9050"},
args=["--no-sandbox", "--disable-setuid-sandbox"]
)
await asyncio.sleep(10)
await self.get_current_ip()
self._setup_done = True
async def get_page(self):
if not self._setup_done:
await self.start()
return await self.context.new_page()
async def goto_with_fallback(self, page, url, timeout=60000):
try:
await page.goto(url, wait_until="domcontentloaded", timeout=timeout)
return True
except Exception as e:
print(f"[TOR GOTO ERROR] {e}")
return False
manager = TorBrowserManager.get_instance()
async def search_movies(query):
print(f"[SEARCH] Hľadám cez Tor: {query}")
if not manager._setup_done: await manager.start()
# Skúsime až 4 rôzne Tor okruhy (identity), ak nás web blokuje
for pokus in range(4):
page = await manager.get_page()
try:
url = f"{DOMENA}/se/j/json?q={urllib.parse.quote(query)}"
success = await manager.goto_with_fallback(page, url)
if success:
content = await page.inner_text("body")
if content and len(content.strip()) > 10:
print(f"[TOR] ÚSPECH na {pokus+1}. pokus!")
return json.loads(content)
print(f"[TOR] Pokus {pokus+1} vrátil prázdny obsah (blokovaná IP). Mením identitu...")
await page.close()
await manager.reset_identity() # Zmena IP adresy
except Exception as e:
print(f"[SEARCH ERROR] Pokus {pokus+1}: {e}")
await page.close()
await manager.reset_identity()
return []
async def get_details(slug, media_type):
print(f"[DETAILS] Info cez Tor: {slug}")
page = await manager.get_page()
url = f"{DOMENA}/p/{slug}" if media_type in ["movie", "film"] else f"{DOMENA}/tv/{slug}/S01E01"
res = {"languages": [], "seasons": []}
try:
success = await manager.goto_with_fallback(page, url)
if not success: return res
await asyncio.sleep(2)
soup = BeautifulSoup(await page.content(), "html.parser")
kw_div = soup.find("div", class_="movie-keyword")
if kw_div:
res["languages"] = [p.get_text(strip=True) for p in kw_div.find_all(recursive=False)]
matches = re.findall(r'Séria\s+(\d+)', soup.get_text())
for seria in sorted(set(matches), key=int):
res["seasons"].append({"season": int(seria), "episodes": 0})
except Exception as e:
print(f"[DETAILS ERROR] {e}")
finally:
await page.close()
return res
async def get_stream_url(slug, media_type, season=None, episode=None, lang=None, source_idx=0):
print(f"[STREAM] Skenujem stream cez Tor: {slug}")
page = await manager.get_page()
url = f"{DOMENA}/p/{slug}" if media_type in ["movie", "film"] else f"{DOMENA}/tv/{slug}/S{int(season):02d}E{int(episode):02d}"
params = [f"source={source_idx}"]
if lang: params.append(f"lng={urllib.parse.quote(lang)}")
url += "?" + "&".join(params)
found = {"stream": None, "vtt": None}
page.on("request", lambda req: zachyt_request(req, found))
try:
success = await manager.goto_with_fallback(page, url, timeout=90000)
if not success: return found
for _ in range(25):
if found["stream"]: break
await asyncio.sleep(1)
if not found["stream"]:
iframe = page.locator("iframe#frm, video").first
if await iframe.count() > 0:
await iframe.click(force=True)
for _ in range(15):
if found["stream"]: break
await asyncio.sleep(1)
except Exception as e:
print(f"[STREAM ERROR] {e}")
finally:
await page.close()
return found
def zachyt_request(request, found):
if ".m3u8" in request.url and not found["stream"]:
found["stream"] = request.url
if request.url.endswith(".vtt") and not found["vtt"]:
found["vtt"] = request.url