diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..4ad7e60e5a2774546d1fea068f0d87777d31ad9a --- /dev/null +++ b/.gitignore @@ -0,0 +1,17 @@ +venv/ +__pycache__/ +archive/ +*.db +*.log +.env +.vscode/ +.idea/ +bin/ +cache/ +logs/ +*.exe +*.img +dist/ +node_modules/ +.choreo/ +TUNNEL_TOKEN.txt diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..96a085c41ebd1080e30daf8b9cc01f502cdd37d7 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,35 @@ +FROM python:3.10-slim + +# Hugging Face Optimized - Lightweight & Stable +ENV PYTHONUNBUFFERED=1 +ENV HF_SPACE=1 + +# Install minimal system dependencies +RUN apt-get update && apt-get install -y \ + curl \ + ffmpeg \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Copy requirements and install +COPY requirements.txt . +RUN pip install --no-cache-dir --upgrade -r requirements.txt + +# Create a non-privileged user (Required by Hugging Face) +RUN useradd -m -u 1000 user +RUN chown -R user:user /app +USER user +ENV HOME=/home/user \ + PATH=/home/user/.local/bin:$PATH + +# Copy application code +COPY --chown=user:user . . + +# Hugging Face uses port 7860 +EXPOSE 7860 +ENV PORT=7860 + +# Start the application with optimized settings for limited RAM +# We use 1 worker to keep memory usage low on the free tier +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1", "--timeout-keep-alive", "60"] diff --git a/Dockerfile.hf b/Dockerfile.hf new file mode 100644 index 0000000000000000000000000000000000000000..c3a8e418c209eaae1868b799d8eeeb70ba2200dc --- /dev/null +++ b/Dockerfile.hf @@ -0,0 +1,35 @@ +FROM python:3.10-slim + +# Hugging Face optimized - Lightweight without Chrome +ENV PYTHONUNBUFFERED=1 +ENV SPACE_ID=huggingface +ENV HF_SPACE=1 + +# Install minimal dependencies +RUN apt-get update && apt-get install -y \ + curl \ + git \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Copy and install Python dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir --upgrade -r requirements.txt + +# Create user for Hugging Face +RUN useradd -m -u 1000 user +RUN chown -R user:user /app +USER user +ENV HOME=/home/user \ + PATH=/home/user/.local/bin:$PATH + +# Copy application +COPY --chown=user:user . . + +# Hugging Face uses port 7860 +EXPOSE 7860 +ENV PORT=7860 + +# Start without FlareSolverr (too heavy for HF) +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"] diff --git a/Procfile b/Procfile new file mode 100644 index 0000000000000000000000000000000000000000..25f27e19e1dcc8ca7e0a456578bdcb0e3aed3fe8 --- /dev/null +++ b/Procfile @@ -0,0 +1 @@ +web: uvicorn main:app --host 0.0.0.0 --port $PORT --log-level info diff --git a/README.hf.md b/README.hf.md new file mode 100644 index 0000000000000000000000000000000000000000..2d34d72d828113241bab2ff95f9d82ee03539955 --- /dev/null +++ b/README.hf.md @@ -0,0 +1,33 @@ +--- +title: MEIH Movies API +emoji: 🎬 +colorFrom: red +colorTo: gray +sdk: docker +app_file: main.py +pinned: false +license: mit +--- + +# MEIH Movies API - Hugging Face Edition + +High-performance movie streaming API optimized for Hugging Face Spaces. + +## Features + +- Fast content scraping with curl-cffi +- Intelligent caching system +- Rate limiting for stability +- Proxy rotation support + +## API Endpoints + +- `GET /latest` - Latest movies and series +- `GET /category/{cat_id}` - Browse by category +- `GET /search?q={query}` - Search content +- `GET /details/{id}` - Get streaming links +- `GET /health` - Health check + +## Usage + +Visit the API at: `https://YOUR-SPACE-NAME.hf.space/` diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..779738920a43d1fa3822f9c01e13d4eecb4ba380 --- /dev/null +++ b/README.md @@ -0,0 +1,30 @@ +--- +title: Meih Movies API +emoji: 🎬 +colorFrom: red +colorTo: gray +sdk: docker +pinned: false +--- + +# MEIH Movies API - Hugging Face Edition + +High-performance movie streaming API optimized for Hugging Face Spaces. + +## Features + +- **Lightweight**: Optimized for 16GB RAM environments. +- **Fast**: Powered by `curl-cffi` for high-speed scraping. +- **Stable**: Automatic proxy rotation and intelligent caching. +- **Universal**: Serves both API and Frontend (if built). + +## API Endpoints + +- `GET /latest` - Latest movies and series. +- `GET /search?q={query}` - Search content. +- `GET /details/{id}` - Get streaming links. +- `GET /health` - System status. + +## Deployment Note + +This project is configured to run on port **7860**. Ensure your Space is set to **Docker** SDK. diff --git a/database.py b/database.py new file mode 100644 index 0000000000000000000000000000000000000000..af5fd0b55752f9802af3b044462ae6d43bfd77bb --- /dev/null +++ b/database.py @@ -0,0 +1,48 @@ +import aiosqlite +import logging + +DB_NAME = "netflix_clone.db" + +async def init_db(): + async with aiosqlite.connect(DB_NAME) as db: + # Movies Table + await db.execute(""" + CREATE TABLE IF NOT EXISTS movies ( + id TEXT PRIMARY KEY, + title TEXT, + poster TEXT, + year TEXT, + rating TEXT, + description TEXT, + category TEXT + ) + """) + # Series Table + await db.execute(""" + CREATE TABLE IF NOT EXISTS series ( + id TEXT PRIMARY KEY, + title TEXT, + poster TEXT, + year TEXT, + rating TEXT, + description TEXT, + category TEXT + ) + """) + # Episodes Table + await db.execute(""" + CREATE TABLE IF NOT EXISTS episodes ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + series_id TEXT, + episode_number INTEGER, + title TEXT, + watch_link TEXT, + FOREIGN KEY(series_id) REFERENCES series(id) + ) + """) + await db.commit() + +async def get_db_connection(): + db = await aiosqlite.connect(DB_NAME) + db.row_factory = aiosqlite.Row + return db diff --git a/deploy/.dockerignore b/deploy/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..aea3a11a36192c814e1972b4244e499928be8341 --- /dev/null +++ b/deploy/.dockerignore @@ -0,0 +1,30 @@ +# Python ignore +__pycache__/ +*.py[cod] +*$py.class +venv/ +.env +netflix_clone.db +archive/ +tools/ + +# Node ignore +node_modules/ +dist/ +build/ +.next/ +.vite/ + +# Git ignore +.git/ +.gitignore + +# OS ignore +.DS_Store +Thumbs.db + +# Project ignore +setup_and_run.bat +*.md +.gemini/ +.agent/ diff --git a/deploy/Dockerfile b/deploy/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..535f403193c571249350df967811a512aa9f92e9 --- /dev/null +++ b/deploy/Dockerfile @@ -0,0 +1,50 @@ +# ========================================== +# Nitro Backend-Only Dockerfile for Hugging Face +# ========================================== +FROM python:3.11-slim + +# Install system dependencies for Scraper (Chrome) and FlareSolverr +ENV DEBIAN_FRONTEND=noninteractive +RUN apt-get update && apt-get install -y \ + ffmpeg \ + curl \ + git \ + wget \ + gnupg \ + xvfb \ + xauth \ + dos2unix \ + libnss3 \ + libatk-bridge2.0-0 \ + libgtk-3-0 \ + && wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /usr/share/keyrings/google-chrome.gpg \ + && echo "deb [arch=amd64 signed-by=/usr/share/keyrings/google-chrome.gpg] http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list \ + && apt-get update && apt-get install -y google-chrome-stable \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Install Backend Dependencies +COPY backend/requirements.txt ./ +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r requirements.txt + +# Copy Backend Application +COPY backend/ ./ + +# Fix line endings and permissions +RUN dos2unix start.sh && chmod +x start.sh + +# Create local user for Hugging Face Spaces (UID 1000) +RUN useradd -m -u 1000 user +RUN chown -R user:user /app +USER user +ENV HOME=/home/user \ + PATH=/home/user/.local/bin:$PATH \ + PYTHONPATH=/app + +# Expose the mandatory Hugging Face Space port +EXPOSE 7860 + +# Kickstart the engine +CMD ["/bin/bash", "./start.sh"] diff --git a/deploy/cloudflare-worker.js b/deploy/cloudflare-worker.js new file mode 100644 index 0000000000000000000000000000000000000000..a8ad932ae367119f55e35a913af68c27c4808a52 --- /dev/null +++ b/deploy/cloudflare-worker.js @@ -0,0 +1,77 @@ +/** + * Cloudflare Worker - Proxy Bypass for Larooza Scraper + * Deploy this to Cloudflare Workers (100% FREE) + * + * This worker acts as a middle-man to bypass IP bans + */ + +addEventListener('fetch', event => { + event.respondWith(handleRequest(event.request)) +}) + +async function handleRequest(request) { + // Enable CORS + const corsHeaders = { + 'Access-Control-Allow-Origin': '*', + 'Access-Control-Allow-Methods': 'GET, POST, OPTIONS', + 'Access-Control-Allow-Headers': 'Content-Type', + } + + // Handle CORS preflight + if (request.method === 'OPTIONS') { + return new Response(null, { headers: corsHeaders }) + } + + // Get target URL from query parameter + const url = new URL(request.url) + const targetUrl = url.searchParams.get('url') + + if (!targetUrl) { + return new Response(JSON.stringify({ error: 'Missing url parameter' }), { + status: 400, + headers: { ...corsHeaders, 'Content-Type': 'application/json' } + }) + } + + try { + // Fetch the target URL with realistic headers + const response = await fetch(targetUrl, { + headers: { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', + 'Accept-Language': 'ar,en-US;q=0.9,en;q=0.8', + 'Referer': 'https://www.google.com/', + 'DNT': '1', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + }, + cf: { + // Cloudflare-specific options + cacheTtl: 300, // Cache for 5 minutes + cacheEverything: true, + } + }) + + // Get the HTML content + const html = await response.text() + + // Return with CORS headers + return new Response(html, { + status: response.status, + headers: { + ...corsHeaders, + 'Content-Type': 'text/html; charset=utf-8', + 'Cache-Control': 'public, max-age=300', + } + }) + + } catch (error) { + return new Response(JSON.stringify({ + error: 'Failed to fetch target URL', + message: error.message + }), { + status: 500, + headers: { ...corsHeaders, 'Content-Type': 'application/json' } + }) + } +} diff --git a/deploy/render.yaml b/deploy/render.yaml new file mode 100644 index 0000000000000000000000000000000000000000..214ceee8c012cfef5be73659634d20e59be5e3cd --- /dev/null +++ b/deploy/render.yaml @@ -0,0 +1,18 @@ +# Render.com Deployment Configuration +# https://render.com + +services: + - type: web + name: meih-movies-api + env: docker + dockerfilePath: ./Dockerfile + dockerContext: ./backend + plan: free + region: oregon + envVars: + - key: PYTHON_VERSION + value: 3.11 + - key: PORT + value: 7860 + healthCheckPath: /health + autoDeploy: true diff --git a/downloader.py b/downloader.py new file mode 100644 index 0000000000000000000000000000000000000000..e545ef1fd4bb59533f831d54c74af9e67c45db2b --- /dev/null +++ b/downloader.py @@ -0,0 +1,145 @@ +import yt_dlp +import logging +import asyncio + +logger = logging.getLogger(__name__) + +class VideoDownloader: + def __init__(self): + self.ydl_opts = { + 'quiet': True, + 'no_warnings': True, + 'format': 'best', + 'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'geo_bypass': True, + 'no_playlist': True, + 'nocheckcertificate': True, + } + + async def get_info(self, url: str): + # 1. Handle Local Watch/Details Links or Direct Larooza Links + is_larooza = any(x in url for x in ['larozavideo', 'larooza', 'laroza']) + if "/watch/" in url or "/details/" in url or is_larooza: + try: + from scraper.engine import scraper + import base64 + + target_url = url + if "/watch/" in url or "/details/" in url: + id_part = url.split("/")[-1].split("?")[0] + if not id_part.startswith("http"): + target_url = base64.urlsafe_b64decode(id_part).decode() + + # If it's a Larooza link (direct or decoded), use scraper + if any(x in target_url for x in ['larozavideo', 'larooza', 'laroza']): + logger.info(f"Routing Larooza link to scraper: {target_url}") + # Normalize: downloader works better with the video.php page + target_url = target_url.replace('play.php', 'video.php').replace('download.php', 'video.php') + + safe_id = base64.urlsafe_b64encode(target_url.encode()).decode() + data = await scraper.fetch_details(safe_id) + + if data and data.get('download_links'): + formats = [] + for dl in data['download_links']: + formats.append({ + 'ext': 'mp4', + 'resolution': dl['quality'], + 'url': dl['url'], + 'type': 'video' + }) + return { + 'title': data.get('title'), + 'thumbnail': data.get('poster'), + 'duration': 0, + 'uploader': 'Larooza', + 'source': 'Larooza', + 'formats': formats + } + elif data: + return {"error": "لم يتم العثور على روابط تحميل لهذا الفيديو (ربما محمي أو غير متاح حالياً)."} + except Exception as e: + logger.error(f"Larooza-specific extraction failed: {e}") + + # 2. Universal yt-dlp Path (YouTube, TikTok, etc.) + try: + loop = asyncio.get_event_loop() + # Use a more robust extraction with a timeout + try: + info = await asyncio.wait_for( + loop.run_in_executor(None, lambda: self._extract(url)), + timeout=30.0 + ) + except asyncio.TimeoutError: + logger.error(f"Timeout extracting info for {url}") + return {"error": "استغرق استخراج البيانات وقتاً طويلاً. حاول مرة أخرى."} + + if not info: + return {"error": "فشل في استخراج بيانات الفيديو. تأكد من الرابط."} + + # Live stream check + if info.get('is_live') or info.get('live_status') == 'is_upcoming': + return {"error": "هذا الفيديو لم يبدأ عرضه بعد أو هو بث مباشر حالياً."} + + formats = [] + seen_resolutions = set() + + # Extract usable formats + raw_formats = info.get('formats', []) + if not raw_formats and info.get('url'): + raw_formats = [info] # For direct links + + for f in raw_formats: + if not f: continue + # Filter out formats without a direct URL or those that are just manifests + f_url = f.get('url') + if not f_url or '.m3u8' in f_url or '.mpd' in f_url: + continue + + ext = f.get('ext', 'mp4') + res = f.get('resolution') or f.get('format_note') or f.get('height') or 'Unknown' + + # Clean resolution label + if isinstance(res, int): res = f"{res}p" + + # Avoid duplicates and prioritize video formats + res_key = f"{res}_{f.get('vcodec') != 'none'}" + if res_key in seen_resolutions: continue + seen_resolutions.add(res_key) + + formats.append({ + 'id': f.get('format_id', 'unknown'), + 'ext': ext, + 'resolution': res, + 'filesize': f.get('filesize') or f.get('filesize_approx') or 0, + 'url': f_url, + 'type': 'video' if f.get('vcodec') != 'none' else 'audio' + }) + + if not formats: + return {"error": "لم يتم العثور على روابط تحميل مباشرة مدعومة لهذا الفيديو."} + + return { + 'title': info.get('title', 'Video'), + 'thumbnail': info.get('thumbnail', ''), + 'duration': info.get('duration', 0), + 'uploader': info.get('uploader', 'Unknown'), + 'source': info.get('extractor_key', 'Unknown'), + 'formats': formats[::-1] + } + except Exception as e: + logger.error(f"Universal Downloader error for {url}: {e}") + return {"error": f"حدث خطأ غير متوقع: {str(e)}"} + + def _extract(self, url): + opts = self.ydl_opts.copy() + # Add extra robustness for TikTok and newer sites + opts.update({ + 'nocheckcertificate': True, + 'ignoreerrors': True, + 'socket_timeout': 15, + }) + with yt_dlp.YoutubeDL(opts) as ydl: + return ydl.extract_info(url, download=False) + +downloader = VideoDownloader() diff --git a/flaresolverr/bottle_plugins/__init__.py b/flaresolverr/bottle_plugins/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/flaresolverr/bottle_plugins/error_plugin.py b/flaresolverr/bottle_plugins/error_plugin.py new file mode 100644 index 0000000000000000000000000000000000000000..4d995086d3b122eb05b386c105b0f7282ad4b2c1 --- /dev/null +++ b/flaresolverr/bottle_plugins/error_plugin.py @@ -0,0 +1,22 @@ +from bottle import response +import logging + + +def error_plugin(callback): + """ + Bottle plugin to handle exceptions + https://stackoverflow.com/a/32764250 + """ + + def wrapper(*args, **kwargs): + try: + actual_response = callback(*args, **kwargs) + except Exception as e: + logging.error(str(e)) + actual_response = { + "error": str(e) + } + response.status = 500 + return actual_response + + return wrapper diff --git a/flaresolverr/bottle_plugins/logger_plugin.py b/flaresolverr/bottle_plugins/logger_plugin.py new file mode 100644 index 0000000000000000000000000000000000000000..cf840e9840204e03d5dfecf9681a0c1523428c1d --- /dev/null +++ b/flaresolverr/bottle_plugins/logger_plugin.py @@ -0,0 +1,23 @@ +from bottle import request, response +import logging + + +def logger_plugin(callback): + """ + Bottle plugin to use logging module + https://bottlepy.org/docs/dev/plugindev.html + + Wrap a Bottle request so that a log line is emitted after it's handled. + (This decorator can be extended to take the desired logger as a param.) + """ + + def wrapper(*args, **kwargs): + actual_response = callback(*args, **kwargs) + if not request.url.endswith("/health"): + logging.info('%s %s %s %s' % (request.remote_addr, + request.method, + request.url, + response.status)) + return actual_response + + return wrapper diff --git a/flaresolverr/bottle_plugins/prometheus_plugin.py b/flaresolverr/bottle_plugins/prometheus_plugin.py new file mode 100644 index 0000000000000000000000000000000000000000..36b767b62c076cbdee8d8a1cfe96e59906dbfe47 --- /dev/null +++ b/flaresolverr/bottle_plugins/prometheus_plugin.py @@ -0,0 +1,66 @@ +import logging +import os +import urllib.parse + +from bottle import request +from dtos import V1RequestBase, V1ResponseBase +from metrics import start_metrics_http_server, REQUEST_COUNTER, REQUEST_DURATION + +PROMETHEUS_ENABLED = os.environ.get('PROMETHEUS_ENABLED', 'false').lower() == 'true' +PROMETHEUS_PORT = int(os.environ.get('PROMETHEUS_PORT', 8192)) + + +def setup(): + if PROMETHEUS_ENABLED: + start_metrics_http_server(PROMETHEUS_PORT) + + +def prometheus_plugin(callback): + """ + Bottle plugin to expose Prometheus metrics + https://bottlepy.org/docs/dev/plugindev.html + """ + def wrapper(*args, **kwargs): + actual_response = callback(*args, **kwargs) + + if PROMETHEUS_ENABLED: + try: + export_metrics(actual_response) + except Exception as e: + logging.warning("Error exporting metrics: " + str(e)) + + return actual_response + + def export_metrics(actual_response): + res = V1ResponseBase(actual_response) + + if res.startTimestamp is None or res.endTimestamp is None: + # skip management and healthcheck endpoints + return + + domain = "unknown" + if res.solution and res.solution.url: + domain = parse_domain_url(res.solution.url) + else: + # timeout error + req = V1RequestBase(request.json) + if req.url: + domain = parse_domain_url(req.url) + + run_time = (res.endTimestamp - res.startTimestamp) / 1000 + REQUEST_DURATION.labels(domain=domain).observe(run_time) + + result = "unknown" + if res.message == "Challenge solved!": + result = "solved" + elif res.message == "Challenge not detected!": + result = "not_detected" + elif res.message.startswith("Error"): + result = "error" + REQUEST_COUNTER.labels(domain=domain, result=result).inc() + + def parse_domain_url(url): + parsed_url = urllib.parse.urlparse(url) + return parsed_url.hostname + + return wrapper diff --git a/flaresolverr/build_package.py b/flaresolverr/build_package.py new file mode 100644 index 0000000000000000000000000000000000000000..748dbbeac1e8d817d65ee2cb2d989f68905c6b6f --- /dev/null +++ b/flaresolverr/build_package.py @@ -0,0 +1,126 @@ +import os +import platform +import shutil +import subprocess +import sys +import zipfile +import tarfile + +import requests + + +def clean_files(): + try: + shutil.rmtree(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'build')) + except Exception: + pass + try: + shutil.rmtree(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'dist')) + except Exception: + pass + try: + shutil.rmtree(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'dist_chrome')) + except Exception: + pass + + +def download_chromium(): + # https://commondatastorage.googleapis.com/chromium-browser-snapshots/index.html?prefix=Linux_x64/ + revision = "1522586" if os.name == 'nt' else '1522586' + arch = 'Win_x64' if os.name == 'nt' else 'Linux_x64' + dl_file = 'chrome-win' if os.name == 'nt' else 'chrome-linux' + dl_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'dist_chrome') + dl_path_folder = os.path.join(dl_path, dl_file) + dl_path_zip = dl_path_folder + '.zip' + + # response = requests.get( + # f'https://commondatastorage.googleapis.com/chromium-browser-snapshots/{arch}/LAST_CHANGE', + # timeout=30) + # revision = response.text.strip() + print("Downloading revision: " + revision) + + os.mkdir(dl_path) + with requests.get( + f'https://commondatastorage.googleapis.com/chromium-browser-snapshots/{arch}/{revision}/{dl_file}.zip', + stream=True) as r: + r.raise_for_status() + with open(dl_path_zip, 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + print("File downloaded: " + dl_path_zip) + with zipfile.ZipFile(dl_path_zip, 'r') as zip_ref: + zip_ref.extractall(dl_path) + os.remove(dl_path_zip) + + chrome_path = os.path.join(dl_path, "chrome") + shutil.move(dl_path_folder, chrome_path) + print("Extracted in: " + chrome_path) + + if os.name != 'nt': + # Give executable permissions for *nix + # file * | grep executable | cut -d: -f1 + print("Giving executable permissions...") + execs = ['chrome', 'chrome_crashpad_handler', 'chrome_sandbox', 'chrome-wrapper', 'xdg-mime', 'xdg-settings'] + for exec_file in execs: + exec_path = os.path.join(chrome_path, exec_file) + os.chmod(exec_path, 0o755) + + +def run_pyinstaller(): + sep = ';' if os.name == 'nt' else ':' + result = subprocess.run([sys.executable, "-m", "PyInstaller", + "--icon", "resources/flaresolverr_logo.ico", + "--add-data", f"package.json{sep}.", + "--add-data", f"{os.path.join('dist_chrome', 'chrome')}{sep}chrome", + os.path.join("src", "flaresolverr.py")], + cwd=os.pardir, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if result.returncode != 0: + print(result.stderr.decode('utf-8')) + raise Exception("Error running pyInstaller") + + +def compress_package(): + dist_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'dist') + package_folder = os.path.join(dist_folder, 'package') + shutil.move(os.path.join(dist_folder, 'flaresolverr'), os.path.join(package_folder, 'flaresolverr')) + print("Package folder: " + package_folder) + + compr_format = 'zip' if os.name == 'nt' else 'gztar' + compr_file_name = 'flaresolverr_windows_x64' if os.name == 'nt' else 'flaresolverr_linux_x64' + compr_file_path = os.path.join(dist_folder, compr_file_name) + + if compr_format == 'zip': + shutil.make_archive(compr_file_path, compr_format, package_folder) + print("Compressed file path: " + compr_file_path) + else: + def _reset_tarinfo(tarinfo): + tarinfo.uid = 0 + tarinfo.gid = 0 + tarinfo.uname = "" + tarinfo.gname = "" + return tarinfo + + tar_path = compr_file_path + '.tar.gz' + with tarfile.open(tar_path, 'w:gz') as tar: + for entry in os.listdir(package_folder): + fullpath = os.path.join(package_folder, entry) + tar.add(fullpath, arcname=entry, filter=_reset_tarinfo) + print("Compressed file path: " + tar_path) + +if __name__ == "__main__": + print("Building package...") + print("Platform: " + platform.platform()) + + print("Cleaning previous build...") + clean_files() + + print("Downloading Chromium...") + download_chromium() + + print("Building pyinstaller executable... ") + run_pyinstaller() + + print("Compressing package... ") + compress_package() + +# NOTE: python -m pip install pyinstaller diff --git a/flaresolverr/dtos.py b/flaresolverr/dtos.py new file mode 100644 index 0000000000000000000000000000000000000000..cbe7fd89189766c6f6510c60ede7e88a95b29b5e --- /dev/null +++ b/flaresolverr/dtos.py @@ -0,0 +1,94 @@ + +STATUS_OK = "ok" +STATUS_ERROR = "error" + + +class ChallengeResolutionResultT: + url: str = None + status: int = None + headers: list = None + response: str = None + cookies: list = None + userAgent: str = None + screenshot: str | None = None + turnstile_token: str = None + + def __init__(self, _dict): + self.__dict__.update(_dict) + + +class ChallengeResolutionT: + status: str = None + message: str = None + result: ChallengeResolutionResultT = None + + def __init__(self, _dict): + self.__dict__.update(_dict) + if self.result is not None: + self.result = ChallengeResolutionResultT(self.result) + + +class V1RequestBase(object): + # V1RequestBase + cmd: str = None + cookies: list = None + maxTimeout: int = None + proxy: dict = None + session: str = None + session_ttl_minutes: int = None + headers: list = None # deprecated v2.0.0, not used + userAgent: str = None # deprecated v2.0.0, not used + + # V1Request + url: str = None + postData: str = None + returnOnlyCookies: bool = None + returnScreenshot: bool = None + download: bool = None # deprecated v2.0.0, not used + returnRawHtml: bool = None # deprecated v2.0.0, not used + waitInSeconds: int = None + # Optional resource blocking flag (blocks images, CSS, and fonts) + disableMedia: bool = None + # Optional when you've got a turnstile captcha that needs to be clicked after X number of Tab presses + tabs_till_verify : int = None + + def __init__(self, _dict): + self.__dict__.update(_dict) + + +class V1ResponseBase(object): + # V1ResponseBase + status: str = None + message: str = None + session: str = None + sessions: list[str] = None + startTimestamp: int = None + endTimestamp: int = None + version: str = None + + # V1ResponseSolution + solution: ChallengeResolutionResultT = None + + # hidden vars + __error_500__: bool = False + + def __init__(self, _dict): + self.__dict__.update(_dict) + if self.solution is not None: + self.solution = ChallengeResolutionResultT(self.solution) + + +class IndexResponse(object): + msg: str = None + version: str = None + userAgent: str = None + + def __init__(self, _dict): + self.__dict__.update(_dict) + + +class HealthResponse(object): + status: str = None + + def __init__(self, _dict): + self.__dict__.update(_dict) diff --git a/flaresolverr/flaresolverr.py b/flaresolverr/flaresolverr.py new file mode 100644 index 0000000000000000000000000000000000000000..d7cf80a26650a9ef30fc03ea46166a2a4c875084 --- /dev/null +++ b/flaresolverr/flaresolverr.py @@ -0,0 +1,155 @@ +import json +import logging +import os +import sys + +import certifi +from bottle import run, response, Bottle, request, ServerAdapter + +from bottle_plugins.error_plugin import error_plugin +from bottle_plugins.logger_plugin import logger_plugin +from bottle_plugins import prometheus_plugin +from dtos import V1RequestBase +import flaresolverr_service +import utils + +env_proxy_url = os.environ.get('PROXY_URL', None) +env_proxy_username = os.environ.get('PROXY_USERNAME', None) +env_proxy_password = os.environ.get('PROXY_PASSWORD', None) + + +class JSONErrorBottle(Bottle): + """ + Handle 404 errors + """ + def default_error_handler(self, res): + response.content_type = 'application/json' + return json.dumps(dict(error=res.body, status_code=res.status_code)) + + +app = JSONErrorBottle() + + +@app.route('/') +def index(): + """ + Show welcome message + """ + res = flaresolverr_service.index_endpoint() + return utils.object_to_dict(res) + + +@app.route('/health') +def health(): + """ + Healthcheck endpoint. + This endpoint is special because it doesn't print traces + """ + res = flaresolverr_service.health_endpoint() + return utils.object_to_dict(res) + + +@app.post('/v1') +def controller_v1(): + """ + Controller v1 + """ + data = request.json or {} + if (('proxy' not in data or not data.get('proxy')) and env_proxy_url is not None and (env_proxy_username is None and env_proxy_password is None)): + logging.info('Using proxy URL ENV') + data['proxy'] = {"url": env_proxy_url} + if (('proxy' not in data or not data.get('proxy')) and env_proxy_url is not None and (env_proxy_username is not None or env_proxy_password is not None)): + logging.info('Using proxy URL, username & password ENVs') + data['proxy'] = {"url": env_proxy_url, "username": env_proxy_username, "password": env_proxy_password} + req = V1RequestBase(data) + res = flaresolverr_service.controller_v1_endpoint(req) + if res.__error_500__: + response.status = 500 + return utils.object_to_dict(res) + + +if __name__ == "__main__": + # check python version + if sys.version_info < (3, 9): + raise Exception("The Python version is less than 3.9, a version equal to or higher is required.") + + # fix for HEADLESS=false in Windows binary + # https://stackoverflow.com/a/27694505 + if os.name == 'nt': + import multiprocessing + multiprocessing.freeze_support() + + # fix ssl certificates for compiled binaries + # https://github.com/pyinstaller/pyinstaller/issues/7229 + # https://stackoverflow.com/q/55736855 + os.environ["REQUESTS_CA_BUNDLE"] = certifi.where() + os.environ["SSL_CERT_FILE"] = certifi.where() + + # validate configuration + log_level = os.environ.get('LOG_LEVEL', 'info').upper() + log_file = os.environ.get('LOG_FILE', None) + log_html = utils.get_config_log_html() + headless = utils.get_config_headless() + server_host = os.environ.get('HOST', '0.0.0.0') + server_port = int(os.environ.get('PORT', 8191)) + + # configure logger + logger_format = '%(asctime)s %(levelname)-8s %(message)s' + if log_level == 'DEBUG': + logger_format = '%(asctime)s %(levelname)-8s ReqId %(thread)s %(message)s' + if log_file: + log_file = os.path.realpath(log_file) + log_path = os.path.dirname(log_file) + os.makedirs(log_path, exist_ok=True) + logging.basicConfig( + format=logger_format, + level=log_level, + datefmt='%Y-%m-%d %H:%M:%S', + handlers=[ + logging.StreamHandler(sys.stdout), + logging.FileHandler(log_file) + ] + ) + else: + logging.basicConfig( + format=logger_format, + level=log_level, + datefmt='%Y-%m-%d %H:%M:%S', + handlers=[ + logging.StreamHandler(sys.stdout) + ] + ) + + # disable warning traces from urllib3 + logging.getLogger('urllib3').setLevel(logging.ERROR) + logging.getLogger('selenium.webdriver.remote.remote_connection').setLevel(logging.WARNING) + logging.getLogger('undetected_chromedriver').setLevel(logging.WARNING) + + logging.info(f'FlareSolverr {utils.get_flaresolverr_version()}') + logging.debug('Debug log enabled') + + # Get current OS for global variable + utils.get_current_platform() + + # test browser installation + if os.environ.get('SKIP_BROWSER_TEST', 'false').lower() != 'true': + flaresolverr_service.test_browser_installation() + else: + logging.info("Skipping browser installation test for faster boot.") + + # start bootle plugins + # plugin order is important + app.install(logger_plugin) + app.install(error_plugin) + prometheus_plugin.setup() + app.install(prometheus_plugin.prometheus_plugin) + + # start webserver + # default server 'wsgiref' does not support concurrent requests + # https://github.com/FlareSolverr/FlareSolverr/issues/680 + # https://github.com/Pylons/waitress/issues/31 + class WaitressServerPoll(ServerAdapter): + def run(self, handler): + from waitress import serve + serve(handler, host=self.host, port=self.port, asyncore_use_poll=True) + run(app, host=server_host, port=server_port, quiet=True, server=WaitressServerPoll) diff --git a/flaresolverr/flaresolverr_service.py b/flaresolverr/flaresolverr_service.py new file mode 100644 index 0000000000000000000000000000000000000000..814aeff40cf883088dd0ae5c16b6d025cbb9d2cc --- /dev/null +++ b/flaresolverr/flaresolverr_service.py @@ -0,0 +1,519 @@ +import logging +import platform +import sys +import time +from datetime import timedelta +from html import escape +from urllib.parse import unquote, quote + +from func_timeout import FunctionTimedOut, func_timeout +from selenium.common import TimeoutException +from selenium.webdriver.chrome.webdriver import WebDriver +from selenium.webdriver.common.by import By +from selenium.webdriver.common.keys import Keys +from selenium.webdriver.support.expected_conditions import ( + presence_of_element_located, staleness_of, title_is) +from selenium.webdriver.common.action_chains import ActionChains +from selenium.webdriver.support.wait import WebDriverWait + +import utils +from dtos import (STATUS_ERROR, STATUS_OK, ChallengeResolutionResultT, + ChallengeResolutionT, HealthResponse, IndexResponse, + V1RequestBase, V1ResponseBase) +from sessions import SessionsStorage + +ACCESS_DENIED_TITLES = [ + # Cloudflare + 'Access denied', + # Cloudflare http://bitturk.net/ Firefox + 'Attention Required! | Cloudflare' +] +ACCESS_DENIED_SELECTORS = [ + # Cloudflare + 'div.cf-error-title span.cf-code-label span', + # Cloudflare http://bitturk.net/ Firefox + '#cf-error-details div.cf-error-overview h1' +] +CHALLENGE_TITLES = [ + # Cloudflare + 'Just a moment...', + # DDoS-GUARD + 'DDoS-Guard' +] +CHALLENGE_SELECTORS = [ + # Cloudflare + '#cf-challenge-running', '.ray_id', '.attack-box', '#cf-please-wait', '#challenge-spinner', '#trk_jschal_js', '#turnstile-wrapper', '.lds-ring', + # Custom CloudFlare for EbookParadijs, Film-Paleis, MuziekFabriek and Puur-Hollands + 'td.info #js_info', + # Fairlane / pararius.com + 'div.vc div.text-box h2' +] + +TURNSTILE_SELECTORS = [ + "input[name='cf-turnstile-response']" +] + +SHORT_TIMEOUT = 1 +SESSIONS_STORAGE = SessionsStorage() + + +def test_browser_installation(): + logging.info("Testing web browser installation...") + logging.info("Platform: " + platform.platform()) + + chrome_exe_path = utils.get_chrome_exe_path() + if chrome_exe_path is None: + logging.error("Chrome / Chromium web browser not installed!") + sys.exit(1) + else: + logging.info("Chrome / Chromium path: " + chrome_exe_path) + + chrome_major_version = utils.get_chrome_major_version() + if chrome_major_version == '': + logging.error("Chrome / Chromium version not detected!") + sys.exit(1) + else: + logging.info("Chrome / Chromium major version: " + chrome_major_version) + + logging.info("Launching web browser...") + user_agent = utils.get_user_agent() + logging.info("FlareSolverr User-Agent: " + user_agent) + logging.info("Test successful!") + + +def index_endpoint() -> IndexResponse: + res = IndexResponse({}) + res.msg = "FlareSolverr is ready!" + res.version = utils.get_flaresolverr_version() + res.userAgent = utils.get_user_agent() + return res + + +def health_endpoint() -> HealthResponse: + res = HealthResponse({}) + res.status = STATUS_OK + return res + + +def controller_v1_endpoint(req: V1RequestBase) -> V1ResponseBase: + start_ts = int(time.time() * 1000) + logging.info(f"Incoming request => POST /v1 body: {utils.object_to_dict(req)}") + res: V1ResponseBase + try: + res = _controller_v1_handler(req) + except Exception as e: + res = V1ResponseBase({}) + res.__error_500__ = True + res.status = STATUS_ERROR + res.message = "Error: " + str(e) + logging.error(res.message) + + res.startTimestamp = start_ts + res.endTimestamp = int(time.time() * 1000) + res.version = utils.get_flaresolverr_version() + logging.debug(f"Response => POST /v1 body: {utils.object_to_dict(res)}") + logging.info(f"Response in {(res.endTimestamp - res.startTimestamp) / 1000} s") + return res + + +def _controller_v1_handler(req: V1RequestBase) -> V1ResponseBase: + # do some validations + if req.cmd is None: + raise Exception("Request parameter 'cmd' is mandatory.") + if req.headers is not None: + logging.warning("Request parameter 'headers' was removed in FlareSolverr v2.") + if req.userAgent is not None: + logging.warning("Request parameter 'userAgent' was removed in FlareSolverr v2.") + + # set default values + if req.maxTimeout is None or int(req.maxTimeout) < 1: + req.maxTimeout = 60000 + + # execute the command + res: V1ResponseBase + if req.cmd == 'sessions.create': + res = _cmd_sessions_create(req) + elif req.cmd == 'sessions.list': + res = _cmd_sessions_list(req) + elif req.cmd == 'sessions.destroy': + res = _cmd_sessions_destroy(req) + elif req.cmd == 'request.get': + res = _cmd_request_get(req) + elif req.cmd == 'request.post': + res = _cmd_request_post(req) + else: + raise Exception(f"Request parameter 'cmd' = '{req.cmd}' is invalid.") + + return res + + +def _cmd_request_get(req: V1RequestBase) -> V1ResponseBase: + # do some validations + if req.url is None: + raise Exception("Request parameter 'url' is mandatory in 'request.get' command.") + if req.postData is not None: + raise Exception("Cannot use 'postBody' when sending a GET request.") + if req.returnRawHtml is not None: + logging.warning("Request parameter 'returnRawHtml' was removed in FlareSolverr v2.") + if req.download is not None: + logging.warning("Request parameter 'download' was removed in FlareSolverr v2.") + + challenge_res = _resolve_challenge(req, 'GET') + res = V1ResponseBase({}) + res.status = challenge_res.status + res.message = challenge_res.message + res.solution = challenge_res.result + return res + + +def _cmd_request_post(req: V1RequestBase) -> V1ResponseBase: + # do some validations + if req.postData is None: + raise Exception("Request parameter 'postData' is mandatory in 'request.post' command.") + if req.returnRawHtml is not None: + logging.warning("Request parameter 'returnRawHtml' was removed in FlareSolverr v2.") + if req.download is not None: + logging.warning("Request parameter 'download' was removed in FlareSolverr v2.") + + challenge_res = _resolve_challenge(req, 'POST') + res = V1ResponseBase({}) + res.status = challenge_res.status + res.message = challenge_res.message + res.solution = challenge_res.result + return res + + +def _cmd_sessions_create(req: V1RequestBase) -> V1ResponseBase: + logging.debug("Creating new session...") + + session, fresh = SESSIONS_STORAGE.create(session_id=req.session, proxy=req.proxy) + session_id = session.session_id + + if not fresh: + return V1ResponseBase({ + "status": STATUS_OK, + "message": "Session already exists.", + "session": session_id + }) + + return V1ResponseBase({ + "status": STATUS_OK, + "message": "Session created successfully.", + "session": session_id + }) + + +def _cmd_sessions_list(req: V1RequestBase) -> V1ResponseBase: + session_ids = SESSIONS_STORAGE.session_ids() + + return V1ResponseBase({ + "status": STATUS_OK, + "message": "", + "sessions": session_ids + }) + + +def _cmd_sessions_destroy(req: V1RequestBase) -> V1ResponseBase: + session_id = req.session + existed = SESSIONS_STORAGE.destroy(session_id) + + if not existed: + raise Exception("The session doesn't exist.") + + return V1ResponseBase({ + "status": STATUS_OK, + "message": "The session has been removed." + }) + + +def _resolve_challenge(req: V1RequestBase, method: str) -> ChallengeResolutionT: + timeout = int(req.maxTimeout) / 1000 + driver = None + try: + if req.session: + session_id = req.session + ttl = timedelta(minutes=req.session_ttl_minutes) if req.session_ttl_minutes else None + session, fresh = SESSIONS_STORAGE.get(session_id, ttl) + + if fresh: + logging.debug(f"new session created to perform the request (session_id={session_id})") + else: + logging.debug(f"existing session is used to perform the request (session_id={session_id}, " + f"lifetime={str(session.lifetime())}, ttl={str(ttl)})") + + driver = session.driver + else: + driver = utils.get_webdriver(req.proxy) + logging.debug('New instance of webdriver has been created to perform the request') + return func_timeout(timeout, _evil_logic, (req, driver, method)) + except FunctionTimedOut: + raise Exception(f'Error solving the challenge. Timeout after {timeout} seconds.') + except Exception as e: + raise Exception('Error solving the challenge. ' + str(e).replace('\n', '\\n')) + finally: + if not req.session and driver is not None: + if utils.PLATFORM_VERSION == "nt": + driver.close() + driver.quit() + logging.debug('A used instance of webdriver has been destroyed') + + +def click_verify(driver: WebDriver, num_tabs: int = 1): + try: + logging.debug("Try to find the Cloudflare verify checkbox...") + actions = ActionChains(driver) + actions.pause(5) + for _ in range(num_tabs): + actions.send_keys(Keys.TAB).pause(0.1) + actions.pause(1) + actions.send_keys(Keys.SPACE).perform() + + logging.debug(f"Cloudflare verify checkbox clicked after {num_tabs} tabs!") + except Exception: + logging.debug("Cloudflare verify checkbox not found on the page.") + finally: + driver.switch_to.default_content() + + try: + logging.debug("Try to find the Cloudflare 'Verify you are human' button...") + button = driver.find_element( + by=By.XPATH, + value="//input[@type='button' and @value='Verify you are human']", + ) + if button: + actions = ActionChains(driver) + actions.move_to_element_with_offset(button, 5, 7) + actions.click(button) + actions.perform() + logging.debug("The Cloudflare 'Verify you are human' button found and clicked!") + except Exception: + logging.debug("The Cloudflare 'Verify you are human' button not found on the page.") + + time.sleep(2) + +def _get_turnstile_token(driver: WebDriver, tabs: int): + token_input = driver.find_element(By.CSS_SELECTOR, "input[name='cf-turnstile-response']") + current_value = token_input.get_attribute("value") + while True: + click_verify(driver, num_tabs=tabs) + turnstile_token = token_input.get_attribute("value") + if turnstile_token: + if turnstile_token != current_value: + logging.info(f"Turnstile token: {turnstile_token}") + return turnstile_token + logging.debug(f"Failed to extract token possibly click failed") + + # reset focus + driver.execute_script(""" + let el = document.createElement('button'); + el.style.position='fixed'; + el.style.top='0'; + el.style.left='0'; + document.body.prepend(el); + el.focus(); + """) + time.sleep(1) + +def _resolve_turnstile_captcha(req: V1RequestBase, driver: WebDriver): + turnstile_token = None + if req.tabs_till_verify is not None: + logging.debug(f'Navigating to... {req.url} in order to pass the turnstile challenge') + driver.get(req.url) + + turnstile_challenge_found = False + for selector in TURNSTILE_SELECTORS: + found_elements = driver.find_elements(By.CSS_SELECTOR, selector) + if len(found_elements) > 0: + turnstile_challenge_found = True + logging.info("Turnstile challenge detected. Selector found: " + selector) + break + if turnstile_challenge_found: + turnstile_token = _get_turnstile_token(driver=driver, tabs=req.tabs_till_verify) + else: + logging.debug(f'Turnstile challenge not found') + return turnstile_token + +def _evil_logic(req: V1RequestBase, driver: WebDriver, method: str) -> ChallengeResolutionT: + res = ChallengeResolutionT({}) + res.status = STATUS_OK + res.message = "" + + # optionally block resources like images/css/fonts using CDP + disable_media = utils.get_config_disable_media() + if req.disableMedia is not None: + disable_media = req.disableMedia + if disable_media: + block_urls = [ + # Images + "*.png", "*.jpg", "*.jpeg", "*.gif", "*.webp", "*.bmp", "*.svg", "*.ico", + "*.PNG", "*.JPG", "*.JPEG", "*.GIF", "*.WEBP", "*.BMP", "*.SVG", "*.ICO", + "*.tiff", "*.tif", "*.jpe", "*.apng", "*.avif", "*.heic", "*.heif", + "*.TIFF", "*.TIF", "*.JPE", "*.APNG", "*.AVIF", "*.HEIC", "*.HEIF", + # Stylesheets + "*.css", + "*.CSS", + # Fonts + "*.woff", "*.woff2", "*.ttf", "*.otf", "*.eot", + "*.WOFF", "*.WOFF2", "*.TTF", "*.OTF", "*.EOT" + ] + try: + logging.debug("Network.setBlockedURLs: %s", block_urls) + driver.execute_cdp_cmd("Network.enable", {}) + driver.execute_cdp_cmd("Network.setBlockedURLs", {"urls": block_urls}) + except Exception: + # if CDP commands are not available or fail, ignore and continue + logging.debug("Network.setBlockedURLs failed or unsupported on this webdriver") + + # navigate to the page + logging.debug(f"Navigating to... {req.url}") + turnstile_token = None + + if method == "POST": + _post_request(req, driver) + else: + if req.tabs_till_verify is None: + driver.get(req.url) + else: + turnstile_token = _resolve_turnstile_captcha(req, driver) + + # set cookies if required + if req.cookies is not None and len(req.cookies) > 0: + logging.debug(f'Setting cookies...') + for cookie in req.cookies: + driver.delete_cookie(cookie['name']) + driver.add_cookie(cookie) + # reload the page + if method == 'POST': + _post_request(req, driver) + else: + driver.get(req.url) + + # wait for the page + if utils.get_config_log_html(): + logging.debug(f"Response HTML:\n{driver.page_source}") + html_element = driver.find_element(By.TAG_NAME, "html") + page_title = driver.title + + # find access denied titles + for title in ACCESS_DENIED_TITLES: + if page_title.startswith(title): + raise Exception('Cloudflare has blocked this request. ' + 'Probably your IP is banned for this site, check in your web browser.') + # find access denied selectors + for selector in ACCESS_DENIED_SELECTORS: + found_elements = driver.find_elements(By.CSS_SELECTOR, selector) + if len(found_elements) > 0: + raise Exception('Cloudflare has blocked this request. ' + 'Probably your IP is banned for this site, check in your web browser.') + + # find challenge by title + challenge_found = False + for title in CHALLENGE_TITLES: + if title.lower() == page_title.lower(): + challenge_found = True + logging.info("Challenge detected. Title found: " + page_title) + break + if not challenge_found: + # find challenge by selectors + for selector in CHALLENGE_SELECTORS: + found_elements = driver.find_elements(By.CSS_SELECTOR, selector) + if len(found_elements) > 0: + challenge_found = True + logging.info("Challenge detected. Selector found: " + selector) + break + + attempt = 0 + if challenge_found: + while True: + try: + attempt = attempt + 1 + # wait until the title changes + for title in CHALLENGE_TITLES: + logging.debug("Waiting for title (attempt " + str(attempt) + "): " + title) + WebDriverWait(driver, SHORT_TIMEOUT).until_not(title_is(title)) + + # then wait until all the selectors disappear + for selector in CHALLENGE_SELECTORS: + logging.debug("Waiting for selector (attempt " + str(attempt) + "): " + selector) + WebDriverWait(driver, SHORT_TIMEOUT).until_not( + presence_of_element_located((By.CSS_SELECTOR, selector))) + + # all elements not found + break + + except TimeoutException: + logging.debug("Timeout waiting for selector") + + click_verify(driver) + + # update the html (cloudflare reloads the page every 5 s) + html_element = driver.find_element(By.TAG_NAME, "html") + + # waits until cloudflare redirection ends + logging.debug("Waiting for redirect") + # noinspection PyBroadException + try: + WebDriverWait(driver, SHORT_TIMEOUT).until(staleness_of(html_element)) + except Exception: + logging.debug("Timeout waiting for redirect") + + logging.info("Challenge solved!") + res.message = "Challenge solved!" + else: + logging.info("Challenge not detected!") + res.message = "Challenge not detected!" + + challenge_res = ChallengeResolutionResultT({}) + challenge_res.url = driver.current_url + challenge_res.status = 200 # todo: fix, selenium not provides this info + challenge_res.cookies = driver.get_cookies() + challenge_res.userAgent = utils.get_user_agent(driver) + challenge_res.turnstile_token = turnstile_token + + if not req.returnOnlyCookies: + challenge_res.headers = {} # todo: fix, selenium not provides this info + + if req.waitInSeconds and req.waitInSeconds > 0: + logging.info("Waiting " + str(req.waitInSeconds) + " seconds before returning the response...") + time.sleep(req.waitInSeconds) + + challenge_res.response = driver.page_source + + if req.returnScreenshot: + challenge_res.screenshot = driver.get_screenshot_as_base64() + + res.result = challenge_res + return res + + +def _post_request(req: V1RequestBase, driver: WebDriver): + post_form = f'
' + query_string = req.postData if req.postData and req.postData[0] != '?' else req.postData[1:] if req.postData else '' + pairs = query_string.split('&') + for pair in pairs: + parts = pair.split('=', 1) + # noinspection PyBroadException + try: + name = unquote(parts[0]) + except Exception: + name = parts[0] + if name == 'submit': + continue + # noinspection PyBroadException + try: + value = unquote(parts[1]) if len(parts) > 1 else '' + except Exception: + value = parts[1] if len(parts) > 1 else '' + # Protection of " character, for syntax + value=value.replace('"','"') + post_form += f'
' + post_form += '
' + html_content = f""" + + + + {post_form} + + + """ + driver.get("data:text/html;charset=utf-8,{html_content}".format(html_content=html_content)) diff --git a/flaresolverr/metrics.py b/flaresolverr/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..4112dd1e70ab2267c9ef87a71d1c2c1557a4c867 --- /dev/null +++ b/flaresolverr/metrics.py @@ -0,0 +1,32 @@ +import logging + +from prometheus_client import Counter, Histogram, start_http_server +import time + +REQUEST_COUNTER = Counter( + name='flaresolverr_request', + documentation='Total requests with result', + labelnames=['domain', 'result'] +) +REQUEST_DURATION = Histogram( + name='flaresolverr_request_duration', + documentation='Request duration in seconds', + labelnames=['domain'], + buckets=[0, 10, 25, 50] +) + + +def serve(port): + start_http_server(port=port) + while True: + time.sleep(600) + + +def start_metrics_http_server(prometheus_port: int): + logging.info(f"Serving Prometheus exporter on http://0.0.0.0:{prometheus_port}/metrics") + from threading import Thread + Thread( + target=serve, + kwargs=dict(port=prometheus_port), + daemon=True, + ).start() diff --git a/flaresolverr/sessions.py b/flaresolverr/sessions.py new file mode 100644 index 0000000000000000000000000000000000000000..30bb3c13eab2895676ab2719c61f40f2878471d8 --- /dev/null +++ b/flaresolverr/sessions.py @@ -0,0 +1,84 @@ +import logging +from dataclasses import dataclass +from datetime import datetime, timedelta +from typing import Optional, Tuple +from uuid import uuid1 + +from selenium.webdriver.chrome.webdriver import WebDriver + +import utils + + +@dataclass +class Session: + session_id: str + driver: WebDriver + created_at: datetime + + def lifetime(self) -> timedelta: + return datetime.now() - self.created_at + + +class SessionsStorage: + """SessionsStorage creates, stores and process all the sessions""" + + def __init__(self): + self.sessions = {} + + def create(self, session_id: Optional[str] = None, proxy: Optional[dict] = None, + force_new: Optional[bool] = False) -> Tuple[Session, bool]: + """create creates new instance of WebDriver if necessary, + assign defined (or newly generated) session_id to the instance + and returns the session object. If a new session has been created + second argument is set to True. + + Note: The function is idempotent, so in case if session_id + already exists in the storage a new instance of WebDriver won't be created + and existing session will be returned. Second argument defines if + new session has been created (True) or an existing one was used (False). + """ + session_id = session_id or str(uuid1()) + + if force_new: + self.destroy(session_id) + + if self.exists(session_id): + return self.sessions[session_id], False + + driver = utils.get_webdriver(proxy) + created_at = datetime.now() + session = Session(session_id, driver, created_at) + + self.sessions[session_id] = session + + return session, True + + def exists(self, session_id: str) -> bool: + return session_id in self.sessions + + def destroy(self, session_id: str) -> bool: + """destroy closes the driver instance and removes session from the storage. + The function is noop if session_id doesn't exist. + The function returns True if session was found and destroyed, + and False if session_id wasn't found. + """ + if not self.exists(session_id): + return False + + session = self.sessions.pop(session_id) + if utils.PLATFORM_VERSION == "nt": + session.driver.close() + session.driver.quit() + return True + + def get(self, session_id: str, ttl: Optional[timedelta] = None) -> Tuple[Session, bool]: + session, fresh = self.create(session_id) + + if ttl is not None and not fresh and session.lifetime() > ttl: + logging.debug(f'session\'s lifetime has expired, so the session is recreated (session_id={session_id})') + session, fresh = self.create(session_id, force_new=True) + + return session, fresh + + def session_ids(self) -> list[str]: + return list(self.sessions.keys()) diff --git a/flaresolverr/tests.py b/flaresolverr/tests.py new file mode 100644 index 0000000000000000000000000000000000000000..af49a68254f7fe8eb278d9c48661b2c6e4575212 --- /dev/null +++ b/flaresolverr/tests.py @@ -0,0 +1,655 @@ +import unittest +from typing import Optional + +from webtest import TestApp + +from dtos import IndexResponse, HealthResponse, V1ResponseBase, STATUS_OK, STATUS_ERROR +import flaresolverr +import utils + + +def _find_obj_by_key(key: str, value: str, _list: list) -> Optional[dict]: + for obj in _list: + if obj[key] == value: + return obj + return None + + +class TestFlareSolverr(unittest.TestCase): + + proxy_url = "http://127.0.0.1:8888" + proxy_socks_url = "socks5://127.0.0.1:1080" + google_url = "https://www.google.com" + post_url = "https://httpbin.org/post" + cloudflare_url = "https://nowsecure.nl/" + cloudflare_url_2 = "https://idope.se/torrent-list/harry/" + ddos_guard_url = "https://www.litres.ru/" + fairlane_url = "https://www.pararius.com/apartments/amsterdam" + custom_cloudflare_url = "https://www.muziekfabriek.org/" + cloudflare_blocked_url = "https://cpasbiens3.fr/index.php?do=search&subaction=search" + + app = TestApp(flaresolverr.app) + # wait until the server is ready + app.get('/') + + def test_wrong_endpoint(self): + res = self.app.get('/wrong', status=404) + self.assertEqual(res.status_code, 404) + + body = res.json + self.assertEqual("Not found: '/wrong'", body['error']) + self.assertEqual(404, body['status_code']) + + def test_index_endpoint(self): + res = self.app.get('/') + self.assertEqual(res.status_code, 200) + + body = IndexResponse(res.json) + self.assertEqual("FlareSolverr is ready!", body.msg) + self.assertEqual(utils.get_flaresolverr_version(), body.version) + self.assertIn("Chrome/", body.userAgent) + + def test_health_endpoint(self): + res = self.app.get('/health') + self.assertEqual(res.status_code, 200) + + body = HealthResponse(res.json) + self.assertEqual(STATUS_OK, body.status) + + def test_v1_endpoint_wrong_cmd(self): + res = self.app.post_json('/v1', { + "cmd": "request.bad", + "url": self.google_url + }, status=500) + self.assertEqual(res.status_code, 500) + + body = V1ResponseBase(res.json) + self.assertEqual(STATUS_ERROR, body.status) + self.assertEqual("Error: Request parameter 'cmd' = 'request.bad' is invalid.", body.message) + self.assertGreater(body.startTimestamp, 10000) + self.assertGreaterEqual(body.endTimestamp, body.startTimestamp) + self.assertEqual(utils.get_flaresolverr_version(), body.version) + + def test_v1_endpoint_request_get_no_cloudflare(self): + res = self.app.post_json('/v1', { + "cmd": "request.get", + "url": self.google_url + }) + self.assertEqual(res.status_code, 200) + + body = V1ResponseBase(res.json) + self.assertEqual(STATUS_OK, body.status) + self.assertEqual("Challenge not detected!", body.message) + self.assertGreater(body.startTimestamp, 10000) + self.assertGreaterEqual(body.endTimestamp, body.startTimestamp) + self.assertEqual(utils.get_flaresolverr_version(), body.version) + + solution = body.solution + self.assertIn(self.google_url, solution.url) + self.assertEqual(solution.status, 200) + self.assertIs(len(solution.headers), 0) + self.assertIn("Google", solution.response) + self.assertGreater(len(solution.cookies), 0) + self.assertIn("Chrome/", solution.userAgent) + + def test_v1_endpoint_request_get_disable_resources(self): + res = self.app.post_json("/v1", { + "cmd": "request.get", + "url": self.google_url, + "disableMedia": True + }) + self.assertEqual(res.status_code, 200) + + body = V1ResponseBase(res.json) + self.assertEqual(STATUS_OK, body.status) + self.assertEqual("Challenge not detected!", body.message) + self.assertGreater(body.startTimestamp, 10000) + self.assertGreaterEqual(body.endTimestamp, body.startTimestamp) + self.assertEqual(utils.get_flaresolverr_version(), body.version) + + solution = body.solution + self.assertIn(self.google_url, solution.url) + self.assertEqual(solution.status, 200) + self.assertIs(len(solution.headers), 0) + self.assertIn("Google", solution.response) + self.assertGreater(len(solution.cookies), 0) + self.assertIn("Chrome/", solution.userAgent) + + def test_v1_endpoint_request_get_cloudflare_js_1(self): + res = self.app.post_json('/v1', { + "cmd": "request.get", + "url": self.cloudflare_url + }) + self.assertEqual(res.status_code, 200) + + body = V1ResponseBase(res.json) + self.assertEqual(STATUS_OK, body.status) + self.assertEqual("Challenge solved!", body.message) + self.assertGreater(body.startTimestamp, 10000) + self.assertGreaterEqual(body.endTimestamp, body.startTimestamp) + self.assertEqual(utils.get_flaresolverr_version(), body.version) + + solution = body.solution + self.assertIn(self.cloudflare_url, solution.url) + self.assertEqual(solution.status, 200) + self.assertIs(len(solution.headers), 0) + self.assertIn("nowSecure", solution.response) + self.assertGreater(len(solution.cookies), 0) + self.assertIn("Chrome/", solution.userAgent) + + cf_cookie = _find_obj_by_key("name", "cf_clearance", solution.cookies) + self.assertIsNotNone(cf_cookie, "Cloudflare cookie not found") + self.assertGreater(len(cf_cookie["value"]), 30) + + def test_v1_endpoint_request_get_cloudflare_js_2(self): + res = self.app.post_json('/v1', { + "cmd": "request.get", + "url": self.cloudflare_url_2 + }) + self.assertEqual(res.status_code, 200) + + body = V1ResponseBase(res.json) + self.assertEqual(STATUS_OK, body.status) + self.assertEqual("Challenge solved!", body.message) + self.assertGreater(body.startTimestamp, 10000) + self.assertGreaterEqual(body.endTimestamp, body.startTimestamp) + self.assertEqual(utils.get_flaresolverr_version(), body.version) + + solution = body.solution + self.assertIn(self.cloudflare_url_2, solution.url) + self.assertEqual(solution.status, 200) + self.assertIs(len(solution.headers), 0) + self.assertIn("harry - idope torrent search", solution.response) + self.assertGreater(len(solution.cookies), 0) + self.assertIn("Chrome/", solution.userAgent) + + cf_cookie = _find_obj_by_key("name", "cf_clearance", solution.cookies) + self.assertIsNotNone(cf_cookie, "Cloudflare cookie not found") + self.assertGreater(len(cf_cookie["value"]), 30) + + def test_v1_endpoint_request_get_ddos_guard_js(self): + res = self.app.post_json('/v1', { + "cmd": "request.get", + "url": self.ddos_guard_url + }) + self.assertEqual(res.status_code, 200) + + body = V1ResponseBase(res.json) + self.assertEqual(STATUS_OK, body.status) + self.assertEqual("Challenge solved!", body.message) + self.assertGreater(body.startTimestamp, 10000) + self.assertGreaterEqual(body.endTimestamp, body.startTimestamp) + self.assertEqual(utils.get_flaresolverr_version(), body.version) + + solution = body.solution + self.assertIn(self.ddos_guard_url, solution.url) + self.assertEqual(solution.status, 200) + self.assertIs(len(solution.headers), 0) + self.assertIn("Литрес", solution.response) + self.assertGreater(len(solution.cookies), 0) + self.assertIn("Chrome/", solution.userAgent) + + cf_cookie = _find_obj_by_key("name", "__ddg1_", solution.cookies) + self.assertIsNotNone(cf_cookie, "DDOS-Guard cookie not found") + self.assertGreater(len(cf_cookie["value"]), 10) + + def test_v1_endpoint_request_get_fairlane_js(self): + res = self.app.post_json('/v1', { + "cmd": "request.get", + "url": self.fairlane_url + }) + self.assertEqual(res.status_code, 200) + + body = V1ResponseBase(res.json) + self.assertEqual(STATUS_OK, body.status) + self.assertEqual("Challenge solved!", body.message) + self.assertGreater(body.startTimestamp, 10000) + self.assertGreaterEqual(body.endTimestamp, body.startTimestamp) + self.assertEqual(utils.get_flaresolverr_version(), body.version) + + solution = body.solution + self.assertIn(self.fairlane_url, solution.url) + self.assertEqual(solution.status, 200) + self.assertIs(len(solution.headers), 0) + self.assertIn("<title>Rental Apartments Amsterdam", solution.response) + self.assertGreater(len(solution.cookies), 0) + self.assertIn("Chrome/", solution.userAgent) + + cf_cookie = _find_obj_by_key("name", "fl_pass_v2_b", solution.cookies) + self.assertIsNotNone(cf_cookie, "Fairlane cookie not found") + self.assertGreater(len(cf_cookie["value"]), 50) + + def test_v1_endpoint_request_get_custom_cloudflare_js(self): + res = self.app.post_json('/v1', { + "cmd": "request.get", + "url": self.custom_cloudflare_url + }) + self.assertEqual(res.status_code, 200) + + body = V1ResponseBase(res.json) + self.assertEqual(STATUS_OK, body.status) + self.assertEqual("Challenge solved!", body.message) + self.assertGreater(body.startTimestamp, 10000) + self.assertGreaterEqual(body.endTimestamp, body.startTimestamp) + self.assertEqual(utils.get_flaresolverr_version(), body.version) + + solution = body.solution + self.assertIn(self.custom_cloudflare_url, solution.url) + self.assertEqual(solution.status, 200) + self.assertIs(len(solution.headers), 0) + self.assertIn("MuziekFabriek : Aanmelden", solution.response) + self.assertGreater(len(solution.cookies), 0) + self.assertIn("Chrome/", solution.userAgent) + + cf_cookie = _find_obj_by_key("name", "ct_anti_ddos_key", solution.cookies) + self.assertIsNotNone(cf_cookie, "Custom Cloudflare cookie not found") + self.assertGreater(len(cf_cookie["value"]), 10) + + # todo: test Cmd 'request.get' should return fail with Cloudflare CAPTCHA + + def test_v1_endpoint_request_get_cloudflare_blocked(self): + res = self.app.post_json('/v1', { + "cmd": "request.get", + "url": self.cloudflare_blocked_url + }, status=500) + self.assertEqual(res.status_code, 500) + + body = V1ResponseBase(res.json) + self.assertEqual(STATUS_ERROR, body.status) + self.assertEqual("Error: Error solving the challenge. Cloudflare has blocked this request. " + "Probably your IP is banned for this site, check in your web browser.", body.message) + self.assertGreater(body.startTimestamp, 10000) + self.assertGreaterEqual(body.endTimestamp, body.startTimestamp) + self.assertEqual(utils.get_flaresolverr_version(), body.version) + + def test_v1_endpoint_request_get_cookies_param(self): + res = self.app.post_json('/v1', { + "cmd": "request.get", + "url": self.google_url, + "cookies": [ + { + "name": "testcookie1", + "value": "testvalue1" + }, + { + "name": "testcookie2", + "value": "testvalue2" + } + ] + }) + self.assertEqual(res.status_code, 200) + + body = V1ResponseBase(res.json) + self.assertEqual(STATUS_OK, body.status) + self.assertEqual("Challenge not detected!", body.message) + self.assertGreater(body.startTimestamp, 10000) + self.assertGreaterEqual(body.endTimestamp, body.startTimestamp) + self.assertEqual(utils.get_flaresolverr_version(), body.version) + + solution = body.solution + self.assertIn(self.google_url, solution.url) + self.assertEqual(solution.status, 200) + self.assertIs(len(solution.headers), 0) + self.assertIn("Google", solution.response) + self.assertGreater(len(solution.cookies), 1) + self.assertIn("Chrome/", solution.userAgent) + + user_cookie1 = _find_obj_by_key("name", "testcookie1", solution.cookies) + self.assertIsNotNone(user_cookie1, "User cookie 1 not found") + self.assertEqual("testvalue1", user_cookie1["value"]) + + user_cookie2 = _find_obj_by_key("name", "testcookie2", solution.cookies) + self.assertIsNotNone(user_cookie2, "User cookie 2 not found") + self.assertEqual("testvalue2", user_cookie2["value"]) + + def test_v1_endpoint_request_get_returnOnlyCookies_param(self): + res = self.app.post_json('/v1', { + "cmd": "request.get", + "url": self.google_url, + "returnOnlyCookies": True + }) + self.assertEqual(res.status_code, 200) + + body = V1ResponseBase(res.json) + self.assertEqual(STATUS_OK, body.status) + self.assertEqual("Challenge not detected!", body.message) + self.assertGreater(body.startTimestamp, 10000) + self.assertGreaterEqual(body.endTimestamp, body.startTimestamp) + self.assertEqual(utils.get_flaresolverr_version(), body.version) + + solution = body.solution + self.assertIn(self.google_url, solution.url) + self.assertEqual(solution.status, 200) + self.assertIsNone(solution.headers) + self.assertIsNone(solution.response) + self.assertGreater(len(solution.cookies), 0) + self.assertIn("Chrome/", solution.userAgent) + + def test_v1_endpoint_request_get_proxy_http_param(self): + """ + To configure TinyProxy in local: + * sudo vim /etc/tinyproxy/tinyproxy.conf + * edit => LogFile "/tmp/tinyproxy.log" + * edit => Syslog Off + * sudo tinyproxy -d + * sudo tail -f /tmp/tinyproxy.log + """ + res = self.app.post_json('/v1', { + "cmd": "request.get", + "url": self.google_url, + "proxy": { + "url": self.proxy_url + } + }) + self.assertEqual(res.status_code, 200) + + body = V1ResponseBase(res.json) + self.assertEqual(STATUS_OK, body.status) + self.assertEqual("Challenge not detected!", body.message) + self.assertGreater(body.startTimestamp, 10000) + self.assertGreaterEqual(body.endTimestamp, body.startTimestamp) + self.assertEqual(utils.get_flaresolverr_version(), body.version) + + solution = body.solution + self.assertIn(self.google_url, solution.url) + self.assertEqual(solution.status, 200) + self.assertIs(len(solution.headers), 0) + self.assertIn("Google", solution.response) + self.assertGreater(len(solution.cookies), 0) + self.assertIn("Chrome/", solution.userAgent) + + def test_v1_endpoint_request_get_proxy_http_param_with_credentials(self): + """ + To configure TinyProxy in local: + * sudo vim /etc/tinyproxy/tinyproxy.conf + * edit => LogFile "/tmp/tinyproxy.log" + * edit => Syslog Off + * add => BasicAuth testuser testpass + * sudo tinyproxy -d + * sudo tail -f /tmp/tinyproxy.log + """ + res = self.app.post_json('/v1', { + "cmd": "request.get", + "url": self.google_url, + "proxy": { + "url": self.proxy_url, + "username": "testuser", + "password": "testpass" + } + }) + self.assertEqual(res.status_code, 200) + + body = V1ResponseBase(res.json) + self.assertEqual(STATUS_OK, body.status) + self.assertEqual("Challenge not detected!", body.message) + self.assertGreater(body.startTimestamp, 10000) + self.assertGreaterEqual(body.endTimestamp, body.startTimestamp) + self.assertEqual(utils.get_flaresolverr_version(), body.version) + + solution = body.solution + self.assertIn(self.google_url, solution.url) + self.assertEqual(solution.status, 200) + self.assertIs(len(solution.headers), 0) + self.assertIn("Google", solution.response) + self.assertGreater(len(solution.cookies), 0) + self.assertIn("Chrome/", solution.userAgent) + + def test_v1_endpoint_request_get_proxy_socks_param(self): + """ + To configure Dante in local: + * https://linuxhint.com/set-up-a-socks5-proxy-on-ubuntu-with-dante/ + * sudo vim /etc/sockd.conf + * sudo systemctl restart sockd.service + * curl --socks5 socks5://127.0.0.1:1080 https://www.google.com + """ + res = self.app.post_json('/v1', { + "cmd": "request.get", + "url": self.google_url, + "proxy": { + "url": self.proxy_socks_url + } + }) + self.assertEqual(res.status_code, 200) + + body = V1ResponseBase(res.json) + self.assertEqual(STATUS_OK, body.status) + self.assertEqual("Challenge not detected!", body.message) + self.assertGreater(body.startTimestamp, 10000) + self.assertGreaterEqual(body.endTimestamp, body.startTimestamp) + self.assertEqual(utils.get_flaresolverr_version(), body.version) + + solution = body.solution + self.assertIn(self.google_url, solution.url) + self.assertEqual(solution.status, 200) + self.assertIs(len(solution.headers), 0) + self.assertIn("Google", solution.response) + self.assertGreater(len(solution.cookies), 0) + self.assertIn("Chrome/", solution.userAgent) + + def test_v1_endpoint_request_get_proxy_wrong_param(self): + res = self.app.post_json('/v1', { + "cmd": "request.get", + "url": self.google_url, + "proxy": { + "url": "http://127.0.0.1:43210" + } + }, status=500) + self.assertEqual(res.status_code, 500) + + body = V1ResponseBase(res.json) + self.assertEqual(STATUS_ERROR, body.status) + self.assertIn("Error: Error solving the challenge. Message: unknown error: net::ERR_PROXY_CONNECTION_FAILED", + body.message) + self.assertGreater(body.startTimestamp, 10000) + self.assertGreaterEqual(body.endTimestamp, body.startTimestamp) + self.assertEqual(utils.get_flaresolverr_version(), body.version) + + def test_v1_endpoint_request_get_fail_timeout(self): + res = self.app.post_json('/v1', { + "cmd": "request.get", + "url": self.google_url, + "maxTimeout": 10 + }, status=500) + self.assertEqual(res.status_code, 500) + + body = V1ResponseBase(res.json) + self.assertEqual(STATUS_ERROR, body.status) + self.assertEqual("Error: Error solving the challenge. Timeout after 0.01 seconds.", body.message) + self.assertGreater(body.startTimestamp, 10000) + self.assertGreaterEqual(body.endTimestamp, body.startTimestamp) + self.assertEqual(utils.get_flaresolverr_version(), body.version) + + def test_v1_endpoint_request_get_fail_bad_domain(self): + res = self.app.post_json('/v1', { + "cmd": "request.get", + "url": "https://www.google.combad" + }, status=500) + self.assertEqual(res.status_code, 500) + + body = V1ResponseBase(res.json) + self.assertEqual(STATUS_ERROR, body.status) + self.assertIn("Message: unknown error: net::ERR_NAME_NOT_RESOLVED", body.message) + + def test_v1_endpoint_request_get_deprecated_param(self): + res = self.app.post_json('/v1', { + "cmd": "request.get", + "url": self.google_url, + "userAgent": "Test User-Agent" # was removed in v2, not used + }) + self.assertEqual(res.status_code, 200) + + body = V1ResponseBase(res.json) + self.assertEqual(STATUS_OK, body.status) + self.assertEqual("Challenge not detected!", body.message) + + def test_v1_endpoint_request_post_no_cloudflare(self): + res = self.app.post_json('/v1', { + "cmd": "request.post", + "url": self.post_url, + "postData": "param1=value1¶m2=value2" + }) + self.assertEqual(res.status_code, 200) + + body = V1ResponseBase(res.json) + self.assertEqual(STATUS_OK, body.status) + self.assertEqual("Challenge not detected!", body.message) + self.assertGreater(body.startTimestamp, 10000) + self.assertGreaterEqual(body.endTimestamp, body.startTimestamp) + self.assertEqual(utils.get_flaresolverr_version(), body.version) + + solution = body.solution + self.assertIn(self.post_url, solution.url) + self.assertEqual(solution.status, 200) + self.assertIs(len(solution.headers), 0) + self.assertIn('"form": {\n "param1": "value1", \n "param2": "value2"\n }', solution.response) + self.assertEqual(len(solution.cookies), 0) + self.assertIn("Chrome/", solution.userAgent) + + def test_v1_endpoint_request_post_cloudflare(self): + res = self.app.post_json('/v1', { + "cmd": "request.post", + "url": self.cloudflare_url, + "postData": "param1=value1¶m2=value2" + }) + self.assertEqual(res.status_code, 200) + + body = V1ResponseBase(res.json) + self.assertEqual(STATUS_OK, body.status) + self.assertEqual("Challenge solved!", body.message) + self.assertGreater(body.startTimestamp, 10000) + self.assertGreaterEqual(body.endTimestamp, body.startTimestamp) + self.assertEqual(utils.get_flaresolverr_version(), body.version) + + solution = body.solution + self.assertIn(self.cloudflare_url, solution.url) + self.assertEqual(solution.status, 200) + self.assertIs(len(solution.headers), 0) + self.assertIn("405 Not Allowed", solution.response) + self.assertGreater(len(solution.cookies), 0) + self.assertIn("Chrome/", solution.userAgent) + + cf_cookie = _find_obj_by_key("name", "cf_clearance", solution.cookies) + self.assertIsNotNone(cf_cookie, "Cloudflare cookie not found") + self.assertGreater(len(cf_cookie["value"]), 30) + + def test_v1_endpoint_request_post_fail_no_post_data(self): + res = self.app.post_json('/v1', { + "cmd": "request.post", + "url": self.google_url + }, status=500) + self.assertEqual(res.status_code, 500) + + body = V1ResponseBase(res.json) + self.assertEqual(STATUS_ERROR, body.status) + self.assertIn("Request parameter 'postData' is mandatory in 'request.post' command", body.message) + + def test_v1_endpoint_request_post_deprecated_param(self): + res = self.app.post_json('/v1', { + "cmd": "request.post", + "url": self.google_url, + "postData": "param1=value1¶m2=value2", + "userAgent": "Test User-Agent" # was removed in v2, not used + }) + self.assertEqual(res.status_code, 200) + + body = V1ResponseBase(res.json) + self.assertEqual(STATUS_OK, body.status) + self.assertEqual("Challenge not detected!", body.message) + + def test_v1_endpoint_sessions_create_without_session(self): + res = self.app.post_json('/v1', { + "cmd": "sessions.create" + }) + self.assertEqual(res.status_code, 200) + + body = V1ResponseBase(res.json) + self.assertEqual(STATUS_OK, body.status) + self.assertEqual("Session created successfully.", body.message) + self.assertIsNotNone(body.session) + + def test_v1_endpoint_sessions_create_with_session(self): + res = self.app.post_json('/v1', { + "cmd": "sessions.create", + "session": "test_create_session" + }) + self.assertEqual(res.status_code, 200) + + body = V1ResponseBase(res.json) + self.assertEqual(STATUS_OK, body.status) + self.assertEqual("Session created successfully.", body.message) + self.assertEqual(body.session, "test_create_session") + + def test_v1_endpoint_sessions_create_with_proxy(self): + res = self.app.post_json('/v1', { + "cmd": "sessions.create", + "proxy": { + "url": self.proxy_url + } + }) + self.assertEqual(res.status_code, 200) + + body = V1ResponseBase(res.json) + self.assertEqual(STATUS_OK, body.status) + self.assertEqual("Session created successfully.", body.message) + self.assertIsNotNone(body.session) + + def test_v1_endpoint_sessions_list(self): + self.app.post_json('/v1', { + "cmd": "sessions.create", + "session": "test_list_sessions" + }) + res = self.app.post_json('/v1', { + "cmd": "sessions.list" + }) + self.assertEqual(res.status_code, 200) + + body = V1ResponseBase(res.json) + self.assertEqual(STATUS_OK, body.status) + self.assertEqual("", body.message) + self.assertGreaterEqual(len(body.sessions), 1) + self.assertIn("test_list_sessions", body.sessions) + + def test_v1_endpoint_sessions_destroy_existing_session(self): + self.app.post_json('/v1', { + "cmd": "sessions.create", + "session": "test_destroy_sessions" + }) + res = self.app.post_json('/v1', { + "cmd": "sessions.destroy", + "session": "test_destroy_sessions" + }) + self.assertEqual(res.status_code, 200) + + body = V1ResponseBase(res.json) + self.assertEqual(STATUS_OK, body.status) + self.assertEqual("The session has been removed.", body.message) + + def test_v1_endpoint_sessions_destroy_non_existing_session(self): + res = self.app.post_json('/v1', { + "cmd": "sessions.destroy", + "session": "non_existing_session_name" + }, status=500) + self.assertEqual(res.status_code, 500) + + body = V1ResponseBase(res.json) + self.assertEqual(STATUS_ERROR, body.status) + self.assertEqual("Error: The session doesn't exist.", body.message) + + def test_v1_endpoint_request_get_with_session(self): + self.app.post_json('/v1', { + "cmd": "sessions.create", + "session": "test_request_sessions" + }) + res = self.app.post_json('/v1', { + "cmd": "request.get", + "session": "test_request_sessions", + "url": self.google_url + }) + self.assertEqual(res.status_code, 200) + + body = V1ResponseBase(res.json) + self.assertEqual(STATUS_OK, body.status) + + +if __name__ == '__main__': + unittest.main() diff --git a/flaresolverr/tests_sites.py b/flaresolverr/tests_sites.py new file mode 100644 index 0000000000000000000000000000000000000000..aa7dcbc4f2ed3071fb03606391781f1818fefd4c --- /dev/null +++ b/flaresolverr/tests_sites.py @@ -0,0 +1,102 @@ +import unittest + +from webtest import TestApp + +from dtos import V1ResponseBase, STATUS_OK +import flaresolverr +import utils + + +def _find_obj_by_key(key: str, value: str, _list: list) -> dict | None: + for obj in _list: + if obj[key] == value: + return obj + return None + + +def asset_cloudflare_solution(self, res, site_url, site_text): + self.assertEqual(res.status_code, 200) + + body = V1ResponseBase(res.json) + self.assertEqual(STATUS_OK, body.status) + self.assertEqual("Challenge solved!", body.message) + self.assertGreater(body.startTimestamp, 10000) + self.assertGreaterEqual(body.endTimestamp, body.startTimestamp) + self.assertEqual(utils.get_flaresolverr_version(), body.version) + + solution = body.solution + self.assertIn(site_url, solution.url) + self.assertEqual(solution.status, 200) + self.assertIs(len(solution.headers), 0) + self.assertIn(site_text, solution.response) + self.assertGreater(len(solution.cookies), 0) + self.assertIn("Chrome/", solution.userAgent) + + cf_cookie = _find_obj_by_key("name", "cf_clearance", solution.cookies) + self.assertIsNotNone(cf_cookie, "Cloudflare cookie not found") + self.assertGreater(len(cf_cookie["value"]), 30) + + +class TestFlareSolverr(unittest.TestCase): + app = TestApp(flaresolverr.app) + # wait until the server is ready + app.get('/') + + def test_v1_endpoint_request_get_cloudflare(self): + sites_get = [ + ('nowsecure', 'https://nowsecure.nl', 'nowSecure'), + ('0magnet', 'https://0magnet.com/search?q=2022', 'Torrent Search - ØMagnet'), + ('1337x', 'https://1337x.unblockit.cat/cat/Movies/time/desc/1/', ''), + ('avistaz', 'https://avistaz.to/api/v1/jackett/torrents?in=1&type=0&search=', + 'Access denied'), + ('badasstorrents', 'https://badasstorrents.com/torrents/search/720p/date/desc', + 'Latest Torrents - BadassTorrents'), + ('bt4g', 'https://bt4g.org/search/2022', 'Download 2022 Torrents - BT4G'), + ('cinemaz', 'https://cinemaz.to/api/v1/jackett/torrents?in=1&type=0&search=', + 'Access denied'), + ('epublibre', 'https://epublibre.unblockit.cat/catalogo/index/0/nuevo/todos/sin/todos/--/ajax', + 'epublibre - catálogo'), + ('ext', 'https://ext.to/latest/?order=age&sort=desc', + 'Download Latest Torrents - EXT Torrents'), + ('extratorrent', 'https://extratorrent.st/search/?srt=added&order=desc&search=720p&new=1&x=0&y=0', + 'Page 1 - ExtraTorrent'), + ('idope', 'https://idope.se/browse.html', 'Recent Torrents'), + ('limetorrents', 'https://limetorrents.unblockninja.com/latest100', + 'Latest 100 torrents - LimeTorrents'), + ('privatehd', 'https://privatehd.to/api/v1/jackett/torrents?in=1&type=0&search=', + 'Access denied'), + ('torrentcore', 'https://torrentcore.xyz/index', 'Torrent[CORE] - Torrent community.'), + ('torrentqq223', 'https://torrentqq223.com/torrent/newest.html', 'https://torrentqq223.com/ads/'), + ('36dm', 'https://www.36dm.club/1.html', 'https://www.36dm.club/yesterday-1.html'), + ('erai-raws', 'https://www.erai-raws.info/feed/?type=magnet', '403 Forbidden'), + ('teamos', 'https://www.teamos.xyz/torrents/?filename=&freeleech=', + 'Log in | Team OS : Your Only Destination To Custom OS !!'), + ('yts', 'https://yts.unblockninja.com/api/v2/list_movies.json?query_term=&limit=50&sort=date_added', + '{"movie_count":') + ] + for site_name, site_url, site_text in sites_get: + with self.subTest(msg=site_name): + res = self.app.post_json('/v1', { + "cmd": "request.get", + "url": site_url + }) + asset_cloudflare_solution(self, res, site_url, site_text) + + def test_v1_endpoint_request_post_cloudflare(self): + sites_post = [ + ('nnmclub', 'https://nnmclub.to/forum/tracker.php', 'Трекер :: NNM-Club', + 'prev_sd=0&prev_a=0&prev_my=0&prev_n=0&prev_shc=0&prev_shf=1&prev_sha=1&prev_shs=0&prev_shr=0&prev_sht=0&f%5B%5D=-1&o=1&s=2&tm=-1&shf=1&sha=1&ta=-1&sns=-1&sds=-1&nm=&pn=&submit=%CF%EE%E8%F1%EA') + ] + + for site_name, site_url, site_text, post_data in sites_post: + with self.subTest(msg=site_name): + res = self.app.post_json('/v1', { + "cmd": "request.post", + "url": site_url, + "postData": post_data + }) + asset_cloudflare_solution(self, res, site_url, site_text) + + +if __name__ == '__main__': + unittest.main() diff --git a/flaresolverr/undetected_chromedriver/__init__.py b/flaresolverr/undetected_chromedriver/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b78f60e3ebacf394743795c94ed7378158ae8f74 --- /dev/null +++ b/flaresolverr/undetected_chromedriver/__init__.py @@ -0,0 +1,910 @@ +#!/usr/bin/env python3 + +""" + + 888 888 d8b + 888 888 Y8P + 888 888 + .d8888b 88888b. 888d888 .d88b. 88888b.d88b. .d88b. .d88888 888d888 888 888 888 .d88b. 888d888 +d88P" 888 "88b 888P" d88""88b 888 "888 "88b d8P Y8b d88" 888 888P" 888 888 888 d8P Y8b 888P" +888 888 888 888 888 888 888 888 888 88888888 888 888 888 888 Y88 88P 88888888 888 +Y88b. 888 888 888 Y88..88P 888 888 888 Y8b. Y88b 888 888 888 Y8bd8P Y8b. 888 + "Y8888P 888 888 888 "Y88P" 888 888 888 "Y8888 "Y88888 888 888 Y88P "Y8888 888 88888888 + +by UltrafunkAmsterdam (https://github.com/ultrafunkamsterdam) + +""" +from __future__ import annotations + + +__version__ = "3.5.5" + +import json +import logging +import os +import pathlib +import re +import shutil +import subprocess +import sys +import tempfile +import time +from weakref import finalize + +import selenium.webdriver.chrome.service +import selenium.webdriver.chrome.webdriver +from selenium.webdriver.common.by import By +import selenium.webdriver.chromium.service +import selenium.webdriver.remote.command +import selenium.webdriver.remote.webdriver + +from .cdp import CDP +from .dprocess import start_detached +from .options import ChromeOptions +from .patcher import IS_POSIX +from .patcher import Patcher +from .reactor import Reactor +from .webelement import UCWebElement +from .webelement import WebElement + + +__all__ = ( + "Chrome", + "ChromeOptions", + "Patcher", + "Reactor", + "CDP", + "find_chrome_executable", +) + +logger = logging.getLogger("uc") +logger.setLevel(logging.getLogger().getEffectiveLevel()) + + +class Chrome(selenium.webdriver.chrome.webdriver.WebDriver): + """ + + Controls the ChromeDriver and allows you to drive the browser. + + The webdriver file will be downloaded by this module automatically, + you do not need to specify this. however, you may if you wish. + + Attributes + ---------- + + Methods + ------- + + reconnect() + + this can be useful in case of heavy detection methods + -stops the chromedriver service which runs in the background + -starts the chromedriver service which runs in the background + -recreate session + + + start_session(capabilities=None, browser_profile=None) + + differentiates from the regular method in that it does not + require a capabilities argument. The capabilities are automatically + recreated from the options at creation time. + + -------------------------------------------------------------------------- + NOTE: + Chrome has everything included to work out of the box. + it does not `need` customizations. + any customizations MAY lead to trigger bot migitation systems. + + -------------------------------------------------------------------------- + """ + + _instances = set() + session_id = None + debug = False + + def __init__( + self, + options=None, + user_data_dir=None, + driver_executable_path=None, + browser_executable_path=None, + port=0, + enable_cdp_events=False, + # service_args=None, + # service_creationflags=None, + desired_capabilities=None, + advanced_elements=False, + # service_log_path=None, + keep_alive=True, + log_level=0, + headless=False, + version_main=None, + patcher_force_close=False, + suppress_welcome=True, + use_subprocess=False, + debug=False, + no_sandbox=True, + windows_headless=False, + user_multi_procs: bool = False, + **kw, + ): + """ + Creates a new instance of the chrome driver. + + Starts the service and then creates new instance of chrome driver. + + Parameters + ---------- + + options: ChromeOptions, optional, default: None - automatic useful defaults + this takes an instance of ChromeOptions, mainly to customize browser behavior. + anything other dan the default, for example extensions or startup options + are not supported in case of failure, and can probably lowers your undetectability. + + + user_data_dir: str , optional, default: None (creates temp profile) + if user_data_dir is a path to a valid chrome profile directory, use it, + and turn off automatic removal mechanism at exit. + + driver_executable_path: str, optional, default: None(=downloads and patches new binary) + + browser_executable_path: str, optional, default: None - use find_chrome_executable + Path to the browser executable. + If not specified, make sure the executable's folder is in $PATH + + port: int, optional, default: 0 + port to be used by the chromedriver executable, this is NOT the debugger port. + leave it at 0 unless you know what you are doing. + the default value of 0 automatically picks an available port. + + enable_cdp_events: bool, default: False + :: currently for chrome only + this enables the handling of wire messages + when enabled, you can subscribe to CDP events by using: + + driver.add_cdp_listener("Network.dataReceived", yourcallback) + # yourcallback is an callable which accepts exactly 1 dict as parameter + + + service_args: list of str, optional, default: None + arguments to pass to the driver service + + desired_capabilities: dict, optional, default: None - auto from config + Dictionary object with non-browser specific capabilities only, such as "item" or "loggingPref". + + advanced_elements: bool, optional, default: False + makes it easier to recognize elements like you know them from html/browser inspection, especially when working + in an interactive environment + + default webelement repr: + + + advanced webelement repr + )> + + note: when retrieving large amounts of elements ( example: find_elements_by_tag("*") ) and print them, it does take a little more time. + + + service_log_path: str, optional, default: None + path to log information from the driver. + + keep_alive: bool, optional, default: True + Whether to configure ChromeRemoteConnection to use HTTP keep-alive. + + log_level: int, optional, default: adapts to python global log level + + headless: bool, optional, default: False + can also be specified in the options instance. + Specify whether you want to use the browser in headless mode. + warning: this lowers undetectability and not fully supported. + + version_main: int, optional, default: None (=auto) + if you, for god knows whatever reason, use + an older version of Chrome. You can specify it's full rounded version number + here. Example: 87 for all versions of 87 + + patcher_force_close: bool, optional, default: False + instructs the patcher to do whatever it can to access the chromedriver binary + if the file is locked, it will force shutdown all instances. + setting it is not recommended, unless you know the implications and think + you might need it. + + suppress_welcome: bool, optional , default: True + a "welcome" alert might show up on *nix-like systems asking whether you want to set + chrome as your default browser, and if you want to send even more data to google. + now, in case you are nag-fetishist, or a diagnostics data feeder to google, you can set this to False. + Note: if you don't handle the nag screen in time, the browser loses it's connection and throws an Exception. + + use_subprocess: bool, optional , default: True, + + False (the default) makes sure Chrome will get it's own process (so no subprocess of chromedriver.exe or python + This fixes a LOT of issues, like multithreaded run, but mst importantly. shutting corectly after + program exits or using .quit() + you should be knowing what you're doing, and know how python works. + + unfortunately, there is always an edge case in which one would like to write an single script with the only contents being: + --start script-- + import undetected_chromedriver as uc + d = uc.Chrome() + d.get('https://somesite/') + ---end script -- + + and will be greeted with an error, since the program exists before chrome has a change to launch. + in that case you can set this to `True`. The browser will start via subprocess, and will keep running most of times. + ! setting it to True comes with NO support when being detected. ! + + no_sandbox: bool, optional, default=True + uses the --no-sandbox option, and additionally does suppress the "unsecure option" status bar + this option has a default of True since many people seem to run this as root (....) , and chrome does not start + when running as root without using --no-sandbox flag. + + user_multi_procs: + set to true when you are using multithreads/multiprocessing + ensures not all processes are trying to modify a binary which is in use by another. + for this to work. YOU MUST HAVE AT LEAST 1 UNDETECTED_CHROMEDRIVER BINARY IN YOUR ROAMING DATA FOLDER. + this requirement can be easily satisfied, by just running this program "normal" and close/kill it. + + + """ + + finalize(self, self._ensure_close, self) + self.debug = debug + self.patcher = Patcher( + executable_path=driver_executable_path, + force=patcher_force_close, + version_main=version_main, + user_multi_procs=user_multi_procs, + ) + # self.patcher.auto(user_multiprocess = user_multi_num_procs) + self.patcher.auto() + + # self.patcher = patcher + if not options: + options = ChromeOptions() + + try: + if hasattr(options, "_session") and options._session is not None: + # prevent reuse of options, + # as it just appends arguments, not replace them + # you'll get conflicts starting chrome + raise RuntimeError("you cannot reuse the ChromeOptions object") + except AttributeError: + pass + + options._session = self + + if not options.debugger_address: + debug_port = ( + port + if port != 0 + else selenium.webdriver.common.service.utils.free_port() + ) + debug_host = "127.0.0.1" + options.debugger_address = "%s:%d" % (debug_host, debug_port) + else: + debug_host, debug_port = options.debugger_address.split(":") + debug_port = int(debug_port) + + if enable_cdp_events: + options.set_capability( + "goog:loggingPrefs", {"performance": "ALL", "browser": "ALL"} + ) + + options.add_argument("--remote-debugging-host=%s" % debug_host) + options.add_argument("--remote-debugging-port=%s" % debug_port) + + if user_data_dir: + options.add_argument("--user-data-dir=%s" % user_data_dir) + + language, keep_user_data_dir = None, bool(user_data_dir) + + # see if a custom user profile is specified in options + for arg in options.arguments: + + if any([_ in arg for _ in ("--headless", "headless")]): + options.arguments.remove(arg) + options.headless = True + + if "lang" in arg: + m = re.search("(?:--)?lang(?:[ =])?(.*)", arg) + try: + language = m[1] + except IndexError: + logger.debug("will set the language to en-US,en;q=0.9") + language = "en-US,en;q=0.9" + + if "user-data-dir" in arg: + m = re.search("(?:--)?user-data-dir(?:[ =])?(.*)", arg) + try: + user_data_dir = m[1] + logger.debug( + "user-data-dir found in user argument %s => %s" % (arg, m[1]) + ) + keep_user_data_dir = True + + except IndexError: + logger.debug( + "no user data dir could be extracted from supplied argument %s " + % arg + ) + + if not user_data_dir: + # backward compatiblity + # check if an old uc.ChromeOptions is used, and extract the user data dir + + if hasattr(options, "user_data_dir") and getattr( + options, "user_data_dir", None + ): + import warnings + + warnings.warn( + "using ChromeOptions.user_data_dir might stop working in future versions." + "use uc.Chrome(user_data_dir='/xyz/some/data') in case you need existing profile folder" + ) + options.add_argument("--user-data-dir=%s" % options.user_data_dir) + keep_user_data_dir = True + logger.debug( + "user_data_dir property found in options object: %s" % user_data_dir + ) + + else: + user_data_dir = os.path.normpath(tempfile.mkdtemp()) + keep_user_data_dir = False + arg = "--user-data-dir=%s" % user_data_dir + options.add_argument(arg) + logger.debug( + "created a temporary folder in which the user-data (profile) will be stored during this\n" + "session, and added it to chrome startup arguments: %s" % arg + ) + + if not language: + try: + import locale + + language = locale.getdefaultlocale()[0].replace("_", "-") + except Exception: + pass + if not language: + language = "en-US" + + options.add_argument("--lang=%s" % language) + + if not options.binary_location: + options.binary_location = ( + browser_executable_path or find_chrome_executable() + ) + + if not options.binary_location or not \ + pathlib.Path(options.binary_location).exists(): + raise FileNotFoundError( + "\n---------------------\n" + "Could not determine browser executable." + "\n---------------------\n" + "Make sure your browser is installed in the default location (path).\n" + "If you are sure about the browser executable, you can specify it using\n" + "the `browser_executable_path='{}` parameter.\n\n" + .format("/path/to/browser/executable" if IS_POSIX else "c:/path/to/your/browser.exe") + ) + + self._delay = 3 + + self.user_data_dir = user_data_dir + self.keep_user_data_dir = keep_user_data_dir + + if suppress_welcome: + options.arguments.extend(["--no-default-browser-check", "--no-first-run"]) + if no_sandbox: + options.arguments.extend(["--no-sandbox", "--test-type"]) + + if headless or getattr(options, 'headless', None): + #workaround until a better checking is found + try: + v_main = int(self.patcher.version_main) if self.patcher.version_main else 108 + if v_main < 108: + options.add_argument("--headless=chrome") + elif v_main >= 108: + options.add_argument("--headless=new") + except: + logger.warning("could not detect version_main." + "therefore, we are assuming it is chrome 108 or higher") + options.add_argument("--headless=new") + + options.add_argument("--window-size=1920,1080") + options.add_argument("--start-maximized") + options.add_argument("--no-sandbox") + # fixes "could not connect to chrome" error when running + # on linux using privileged user like root (which i don't recommend) + + options.add_argument( + "--log-level=%d" % log_level + or divmod(logging.getLogger().getEffectiveLevel(), 10)[0] + ) + + if hasattr(options, "handle_prefs"): + options.handle_prefs(user_data_dir) + + # fix exit_type flag to prevent tab-restore nag + try: + with open( + os.path.join(user_data_dir, "Default/Preferences"), + encoding="latin1", + mode="r+", + ) as fs: + config = json.load(fs) + if config["profile"]["exit_type"] is not None: + # fixing the restore-tabs-nag + config["profile"]["exit_type"] = None + fs.seek(0, 0) + json.dump(config, fs) + fs.truncate() # the file might be shorter + logger.debug("fixed exit_type flag") + except Exception as e: + logger.debug("did not find a bad exit_type flag ") + + self.options = options + + if not desired_capabilities: + desired_capabilities = options.to_capabilities() + + if not use_subprocess and not windows_headless: + self.browser_pid = start_detached( + options.binary_location, *options.arguments + ) + else: + startupinfo = None + if os.name == 'nt' and windows_headless: + # STARTUPINFO() is Windows only + startupinfo = subprocess.STARTUPINFO() + startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW + browser = subprocess.Popen( + [options.binary_location, *options.arguments], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + close_fds=IS_POSIX, + startupinfo=startupinfo + ) + self.browser_pid = browser.pid + + + service = selenium.webdriver.chromium.service.ChromiumService( + self.patcher.executable_path + ) + + super().__init__( + service=service, + options=options, + keep_alive=keep_alive, + ) + + self.reactor = None + + if enable_cdp_events: + if logging.getLogger().getEffectiveLevel() == logging.DEBUG: + logging.getLogger( + "selenium.webdriver.remote.remote_connection" + ).setLevel(20) + reactor = Reactor(self) + reactor.start() + self.reactor = reactor + + if advanced_elements: + self._web_element_cls = UCWebElement + else: + self._web_element_cls = WebElement + + if headless or getattr(options, 'headless', None): + self._configure_headless() + + def _configure_headless(self): + orig_get = self.get + logger.info("setting properties for headless") + + def get_wrapped(*args, **kwargs): + if self.execute_script("return navigator.webdriver"): + logger.info("patch navigator.webdriver") + self.execute_cdp_cmd( + "Page.addScriptToEvaluateOnNewDocument", + { + "source": """ + Object.defineProperty(window, "navigator", { + value: new Proxy(navigator, { + has: (target, key) => (key === "webdriver" ? false : key in target), + get: (target, key) => + key === "webdriver" + ? false + : typeof target[key] === "function" + ? target[key].bind(target) + : target[key], + }), + }); + """ + }, + ) + + logger.info("patch user-agent string") + self.execute_cdp_cmd( + "Network.setUserAgentOverride", + { + "userAgent": self.execute_script( + "return navigator.userAgent" + ).replace("Headless", "") + }, + ) + self.execute_cdp_cmd( + "Page.addScriptToEvaluateOnNewDocument", + { + "source": """ + Object.defineProperty(navigator, 'maxTouchPoints', {get: () => 1}); + Object.defineProperty(navigator.connection, 'rtt', {get: () => 100}); + + // https://github.com/microlinkhq/browserless/blob/master/packages/goto/src/evasions/chrome-runtime.js + window.chrome = { + app: { + isInstalled: false, + InstallState: { + DISABLED: 'disabled', + INSTALLED: 'installed', + NOT_INSTALLED: 'not_installed' + }, + RunningState: { + CANNOT_RUN: 'cannot_run', + READY_TO_RUN: 'ready_to_run', + RUNNING: 'running' + } + }, + runtime: { + OnInstalledReason: { + CHROME_UPDATE: 'chrome_update', + INSTALL: 'install', + SHARED_MODULE_UPDATE: 'shared_module_update', + UPDATE: 'update' + }, + OnRestartRequiredReason: { + APP_UPDATE: 'app_update', + OS_UPDATE: 'os_update', + PERIODIC: 'periodic' + }, + PlatformArch: { + ARM: 'arm', + ARM64: 'arm64', + MIPS: 'mips', + MIPS64: 'mips64', + X86_32: 'x86-32', + X86_64: 'x86-64' + }, + PlatformNaclArch: { + ARM: 'arm', + MIPS: 'mips', + MIPS64: 'mips64', + X86_32: 'x86-32', + X86_64: 'x86-64' + }, + PlatformOs: { + ANDROID: 'android', + CROS: 'cros', + LINUX: 'linux', + MAC: 'mac', + OPENBSD: 'openbsd', + WIN: 'win' + }, + RequestUpdateCheckStatus: { + NO_UPDATE: 'no_update', + THROTTLED: 'throttled', + UPDATE_AVAILABLE: 'update_available' + } + } + } + + // https://github.com/microlinkhq/browserless/blob/master/packages/goto/src/evasions/navigator-permissions.js + if (!window.Notification) { + window.Notification = { + permission: 'denied' + } + } + + const originalQuery = window.navigator.permissions.query + window.navigator.permissions.__proto__.query = parameters => + parameters.name === 'notifications' + ? Promise.resolve({ state: window.Notification.permission }) + : originalQuery(parameters) + + const oldCall = Function.prototype.call + function call() { + return oldCall.apply(this, arguments) + } + Function.prototype.call = call + + const nativeToStringFunctionString = Error.toString().replace(/Error/g, 'toString') + const oldToString = Function.prototype.toString + + function functionToString() { + if (this === window.navigator.permissions.query) { + return 'function query() { [native code] }' + } + if (this === functionToString) { + return nativeToStringFunctionString + } + return oldCall.call(oldToString, this) + } + // eslint-disable-next-line + Function.prototype.toString = functionToString + """ + }, + ) + return orig_get(*args, **kwargs) + + self.get = get_wrapped + + # def _get_cdc_props(self): + # return self.execute_script( + # """ + # let objectToInspect = window, + # result = []; + # while(objectToInspect !== null) + # { result = result.concat(Object.getOwnPropertyNames(objectToInspect)); + # objectToInspect = Object.getPrototypeOf(objectToInspect); } + # + # return result.filter(i => i.match(/^([a-zA-Z]){27}(Array|Promise|Symbol)$/ig)) + # """ + # ) + # + # def _hook_remove_cdc_props(self): + # self.execute_cdp_cmd( + # "Page.addScriptToEvaluateOnNewDocument", + # { + # "source": """ + # let objectToInspect = window, + # result = []; + # while(objectToInspect !== null) + # { result = result.concat(Object.getOwnPropertyNames(objectToInspect)); + # objectToInspect = Object.getPrototypeOf(objectToInspect); } + # result.forEach(p => p.match(/^([a-zA-Z]){27}(Array|Promise|Symbol)$/ig) + # &&delete window[p]&&console.log('removed',p)) + # """ + # }, + # ) + + def get(self, url): + # if self._get_cdc_props(): + # self._hook_remove_cdc_props() + return super().get(url) + + def add_cdp_listener(self, event_name, callback): + if ( + self.reactor + and self.reactor is not None + and isinstance(self.reactor, Reactor) + ): + self.reactor.add_event_handler(event_name, callback) + return self.reactor.handlers + return False + + def clear_cdp_listeners(self): + if self.reactor and isinstance(self.reactor, Reactor): + self.reactor.handlers.clear() + + def window_new(self): + self.execute( + selenium.webdriver.remote.command.Command.NEW_WINDOW, {"type": "window"} + ) + + def tab_new(self, url: str): + """ + this opens a url in a new tab. + apparently, that passes all tests directly! + + Parameters + ---------- + url + + Returns + ------- + + """ + if not hasattr(self, "cdp"): + from .cdp import CDP + + cdp = CDP(self.options) + cdp.tab_new(url) + + def reconnect(self, timeout=0.1): + try: + self.service.stop() + except Exception as e: + logger.debug(e) + time.sleep(timeout) + try: + self.service.start() + except Exception as e: + logger.debug(e) + + try: + self.start_session() + except Exception as e: + logger.debug(e) + + def start_session(self, capabilities=None, browser_profile=None): + if not capabilities: + capabilities = self.options.to_capabilities() + super().start_session(capabilities) + # super(Chrome, self).start_session(capabilities, browser_profile) # Original explicit call commented out + + def find_elements_recursive(self, by, value): + """ + find elements in all frames + this is a generator function, which is needed + since if it would return a list of elements, they + will be stale on arrival. + using generator, when the element is returned we are in the correct frame + to use it directly + Args: + by: By + value: str + Returns: Generator[webelement.WebElement] + """ + def search_frame(f=None): + if not f: + # ensure we are on main content frame + self.switch_to.default_content() + else: + self.switch_to.frame(f) + for elem in self.find_elements(by, value): + yield elem + # switch back to main content, otherwise we will get StaleElementReferenceException + self.switch_to.default_content() + + # search root frame + for elem in search_frame(): + yield elem + # get iframes + frames = self.find_elements('css selector', 'iframe') + + # search per frame + for f in frames: + for elem in search_frame(f): + yield elem + + def quit(self): + try: + self.service.stop() + self.service.process.kill() + self.command_executor.close() + self.service.process.wait(5) + logger.debug("webdriver process ended") + except (AttributeError, RuntimeError, OSError): + pass + try: + self.reactor.event.set() + logger.debug("shutting down reactor") + except AttributeError: + pass + try: + os.kill(self.browser_pid, 15) + logger.debug("gracefully closed browser") + except Exception as e: # noqa + pass + if ( + hasattr(self, "keep_user_data_dir") + and hasattr(self, "user_data_dir") + and not self.keep_user_data_dir + ): + for _ in range(5): + try: + shutil.rmtree(self.user_data_dir, ignore_errors=False) + except FileNotFoundError: + pass + except (RuntimeError, OSError, PermissionError) as e: + logger.debug( + "When removing the temp profile, a %s occured: %s\nretrying..." + % (e.__class__.__name__, e) + ) + else: + logger.debug("successfully removed %s" % self.user_data_dir) + break + + try: + time.sleep(0.1) + except OSError: + pass + + # dereference patcher, so patcher can start cleaning up as well. + # this must come last, otherwise it will throw 'in use' errors + self.patcher = None + + def __getattribute__(self, item): + if not super().__getattribute__("debug"): + return super().__getattribute__(item) + else: + import inspect + + original = super().__getattribute__(item) + if inspect.ismethod(original) and not inspect.isclass(original): + + def newfunc(*args, **kwargs): + logger.debug( + "calling %s with args %s and kwargs %s\n" + % (original.__qualname__, args, kwargs) + ) + return original(*args, **kwargs) + + return newfunc + return original + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.service.stop() + time.sleep(self._delay) + self.service.start() + self.start_session() + + def __hash__(self): + return hash(self.options.debugger_address) + + def __dir__(self): + return object.__dir__(self) + + def __del__(self): + try: + self.service.process.kill() + except: # noqa + pass + self.quit() + + @classmethod + def _ensure_close(cls, self): + # needs to be a classmethod so finalize can find the reference + logger.info("ensuring close") + if ( + hasattr(self, "service") + and hasattr(self.service, "process") + and hasattr(self.service.process, "kill") + ): + self.service.process.kill() + + +def find_chrome_executable(): + """ + Finds the chrome, chrome beta, chrome canary, chromium executable + + Returns + ------- + executable_path : str + the full file path to found executable + + """ + candidates = set() + if IS_POSIX: + for item in os.environ.get("PATH").split(os.pathsep): + for subitem in ( + "google-chrome", + "chromium", + "chromium-browser", + "chrome", + "google-chrome-stable", + ): + candidates.add(os.sep.join((item, subitem))) + if "darwin" in sys.platform: + candidates.update( + [ + "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", + "/Applications/Chromium.app/Contents/MacOS/Chromium", + ] + ) + else: + for item in map( + os.environ.get, + ("PROGRAMFILES", "PROGRAMFILES(X86)", "LOCALAPPDATA", "PROGRAMW6432"), + ): + if item is not None: + for subitem in ( + "Google/Chrome/Application", + ): + candidates.add(os.sep.join((item, subitem, "chrome.exe"))) + for candidate in candidates: + logger.debug('checking if %s exists and is executable' % candidate) + if os.path.exists(candidate) and os.access(candidate, os.X_OK): + logger.debug('found! using %s' % candidate) + return os.path.normpath(candidate) diff --git a/flaresolverr/undetected_chromedriver/cdp.py b/flaresolverr/undetected_chromedriver/cdp.py new file mode 100644 index 0000000000000000000000000000000000000000..32a503c73c26f71f7513e891555a72ff14f03bfe --- /dev/null +++ b/flaresolverr/undetected_chromedriver/cdp.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +# this module is part of undetected_chromedriver + +import json +import logging + +import requests +import websockets + + +log = logging.getLogger(__name__) + + +class CDPObject(dict): + def __init__(self, *a, **k): + super().__init__(*a, **k) + self.__dict__ = self + for k in self.__dict__: + if isinstance(self.__dict__[k], dict): + self.__dict__[k] = CDPObject(self.__dict__[k]) + elif isinstance(self.__dict__[k], list): + for i in range(len(self.__dict__[k])): + if isinstance(self.__dict__[k][i], dict): + self.__dict__[k][i] = CDPObject(self) + + def __repr__(self): + tpl = f"{self.__class__.__name__}(\n\t{{}}\n\t)" + return tpl.format("\n ".join(f"{k} = {v}" for k, v in self.items())) + + +class PageElement(CDPObject): + pass + + +class CDP: + log = logging.getLogger("CDP") + + endpoints = CDPObject( + { + "json": "/json", + "protocol": "/json/protocol", + "list": "/json/list", + "new": "/json/new?{url}", + "activate": "/json/activate/{id}", + "close": "/json/close/{id}", + } + ) + + def __init__(self, options: "ChromeOptions"): # noqa + self.server_addr = "http://{0}:{1}".format(*options.debugger_address.split(":")) + + self._reqid = 0 + self._session = requests.Session() + self._last_resp = None + self._last_json = None + + resp = self.get(self.endpoints.json) # noqa + self.sessionId = resp[0]["id"] + self.wsurl = resp[0]["webSocketDebuggerUrl"] + + def tab_activate(self, id=None): + if not id: + active_tab = self.tab_list()[0] + id = active_tab.id # noqa + self.wsurl = active_tab.webSocketDebuggerUrl # noqa + return self.post(self.endpoints["activate"].format(id=id)) + + def tab_list(self): + retval = self.get(self.endpoints["list"]) + return [PageElement(o) for o in retval] + + def tab_new(self, url): + return self.post(self.endpoints["new"].format(url=url)) + + def tab_close_last_opened(self): + sessions = self.tab_list() + opentabs = [s for s in sessions if s["type"] == "page"] + return self.post(self.endpoints["close"].format(id=opentabs[-1]["id"])) + + async def send(self, method: str, params: dict): + self._reqid += 1 + async with websockets.connect(self.wsurl) as ws: + await ws.send( + json.dumps({"method": method, "params": params, "id": self._reqid}) + ) + self._last_resp = await ws.recv() + self._last_json = json.loads(self._last_resp) + self.log.info(self._last_json) + + def get(self, uri): + resp = self._session.get(self.server_addr + uri) + try: + self._last_resp = resp + self._last_json = resp.json() + except Exception: + return + else: + return self._last_json + + def post(self, uri, data: dict = None): + if not data: + data = {} + resp = self._session.post(self.server_addr + uri, json=data) + try: + self._last_resp = resp + self._last_json = resp.json() + except Exception: + return self._last_resp + + @property + def last_json(self): + return self._last_json diff --git a/flaresolverr/undetected_chromedriver/devtool.py b/flaresolverr/undetected_chromedriver/devtool.py new file mode 100644 index 0000000000000000000000000000000000000000..915d4176a5898f8c4caed006866cac6ceccdad3f --- /dev/null +++ b/flaresolverr/undetected_chromedriver/devtool.py @@ -0,0 +1,193 @@ +import asyncio +from collections.abc import Mapping +from collections.abc import Sequence +from functools import wraps +import os +import logging +import threading +import time +import traceback +from typing import Any +from typing import Awaitable +from typing import Callable +from typing import List +from typing import Optional + + +class Structure(dict): + """ + This is a dict-like object structure, which you should subclass + Only properties defined in the class context are used on initialization. + + See example + """ + + _store = {} + + def __init__(self, *a, **kw): + """ + Instantiate a new instance. + + :param a: + :param kw: + """ + + super().__init__() + + # auxiliar dict + d = dict(*a, **kw) + for k, v in d.items(): + if isinstance(v, Mapping): + self[k] = self.__class__(v) + elif isinstance(v, Sequence) and not isinstance(v, (str, bytes)): + self[k] = [self.__class__(i) for i in v] + else: + self[k] = v + super().__setattr__("__dict__", self) + + def __getattr__(self, item): + return getattr(super(), item) + + def __getitem__(self, item): + return super().__getitem__(item) + + def __setattr__(self, key, value): + self.__setitem__(key, value) + + def __setitem__(self, key, value): + super().__setitem__(key, value) + + def update(self, *a, **kw): + super().update(*a, **kw) + + def __eq__(self, other): + return frozenset(other.items()) == frozenset(self.items()) + + def __hash__(self): + return hash(frozenset(self.items())) + + @classmethod + def __init_subclass__(cls, **kwargs): + cls._store = {} + + def _normalize_strings(self): + for k, v in self.copy().items(): + if isinstance(v, (str)): + self[k] = v.strip() + + +def timeout(seconds=3, on_timeout: Optional[Callable[[callable], Any]] = None): + def wrapper(func): + @wraps(func) + def wrapped(*args, **kwargs): + def function_reached_timeout(): + if on_timeout: + on_timeout(func) + else: + raise TimeoutError("function call timed out") + + t = threading.Timer(interval=seconds, function=function_reached_timeout) + t.start() + try: + return func(*args, **kwargs) + except: + t.cancel() + raise + finally: + t.cancel() + + return wrapped + + return wrapper + + +def test(): + import sys, os + + sys.path.insert(0, os.path.abspath(os.path.dirname(__file__))) + import undetected_chromedriver as uc + import threading + + def collector( + driver: uc.Chrome, + stop_event: threading.Event, + on_event_coro: Optional[Callable[[List[str]], Awaitable[Any]]] = None, + listen_events: Sequence = ("browser", "network", "performance"), + ): + def threaded(driver, stop_event, on_event_coro): + async def _ensure_service_started(): + while ( + getattr(driver, "service", False) + and getattr(driver.service, "process", False) + and driver.service.process.poll() + ): + print("waiting for driver service to come back on") + await asyncio.sleep(0.05) + # await asyncio.sleep(driver._delay or .25) + + async def get_log_lines(typ): + await _ensure_service_started() + return driver.get_log(typ) + + async def looper(): + while not stop_event.is_set(): + log_lines = [] + try: + for _ in listen_events: + try: + log_lines += await get_log_lines(_) + except: + if logging.getLogger().getEffectiveLevel() <= 10: + traceback.print_exc() + continue + if log_lines and on_event_coro: + await on_event_coro(log_lines) + except Exception as e: + if logging.getLogger().getEffectiveLevel() <= 10: + traceback.print_exc() + + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + loop.run_until_complete(looper()) + + t = threading.Thread(target=threaded, args=(driver, stop_event, on_event_coro)) + t.start() + + async def on_event(data): + print("on_event") + print("data:", data) + + def func_called(fn): + def wrapped(*args, **kwargs): + print( + "func called! %s (args: %s, kwargs: %s)" % (fn.__name__, args, kwargs) + ) + while driver.service.process and driver.service.process.poll() is not None: + time.sleep(0.1) + res = fn(*args, **kwargs) + print("func completed! (result: %s)" % res) + return res + + return wrapped + + logging.basicConfig(level=10) + + options = uc.ChromeOptions() + options.set_capability( + "goog:loggingPrefs", {"performance": "ALL", "browser": "ALL", "network": "ALL"} + ) + + driver = uc.Chrome(version_main=96, options=options) + + # driver.command_executor._request = timeout(seconds=1)(driver.command_executor._request) + driver.command_executor._request = func_called(driver.command_executor._request) + collector_stop = threading.Event() + collector(driver, collector_stop, on_event) + + driver.get("https://nowsecure.nl") + + time.sleep(10) + + if os.name == "nt": + driver.close() + driver.quit() diff --git a/flaresolverr/undetected_chromedriver/dprocess.py b/flaresolverr/undetected_chromedriver/dprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..6d053fa9a87d74570dae311e18ae550ac6006da1 --- /dev/null +++ b/flaresolverr/undetected_chromedriver/dprocess.py @@ -0,0 +1,77 @@ +import atexit +import logging +import multiprocessing +import os +import platform +import signal +from subprocess import PIPE +from subprocess import Popen +import sys + + +CREATE_NEW_PROCESS_GROUP = 0x00000200 +DETACHED_PROCESS = 0x00000008 + +REGISTERED = [] + + +def start_detached(executable, *args): + """ + Starts a fully independent subprocess (with no parent) + :param executable: executable + :param args: arguments to the executable, eg: ['--param1_key=param1_val', '-vvv' ...] + :return: pid of the grandchild process + """ + + # create pipe + reader, writer = multiprocessing.Pipe(False) + + # do not keep reference + process = multiprocessing.Process( + target=_start_detached, + args=(executable, *args), + kwargs={"writer": writer}, + daemon=True, + ) + process.start() + process.join() + # receive pid from pipe + pid = reader.recv() + REGISTERED.append(pid) + # close pipes + writer.close() + reader.close() + process.close() + + return pid + + +def _start_detached(executable, *args, writer: multiprocessing.Pipe = None): + # configure launch + kwargs = {} + if platform.system() == "Windows": + kwargs.update(creationflags=DETACHED_PROCESS | CREATE_NEW_PROCESS_GROUP) + elif sys.version_info < (3, 2): + # assume posix + kwargs.update(preexec_fn=os.setsid) + else: # Python 3.2+ and Unix + kwargs.update(start_new_session=True) + + # run + p = Popen([executable, *args], stdin=PIPE, stdout=PIPE, stderr=PIPE, **kwargs) + + # send pid to pipe + writer.send(p.pid) + sys.exit() + + +def _cleanup(): + for pid in REGISTERED: + try: + logging.getLogger(__name__).debug("cleaning up pid %d " % pid) + os.kill(pid, signal.SIGTERM) + except: # noqa + pass + + +atexit.register(_cleanup) diff --git a/flaresolverr/undetected_chromedriver/options.py b/flaresolverr/undetected_chromedriver/options.py new file mode 100644 index 0000000000000000000000000000000000000000..8078ae957d14b43296a094fbe3e87e463069aa4b --- /dev/null +++ b/flaresolverr/undetected_chromedriver/options.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +# this module is part of undetected_chromedriver + + +import json +import os + +from selenium.webdriver.chromium.options import ChromiumOptions as _ChromiumOptions + + +class ChromeOptions(_ChromiumOptions): + _session = None + _user_data_dir = None + + @property + def user_data_dir(self): + return self._user_data_dir + + @user_data_dir.setter + def user_data_dir(self, path: str): + """ + Sets the browser profile folder to use, or creates a new profile + at given . + + Parameters + ---------- + path: str + the path to a chrome profile folder + if it does not exist, a new profile will be created at given location + """ + apath = os.path.abspath(path) + self._user_data_dir = os.path.normpath(apath) + + @staticmethod + def _undot_key(key, value): + """turn a (dotted key, value) into a proper nested dict""" + if "." in key: + key, rest = key.split(".", 1) + value = ChromeOptions._undot_key(rest, value) + return {key: value} + + @staticmethod + def _merge_nested(a, b): + """ + merges b into a + leaf values in a are overwritten with values from b + """ + for key in b: + if key in a: + if isinstance(a[key], dict) and isinstance(b[key], dict): + ChromeOptions._merge_nested(a[key], b[key]) + continue + a[key] = b[key] + return a + + def handle_prefs(self, user_data_dir): + prefs = self.experimental_options.get("prefs") + if prefs: + user_data_dir = user_data_dir or self._user_data_dir + default_path = os.path.join(user_data_dir, "Default") + os.makedirs(default_path, exist_ok=True) + + # undot prefs dict keys + undot_prefs = {} + for key, value in prefs.items(): + undot_prefs = self._merge_nested( + undot_prefs, self._undot_key(key, value) + ) + + prefs_file = os.path.join(default_path, "Preferences") + if os.path.exists(prefs_file): + with open(prefs_file, encoding="latin1", mode="r") as f: + undot_prefs = self._merge_nested(json.load(f), undot_prefs) + + with open(prefs_file, encoding="latin1", mode="w") as f: + json.dump(undot_prefs, f) + + # remove the experimental_options to avoid an error + del self._experimental_options["prefs"] + + @classmethod + def from_options(cls, options): + o = cls() + o.__dict__.update(options.__dict__) + return o diff --git a/flaresolverr/undetected_chromedriver/patcher.py b/flaresolverr/undetected_chromedriver/patcher.py new file mode 100644 index 0000000000000000000000000000000000000000..7a60f76492ead6df854f61cb90b7ef78386fbf55 --- /dev/null +++ b/flaresolverr/undetected_chromedriver/patcher.py @@ -0,0 +1,473 @@ +#!/usr/bin/env python3 +# this module is part of undetected_chromedriver + +from packaging.version import Version as LooseVersion +import io +import json +import logging +import os +import pathlib +import platform +import random +import re +import shutil +import string +import subprocess +import sys +import time +from urllib.request import urlopen +from urllib.request import urlretrieve +import zipfile +from multiprocessing import Lock + +logger = logging.getLogger(__name__) + +IS_POSIX = sys.platform.startswith(("darwin", "cygwin", "linux", "linux2", "freebsd")) + + +class Patcher(object): + lock = Lock() + exe_name = "chromedriver%s" + + platform = sys.platform + if platform.endswith("win32"): + d = "~/appdata/roaming/undetected_chromedriver" + elif "LAMBDA_TASK_ROOT" in os.environ: + d = "/tmp/undetected_chromedriver" + elif platform.startswith(("linux", "linux2")): + d = "~/.local/share/undetected_chromedriver" + elif platform.endswith("darwin"): + d = "~/Library/Application Support/undetected_chromedriver" + else: + d = "~/.undetected_chromedriver" + data_path = os.path.abspath(os.path.expanduser(d)) + + def __init__( + self, + executable_path=None, + force=False, + version_main: int = 0, + user_multi_procs=False, + ): + """ + Args: + executable_path: None = automatic + a full file path to the chromedriver executable + force: False + terminate processes which are holding lock + version_main: 0 = auto + specify main chrome version (rounded, ex: 82) + """ + self.force = force + self._custom_exe_path = False + prefix = "undetected" + self.user_multi_procs = user_multi_procs + + try: + # Try to convert version_main into an integer + version_main_int = int(version_main) + # check if version_main_int is less than or equal to e.g 114 + self.is_old_chromedriver = version_main and version_main_int <= 114 + except (ValueError,TypeError): + # Check not running inside Docker + if not os.path.exists("/app/chromedriver"): + # If the conversion fails, log an error message + logging.info("version_main cannot be converted to an integer") + # Set self.is_old_chromedriver to False if the conversion fails + self.is_old_chromedriver = False + + # Needs to be called before self.exe_name is accessed + self._set_platform_name() + + if not os.path.exists(self.data_path): + os.makedirs(self.data_path, exist_ok=True) + + if not executable_path: + if sys.platform.startswith("freebsd"): + self.executable_path = os.path.join( + self.data_path, self.exe_name + ) + else: + self.executable_path = os.path.join( + self.data_path, "_".join([prefix, self.exe_name]) + ) + + if not IS_POSIX: + if executable_path: + if not executable_path[-4:] == ".exe": + executable_path += ".exe" + + self.zip_path = os.path.join(self.data_path, prefix) + + if not executable_path: + if not self.user_multi_procs: + self.executable_path = os.path.abspath( + os.path.join(".", self.executable_path) + ) + + if executable_path: + self._custom_exe_path = True + self.executable_path = executable_path + + # Set the correct repository to download the Chromedriver from + if self.is_old_chromedriver: + self.url_repo = "https://chromedriver.storage.googleapis.com" + else: + self.url_repo = "https://googlechromelabs.github.io/chrome-for-testing" + + self.version_main = version_main + self.version_full = None + + def _set_platform_name(self): + """ + Set the platform and exe name based on the platform undetected_chromedriver is running on + in order to download the correct chromedriver. + """ + if self.platform.endswith("win32"): + self.platform_name = "win32" + self.exe_name %= ".exe" + if self.platform.endswith(("linux", "linux2")): + self.platform_name = "linux64" + self.exe_name %= "" + if self.platform.endswith("darwin"): + if self.is_old_chromedriver: + self.platform_name = "mac64" + else: + self.platform_name = "mac-x64" + self.exe_name %= "" + if self.platform.startswith("freebsd"): + self.platform_name = "freebsd" + self.exe_name %= "" + + def auto(self, executable_path=None, force=False, version_main=None, _=None): + """ + + Args: + executable_path: + force: + version_main: + + Returns: + + """ + p = pathlib.Path(self.data_path) + if self.user_multi_procs: + with Lock(): + files = list(p.rglob("*chromedriver*")) + most_recent = max(files, key=lambda f: f.stat().st_mtime) + files.remove(most_recent) + list(map(lambda f: f.unlink(), files)) + if self.is_binary_patched(most_recent): + self.executable_path = str(most_recent) + return True + + if executable_path: + self.executable_path = executable_path + self._custom_exe_path = True + + if self._custom_exe_path: + ispatched = self.is_binary_patched(self.executable_path) + if not ispatched: + return self.patch_exe() + else: + return + + if version_main: + self.version_main = version_main + if force is True: + self.force = force + + + if self.platform_name == "freebsd": + chromedriver_path = shutil.which("chromedriver") + + if not os.path.isfile(chromedriver_path) or not os.access(chromedriver_path, os.X_OK): + logging.error("Chromedriver not installed!") + return + + version_path = os.path.join(os.path.dirname(self.executable_path), "version.txt") + + process = os.popen(f'"{chromedriver_path}" --version') + chromedriver_version = process.read().split(' ')[1].split(' ')[0] + process.close() + + current_version = None + if os.path.isfile(version_path) or os.access(version_path, os.X_OK): + with open(version_path, 'r') as f: + current_version = f.read() + + if current_version != chromedriver_version: + logging.info("Copying chromedriver executable...") + shutil.copy(chromedriver_path, self.executable_path) + os.chmod(self.executable_path, 0o755) + + with open(version_path, 'w') as f: + f.write(chromedriver_version) + + logging.info("Chromedriver executable copied!") + else: + try: + os.unlink(self.executable_path) + except PermissionError: + if self.force: + self.force_kill_instances(self.executable_path) + return self.auto(force=not self.force) + try: + if self.is_binary_patched(): + # assumes already running AND patched + return True + except PermissionError: + pass + # return False + except FileNotFoundError: + pass + + release = self.fetch_release_number() + self.version_main = release.major + self.version_full = release + self.unzip_package(self.fetch_package()) + + return self.patch() + + def driver_binary_in_use(self, path: str = None) -> bool: + """ + naive test to check if a found chromedriver binary is + currently in use + + Args: + path: a string or PathLike object to the binary to check. + if not specified, we check use this object's executable_path + """ + if not path: + path = self.executable_path + p = pathlib.Path(path) + + if not p.exists(): + raise OSError("file does not exist: %s" % p) + try: + with open(p, mode="a+b") as fs: + exc = [] + try: + + fs.seek(0, 0) + except PermissionError as e: + exc.append(e) # since some systems apprently allow seeking + # we conduct another test + try: + fs.readline() + except PermissionError as e: + exc.append(e) + + if exc: + + return True + return False + # ok safe to assume this is in use + except Exception as e: + # logger.exception("whoops ", e) + pass + + def cleanup_unused_files(self): + p = pathlib.Path(self.data_path) + items = list(p.glob("*undetected*")) + for item in items: + try: + item.unlink() + except: + pass + + def patch(self): + self.patch_exe() + return self.is_binary_patched() + + def fetch_release_number(self): + """ + Gets the latest major version available, or the latest major version of self.target_version if set explicitly. + :return: version string + :rtype: LooseVersion + """ + # Endpoint for old versions of Chromedriver (114 and below) + if self.is_old_chromedriver: + path = f"/latest_release_{self.version_main}" + path = path.upper() + logger.debug("getting release number from %s" % path) + return LooseVersion(urlopen(self.url_repo + path).read().decode()) + + # Endpoint for new versions of Chromedriver (115+) + if not self.version_main: + # Fetch the latest version + path = "/last-known-good-versions-with-downloads.json" + logger.debug("getting release number from %s" % path) + with urlopen(self.url_repo + path) as conn: + response = conn.read().decode() + + last_versions = json.loads(response) + return LooseVersion(last_versions["channels"]["Stable"]["version"]) + + # Fetch the latest minor version of the major version provided + path = "/latest-versions-per-milestone-with-downloads.json" + logger.debug("getting release number from %s" % path) + with urlopen(self.url_repo + path) as conn: + response = conn.read().decode() + + major_versions = json.loads(response) + return LooseVersion(major_versions["milestones"][str(self.version_main)]["version"]) + + def parse_exe_version(self): + with io.open(self.executable_path, "rb") as f: + for line in iter(lambda: f.readline(), b""): + match = re.search(rb"platform_handle\x00content\x00([0-9.]*)", line) + if match: + return LooseVersion(match[1].decode()) + + def fetch_package(self): + """ + Downloads ChromeDriver from source + + :return: path to downloaded file + """ + zip_name = f"chromedriver_{self.platform_name}.zip" + if self.is_old_chromedriver: + download_url = "%s/%s/%s" % (self.url_repo, str(self.version_full), zip_name) + else: + zip_name = zip_name.replace("_", "-", 1) + download_url = "https://storage.googleapis.com/chrome-for-testing-public/%s/%s/%s" + download_url %= (str(self.version_full), self.platform_name, zip_name) + + logger.debug("downloading from %s" % download_url) + return urlretrieve(download_url)[0] + + def unzip_package(self, fp): + """ + Does what it says + + :return: path to unpacked executable + """ + exe_path = self.exe_name + if not self.is_old_chromedriver: + # The new chromedriver unzips into its own folder + zip_name = f"chromedriver-{self.platform_name}" + exe_path = os.path.join(zip_name, self.exe_name) + + logger.debug("unzipping %s" % fp) + try: + os.unlink(self.zip_path) + except (FileNotFoundError, OSError): + pass + + os.makedirs(self.zip_path, mode=0o755, exist_ok=True) + with zipfile.ZipFile(fp, mode="r") as zf: + zf.extractall(self.zip_path) + os.rename(os.path.join(self.zip_path, exe_path), self.executable_path) + os.remove(fp) + shutil.rmtree + os.chmod(self.executable_path, 0o755) + return self.executable_path + + @staticmethod + def force_kill_instances(exe_name): + """ + kills running instances. + :param: executable name to kill, may be a path as well + + :return: True on success else False + """ + exe_name = os.path.basename(exe_name) + if IS_POSIX: + # Using shell=True for pidof, consider a more robust pid finding method if issues arise. + # pgrep can be an alternative: ["pgrep", "-f", exe_name] + # Or psutil if adding a dependency is acceptable. + command = f"pidof {exe_name}" + try: + result = subprocess.run(command, shell=True, capture_output=True, text=True, check=True) + pids = result.stdout.strip().split() + if pids: + subprocess.run(["kill", "-9"] + pids, check=False) # Changed from -f -9 to -9 as -f is not standard for kill + return True + return False # No PIDs found + except subprocess.CalledProcessError: # pidof returns 1 if no process found + return False # No process found + except Exception as e: + logger.debug(f"Error killing process on POSIX: {e}") + return False + else: + try: + # TASKKILL /F /IM chromedriver.exe + result = subprocess.run(["taskkill", "/f", "/im", exe_name], check=False, capture_output=True) + # taskkill returns 0 if process was killed, 128 if not found. + return result.returncode == 0 + except Exception as e: + logger.debug(f"Error killing process on Windows: {e}") + return False + + @staticmethod + def gen_random_cdc(): + cdc = random.choices(string.ascii_letters, k=27) + return "".join(cdc).encode() + + def is_binary_patched(self, executable_path=None): + executable_path = executable_path or self.executable_path + try: + with io.open(executable_path, "rb") as fh: + return fh.read().find(b"undetected chromedriver") != -1 + except FileNotFoundError: + return False + + def patch_exe(self): + start = time.perf_counter() + logger.info("patching driver executable %s" % self.executable_path) + with io.open(self.executable_path, "r+b") as fh: + content = fh.read() + # match_injected_codeblock = re.search(rb"{window.*;}", content) + match_injected_codeblock = re.search(rb"\{window\.cdc.*?;\}", content) + if match_injected_codeblock: + target_bytes = match_injected_codeblock[0] + new_target_bytes = ( + b'{console.log("undetected chromedriver 1337!")}'.ljust( + len(target_bytes), b" " + ) + ) + new_content = content.replace(target_bytes, new_target_bytes) + if new_content == content: + logger.warning( + "something went wrong patching the driver binary. could not find injection code block" + ) + else: + logger.debug( + "found block:\n%s\nreplacing with:\n%s" + % (target_bytes, new_target_bytes) + ) + fh.seek(0) + fh.write(new_content) + logger.debug( + "patching took us {:.2f} seconds".format(time.perf_counter() - start) + ) + + def __repr__(self): + return "{0:s}({1:s})".format( + self.__class__.__name__, + self.executable_path, + ) + + def __del__(self): + if self._custom_exe_path: + # if the driver binary is specified by user + # we assume it is important enough to not delete it + return + else: + timeout = 3 # stop trying after this many seconds + t = time.monotonic() + now = lambda: time.monotonic() + while now() - t > timeout: + # we don't want to wait until the end of time + try: + if self.user_multi_procs: + break + os.unlink(self.executable_path) + logger.debug("successfully unlinked %s" % self.executable_path) + break + except (OSError, RuntimeError, PermissionError): + time.sleep(0.01) + continue + except FileNotFoundError: + break diff --git a/flaresolverr/undetected_chromedriver/reactor.py b/flaresolverr/undetected_chromedriver/reactor.py new file mode 100644 index 0000000000000000000000000000000000000000..d52e312e37dbed8669e43d43c762c0aa343edac7 --- /dev/null +++ b/flaresolverr/undetected_chromedriver/reactor.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +# this module is part of undetected_chromedriver + +import asyncio +import json +import logging +import threading + + +logger = logging.getLogger(__name__) + + +class Reactor(threading.Thread): + def __init__(self, driver: "Chrome"): + super().__init__() + + self.driver = driver + self.loop = asyncio.new_event_loop() + + self.lock = threading.Lock() + self.event = threading.Event() + self.daemon = True + self.handlers = {} + + def add_event_handler(self, method_name, callback: callable): + """ + + Parameters + ---------- + event_name: str + example "Network.responseReceived" + + callback: callable + callable which accepts 1 parameter: the message object dictionary + + Returns + ------- + + """ + with self.lock: + self.handlers[method_name.lower()] = callback + + @property + def running(self): + return not self.event.is_set() + + def run(self): + try: + asyncio.set_event_loop(self.loop) + self.loop.run_until_complete(self.listen()) + except Exception as e: + logger.warning("Reactor.run() => %s", e) + + async def _wait_service_started(self): + while True: + with self.lock: + if ( + getattr(self.driver, "service", None) + and getattr(self.driver.service, "process", None) + and self.driver.service.process.poll() + ): + await asyncio.sleep(self.driver._delay or 0.25) + else: + break + + async def listen(self): + while self.running: + await self._wait_service_started() + await asyncio.sleep(1) + + try: + with self.lock: + log_entries = self.driver.get_log("performance") + + for entry in log_entries: + try: + obj_serialized: str = entry.get("message") + obj = json.loads(obj_serialized) + message = obj.get("message") + method = message.get("method") + + if "*" in self.handlers: + await self.loop.run_in_executor( + None, self.handlers["*"], message + ) + elif method.lower() in self.handlers: + await self.loop.run_in_executor( + None, self.handlers[method.lower()], message + ) + + # print(type(message), message) + except Exception as e: + raise e from None + + except Exception as e: + if "invalid session id" in str(e): + pass + else: + logging.debug("exception ignored :", e) diff --git a/flaresolverr/undetected_chromedriver/webelement.py b/flaresolverr/undetected_chromedriver/webelement.py new file mode 100644 index 0000000000000000000000000000000000000000..03d687890b58751a6910f3014fdb31877779e3d4 --- /dev/null +++ b/flaresolverr/undetected_chromedriver/webelement.py @@ -0,0 +1,86 @@ +from typing import List + +from selenium.webdriver.common.by import By +import selenium.webdriver.remote.webelement + + +class WebElement(selenium.webdriver.remote.webelement.WebElement): + def click_safe(self): + super().click() + self._parent.reconnect(0.1) + + def children( + self, tag=None, recursive=False + ) -> List[selenium.webdriver.remote.webelement.WebElement]: + """ + returns direct child elements of current element + :param tag: str, if supplied, returns nodes only + """ + script = "return [... arguments[0].children]" + if tag: + script += ".filter( node => node.tagName === '%s')" % tag.upper() + if recursive: + return list(_recursive_children(self, tag)) + return list(self._parent.execute_script(script, self)) + + +class UCWebElement(WebElement): + """ + Custom WebElement class which makes it easier to view elements when + working in an interactive environment. + + standard webelement repr: + + + using this WebElement class: + )> + + """ + + def __init__(self, parent, id_): + super().__init__(parent, id_) + self._attrs = None + + @property + def attrs(self): + if not self._attrs: + self._attrs = self._parent.execute_script( + """ + var items = {}; + for (index = 0; index < arguments[0].attributes.length; ++index) + { + items[arguments[0].attributes[index].name] = arguments[0].attributes[index].value + }; + return items; + """, + self, + ) + return self._attrs + + def __repr__(self): + strattrs = " ".join([f'{k}="{v}"' for k, v in self.attrs.items()]) + if strattrs: + strattrs = " " + strattrs + return f"{self.__class__.__name__} <{self.tag_name}{strattrs}>" + + +def _recursive_children(element, tag: str = None, _results=None): + """ + returns all children of recursively + + :param element: `WebElement` object. + find children below this + + :param tag: str = None. + if provided, return only elements. example: 'a', or 'img' + :param _results: do not use! + """ + results = _results or set() + for element in element.children(): + if tag: + if element.tag_name == tag: + results.add(element) + else: + results.add(element) + results |= _recursive_children(element, tag, results) + return results diff --git a/flaresolverr/utils.py b/flaresolverr/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9ad558edd0e85f2c5b90e89c4e9a496a09e21fff --- /dev/null +++ b/flaresolverr/utils.py @@ -0,0 +1,376 @@ +import json +import logging +import os +import platform +import re +import shutil +import sys +import tempfile +import urllib.parse + +from selenium.webdriver.chrome.webdriver import WebDriver +import undetected_chromedriver as uc + +FLARESOLVERR_VERSION = None +PLATFORM_VERSION = None +CHROME_EXE_PATH = None +CHROME_MAJOR_VERSION = None +USER_AGENT = None +XVFB_DISPLAY = None +PATCHED_DRIVER_PATH = None + + +def get_config_log_html() -> bool: + return os.environ.get('LOG_HTML', 'false').lower() == 'true' + + +def get_config_headless() -> bool: + return os.environ.get('HEADLESS', 'true').lower() == 'true' + + +def get_config_disable_media() -> bool: + return os.environ.get('DISABLE_MEDIA', 'false').lower() == 'true' + + +def get_flaresolverr_version() -> str: + global FLARESOLVERR_VERSION + if FLARESOLVERR_VERSION is not None: + return FLARESOLVERR_VERSION + + package_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'package.json') + if not os.path.isfile(package_path): + package_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'package.json') + with open(package_path) as f: + FLARESOLVERR_VERSION = json.loads(f.read())['version'] + return FLARESOLVERR_VERSION + +def get_current_platform() -> str: + global PLATFORM_VERSION + if PLATFORM_VERSION is not None: + return PLATFORM_VERSION + PLATFORM_VERSION = os.name + return PLATFORM_VERSION + + +def create_proxy_extension(proxy: dict) -> str: + parsed_url = urllib.parse.urlparse(proxy['url']) + scheme = parsed_url.scheme + host = parsed_url.hostname + port = parsed_url.port + username = proxy['username'] + password = proxy['password'] + manifest_json = """ + { + "version": "1.0.0", + "manifest_version": 3, + "name": "Chrome Proxy", + "permissions": [ + "proxy", + "tabs", + "storage", + "webRequest", + "webRequestAuthProvider" + ], + "host_permissions": [ + "" + ], + "background": { + "service_worker": "background.js" + }, + "minimum_chrome_version": "76.0.0" + } + """ + + background_js = """ + var config = { + mode: "fixed_servers", + rules: { + singleProxy: { + scheme: "%s", + host: "%s", + port: %d + }, + bypassList: ["localhost"] + } + }; + + chrome.proxy.settings.set({value: config, scope: "regular"}, function() {}); + + function callbackFn(details) { + return { + authCredentials: { + username: "%s", + password: "%s" + } + }; + } + + chrome.webRequest.onAuthRequired.addListener( + callbackFn, + { urls: [""] }, + ['blocking'] + ); + """ % ( + scheme, + host, + port, + username, + password + ) + + proxy_extension_dir = tempfile.mkdtemp() + + with open(os.path.join(proxy_extension_dir, "manifest.json"), "w") as f: + f.write(manifest_json) + + with open(os.path.join(proxy_extension_dir, "background.js"), "w") as f: + f.write(background_js) + + return proxy_extension_dir + + +def get_webdriver(proxy: dict = None) -> WebDriver: + global PATCHED_DRIVER_PATH, USER_AGENT + logging.debug('Launching web browser...') + + # undetected_chromedriver + options = uc.ChromeOptions() + options.add_argument('--no-sandbox') + options.add_argument('--window-size=1280,1024') # Smaller window for less overhead + options.add_argument('--disable-search-engine-choice-screen') + options.add_argument('--disable-setuid-sandbox') + options.add_argument('--disable-dev-shm-usage') + options.add_argument('--no-zygote') + options.add_argument('--disable-gpu') # Disable GPU for faster headless boot + options.add_argument('--mute-audio') + options.add_argument('--disable-notifications') + options.add_argument('--disable-popup-blocking') + options.add_argument('--disable-extensions') + options.add_argument('--disable-blink-features=AutomationControlled') + + # Force headless and invisibility + options.add_argument('--headless=new') + + IS_ARMARCH = platform.machine().startswith(('arm', 'aarch')) + if IS_ARMARCH: + options.add_argument('--disable-gpu-sandbox') + options.add_argument('--ignore-certificate-errors') + options.add_argument('--ignore-ssl-errors') + + language = os.environ.get('LANG', None) + if language is not None: + options.add_argument('--accept-lang=%s' % language) + + # Fix for Chrome 117 | https://github.com/FlareSolverr/FlareSolverr/issues/910 + if USER_AGENT is not None: + options.add_argument('--user-agent=%s' % USER_AGENT) + + proxy_extension_dir = None + if proxy and all(key in proxy for key in ['url', 'username', 'password']): + proxy_extension_dir = create_proxy_extension(proxy) + options.add_argument("--disable-features=DisableLoadExtensionCommandLineSwitch") + options.add_argument("--load-extension=%s" % os.path.abspath(proxy_extension_dir)) + elif proxy and 'url' in proxy: + proxy_url = proxy['url'] + logging.debug("Using webdriver proxy: %s", proxy_url) + options.add_argument('--proxy-server=%s' % proxy_url) + + # note: headless mode is detected (headless = True) + # we launch the browser in head-full mode with the window hidden + windows_headless = True if os.name == 'nt' else False + if get_config_headless(): + if os.name != 'nt': + start_xvfb_display() + + # Override for absolute invisibility on Windows + if os.name == 'nt': + options.add_argument('--hide-scrollbars') + options.add_argument('--disable-logging') + options.add_argument('--log-level=3') + + # if we are inside the Docker container, we avoid downloading the driver + driver_exe_path = None + version_main = None + if os.path.exists("/app/chromedriver"): + # running inside Docker + driver_exe_path = "/app/chromedriver" + else: + version_main = get_chrome_major_version() + if PATCHED_DRIVER_PATH is not None: + driver_exe_path = PATCHED_DRIVER_PATH + + # detect chrome path + browser_executable_path = get_chrome_exe_path() + + # CRITICAL: Clean up undetected_chromedriver cache on Windows to avoid WinError 183 + if os.name == 'nt': + try: + uc_path = os.path.join(os.environ.get('APPDATA', ''), 'undetected_chromedriver') + if os.path.exists(uc_path): + # Try to remove the file that usually causes WinError 183 + target_exe = os.path.join(uc_path, 'undetected_chromedriver.exe') + if os.path.exists(target_exe): + try: os.remove(target_exe) + except: pass + except: pass + + # downloads and patches the chromedriver + # if we don't set driver_executable_path it downloads, patches, and deletes the driver each time + try: + driver = uc.Chrome(options=options, browser_executable_path=browser_executable_path, + driver_executable_path=driver_exe_path, version_main=version_main, + windows_headless=windows_headless, headless=get_config_headless()) + except Exception as e: + logging.error("Error starting Chrome: %s" % e) + # No point in continuing if we cannot retrieve the driver + raise e + + # save the patched driver to avoid re-downloads + if driver_exe_path is None: + try: + target_path = os.path.join(driver.patcher.data_path, driver.patcher.exe_name) + if target_path != driver.patcher.executable_path: + # On Windows, we might get WinError 183 if the file is locked or exists + if os.path.exists(target_path): + try: os.remove(target_path) + except: pass + shutil.copy(driver.patcher.executable_path, target_path) + PATCHED_DRIVER_PATH = target_path + except Exception as e: + logging.warning(f"Failed to save patched driver: {e}") + + # clean up proxy extension directory + if proxy_extension_dir is not None: + shutil.rmtree(proxy_extension_dir) + + # selenium vanilla + # options = webdriver.ChromeOptions() + # options.add_argument('--no-sandbox') + # options.add_argument('--window-size=1920,1080') + # options.add_argument('--disable-setuid-sandbox') + # options.add_argument('--disable-dev-shm-usage') + # driver = webdriver.Chrome(options=options) + + return driver + + +def get_chrome_exe_path() -> str: + global CHROME_EXE_PATH + if CHROME_EXE_PATH is not None: + return CHROME_EXE_PATH + # linux pyinstaller bundle + chrome_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'chrome', "chrome") + if os.path.exists(chrome_path): + if not os.access(chrome_path, os.X_OK): + raise Exception(f'Chrome binary "{chrome_path}" is not executable. ' + f'Please, extract the archive with "tar xzf ".') + CHROME_EXE_PATH = chrome_path + return CHROME_EXE_PATH + # windows pyinstaller bundle + chrome_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'chrome', "chrome.exe") + if os.path.exists(chrome_path): + CHROME_EXE_PATH = chrome_path + return CHROME_EXE_PATH + # system + CHROME_EXE_PATH = uc.find_chrome_executable() + return CHROME_EXE_PATH + + +def get_chrome_major_version() -> str: + global CHROME_MAJOR_VERSION + if CHROME_MAJOR_VERSION is not None: + return CHROME_MAJOR_VERSION + + if os.name == 'nt': + # Example: '104.0.5112.79' + try: + complete_version = extract_version_nt_executable(get_chrome_exe_path()) + except Exception: + try: + complete_version = extract_version_nt_registry() + except Exception: + # Example: '104.0.5112.79' + complete_version = extract_version_nt_folder() + else: + chrome_path = get_chrome_exe_path() + process = os.popen(f'"{chrome_path}" --version') + # Example 1: 'Chromium 104.0.5112.79 Arch Linux\n' + # Example 2: 'Google Chrome 104.0.5112.79 Arch Linux\n' + complete_version = process.read() + process.close() + + CHROME_MAJOR_VERSION = complete_version.split('.')[0].split(' ')[-1] + return CHROME_MAJOR_VERSION + + +def extract_version_nt_executable(exe_path: str) -> str: + import pefile + pe = pefile.PE(exe_path, fast_load=True) + pe.parse_data_directories( + directories=[pefile.DIRECTORY_ENTRY["IMAGE_DIRECTORY_ENTRY_RESOURCE"]] + ) + return pe.FileInfo[0][0].StringTable[0].entries[b"FileVersion"].decode('utf-8') + + +def extract_version_nt_registry() -> str: + stream = os.popen( + 'reg query "HKLM\\SOFTWARE\\Wow6432Node\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\Google Chrome"') + output = stream.read() + google_version = '' + for letter in output[output.rindex('DisplayVersion REG_SZ') + 24:]: + if letter != '\n': + google_version += letter + else: + break + return google_version.strip() + + +def extract_version_nt_folder() -> str: + # Check if the Chrome folder exists in the x32 or x64 Program Files folders. + for i in range(2): + path = 'C:\\Program Files' + (' (x86)' if i else '') + '\\Google\\Chrome\\Application' + if os.path.isdir(path): + paths = [f.path for f in os.scandir(path) if f.is_dir()] + for path in paths: + filename = os.path.basename(path) + pattern = r'\d+\.\d+\.\d+\.\d+' + match = re.search(pattern, filename) + if match and match.group(): + # Found a Chrome version. + return match.group(0) + return '' + + +def get_user_agent(driver=None) -> str: + global USER_AGENT + if USER_AGENT is not None: + return USER_AGENT + + try: + if driver is None: + driver = get_webdriver() + USER_AGENT = driver.execute_script("return navigator.userAgent") + # Fix for Chrome 117 | https://github.com/FlareSolverr/FlareSolverr/issues/910 + USER_AGENT = re.sub('HEADLESS', '', USER_AGENT, flags=re.IGNORECASE) + return USER_AGENT + except Exception as e: + raise Exception("Error getting browser User-Agent. " + str(e)) + finally: + if driver is not None: + if PLATFORM_VERSION == "nt": + driver.close() + driver.quit() + + +def start_xvfb_display(): + global XVFB_DISPLAY + if XVFB_DISPLAY is None: + from xvfbwrapper import Xvfb + XVFB_DISPLAY = Xvfb() + XVFB_DISPLAY.start() + + +def object_to_dict(_object): + json_dict = json.loads(json.dumps(_object, default=lambda o: o.__dict__)) + # remove hidden fields + return {k: v for k, v in json_dict.items() if not k.startswith('__')} diff --git a/keep_alive.py b/keep_alive.py new file mode 100644 index 0000000000000000000000000000000000000000..00ace6dbf5cc709f6989a1324755af30602d32e1 --- /dev/null +++ b/keep_alive.py @@ -0,0 +1,47 @@ +""" +Keep-Alive Service to prevent Render.com from sleeping +Pings the server every 10 minutes to maintain activity +""" +import asyncio +import httpx +import logging +from datetime import datetime + +logger = logging.getLogger("keep_alive") + +class KeepAliveService: + def __init__(self, base_url: str = "http://localhost:7860"): + self.base_url = base_url + self.running = False + self.ping_interval = 600 # 10 minutes + + async def start(self): + """Start the keep-alive service""" + self.running = True + logger.info("🔄 Keep-Alive service started (pinging every 10 minutes)") + + while self.running: + try: + await asyncio.sleep(self.ping_interval) + await self._ping() + except Exception as e: + logger.error(f"Keep-Alive error: {e}") + + async def _ping(self): + """Send a ping to keep the service alive""" + try: + async with httpx.AsyncClient(timeout=10.0) as client: + response = await client.get(f"{self.base_url}/health") + if response.status_code == 200: + logger.info(f"✅ Keep-Alive ping successful at {datetime.now().strftime('%H:%M:%S')}") + else: + logger.warning(f"⚠️ Keep-Alive ping returned {response.status_code}") + except Exception as e: + logger.warning(f"Keep-Alive ping failed: {e}") + + def stop(self): + """Stop the keep-alive service""" + self.running = False + logger.info("Keep-Alive service stopped") + +keep_alive = KeepAliveService() diff --git a/main.py b/main.py new file mode 100644 index 0000000000000000000000000000000000000000..a62fcc62c44a7db2d9e156a0a8227b78912aca44 --- /dev/null +++ b/main.py @@ -0,0 +1,352 @@ +import logging +import time +from typing import List, Optional +from fastapi import FastAPI, Request, HTTPException, Query +from fastapi.responses import JSONResponse, FileResponse, StreamingResponse, RedirectResponse +from fastapi.middleware.cors import CORSMiddleware +from fastapi.middleware.gzip import GZipMiddleware +import httpx +from scraper.engine import scraper +from downloader import downloader +import os +import re +from urllib.parse import unquote, quote +from fastapi.staticfiles import StaticFiles +from database import init_db +from keep_alive import keep_alive +import asyncio +import io + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", +) +logger = logging.getLogger("backend") + +app = FastAPI(title="MEIH Movies API", version="2.0.0") + +# --- Simple Caching Layer --- +class MemoryCache: + def __init__(self): + self._cache = {} + + def get(self, key: str): + item = self._cache.get(key) + if item: + expire_time, data = item + if time.time() < expire_time: + return data + else: + del self._cache[key] + return None + + def set(self, key: str, data, ttl_seconds: int = 600): # Default 10 mins + self._cache[key] = (time.time() + ttl_seconds, data) + +cache = MemoryCache() + +async def warm_scraper(): + """Warms up the scraper by making an initial request to sync cookies.""" + logger.info("🔥 Warming up scraper in background...") + try: + # Give services a few more seconds to be truly ready + await asyncio.sleep(5) + await scraper.fetch_home(page=1) + logger.info("✅ Scraper warmed up and cookies synced") + except Exception as e: + logger.warning(f"⚠️ Scraper warmup failed (will retry on first request): {e}") + +@app.on_event("startup") +async def startup_event(): + await init_db() + logger.info("🚀 Database initialized and ready") + + # Detect if running on Hugging Face + is_hf = os.environ.get("SPACE_ID") is not None or os.environ.get("HF_SPACE") is not None + + if not is_hf: + # Start Keep-Alive service (only for non-HF environments) + asyncio.create_task(keep_alive.start()) + # Start Warm-up service + asyncio.create_task(warm_scraper()) + # Start Nitro Pre-fetch (Populates cache in background) + if hasattr(scraper, '_turbo_prefetch'): + asyncio.create_task(scraper._turbo_prefetch()) + logger.info("🔄 Background services activated") + else: + logger.info("🤗 Running on Hugging Face - Lightweight mode enabled") + # Just warm up the scraper without heavy pre-fetching + asyncio.create_task(warm_scraper()) + + +# Enable CORS for frontend +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) +app.add_middleware(GZipMiddleware, minimum_size=1000) + +@app.get("/") +async def root(): + return { + "status": "online", + "engine": "Nitro-Power Larooza Engine", + "engine_status": "WARM" if scraper._cookies_synced else "COLD", + "cached_keys": list(cache._cache.keys()) + } + +@app.get("/latest") +async def get_latest(page: int = 1): + cache_key = f"latest_{page}" + cached = cache.get(cache_key) + if cached: + return cached + + try: + items = await scraper.fetch_home(page=page) + if items: + cache.set(cache_key, items) + return items + except Exception as e: + logger.error(f"Error fetching latest: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.get("/category/{cat_id}") +async def get_category(cat_id: str, page: int = 1): + cache_key = f"cat_{cat_id}_{page}" + cached = cache.get(cache_key) + if cached: + return cached + + try: + items = await scraper.fetch_category(cat_id, page=page) + if items: + cache.set(cache_key, items) + return items + except Exception as e: + logger.error(f"Error fetching category {cat_id}: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.get("/search") +async def search(q: str): + cache_key = f"search_{q}" + cached = cache.get(cache_key) + if cached: + return cached + + try: + items = await scraper.search(q) + if items: + cache.set(cache_key, items, ttl_seconds=3600) # Search results cache longer + return items + except Exception as e: + logger.error(f"Error searching for {q}: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.get("/details/{safe_id}") +async def get_details(safe_id: str): + cache_key = f"details_{safe_id}" + cached = cache.get(cache_key) + if cached: + return cached + + try: + details = await scraper.fetch_details(safe_id) + if not details: + return JSONResponse(status_code=404, content={"error": "Content not found"}) + + cache.set(cache_key, details, ttl_seconds=86400) # Details cache for 24h + return details + except Exception as e: + logger.error(f"Error fetching details for {safe_id}: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.get("/proxy/image") +async def proxy_image(url: str): + if not url: + raise HTTPException(status_code=400, detail="URL is required") + + url = unquote(url) + + # --- Image Disk Cache --- + cache_dir = os.path.join(base_dir, "cache", "images") + os.makedirs(cache_dir, exist_ok=True) + + # Generate simple hash for filename + import hashlib + url_hash = hashlib.md5(url.encode()).hexdigest() + cache_path = os.path.join(cache_dir, f"{url_hash}.img") + + # 1. Check if cached + if os.path.exists(cache_path): + # Check cache age (optional - 1 week) + if time.time() - os.path.getmtime(cache_path) < 604800: + return FileResponse( + cache_path, + media_type="image/jpeg", # Approximate, browser will handle + headers={"Cache-Control": "public, max-age=31536000"} + ) + + try: + # Using follow_redirects and a longer timeout for images + async with httpx.AsyncClient(timeout=20.0, follow_redirects=True) as client: + resp = await client.get(url, headers={"User-Agent": scraper.headers["User-Agent"]}) + if resp.status_code == 200: + # Save to cache + content = resp.content + with open(cache_path, "wb") as f: + f.write(content) + + # Return the image stream directly + return StreamingResponse( + io.BytesIO(content), + media_type=resp.headers.get("Content-Type", "image/jpeg"), + headers={"Cache-Control": "public, max-age=31536000"} + ) + else: + logger.warning(f"Failed to proxy image {url} (Status: {resp.status_code})") + return JSONResponse(status_code=resp.status_code, content={"error": f"Failed (Status {resp.status_code})"}) + except httpx.TimeoutException: + logger.warning(f"Timeout proxying image: {url}") + return JSONResponse(status_code=504, content={"error": "Image timeout"}) + except Exception as e: + logger.error(f"Proxy image error for {url}: {type(e).__name__} - {str(e)}") + return JSONResponse(status_code=500, content={"error": str(e)}) + +@app.get("/download/info") +async def get_download_info(url: str): + try: + info = await downloader.get_info(url) + return info + except Exception as e: + logger.error(f"Download info error for {url}: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + + +@app.get("/download/file") +async def download_file(url: str, filename: str = "video.mp4"): + """Handles file downloads, proxying if necessary to bypass IP blocks or hotlink protection.""" + if not url: + raise HTTPException(status_code=400, detail="URL is required") + + url = unquote(url) + + # Domains that REQUIRE proxying (IP-bound or strict hotlink protection) + proxy_domains = [ + "googlevideo.com", + "manifest.googlevideo.com", + "larozavideo.net", + "larooza.site", + "larooza.mom", + "laroza-tv.net", + "youtube.com", + "youtu.be" + ] + + should_proxy = any(domain in url for domain in proxy_domains) + + if should_proxy: + logger.info(f"🛡️ Proxying download: {filename[:50]}...") + + # Clean filename for the ASCII part of Content-Disposition + # Remove non-ASCII characters for the fallback filename + ascii_filename = re.sub(r'[^\x00-\x7F]+', '_', filename) + encoded_filename = quote(filename) + + async def stream_generator(): + async with httpx.AsyncClient(timeout=None, follow_redirects=True) as client: + try: + async with client.stream("GET", url, headers={"User-Agent": scraper.headers["User-Agent"]}) as resp: + if resp.status_code != 200: + logger.error(f"Proxy source returned {resp.status_code}") + return + + # We can't easily set Content-Length here because StreamingResponse + # starts before we have all chunks, but we can set it in the outer response + async for chunk in resp.aiter_bytes(chunk_size=1024*1024): + yield chunk + except Exception as e: + logger.error(f"Streaming error: {e}") + + # Get initial headers to find content length/type if possible + try: + async with httpx.AsyncClient(timeout=10.0, follow_redirects=True) as client: + head_resp = await client.head(url, headers={"User-Agent": scraper.headers["User-Agent"]}) + content_length = head_resp.headers.get("Content-Length") + content_type = head_resp.headers.get("Content-Type", "video/mp4") + except: + content_length = None + content_type = "video/mp4" + + headers = { + "Content-Disposition": f"attachment; filename=\"{ascii_filename}\"; filename*=UTF-8''{encoded_filename}", + "Access-Control-Expose-Headers": "Content-Disposition" + } + if content_length: + headers["Content-Length"] = content_length + + return StreamingResponse(stream_generator(), media_type=content_type, headers=headers) + + # For other sources, a simple redirect is much faster and saves server bandwidth + return RedirectResponse(url=url) + +@app.get("/health") +async def health(): + # Check FlareSolverr + fs_status = "OFFLINE" + try: + # Increase timeout as solver might be busy + async with httpx.AsyncClient(timeout=5.0) as client: + resp = await client.get("http://localhost:8191/health") + if resp.status_code == 200: + fs_status = "ONLINE" + except: + pass + + return { + "backend": "ONLINE", + "flaresolverr": fs_status, + "scraper_sync": scraper._cookies_synced, + "timestamp": time.time() + } + +# --- Frontend Mounting --- +# This ensures that our React app is served directly by FastAPI in production +# Check both relative and same-level structures for Docker/Local compatibility +base_dir = os.path.dirname(__file__) +frontend_path = os.path.join(base_dir, "meih-netflix-clone", "dist") + +if not os.path.exists(frontend_path): + # Try one level up (local dev structure) + frontend_path = os.path.join(base_dir, "..", "meih-netflix-clone", "dist") + +if os.path.exists(frontend_path): + # Assets are usually in dist/assets and referenced as /assets/ in Vite + assets_path = os.path.join(frontend_path, "assets") + if os.path.exists(assets_path): + app.mount("/assets", StaticFiles(directory=assets_path), name="assets") + + @app.get("/{full_path:path}") + async def serve_frontend(full_path: str): + # Prevent infinite recursion for API routes if someone hits a wrong URL + if full_path.startswith(("api/", "latest", "category/", "search", "details", "proxy", "download", "health")): + return JSONResponse(status_code=404, content={"error": "Not Found"}) + # If the path starts with api/ or other backend routes, it should have been caught above + # Otherwise, serve the main index.html for React Router to handle + file_path = os.path.join(frontend_path, full_path) + if os.path.exists(file_path) and os.path.isfile(file_path): + return FileResponse(file_path) + return FileResponse(os.path.join(frontend_path, "index.html")) +else: + logger.warning(f"Frontend dist folder not found at {frontend_path}. Frontend serving disabled.") + +if __name__ == "__main__": + import uvicorn + # Use port 7860 for Hugging Face Spaces compatibility + uvicorn.run(app, host="0.0.0.0", port=7860) diff --git a/package.json b/package.json new file mode 100644 index 0000000000000000000000000000000000000000..a6e32ef56497088a4f5a24171819ab08b1b162a5 --- /dev/null +++ b/package.json @@ -0,0 +1,12 @@ +{ + "name": "meih-movies-api", + "version": "1.0.0", + "description": "Nitro-powered movie scraping API", + "main": "main.py", + "scripts": { + "start": "bash start.sh" + }, + "engines": { + "node": ">=18.x" + } +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..31f359f5266d83b6ec834b1ec2a9abab8b8be5c8 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,14 @@ +fastapi +uvicorn +httpx[http2] +beautifulsoup4 +curl-cffi +yt-dlp +pydantic +python-multipart +aiohttp +aiosqlite +certifi +websockets +packaging +setuptools diff --git a/scraper/engine.py b/scraper/engine.py new file mode 100644 index 0000000000000000000000000000000000000000..95586264495a9920bdcc4fe1861fa268a3551e3e --- /dev/null +++ b/scraper/engine.py @@ -0,0 +1,996 @@ +import asyncio +import httpx +import re +import logging +import base64 +import random +import os +import time +from typing import List, Dict, Optional +from bs4 import BeautifulSoup +from curl_cffi.requests import AsyncSession +from urllib.parse import urljoin, quote +from scraper.proxy_fetcher import proxy_fetcher +# Optional dependencies for heavy bypasses +try: + import undetected_chromedriver as uc + from selenium.webdriver.common.by import By + from selenium.webdriver.support.ui import WebDriverWait + from selenium.webdriver.support import expected_conditions as EC + HAS_SELENIUM = True +except ImportError: + HAS_SELENIUM = False + logger.warning("⚠️ Selenium/Undetected-Chromedriver not installed. Nuclear bypass will be disabled.") + +# Clean, strictly used logger +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger("scraper") + +class LaroozaScraper: + MIRRORS = ["https://q.larozavideo.net", "https://larooza.mom", "https://larooza.site", "https://m.laroza-tv.net"] + BASE_URL = "https://q.larozavideo.net" + TARGET_URL = "https://q.larozavideo.net/newvideos1.php" + _blacklisted_mirrors = {} + + # Permanent Aliases -> Keywords search + CATEGORY_KEYWORDS = { + "arabic-movies": ["أفلام عربية", "افلام عربية", "افلام عربي", "arabic-movies33"], + "english-movies": ["افلام اجنبية", "أفلام أجنبية", "افلام اجنبي", "أجنبي", "all_movies_13"], + "indian-movies": ["افلام هندي", "أفلام هندية", "هندي", "indian-movies9"], + "anime-movies": ["افلام انمي", "أفلام أنمي", "انمي", "anime-movies-7"], + "dubbed-movies": ["افلام مدبلجة", "أفلام مدبلجة", "مدبلج", "7-aflammdblgh"], + "turkish-series": ["مسلسلات تركية", "تركي", "turkish-3isk-seriess47"], + "arabic-series": ["مسلسلات عربية", "عربي", "arabic-series46"], + "english-series": ["مسلسلات اجنبية", "أجنبي", "english-series10"], + "ramadan-2025": ["رمضان 2025", "13-ramadan-2025"], + "ramadan-2024": ["رمضان 2024", "28-ramadan-2024"], + "ramadan-2023": ["رمضان 2023", "10-ramadan-2023"], + "asian-movies": ["آسيوي", "اسيوي", "آسيوية", "6-asian-movies"], + "asian-series": ["مسلسلات اسياوية", "اسياوية", "6-asya"], + "turkish-movies": ["افلام تركية", "أفلام تركية", "8-aflam3isk"], + "anime-series": ["مسلسلات انمي", "كرتون", "6-anime-series"], + "indian-series": ["مسلسلات هندية", "11indian-series"], + "tv-programs": ["برامج تلفزيون", "tv-programs12"], + "plays": ["مسرحيات", "masrh-5"] + } + + # Manual Fallbacks for reliability + HARDCODED_FALLBACKS = { + "arabic-movies": "arabic-movies33", + "english-movies": "all_movies_13", + "indian-movies": "indian-movies9", + "asian-movies": "6-asian-movies", + "anime-movies": "anime-movies-7", + "dubbed-movies": "7-aflammdblgh", + "turkish-movies": "8-aflam3isk", + "arabic-series": "arabic-series46", + "ramadan-2025": "13-ramadan-2025", + "ramadan-2024": "28-ramadan-2024", + "ramadan-2023": "10-ramadan-2023", + "english-series": "english-series10", + "turkish-series": "turkish-3isk-seriess47", + "indian-series": "11indian-series", + "tv-programs": "tv-programs12", + "plays": "masrh-5", + "anime-series": "6-anime-series", + "asian-series": "6-asya" + } + + def __init__(self): + # Primary fetcher: curl-cffi (Fastest, TLS Impersonation) + # Using chrome120 and disabling SSL verify for maximum compatibility + self.session = AsyncSession(impersonate="chrome120", timeout=30, verify=False) + self._cookies_synced = False + self._last_pw_solve = 0 + self._ua_synced = None + self._chrome_version = None + self._domain_lock = asyncio.Lock() + self._warming_lock = asyncio.Lock() + self._proxy_refresh_interval = 1800 # 30 minutes + self._proxy_refresh_time = 0 + self._semaphore = asyncio.Semaphore(5) # Reduced from 15 for stability + self._optimization_started = False + self._is_prefetching = False + self._domain_detected = False + + + # Hybrid Configuration + self.REMOTE_SOLVER_URL = "https://meih-movies-api.onrender.com/remote-fetch" + self.IS_RENDER = os.environ.get("RENDER") is not None + self.IS_HUGGINGFACE = os.environ.get("SPACE_ID") is not None + + # Free Proxy Pool for Hugging Face (to bypass IP bans) + self._free_proxy_pool = [] + self._proxy_pool_last_refresh = 0 + + self.headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", + "Accept-Language": "ar,en-US;q=0.9,en;q=0.8", + "Accept-Encoding": "gzip, deflate, br", + "Referer": "https://www.google.com/", + "Connection": "keep-alive", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "cross-site", + } + self._session_initialized = False + self._session_warmed_at = 0 + self._httpx_client = None + + # --- Proxy Rotation System --- + proxy_str = os.getenv("PROXY_LIST", "") + self.proxies = [p.strip() for p in proxy_str.split(",") if p.strip()] + self._current_proxy_idx = 0 + if self.proxies: + logger.info(f"✓ Proxy rotation enabled with {len(self.proxies)} endpoints") + self._category_map = {} + self._last_discovery = 0 + self._discovery_lock = asyncio.Lock() + + # --- Mirror & Performance --- + self._cache = {} # {url: (timestamp, data)} + self._cache_ttl = 3600 # 1 hour for data + self._free_proxies = [] + self._optimization_started = False + self._uc_lock = asyncio.Lock() + self._solver_lock = asyncio.Lock() # Guard against multiple solvers + + # We'll start optimization on the first request to avoid "no running loop" error + + async def _optimize_connection(self): + """Find the fastest mirror and warm up the engine""" + # 1. Check if we already have a reasonably fresh fastest mirror + now = time.time() + if hasattr(self, '_fastest_mirror_detected_at') and now - self._fastest_mirror_detected_at < 3600: + return + + logger.info("🔍 Testing mirror speeds (Optimized)...") + + async def test_mirror(mirror): + try: + # very aggressive timeout for discovery + start = time.time() + test_url = f"{mirror}/newvideos1.php" + async with httpx.AsyncClient(timeout=1.5, follow_redirects=True, verify=False) as client: + resp = await client.get(test_url) + if resp.status_code == 200: + return (time.time() - start, mirror) + except: + pass + return (999, mirror) + + results = await asyncio.gather(*(test_mirror(m) for m in self.MIRRORS)) + results.sort() + + min_time, fastest_mirror = results[0] + + if min_time < 999: + logger.info(f"⚡ Fastest mirror: {fastest_mirror} ({min_time:.2f}s)") + self.BASE_URL = fastest_mirror + self.TARGET_URL = f"{fastest_mirror}/newvideos1.php" + self._fastest_mirror_detected_at = now + else: + logger.warning("⚠️ No mirrors responded quickly, using default.") + self._fastest_mirror_detected_at = now - 3300 # Retry sooner + + + async def _refresh_free_proxies(self): + """Fetch free proxies from public APIs (for cloud deployments)""" + # Enable on both Hugging Face and Render.com + if not (self.IS_HUGGINGFACE or self.IS_RENDER): + return + + now = time.time() + if now - self._proxy_pool_last_refresh < 300: # Refresh every 5 minutes + return + + logger.info("🔄 Refreshing free proxy pool...") + self._proxy_pool_last_refresh = now + + proxy_sources = [ + "https://api.proxyscrape.com/v2/?request=get&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all", + "https://www.proxy-list.download/api/v1/get?type=http", + ] + + new_proxies = [] + for source in proxy_sources: + try: + async with httpx.AsyncClient(timeout=10.0) as client: + resp = await client.get(source) + if resp.status_code == 200: + proxies = resp.text.strip().split('\n') + for proxy in proxies[:10]: # Take first 10 from each source + proxy = proxy.strip() + if proxy and ':' in proxy: + new_proxies.append(f"http://{proxy}") + except Exception as e: + logger.warning(f"Failed to fetch proxies from {source}: {e}") + + if new_proxies: + self._free_proxy_pool = new_proxies + logger.info(f"✅ Loaded {len(new_proxies)} free proxies") + else: + logger.warning("⚠️ No free proxies available") + + async def _discover_categories(self, force=False): + """Build the category map dynamically from the homepage""" + async with self._discovery_lock: + if not force and time.time() - self._last_discovery < 3600: # Cache for 1 hour + return + + logger.info("Refreshing category mapping...") + html = await self._get_html(self.BASE_URL) + if not html: return + + soup = BeautifulSoup(html, 'html.parser') + new_map = {} + + # Find all category links + for a in soup.find_all('a', href=True): + href = a['href'] + if 'cat=' not in href: continue + + cat_id = href.split('cat=')[-1].split('&')[0] + text = a.get_text(strip=True).lower() + + # Match against keywords + for alias, keywords in self.CATEGORY_KEYWORDS.items(): + if alias not in new_map: + if any(k in text for k in keywords): + new_map[alias] = cat_id + + if new_map: + self._category_map = new_map + self._last_discovery = time.time() + logger.info(f"✓ Mapped {len(new_map)} categories: {new_map}") + + async def _resolve_cat_id(self, cat_id: str) -> str: + """Resolves an alias to a real ID, or returns the original if not an alias""" + await self._discover_categories() + # 1. Check dynamic map + if cat_id in self._category_map: + return self._category_map[cat_id] + + # 2. Check hardcoded fallbacks if dynamic failed + if cat_id in self.HARDCODED_FALLBACKS: + return self.HARDCODED_FALLBACKS[cat_id] + + return cat_id + + async def _warm_session(self): + """Warm up session with the detected working mirror""" + if not self._domain_detected: + # We already set defaults in __init__ / class, just confirm + logger.info(f"🚀 Targeting exclusive source: {self.TARGET_URL}") + self._domain_detected = True + + if not self._session_initialized: + self._session_initialized = True # Mark as init even if basic get fails, as PW will solve it + + async def _refresh_free_proxies(self): + """Refresh free proxy list if needed""" + if time.time() - self._proxy_refresh_time > self._proxy_refresh_interval: + logger.info("Refreshing free proxy pool...") + self._free_proxies = await proxy_fetcher.get_working_proxies(max_count=15) + self._proxy_refresh_time = time.time() + logger.info(f"Loaded {len(self._free_proxies)} working free proxies") + + def _get_proxy(self) -> Optional[str]: + # On cloud platforms (HF or Render), prioritize free proxy pool + if (self.IS_HUGGINGFACE or self.IS_RENDER) and self._free_proxy_pool: + proxy = self._free_proxy_pool[self._current_proxy_idx % len(self._free_proxy_pool)] + self._current_proxy_idx += 1 + return proxy + + # Try free proxies first (legacy proxy_fetcher) + if self._free_proxies: + proxy = self._free_proxies[self._current_proxy_idx % len(self._free_proxies)] + self._current_proxy_idx += 1 + return proxy + + # Fallback to configured proxies + if not self.proxies: return None + proxy = self.proxies[self._current_proxy_idx % len(self.proxies)] + self._current_proxy_idx += 1 + return proxy + + + async def _get_html_with_undetected_chrome(self, url: str) -> Optional[str]: + """The 'NUCLEAR Option': Undetected-Chromedriver with safety locks for Windows""" + if not HAS_SELENIUM: + logger.error("❌ Cannot use UC: Selenium/Undetected-Chromedriver not installed.") + return None + + async with self._uc_lock: + logger.info(f"💣 Launching Undetected-Chrome NUCLEAR Bypass for {url}...") + + def get_chrome_version(): + try: + import winreg + key = winreg.OpenKey(winreg.HKEY_CURRENT_USER, r'Software\Google\Chrome\BLBeacon') + version, _ = winreg.QueryValueEx(key, 'version') + return int(version.split('.')[0]) + except: + return 120 # Fallback + + if not self._chrome_version: + self._chrome_version = get_chrome_version() + + def chrome_task(): + driver = None + try: + options = uc.ChromeOptions() + options.add_argument('--headless') + options.add_argument('--no-sandbox') + options.add_argument('--disable-dev-shm-usage') + options.add_argument('--disable-gpu') + options.add_argument('--window-size=1280,1024') + options.add_argument('--mute-audio') + options.add_argument('--disable-notifications') + options.add_argument('--disable-popup-blocking') + options.add_argument('--hide-scrollbars') + options.add_argument('--disable-logging') + options.add_argument('--log-level=3') + options.add_argument('--no-first-run') + options.add_argument('--no-default-browser-check') + options.add_argument('--no-pings') + options.add_argument('--disable-blink-features=AutomationControlled') + + # Disable images for maximum speed + prefs = { + 'profile.managed_default_content_settings.images': 2, + 'profile.default_content_settings.images': 2 + } + options.add_experimental_option('prefs', prefs) + + driver = uc.Chrome(options=options, version_main=self._chrome_version) + driver.set_page_load_timeout(60) + + logger.info(f"💣 UC Fetching: {url}") + driver.get(url) + + # Wait for either content or challenge + time.sleep(10) # Heavy sleep for UC + + html = driver.page_source + + # Basic sync of UA + ua = driver.execute_script("return navigator.userAgent") + if ua: + self.headers["User-Agent"] = ua + + return html + except Exception as e: + logger.error(f"Undetected-Chrome failure: {e}") + return None + finally: + if driver: + try: driver.quit() + except: pass + + loop = asyncio.get_event_loop() + return await loop.run_in_executor(None, chrome_task) + + async def _get_html_with_flaresolverr(self, url: str) -> Optional[str]: + """FlareSolverr with Singleton Lock to avoid browser bloat""" + async with self._solver_lock: + # Re-check cache inside lock + if url in self._cache: + return self._cache[url][1] + + logger.info(f"✨ Requesting FlareSolverr solve for {url}...") + + flaresolverr_url = "http://localhost:8191/v1" + payload = { + "cmd": "request.get", + "url": url, + "maxTimeout": 60000 + } + + # Connection Retry Loop + max_conn_retries = 5 # Increased retries + for conn_attempt in range(max_conn_retries): + try: + async with httpx.AsyncClient(timeout=90.0) as client: + response = await client.post(flaresolverr_url, json=payload) + if response.status_code == 200: + data = response.json() + if data.get('status') == 'ok': + solution = data.get('solution', {}) + html = solution.get('response', '') + + # SYNCING LOGIC + cookies = solution.get('cookies', []) + ua = solution.get('userAgent', '') + if ua: + self._ua_synced = ua + self.headers["User-Agent"] = ua + + for cookie in cookies: + # Ensure domain is set for proper cookie handling + domain = cookie.get('domain') + if not domain and url: + try: + domain = urlparse(url).netloc + if domain.startswith('www.'): + domain = domain[4:] + except: + pass + + if domain: + self.session.cookies.set( + cookie['name'], + cookie['value'], + domain=domain, + path=cookie.get('path', '/'), + secure=cookie.get('secure', False), + expires=cookie.get('expires') + ) + + self._cookies_synced = True + self._last_pw_solve = time.time() + logger.info("✅ Session Synced!") + return html + else: + logger.warning(f"FlareSolverr error: {data.get('message')}") + else: + logger.warning(f"FlareSolverr returned status {response.status_code}") + except Exception as e: + if conn_attempt < max_conn_retries - 1: + logger.warning(f"FlareSolverr comm failed (attempt {conn_attempt+1}/{max_conn_retries}): {e}. Retrying...") + await asyncio.sleep(2) + else: + logger.error(f"FlareSolverr comm failed after {max_conn_retries} attempts: {e}") + return None + + async def _turbo_prefetch(self): + """Pre-fetch all major categories in parallel to populate cache instantly""" + if self._is_prefetching: return + self._is_prefetching = True + logger.info("🚀 NITRO MODE: Starting concurrent background pre-fetch...") + + try: + # List of high-priority tasks + tasks = [self.fetch_home(page=1)] + + # Map of key categories to pre-warm + priority_cats = list(self.CATEGORY_KEYWORDS.keys())[:15] + for cat_id in priority_cats: + tasks.append(self.fetch_category(cat_id, page=1)) + + # Run everything in parallel with semaphore protection + await asyncio.gather(*tasks, return_exceptions=True) + logger.info(f"⚡ NITRO MODE complete! Cache primed with {len(self._cache)} items.") + except Exception as e: + logger.error(f"Nitro pre-fetch failed: {e}") + finally: + self._is_prefetching = False + + async def _get_html(self, url: str, max_retries: int = 1, follow_meta=True) -> Optional[str]: + """Nitro-Speed Fetch with Parallel Safety""" + if not self._optimization_started: + self._optimization_started = True + asyncio.create_task(self._optimize_connection()) + + async with self._semaphore: + now = time.time() + + # 0. Cache Check + if url in self._cache: + ts, data = self._cache[url] + if now - ts < self._cache_ttl: + return data + + # Sanitize URL - Skip landing pages + if any(x in url for x in ["/gaza.20", "/gaza.18", "/gaza.22"]): + logger.info(f"Sanitizing landing page URL: {url} -> {self.TARGET_URL}") + url = self.TARGET_URL + + # Refresh free proxies if on cloud platforms + if self.IS_HUGGINGFACE or self.IS_RENDER: + await self._refresh_free_proxies() + + proxy = self._get_proxy() + proxy_dict = {"http": proxy, "https": proxy} if proxy else None + + # 1. Nitro Path (curl-cffi) + logger.info(f"🚀 Nitro Path (curl-cffi) for {url}") + try: + # Increased timeout to 45s to handle extremely slow responses + resp = await self.session.get(url, headers=self.headers, timeout=45, proxies=proxy_dict) + status_code = resp.status_code + logger.info(f"📡 Nitro Path response: {status_code} ({len(resp.content)} bytes)") + + if status_code == 200: + text = resp.text + # Improve Meta Refresh detection (Larooza uses this heavily for domain rotation) + refresh_match = re.search(r'http-equiv=["\']refresh["\'].*?content=["\']\d+;\s*url=(.*?)["\']', text, re.I) + if not refresh_match: + refresh_match = re.search(r'content=["\']\d+;\s*url=(.*?)["\']', text, re.I) + + if refresh_match and follow_meta: + new_url_raw = refresh_match.group(1).strip("'\" ") + new_url = urljoin(url, new_url_raw) + + # Preserve query parameters if the new URL doesn't have them but the old one did + if "?" not in new_url and "?" in url: + query = url.split("?")[-1] + new_url = f"{new_url}?{query}" if not new_url.endswith("?") else f"{new_url}{query}" + + # If redirecting to a known landing page or ad-trap, skip it + if any(x in new_url for x in ["gaza.20", "gaza.18", "gaza.22", "gaza.24"]): + logger.info(f"🚫 Skipping ad-trap redirect: {new_url}") + new_url = self.TARGET_URL + + logger.info(f"🔄 Following meta refresh to: {new_url}") + return await self._get_html(new_url, max_retries=max_retries, follow_meta=False) + + # More robust Cloudflare & Landing Page detection + text_lower = text.lower() + cf_markers = ["challenge-running", "cf-ray", "cloudflare-static", "just a moment", "verify you are human", "checking your browser"] + is_cf = any(x in text_lower for x in cf_markers) or "id=\"challenge-form\"" in text_lower + + # Detect landing page even if 200 OK (gaza.20 redirect in JS or Meta) + is_landing = "gaza.20" in text_lower or "gaza.18" in text_lower or "gaza.22" in text_lower + + if is_cf: + logger.warning(f"⚠️ Cloudflare detected in Nitro response for {url}") + elif is_landing and follow_meta: + logger.info(f"🔄 Landing page detected in content for {url}, forcing target...") + return await self._get_html(self.TARGET_URL, max_retries=max_retries, follow_meta=False) + else: + self._cache[url] = (now, text) + return text + elif status_code == 404: + logger.warning(f"⚠️ Nitro Path 404 for {url} on mirror {self.BASE_URL}") + # If this was a mirror, fallback to primary domain + primary_primary = self.MIRRORS[0] + if self.BASE_URL != primary_primary: + fallback_url = url.replace(self.BASE_URL, primary_primary) + logger.info(f"🔁 Falling back to primary domain: {fallback_url}") + return await self._get_html(fallback_url, max_retries=max_retries, follow_meta=True) + elif status_code == 403: + logger.warning(f"🚫 Nitro Path 403 for {url}, falling back to solvers...") + except Exception as e: + logger.error(f"❌ Nitro Path error for {url}: {e}") + + # 2. Solver Path + for att in range(max_retries): + # Use a specific lock for solver to prevent multiple concurrent solver requests for the same URL + # but allow different URLs in parallel. For simplicity, we use the existing semaphore and a small delay. + + # Check cache again just in case another task filled it + if url in self._cache: + return self._cache[url][1] + + html = await self._get_html_with_flaresolverr(url) + if html: + self._cache[url] = (now, html) + return html + + # UC Fallback for critical pages + if att == max_retries - 1: + logger.info(f"UC Fallback for: {url}") + res = await self._get_html_with_undetected_chrome(url) + if res: return res + + return None + + def _extract_items(self, soup: BeautifulSoup) -> List[Dict]: + """Ultra-Fast Content Extraction with Deep Image Probing""" + items = [] + if not soup: return [] + + if soup.title: + logger.info(f"Extracting: {soup.title.string}") + if "challenge" in str(soup.title).lower() or "cloudflare" in str(soup.title).lower(): + return [] + + # Ultra-Strong Coverage for all Larooza Variants & Mirrors + containers = soup.select('.thumbnail, .pm-li-video, .pm-video-thumb, .video-block, .movie-item, li.col-xs-6, .box, .video-box, .video-item, .post-item') + if not containers: + # Deep scan for any link that looks like a video + containers = soup.select('a[href*="video.php"], a[href*="watch.php"], .video-listing-content, .card-video') + + seen_urls = set() + for tag in containers: + # 1. Fast Link Detection + link = tag if (tag.name == 'a' and 'video.php' in tag.get('href', '')) else \ + (tag.select_one('a.ellipsis') or tag.find('a', href=lambda x: x and 'video.php' in x)) + + if not link: continue + href = link.get('href') + if not href: continue + + full_link = urljoin(self.BASE_URL, href) + if full_link in seen_urls: continue + seen_urls.add(full_link) + + # 2. Extract Title & Clean it + title_node = tag.select_one('h3, h2, .title, .ellipsis, .video-title, p') + title = title_node.get_text(strip=True) if title_node else "" + if not title and link: + title = link.get('title') or link.get_text(strip=True) + + # Clean Title (Remove noisy tags for premium look) + for t_tag in ["مشاهدة", "فيلم", "مسلسل", "كامل", "HDCAM", "HD", "WEB-DL", "Cam", "مترجم", "اون لاين", "مدبلج"]: + title = title.replace(t_tag, "").strip() + title = re.sub(r'\d{4}', '', title).strip("- ").strip() # Remove Year + + # 3. Deep Image Probing + img_node = tag.select_one('img') + img_url = "" + if img_node: + # Try all possible lazy-load attributes, prefer potential real URLs over base64 + candidates = [ + img_node.get('data-src'), + img_node.get('data-lazy-src'), + img_node.get('data-original'), + img_node.get('srcset'), + img_node.get('src') + ] + for c in candidates: + if c and not c.startswith('data:'): + # Ensure it's a real URL + if c.startswith('http') or c.startswith('//') or c.startswith('/'): + img_url = c + break + + # If still no image, try to find ANY attribute that looks like a URL + if not img_url: + for attr, val in img_node.attrs.items(): + if isinstance(val, str) and (val.startswith('http') or '.jpg' in val or '.png' in val) and not val.startswith('data:'): + img_url = val + break + + if img_url and "," in img_url: # Handle srcset + img_url = img_url.split(",")[0].split(" ")[0] + + # Fallback: Check for background-image in style + if not img_url: + style = tag.get('style') or "" + if 'background-image' in style: + m = re.search(r'url\([\'"]?(.*?)[\'"]?\)', style) + if m: + img_url = m.group(1) + + if not img_url or img_url.startswith('data:'): + img_url = "https://placehold.co/600x400/000000/FFFFFF?text=No+Poster" + + # Absolute URL correction + if img_url.startswith('//'): img_url = 'https:' + img_url + elif img_url.startswith('/'): img_url = self.BASE_URL + img_url + + # Proxy through our backend for stability + poster = f"/proxy/image?url={quote(img_url)}" + + # 4. Speed-optimized Series Detection + lt = title.lower() + content_type = "series" if any(x in lt for x in ['حلقة', 'مسلسل', 'episode', 'season', 'series']) else "movie" + + items.append({ + "id": base64.urlsafe_b64encode(full_link.encode()).decode(), + "title": title, + "poster": poster, + "type": content_type, + "duration": tag.select_one('.duration, .pm-label-duration, .time').get_text(strip=True) if tag.select_one('.duration, .pm-label-duration, .time') else "" + }) + return items + + async def fetch_home(self, page: int = 1) -> List[Dict]: + target = f"{self.TARGET_URL}?page={page}" + html = await self._get_html(target, max_retries=3) + if not html: + logger.error(f"Failed to fetch home page: {target}") + return [] + + items = self._extract_items(BeautifulSoup(html, 'html.parser')) + logger.info(f"Fetched {len(items)} items from {target}") + return items + + async def fetch_category(self, cat_id: str, page: int = 1) -> List[Dict]: + resolved_id = await self._resolve_cat_id(cat_id) + target = f"{self.BASE_URL}/category.php?cat={resolved_id}&page={page}" + html = await self._get_html(target, max_retries=3) + return self._extract_items(BeautifulSoup(html, 'html.parser')) if html else [] + + def _normalize_number(self, text: str) -> int: + """Extract episode number from Arabic/English text""" + # Arabic number words mapping + arabic_map = { + 'الأولى': 1, 'الاولى': 1, 'الثانية': 2, 'الثالثة': 3, 'الرابعة': 4, + 'الخامسة': 5, 'السادسة': 6, 'السابعة': 7, 'الثامنة': 8, 'التاسعة': 9, + 'العاشرة': 10, 'الحادية': 11, 'الثانية عشر': 12, 'الثالثة عشر': 13, + 'الرابعة عشر': 14, 'الخامسة عشر': 15, 'السادسة عشر': 16, 'السابعة عشر': 17, + 'الثامنة عشر': 18, 'التاسعة عشر': 19, 'العشرون': 20, 'الاخيرة': 999 + } + + # Try to find numeric digits first (most reliable) + match = re.search(r'(\d+)', text) + if match: + return int(match.group(1)) + + # Try Arabic number words + text_lower = text.lower() + for arabic_word, num in arabic_map.items(): + if arabic_word in text_lower: + return num + + # Try to extract from patterns like "الحلقة X" or "Episode X" + patterns = [ + r'(?:الحلقة|حلقة|episode|ep)\s*[:\-]?\s*(\d+)', + r'(\d+)\s*(?:الحلقة|حلقة|episode|ep)', + ] + for pattern in patterns: + match = re.search(pattern, text_lower) + if match: + return int(match.group(1)) + + return 0 + + def _safe_get_episode(self, text: str, name_hint: str = None) -> int: + """Smarter episode number extraction with common patterns""" + # Remove common noise + clean = re.sub(r'\(.*?\)', '', text) + clean = re.sub(r'\[.*?\]', '', clean) + + if name_hint: + # Remove the series name from the text to avoid matching numbers in the title (e.g. "2 قهوة") + clean = clean.replace(name_hint, "").strip() + + # 1. Look for number after keywords (Most reliable) + m = re.search(r'(?:الحلقة|حلقة|ep|episode|part|p)\s*(\d+)', clean, re.I) + if m: return int(m.group(1)) + + # 2. Direct digits (Fallback) + m = re.search(r'(\d+)', clean) + if m: return int(m.group(1)) + + # 3. Word matches + return self._normalize_number(clean) + + async def search(self, query: str) -> List[Dict]: + url = f"{self.BASE_URL}/search.php?keywords={quote(query)}" + html = await self._get_html(url, max_retries=2) + return self._extract_items(BeautifulSoup(html, 'html.parser')) if html else [] + + async def fetch_details(self, safe_id: str) -> Dict: + try: + url = base64.urlsafe_b64decode(safe_id).decode() + except: return {} + + html = await self._get_html(url) + if not html: return {} + + soup = BeautifulSoup(html, 'html.parser') + + # Follow play.php for watch servers + watch_html = html + watch_soup = soup + play_a = soup.select_one('a[href*="play.php"]') + if play_a: + p_url = urljoin(self.BASE_URL, play_a.get('href')) + p_html = await self._get_html(p_url) + if p_html: + watch_soup = BeautifulSoup(p_html, 'html.parser') + watch_html = p_html + + title = soup.find('h1').get_text(strip=True) if soup.find('h1') else "Unknown" + is_series = bool(soup.select('.episodes-list, .season-episodes, .vid-episodes')) or any(x in title for x in ["حلقة", "مسلسل", "الموسم"]) + + raw_poster = soup.select_one('meta[property="og:image"]')['content'] if soup.select_one('meta[property="og:image"]') else "" + if not raw_poster: + img_tag = soup.select_one('.poster img, .movie-poster img, .pm-video-watch-main img') + if img_tag: + raw_poster = img_tag.get('src') or img_tag.get('data-src') + + poster = "" + if raw_poster: + full_poster_url = urljoin(self.BASE_URL, raw_poster) + poster = f"/proxy/image?url={quote(full_poster_url)}" + + response = { + "id": safe_id, "title": title, + "description": soup.select_one('.story, .desc, .entry-content').get_text(strip=True) if soup.select_one('.story, .desc, .entry-content') else "", + "poster": poster, + "type": "series" if is_series else "movie", + "seasons": [], "episodes": [], "servers": [], "download_links": [] + } + + # --- Episodes --- + if is_series: + unique_eps = {} + + # 1. Proactive Search: Look for a "Series Category" link + cat_link = None + + # A. Check Breadcrumbs (Very reliable for series category) + for bc in soup.select('.breadcrumb a, .bread-crumb a, .breadcrumbs a, .pm-breadcrumb a'): + href = bc.get('href') + if href and ('cat=' in href or 'ser=' in href): + # Skip generic high-level categories if possible? + # Actually, we filter by title later, so it's okay. + cat_link = urljoin(self.BASE_URL, href) + if 'ser=' in href: # Prefer ser= over cat= + break + + # Extract clean series name for filtering + clean_title = title.replace("مسلسل", "").strip() + # Try to get name before "الحلقة" or "المواسم" + series_name = re.split(r'الحلقة|الموسم|حلقة|season|episode', clean_title, flags=re.I)[0].strip() + # Arabic numeral support for filtering + series_name_alt = series_name.replace('0','٠').replace('1','١').replace('2','٢').replace('3','٣').replace('4','٤').replace('5','٥').replace('6','٦').replace('7','٧').replace('8','٨').replace('9','٩') + + logger.info(f"Targeting series name: {series_name} (Alt: {series_name_alt})") + + # B. Check if Title itself is a link to the category or series + if not cat_link: + title_link = soup.select_one('h1 a[href*="cat="], h1 a[href*="ser="], h1 a[href*="tag.php"]') + if title_link: + cat_link = urljoin(self.BASE_URL, title_link['href']) + + # C. General search in links with strict patterns + if not cat_link: + for a in soup.find_all('a', href=True): + href = a['href'] + a_text = a.get_text(strip=True) + # High-confidence patterns + if any(x in a_text for x in ["المسلسل:", "جميع الحلقات", "حلقات المسلسل", "كل الحلقات"]): + cat_link = urljoin(self.BASE_URL, href) + logger.info(f"Found cat_link via labels: {cat_link}") + break + + # D. Fallback search by title + if not cat_link: + for a in soup.find_all('a', href=True): + href = a['href'] + if any(x in href for x in ['ser=', 'cat=', 'tag.php']): + a_text = a.get_text(strip=True) + if (series_name and series_name in a_text) or (series_name_alt and series_name_alt in a_text): + cat_link = urljoin(self.BASE_URL, href) + logger.info(f"Found cat_link via fallback title search: {cat_link}") + break + + if cat_link: + try: + # Determine type: view-serie.php, category.php, tag.php + is_view_serie = 'view-serie' in cat_link + param_name = 'ser' if is_view_serie else ('t' if 'tag.php' in cat_link else 'cat') + + # Robust ID extraction + match = re.search(f'[?&]{param_name}=([^&]+)', cat_link) + if match: + cat_id = match.group(1) + base_deep_url = f"{self.BASE_URL}/tag.php?t={cat_id}" if param_name == 't' else \ + (f"{self.BASE_URL}/view-serie.php?ser={cat_id}" if is_view_serie else \ + f"{self.BASE_URL}/category.php?cat={cat_id}") + + logger.info(f"Deep scraping episodes from {cat_link} (ID: {cat_id})") + # Fetch first 5 pages + for p in range(1, 6): + target_p = f"{base_deep_url}&page={p}" if p > 1 else base_deep_url + p_html = await self._get_html(target_p) + if not p_html: break + p_items = self._extract_items(BeautifulSoup(p_html, 'html.parser')) + + if not p_items: break + for item in p_items: + # Filter Check: Use a fuzzy name match + i_title = item['title'] + # Must match at least the first 2 words if possible, or the whole name + name_parts = series_name.split() + match_key = " ".join(name_parts[:2]) if len(name_parts) >= 2 else series_name + + if match_key in i_title or series_name in i_title or series_name_alt in i_title: + e_num = self._safe_get_episode(i_title, name_hint=series_name) + if e_num and e_num not in unique_eps: + unique_eps[e_num] = { + "id": item['id'], + "episode": e_num, + "title": i_title + } + if len(p_items) < 10: break + except Exception as e: + logger.error(f"Category episode fetch failed: {e}") + + # 2. Local fallback: Scrape episodes from the current page + for ep in soup.select('.episodes-list a, .season-episodes a, .vid-episodes a, ul.episodes li a, div.caption h3 a, .movie-item a, .related-vids a'): + ep_href = ep.get('href') + if not ep_href or 'video.php' not in ep_href: continue + ep_url = urljoin(self.BASE_URL, ep_href) + ep_text = ep.get_text(strip=True) + + # If text is empty, check for nested title + if not ep_text: + inner = ep.find(['h3', 'span', 'strong']) + if inner: ep_text = inner.get_text(strip=True) + + # CRITICAL FILTER: Item must belong to this series + if series_name and series_name not in ep_text: + continue + + ep_num = self._safe_get_episode(ep_text, name_hint=series_name) + if ep_num and ep_num not in unique_eps: + unique_eps[ep_num] = { + "id": base64.urlsafe_b64encode(ep_url.encode()).decode(), + "episode": ep_num, + "title": ep_text + } + + response['episodes'] = sorted(list(unique_eps.values()), key=lambda x: x['episode']) + response['seasons'] = [{"number": 1, "episodes": response['episodes']}] + + # --- WATCH SERVERS --- + watch_urls = set() + + def is_valid_srv(url_str: str) -> bool: + if not url_str or 'javascript' in url_str: return False + if 'larooza' in url_str and 'video.php' in url_str: return False + if any(x in url_str.lower() for x in ['beacon', 'analytics', 'pixel', 'ads.', 'google', 'facebook']): return False + return True + + # 1. Primary: WatchList & Source tags + server_selectors = [ + 'ul.WatchList li', '.server-list li', '#servers li', '.watch-servers li', + '.video-servers-list li', 'div.servers a', '.player-servers li' + ] + + for sel in server_selectors: + for li in watch_soup.select(sel): + s_url = li.get('data-embed-url') or li.get('data-link') or li.get('data-embed') or li.get('data-src') or li.get('data-url') + if not s_url: + a_tag = li.find('a', href=True) + if a_tag and not a_tag['href'].startswith('javascript'): + s_url = a_tag['href'] + + if s_url and is_valid_srv(s_url): + if s_url.startswith('//'): s_url = "https:" + s_url + full_s_url = urljoin(self.BASE_URL, s_url) + if full_s_url not in watch_urls: + watch_urls.add(full_s_url) + name = li.get_text(strip=True) or f"سيرفر {len(response['servers']) + 1}" + response['servers'].append({"name": name, "url": full_s_url, "type": "iframe"}) + + # 2. Secondary: Deep Iframe Scan + for ifr in watch_soup.select('iframe[src], embed[src], video source[src]'): + src = ifr.get('src') + if is_valid_srv(src): + if src.startswith('//'): src = "https:" + src + full_s_url = urljoin(self.BASE_URL, src) + if full_s_url not in watch_urls: + watch_urls.add(full_s_url) + response['servers'].append({"name": f"سيرفر سريع {len(response['servers']) + 1}", "url": full_s_url, "type": "iframe"}) + + # 3. Regex Fallback (Scripts & Global) + patterns = [ + r'iframe.*?src=["\'](https?://[^"\']+)["\']', + r'embedUrl["\']\s*:\s*["\'](https?://[^"\']+)["\']', + r'file["\']\s*:\s*["\'](https?://[^"\']+\.m3u8)["\']', + r'source\s*src=["\'](https?://[^"\']+)["\']' + ] + for pattern in patterns: + for match in re.findall(pattern, watch_html, re.I): + if is_valid_srv(match) and match not in watch_urls: + watch_urls.add(match) + response['servers'].append({"name": f"سيرفر احتياطي {len(response['servers']) + 1}", "url": match, "type": "iframe"}) + + # Clean duplicates and sort by quality/relevance if possible + # For now, just ensuring uniqueness + + # --- Downloads --- + dl_url = url.replace('video.php', 'download.php').replace('play.php', 'download.php') + dl_html = await self._get_html(dl_url) + if dl_html: + dl_soup = BeautifulSoup(dl_html, 'html.parser') + for mirror in dl_soup.select('a[target="_blank"]'): + m_url = mirror.get('href') + if m_url and 'http' in m_url: + if any(x in m_url.lower() for x in ['wa.me', 'facebook.com', 'twitter.com', 'telegram.me', 't.me', 'sharer.php']): + continue + q_text = mirror.get_text(strip=True).replace("اضغط هنا للتحميل", "").replace("تحميل الملف", "").strip() or "رابط تحميل" + response['download_links'].append({"quality": q_text, "url": m_url}) + + return response + +scraper = LaroozaScraper() diff --git a/scraper/proxy_fetcher.py b/scraper/proxy_fetcher.py new file mode 100644 index 0000000000000000000000000000000000000000..bde5e67f947c37c4c888ebe26b774c01261367bf --- /dev/null +++ b/scraper/proxy_fetcher.py @@ -0,0 +1,66 @@ +""" +Free Proxy Fetcher - Automatically fetches and validates free proxies +""" +import aiohttp +import asyncio +import logging + +logger = logging.getLogger("proxy_fetcher") + +class FreeProxyFetcher: + def __init__(self): + self.proxies = [] + self.last_fetch = 0 + + async def fetch_free_proxies(self): + """Fetch free proxies from public APIs""" + proxy_sources = [ + "https://api.proxyscrape.com/v2/?request=get&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all", + "https://www.proxy-list.download/api/v1/get?type=http", + ] + + all_proxies = [] + async with aiohttp.ClientSession() as session: + for source in proxy_sources: + try: + async with session.get(source, timeout=10) as resp: + if resp.status == 200: + text = await resp.text() + proxies = [f"http://{line.strip()}" for line in text.split('\n') if line.strip()] + all_proxies.extend(proxies[:20]) # Take first 20 from each source + logger.info(f"Fetched {len(proxies)} proxies from {source}") + except Exception as e: + logger.error(f"Failed to fetch from {source}: {e}") + + self.proxies = all_proxies + logger.info(f"Total free proxies loaded: {len(self.proxies)}") + return self.proxies + + async def validate_proxy(self, proxy, test_url="https://httpbin.org/ip"): + """Test if a proxy works""" + try: + async with aiohttp.ClientSession() as session: + async with session.get(test_url, proxy=proxy, timeout=5) as resp: + if resp.status == 200: + return True + except: + pass + return False + + async def get_working_proxies(self, max_count=10): + """Get validated working proxies""" + if not self.proxies: + await self.fetch_free_proxies() + + working = [] + tasks = [self.validate_proxy(p) for p in self.proxies[:30]] + results = await asyncio.gather(*tasks, return_exceptions=True) + + for proxy, is_working in zip(self.proxies[:30], results): + if is_working and len(working) < max_count: + working.append(proxy) + + logger.info(f"Validated {len(working)} working proxies") + return working + +proxy_fetcher = FreeProxyFetcher() diff --git a/start.sh b/start.sh new file mode 100644 index 0000000000000000000000000000000000000000..5b802ac38de4fc010cb9fd89edb69cf0e2fb3832 --- /dev/null +++ b/start.sh @@ -0,0 +1,31 @@ +#!/bin/bash +set -e + +echo "--- STARTING MULTI-SERVICE BOOT ---" + +# Step 1: Start FlareSolverr +echo "[1/3] Launching FlareSolverr in background..." +export PYTHONPATH=$PYTHONPATH:/app/flaresolverr +export PORT=8191 +export LOG_LEVEL=info + +# Run FlareSolverr with its own directory as CWD +(cd /app/flaresolverr && python3 flaresolverr.py) & + +# Step 2: Health Check for FlareSolverr +echo "[2/3] Waiting for FlareSolverr to bind to port 8191..." +MAX_RETRIES=30 +COUNT=0 +while ! curl -s http://localhost:8191/health > /dev/null; do + sleep 1 + COUNT=$((COUNT+1)) + if [ $COUNT -ge $MAX_RETRIES ]; then + echo "⚠️ FlareSolverr failed to start in time, continuing to FastAPI anyway..." + break + fi +done +echo "✅ FlareSolverr is ready!" + +# Step 3: Start FastAPI +echo "[3/3] Launching FastAPI on port 7860..." +uvicorn main:app --host 0.0.0.0 --port 7860 --log-level info diff --git a/start_render.sh b/start_render.sh new file mode 100644 index 0000000000000000000000000000000000000000..31efd2c304b410aa48f932eca9467789c9864628 --- /dev/null +++ b/start_render.sh @@ -0,0 +1,22 @@ +#!/bin/bash +set -e + +echo "--- RENDER.COM DEPLOYMENT ---" + +# Step 1: Start FlareSolverr +echo "[1/2] Launching FlareSolverr in background..." +export PYTHONPATH=$PYTHONPATH:/opt/render/project/src/flaresolverr +export PORT_FS=8191 +export LOG_LEVEL=info + +(cd /opt/render/project/src/flaresolverr && python3 flaresolverr.py) & + +# Wait for FlareSolverr +echo "[2/2] Waiting for FlareSolverr..." +sleep 5 + +echo "✅ FlareSolverr ready!" +echo "--- Starting FastAPI on port $PORT ---" + +# Render provides $PORT automatically +uvicorn main:app --host 0.0.0.0 --port ${PORT:-7860} --log-level info diff --git a/tools/analyze_structure.py b/tools/analyze_structure.py new file mode 100644 index 0000000000000000000000000000000000000000..3baafd030208a711c6d6c1241d975c65fcef5485 --- /dev/null +++ b/tools/analyze_structure.py @@ -0,0 +1,36 @@ +from bs4 import BeautifulSoup +import sys +import io + +# Set encoding for Windows terminal +if sys.platform == 'win32': + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + +def analyze_html(): + with open("flaresolverr_output.html", "r", encoding="utf-8") as f: + html = f.read() + + soup = BeautifulSoup(html, 'html.parser') + + print("--- Analyzing Links ---") + links = soup.find_all('a', href=True) + for i, a in enumerate(links[:100]): + href = a['href'] + text = a.get_text(strip=True) + if 'cat=' in href or 'video' in href or 'movie' in href or 'series' in href: + print(f"{i}: Text: {text} | Href: {href}") + + print("\n--- Analyzing Containers ---") + # Look for common patterns in classes + classes = set() + for tag in soup.find_all(True, class_=True): + for c in tag['class']: + classes.add(c) + + print(f"Found {len(classes)} unique classes.") + # Print classes that might be containers + potential = [c for c in classes if any(x in c.lower() for x in ['item', 'video', 'movie', 'thumb', 'card', 'block', 'col'])] + print(f"Potential container classes: {potential}") + +if __name__ == "__main__": + analyze_html() diff --git a/tools/check_mirrors.py b/tools/check_mirrors.py new file mode 100644 index 0000000000000000000000000000000000000000..d19f65a91022affb504c6762a2a9ae1dac812bf1 --- /dev/null +++ b/tools/check_mirrors.py @@ -0,0 +1,34 @@ +import asyncio +import httpx +from curl_cffi.requests import AsyncSession + +async def check_mirrors(): + mirrors = [ + "https://larooza.mom", + "https://larooza.site", + "https://laroza-tv.net", + "https://larozavideo.net", + "https://larooza.video", + "https://q.larozavideo.net" + ] + + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + } + + for mirror in mirrors: + print(f"Checking {mirror}...") + try: + # Try curl-cffi first + async with AsyncSession(impersonate="chrome110") as s: + resp = await s.get(mirror, headers=headers, timeout=10) + print(f" [curl-cffi] {mirror}: {resp.status_code} | Title: {resp.text[:100].replace('\n', ' ')}") + + async with httpx.AsyncClient(http2=True, timeout=10) as client: + resp = await client.get(mirror, headers=headers) + print(f" [httpx] {mirror}: {resp.status_code} | Title: {resp.text[:100].replace('\n', ' ')}") + except Exception as e: + print(f" [Error] {mirror}: {e}") + +if __name__ == "__main__": + asyncio.run(check_mirrors()) diff --git a/tools/debug_fs.py b/tools/debug_fs.py new file mode 100644 index 0000000000000000000000000000000000000000..ffc5176ebf1b1d7ee1513f0ffc30b2673b44a589 --- /dev/null +++ b/tools/debug_fs.py @@ -0,0 +1,51 @@ +import asyncio +import httpx +import json +import sys + +# Set encoding to utf-8 for windows console +if sys.platform == "win32": + import codecs + sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach()) + +async def test(): + urls = [ + "https://q.larozavideo.net/home.24", + "https://q.larozavideo.net/newvideos1.php", + "https://q.larozavideo.net/category.php?cat=all_movies_13" + ] + + flaresolverr_url = "http://127.0.0.1:8191/v1" + + async with httpx.AsyncClient(timeout=90.0) as client: + for url in urls: + print(f"\n--- Testing {url} ---") + payload = { + "cmd": "request.get", + "url": url, + "maxTimeout": 60000 + } + try: + response = await client.post(flaresolverr_url, json=payload) + if response.status_code == 200: + data = response.json() + if data.get('status') == 'ok': + solution = data.get('solution', {}) + html = solution.get('response', '') + title = solution.get('title', '') + print(f"Title found: {title}") + + if "video.php" in html or ".thumbnail" in html or ".box" in html: + print("FOUND: Movie items are present in HTML!") + else: + print("FAILED: No movie items in HTML.") + print(f"Snippet: {html[:500]}") + else: + print(f"FlareSolverr message: {data.get('message')}") + else: + print(f"Server error: {response.status_code}") + except Exception as e: + print(f"Script error: {e}") + +if __name__ == "__main__": + asyncio.run(test()) diff --git a/tools/debug_mirrors.py b/tools/debug_mirrors.py new file mode 100644 index 0000000000000000000000000000000000000000..73988eb7cee36290bf496c4f97220ce223a50586 --- /dev/null +++ b/tools/debug_mirrors.py @@ -0,0 +1,35 @@ +import asyncio +import httpx +from bs4 import BeautifulSoup + +async def debug_fetch(): + mirrors = ["https://q.larozavideo.net", "https://larooza.mom", "https://larooza.site", "https://m.laroza-tv.net"] + async with httpx.AsyncClient(timeout=10.0, follow_redirects=True) as client: + for mirror in mirrors: + print(f"\n--- Checking mirror: {mirror} ---") + try: + resp = await client.get(mirror, headers={"User-Agent": "Mozilla/5.0"}) + print(f"Status: {resp.status_code}") + if resp.status_code == 200: + soup = BeautifulSoup(resp.text, 'html.parser') + title = soup.title.string if soup.title else "No title" + print(f"Title: {title}") + + selectors = ['.thumbnail', '.pm-li-video', '.pm-video-thumb', '.video-block', '.movie-item', 'li.col-xs-6', '.box', '.video-box', '.video-item', '.post-item'] + found = False + for sel in selectors: + count = len(soup.select(sel)) + if count > 0: + print(f" Found {count} items with selector {sel}") + found = True + + if not found: + video_links = len(soup.select('a[href*="video.php"], a[href*="watch.php"]')) + print(f" Found {video_links} video/watch links.") + else: + print(f" Snippet: {resp.text[:200]}") + except Exception as e: + print(f" Error: {e}") + +if __name__ == "__main__": + asyncio.run(debug_fetch()) diff --git a/tools/debug_scraper.py b/tools/debug_scraper.py new file mode 100644 index 0000000000000000000000000000000000000000..bd282e90cb4b1216d17ba7bc26bb17e8395aee28 --- /dev/null +++ b/tools/debug_scraper.py @@ -0,0 +1,27 @@ +import asyncio +import sys +import os + +# Add the current directory to path +sys.path.append(os.getcwd()) + +from scraper.engine import LaroozaScraper + +# Set encoding to utf-8 for windows console +if sys.platform == "win32": + import codecs + sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach()) + +async def test(): + scraper = LaroozaScraper() + print("DEBUG: Fetching latest movies...") + items = await scraper.fetch_home(page=1) + print(f"DEBUG: Found {len(items)} items.") + if items: + for i, item in enumerate(items[:3]): + print(f" {i+1}. {item['title']} - ID: {item['id'][:20]}...") + else: + print("DEBUG: ❌ fetch_home returned 0 items.") + +if __name__ == "__main__": + asyncio.run(test()) diff --git a/tools/dump_html.py b/tools/dump_html.py new file mode 100644 index 0000000000000000000000000000000000000000..ad7841315db09ed09c88de092150245de1887509 --- /dev/null +++ b/tools/dump_html.py @@ -0,0 +1,25 @@ +import httpx +import asyncio +from bs4 import BeautifulSoup + +async def dump_html(): + url = "https://larooza.mom" # Using the one that gave 0 links + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + } + async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client: + print(f"Fetching {url}...") + resp = await client.get(url, headers=headers) + print(f"Status: {resp.status_code}") + with open("dump.html", "w", encoding="utf-8") as f: + f.write(resp.text) + print("HTML dumped to dump.html") + + soup = BeautifulSoup(resp.text, 'html.parser') + links = soup.select('a') + print(f"Total links: {len(links)}") + for a in links[:20]: + print(f"Link: {a.get('href')} | Text: {a.get_text(strip=True)[:30]}") + +if __name__ == "__main__": + asyncio.run(dump_html()) diff --git a/tools/dump_html_v2.py b/tools/dump_html_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..14d20010dc00520a4a089812f4651f4ef8b207c9 --- /dev/null +++ b/tools/dump_html_v2.py @@ -0,0 +1,25 @@ +import httpx +import asyncio +from bs4 import BeautifulSoup + +async def dump_html(): + url = "https://q.larozavideo.net/newvideos1.php" + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + } + async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client: + print(f"Fetching {url}...") + resp = await client.get(url, headers=headers) + print(f"Status: {resp.status_code}") + print(f"Final URL: {resp.url}") + + soup = BeautifulSoup(resp.text, 'html.parser') + containers = soup.select('.thumbnail, .pm-li-video, .pm-video-thumb, .video-block, .movie-item, li.col-xs-6, .box, .video-box, .video-item, .post-item') + print(f"Found {len(containers)} item containers.") + + if len(containers) == 0: + print("Snippet of HTML:") + print(resp.text[:1000]) + +if __name__ == "__main__": + asyncio.run(dump_html()) diff --git a/tools/extra/diagnose.py b/tools/extra/diagnose.py new file mode 100644 index 0000000000000000000000000000000000000000..8fcdffe72ce56a1e81f897a7cc88d9a683ef74a5 --- /dev/null +++ b/tools/extra/diagnose.py @@ -0,0 +1,27 @@ + +import httpx +import asyncio +import os + +async def check_service(name, url): + try: + async with httpx.AsyncClient(timeout=5.0) as client: + resp = await client.get(url) + print(f"✅ {name} is UP ({url}) - Status: {resp.status_code}") + return True + except Exception as e: + print(f"❌ {name} is DOWN ({url}) - Error: {e}") + return False + +async def main(): + print("--- Diagnostics ---") + await check_service("Backend", "http://localhost:8000/health") + await check_service("FlareSolverr", "http://localhost:8191/health") + + # Try to find the tunnel URL from local logs if possible + print("\n--- Searching for Tunnel URL ---") + # This is a bit tricky, but we can try to find recent cloudflared logs + # Cloudflared usually doesn't log to a file unless specified, but we'll check common names + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/tools/extra/expose_to_internet.bat b/tools/extra/expose_to_internet.bat new file mode 100644 index 0000000000000000000000000000000000000000..f06efe4cf91482e563beec9efbbad17cd211af81 --- /dev/null +++ b/tools/extra/expose_to_internet.bat @@ -0,0 +1,18 @@ +@echo off +echo ========================================== +echo CLOUDFLARE TUNNEL - EXPOSE TO INTERNET +echo ========================================== + +REM Download Cloudflared (if not exists) +if not exist cloudflared.exe ( + echo Downloading Cloudflare Tunnel... + powershell -Command "Invoke-WebRequest -Uri 'https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-windows-amd64.exe' -OutFile 'cloudflared.exe'" +) + +REM Start tunnel +echo Starting Cloudflare Tunnel... +echo Your backend will be accessible via a public URL in a moment... +echo. +cloudflared.exe tunnel --url http://localhost:8000 + +pause diff --git a/tools/extra/nulcd b/tools/extra/nulcd new file mode 100644 index 0000000000000000000000000000000000000000..e54a14ae3638f055863e78839784e4d13b9f0f78 --- /dev/null +++ b/tools/extra/nulcd @@ -0,0 +1,2 @@ +ERROR: Invalid argument/option - 'c:\Users\Mina\Desktop\lmina\backend'. +Type "TASKKILL /?" for usage. diff --git a/tools/extra/run_vps.bat b/tools/extra/run_vps.bat new file mode 100644 index 0000000000000000000000000000000000000000..ae7b591271f8b15eb202250f5b5832494bf0d4b2 --- /dev/null +++ b/tools/extra/run_vps.bat @@ -0,0 +1,58 @@ +@echo off +setlocal enabledelayedexpansion + +echo ========================================== +echo MEIH PLATFORM - VPS AUTO RUNNER +echo ========================================== + +REM 1. Kill existing processes to avoid port conflicts +echo [*] Cleaning up old processes... +taskkill /F /IM uvicorn.exe /T 2>nul +taskkill /F /IM python.exe /T 2>nul +taskkill /F /IM cloudflared_vps.exe /T 2>nul + +REM 2. Start FlareSolverr +echo [*] Starting FlareSolverr (Protection Bypass)... +cd /d "%~dp0backend" +start /B "FlareSolverr" cmd /c "cd flaresolverr && ..\venv\Scripts\python flaresolverr.py" + +REM 3. Start FastAPI Backend +echo [*] Starting Backend Server... +start /B "Backend" cmd /c "venv\Scripts\uvicorn main:app --port 8000 --workers 1" + +REM Wait for servers to warm up +timeout /t 8 /nobreak >nul + +REM 4. Start Cloudflare Tunnel and Log Output +echo [*] Starting Cloudflare Tunnel... +echo [*] WAITING FOR YOUR UNIQUE URL... +echo. > ..\vps_connection.log +start /B "Cloudflare" cmd /c "..\cloudflared_vps.exe tunnel --url http://localhost:8000 --no-autoupdate > ..\vps_connection.log 2>&1" + +:WAIT_FOR_URL +timeout /t 2 /nobreak >nul +findstr "trycloudflare.com" ..\vps_connection.log >nul +if errorlevel 1 goto WAIT_FOR_URL + +REM 5. Extract and Display the URL +for /f "tokens=4" %%a in ('findstr "trycloudflare.com" ..\vps_connection.log') do ( + set RAW_URL=%%a + REM Clean up the URL (remove pipes and spaces) + set CLEAN_URL=!RAW_URL:|=! + set CLEAN_URL=!CLEAN_URL: =! +) + +echo. +echo ========================================== +echo SUCCESS! YOUR PROJECT IS ONLINE +echo ========================================== +echo. +echo API URL: !CLEAN_URL! +echo. +echo 1. Open the URL above in your browser. +echo 2. Click 'Advanced' -> 'Proceed' to trust it. +echo 3. Then open: https://meih-netflix-clone.vercel.app/ +echo. +echo KEEP THIS WINDOW OPEN TO STAY ONLINE +echo ========================================== +pause diff --git a/tools/extra/setup_and_run.bat b/tools/extra/setup_and_run.bat new file mode 100644 index 0000000000000000000000000000000000000000..ee15f0c1fc841019d0a4c2adc26ee6e452993e5a --- /dev/null +++ b/tools/extra/setup_and_run.bat @@ -0,0 +1,93 @@ +@echo off +setlocal enabledelayedexpansion + +echo =========================================== +echo LMINA PLATFORM - NITRO AUTO SYSTEM +echo =========================================== + +REM 1. Kill existing processes to avoid port conflicts +echo [*] Cleaning up old processes... +taskkill /F /IM node.exe /T 2>nul +taskkill /F /IM uvicorn.exe /T 2>nul +taskkill /F /IM python.exe /T 2>nul +taskkill /F /IM cloudflared_vps.exe /T 2>nul +taskkill /F /IM chrome.exe /T 2>nul + +REM 2. Backend Setup & Run +echo [*] Initializing Backend... +set "ROOT_DIR=%~dp0" +cd /d "%ROOT_DIR%backend" + +if not exist venv ( + echo [!] Creating Virtual Environment... + python -m venv venv +) + +echo [*] Installing/Updating dependencies... +call venv\Scripts\activate +pip install -r requirements.txt >nul 2>&1 + +REM 3. Start FlareSolverr +echo [*] Starting FlareSolverr (Protection Bypass)... +start /B "FlareSolverr" cmd /c "cd /d \"%ROOT_DIR%backend\flaresolverr\" && ..\venv\Scripts\python flaresolverr.py" + +REM 4. Start FastAPI Backend +echo [*] Starting Backend Server on port 8000... +start /B "Backend" cmd /c "cd /d \"%ROOT_DIR%backend\" && venv\Scripts\uvicorn main:app --port 8000 --workers 1" + +REM Wait for servers to warm up +echo [*] Warming up servers (8s)... +timeout /t 8 /nobreak >nul + +REM 5. Start Cloudflare Tunnel (VPS Mode) +echo [*] Starting Cloudflare Tunnel... +echo [*] WAITING FOR PUBLIC URL (This may take 10-20 seconds)... +set "LOG_FILE=%ROOT_DIR%vps_connection.log" +echo. > "!LOG_FILE!" + +REM Using a more robust start command +start /B "Cloudflare" cmd /c "\"%ROOT_DIR%cloudflared_vps.exe\" tunnel --url http://localhost:8000 --no-autoupdate > \"!LOG_FILE!\" 2>&1" + +:WAIT_FOR_URL +timeout /t 2 /nobreak >nul +if not exist "!LOG_FILE!" goto WAIT_FOR_URL +findstr "trycloudflare.com" "!LOG_FILE!" >nul +if errorlevel 1 goto WAIT_FOR_URL + +REM Extract the URL +set "CLEAN_URL=NOT_FOUND" +for /f "tokens=4" %%a in ('findstr "trycloudflare.com" "!LOG_FILE!"') do ( + set "RAW_URL=%%a" + set "CLEAN_URL=!RAW_URL:|=!" + set "CLEAN_URL=!CLEAN_URL: =!" +) + +REM 6. Frontend Setup & Run (Local) +echo [*] Initializing Frontend Local Server... +cd /d "%ROOT_DIR%meih-netflix-clone" +if not exist node_modules ( + echo [!] Installing Node modules (First time only)... + call npm install >nul 2>&1 +) +start "Frontend" cmd /k "npm run dev" + +echo. +echo =========================================== +echo SYSTEM DEPLOYED SUCCESSFULLY +echo =========================================== +echo. +echo [LOCAL ACCESS] +echo Frontend: http://localhost:5173 +echo Backend: http://localhost:8000 +echo. +echo [VPS / MOBILE ACCESS] +echo Public API URL: !CLEAN_URL! +echo. +echo [IMPORTANT] +echo Open the Public API URL once in your browser, +echo click Advanced -> Proceed, then use the Frontend. +echo. +echo =========================================== +echo KEEP THIS WINDOW OPEN TO STAY ONLINE +echo =========================================== +pause diff --git a/tools/extra/start_vps.bat b/tools/extra/start_vps.bat new file mode 100644 index 0000000000000000000000000000000000000000..e350f028b9e4cad6cc7cd7c3c632f08d7ecdd932 --- /dev/null +++ b/tools/extra/start_vps.bat @@ -0,0 +1,4 @@ +@echo off +echo [*] Starting Global Link (VPS Mode)... +echo [*] Please wait while we generate your unique URL... +c:\Users\Mina\Desktop\lmina\cloudflared.exe tunnel --url http://localhost:8000 --logfile %temp%\tunnel.log 2>&1 diff --git a/tools/find_body.py b/tools/find_body.py new file mode 100644 index 0000000000000000000000000000000000000000..258f7554194ba7a5f34f77d9be29cedda13d57cd --- /dev/null +++ b/tools/find_body.py @@ -0,0 +1,19 @@ +import sys +import io + +if sys.platform == 'win32': + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + +with open("flaresolverr_output.html", "r", encoding="utf-8") as f: + content = f.read() + + body_idx = content.find(" at index {body_idx}") + print(content[body_idx:body_idx+2000]) + else: + print(" tag NOT found!") + # Check if it's all in scripts + print(f"Total length: {len(content)}") + print("Last 1000 chars:") + print(content[-1000:]) diff --git a/tools/quick_health.py b/tools/quick_health.py new file mode 100644 index 0000000000000000000000000000000000000000..094671db1dedb8c940771243de412c1361692c02 --- /dev/null +++ b/tools/quick_health.py @@ -0,0 +1,8 @@ +import httpx +try: + with httpx.Client(timeout=5.0) as client: + resp = client.get("http://localhost:8000/health") + print(f"Status: {resp.status_code}") + print(f"Data: {resp.json()}") +except Exception as e: + print(f"Error: {e}") diff --git a/tools/read_head.py b/tools/read_head.py new file mode 100644 index 0000000000000000000000000000000000000000..3e8ce1d7f999e8cbeaaf9ee644b317122f249f55 --- /dev/null +++ b/tools/read_head.py @@ -0,0 +1,9 @@ +import sys +import io + +if sys.platform == 'win32': + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + +with open("flaresolverr_output.html", "r", encoding="utf-8") as f: + content = f.read(2000) + print(content) diff --git a/tools/search_patterns.py b/tools/search_patterns.py new file mode 100644 index 0000000000000000000000000000000000000000..815b700bd7c6208201f0cfaebe2689569f12b74f --- /dev/null +++ b/tools/search_patterns.py @@ -0,0 +1,22 @@ +import sys +import io +import re + +if sys.platform == 'win32': + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + +with open("flaresolverr_output.html", "r", encoding="utf-8") as f: + content = f.read() + + print(f"Total length: {len(content)}") + + # Search for common patterns + patterns = ['thumbnail', 'pm-li-video', 'video-block', 'movie-item', 'video.php', 'watch.php'] + for p in patterns: + count = len(re.findall(p, content)) + print(f"Pattern '{p}': found {count} times") + + # If not found, show some snippets from the middle + if len(content) > 10000: + print("\n--- Snippet from middle (50000:51000) ---") + print(content[50000:51000]) diff --git a/tools/test_comprehensive.py b/tools/test_comprehensive.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tools/test_curl_direct.py b/tools/test_curl_direct.py new file mode 100644 index 0000000000000000000000000000000000000000..c1870905cb71542bf4479d70a543bc1158a1d869 --- /dev/null +++ b/tools/test_curl_direct.py @@ -0,0 +1,29 @@ +from curl_cffi.requests import AsyncSession +import asyncio + +async def test_curl(): + url = "https://q.larozavideo.net/newvideos1.php" + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + } + async with AsyncSession(impersonate="chrome120") as s: + print(f"Fetching {url}...") + try: + resp = await s.get(url, headers=headers, timeout=15) + print(f"Status: {resp.status_code}") + if resp.status_code == 200: + print(f"Title: {resp.text.find('')}") + if "video.php" in resp.text: + print("SUCCESS: Found video items!") + else: + print("FAILED: No video items found.") + print(f"Snippet: {resp.text[:500]}") + else: + print(f"HTTP Error {resp.status_code}") + # Print headers to see if it's Cloudflare + print(f"Server: {resp.headers.get('Server')}") + except Exception as e: + print(f"Error: {e}") + +if __name__ == "__main__": + asyncio.run(test_curl()) diff --git a/tools/test_flaresolverr_direct.py b/tools/test_flaresolverr_direct.py new file mode 100644 index 0000000000000000000000000000000000000000..1bd5f3cfe90e5ad7eea6748613df0893fd1423e7 --- /dev/null +++ b/tools/test_flaresolverr_direct.py @@ -0,0 +1,56 @@ +import httpx +import json +import time +import sys + +# Set encoding for Windows terminal +if sys.platform == 'win32': + import io + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + +def test_flaresolverr(): + url = "http://localhost:8191/v1" + target_url = "https://q.larozavideo.net/newvideos1.php" + + payload = { + "cmd": "request.get", + "url": target_url, + "maxTimeout": 60000 + } + + print(f"Sending request to FlareSolverr for {target_url}...") + start_time = time.time() + try: + with httpx.Client(timeout=90.0) as client: + response = client.post(url, json=payload) + duration = time.time() - start_time + print(f"Status Code: {response.status_code}") + print(f"Duration: {duration:.2f}s") + + if response.status_code == 200: + data = response.json() + print(f"FlareSolverr Status: {data.get('status')}") + if data.get('status') == 'ok': + solution = data.get('solution', {}) + html = solution.get('response', '') + print(f"HTML Length: {len(html)}") + print(f"Cookies: {len(solution.get('cookies', []))}") + print(f"User-Agent: {solution.get('userAgent')}") + + if "challenge-running" in html or "cf-ray" in html: + print("[X] Challenge still present in HTML!") + else: + print("[OK] Challenge solved (or not present)!") + + # Save HTML for inspection + with open("flaresolverr_output.html", "w", encoding="utf-8") as f: + f.write(html) + else: + print(f"[X] FlareSolverr Error: {data.get('message')}") + else: + print(f"[X] HTTP Error: {response.text}") + except Exception as e: + print(f"[X] Exception: {e}") + +if __name__ == "__main__": + test_flaresolverr() diff --git a/tools/test_mom.py b/tools/test_mom.py new file mode 100644 index 0000000000000000000000000000000000000000..69c9a7790b3b87fff54c25b77b2a49cb2ca20b96 --- /dev/null +++ b/tools/test_mom.py @@ -0,0 +1,25 @@ +import asyncio +from scraper.engine import LaroozaScraper +from bs4 import BeautifulSoup +import sys + +async def main(): + s = LaroozaScraper() + s.BASE_URL = "https://larooza.mom" + s.TARGET_URL = "https://larooza.mom/newvideos.php" + + print(f"Fetching {s.TARGET_URL}...") + html = await s._get_html(s.TARGET_URL) + if not html: + print("Failed to get HTML") + return + + print(f"HTML Length: {len(html)}") + soup = BeautifulSoup(html, 'html.parser') + items = s._extract_items(soup) + print(f"Found {len(items)} items") + for item in items[:5]: + print(f" - {item['title']} ({item['type']})") + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/tools/test_quick.py b/tools/test_quick.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tools/test_scraper_direct.py b/tools/test_scraper_direct.py new file mode 100644 index 0000000000000000000000000000000000000000..6f6398128df311bd2b4d8e00a7b5fd505b82e9da --- /dev/null +++ b/tools/test_scraper_direct.py @@ -0,0 +1,19 @@ +import asyncio +import logging +from scraper.engine import scraper + +async def test(): + logging.basicConfig(level=logging.INFO) + print("Testing LaroozaScraper.fetch_home(1)...") + try: + items = await scraper.fetch_home(1) + print(f"Success! Found {len(items)} items.") + if items: + print(f"First item: {items[0]['title']}") + except Exception as e: + print(f"ERROR: {e}") + import traceback + traceback.print_exc() + +if __name__ == "__main__": + asyncio.run(test()) diff --git a/tools/test_scraper_full.py b/tools/test_scraper_full.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tools/test_scraper_logic.py b/tools/test_scraper_logic.py new file mode 100644 index 0000000000000000000000000000000000000000..d0fa18675b102753dfe8982725839245726809ec --- /dev/null +++ b/tools/test_scraper_logic.py @@ -0,0 +1,40 @@ +import asyncio +import sys +import io +from scraper.engine import scraper +from bs4 import BeautifulSoup + +# Set encoding for Windows terminal +if sys.platform == 'win32': + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + +async def test_scraper_logic(): + # Test with the HTML we just saved from FlareSolverr + try: + with open("flaresolverr_output.html", "r", encoding="utf-8") as f: + html = f.read() + + print(f"Testing extraction from saved HTML (length: {len(html)})...") + soup = BeautifulSoup(html, 'html.parser') + items = scraper._extract_items(soup) + + print(f"Extracted {len(items)} items.") + for i, item in enumerate(items[:5]): + print(f"{i+1}. {item['title']} - {item['type']}") + print(f" Poster: {item['poster'][:50]}...") + + if not items: + print("[X] No items extracted! Checking container selectors...") + # Debug selectors + selectors = ['.thumbnail', '.pm-li-video', '.pm-video-thumb', '.video-block', '.movie-item', 'li.col-xs-6', '.box', '.video-box', '.video-item', '.post-item'] + for sel in selectors: + found = soup.select(sel) + print(f"Selector '{sel}': found {len(found)} elements") + + except FileNotFoundError: + print("[X] flaresolverr_output.html not found. Run test_flaresolverr_direct.py first.") + except Exception as e: + print(f"[X] Error: {e}") + +if __name__ == "__main__": + asyncio.run(test_scraper_logic()) diff --git a/tools/test_search_details.py b/tools/test_search_details.py new file mode 100644 index 0000000000000000000000000000000000000000..188ab42d32bfb1bcbdc693f68efe9c276f51e8c2 --- /dev/null +++ b/tools/test_search_details.py @@ -0,0 +1,46 @@ +import requests +import sys + +# Ensure UTF-8 output for console +sys.stdout.reconfigure(encoding='utf-8') + +def test_search(query): + url = f"http://localhost:8000/search?q={query}" + print(f"Searching for: {query}...") + try: + r = requests.get(url, timeout=30) + print(f"Status: {r.status_code}") + if r.status_code == 200: + data = r.json() + print(f"Results found: {len(data)}") + if data: + print(f"First result title: {data[0].get('title')}") + print(f"First result ID: {data[0].get('id')}") + return data[0].get('id') + else: + print(f"Error: {r.text[:500]}") + except Exception as e: + print(f"Search failed: {e}") + return None + +def test_details(safe_id): + url = f"http://localhost:8000/details/{safe_id}" + print(f"\nFetching details for: {safe_id}...") + try: + r = requests.get(url, timeout=30) + print(f"Status: {r.status_code}") + if r.status_code == 200: + data = r.json() + print(f"Title: {data.get('title')}") + print(f"Servers count: {len(data.get('servers', []))}") + print(f"Download links count: {len(data.get('download_links', []))}") + else: + print(f"Error: {r.text[:500]}") + except Exception as e: + print(f"Details failed: {e}") + +if __name__ == "__main__": + # Test with a likely existing movie title + movie_id = test_search("%D9%87%D9%8A%D8%A8%D8%AA%D8%A7") # "هيبتا" + if movie_id: + test_details(movie_id) diff --git a/tools/test_system.py b/tools/test_system.py new file mode 100644 index 0000000000000000000000000000000000000000..6192187fbb8b62ed725f238c4956954816636d14 --- /dev/null +++ b/tools/test_system.py @@ -0,0 +1,82 @@ +import asyncio +import httpx +import time +import sys +import os + +# Set encoding for Windows CLI +if sys.platform == "win32": + import codecs + sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach()) + +async def test_system(): + print("\n" + "="*50) + print("🔍 MEIH SYSTEM HEALTH CHECK") + print("="*50 + "\n") + + async with httpx.AsyncClient(timeout=30.0) as client: + # 1. Test FlareSolverr + print("📡 Checking FlareSolverr...") + try: + resp = await client.get("http://localhost:8191/health") + if resp.status_code == 200: + print("✅ FlareSolverr: ONLINE") + else: + print(f"❌ FlareSolverr: ERROR (Status {resp.status_code})") + except Exception as e: + print(f"❌ FlareSolverr: OFFLINE ({e})") + + # 2. Test FastAPI Backend + print("\n⚙️ Checking FastAPI Backend...") + try: + resp = await client.get("http://localhost:8000/") + if resp.status_code == 200: + print("✅ Backend: ONLINE") + data = resp.json() + print(f" Mirror Active: {data.get('active_mirror')}") + print(f" Engine Status: {data.get('engine_status')}") + else: + print(f"❌ Backend: ERROR (Status {resp.status_code})") + except Exception as e: + print(f"❌ Backend: OFFLINE ({e})") + + # 3. Test Scrapper (Latest Movies) + print("\n🎬 Testing Movie Scrapper (Live Fetch)...") + try: + start_time = time.time() + resp = await client.get("http://localhost:8000/latest") + duration = time.time() - start_time + if resp.status_code == 200: + items = resp.json() + print(f"✅ Scrapper: SUCCESS") + print(f" Items Found: {len(items)}") + print(f" Time Taken: {duration:.2f}s") + if items: + print(f" Top Item: {items[0]['title']}") + else: + print(f"❌ Scrapper: FAILED (Status {resp.status_code})") + except Exception as e: + print(f"❌ Scrapper: ERROR ({e})") + + # 4. Test Category (Fast Path) + print("\n📂 Testing Category (Prefetch Integrity)...") + try: + start_time = time.time() + resp = await client.get("http://localhost:8000/category/arabic-movies") + duration = time.time() - start_time + if resp.status_code == 200: + items = resp.json() + print(f"✅ Category Path: STABLE") + print(f" Items: {len(items)}") + print(f" Time Taken: {duration:.2f}s (Should be < 0.5s if cached)") + else: + print(f"❌ Category: FAILED") + except Exception as e: + print(f"❌ Category: ERROR ({e})") + + print("\n" + "="*50) + print("✨ ALL TESTS COMPLETED") + print("="*50 + "\n") + +if __name__ == "__main__": + asyncio.run(test_system()) diff --git a/tools/test_uc.py b/tools/test_uc.py new file mode 100644 index 0000000000000000000000000000000000000000..2b04368664de6f5acecb789fa20a14e12536d094 --- /dev/null +++ b/tools/test_uc.py @@ -0,0 +1,18 @@ +import undetected_chromedriver as uc +import time + +def test(): + print("Starting UC test...") + try: + options = uc.ChromeOptions() + options.add_argument('--headless') + driver = uc.Chrome(options=options) + print("Driver started successfully!") + driver.get("https://www.google.com") + print(f"Title: {driver.title}") + driver.quit() + except Exception as e: + print(f"Error: {e}") + +if __name__ == "__main__": + test() diff --git a/tools/verify_latest.py b/tools/verify_latest.py new file mode 100644 index 0000000000000000000000000000000000000000..fff94c40714a07613f3a512a510a7aee5074dbd4 --- /dev/null +++ b/tools/verify_latest.py @@ -0,0 +1,26 @@ +import requests +import json +import sys + +# Ensure UTF-8 output for Arabic characters +try: + sys.stdout.reconfigure(encoding='utf-8') +except AttributeError: + pass # Not available in all environments + +try: + r = requests.get('http://localhost:8000/latest', timeout=30) + print(f"Status: {r.status_code}") + if r.status_code == 200: + data = r.json() + print(f"Items found: {len(data)}") + if data: + print(f"First item title: {data[0].get('title')}") + print(f"First item ID: {data[0].get('id')}") + else: + print("Response body:") + print(r.text[:500]) + else: + print(f"Error body: {r.text[:500]}") +except Exception as e: + print(f"Request failed: {e}")