diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..4ad7e60e5a2774546d1fea068f0d87777d31ad9a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,17 @@
+venv/
+__pycache__/
+archive/
+*.db
+*.log
+.env
+.vscode/
+.idea/
+bin/
+cache/
+logs/
+*.exe
+*.img
+dist/
+node_modules/
+.choreo/
+TUNNEL_TOKEN.txt
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..96a085c41ebd1080e30daf8b9cc01f502cdd37d7
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,35 @@
+FROM python:3.10-slim
+
+# Hugging Face Optimized - Lightweight & Stable
+ENV PYTHONUNBUFFERED=1
+ENV HF_SPACE=1
+
+# Install minimal system dependencies
+RUN apt-get update && apt-get install -y \
+ curl \
+ ffmpeg \
+ && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+# Copy requirements and install
+COPY requirements.txt .
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+
+# Create a non-privileged user (Required by Hugging Face)
+RUN useradd -m -u 1000 user
+RUN chown -R user:user /app
+USER user
+ENV HOME=/home/user \
+ PATH=/home/user/.local/bin:$PATH
+
+# Copy application code
+COPY --chown=user:user . .
+
+# Hugging Face uses port 7860
+EXPOSE 7860
+ENV PORT=7860
+
+# Start the application with optimized settings for limited RAM
+# We use 1 worker to keep memory usage low on the free tier
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1", "--timeout-keep-alive", "60"]
diff --git a/Dockerfile.hf b/Dockerfile.hf
new file mode 100644
index 0000000000000000000000000000000000000000..c3a8e418c209eaae1868b799d8eeeb70ba2200dc
--- /dev/null
+++ b/Dockerfile.hf
@@ -0,0 +1,35 @@
+FROM python:3.10-slim
+
+# Hugging Face optimized - Lightweight without Chrome
+ENV PYTHONUNBUFFERED=1
+ENV SPACE_ID=huggingface
+ENV HF_SPACE=1
+
+# Install minimal dependencies
+RUN apt-get update && apt-get install -y \
+ curl \
+ git \
+ && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+# Copy and install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+
+# Create user for Hugging Face
+RUN useradd -m -u 1000 user
+RUN chown -R user:user /app
+USER user
+ENV HOME=/home/user \
+ PATH=/home/user/.local/bin:$PATH
+
+# Copy application
+COPY --chown=user:user . .
+
+# Hugging Face uses port 7860
+EXPOSE 7860
+ENV PORT=7860
+
+# Start without FlareSolverr (too heavy for HF)
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
diff --git a/Procfile b/Procfile
new file mode 100644
index 0000000000000000000000000000000000000000..25f27e19e1dcc8ca7e0a456578bdcb0e3aed3fe8
--- /dev/null
+++ b/Procfile
@@ -0,0 +1 @@
+web: uvicorn main:app --host 0.0.0.0 --port $PORT --log-level info
diff --git a/README.hf.md b/README.hf.md
new file mode 100644
index 0000000000000000000000000000000000000000..2d34d72d828113241bab2ff95f9d82ee03539955
--- /dev/null
+++ b/README.hf.md
@@ -0,0 +1,33 @@
+---
+title: MEIH Movies API
+emoji: 🎬
+colorFrom: red
+colorTo: gray
+sdk: docker
+app_file: main.py
+pinned: false
+license: mit
+---
+
+# MEIH Movies API - Hugging Face Edition
+
+High-performance movie streaming API optimized for Hugging Face Spaces.
+
+## Features
+
+- Fast content scraping with curl-cffi
+- Intelligent caching system
+- Rate limiting for stability
+- Proxy rotation support
+
+## API Endpoints
+
+- `GET /latest` - Latest movies and series
+- `GET /category/{cat_id}` - Browse by category
+- `GET /search?q={query}` - Search content
+- `GET /details/{id}` - Get streaming links
+- `GET /health` - Health check
+
+## Usage
+
+Visit the API at: `https://YOUR-SPACE-NAME.hf.space/`
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..779738920a43d1fa3822f9c01e13d4eecb4ba380
--- /dev/null
+++ b/README.md
@@ -0,0 +1,30 @@
+---
+title: Meih Movies API
+emoji: 🎬
+colorFrom: red
+colorTo: gray
+sdk: docker
+pinned: false
+---
+
+# MEIH Movies API - Hugging Face Edition
+
+High-performance movie streaming API optimized for Hugging Face Spaces.
+
+## Features
+
+- **Lightweight**: Optimized for 16GB RAM environments.
+- **Fast**: Powered by `curl-cffi` for high-speed scraping.
+- **Stable**: Automatic proxy rotation and intelligent caching.
+- **Universal**: Serves both API and Frontend (if built).
+
+## API Endpoints
+
+- `GET /latest` - Latest movies and series.
+- `GET /search?q={query}` - Search content.
+- `GET /details/{id}` - Get streaming links.
+- `GET /health` - System status.
+
+## Deployment Note
+
+This project is configured to run on port **7860**. Ensure your Space is set to **Docker** SDK.
diff --git a/database.py b/database.py
new file mode 100644
index 0000000000000000000000000000000000000000..af5fd0b55752f9802af3b044462ae6d43bfd77bb
--- /dev/null
+++ b/database.py
@@ -0,0 +1,48 @@
+import aiosqlite
+import logging
+
+DB_NAME = "netflix_clone.db"
+
+async def init_db():
+ async with aiosqlite.connect(DB_NAME) as db:
+ # Movies Table
+ await db.execute("""
+ CREATE TABLE IF NOT EXISTS movies (
+ id TEXT PRIMARY KEY,
+ title TEXT,
+ poster TEXT,
+ year TEXT,
+ rating TEXT,
+ description TEXT,
+ category TEXT
+ )
+ """)
+ # Series Table
+ await db.execute("""
+ CREATE TABLE IF NOT EXISTS series (
+ id TEXT PRIMARY KEY,
+ title TEXT,
+ poster TEXT,
+ year TEXT,
+ rating TEXT,
+ description TEXT,
+ category TEXT
+ )
+ """)
+ # Episodes Table
+ await db.execute("""
+ CREATE TABLE IF NOT EXISTS episodes (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ series_id TEXT,
+ episode_number INTEGER,
+ title TEXT,
+ watch_link TEXT,
+ FOREIGN KEY(series_id) REFERENCES series(id)
+ )
+ """)
+ await db.commit()
+
+async def get_db_connection():
+ db = await aiosqlite.connect(DB_NAME)
+ db.row_factory = aiosqlite.Row
+ return db
diff --git a/deploy/.dockerignore b/deploy/.dockerignore
new file mode 100644
index 0000000000000000000000000000000000000000..aea3a11a36192c814e1972b4244e499928be8341
--- /dev/null
+++ b/deploy/.dockerignore
@@ -0,0 +1,30 @@
+# Python ignore
+__pycache__/
+*.py[cod]
+*$py.class
+venv/
+.env
+netflix_clone.db
+archive/
+tools/
+
+# Node ignore
+node_modules/
+dist/
+build/
+.next/
+.vite/
+
+# Git ignore
+.git/
+.gitignore
+
+# OS ignore
+.DS_Store
+Thumbs.db
+
+# Project ignore
+setup_and_run.bat
+*.md
+.gemini/
+.agent/
diff --git a/deploy/Dockerfile b/deploy/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..535f403193c571249350df967811a512aa9f92e9
--- /dev/null
+++ b/deploy/Dockerfile
@@ -0,0 +1,50 @@
+# ==========================================
+# Nitro Backend-Only Dockerfile for Hugging Face
+# ==========================================
+FROM python:3.11-slim
+
+# Install system dependencies for Scraper (Chrome) and FlareSolverr
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y \
+ ffmpeg \
+ curl \
+ git \
+ wget \
+ gnupg \
+ xvfb \
+ xauth \
+ dos2unix \
+ libnss3 \
+ libatk-bridge2.0-0 \
+ libgtk-3-0 \
+ && wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /usr/share/keyrings/google-chrome.gpg \
+ && echo "deb [arch=amd64 signed-by=/usr/share/keyrings/google-chrome.gpg] http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list \
+ && apt-get update && apt-get install -y google-chrome-stable \
+ && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+# Install Backend Dependencies
+COPY backend/requirements.txt ./
+RUN pip install --no-cache-dir --upgrade pip && \
+ pip install --no-cache-dir -r requirements.txt
+
+# Copy Backend Application
+COPY backend/ ./
+
+# Fix line endings and permissions
+RUN dos2unix start.sh && chmod +x start.sh
+
+# Create local user for Hugging Face Spaces (UID 1000)
+RUN useradd -m -u 1000 user
+RUN chown -R user:user /app
+USER user
+ENV HOME=/home/user \
+ PATH=/home/user/.local/bin:$PATH \
+ PYTHONPATH=/app
+
+# Expose the mandatory Hugging Face Space port
+EXPOSE 7860
+
+# Kickstart the engine
+CMD ["/bin/bash", "./start.sh"]
diff --git a/deploy/cloudflare-worker.js b/deploy/cloudflare-worker.js
new file mode 100644
index 0000000000000000000000000000000000000000..a8ad932ae367119f55e35a913af68c27c4808a52
--- /dev/null
+++ b/deploy/cloudflare-worker.js
@@ -0,0 +1,77 @@
+/**
+ * Cloudflare Worker - Proxy Bypass for Larooza Scraper
+ * Deploy this to Cloudflare Workers (100% FREE)
+ *
+ * This worker acts as a middle-man to bypass IP bans
+ */
+
+addEventListener('fetch', event => {
+ event.respondWith(handleRequest(event.request))
+})
+
+async function handleRequest(request) {
+ // Enable CORS
+ const corsHeaders = {
+ 'Access-Control-Allow-Origin': '*',
+ 'Access-Control-Allow-Methods': 'GET, POST, OPTIONS',
+ 'Access-Control-Allow-Headers': 'Content-Type',
+ }
+
+ // Handle CORS preflight
+ if (request.method === 'OPTIONS') {
+ return new Response(null, { headers: corsHeaders })
+ }
+
+ // Get target URL from query parameter
+ const url = new URL(request.url)
+ const targetUrl = url.searchParams.get('url')
+
+ if (!targetUrl) {
+ return new Response(JSON.stringify({ error: 'Missing url parameter' }), {
+ status: 400,
+ headers: { ...corsHeaders, 'Content-Type': 'application/json' }
+ })
+ }
+
+ try {
+ // Fetch the target URL with realistic headers
+ const response = await fetch(targetUrl, {
+ headers: {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
+ 'Accept-Language': 'ar,en-US;q=0.9,en;q=0.8',
+ 'Referer': 'https://www.google.com/',
+ 'DNT': '1',
+ 'Connection': 'keep-alive',
+ 'Upgrade-Insecure-Requests': '1',
+ },
+ cf: {
+ // Cloudflare-specific options
+ cacheTtl: 300, // Cache for 5 minutes
+ cacheEverything: true,
+ }
+ })
+
+ // Get the HTML content
+ const html = await response.text()
+
+ // Return with CORS headers
+ return new Response(html, {
+ status: response.status,
+ headers: {
+ ...corsHeaders,
+ 'Content-Type': 'text/html; charset=utf-8',
+ 'Cache-Control': 'public, max-age=300',
+ }
+ })
+
+ } catch (error) {
+ return new Response(JSON.stringify({
+ error: 'Failed to fetch target URL',
+ message: error.message
+ }), {
+ status: 500,
+ headers: { ...corsHeaders, 'Content-Type': 'application/json' }
+ })
+ }
+}
diff --git a/deploy/render.yaml b/deploy/render.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..214ceee8c012cfef5be73659634d20e59be5e3cd
--- /dev/null
+++ b/deploy/render.yaml
@@ -0,0 +1,18 @@
+# Render.com Deployment Configuration
+# https://render.com
+
+services:
+ - type: web
+ name: meih-movies-api
+ env: docker
+ dockerfilePath: ./Dockerfile
+ dockerContext: ./backend
+ plan: free
+ region: oregon
+ envVars:
+ - key: PYTHON_VERSION
+ value: 3.11
+ - key: PORT
+ value: 7860
+ healthCheckPath: /health
+ autoDeploy: true
diff --git a/downloader.py b/downloader.py
new file mode 100644
index 0000000000000000000000000000000000000000..e545ef1fd4bb59533f831d54c74af9e67c45db2b
--- /dev/null
+++ b/downloader.py
@@ -0,0 +1,145 @@
+import yt_dlp
+import logging
+import asyncio
+
+logger = logging.getLogger(__name__)
+
+class VideoDownloader:
+ def __init__(self):
+ self.ydl_opts = {
+ 'quiet': True,
+ 'no_warnings': True,
+ 'format': 'best',
+ 'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+ 'geo_bypass': True,
+ 'no_playlist': True,
+ 'nocheckcertificate': True,
+ }
+
+ async def get_info(self, url: str):
+ # 1. Handle Local Watch/Details Links or Direct Larooza Links
+ is_larooza = any(x in url for x in ['larozavideo', 'larooza', 'laroza'])
+ if "/watch/" in url or "/details/" in url or is_larooza:
+ try:
+ from scraper.engine import scraper
+ import base64
+
+ target_url = url
+ if "/watch/" in url or "/details/" in url:
+ id_part = url.split("/")[-1].split("?")[0]
+ if not id_part.startswith("http"):
+ target_url = base64.urlsafe_b64decode(id_part).decode()
+
+ # If it's a Larooza link (direct or decoded), use scraper
+ if any(x in target_url for x in ['larozavideo', 'larooza', 'laroza']):
+ logger.info(f"Routing Larooza link to scraper: {target_url}")
+ # Normalize: downloader works better with the video.php page
+ target_url = target_url.replace('play.php', 'video.php').replace('download.php', 'video.php')
+
+ safe_id = base64.urlsafe_b64encode(target_url.encode()).decode()
+ data = await scraper.fetch_details(safe_id)
+
+ if data and data.get('download_links'):
+ formats = []
+ for dl in data['download_links']:
+ formats.append({
+ 'ext': 'mp4',
+ 'resolution': dl['quality'],
+ 'url': dl['url'],
+ 'type': 'video'
+ })
+ return {
+ 'title': data.get('title'),
+ 'thumbnail': data.get('poster'),
+ 'duration': 0,
+ 'uploader': 'Larooza',
+ 'source': 'Larooza',
+ 'formats': formats
+ }
+ elif data:
+ return {"error": "لم يتم العثور على روابط تحميل لهذا الفيديو (ربما محمي أو غير متاح حالياً)."}
+ except Exception as e:
+ logger.error(f"Larooza-specific extraction failed: {e}")
+
+ # 2. Universal yt-dlp Path (YouTube, TikTok, etc.)
+ try:
+ loop = asyncio.get_event_loop()
+ # Use a more robust extraction with a timeout
+ try:
+ info = await asyncio.wait_for(
+ loop.run_in_executor(None, lambda: self._extract(url)),
+ timeout=30.0
+ )
+ except asyncio.TimeoutError:
+ logger.error(f"Timeout extracting info for {url}")
+ return {"error": "استغرق استخراج البيانات وقتاً طويلاً. حاول مرة أخرى."}
+
+ if not info:
+ return {"error": "فشل في استخراج بيانات الفيديو. تأكد من الرابط."}
+
+ # Live stream check
+ if info.get('is_live') or info.get('live_status') == 'is_upcoming':
+ return {"error": "هذا الفيديو لم يبدأ عرضه بعد أو هو بث مباشر حالياً."}
+
+ formats = []
+ seen_resolutions = set()
+
+ # Extract usable formats
+ raw_formats = info.get('formats', [])
+ if not raw_formats and info.get('url'):
+ raw_formats = [info] # For direct links
+
+ for f in raw_formats:
+ if not f: continue
+ # Filter out formats without a direct URL or those that are just manifests
+ f_url = f.get('url')
+ if not f_url or '.m3u8' in f_url or '.mpd' in f_url:
+ continue
+
+ ext = f.get('ext', 'mp4')
+ res = f.get('resolution') or f.get('format_note') or f.get('height') or 'Unknown'
+
+ # Clean resolution label
+ if isinstance(res, int): res = f"{res}p"
+
+ # Avoid duplicates and prioritize video formats
+ res_key = f"{res}_{f.get('vcodec') != 'none'}"
+ if res_key in seen_resolutions: continue
+ seen_resolutions.add(res_key)
+
+ formats.append({
+ 'id': f.get('format_id', 'unknown'),
+ 'ext': ext,
+ 'resolution': res,
+ 'filesize': f.get('filesize') or f.get('filesize_approx') or 0,
+ 'url': f_url,
+ 'type': 'video' if f.get('vcodec') != 'none' else 'audio'
+ })
+
+ if not formats:
+ return {"error": "لم يتم العثور على روابط تحميل مباشرة مدعومة لهذا الفيديو."}
+
+ return {
+ 'title': info.get('title', 'Video'),
+ 'thumbnail': info.get('thumbnail', ''),
+ 'duration': info.get('duration', 0),
+ 'uploader': info.get('uploader', 'Unknown'),
+ 'source': info.get('extractor_key', 'Unknown'),
+ 'formats': formats[::-1]
+ }
+ except Exception as e:
+ logger.error(f"Universal Downloader error for {url}: {e}")
+ return {"error": f"حدث خطأ غير متوقع: {str(e)}"}
+
+ def _extract(self, url):
+ opts = self.ydl_opts.copy()
+ # Add extra robustness for TikTok and newer sites
+ opts.update({
+ 'nocheckcertificate': True,
+ 'ignoreerrors': True,
+ 'socket_timeout': 15,
+ })
+ with yt_dlp.YoutubeDL(opts) as ydl:
+ return ydl.extract_info(url, download=False)
+
+downloader = VideoDownloader()
diff --git a/flaresolverr/bottle_plugins/__init__.py b/flaresolverr/bottle_plugins/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/flaresolverr/bottle_plugins/error_plugin.py b/flaresolverr/bottle_plugins/error_plugin.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d995086d3b122eb05b386c105b0f7282ad4b2c1
--- /dev/null
+++ b/flaresolverr/bottle_plugins/error_plugin.py
@@ -0,0 +1,22 @@
+from bottle import response
+import logging
+
+
+def error_plugin(callback):
+ """
+ Bottle plugin to handle exceptions
+ https://stackoverflow.com/a/32764250
+ """
+
+ def wrapper(*args, **kwargs):
+ try:
+ actual_response = callback(*args, **kwargs)
+ except Exception as e:
+ logging.error(str(e))
+ actual_response = {
+ "error": str(e)
+ }
+ response.status = 500
+ return actual_response
+
+ return wrapper
diff --git a/flaresolverr/bottle_plugins/logger_plugin.py b/flaresolverr/bottle_plugins/logger_plugin.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf840e9840204e03d5dfecf9681a0c1523428c1d
--- /dev/null
+++ b/flaresolverr/bottle_plugins/logger_plugin.py
@@ -0,0 +1,23 @@
+from bottle import request, response
+import logging
+
+
+def logger_plugin(callback):
+ """
+ Bottle plugin to use logging module
+ https://bottlepy.org/docs/dev/plugindev.html
+
+ Wrap a Bottle request so that a log line is emitted after it's handled.
+ (This decorator can be extended to take the desired logger as a param.)
+ """
+
+ def wrapper(*args, **kwargs):
+ actual_response = callback(*args, **kwargs)
+ if not request.url.endswith("/health"):
+ logging.info('%s %s %s %s' % (request.remote_addr,
+ request.method,
+ request.url,
+ response.status))
+ return actual_response
+
+ return wrapper
diff --git a/flaresolverr/bottle_plugins/prometheus_plugin.py b/flaresolverr/bottle_plugins/prometheus_plugin.py
new file mode 100644
index 0000000000000000000000000000000000000000..36b767b62c076cbdee8d8a1cfe96e59906dbfe47
--- /dev/null
+++ b/flaresolverr/bottle_plugins/prometheus_plugin.py
@@ -0,0 +1,66 @@
+import logging
+import os
+import urllib.parse
+
+from bottle import request
+from dtos import V1RequestBase, V1ResponseBase
+from metrics import start_metrics_http_server, REQUEST_COUNTER, REQUEST_DURATION
+
+PROMETHEUS_ENABLED = os.environ.get('PROMETHEUS_ENABLED', 'false').lower() == 'true'
+PROMETHEUS_PORT = int(os.environ.get('PROMETHEUS_PORT', 8192))
+
+
+def setup():
+ if PROMETHEUS_ENABLED:
+ start_metrics_http_server(PROMETHEUS_PORT)
+
+
+def prometheus_plugin(callback):
+ """
+ Bottle plugin to expose Prometheus metrics
+ https://bottlepy.org/docs/dev/plugindev.html
+ """
+ def wrapper(*args, **kwargs):
+ actual_response = callback(*args, **kwargs)
+
+ if PROMETHEUS_ENABLED:
+ try:
+ export_metrics(actual_response)
+ except Exception as e:
+ logging.warning("Error exporting metrics: " + str(e))
+
+ return actual_response
+
+ def export_metrics(actual_response):
+ res = V1ResponseBase(actual_response)
+
+ if res.startTimestamp is None or res.endTimestamp is None:
+ # skip management and healthcheck endpoints
+ return
+
+ domain = "unknown"
+ if res.solution and res.solution.url:
+ domain = parse_domain_url(res.solution.url)
+ else:
+ # timeout error
+ req = V1RequestBase(request.json)
+ if req.url:
+ domain = parse_domain_url(req.url)
+
+ run_time = (res.endTimestamp - res.startTimestamp) / 1000
+ REQUEST_DURATION.labels(domain=domain).observe(run_time)
+
+ result = "unknown"
+ if res.message == "Challenge solved!":
+ result = "solved"
+ elif res.message == "Challenge not detected!":
+ result = "not_detected"
+ elif res.message.startswith("Error"):
+ result = "error"
+ REQUEST_COUNTER.labels(domain=domain, result=result).inc()
+
+ def parse_domain_url(url):
+ parsed_url = urllib.parse.urlparse(url)
+ return parsed_url.hostname
+
+ return wrapper
diff --git a/flaresolverr/build_package.py b/flaresolverr/build_package.py
new file mode 100644
index 0000000000000000000000000000000000000000..748dbbeac1e8d817d65ee2cb2d989f68905c6b6f
--- /dev/null
+++ b/flaresolverr/build_package.py
@@ -0,0 +1,126 @@
+import os
+import platform
+import shutil
+import subprocess
+import sys
+import zipfile
+import tarfile
+
+import requests
+
+
+def clean_files():
+ try:
+ shutil.rmtree(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'build'))
+ except Exception:
+ pass
+ try:
+ shutil.rmtree(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'dist'))
+ except Exception:
+ pass
+ try:
+ shutil.rmtree(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'dist_chrome'))
+ except Exception:
+ pass
+
+
+def download_chromium():
+ # https://commondatastorage.googleapis.com/chromium-browser-snapshots/index.html?prefix=Linux_x64/
+ revision = "1522586" if os.name == 'nt' else '1522586'
+ arch = 'Win_x64' if os.name == 'nt' else 'Linux_x64'
+ dl_file = 'chrome-win' if os.name == 'nt' else 'chrome-linux'
+ dl_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'dist_chrome')
+ dl_path_folder = os.path.join(dl_path, dl_file)
+ dl_path_zip = dl_path_folder + '.zip'
+
+ # response = requests.get(
+ # f'https://commondatastorage.googleapis.com/chromium-browser-snapshots/{arch}/LAST_CHANGE',
+ # timeout=30)
+ # revision = response.text.strip()
+ print("Downloading revision: " + revision)
+
+ os.mkdir(dl_path)
+ with requests.get(
+ f'https://commondatastorage.googleapis.com/chromium-browser-snapshots/{arch}/{revision}/{dl_file}.zip',
+ stream=True) as r:
+ r.raise_for_status()
+ with open(dl_path_zip, 'wb') as f:
+ for chunk in r.iter_content(chunk_size=8192):
+ f.write(chunk)
+ print("File downloaded: " + dl_path_zip)
+ with zipfile.ZipFile(dl_path_zip, 'r') as zip_ref:
+ zip_ref.extractall(dl_path)
+ os.remove(dl_path_zip)
+
+ chrome_path = os.path.join(dl_path, "chrome")
+ shutil.move(dl_path_folder, chrome_path)
+ print("Extracted in: " + chrome_path)
+
+ if os.name != 'nt':
+ # Give executable permissions for *nix
+ # file * | grep executable | cut -d: -f1
+ print("Giving executable permissions...")
+ execs = ['chrome', 'chrome_crashpad_handler', 'chrome_sandbox', 'chrome-wrapper', 'xdg-mime', 'xdg-settings']
+ for exec_file in execs:
+ exec_path = os.path.join(chrome_path, exec_file)
+ os.chmod(exec_path, 0o755)
+
+
+def run_pyinstaller():
+ sep = ';' if os.name == 'nt' else ':'
+ result = subprocess.run([sys.executable, "-m", "PyInstaller",
+ "--icon", "resources/flaresolverr_logo.ico",
+ "--add-data", f"package.json{sep}.",
+ "--add-data", f"{os.path.join('dist_chrome', 'chrome')}{sep}chrome",
+ os.path.join("src", "flaresolverr.py")],
+ cwd=os.pardir, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ if result.returncode != 0:
+ print(result.stderr.decode('utf-8'))
+ raise Exception("Error running pyInstaller")
+
+
+def compress_package():
+ dist_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'dist')
+ package_folder = os.path.join(dist_folder, 'package')
+ shutil.move(os.path.join(dist_folder, 'flaresolverr'), os.path.join(package_folder, 'flaresolverr'))
+ print("Package folder: " + package_folder)
+
+ compr_format = 'zip' if os.name == 'nt' else 'gztar'
+ compr_file_name = 'flaresolverr_windows_x64' if os.name == 'nt' else 'flaresolverr_linux_x64'
+ compr_file_path = os.path.join(dist_folder, compr_file_name)
+
+ if compr_format == 'zip':
+ shutil.make_archive(compr_file_path, compr_format, package_folder)
+ print("Compressed file path: " + compr_file_path)
+ else:
+ def _reset_tarinfo(tarinfo):
+ tarinfo.uid = 0
+ tarinfo.gid = 0
+ tarinfo.uname = ""
+ tarinfo.gname = ""
+ return tarinfo
+
+ tar_path = compr_file_path + '.tar.gz'
+ with tarfile.open(tar_path, 'w:gz') as tar:
+ for entry in os.listdir(package_folder):
+ fullpath = os.path.join(package_folder, entry)
+ tar.add(fullpath, arcname=entry, filter=_reset_tarinfo)
+ print("Compressed file path: " + tar_path)
+
+if __name__ == "__main__":
+ print("Building package...")
+ print("Platform: " + platform.platform())
+
+ print("Cleaning previous build...")
+ clean_files()
+
+ print("Downloading Chromium...")
+ download_chromium()
+
+ print("Building pyinstaller executable... ")
+ run_pyinstaller()
+
+ print("Compressing package... ")
+ compress_package()
+
+# NOTE: python -m pip install pyinstaller
diff --git a/flaresolverr/dtos.py b/flaresolverr/dtos.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbe7fd89189766c6f6510c60ede7e88a95b29b5e
--- /dev/null
+++ b/flaresolverr/dtos.py
@@ -0,0 +1,94 @@
+
+STATUS_OK = "ok"
+STATUS_ERROR = "error"
+
+
+class ChallengeResolutionResultT:
+ url: str = None
+ status: int = None
+ headers: list = None
+ response: str = None
+ cookies: list = None
+ userAgent: str = None
+ screenshot: str | None = None
+ turnstile_token: str = None
+
+ def __init__(self, _dict):
+ self.__dict__.update(_dict)
+
+
+class ChallengeResolutionT:
+ status: str = None
+ message: str = None
+ result: ChallengeResolutionResultT = None
+
+ def __init__(self, _dict):
+ self.__dict__.update(_dict)
+ if self.result is not None:
+ self.result = ChallengeResolutionResultT(self.result)
+
+
+class V1RequestBase(object):
+ # V1RequestBase
+ cmd: str = None
+ cookies: list = None
+ maxTimeout: int = None
+ proxy: dict = None
+ session: str = None
+ session_ttl_minutes: int = None
+ headers: list = None # deprecated v2.0.0, not used
+ userAgent: str = None # deprecated v2.0.0, not used
+
+ # V1Request
+ url: str = None
+ postData: str = None
+ returnOnlyCookies: bool = None
+ returnScreenshot: bool = None
+ download: bool = None # deprecated v2.0.0, not used
+ returnRawHtml: bool = None # deprecated v2.0.0, not used
+ waitInSeconds: int = None
+ # Optional resource blocking flag (blocks images, CSS, and fonts)
+ disableMedia: bool = None
+ # Optional when you've got a turnstile captcha that needs to be clicked after X number of Tab presses
+ tabs_till_verify : int = None
+
+ def __init__(self, _dict):
+ self.__dict__.update(_dict)
+
+
+class V1ResponseBase(object):
+ # V1ResponseBase
+ status: str = None
+ message: str = None
+ session: str = None
+ sessions: list[str] = None
+ startTimestamp: int = None
+ endTimestamp: int = None
+ version: str = None
+
+ # V1ResponseSolution
+ solution: ChallengeResolutionResultT = None
+
+ # hidden vars
+ __error_500__: bool = False
+
+ def __init__(self, _dict):
+ self.__dict__.update(_dict)
+ if self.solution is not None:
+ self.solution = ChallengeResolutionResultT(self.solution)
+
+
+class IndexResponse(object):
+ msg: str = None
+ version: str = None
+ userAgent: str = None
+
+ def __init__(self, _dict):
+ self.__dict__.update(_dict)
+
+
+class HealthResponse(object):
+ status: str = None
+
+ def __init__(self, _dict):
+ self.__dict__.update(_dict)
diff --git a/flaresolverr/flaresolverr.py b/flaresolverr/flaresolverr.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7cf80a26650a9ef30fc03ea46166a2a4c875084
--- /dev/null
+++ b/flaresolverr/flaresolverr.py
@@ -0,0 +1,155 @@
+import json
+import logging
+import os
+import sys
+
+import certifi
+from bottle import run, response, Bottle, request, ServerAdapter
+
+from bottle_plugins.error_plugin import error_plugin
+from bottle_plugins.logger_plugin import logger_plugin
+from bottle_plugins import prometheus_plugin
+from dtos import V1RequestBase
+import flaresolverr_service
+import utils
+
+env_proxy_url = os.environ.get('PROXY_URL', None)
+env_proxy_username = os.environ.get('PROXY_USERNAME', None)
+env_proxy_password = os.environ.get('PROXY_PASSWORD', None)
+
+
+class JSONErrorBottle(Bottle):
+ """
+ Handle 404 errors
+ """
+ def default_error_handler(self, res):
+ response.content_type = 'application/json'
+ return json.dumps(dict(error=res.body, status_code=res.status_code))
+
+
+app = JSONErrorBottle()
+
+
+@app.route('/')
+def index():
+ """
+ Show welcome message
+ """
+ res = flaresolverr_service.index_endpoint()
+ return utils.object_to_dict(res)
+
+
+@app.route('/health')
+def health():
+ """
+ Healthcheck endpoint.
+ This endpoint is special because it doesn't print traces
+ """
+ res = flaresolverr_service.health_endpoint()
+ return utils.object_to_dict(res)
+
+
+@app.post('/v1')
+def controller_v1():
+ """
+ Controller v1
+ """
+ data = request.json or {}
+ if (('proxy' not in data or not data.get('proxy')) and env_proxy_url is not None and (env_proxy_username is None and env_proxy_password is None)):
+ logging.info('Using proxy URL ENV')
+ data['proxy'] = {"url": env_proxy_url}
+ if (('proxy' not in data or not data.get('proxy')) and env_proxy_url is not None and (env_proxy_username is not None or env_proxy_password is not None)):
+ logging.info('Using proxy URL, username & password ENVs')
+ data['proxy'] = {"url": env_proxy_url, "username": env_proxy_username, "password": env_proxy_password}
+ req = V1RequestBase(data)
+ res = flaresolverr_service.controller_v1_endpoint(req)
+ if res.__error_500__:
+ response.status = 500
+ return utils.object_to_dict(res)
+
+
+if __name__ == "__main__":
+ # check python version
+ if sys.version_info < (3, 9):
+ raise Exception("The Python version is less than 3.9, a version equal to or higher is required.")
+
+ # fix for HEADLESS=false in Windows binary
+ # https://stackoverflow.com/a/27694505
+ if os.name == 'nt':
+ import multiprocessing
+ multiprocessing.freeze_support()
+
+ # fix ssl certificates for compiled binaries
+ # https://github.com/pyinstaller/pyinstaller/issues/7229
+ # https://stackoverflow.com/q/55736855
+ os.environ["REQUESTS_CA_BUNDLE"] = certifi.where()
+ os.environ["SSL_CERT_FILE"] = certifi.where()
+
+ # validate configuration
+ log_level = os.environ.get('LOG_LEVEL', 'info').upper()
+ log_file = os.environ.get('LOG_FILE', None)
+ log_html = utils.get_config_log_html()
+ headless = utils.get_config_headless()
+ server_host = os.environ.get('HOST', '0.0.0.0')
+ server_port = int(os.environ.get('PORT', 8191))
+
+ # configure logger
+ logger_format = '%(asctime)s %(levelname)-8s %(message)s'
+ if log_level == 'DEBUG':
+ logger_format = '%(asctime)s %(levelname)-8s ReqId %(thread)s %(message)s'
+ if log_file:
+ log_file = os.path.realpath(log_file)
+ log_path = os.path.dirname(log_file)
+ os.makedirs(log_path, exist_ok=True)
+ logging.basicConfig(
+ format=logger_format,
+ level=log_level,
+ datefmt='%Y-%m-%d %H:%M:%S',
+ handlers=[
+ logging.StreamHandler(sys.stdout),
+ logging.FileHandler(log_file)
+ ]
+ )
+ else:
+ logging.basicConfig(
+ format=logger_format,
+ level=log_level,
+ datefmt='%Y-%m-%d %H:%M:%S',
+ handlers=[
+ logging.StreamHandler(sys.stdout)
+ ]
+ )
+
+ # disable warning traces from urllib3
+ logging.getLogger('urllib3').setLevel(logging.ERROR)
+ logging.getLogger('selenium.webdriver.remote.remote_connection').setLevel(logging.WARNING)
+ logging.getLogger('undetected_chromedriver').setLevel(logging.WARNING)
+
+ logging.info(f'FlareSolverr {utils.get_flaresolverr_version()}')
+ logging.debug('Debug log enabled')
+
+ # Get current OS for global variable
+ utils.get_current_platform()
+
+ # test browser installation
+ if os.environ.get('SKIP_BROWSER_TEST', 'false').lower() != 'true':
+ flaresolverr_service.test_browser_installation()
+ else:
+ logging.info("Skipping browser installation test for faster boot.")
+
+ # start bootle plugins
+ # plugin order is important
+ app.install(logger_plugin)
+ app.install(error_plugin)
+ prometheus_plugin.setup()
+ app.install(prometheus_plugin.prometheus_plugin)
+
+ # start webserver
+ # default server 'wsgiref' does not support concurrent requests
+ # https://github.com/FlareSolverr/FlareSolverr/issues/680
+ # https://github.com/Pylons/waitress/issues/31
+ class WaitressServerPoll(ServerAdapter):
+ def run(self, handler):
+ from waitress import serve
+ serve(handler, host=self.host, port=self.port, asyncore_use_poll=True)
+ run(app, host=server_host, port=server_port, quiet=True, server=WaitressServerPoll)
diff --git a/flaresolverr/flaresolverr_service.py b/flaresolverr/flaresolverr_service.py
new file mode 100644
index 0000000000000000000000000000000000000000..814aeff40cf883088dd0ae5c16b6d025cbb9d2cc
--- /dev/null
+++ b/flaresolverr/flaresolverr_service.py
@@ -0,0 +1,519 @@
+import logging
+import platform
+import sys
+import time
+from datetime import timedelta
+from html import escape
+from urllib.parse import unquote, quote
+
+from func_timeout import FunctionTimedOut, func_timeout
+from selenium.common import TimeoutException
+from selenium.webdriver.chrome.webdriver import WebDriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.support.expected_conditions import (
+ presence_of_element_located, staleness_of, title_is)
+from selenium.webdriver.common.action_chains import ActionChains
+from selenium.webdriver.support.wait import WebDriverWait
+
+import utils
+from dtos import (STATUS_ERROR, STATUS_OK, ChallengeResolutionResultT,
+ ChallengeResolutionT, HealthResponse, IndexResponse,
+ V1RequestBase, V1ResponseBase)
+from sessions import SessionsStorage
+
+ACCESS_DENIED_TITLES = [
+ # Cloudflare
+ 'Access denied',
+ # Cloudflare http://bitturk.net/ Firefox
+ 'Attention Required! | Cloudflare'
+]
+ACCESS_DENIED_SELECTORS = [
+ # Cloudflare
+ 'div.cf-error-title span.cf-code-label span',
+ # Cloudflare http://bitturk.net/ Firefox
+ '#cf-error-details div.cf-error-overview h1'
+]
+CHALLENGE_TITLES = [
+ # Cloudflare
+ 'Just a moment...',
+ # DDoS-GUARD
+ 'DDoS-Guard'
+]
+CHALLENGE_SELECTORS = [
+ # Cloudflare
+ '#cf-challenge-running', '.ray_id', '.attack-box', '#cf-please-wait', '#challenge-spinner', '#trk_jschal_js', '#turnstile-wrapper', '.lds-ring',
+ # Custom CloudFlare for EbookParadijs, Film-Paleis, MuziekFabriek and Puur-Hollands
+ 'td.info #js_info',
+ # Fairlane / pararius.com
+ 'div.vc div.text-box h2'
+]
+
+TURNSTILE_SELECTORS = [
+ "input[name='cf-turnstile-response']"
+]
+
+SHORT_TIMEOUT = 1
+SESSIONS_STORAGE = SessionsStorage()
+
+
+def test_browser_installation():
+ logging.info("Testing web browser installation...")
+ logging.info("Platform: " + platform.platform())
+
+ chrome_exe_path = utils.get_chrome_exe_path()
+ if chrome_exe_path is None:
+ logging.error("Chrome / Chromium web browser not installed!")
+ sys.exit(1)
+ else:
+ logging.info("Chrome / Chromium path: " + chrome_exe_path)
+
+ chrome_major_version = utils.get_chrome_major_version()
+ if chrome_major_version == '':
+ logging.error("Chrome / Chromium version not detected!")
+ sys.exit(1)
+ else:
+ logging.info("Chrome / Chromium major version: " + chrome_major_version)
+
+ logging.info("Launching web browser...")
+ user_agent = utils.get_user_agent()
+ logging.info("FlareSolverr User-Agent: " + user_agent)
+ logging.info("Test successful!")
+
+
+def index_endpoint() -> IndexResponse:
+ res = IndexResponse({})
+ res.msg = "FlareSolverr is ready!"
+ res.version = utils.get_flaresolverr_version()
+ res.userAgent = utils.get_user_agent()
+ return res
+
+
+def health_endpoint() -> HealthResponse:
+ res = HealthResponse({})
+ res.status = STATUS_OK
+ return res
+
+
+def controller_v1_endpoint(req: V1RequestBase) -> V1ResponseBase:
+ start_ts = int(time.time() * 1000)
+ logging.info(f"Incoming request => POST /v1 body: {utils.object_to_dict(req)}")
+ res: V1ResponseBase
+ try:
+ res = _controller_v1_handler(req)
+ except Exception as e:
+ res = V1ResponseBase({})
+ res.__error_500__ = True
+ res.status = STATUS_ERROR
+ res.message = "Error: " + str(e)
+ logging.error(res.message)
+
+ res.startTimestamp = start_ts
+ res.endTimestamp = int(time.time() * 1000)
+ res.version = utils.get_flaresolverr_version()
+ logging.debug(f"Response => POST /v1 body: {utils.object_to_dict(res)}")
+ logging.info(f"Response in {(res.endTimestamp - res.startTimestamp) / 1000} s")
+ return res
+
+
+def _controller_v1_handler(req: V1RequestBase) -> V1ResponseBase:
+ # do some validations
+ if req.cmd is None:
+ raise Exception("Request parameter 'cmd' is mandatory.")
+ if req.headers is not None:
+ logging.warning("Request parameter 'headers' was removed in FlareSolverr v2.")
+ if req.userAgent is not None:
+ logging.warning("Request parameter 'userAgent' was removed in FlareSolverr v2.")
+
+ # set default values
+ if req.maxTimeout is None or int(req.maxTimeout) < 1:
+ req.maxTimeout = 60000
+
+ # execute the command
+ res: V1ResponseBase
+ if req.cmd == 'sessions.create':
+ res = _cmd_sessions_create(req)
+ elif req.cmd == 'sessions.list':
+ res = _cmd_sessions_list(req)
+ elif req.cmd == 'sessions.destroy':
+ res = _cmd_sessions_destroy(req)
+ elif req.cmd == 'request.get':
+ res = _cmd_request_get(req)
+ elif req.cmd == 'request.post':
+ res = _cmd_request_post(req)
+ else:
+ raise Exception(f"Request parameter 'cmd' = '{req.cmd}' is invalid.")
+
+ return res
+
+
+def _cmd_request_get(req: V1RequestBase) -> V1ResponseBase:
+ # do some validations
+ if req.url is None:
+ raise Exception("Request parameter 'url' is mandatory in 'request.get' command.")
+ if req.postData is not None:
+ raise Exception("Cannot use 'postBody' when sending a GET request.")
+ if req.returnRawHtml is not None:
+ logging.warning("Request parameter 'returnRawHtml' was removed in FlareSolverr v2.")
+ if req.download is not None:
+ logging.warning("Request parameter 'download' was removed in FlareSolverr v2.")
+
+ challenge_res = _resolve_challenge(req, 'GET')
+ res = V1ResponseBase({})
+ res.status = challenge_res.status
+ res.message = challenge_res.message
+ res.solution = challenge_res.result
+ return res
+
+
+def _cmd_request_post(req: V1RequestBase) -> V1ResponseBase:
+ # do some validations
+ if req.postData is None:
+ raise Exception("Request parameter 'postData' is mandatory in 'request.post' command.")
+ if req.returnRawHtml is not None:
+ logging.warning("Request parameter 'returnRawHtml' was removed in FlareSolverr v2.")
+ if req.download is not None:
+ logging.warning("Request parameter 'download' was removed in FlareSolverr v2.")
+
+ challenge_res = _resolve_challenge(req, 'POST')
+ res = V1ResponseBase({})
+ res.status = challenge_res.status
+ res.message = challenge_res.message
+ res.solution = challenge_res.result
+ return res
+
+
+def _cmd_sessions_create(req: V1RequestBase) -> V1ResponseBase:
+ logging.debug("Creating new session...")
+
+ session, fresh = SESSIONS_STORAGE.create(session_id=req.session, proxy=req.proxy)
+ session_id = session.session_id
+
+ if not fresh:
+ return V1ResponseBase({
+ "status": STATUS_OK,
+ "message": "Session already exists.",
+ "session": session_id
+ })
+
+ return V1ResponseBase({
+ "status": STATUS_OK,
+ "message": "Session created successfully.",
+ "session": session_id
+ })
+
+
+def _cmd_sessions_list(req: V1RequestBase) -> V1ResponseBase:
+ session_ids = SESSIONS_STORAGE.session_ids()
+
+ return V1ResponseBase({
+ "status": STATUS_OK,
+ "message": "",
+ "sessions": session_ids
+ })
+
+
+def _cmd_sessions_destroy(req: V1RequestBase) -> V1ResponseBase:
+ session_id = req.session
+ existed = SESSIONS_STORAGE.destroy(session_id)
+
+ if not existed:
+ raise Exception("The session doesn't exist.")
+
+ return V1ResponseBase({
+ "status": STATUS_OK,
+ "message": "The session has been removed."
+ })
+
+
+def _resolve_challenge(req: V1RequestBase, method: str) -> ChallengeResolutionT:
+ timeout = int(req.maxTimeout) / 1000
+ driver = None
+ try:
+ if req.session:
+ session_id = req.session
+ ttl = timedelta(minutes=req.session_ttl_minutes) if req.session_ttl_minutes else None
+ session, fresh = SESSIONS_STORAGE.get(session_id, ttl)
+
+ if fresh:
+ logging.debug(f"new session created to perform the request (session_id={session_id})")
+ else:
+ logging.debug(f"existing session is used to perform the request (session_id={session_id}, "
+ f"lifetime={str(session.lifetime())}, ttl={str(ttl)})")
+
+ driver = session.driver
+ else:
+ driver = utils.get_webdriver(req.proxy)
+ logging.debug('New instance of webdriver has been created to perform the request')
+ return func_timeout(timeout, _evil_logic, (req, driver, method))
+ except FunctionTimedOut:
+ raise Exception(f'Error solving the challenge. Timeout after {timeout} seconds.')
+ except Exception as e:
+ raise Exception('Error solving the challenge. ' + str(e).replace('\n', '\\n'))
+ finally:
+ if not req.session and driver is not None:
+ if utils.PLATFORM_VERSION == "nt":
+ driver.close()
+ driver.quit()
+ logging.debug('A used instance of webdriver has been destroyed')
+
+
+def click_verify(driver: WebDriver, num_tabs: int = 1):
+ try:
+ logging.debug("Try to find the Cloudflare verify checkbox...")
+ actions = ActionChains(driver)
+ actions.pause(5)
+ for _ in range(num_tabs):
+ actions.send_keys(Keys.TAB).pause(0.1)
+ actions.pause(1)
+ actions.send_keys(Keys.SPACE).perform()
+
+ logging.debug(f"Cloudflare verify checkbox clicked after {num_tabs} tabs!")
+ except Exception:
+ logging.debug("Cloudflare verify checkbox not found on the page.")
+ finally:
+ driver.switch_to.default_content()
+
+ try:
+ logging.debug("Try to find the Cloudflare 'Verify you are human' button...")
+ button = driver.find_element(
+ by=By.XPATH,
+ value="//input[@type='button' and @value='Verify you are human']",
+ )
+ if button:
+ actions = ActionChains(driver)
+ actions.move_to_element_with_offset(button, 5, 7)
+ actions.click(button)
+ actions.perform()
+ logging.debug("The Cloudflare 'Verify you are human' button found and clicked!")
+ except Exception:
+ logging.debug("The Cloudflare 'Verify you are human' button not found on the page.")
+
+ time.sleep(2)
+
+def _get_turnstile_token(driver: WebDriver, tabs: int):
+ token_input = driver.find_element(By.CSS_SELECTOR, "input[name='cf-turnstile-response']")
+ current_value = token_input.get_attribute("value")
+ while True:
+ click_verify(driver, num_tabs=tabs)
+ turnstile_token = token_input.get_attribute("value")
+ if turnstile_token:
+ if turnstile_token != current_value:
+ logging.info(f"Turnstile token: {turnstile_token}")
+ return turnstile_token
+ logging.debug(f"Failed to extract token possibly click failed")
+
+ # reset focus
+ driver.execute_script("""
+ let el = document.createElement('button');
+ el.style.position='fixed';
+ el.style.top='0';
+ el.style.left='0';
+ document.body.prepend(el);
+ el.focus();
+ """)
+ time.sleep(1)
+
+def _resolve_turnstile_captcha(req: V1RequestBase, driver: WebDriver):
+ turnstile_token = None
+ if req.tabs_till_verify is not None:
+ logging.debug(f'Navigating to... {req.url} in order to pass the turnstile challenge')
+ driver.get(req.url)
+
+ turnstile_challenge_found = False
+ for selector in TURNSTILE_SELECTORS:
+ found_elements = driver.find_elements(By.CSS_SELECTOR, selector)
+ if len(found_elements) > 0:
+ turnstile_challenge_found = True
+ logging.info("Turnstile challenge detected. Selector found: " + selector)
+ break
+ if turnstile_challenge_found:
+ turnstile_token = _get_turnstile_token(driver=driver, tabs=req.tabs_till_verify)
+ else:
+ logging.debug(f'Turnstile challenge not found')
+ return turnstile_token
+
+def _evil_logic(req: V1RequestBase, driver: WebDriver, method: str) -> ChallengeResolutionT:
+ res = ChallengeResolutionT({})
+ res.status = STATUS_OK
+ res.message = ""
+
+ # optionally block resources like images/css/fonts using CDP
+ disable_media = utils.get_config_disable_media()
+ if req.disableMedia is not None:
+ disable_media = req.disableMedia
+ if disable_media:
+ block_urls = [
+ # Images
+ "*.png", "*.jpg", "*.jpeg", "*.gif", "*.webp", "*.bmp", "*.svg", "*.ico",
+ "*.PNG", "*.JPG", "*.JPEG", "*.GIF", "*.WEBP", "*.BMP", "*.SVG", "*.ICO",
+ "*.tiff", "*.tif", "*.jpe", "*.apng", "*.avif", "*.heic", "*.heif",
+ "*.TIFF", "*.TIF", "*.JPE", "*.APNG", "*.AVIF", "*.HEIC", "*.HEIF",
+ # Stylesheets
+ "*.css",
+ "*.CSS",
+ # Fonts
+ "*.woff", "*.woff2", "*.ttf", "*.otf", "*.eot",
+ "*.WOFF", "*.WOFF2", "*.TTF", "*.OTF", "*.EOT"
+ ]
+ try:
+ logging.debug("Network.setBlockedURLs: %s", block_urls)
+ driver.execute_cdp_cmd("Network.enable", {})
+ driver.execute_cdp_cmd("Network.setBlockedURLs", {"urls": block_urls})
+ except Exception:
+ # if CDP commands are not available or fail, ignore and continue
+ logging.debug("Network.setBlockedURLs failed or unsupported on this webdriver")
+
+ # navigate to the page
+ logging.debug(f"Navigating to... {req.url}")
+ turnstile_token = None
+
+ if method == "POST":
+ _post_request(req, driver)
+ else:
+ if req.tabs_till_verify is None:
+ driver.get(req.url)
+ else:
+ turnstile_token = _resolve_turnstile_captcha(req, driver)
+
+ # set cookies if required
+ if req.cookies is not None and len(req.cookies) > 0:
+ logging.debug(f'Setting cookies...')
+ for cookie in req.cookies:
+ driver.delete_cookie(cookie['name'])
+ driver.add_cookie(cookie)
+ # reload the page
+ if method == 'POST':
+ _post_request(req, driver)
+ else:
+ driver.get(req.url)
+
+ # wait for the page
+ if utils.get_config_log_html():
+ logging.debug(f"Response HTML:\n{driver.page_source}")
+ html_element = driver.find_element(By.TAG_NAME, "html")
+ page_title = driver.title
+
+ # find access denied titles
+ for title in ACCESS_DENIED_TITLES:
+ if page_title.startswith(title):
+ raise Exception('Cloudflare has blocked this request. '
+ 'Probably your IP is banned for this site, check in your web browser.')
+ # find access denied selectors
+ for selector in ACCESS_DENIED_SELECTORS:
+ found_elements = driver.find_elements(By.CSS_SELECTOR, selector)
+ if len(found_elements) > 0:
+ raise Exception('Cloudflare has blocked this request. '
+ 'Probably your IP is banned for this site, check in your web browser.')
+
+ # find challenge by title
+ challenge_found = False
+ for title in CHALLENGE_TITLES:
+ if title.lower() == page_title.lower():
+ challenge_found = True
+ logging.info("Challenge detected. Title found: " + page_title)
+ break
+ if not challenge_found:
+ # find challenge by selectors
+ for selector in CHALLENGE_SELECTORS:
+ found_elements = driver.find_elements(By.CSS_SELECTOR, selector)
+ if len(found_elements) > 0:
+ challenge_found = True
+ logging.info("Challenge detected. Selector found: " + selector)
+ break
+
+ attempt = 0
+ if challenge_found:
+ while True:
+ try:
+ attempt = attempt + 1
+ # wait until the title changes
+ for title in CHALLENGE_TITLES:
+ logging.debug("Waiting for title (attempt " + str(attempt) + "): " + title)
+ WebDriverWait(driver, SHORT_TIMEOUT).until_not(title_is(title))
+
+ # then wait until all the selectors disappear
+ for selector in CHALLENGE_SELECTORS:
+ logging.debug("Waiting for selector (attempt " + str(attempt) + "): " + selector)
+ WebDriverWait(driver, SHORT_TIMEOUT).until_not(
+ presence_of_element_located((By.CSS_SELECTOR, selector)))
+
+ # all elements not found
+ break
+
+ except TimeoutException:
+ logging.debug("Timeout waiting for selector")
+
+ click_verify(driver)
+
+ # update the html (cloudflare reloads the page every 5 s)
+ html_element = driver.find_element(By.TAG_NAME, "html")
+
+ # waits until cloudflare redirection ends
+ logging.debug("Waiting for redirect")
+ # noinspection PyBroadException
+ try:
+ WebDriverWait(driver, SHORT_TIMEOUT).until(staleness_of(html_element))
+ except Exception:
+ logging.debug("Timeout waiting for redirect")
+
+ logging.info("Challenge solved!")
+ res.message = "Challenge solved!"
+ else:
+ logging.info("Challenge not detected!")
+ res.message = "Challenge not detected!"
+
+ challenge_res = ChallengeResolutionResultT({})
+ challenge_res.url = driver.current_url
+ challenge_res.status = 200 # todo: fix, selenium not provides this info
+ challenge_res.cookies = driver.get_cookies()
+ challenge_res.userAgent = utils.get_user_agent(driver)
+ challenge_res.turnstile_token = turnstile_token
+
+ if not req.returnOnlyCookies:
+ challenge_res.headers = {} # todo: fix, selenium not provides this info
+
+ if req.waitInSeconds and req.waitInSeconds > 0:
+ logging.info("Waiting " + str(req.waitInSeconds) + " seconds before returning the response...")
+ time.sleep(req.waitInSeconds)
+
+ challenge_res.response = driver.page_source
+
+ if req.returnScreenshot:
+ challenge_res.screenshot = driver.get_screenshot_as_base64()
+
+ res.result = challenge_res
+ return res
+
+
+def _post_request(req: V1RequestBase, driver: WebDriver):
+ post_form = f'
'
+ html_content = f"""
+
+
+
+ {post_form}
+
+
+ """
+ driver.get("data:text/html;charset=utf-8,{html_content}".format(html_content=html_content))
diff --git a/flaresolverr/metrics.py b/flaresolverr/metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..4112dd1e70ab2267c9ef87a71d1c2c1557a4c867
--- /dev/null
+++ b/flaresolverr/metrics.py
@@ -0,0 +1,32 @@
+import logging
+
+from prometheus_client import Counter, Histogram, start_http_server
+import time
+
+REQUEST_COUNTER = Counter(
+ name='flaresolverr_request',
+ documentation='Total requests with result',
+ labelnames=['domain', 'result']
+)
+REQUEST_DURATION = Histogram(
+ name='flaresolverr_request_duration',
+ documentation='Request duration in seconds',
+ labelnames=['domain'],
+ buckets=[0, 10, 25, 50]
+)
+
+
+def serve(port):
+ start_http_server(port=port)
+ while True:
+ time.sleep(600)
+
+
+def start_metrics_http_server(prometheus_port: int):
+ logging.info(f"Serving Prometheus exporter on http://0.0.0.0:{prometheus_port}/metrics")
+ from threading import Thread
+ Thread(
+ target=serve,
+ kwargs=dict(port=prometheus_port),
+ daemon=True,
+ ).start()
diff --git a/flaresolverr/sessions.py b/flaresolverr/sessions.py
new file mode 100644
index 0000000000000000000000000000000000000000..30bb3c13eab2895676ab2719c61f40f2878471d8
--- /dev/null
+++ b/flaresolverr/sessions.py
@@ -0,0 +1,84 @@
+import logging
+from dataclasses import dataclass
+from datetime import datetime, timedelta
+from typing import Optional, Tuple
+from uuid import uuid1
+
+from selenium.webdriver.chrome.webdriver import WebDriver
+
+import utils
+
+
+@dataclass
+class Session:
+ session_id: str
+ driver: WebDriver
+ created_at: datetime
+
+ def lifetime(self) -> timedelta:
+ return datetime.now() - self.created_at
+
+
+class SessionsStorage:
+ """SessionsStorage creates, stores and process all the sessions"""
+
+ def __init__(self):
+ self.sessions = {}
+
+ def create(self, session_id: Optional[str] = None, proxy: Optional[dict] = None,
+ force_new: Optional[bool] = False) -> Tuple[Session, bool]:
+ """create creates new instance of WebDriver if necessary,
+ assign defined (or newly generated) session_id to the instance
+ and returns the session object. If a new session has been created
+ second argument is set to True.
+
+ Note: The function is idempotent, so in case if session_id
+ already exists in the storage a new instance of WebDriver won't be created
+ and existing session will be returned. Second argument defines if
+ new session has been created (True) or an existing one was used (False).
+ """
+ session_id = session_id or str(uuid1())
+
+ if force_new:
+ self.destroy(session_id)
+
+ if self.exists(session_id):
+ return self.sessions[session_id], False
+
+ driver = utils.get_webdriver(proxy)
+ created_at = datetime.now()
+ session = Session(session_id, driver, created_at)
+
+ self.sessions[session_id] = session
+
+ return session, True
+
+ def exists(self, session_id: str) -> bool:
+ return session_id in self.sessions
+
+ def destroy(self, session_id: str) -> bool:
+ """destroy closes the driver instance and removes session from the storage.
+ The function is noop if session_id doesn't exist.
+ The function returns True if session was found and destroyed,
+ and False if session_id wasn't found.
+ """
+ if not self.exists(session_id):
+ return False
+
+ session = self.sessions.pop(session_id)
+ if utils.PLATFORM_VERSION == "nt":
+ session.driver.close()
+ session.driver.quit()
+ return True
+
+ def get(self, session_id: str, ttl: Optional[timedelta] = None) -> Tuple[Session, bool]:
+ session, fresh = self.create(session_id)
+
+ if ttl is not None and not fresh and session.lifetime() > ttl:
+ logging.debug(f'session\'s lifetime has expired, so the session is recreated (session_id={session_id})')
+ session, fresh = self.create(session_id, force_new=True)
+
+ return session, fresh
+
+ def session_ids(self) -> list[str]:
+ return list(self.sessions.keys())
diff --git a/flaresolverr/tests.py b/flaresolverr/tests.py
new file mode 100644
index 0000000000000000000000000000000000000000..af49a68254f7fe8eb278d9c48661b2c6e4575212
--- /dev/null
+++ b/flaresolverr/tests.py
@@ -0,0 +1,655 @@
+import unittest
+from typing import Optional
+
+from webtest import TestApp
+
+from dtos import IndexResponse, HealthResponse, V1ResponseBase, STATUS_OK, STATUS_ERROR
+import flaresolverr
+import utils
+
+
+def _find_obj_by_key(key: str, value: str, _list: list) -> Optional[dict]:
+ for obj in _list:
+ if obj[key] == value:
+ return obj
+ return None
+
+
+class TestFlareSolverr(unittest.TestCase):
+
+ proxy_url = "http://127.0.0.1:8888"
+ proxy_socks_url = "socks5://127.0.0.1:1080"
+ google_url = "https://www.google.com"
+ post_url = "https://httpbin.org/post"
+ cloudflare_url = "https://nowsecure.nl/"
+ cloudflare_url_2 = "https://idope.se/torrent-list/harry/"
+ ddos_guard_url = "https://www.litres.ru/"
+ fairlane_url = "https://www.pararius.com/apartments/amsterdam"
+ custom_cloudflare_url = "https://www.muziekfabriek.org/"
+ cloudflare_blocked_url = "https://cpasbiens3.fr/index.php?do=search&subaction=search"
+
+ app = TestApp(flaresolverr.app)
+ # wait until the server is ready
+ app.get('/')
+
+ def test_wrong_endpoint(self):
+ res = self.app.get('/wrong', status=404)
+ self.assertEqual(res.status_code, 404)
+
+ body = res.json
+ self.assertEqual("Not found: '/wrong'", body['error'])
+ self.assertEqual(404, body['status_code'])
+
+ def test_index_endpoint(self):
+ res = self.app.get('/')
+ self.assertEqual(res.status_code, 200)
+
+ body = IndexResponse(res.json)
+ self.assertEqual("FlareSolverr is ready!", body.msg)
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
+ self.assertIn("Chrome/", body.userAgent)
+
+ def test_health_endpoint(self):
+ res = self.app.get('/health')
+ self.assertEqual(res.status_code, 200)
+
+ body = HealthResponse(res.json)
+ self.assertEqual(STATUS_OK, body.status)
+
+ def test_v1_endpoint_wrong_cmd(self):
+ res = self.app.post_json('/v1', {
+ "cmd": "request.bad",
+ "url": self.google_url
+ }, status=500)
+ self.assertEqual(res.status_code, 500)
+
+ body = V1ResponseBase(res.json)
+ self.assertEqual(STATUS_ERROR, body.status)
+ self.assertEqual("Error: Request parameter 'cmd' = 'request.bad' is invalid.", body.message)
+ self.assertGreater(body.startTimestamp, 10000)
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
+
+ def test_v1_endpoint_request_get_no_cloudflare(self):
+ res = self.app.post_json('/v1', {
+ "cmd": "request.get",
+ "url": self.google_url
+ })
+ self.assertEqual(res.status_code, 200)
+
+ body = V1ResponseBase(res.json)
+ self.assertEqual(STATUS_OK, body.status)
+ self.assertEqual("Challenge not detected!", body.message)
+ self.assertGreater(body.startTimestamp, 10000)
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
+
+ solution = body.solution
+ self.assertIn(self.google_url, solution.url)
+ self.assertEqual(solution.status, 200)
+ self.assertIs(len(solution.headers), 0)
+ self.assertIn("Google", solution.response)
+ self.assertGreater(len(solution.cookies), 0)
+ self.assertIn("Chrome/", solution.userAgent)
+
+ def test_v1_endpoint_request_get_disable_resources(self):
+ res = self.app.post_json("/v1", {
+ "cmd": "request.get",
+ "url": self.google_url,
+ "disableMedia": True
+ })
+ self.assertEqual(res.status_code, 200)
+
+ body = V1ResponseBase(res.json)
+ self.assertEqual(STATUS_OK, body.status)
+ self.assertEqual("Challenge not detected!", body.message)
+ self.assertGreater(body.startTimestamp, 10000)
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
+
+ solution = body.solution
+ self.assertIn(self.google_url, solution.url)
+ self.assertEqual(solution.status, 200)
+ self.assertIs(len(solution.headers), 0)
+ self.assertIn("Google", solution.response)
+ self.assertGreater(len(solution.cookies), 0)
+ self.assertIn("Chrome/", solution.userAgent)
+
+ def test_v1_endpoint_request_get_cloudflare_js_1(self):
+ res = self.app.post_json('/v1', {
+ "cmd": "request.get",
+ "url": self.cloudflare_url
+ })
+ self.assertEqual(res.status_code, 200)
+
+ body = V1ResponseBase(res.json)
+ self.assertEqual(STATUS_OK, body.status)
+ self.assertEqual("Challenge solved!", body.message)
+ self.assertGreater(body.startTimestamp, 10000)
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
+
+ solution = body.solution
+ self.assertIn(self.cloudflare_url, solution.url)
+ self.assertEqual(solution.status, 200)
+ self.assertIs(len(solution.headers), 0)
+ self.assertIn("nowSecure", solution.response)
+ self.assertGreater(len(solution.cookies), 0)
+ self.assertIn("Chrome/", solution.userAgent)
+
+ cf_cookie = _find_obj_by_key("name", "cf_clearance", solution.cookies)
+ self.assertIsNotNone(cf_cookie, "Cloudflare cookie not found")
+ self.assertGreater(len(cf_cookie["value"]), 30)
+
+ def test_v1_endpoint_request_get_cloudflare_js_2(self):
+ res = self.app.post_json('/v1', {
+ "cmd": "request.get",
+ "url": self.cloudflare_url_2
+ })
+ self.assertEqual(res.status_code, 200)
+
+ body = V1ResponseBase(res.json)
+ self.assertEqual(STATUS_OK, body.status)
+ self.assertEqual("Challenge solved!", body.message)
+ self.assertGreater(body.startTimestamp, 10000)
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
+
+ solution = body.solution
+ self.assertIn(self.cloudflare_url_2, solution.url)
+ self.assertEqual(solution.status, 200)
+ self.assertIs(len(solution.headers), 0)
+ self.assertIn("harry - idope torrent search", solution.response)
+ self.assertGreater(len(solution.cookies), 0)
+ self.assertIn("Chrome/", solution.userAgent)
+
+ cf_cookie = _find_obj_by_key("name", "cf_clearance", solution.cookies)
+ self.assertIsNotNone(cf_cookie, "Cloudflare cookie not found")
+ self.assertGreater(len(cf_cookie["value"]), 30)
+
+ def test_v1_endpoint_request_get_ddos_guard_js(self):
+ res = self.app.post_json('/v1', {
+ "cmd": "request.get",
+ "url": self.ddos_guard_url
+ })
+ self.assertEqual(res.status_code, 200)
+
+ body = V1ResponseBase(res.json)
+ self.assertEqual(STATUS_OK, body.status)
+ self.assertEqual("Challenge solved!", body.message)
+ self.assertGreater(body.startTimestamp, 10000)
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
+
+ solution = body.solution
+ self.assertIn(self.ddos_guard_url, solution.url)
+ self.assertEqual(solution.status, 200)
+ self.assertIs(len(solution.headers), 0)
+ self.assertIn("Литрес", solution.response)
+ self.assertGreater(len(solution.cookies), 0)
+ self.assertIn("Chrome/", solution.userAgent)
+
+ cf_cookie = _find_obj_by_key("name", "__ddg1_", solution.cookies)
+ self.assertIsNotNone(cf_cookie, "DDOS-Guard cookie not found")
+ self.assertGreater(len(cf_cookie["value"]), 10)
+
+ def test_v1_endpoint_request_get_fairlane_js(self):
+ res = self.app.post_json('/v1', {
+ "cmd": "request.get",
+ "url": self.fairlane_url
+ })
+ self.assertEqual(res.status_code, 200)
+
+ body = V1ResponseBase(res.json)
+ self.assertEqual(STATUS_OK, body.status)
+ self.assertEqual("Challenge solved!", body.message)
+ self.assertGreater(body.startTimestamp, 10000)
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
+
+ solution = body.solution
+ self.assertIn(self.fairlane_url, solution.url)
+ self.assertEqual(solution.status, 200)
+ self.assertIs(len(solution.headers), 0)
+ self.assertIn("Rental Apartments Amsterdam", solution.response)
+ self.assertGreater(len(solution.cookies), 0)
+ self.assertIn("Chrome/", solution.userAgent)
+
+ cf_cookie = _find_obj_by_key("name", "fl_pass_v2_b", solution.cookies)
+ self.assertIsNotNone(cf_cookie, "Fairlane cookie not found")
+ self.assertGreater(len(cf_cookie["value"]), 50)
+
+ def test_v1_endpoint_request_get_custom_cloudflare_js(self):
+ res = self.app.post_json('/v1', {
+ "cmd": "request.get",
+ "url": self.custom_cloudflare_url
+ })
+ self.assertEqual(res.status_code, 200)
+
+ body = V1ResponseBase(res.json)
+ self.assertEqual(STATUS_OK, body.status)
+ self.assertEqual("Challenge solved!", body.message)
+ self.assertGreater(body.startTimestamp, 10000)
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
+
+ solution = body.solution
+ self.assertIn(self.custom_cloudflare_url, solution.url)
+ self.assertEqual(solution.status, 200)
+ self.assertIs(len(solution.headers), 0)
+ self.assertIn("MuziekFabriek : Aanmelden", solution.response)
+ self.assertGreater(len(solution.cookies), 0)
+ self.assertIn("Chrome/", solution.userAgent)
+
+ cf_cookie = _find_obj_by_key("name", "ct_anti_ddos_key", solution.cookies)
+ self.assertIsNotNone(cf_cookie, "Custom Cloudflare cookie not found")
+ self.assertGreater(len(cf_cookie["value"]), 10)
+
+ # todo: test Cmd 'request.get' should return fail with Cloudflare CAPTCHA
+
+ def test_v1_endpoint_request_get_cloudflare_blocked(self):
+ res = self.app.post_json('/v1', {
+ "cmd": "request.get",
+ "url": self.cloudflare_blocked_url
+ }, status=500)
+ self.assertEqual(res.status_code, 500)
+
+ body = V1ResponseBase(res.json)
+ self.assertEqual(STATUS_ERROR, body.status)
+ self.assertEqual("Error: Error solving the challenge. Cloudflare has blocked this request. "
+ "Probably your IP is banned for this site, check in your web browser.", body.message)
+ self.assertGreater(body.startTimestamp, 10000)
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
+
+ def test_v1_endpoint_request_get_cookies_param(self):
+ res = self.app.post_json('/v1', {
+ "cmd": "request.get",
+ "url": self.google_url,
+ "cookies": [
+ {
+ "name": "testcookie1",
+ "value": "testvalue1"
+ },
+ {
+ "name": "testcookie2",
+ "value": "testvalue2"
+ }
+ ]
+ })
+ self.assertEqual(res.status_code, 200)
+
+ body = V1ResponseBase(res.json)
+ self.assertEqual(STATUS_OK, body.status)
+ self.assertEqual("Challenge not detected!", body.message)
+ self.assertGreater(body.startTimestamp, 10000)
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
+
+ solution = body.solution
+ self.assertIn(self.google_url, solution.url)
+ self.assertEqual(solution.status, 200)
+ self.assertIs(len(solution.headers), 0)
+ self.assertIn("Google", solution.response)
+ self.assertGreater(len(solution.cookies), 1)
+ self.assertIn("Chrome/", solution.userAgent)
+
+ user_cookie1 = _find_obj_by_key("name", "testcookie1", solution.cookies)
+ self.assertIsNotNone(user_cookie1, "User cookie 1 not found")
+ self.assertEqual("testvalue1", user_cookie1["value"])
+
+ user_cookie2 = _find_obj_by_key("name", "testcookie2", solution.cookies)
+ self.assertIsNotNone(user_cookie2, "User cookie 2 not found")
+ self.assertEqual("testvalue2", user_cookie2["value"])
+
+ def test_v1_endpoint_request_get_returnOnlyCookies_param(self):
+ res = self.app.post_json('/v1', {
+ "cmd": "request.get",
+ "url": self.google_url,
+ "returnOnlyCookies": True
+ })
+ self.assertEqual(res.status_code, 200)
+
+ body = V1ResponseBase(res.json)
+ self.assertEqual(STATUS_OK, body.status)
+ self.assertEqual("Challenge not detected!", body.message)
+ self.assertGreater(body.startTimestamp, 10000)
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
+
+ solution = body.solution
+ self.assertIn(self.google_url, solution.url)
+ self.assertEqual(solution.status, 200)
+ self.assertIsNone(solution.headers)
+ self.assertIsNone(solution.response)
+ self.assertGreater(len(solution.cookies), 0)
+ self.assertIn("Chrome/", solution.userAgent)
+
+ def test_v1_endpoint_request_get_proxy_http_param(self):
+ """
+ To configure TinyProxy in local:
+ * sudo vim /etc/tinyproxy/tinyproxy.conf
+ * edit => LogFile "/tmp/tinyproxy.log"
+ * edit => Syslog Off
+ * sudo tinyproxy -d
+ * sudo tail -f /tmp/tinyproxy.log
+ """
+ res = self.app.post_json('/v1', {
+ "cmd": "request.get",
+ "url": self.google_url,
+ "proxy": {
+ "url": self.proxy_url
+ }
+ })
+ self.assertEqual(res.status_code, 200)
+
+ body = V1ResponseBase(res.json)
+ self.assertEqual(STATUS_OK, body.status)
+ self.assertEqual("Challenge not detected!", body.message)
+ self.assertGreater(body.startTimestamp, 10000)
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
+
+ solution = body.solution
+ self.assertIn(self.google_url, solution.url)
+ self.assertEqual(solution.status, 200)
+ self.assertIs(len(solution.headers), 0)
+ self.assertIn("Google", solution.response)
+ self.assertGreater(len(solution.cookies), 0)
+ self.assertIn("Chrome/", solution.userAgent)
+
+ def test_v1_endpoint_request_get_proxy_http_param_with_credentials(self):
+ """
+ To configure TinyProxy in local:
+ * sudo vim /etc/tinyproxy/tinyproxy.conf
+ * edit => LogFile "/tmp/tinyproxy.log"
+ * edit => Syslog Off
+ * add => BasicAuth testuser testpass
+ * sudo tinyproxy -d
+ * sudo tail -f /tmp/tinyproxy.log
+ """
+ res = self.app.post_json('/v1', {
+ "cmd": "request.get",
+ "url": self.google_url,
+ "proxy": {
+ "url": self.proxy_url,
+ "username": "testuser",
+ "password": "testpass"
+ }
+ })
+ self.assertEqual(res.status_code, 200)
+
+ body = V1ResponseBase(res.json)
+ self.assertEqual(STATUS_OK, body.status)
+ self.assertEqual("Challenge not detected!", body.message)
+ self.assertGreater(body.startTimestamp, 10000)
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
+
+ solution = body.solution
+ self.assertIn(self.google_url, solution.url)
+ self.assertEqual(solution.status, 200)
+ self.assertIs(len(solution.headers), 0)
+ self.assertIn("Google", solution.response)
+ self.assertGreater(len(solution.cookies), 0)
+ self.assertIn("Chrome/", solution.userAgent)
+
+ def test_v1_endpoint_request_get_proxy_socks_param(self):
+ """
+ To configure Dante in local:
+ * https://linuxhint.com/set-up-a-socks5-proxy-on-ubuntu-with-dante/
+ * sudo vim /etc/sockd.conf
+ * sudo systemctl restart sockd.service
+ * curl --socks5 socks5://127.0.0.1:1080 https://www.google.com
+ """
+ res = self.app.post_json('/v1', {
+ "cmd": "request.get",
+ "url": self.google_url,
+ "proxy": {
+ "url": self.proxy_socks_url
+ }
+ })
+ self.assertEqual(res.status_code, 200)
+
+ body = V1ResponseBase(res.json)
+ self.assertEqual(STATUS_OK, body.status)
+ self.assertEqual("Challenge not detected!", body.message)
+ self.assertGreater(body.startTimestamp, 10000)
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
+
+ solution = body.solution
+ self.assertIn(self.google_url, solution.url)
+ self.assertEqual(solution.status, 200)
+ self.assertIs(len(solution.headers), 0)
+ self.assertIn("Google", solution.response)
+ self.assertGreater(len(solution.cookies), 0)
+ self.assertIn("Chrome/", solution.userAgent)
+
+ def test_v1_endpoint_request_get_proxy_wrong_param(self):
+ res = self.app.post_json('/v1', {
+ "cmd": "request.get",
+ "url": self.google_url,
+ "proxy": {
+ "url": "http://127.0.0.1:43210"
+ }
+ }, status=500)
+ self.assertEqual(res.status_code, 500)
+
+ body = V1ResponseBase(res.json)
+ self.assertEqual(STATUS_ERROR, body.status)
+ self.assertIn("Error: Error solving the challenge. Message: unknown error: net::ERR_PROXY_CONNECTION_FAILED",
+ body.message)
+ self.assertGreater(body.startTimestamp, 10000)
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
+
+ def test_v1_endpoint_request_get_fail_timeout(self):
+ res = self.app.post_json('/v1', {
+ "cmd": "request.get",
+ "url": self.google_url,
+ "maxTimeout": 10
+ }, status=500)
+ self.assertEqual(res.status_code, 500)
+
+ body = V1ResponseBase(res.json)
+ self.assertEqual(STATUS_ERROR, body.status)
+ self.assertEqual("Error: Error solving the challenge. Timeout after 0.01 seconds.", body.message)
+ self.assertGreater(body.startTimestamp, 10000)
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
+
+ def test_v1_endpoint_request_get_fail_bad_domain(self):
+ res = self.app.post_json('/v1', {
+ "cmd": "request.get",
+ "url": "https://www.google.combad"
+ }, status=500)
+ self.assertEqual(res.status_code, 500)
+
+ body = V1ResponseBase(res.json)
+ self.assertEqual(STATUS_ERROR, body.status)
+ self.assertIn("Message: unknown error: net::ERR_NAME_NOT_RESOLVED", body.message)
+
+ def test_v1_endpoint_request_get_deprecated_param(self):
+ res = self.app.post_json('/v1', {
+ "cmd": "request.get",
+ "url": self.google_url,
+ "userAgent": "Test User-Agent" # was removed in v2, not used
+ })
+ self.assertEqual(res.status_code, 200)
+
+ body = V1ResponseBase(res.json)
+ self.assertEqual(STATUS_OK, body.status)
+ self.assertEqual("Challenge not detected!", body.message)
+
+ def test_v1_endpoint_request_post_no_cloudflare(self):
+ res = self.app.post_json('/v1', {
+ "cmd": "request.post",
+ "url": self.post_url,
+ "postData": "param1=value1¶m2=value2"
+ })
+ self.assertEqual(res.status_code, 200)
+
+ body = V1ResponseBase(res.json)
+ self.assertEqual(STATUS_OK, body.status)
+ self.assertEqual("Challenge not detected!", body.message)
+ self.assertGreater(body.startTimestamp, 10000)
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
+
+ solution = body.solution
+ self.assertIn(self.post_url, solution.url)
+ self.assertEqual(solution.status, 200)
+ self.assertIs(len(solution.headers), 0)
+ self.assertIn('"form": {\n "param1": "value1", \n "param2": "value2"\n }', solution.response)
+ self.assertEqual(len(solution.cookies), 0)
+ self.assertIn("Chrome/", solution.userAgent)
+
+ def test_v1_endpoint_request_post_cloudflare(self):
+ res = self.app.post_json('/v1', {
+ "cmd": "request.post",
+ "url": self.cloudflare_url,
+ "postData": "param1=value1¶m2=value2"
+ })
+ self.assertEqual(res.status_code, 200)
+
+ body = V1ResponseBase(res.json)
+ self.assertEqual(STATUS_OK, body.status)
+ self.assertEqual("Challenge solved!", body.message)
+ self.assertGreater(body.startTimestamp, 10000)
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
+
+ solution = body.solution
+ self.assertIn(self.cloudflare_url, solution.url)
+ self.assertEqual(solution.status, 200)
+ self.assertIs(len(solution.headers), 0)
+ self.assertIn("405 Not Allowed", solution.response)
+ self.assertGreater(len(solution.cookies), 0)
+ self.assertIn("Chrome/", solution.userAgent)
+
+ cf_cookie = _find_obj_by_key("name", "cf_clearance", solution.cookies)
+ self.assertIsNotNone(cf_cookie, "Cloudflare cookie not found")
+ self.assertGreater(len(cf_cookie["value"]), 30)
+
+ def test_v1_endpoint_request_post_fail_no_post_data(self):
+ res = self.app.post_json('/v1', {
+ "cmd": "request.post",
+ "url": self.google_url
+ }, status=500)
+ self.assertEqual(res.status_code, 500)
+
+ body = V1ResponseBase(res.json)
+ self.assertEqual(STATUS_ERROR, body.status)
+ self.assertIn("Request parameter 'postData' is mandatory in 'request.post' command", body.message)
+
+ def test_v1_endpoint_request_post_deprecated_param(self):
+ res = self.app.post_json('/v1', {
+ "cmd": "request.post",
+ "url": self.google_url,
+ "postData": "param1=value1¶m2=value2",
+ "userAgent": "Test User-Agent" # was removed in v2, not used
+ })
+ self.assertEqual(res.status_code, 200)
+
+ body = V1ResponseBase(res.json)
+ self.assertEqual(STATUS_OK, body.status)
+ self.assertEqual("Challenge not detected!", body.message)
+
+ def test_v1_endpoint_sessions_create_without_session(self):
+ res = self.app.post_json('/v1', {
+ "cmd": "sessions.create"
+ })
+ self.assertEqual(res.status_code, 200)
+
+ body = V1ResponseBase(res.json)
+ self.assertEqual(STATUS_OK, body.status)
+ self.assertEqual("Session created successfully.", body.message)
+ self.assertIsNotNone(body.session)
+
+ def test_v1_endpoint_sessions_create_with_session(self):
+ res = self.app.post_json('/v1', {
+ "cmd": "sessions.create",
+ "session": "test_create_session"
+ })
+ self.assertEqual(res.status_code, 200)
+
+ body = V1ResponseBase(res.json)
+ self.assertEqual(STATUS_OK, body.status)
+ self.assertEqual("Session created successfully.", body.message)
+ self.assertEqual(body.session, "test_create_session")
+
+ def test_v1_endpoint_sessions_create_with_proxy(self):
+ res = self.app.post_json('/v1', {
+ "cmd": "sessions.create",
+ "proxy": {
+ "url": self.proxy_url
+ }
+ })
+ self.assertEqual(res.status_code, 200)
+
+ body = V1ResponseBase(res.json)
+ self.assertEqual(STATUS_OK, body.status)
+ self.assertEqual("Session created successfully.", body.message)
+ self.assertIsNotNone(body.session)
+
+ def test_v1_endpoint_sessions_list(self):
+ self.app.post_json('/v1', {
+ "cmd": "sessions.create",
+ "session": "test_list_sessions"
+ })
+ res = self.app.post_json('/v1', {
+ "cmd": "sessions.list"
+ })
+ self.assertEqual(res.status_code, 200)
+
+ body = V1ResponseBase(res.json)
+ self.assertEqual(STATUS_OK, body.status)
+ self.assertEqual("", body.message)
+ self.assertGreaterEqual(len(body.sessions), 1)
+ self.assertIn("test_list_sessions", body.sessions)
+
+ def test_v1_endpoint_sessions_destroy_existing_session(self):
+ self.app.post_json('/v1', {
+ "cmd": "sessions.create",
+ "session": "test_destroy_sessions"
+ })
+ res = self.app.post_json('/v1', {
+ "cmd": "sessions.destroy",
+ "session": "test_destroy_sessions"
+ })
+ self.assertEqual(res.status_code, 200)
+
+ body = V1ResponseBase(res.json)
+ self.assertEqual(STATUS_OK, body.status)
+ self.assertEqual("The session has been removed.", body.message)
+
+ def test_v1_endpoint_sessions_destroy_non_existing_session(self):
+ res = self.app.post_json('/v1', {
+ "cmd": "sessions.destroy",
+ "session": "non_existing_session_name"
+ }, status=500)
+ self.assertEqual(res.status_code, 500)
+
+ body = V1ResponseBase(res.json)
+ self.assertEqual(STATUS_ERROR, body.status)
+ self.assertEqual("Error: The session doesn't exist.", body.message)
+
+ def test_v1_endpoint_request_get_with_session(self):
+ self.app.post_json('/v1', {
+ "cmd": "sessions.create",
+ "session": "test_request_sessions"
+ })
+ res = self.app.post_json('/v1', {
+ "cmd": "request.get",
+ "session": "test_request_sessions",
+ "url": self.google_url
+ })
+ self.assertEqual(res.status_code, 200)
+
+ body = V1ResponseBase(res.json)
+ self.assertEqual(STATUS_OK, body.status)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/flaresolverr/tests_sites.py b/flaresolverr/tests_sites.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa7dcbc4f2ed3071fb03606391781f1818fefd4c
--- /dev/null
+++ b/flaresolverr/tests_sites.py
@@ -0,0 +1,102 @@
+import unittest
+
+from webtest import TestApp
+
+from dtos import V1ResponseBase, STATUS_OK
+import flaresolverr
+import utils
+
+
+def _find_obj_by_key(key: str, value: str, _list: list) -> dict | None:
+ for obj in _list:
+ if obj[key] == value:
+ return obj
+ return None
+
+
+def asset_cloudflare_solution(self, res, site_url, site_text):
+ self.assertEqual(res.status_code, 200)
+
+ body = V1ResponseBase(res.json)
+ self.assertEqual(STATUS_OK, body.status)
+ self.assertEqual("Challenge solved!", body.message)
+ self.assertGreater(body.startTimestamp, 10000)
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
+
+ solution = body.solution
+ self.assertIn(site_url, solution.url)
+ self.assertEqual(solution.status, 200)
+ self.assertIs(len(solution.headers), 0)
+ self.assertIn(site_text, solution.response)
+ self.assertGreater(len(solution.cookies), 0)
+ self.assertIn("Chrome/", solution.userAgent)
+
+ cf_cookie = _find_obj_by_key("name", "cf_clearance", solution.cookies)
+ self.assertIsNotNone(cf_cookie, "Cloudflare cookie not found")
+ self.assertGreater(len(cf_cookie["value"]), 30)
+
+
+class TestFlareSolverr(unittest.TestCase):
+ app = TestApp(flaresolverr.app)
+ # wait until the server is ready
+ app.get('/')
+
+ def test_v1_endpoint_request_get_cloudflare(self):
+ sites_get = [
+ ('nowsecure', 'https://nowsecure.nl', 'nowSecure'),
+ ('0magnet', 'https://0magnet.com/search?q=2022', 'Torrent Search - ØMagnet'),
+ ('1337x', 'https://1337x.unblockit.cat/cat/Movies/time/desc/1/', ''),
+ ('avistaz', 'https://avistaz.to/api/v1/jackett/torrents?in=1&type=0&search=',
+ 'Access denied'),
+ ('badasstorrents', 'https://badasstorrents.com/torrents/search/720p/date/desc',
+ 'Latest Torrents - BadassTorrents'),
+ ('bt4g', 'https://bt4g.org/search/2022', 'Download 2022 Torrents - BT4G'),
+ ('cinemaz', 'https://cinemaz.to/api/v1/jackett/torrents?in=1&type=0&search=',
+ 'Access denied'),
+ ('epublibre', 'https://epublibre.unblockit.cat/catalogo/index/0/nuevo/todos/sin/todos/--/ajax',
+ 'epublibre - catálogo'),
+ ('ext', 'https://ext.to/latest/?order=age&sort=desc',
+ 'Download Latest Torrents - EXT Torrents'),
+ ('extratorrent', 'https://extratorrent.st/search/?srt=added&order=desc&search=720p&new=1&x=0&y=0',
+ 'Page 1 - ExtraTorrent'),
+ ('idope', 'https://idope.se/browse.html', 'Recent Torrents'),
+ ('limetorrents', 'https://limetorrents.unblockninja.com/latest100',
+ 'Latest 100 torrents - LimeTorrents'),
+ ('privatehd', 'https://privatehd.to/api/v1/jackett/torrents?in=1&type=0&search=',
+ 'Access denied'),
+ ('torrentcore', 'https://torrentcore.xyz/index', 'Torrent[CORE] - Torrent community.'),
+ ('torrentqq223', 'https://torrentqq223.com/torrent/newest.html', 'https://torrentqq223.com/ads/'),
+ ('36dm', 'https://www.36dm.club/1.html', 'https://www.36dm.club/yesterday-1.html'),
+ ('erai-raws', 'https://www.erai-raws.info/feed/?type=magnet', '403 Forbidden'),
+ ('teamos', 'https://www.teamos.xyz/torrents/?filename=&freeleech=',
+ 'Log in | Team OS : Your Only Destination To Custom OS !!'),
+ ('yts', 'https://yts.unblockninja.com/api/v2/list_movies.json?query_term=&limit=50&sort=date_added',
+ '{"movie_count":')
+ ]
+ for site_name, site_url, site_text in sites_get:
+ with self.subTest(msg=site_name):
+ res = self.app.post_json('/v1', {
+ "cmd": "request.get",
+ "url": site_url
+ })
+ asset_cloudflare_solution(self, res, site_url, site_text)
+
+ def test_v1_endpoint_request_post_cloudflare(self):
+ sites_post = [
+ ('nnmclub', 'https://nnmclub.to/forum/tracker.php', 'Трекер :: NNM-Club',
+ 'prev_sd=0&prev_a=0&prev_my=0&prev_n=0&prev_shc=0&prev_shf=1&prev_sha=1&prev_shs=0&prev_shr=0&prev_sht=0&f%5B%5D=-1&o=1&s=2&tm=-1&shf=1&sha=1&ta=-1&sns=-1&sds=-1&nm=&pn=&submit=%CF%EE%E8%F1%EA')
+ ]
+
+ for site_name, site_url, site_text, post_data in sites_post:
+ with self.subTest(msg=site_name):
+ res = self.app.post_json('/v1', {
+ "cmd": "request.post",
+ "url": site_url,
+ "postData": post_data
+ })
+ asset_cloudflare_solution(self, res, site_url, site_text)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/flaresolverr/undetected_chromedriver/__init__.py b/flaresolverr/undetected_chromedriver/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b78f60e3ebacf394743795c94ed7378158ae8f74
--- /dev/null
+++ b/flaresolverr/undetected_chromedriver/__init__.py
@@ -0,0 +1,910 @@
+#!/usr/bin/env python3
+
+"""
+
+ 888 888 d8b
+ 888 888 Y8P
+ 888 888
+ .d8888b 88888b. 888d888 .d88b. 88888b.d88b. .d88b. .d88888 888d888 888 888 888 .d88b. 888d888
+d88P" 888 "88b 888P" d88""88b 888 "888 "88b d8P Y8b d88" 888 888P" 888 888 888 d8P Y8b 888P"
+888 888 888 888 888 888 888 888 888 88888888 888 888 888 888 Y88 88P 88888888 888
+Y88b. 888 888 888 Y88..88P 888 888 888 Y8b. Y88b 888 888 888 Y8bd8P Y8b. 888
+ "Y8888P 888 888 888 "Y88P" 888 888 888 "Y8888 "Y88888 888 888 Y88P "Y8888 888 88888888
+
+by UltrafunkAmsterdam (https://github.com/ultrafunkamsterdam)
+
+"""
+from __future__ import annotations
+
+
+__version__ = "3.5.5"
+
+import json
+import logging
+import os
+import pathlib
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+import time
+from weakref import finalize
+
+import selenium.webdriver.chrome.service
+import selenium.webdriver.chrome.webdriver
+from selenium.webdriver.common.by import By
+import selenium.webdriver.chromium.service
+import selenium.webdriver.remote.command
+import selenium.webdriver.remote.webdriver
+
+from .cdp import CDP
+from .dprocess import start_detached
+from .options import ChromeOptions
+from .patcher import IS_POSIX
+from .patcher import Patcher
+from .reactor import Reactor
+from .webelement import UCWebElement
+from .webelement import WebElement
+
+
+__all__ = (
+ "Chrome",
+ "ChromeOptions",
+ "Patcher",
+ "Reactor",
+ "CDP",
+ "find_chrome_executable",
+)
+
+logger = logging.getLogger("uc")
+logger.setLevel(logging.getLogger().getEffectiveLevel())
+
+
+class Chrome(selenium.webdriver.chrome.webdriver.WebDriver):
+ """
+
+ Controls the ChromeDriver and allows you to drive the browser.
+
+ The webdriver file will be downloaded by this module automatically,
+ you do not need to specify this. however, you may if you wish.
+
+ Attributes
+ ----------
+
+ Methods
+ -------
+
+ reconnect()
+
+ this can be useful in case of heavy detection methods
+ -stops the chromedriver service which runs in the background
+ -starts the chromedriver service which runs in the background
+ -recreate session
+
+
+ start_session(capabilities=None, browser_profile=None)
+
+ differentiates from the regular method in that it does not
+ require a capabilities argument. The capabilities are automatically
+ recreated from the options at creation time.
+
+ --------------------------------------------------------------------------
+ NOTE:
+ Chrome has everything included to work out of the box.
+ it does not `need` customizations.
+ any customizations MAY lead to trigger bot migitation systems.
+
+ --------------------------------------------------------------------------
+ """
+
+ _instances = set()
+ session_id = None
+ debug = False
+
+ def __init__(
+ self,
+ options=None,
+ user_data_dir=None,
+ driver_executable_path=None,
+ browser_executable_path=None,
+ port=0,
+ enable_cdp_events=False,
+ # service_args=None,
+ # service_creationflags=None,
+ desired_capabilities=None,
+ advanced_elements=False,
+ # service_log_path=None,
+ keep_alive=True,
+ log_level=0,
+ headless=False,
+ version_main=None,
+ patcher_force_close=False,
+ suppress_welcome=True,
+ use_subprocess=False,
+ debug=False,
+ no_sandbox=True,
+ windows_headless=False,
+ user_multi_procs: bool = False,
+ **kw,
+ ):
+ """
+ Creates a new instance of the chrome driver.
+
+ Starts the service and then creates new instance of chrome driver.
+
+ Parameters
+ ----------
+
+ options: ChromeOptions, optional, default: None - automatic useful defaults
+ this takes an instance of ChromeOptions, mainly to customize browser behavior.
+ anything other dan the default, for example extensions or startup options
+ are not supported in case of failure, and can probably lowers your undetectability.
+
+
+ user_data_dir: str , optional, default: None (creates temp profile)
+ if user_data_dir is a path to a valid chrome profile directory, use it,
+ and turn off automatic removal mechanism at exit.
+
+ driver_executable_path: str, optional, default: None(=downloads and patches new binary)
+
+ browser_executable_path: str, optional, default: None - use find_chrome_executable
+ Path to the browser executable.
+ If not specified, make sure the executable's folder is in $PATH
+
+ port: int, optional, default: 0
+ port to be used by the chromedriver executable, this is NOT the debugger port.
+ leave it at 0 unless you know what you are doing.
+ the default value of 0 automatically picks an available port.
+
+ enable_cdp_events: bool, default: False
+ :: currently for chrome only
+ this enables the handling of wire messages
+ when enabled, you can subscribe to CDP events by using:
+
+ driver.add_cdp_listener("Network.dataReceived", yourcallback)
+ # yourcallback is an callable which accepts exactly 1 dict as parameter
+
+
+ service_args: list of str, optional, default: None
+ arguments to pass to the driver service
+
+ desired_capabilities: dict, optional, default: None - auto from config
+ Dictionary object with non-browser specific capabilities only, such as "item" or "loggingPref".
+
+ advanced_elements: bool, optional, default: False
+ makes it easier to recognize elements like you know them from html/browser inspection, especially when working
+ in an interactive environment
+
+ default webelement repr:
+
+
+ advanced webelement repr
+ )>
+
+ note: when retrieving large amounts of elements ( example: find_elements_by_tag("*") ) and print them, it does take a little more time.
+
+
+ service_log_path: str, optional, default: None
+ path to log information from the driver.
+
+ keep_alive: bool, optional, default: True
+ Whether to configure ChromeRemoteConnection to use HTTP keep-alive.
+
+ log_level: int, optional, default: adapts to python global log level
+
+ headless: bool, optional, default: False
+ can also be specified in the options instance.
+ Specify whether you want to use the browser in headless mode.
+ warning: this lowers undetectability and not fully supported.
+
+ version_main: int, optional, default: None (=auto)
+ if you, for god knows whatever reason, use
+ an older version of Chrome. You can specify it's full rounded version number
+ here. Example: 87 for all versions of 87
+
+ patcher_force_close: bool, optional, default: False
+ instructs the patcher to do whatever it can to access the chromedriver binary
+ if the file is locked, it will force shutdown all instances.
+ setting it is not recommended, unless you know the implications and think
+ you might need it.
+
+ suppress_welcome: bool, optional , default: True
+ a "welcome" alert might show up on *nix-like systems asking whether you want to set
+ chrome as your default browser, and if you want to send even more data to google.
+ now, in case you are nag-fetishist, or a diagnostics data feeder to google, you can set this to False.
+ Note: if you don't handle the nag screen in time, the browser loses it's connection and throws an Exception.
+
+ use_subprocess: bool, optional , default: True,
+
+ False (the default) makes sure Chrome will get it's own process (so no subprocess of chromedriver.exe or python
+ This fixes a LOT of issues, like multithreaded run, but mst importantly. shutting corectly after
+ program exits or using .quit()
+ you should be knowing what you're doing, and know how python works.
+
+ unfortunately, there is always an edge case in which one would like to write an single script with the only contents being:
+ --start script--
+ import undetected_chromedriver as uc
+ d = uc.Chrome()
+ d.get('https://somesite/')
+ ---end script --
+
+ and will be greeted with an error, since the program exists before chrome has a change to launch.
+ in that case you can set this to `True`. The browser will start via subprocess, and will keep running most of times.
+ ! setting it to True comes with NO support when being detected. !
+
+ no_sandbox: bool, optional, default=True
+ uses the --no-sandbox option, and additionally does suppress the "unsecure option" status bar
+ this option has a default of True since many people seem to run this as root (....) , and chrome does not start
+ when running as root without using --no-sandbox flag.
+
+ user_multi_procs:
+ set to true when you are using multithreads/multiprocessing
+ ensures not all processes are trying to modify a binary which is in use by another.
+ for this to work. YOU MUST HAVE AT LEAST 1 UNDETECTED_CHROMEDRIVER BINARY IN YOUR ROAMING DATA FOLDER.
+ this requirement can be easily satisfied, by just running this program "normal" and close/kill it.
+
+
+ """
+
+ finalize(self, self._ensure_close, self)
+ self.debug = debug
+ self.patcher = Patcher(
+ executable_path=driver_executable_path,
+ force=patcher_force_close,
+ version_main=version_main,
+ user_multi_procs=user_multi_procs,
+ )
+ # self.patcher.auto(user_multiprocess = user_multi_num_procs)
+ self.patcher.auto()
+
+ # self.patcher = patcher
+ if not options:
+ options = ChromeOptions()
+
+ try:
+ if hasattr(options, "_session") and options._session is not None:
+ # prevent reuse of options,
+ # as it just appends arguments, not replace them
+ # you'll get conflicts starting chrome
+ raise RuntimeError("you cannot reuse the ChromeOptions object")
+ except AttributeError:
+ pass
+
+ options._session = self
+
+ if not options.debugger_address:
+ debug_port = (
+ port
+ if port != 0
+ else selenium.webdriver.common.service.utils.free_port()
+ )
+ debug_host = "127.0.0.1"
+ options.debugger_address = "%s:%d" % (debug_host, debug_port)
+ else:
+ debug_host, debug_port = options.debugger_address.split(":")
+ debug_port = int(debug_port)
+
+ if enable_cdp_events:
+ options.set_capability(
+ "goog:loggingPrefs", {"performance": "ALL", "browser": "ALL"}
+ )
+
+ options.add_argument("--remote-debugging-host=%s" % debug_host)
+ options.add_argument("--remote-debugging-port=%s" % debug_port)
+
+ if user_data_dir:
+ options.add_argument("--user-data-dir=%s" % user_data_dir)
+
+ language, keep_user_data_dir = None, bool(user_data_dir)
+
+ # see if a custom user profile is specified in options
+ for arg in options.arguments:
+
+ if any([_ in arg for _ in ("--headless", "headless")]):
+ options.arguments.remove(arg)
+ options.headless = True
+
+ if "lang" in arg:
+ m = re.search("(?:--)?lang(?:[ =])?(.*)", arg)
+ try:
+ language = m[1]
+ except IndexError:
+ logger.debug("will set the language to en-US,en;q=0.9")
+ language = "en-US,en;q=0.9"
+
+ if "user-data-dir" in arg:
+ m = re.search("(?:--)?user-data-dir(?:[ =])?(.*)", arg)
+ try:
+ user_data_dir = m[1]
+ logger.debug(
+ "user-data-dir found in user argument %s => %s" % (arg, m[1])
+ )
+ keep_user_data_dir = True
+
+ except IndexError:
+ logger.debug(
+ "no user data dir could be extracted from supplied argument %s "
+ % arg
+ )
+
+ if not user_data_dir:
+ # backward compatiblity
+ # check if an old uc.ChromeOptions is used, and extract the user data dir
+
+ if hasattr(options, "user_data_dir") and getattr(
+ options, "user_data_dir", None
+ ):
+ import warnings
+
+ warnings.warn(
+ "using ChromeOptions.user_data_dir might stop working in future versions."
+ "use uc.Chrome(user_data_dir='/xyz/some/data') in case you need existing profile folder"
+ )
+ options.add_argument("--user-data-dir=%s" % options.user_data_dir)
+ keep_user_data_dir = True
+ logger.debug(
+ "user_data_dir property found in options object: %s" % user_data_dir
+ )
+
+ else:
+ user_data_dir = os.path.normpath(tempfile.mkdtemp())
+ keep_user_data_dir = False
+ arg = "--user-data-dir=%s" % user_data_dir
+ options.add_argument(arg)
+ logger.debug(
+ "created a temporary folder in which the user-data (profile) will be stored during this\n"
+ "session, and added it to chrome startup arguments: %s" % arg
+ )
+
+ if not language:
+ try:
+ import locale
+
+ language = locale.getdefaultlocale()[0].replace("_", "-")
+ except Exception:
+ pass
+ if not language:
+ language = "en-US"
+
+ options.add_argument("--lang=%s" % language)
+
+ if not options.binary_location:
+ options.binary_location = (
+ browser_executable_path or find_chrome_executable()
+ )
+
+ if not options.binary_location or not \
+ pathlib.Path(options.binary_location).exists():
+ raise FileNotFoundError(
+ "\n---------------------\n"
+ "Could not determine browser executable."
+ "\n---------------------\n"
+ "Make sure your browser is installed in the default location (path).\n"
+ "If you are sure about the browser executable, you can specify it using\n"
+ "the `browser_executable_path='{}` parameter.\n\n"
+ .format("/path/to/browser/executable" if IS_POSIX else "c:/path/to/your/browser.exe")
+ )
+
+ self._delay = 3
+
+ self.user_data_dir = user_data_dir
+ self.keep_user_data_dir = keep_user_data_dir
+
+ if suppress_welcome:
+ options.arguments.extend(["--no-default-browser-check", "--no-first-run"])
+ if no_sandbox:
+ options.arguments.extend(["--no-sandbox", "--test-type"])
+
+ if headless or getattr(options, 'headless', None):
+ #workaround until a better checking is found
+ try:
+ v_main = int(self.patcher.version_main) if self.patcher.version_main else 108
+ if v_main < 108:
+ options.add_argument("--headless=chrome")
+ elif v_main >= 108:
+ options.add_argument("--headless=new")
+ except:
+ logger.warning("could not detect version_main."
+ "therefore, we are assuming it is chrome 108 or higher")
+ options.add_argument("--headless=new")
+
+ options.add_argument("--window-size=1920,1080")
+ options.add_argument("--start-maximized")
+ options.add_argument("--no-sandbox")
+ # fixes "could not connect to chrome" error when running
+ # on linux using privileged user like root (which i don't recommend)
+
+ options.add_argument(
+ "--log-level=%d" % log_level
+ or divmod(logging.getLogger().getEffectiveLevel(), 10)[0]
+ )
+
+ if hasattr(options, "handle_prefs"):
+ options.handle_prefs(user_data_dir)
+
+ # fix exit_type flag to prevent tab-restore nag
+ try:
+ with open(
+ os.path.join(user_data_dir, "Default/Preferences"),
+ encoding="latin1",
+ mode="r+",
+ ) as fs:
+ config = json.load(fs)
+ if config["profile"]["exit_type"] is not None:
+ # fixing the restore-tabs-nag
+ config["profile"]["exit_type"] = None
+ fs.seek(0, 0)
+ json.dump(config, fs)
+ fs.truncate() # the file might be shorter
+ logger.debug("fixed exit_type flag")
+ except Exception as e:
+ logger.debug("did not find a bad exit_type flag ")
+
+ self.options = options
+
+ if not desired_capabilities:
+ desired_capabilities = options.to_capabilities()
+
+ if not use_subprocess and not windows_headless:
+ self.browser_pid = start_detached(
+ options.binary_location, *options.arguments
+ )
+ else:
+ startupinfo = None
+ if os.name == 'nt' and windows_headless:
+ # STARTUPINFO() is Windows only
+ startupinfo = subprocess.STARTUPINFO()
+ startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
+ browser = subprocess.Popen(
+ [options.binary_location, *options.arguments],
+ stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ close_fds=IS_POSIX,
+ startupinfo=startupinfo
+ )
+ self.browser_pid = browser.pid
+
+
+ service = selenium.webdriver.chromium.service.ChromiumService(
+ self.patcher.executable_path
+ )
+
+ super().__init__(
+ service=service,
+ options=options,
+ keep_alive=keep_alive,
+ )
+
+ self.reactor = None
+
+ if enable_cdp_events:
+ if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
+ logging.getLogger(
+ "selenium.webdriver.remote.remote_connection"
+ ).setLevel(20)
+ reactor = Reactor(self)
+ reactor.start()
+ self.reactor = reactor
+
+ if advanced_elements:
+ self._web_element_cls = UCWebElement
+ else:
+ self._web_element_cls = WebElement
+
+ if headless or getattr(options, 'headless', None):
+ self._configure_headless()
+
+ def _configure_headless(self):
+ orig_get = self.get
+ logger.info("setting properties for headless")
+
+ def get_wrapped(*args, **kwargs):
+ if self.execute_script("return navigator.webdriver"):
+ logger.info("patch navigator.webdriver")
+ self.execute_cdp_cmd(
+ "Page.addScriptToEvaluateOnNewDocument",
+ {
+ "source": """
+ Object.defineProperty(window, "navigator", {
+ value: new Proxy(navigator, {
+ has: (target, key) => (key === "webdriver" ? false : key in target),
+ get: (target, key) =>
+ key === "webdriver"
+ ? false
+ : typeof target[key] === "function"
+ ? target[key].bind(target)
+ : target[key],
+ }),
+ });
+ """
+ },
+ )
+
+ logger.info("patch user-agent string")
+ self.execute_cdp_cmd(
+ "Network.setUserAgentOverride",
+ {
+ "userAgent": self.execute_script(
+ "return navigator.userAgent"
+ ).replace("Headless", "")
+ },
+ )
+ self.execute_cdp_cmd(
+ "Page.addScriptToEvaluateOnNewDocument",
+ {
+ "source": """
+ Object.defineProperty(navigator, 'maxTouchPoints', {get: () => 1});
+ Object.defineProperty(navigator.connection, 'rtt', {get: () => 100});
+
+ // https://github.com/microlinkhq/browserless/blob/master/packages/goto/src/evasions/chrome-runtime.js
+ window.chrome = {
+ app: {
+ isInstalled: false,
+ InstallState: {
+ DISABLED: 'disabled',
+ INSTALLED: 'installed',
+ NOT_INSTALLED: 'not_installed'
+ },
+ RunningState: {
+ CANNOT_RUN: 'cannot_run',
+ READY_TO_RUN: 'ready_to_run',
+ RUNNING: 'running'
+ }
+ },
+ runtime: {
+ OnInstalledReason: {
+ CHROME_UPDATE: 'chrome_update',
+ INSTALL: 'install',
+ SHARED_MODULE_UPDATE: 'shared_module_update',
+ UPDATE: 'update'
+ },
+ OnRestartRequiredReason: {
+ APP_UPDATE: 'app_update',
+ OS_UPDATE: 'os_update',
+ PERIODIC: 'periodic'
+ },
+ PlatformArch: {
+ ARM: 'arm',
+ ARM64: 'arm64',
+ MIPS: 'mips',
+ MIPS64: 'mips64',
+ X86_32: 'x86-32',
+ X86_64: 'x86-64'
+ },
+ PlatformNaclArch: {
+ ARM: 'arm',
+ MIPS: 'mips',
+ MIPS64: 'mips64',
+ X86_32: 'x86-32',
+ X86_64: 'x86-64'
+ },
+ PlatformOs: {
+ ANDROID: 'android',
+ CROS: 'cros',
+ LINUX: 'linux',
+ MAC: 'mac',
+ OPENBSD: 'openbsd',
+ WIN: 'win'
+ },
+ RequestUpdateCheckStatus: {
+ NO_UPDATE: 'no_update',
+ THROTTLED: 'throttled',
+ UPDATE_AVAILABLE: 'update_available'
+ }
+ }
+ }
+
+ // https://github.com/microlinkhq/browserless/blob/master/packages/goto/src/evasions/navigator-permissions.js
+ if (!window.Notification) {
+ window.Notification = {
+ permission: 'denied'
+ }
+ }
+
+ const originalQuery = window.navigator.permissions.query
+ window.navigator.permissions.__proto__.query = parameters =>
+ parameters.name === 'notifications'
+ ? Promise.resolve({ state: window.Notification.permission })
+ : originalQuery(parameters)
+
+ const oldCall = Function.prototype.call
+ function call() {
+ return oldCall.apply(this, arguments)
+ }
+ Function.prototype.call = call
+
+ const nativeToStringFunctionString = Error.toString().replace(/Error/g, 'toString')
+ const oldToString = Function.prototype.toString
+
+ function functionToString() {
+ if (this === window.navigator.permissions.query) {
+ return 'function query() { [native code] }'
+ }
+ if (this === functionToString) {
+ return nativeToStringFunctionString
+ }
+ return oldCall.call(oldToString, this)
+ }
+ // eslint-disable-next-line
+ Function.prototype.toString = functionToString
+ """
+ },
+ )
+ return orig_get(*args, **kwargs)
+
+ self.get = get_wrapped
+
+ # def _get_cdc_props(self):
+ # return self.execute_script(
+ # """
+ # let objectToInspect = window,
+ # result = [];
+ # while(objectToInspect !== null)
+ # { result = result.concat(Object.getOwnPropertyNames(objectToInspect));
+ # objectToInspect = Object.getPrototypeOf(objectToInspect); }
+ #
+ # return result.filter(i => i.match(/^([a-zA-Z]){27}(Array|Promise|Symbol)$/ig))
+ # """
+ # )
+ #
+ # def _hook_remove_cdc_props(self):
+ # self.execute_cdp_cmd(
+ # "Page.addScriptToEvaluateOnNewDocument",
+ # {
+ # "source": """
+ # let objectToInspect = window,
+ # result = [];
+ # while(objectToInspect !== null)
+ # { result = result.concat(Object.getOwnPropertyNames(objectToInspect));
+ # objectToInspect = Object.getPrototypeOf(objectToInspect); }
+ # result.forEach(p => p.match(/^([a-zA-Z]){27}(Array|Promise|Symbol)$/ig)
+ # &&delete window[p]&&console.log('removed',p))
+ # """
+ # },
+ # )
+
+ def get(self, url):
+ # if self._get_cdc_props():
+ # self._hook_remove_cdc_props()
+ return super().get(url)
+
+ def add_cdp_listener(self, event_name, callback):
+ if (
+ self.reactor
+ and self.reactor is not None
+ and isinstance(self.reactor, Reactor)
+ ):
+ self.reactor.add_event_handler(event_name, callback)
+ return self.reactor.handlers
+ return False
+
+ def clear_cdp_listeners(self):
+ if self.reactor and isinstance(self.reactor, Reactor):
+ self.reactor.handlers.clear()
+
+ def window_new(self):
+ self.execute(
+ selenium.webdriver.remote.command.Command.NEW_WINDOW, {"type": "window"}
+ )
+
+ def tab_new(self, url: str):
+ """
+ this opens a url in a new tab.
+ apparently, that passes all tests directly!
+
+ Parameters
+ ----------
+ url
+
+ Returns
+ -------
+
+ """
+ if not hasattr(self, "cdp"):
+ from .cdp import CDP
+
+ cdp = CDP(self.options)
+ cdp.tab_new(url)
+
+ def reconnect(self, timeout=0.1):
+ try:
+ self.service.stop()
+ except Exception as e:
+ logger.debug(e)
+ time.sleep(timeout)
+ try:
+ self.service.start()
+ except Exception as e:
+ logger.debug(e)
+
+ try:
+ self.start_session()
+ except Exception as e:
+ logger.debug(e)
+
+ def start_session(self, capabilities=None, browser_profile=None):
+ if not capabilities:
+ capabilities = self.options.to_capabilities()
+ super().start_session(capabilities)
+ # super(Chrome, self).start_session(capabilities, browser_profile) # Original explicit call commented out
+
+ def find_elements_recursive(self, by, value):
+ """
+ find elements in all frames
+ this is a generator function, which is needed
+ since if it would return a list of elements, they
+ will be stale on arrival.
+ using generator, when the element is returned we are in the correct frame
+ to use it directly
+ Args:
+ by: By
+ value: str
+ Returns: Generator[webelement.WebElement]
+ """
+ def search_frame(f=None):
+ if not f:
+ # ensure we are on main content frame
+ self.switch_to.default_content()
+ else:
+ self.switch_to.frame(f)
+ for elem in self.find_elements(by, value):
+ yield elem
+ # switch back to main content, otherwise we will get StaleElementReferenceException
+ self.switch_to.default_content()
+
+ # search root frame
+ for elem in search_frame():
+ yield elem
+ # get iframes
+ frames = self.find_elements('css selector', 'iframe')
+
+ # search per frame
+ for f in frames:
+ for elem in search_frame(f):
+ yield elem
+
+ def quit(self):
+ try:
+ self.service.stop()
+ self.service.process.kill()
+ self.command_executor.close()
+ self.service.process.wait(5)
+ logger.debug("webdriver process ended")
+ except (AttributeError, RuntimeError, OSError):
+ pass
+ try:
+ self.reactor.event.set()
+ logger.debug("shutting down reactor")
+ except AttributeError:
+ pass
+ try:
+ os.kill(self.browser_pid, 15)
+ logger.debug("gracefully closed browser")
+ except Exception as e: # noqa
+ pass
+ if (
+ hasattr(self, "keep_user_data_dir")
+ and hasattr(self, "user_data_dir")
+ and not self.keep_user_data_dir
+ ):
+ for _ in range(5):
+ try:
+ shutil.rmtree(self.user_data_dir, ignore_errors=False)
+ except FileNotFoundError:
+ pass
+ except (RuntimeError, OSError, PermissionError) as e:
+ logger.debug(
+ "When removing the temp profile, a %s occured: %s\nretrying..."
+ % (e.__class__.__name__, e)
+ )
+ else:
+ logger.debug("successfully removed %s" % self.user_data_dir)
+ break
+
+ try:
+ time.sleep(0.1)
+ except OSError:
+ pass
+
+ # dereference patcher, so patcher can start cleaning up as well.
+ # this must come last, otherwise it will throw 'in use' errors
+ self.patcher = None
+
+ def __getattribute__(self, item):
+ if not super().__getattribute__("debug"):
+ return super().__getattribute__(item)
+ else:
+ import inspect
+
+ original = super().__getattribute__(item)
+ if inspect.ismethod(original) and not inspect.isclass(original):
+
+ def newfunc(*args, **kwargs):
+ logger.debug(
+ "calling %s with args %s and kwargs %s\n"
+ % (original.__qualname__, args, kwargs)
+ )
+ return original(*args, **kwargs)
+
+ return newfunc
+ return original
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ self.service.stop()
+ time.sleep(self._delay)
+ self.service.start()
+ self.start_session()
+
+ def __hash__(self):
+ return hash(self.options.debugger_address)
+
+ def __dir__(self):
+ return object.__dir__(self)
+
+ def __del__(self):
+ try:
+ self.service.process.kill()
+ except: # noqa
+ pass
+ self.quit()
+
+ @classmethod
+ def _ensure_close(cls, self):
+ # needs to be a classmethod so finalize can find the reference
+ logger.info("ensuring close")
+ if (
+ hasattr(self, "service")
+ and hasattr(self.service, "process")
+ and hasattr(self.service.process, "kill")
+ ):
+ self.service.process.kill()
+
+
+def find_chrome_executable():
+ """
+ Finds the chrome, chrome beta, chrome canary, chromium executable
+
+ Returns
+ -------
+ executable_path : str
+ the full file path to found executable
+
+ """
+ candidates = set()
+ if IS_POSIX:
+ for item in os.environ.get("PATH").split(os.pathsep):
+ for subitem in (
+ "google-chrome",
+ "chromium",
+ "chromium-browser",
+ "chrome",
+ "google-chrome-stable",
+ ):
+ candidates.add(os.sep.join((item, subitem)))
+ if "darwin" in sys.platform:
+ candidates.update(
+ [
+ "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
+ "/Applications/Chromium.app/Contents/MacOS/Chromium",
+ ]
+ )
+ else:
+ for item in map(
+ os.environ.get,
+ ("PROGRAMFILES", "PROGRAMFILES(X86)", "LOCALAPPDATA", "PROGRAMW6432"),
+ ):
+ if item is not None:
+ for subitem in (
+ "Google/Chrome/Application",
+ ):
+ candidates.add(os.sep.join((item, subitem, "chrome.exe")))
+ for candidate in candidates:
+ logger.debug('checking if %s exists and is executable' % candidate)
+ if os.path.exists(candidate) and os.access(candidate, os.X_OK):
+ logger.debug('found! using %s' % candidate)
+ return os.path.normpath(candidate)
diff --git a/flaresolverr/undetected_chromedriver/cdp.py b/flaresolverr/undetected_chromedriver/cdp.py
new file mode 100644
index 0000000000000000000000000000000000000000..32a503c73c26f71f7513e891555a72ff14f03bfe
--- /dev/null
+++ b/flaresolverr/undetected_chromedriver/cdp.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+# this module is part of undetected_chromedriver
+
+import json
+import logging
+
+import requests
+import websockets
+
+
+log = logging.getLogger(__name__)
+
+
+class CDPObject(dict):
+ def __init__(self, *a, **k):
+ super().__init__(*a, **k)
+ self.__dict__ = self
+ for k in self.__dict__:
+ if isinstance(self.__dict__[k], dict):
+ self.__dict__[k] = CDPObject(self.__dict__[k])
+ elif isinstance(self.__dict__[k], list):
+ for i in range(len(self.__dict__[k])):
+ if isinstance(self.__dict__[k][i], dict):
+ self.__dict__[k][i] = CDPObject(self)
+
+ def __repr__(self):
+ tpl = f"{self.__class__.__name__}(\n\t{{}}\n\t)"
+ return tpl.format("\n ".join(f"{k} = {v}" for k, v in self.items()))
+
+
+class PageElement(CDPObject):
+ pass
+
+
+class CDP:
+ log = logging.getLogger("CDP")
+
+ endpoints = CDPObject(
+ {
+ "json": "/json",
+ "protocol": "/json/protocol",
+ "list": "/json/list",
+ "new": "/json/new?{url}",
+ "activate": "/json/activate/{id}",
+ "close": "/json/close/{id}",
+ }
+ )
+
+ def __init__(self, options: "ChromeOptions"): # noqa
+ self.server_addr = "http://{0}:{1}".format(*options.debugger_address.split(":"))
+
+ self._reqid = 0
+ self._session = requests.Session()
+ self._last_resp = None
+ self._last_json = None
+
+ resp = self.get(self.endpoints.json) # noqa
+ self.sessionId = resp[0]["id"]
+ self.wsurl = resp[0]["webSocketDebuggerUrl"]
+
+ def tab_activate(self, id=None):
+ if not id:
+ active_tab = self.tab_list()[0]
+ id = active_tab.id # noqa
+ self.wsurl = active_tab.webSocketDebuggerUrl # noqa
+ return self.post(self.endpoints["activate"].format(id=id))
+
+ def tab_list(self):
+ retval = self.get(self.endpoints["list"])
+ return [PageElement(o) for o in retval]
+
+ def tab_new(self, url):
+ return self.post(self.endpoints["new"].format(url=url))
+
+ def tab_close_last_opened(self):
+ sessions = self.tab_list()
+ opentabs = [s for s in sessions if s["type"] == "page"]
+ return self.post(self.endpoints["close"].format(id=opentabs[-1]["id"]))
+
+ async def send(self, method: str, params: dict):
+ self._reqid += 1
+ async with websockets.connect(self.wsurl) as ws:
+ await ws.send(
+ json.dumps({"method": method, "params": params, "id": self._reqid})
+ )
+ self._last_resp = await ws.recv()
+ self._last_json = json.loads(self._last_resp)
+ self.log.info(self._last_json)
+
+ def get(self, uri):
+ resp = self._session.get(self.server_addr + uri)
+ try:
+ self._last_resp = resp
+ self._last_json = resp.json()
+ except Exception:
+ return
+ else:
+ return self._last_json
+
+ def post(self, uri, data: dict = None):
+ if not data:
+ data = {}
+ resp = self._session.post(self.server_addr + uri, json=data)
+ try:
+ self._last_resp = resp
+ self._last_json = resp.json()
+ except Exception:
+ return self._last_resp
+
+ @property
+ def last_json(self):
+ return self._last_json
diff --git a/flaresolverr/undetected_chromedriver/devtool.py b/flaresolverr/undetected_chromedriver/devtool.py
new file mode 100644
index 0000000000000000000000000000000000000000..915d4176a5898f8c4caed006866cac6ceccdad3f
--- /dev/null
+++ b/flaresolverr/undetected_chromedriver/devtool.py
@@ -0,0 +1,193 @@
+import asyncio
+from collections.abc import Mapping
+from collections.abc import Sequence
+from functools import wraps
+import os
+import logging
+import threading
+import time
+import traceback
+from typing import Any
+from typing import Awaitable
+from typing import Callable
+from typing import List
+from typing import Optional
+
+
+class Structure(dict):
+ """
+ This is a dict-like object structure, which you should subclass
+ Only properties defined in the class context are used on initialization.
+
+ See example
+ """
+
+ _store = {}
+
+ def __init__(self, *a, **kw):
+ """
+ Instantiate a new instance.
+
+ :param a:
+ :param kw:
+ """
+
+ super().__init__()
+
+ # auxiliar dict
+ d = dict(*a, **kw)
+ for k, v in d.items():
+ if isinstance(v, Mapping):
+ self[k] = self.__class__(v)
+ elif isinstance(v, Sequence) and not isinstance(v, (str, bytes)):
+ self[k] = [self.__class__(i) for i in v]
+ else:
+ self[k] = v
+ super().__setattr__("__dict__", self)
+
+ def __getattr__(self, item):
+ return getattr(super(), item)
+
+ def __getitem__(self, item):
+ return super().__getitem__(item)
+
+ def __setattr__(self, key, value):
+ self.__setitem__(key, value)
+
+ def __setitem__(self, key, value):
+ super().__setitem__(key, value)
+
+ def update(self, *a, **kw):
+ super().update(*a, **kw)
+
+ def __eq__(self, other):
+ return frozenset(other.items()) == frozenset(self.items())
+
+ def __hash__(self):
+ return hash(frozenset(self.items()))
+
+ @classmethod
+ def __init_subclass__(cls, **kwargs):
+ cls._store = {}
+
+ def _normalize_strings(self):
+ for k, v in self.copy().items():
+ if isinstance(v, (str)):
+ self[k] = v.strip()
+
+
+def timeout(seconds=3, on_timeout: Optional[Callable[[callable], Any]] = None):
+ def wrapper(func):
+ @wraps(func)
+ def wrapped(*args, **kwargs):
+ def function_reached_timeout():
+ if on_timeout:
+ on_timeout(func)
+ else:
+ raise TimeoutError("function call timed out")
+
+ t = threading.Timer(interval=seconds, function=function_reached_timeout)
+ t.start()
+ try:
+ return func(*args, **kwargs)
+ except:
+ t.cancel()
+ raise
+ finally:
+ t.cancel()
+
+ return wrapped
+
+ return wrapper
+
+
+def test():
+ import sys, os
+
+ sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
+ import undetected_chromedriver as uc
+ import threading
+
+ def collector(
+ driver: uc.Chrome,
+ stop_event: threading.Event,
+ on_event_coro: Optional[Callable[[List[str]], Awaitable[Any]]] = None,
+ listen_events: Sequence = ("browser", "network", "performance"),
+ ):
+ def threaded(driver, stop_event, on_event_coro):
+ async def _ensure_service_started():
+ while (
+ getattr(driver, "service", False)
+ and getattr(driver.service, "process", False)
+ and driver.service.process.poll()
+ ):
+ print("waiting for driver service to come back on")
+ await asyncio.sleep(0.05)
+ # await asyncio.sleep(driver._delay or .25)
+
+ async def get_log_lines(typ):
+ await _ensure_service_started()
+ return driver.get_log(typ)
+
+ async def looper():
+ while not stop_event.is_set():
+ log_lines = []
+ try:
+ for _ in listen_events:
+ try:
+ log_lines += await get_log_lines(_)
+ except:
+ if logging.getLogger().getEffectiveLevel() <= 10:
+ traceback.print_exc()
+ continue
+ if log_lines and on_event_coro:
+ await on_event_coro(log_lines)
+ except Exception as e:
+ if logging.getLogger().getEffectiveLevel() <= 10:
+ traceback.print_exc()
+
+ loop = asyncio.new_event_loop()
+ asyncio.set_event_loop(loop)
+ loop.run_until_complete(looper())
+
+ t = threading.Thread(target=threaded, args=(driver, stop_event, on_event_coro))
+ t.start()
+
+ async def on_event(data):
+ print("on_event")
+ print("data:", data)
+
+ def func_called(fn):
+ def wrapped(*args, **kwargs):
+ print(
+ "func called! %s (args: %s, kwargs: %s)" % (fn.__name__, args, kwargs)
+ )
+ while driver.service.process and driver.service.process.poll() is not None:
+ time.sleep(0.1)
+ res = fn(*args, **kwargs)
+ print("func completed! (result: %s)" % res)
+ return res
+
+ return wrapped
+
+ logging.basicConfig(level=10)
+
+ options = uc.ChromeOptions()
+ options.set_capability(
+ "goog:loggingPrefs", {"performance": "ALL", "browser": "ALL", "network": "ALL"}
+ )
+
+ driver = uc.Chrome(version_main=96, options=options)
+
+ # driver.command_executor._request = timeout(seconds=1)(driver.command_executor._request)
+ driver.command_executor._request = func_called(driver.command_executor._request)
+ collector_stop = threading.Event()
+ collector(driver, collector_stop, on_event)
+
+ driver.get("https://nowsecure.nl")
+
+ time.sleep(10)
+
+ if os.name == "nt":
+ driver.close()
+ driver.quit()
diff --git a/flaresolverr/undetected_chromedriver/dprocess.py b/flaresolverr/undetected_chromedriver/dprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d053fa9a87d74570dae311e18ae550ac6006da1
--- /dev/null
+++ b/flaresolverr/undetected_chromedriver/dprocess.py
@@ -0,0 +1,77 @@
+import atexit
+import logging
+import multiprocessing
+import os
+import platform
+import signal
+from subprocess import PIPE
+from subprocess import Popen
+import sys
+
+
+CREATE_NEW_PROCESS_GROUP = 0x00000200
+DETACHED_PROCESS = 0x00000008
+
+REGISTERED = []
+
+
+def start_detached(executable, *args):
+ """
+ Starts a fully independent subprocess (with no parent)
+ :param executable: executable
+ :param args: arguments to the executable, eg: ['--param1_key=param1_val', '-vvv' ...]
+ :return: pid of the grandchild process
+ """
+
+ # create pipe
+ reader, writer = multiprocessing.Pipe(False)
+
+ # do not keep reference
+ process = multiprocessing.Process(
+ target=_start_detached,
+ args=(executable, *args),
+ kwargs={"writer": writer},
+ daemon=True,
+ )
+ process.start()
+ process.join()
+ # receive pid from pipe
+ pid = reader.recv()
+ REGISTERED.append(pid)
+ # close pipes
+ writer.close()
+ reader.close()
+ process.close()
+
+ return pid
+
+
+def _start_detached(executable, *args, writer: multiprocessing.Pipe = None):
+ # configure launch
+ kwargs = {}
+ if platform.system() == "Windows":
+ kwargs.update(creationflags=DETACHED_PROCESS | CREATE_NEW_PROCESS_GROUP)
+ elif sys.version_info < (3, 2):
+ # assume posix
+ kwargs.update(preexec_fn=os.setsid)
+ else: # Python 3.2+ and Unix
+ kwargs.update(start_new_session=True)
+
+ # run
+ p = Popen([executable, *args], stdin=PIPE, stdout=PIPE, stderr=PIPE, **kwargs)
+
+ # send pid to pipe
+ writer.send(p.pid)
+ sys.exit()
+
+
+def _cleanup():
+ for pid in REGISTERED:
+ try:
+ logging.getLogger(__name__).debug("cleaning up pid %d " % pid)
+ os.kill(pid, signal.SIGTERM)
+ except: # noqa
+ pass
+
+
+atexit.register(_cleanup)
diff --git a/flaresolverr/undetected_chromedriver/options.py b/flaresolverr/undetected_chromedriver/options.py
new file mode 100644
index 0000000000000000000000000000000000000000..8078ae957d14b43296a094fbe3e87e463069aa4b
--- /dev/null
+++ b/flaresolverr/undetected_chromedriver/options.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+# this module is part of undetected_chromedriver
+
+
+import json
+import os
+
+from selenium.webdriver.chromium.options import ChromiumOptions as _ChromiumOptions
+
+
+class ChromeOptions(_ChromiumOptions):
+ _session = None
+ _user_data_dir = None
+
+ @property
+ def user_data_dir(self):
+ return self._user_data_dir
+
+ @user_data_dir.setter
+ def user_data_dir(self, path: str):
+ """
+ Sets the browser profile folder to use, or creates a new profile
+ at given .
+
+ Parameters
+ ----------
+ path: str
+ the path to a chrome profile folder
+ if it does not exist, a new profile will be created at given location
+ """
+ apath = os.path.abspath(path)
+ self._user_data_dir = os.path.normpath(apath)
+
+ @staticmethod
+ def _undot_key(key, value):
+ """turn a (dotted key, value) into a proper nested dict"""
+ if "." in key:
+ key, rest = key.split(".", 1)
+ value = ChromeOptions._undot_key(rest, value)
+ return {key: value}
+
+ @staticmethod
+ def _merge_nested(a, b):
+ """
+ merges b into a
+ leaf values in a are overwritten with values from b
+ """
+ for key in b:
+ if key in a:
+ if isinstance(a[key], dict) and isinstance(b[key], dict):
+ ChromeOptions._merge_nested(a[key], b[key])
+ continue
+ a[key] = b[key]
+ return a
+
+ def handle_prefs(self, user_data_dir):
+ prefs = self.experimental_options.get("prefs")
+ if prefs:
+ user_data_dir = user_data_dir or self._user_data_dir
+ default_path = os.path.join(user_data_dir, "Default")
+ os.makedirs(default_path, exist_ok=True)
+
+ # undot prefs dict keys
+ undot_prefs = {}
+ for key, value in prefs.items():
+ undot_prefs = self._merge_nested(
+ undot_prefs, self._undot_key(key, value)
+ )
+
+ prefs_file = os.path.join(default_path, "Preferences")
+ if os.path.exists(prefs_file):
+ with open(prefs_file, encoding="latin1", mode="r") as f:
+ undot_prefs = self._merge_nested(json.load(f), undot_prefs)
+
+ with open(prefs_file, encoding="latin1", mode="w") as f:
+ json.dump(undot_prefs, f)
+
+ # remove the experimental_options to avoid an error
+ del self._experimental_options["prefs"]
+
+ @classmethod
+ def from_options(cls, options):
+ o = cls()
+ o.__dict__.update(options.__dict__)
+ return o
diff --git a/flaresolverr/undetected_chromedriver/patcher.py b/flaresolverr/undetected_chromedriver/patcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a60f76492ead6df854f61cb90b7ef78386fbf55
--- /dev/null
+++ b/flaresolverr/undetected_chromedriver/patcher.py
@@ -0,0 +1,473 @@
+#!/usr/bin/env python3
+# this module is part of undetected_chromedriver
+
+from packaging.version import Version as LooseVersion
+import io
+import json
+import logging
+import os
+import pathlib
+import platform
+import random
+import re
+import shutil
+import string
+import subprocess
+import sys
+import time
+from urllib.request import urlopen
+from urllib.request import urlretrieve
+import zipfile
+from multiprocessing import Lock
+
+logger = logging.getLogger(__name__)
+
+IS_POSIX = sys.platform.startswith(("darwin", "cygwin", "linux", "linux2", "freebsd"))
+
+
+class Patcher(object):
+ lock = Lock()
+ exe_name = "chromedriver%s"
+
+ platform = sys.platform
+ if platform.endswith("win32"):
+ d = "~/appdata/roaming/undetected_chromedriver"
+ elif "LAMBDA_TASK_ROOT" in os.environ:
+ d = "/tmp/undetected_chromedriver"
+ elif platform.startswith(("linux", "linux2")):
+ d = "~/.local/share/undetected_chromedriver"
+ elif platform.endswith("darwin"):
+ d = "~/Library/Application Support/undetected_chromedriver"
+ else:
+ d = "~/.undetected_chromedriver"
+ data_path = os.path.abspath(os.path.expanduser(d))
+
+ def __init__(
+ self,
+ executable_path=None,
+ force=False,
+ version_main: int = 0,
+ user_multi_procs=False,
+ ):
+ """
+ Args:
+ executable_path: None = automatic
+ a full file path to the chromedriver executable
+ force: False
+ terminate processes which are holding lock
+ version_main: 0 = auto
+ specify main chrome version (rounded, ex: 82)
+ """
+ self.force = force
+ self._custom_exe_path = False
+ prefix = "undetected"
+ self.user_multi_procs = user_multi_procs
+
+ try:
+ # Try to convert version_main into an integer
+ version_main_int = int(version_main)
+ # check if version_main_int is less than or equal to e.g 114
+ self.is_old_chromedriver = version_main and version_main_int <= 114
+ except (ValueError,TypeError):
+ # Check not running inside Docker
+ if not os.path.exists("/app/chromedriver"):
+ # If the conversion fails, log an error message
+ logging.info("version_main cannot be converted to an integer")
+ # Set self.is_old_chromedriver to False if the conversion fails
+ self.is_old_chromedriver = False
+
+ # Needs to be called before self.exe_name is accessed
+ self._set_platform_name()
+
+ if not os.path.exists(self.data_path):
+ os.makedirs(self.data_path, exist_ok=True)
+
+ if not executable_path:
+ if sys.platform.startswith("freebsd"):
+ self.executable_path = os.path.join(
+ self.data_path, self.exe_name
+ )
+ else:
+ self.executable_path = os.path.join(
+ self.data_path, "_".join([prefix, self.exe_name])
+ )
+
+ if not IS_POSIX:
+ if executable_path:
+ if not executable_path[-4:] == ".exe":
+ executable_path += ".exe"
+
+ self.zip_path = os.path.join(self.data_path, prefix)
+
+ if not executable_path:
+ if not self.user_multi_procs:
+ self.executable_path = os.path.abspath(
+ os.path.join(".", self.executable_path)
+ )
+
+ if executable_path:
+ self._custom_exe_path = True
+ self.executable_path = executable_path
+
+ # Set the correct repository to download the Chromedriver from
+ if self.is_old_chromedriver:
+ self.url_repo = "https://chromedriver.storage.googleapis.com"
+ else:
+ self.url_repo = "https://googlechromelabs.github.io/chrome-for-testing"
+
+ self.version_main = version_main
+ self.version_full = None
+
+ def _set_platform_name(self):
+ """
+ Set the platform and exe name based on the platform undetected_chromedriver is running on
+ in order to download the correct chromedriver.
+ """
+ if self.platform.endswith("win32"):
+ self.platform_name = "win32"
+ self.exe_name %= ".exe"
+ if self.platform.endswith(("linux", "linux2")):
+ self.platform_name = "linux64"
+ self.exe_name %= ""
+ if self.platform.endswith("darwin"):
+ if self.is_old_chromedriver:
+ self.platform_name = "mac64"
+ else:
+ self.platform_name = "mac-x64"
+ self.exe_name %= ""
+ if self.platform.startswith("freebsd"):
+ self.platform_name = "freebsd"
+ self.exe_name %= ""
+
+ def auto(self, executable_path=None, force=False, version_main=None, _=None):
+ """
+
+ Args:
+ executable_path:
+ force:
+ version_main:
+
+ Returns:
+
+ """
+ p = pathlib.Path(self.data_path)
+ if self.user_multi_procs:
+ with Lock():
+ files = list(p.rglob("*chromedriver*"))
+ most_recent = max(files, key=lambda f: f.stat().st_mtime)
+ files.remove(most_recent)
+ list(map(lambda f: f.unlink(), files))
+ if self.is_binary_patched(most_recent):
+ self.executable_path = str(most_recent)
+ return True
+
+ if executable_path:
+ self.executable_path = executable_path
+ self._custom_exe_path = True
+
+ if self._custom_exe_path:
+ ispatched = self.is_binary_patched(self.executable_path)
+ if not ispatched:
+ return self.patch_exe()
+ else:
+ return
+
+ if version_main:
+ self.version_main = version_main
+ if force is True:
+ self.force = force
+
+
+ if self.platform_name == "freebsd":
+ chromedriver_path = shutil.which("chromedriver")
+
+ if not os.path.isfile(chromedriver_path) or not os.access(chromedriver_path, os.X_OK):
+ logging.error("Chromedriver not installed!")
+ return
+
+ version_path = os.path.join(os.path.dirname(self.executable_path), "version.txt")
+
+ process = os.popen(f'"{chromedriver_path}" --version')
+ chromedriver_version = process.read().split(' ')[1].split(' ')[0]
+ process.close()
+
+ current_version = None
+ if os.path.isfile(version_path) or os.access(version_path, os.X_OK):
+ with open(version_path, 'r') as f:
+ current_version = f.read()
+
+ if current_version != chromedriver_version:
+ logging.info("Copying chromedriver executable...")
+ shutil.copy(chromedriver_path, self.executable_path)
+ os.chmod(self.executable_path, 0o755)
+
+ with open(version_path, 'w') as f:
+ f.write(chromedriver_version)
+
+ logging.info("Chromedriver executable copied!")
+ else:
+ try:
+ os.unlink(self.executable_path)
+ except PermissionError:
+ if self.force:
+ self.force_kill_instances(self.executable_path)
+ return self.auto(force=not self.force)
+ try:
+ if self.is_binary_patched():
+ # assumes already running AND patched
+ return True
+ except PermissionError:
+ pass
+ # return False
+ except FileNotFoundError:
+ pass
+
+ release = self.fetch_release_number()
+ self.version_main = release.major
+ self.version_full = release
+ self.unzip_package(self.fetch_package())
+
+ return self.patch()
+
+ def driver_binary_in_use(self, path: str = None) -> bool:
+ """
+ naive test to check if a found chromedriver binary is
+ currently in use
+
+ Args:
+ path: a string or PathLike object to the binary to check.
+ if not specified, we check use this object's executable_path
+ """
+ if not path:
+ path = self.executable_path
+ p = pathlib.Path(path)
+
+ if not p.exists():
+ raise OSError("file does not exist: %s" % p)
+ try:
+ with open(p, mode="a+b") as fs:
+ exc = []
+ try:
+
+ fs.seek(0, 0)
+ except PermissionError as e:
+ exc.append(e) # since some systems apprently allow seeking
+ # we conduct another test
+ try:
+ fs.readline()
+ except PermissionError as e:
+ exc.append(e)
+
+ if exc:
+
+ return True
+ return False
+ # ok safe to assume this is in use
+ except Exception as e:
+ # logger.exception("whoops ", e)
+ pass
+
+ def cleanup_unused_files(self):
+ p = pathlib.Path(self.data_path)
+ items = list(p.glob("*undetected*"))
+ for item in items:
+ try:
+ item.unlink()
+ except:
+ pass
+
+ def patch(self):
+ self.patch_exe()
+ return self.is_binary_patched()
+
+ def fetch_release_number(self):
+ """
+ Gets the latest major version available, or the latest major version of self.target_version if set explicitly.
+ :return: version string
+ :rtype: LooseVersion
+ """
+ # Endpoint for old versions of Chromedriver (114 and below)
+ if self.is_old_chromedriver:
+ path = f"/latest_release_{self.version_main}"
+ path = path.upper()
+ logger.debug("getting release number from %s" % path)
+ return LooseVersion(urlopen(self.url_repo + path).read().decode())
+
+ # Endpoint for new versions of Chromedriver (115+)
+ if not self.version_main:
+ # Fetch the latest version
+ path = "/last-known-good-versions-with-downloads.json"
+ logger.debug("getting release number from %s" % path)
+ with urlopen(self.url_repo + path) as conn:
+ response = conn.read().decode()
+
+ last_versions = json.loads(response)
+ return LooseVersion(last_versions["channels"]["Stable"]["version"])
+
+ # Fetch the latest minor version of the major version provided
+ path = "/latest-versions-per-milestone-with-downloads.json"
+ logger.debug("getting release number from %s" % path)
+ with urlopen(self.url_repo + path) as conn:
+ response = conn.read().decode()
+
+ major_versions = json.loads(response)
+ return LooseVersion(major_versions["milestones"][str(self.version_main)]["version"])
+
+ def parse_exe_version(self):
+ with io.open(self.executable_path, "rb") as f:
+ for line in iter(lambda: f.readline(), b""):
+ match = re.search(rb"platform_handle\x00content\x00([0-9.]*)", line)
+ if match:
+ return LooseVersion(match[1].decode())
+
+ def fetch_package(self):
+ """
+ Downloads ChromeDriver from source
+
+ :return: path to downloaded file
+ """
+ zip_name = f"chromedriver_{self.platform_name}.zip"
+ if self.is_old_chromedriver:
+ download_url = "%s/%s/%s" % (self.url_repo, str(self.version_full), zip_name)
+ else:
+ zip_name = zip_name.replace("_", "-", 1)
+ download_url = "https://storage.googleapis.com/chrome-for-testing-public/%s/%s/%s"
+ download_url %= (str(self.version_full), self.platform_name, zip_name)
+
+ logger.debug("downloading from %s" % download_url)
+ return urlretrieve(download_url)[0]
+
+ def unzip_package(self, fp):
+ """
+ Does what it says
+
+ :return: path to unpacked executable
+ """
+ exe_path = self.exe_name
+ if not self.is_old_chromedriver:
+ # The new chromedriver unzips into its own folder
+ zip_name = f"chromedriver-{self.platform_name}"
+ exe_path = os.path.join(zip_name, self.exe_name)
+
+ logger.debug("unzipping %s" % fp)
+ try:
+ os.unlink(self.zip_path)
+ except (FileNotFoundError, OSError):
+ pass
+
+ os.makedirs(self.zip_path, mode=0o755, exist_ok=True)
+ with zipfile.ZipFile(fp, mode="r") as zf:
+ zf.extractall(self.zip_path)
+ os.rename(os.path.join(self.zip_path, exe_path), self.executable_path)
+ os.remove(fp)
+ shutil.rmtree
+ os.chmod(self.executable_path, 0o755)
+ return self.executable_path
+
+ @staticmethod
+ def force_kill_instances(exe_name):
+ """
+ kills running instances.
+ :param: executable name to kill, may be a path as well
+
+ :return: True on success else False
+ """
+ exe_name = os.path.basename(exe_name)
+ if IS_POSIX:
+ # Using shell=True for pidof, consider a more robust pid finding method if issues arise.
+ # pgrep can be an alternative: ["pgrep", "-f", exe_name]
+ # Or psutil if adding a dependency is acceptable.
+ command = f"pidof {exe_name}"
+ try:
+ result = subprocess.run(command, shell=True, capture_output=True, text=True, check=True)
+ pids = result.stdout.strip().split()
+ if pids:
+ subprocess.run(["kill", "-9"] + pids, check=False) # Changed from -f -9 to -9 as -f is not standard for kill
+ return True
+ return False # No PIDs found
+ except subprocess.CalledProcessError: # pidof returns 1 if no process found
+ return False # No process found
+ except Exception as e:
+ logger.debug(f"Error killing process on POSIX: {e}")
+ return False
+ else:
+ try:
+ # TASKKILL /F /IM chromedriver.exe
+ result = subprocess.run(["taskkill", "/f", "/im", exe_name], check=False, capture_output=True)
+ # taskkill returns 0 if process was killed, 128 if not found.
+ return result.returncode == 0
+ except Exception as e:
+ logger.debug(f"Error killing process on Windows: {e}")
+ return False
+
+ @staticmethod
+ def gen_random_cdc():
+ cdc = random.choices(string.ascii_letters, k=27)
+ return "".join(cdc).encode()
+
+ def is_binary_patched(self, executable_path=None):
+ executable_path = executable_path or self.executable_path
+ try:
+ with io.open(executable_path, "rb") as fh:
+ return fh.read().find(b"undetected chromedriver") != -1
+ except FileNotFoundError:
+ return False
+
+ def patch_exe(self):
+ start = time.perf_counter()
+ logger.info("patching driver executable %s" % self.executable_path)
+ with io.open(self.executable_path, "r+b") as fh:
+ content = fh.read()
+ # match_injected_codeblock = re.search(rb"{window.*;}", content)
+ match_injected_codeblock = re.search(rb"\{window\.cdc.*?;\}", content)
+ if match_injected_codeblock:
+ target_bytes = match_injected_codeblock[0]
+ new_target_bytes = (
+ b'{console.log("undetected chromedriver 1337!")}'.ljust(
+ len(target_bytes), b" "
+ )
+ )
+ new_content = content.replace(target_bytes, new_target_bytes)
+ if new_content == content:
+ logger.warning(
+ "something went wrong patching the driver binary. could not find injection code block"
+ )
+ else:
+ logger.debug(
+ "found block:\n%s\nreplacing with:\n%s"
+ % (target_bytes, new_target_bytes)
+ )
+ fh.seek(0)
+ fh.write(new_content)
+ logger.debug(
+ "patching took us {:.2f} seconds".format(time.perf_counter() - start)
+ )
+
+ def __repr__(self):
+ return "{0:s}({1:s})".format(
+ self.__class__.__name__,
+ self.executable_path,
+ )
+
+ def __del__(self):
+ if self._custom_exe_path:
+ # if the driver binary is specified by user
+ # we assume it is important enough to not delete it
+ return
+ else:
+ timeout = 3 # stop trying after this many seconds
+ t = time.monotonic()
+ now = lambda: time.monotonic()
+ while now() - t > timeout:
+ # we don't want to wait until the end of time
+ try:
+ if self.user_multi_procs:
+ break
+ os.unlink(self.executable_path)
+ logger.debug("successfully unlinked %s" % self.executable_path)
+ break
+ except (OSError, RuntimeError, PermissionError):
+ time.sleep(0.01)
+ continue
+ except FileNotFoundError:
+ break
diff --git a/flaresolverr/undetected_chromedriver/reactor.py b/flaresolverr/undetected_chromedriver/reactor.py
new file mode 100644
index 0000000000000000000000000000000000000000..d52e312e37dbed8669e43d43c762c0aa343edac7
--- /dev/null
+++ b/flaresolverr/undetected_chromedriver/reactor.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+# this module is part of undetected_chromedriver
+
+import asyncio
+import json
+import logging
+import threading
+
+
+logger = logging.getLogger(__name__)
+
+
+class Reactor(threading.Thread):
+ def __init__(self, driver: "Chrome"):
+ super().__init__()
+
+ self.driver = driver
+ self.loop = asyncio.new_event_loop()
+
+ self.lock = threading.Lock()
+ self.event = threading.Event()
+ self.daemon = True
+ self.handlers = {}
+
+ def add_event_handler(self, method_name, callback: callable):
+ """
+
+ Parameters
+ ----------
+ event_name: str
+ example "Network.responseReceived"
+
+ callback: callable
+ callable which accepts 1 parameter: the message object dictionary
+
+ Returns
+ -------
+
+ """
+ with self.lock:
+ self.handlers[method_name.lower()] = callback
+
+ @property
+ def running(self):
+ return not self.event.is_set()
+
+ def run(self):
+ try:
+ asyncio.set_event_loop(self.loop)
+ self.loop.run_until_complete(self.listen())
+ except Exception as e:
+ logger.warning("Reactor.run() => %s", e)
+
+ async def _wait_service_started(self):
+ while True:
+ with self.lock:
+ if (
+ getattr(self.driver, "service", None)
+ and getattr(self.driver.service, "process", None)
+ and self.driver.service.process.poll()
+ ):
+ await asyncio.sleep(self.driver._delay or 0.25)
+ else:
+ break
+
+ async def listen(self):
+ while self.running:
+ await self._wait_service_started()
+ await asyncio.sleep(1)
+
+ try:
+ with self.lock:
+ log_entries = self.driver.get_log("performance")
+
+ for entry in log_entries:
+ try:
+ obj_serialized: str = entry.get("message")
+ obj = json.loads(obj_serialized)
+ message = obj.get("message")
+ method = message.get("method")
+
+ if "*" in self.handlers:
+ await self.loop.run_in_executor(
+ None, self.handlers["*"], message
+ )
+ elif method.lower() in self.handlers:
+ await self.loop.run_in_executor(
+ None, self.handlers[method.lower()], message
+ )
+
+ # print(type(message), message)
+ except Exception as e:
+ raise e from None
+
+ except Exception as e:
+ if "invalid session id" in str(e):
+ pass
+ else:
+ logging.debug("exception ignored :", e)
diff --git a/flaresolverr/undetected_chromedriver/webelement.py b/flaresolverr/undetected_chromedriver/webelement.py
new file mode 100644
index 0000000000000000000000000000000000000000..03d687890b58751a6910f3014fdb31877779e3d4
--- /dev/null
+++ b/flaresolverr/undetected_chromedriver/webelement.py
@@ -0,0 +1,86 @@
+from typing import List
+
+from selenium.webdriver.common.by import By
+import selenium.webdriver.remote.webelement
+
+
+class WebElement(selenium.webdriver.remote.webelement.WebElement):
+ def click_safe(self):
+ super().click()
+ self._parent.reconnect(0.1)
+
+ def children(
+ self, tag=None, recursive=False
+ ) -> List[selenium.webdriver.remote.webelement.WebElement]:
+ """
+ returns direct child elements of current element
+ :param tag: str, if supplied, returns nodes only
+ """
+ script = "return [... arguments[0].children]"
+ if tag:
+ script += ".filter( node => node.tagName === '%s')" % tag.upper()
+ if recursive:
+ return list(_recursive_children(self, tag))
+ return list(self._parent.execute_script(script, self))
+
+
+class UCWebElement(WebElement):
+ """
+ Custom WebElement class which makes it easier to view elements when
+ working in an interactive environment.
+
+ standard webelement repr:
+
+
+ using this WebElement class:
+ )>
+
+ """
+
+ def __init__(self, parent, id_):
+ super().__init__(parent, id_)
+ self._attrs = None
+
+ @property
+ def attrs(self):
+ if not self._attrs:
+ self._attrs = self._parent.execute_script(
+ """
+ var items = {};
+ for (index = 0; index < arguments[0].attributes.length; ++index)
+ {
+ items[arguments[0].attributes[index].name] = arguments[0].attributes[index].value
+ };
+ return items;
+ """,
+ self,
+ )
+ return self._attrs
+
+ def __repr__(self):
+ strattrs = " ".join([f'{k}="{v}"' for k, v in self.attrs.items()])
+ if strattrs:
+ strattrs = " " + strattrs
+ return f"{self.__class__.__name__} <{self.tag_name}{strattrs}>"
+
+
+def _recursive_children(element, tag: str = None, _results=None):
+ """
+ returns all children of recursively
+
+ :param element: `WebElement` object.
+ find children below this
+
+ :param tag: str = None.
+ if provided, return only elements. example: 'a', or 'img'
+ :param _results: do not use!
+ """
+ results = _results or set()
+ for element in element.children():
+ if tag:
+ if element.tag_name == tag:
+ results.add(element)
+ else:
+ results.add(element)
+ results |= _recursive_children(element, tag, results)
+ return results
diff --git a/flaresolverr/utils.py b/flaresolverr/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ad558edd0e85f2c5b90e89c4e9a496a09e21fff
--- /dev/null
+++ b/flaresolverr/utils.py
@@ -0,0 +1,376 @@
+import json
+import logging
+import os
+import platform
+import re
+import shutil
+import sys
+import tempfile
+import urllib.parse
+
+from selenium.webdriver.chrome.webdriver import WebDriver
+import undetected_chromedriver as uc
+
+FLARESOLVERR_VERSION = None
+PLATFORM_VERSION = None
+CHROME_EXE_PATH = None
+CHROME_MAJOR_VERSION = None
+USER_AGENT = None
+XVFB_DISPLAY = None
+PATCHED_DRIVER_PATH = None
+
+
+def get_config_log_html() -> bool:
+ return os.environ.get('LOG_HTML', 'false').lower() == 'true'
+
+
+def get_config_headless() -> bool:
+ return os.environ.get('HEADLESS', 'true').lower() == 'true'
+
+
+def get_config_disable_media() -> bool:
+ return os.environ.get('DISABLE_MEDIA', 'false').lower() == 'true'
+
+
+def get_flaresolverr_version() -> str:
+ global FLARESOLVERR_VERSION
+ if FLARESOLVERR_VERSION is not None:
+ return FLARESOLVERR_VERSION
+
+ package_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'package.json')
+ if not os.path.isfile(package_path):
+ package_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'package.json')
+ with open(package_path) as f:
+ FLARESOLVERR_VERSION = json.loads(f.read())['version']
+ return FLARESOLVERR_VERSION
+
+def get_current_platform() -> str:
+ global PLATFORM_VERSION
+ if PLATFORM_VERSION is not None:
+ return PLATFORM_VERSION
+ PLATFORM_VERSION = os.name
+ return PLATFORM_VERSION
+
+
+def create_proxy_extension(proxy: dict) -> str:
+ parsed_url = urllib.parse.urlparse(proxy['url'])
+ scheme = parsed_url.scheme
+ host = parsed_url.hostname
+ port = parsed_url.port
+ username = proxy['username']
+ password = proxy['password']
+ manifest_json = """
+ {
+ "version": "1.0.0",
+ "manifest_version": 3,
+ "name": "Chrome Proxy",
+ "permissions": [
+ "proxy",
+ "tabs",
+ "storage",
+ "webRequest",
+ "webRequestAuthProvider"
+ ],
+ "host_permissions": [
+ ""
+ ],
+ "background": {
+ "service_worker": "background.js"
+ },
+ "minimum_chrome_version": "76.0.0"
+ }
+ """
+
+ background_js = """
+ var config = {
+ mode: "fixed_servers",
+ rules: {
+ singleProxy: {
+ scheme: "%s",
+ host: "%s",
+ port: %d
+ },
+ bypassList: ["localhost"]
+ }
+ };
+
+ chrome.proxy.settings.set({value: config, scope: "regular"}, function() {});
+
+ function callbackFn(details) {
+ return {
+ authCredentials: {
+ username: "%s",
+ password: "%s"
+ }
+ };
+ }
+
+ chrome.webRequest.onAuthRequired.addListener(
+ callbackFn,
+ { urls: [""] },
+ ['blocking']
+ );
+ """ % (
+ scheme,
+ host,
+ port,
+ username,
+ password
+ )
+
+ proxy_extension_dir = tempfile.mkdtemp()
+
+ with open(os.path.join(proxy_extension_dir, "manifest.json"), "w") as f:
+ f.write(manifest_json)
+
+ with open(os.path.join(proxy_extension_dir, "background.js"), "w") as f:
+ f.write(background_js)
+
+ return proxy_extension_dir
+
+
+def get_webdriver(proxy: dict = None) -> WebDriver:
+ global PATCHED_DRIVER_PATH, USER_AGENT
+ logging.debug('Launching web browser...')
+
+ # undetected_chromedriver
+ options = uc.ChromeOptions()
+ options.add_argument('--no-sandbox')
+ options.add_argument('--window-size=1280,1024') # Smaller window for less overhead
+ options.add_argument('--disable-search-engine-choice-screen')
+ options.add_argument('--disable-setuid-sandbox')
+ options.add_argument('--disable-dev-shm-usage')
+ options.add_argument('--no-zygote')
+ options.add_argument('--disable-gpu') # Disable GPU for faster headless boot
+ options.add_argument('--mute-audio')
+ options.add_argument('--disable-notifications')
+ options.add_argument('--disable-popup-blocking')
+ options.add_argument('--disable-extensions')
+ options.add_argument('--disable-blink-features=AutomationControlled')
+
+ # Force headless and invisibility
+ options.add_argument('--headless=new')
+
+ IS_ARMARCH = platform.machine().startswith(('arm', 'aarch'))
+ if IS_ARMARCH:
+ options.add_argument('--disable-gpu-sandbox')
+ options.add_argument('--ignore-certificate-errors')
+ options.add_argument('--ignore-ssl-errors')
+
+ language = os.environ.get('LANG', None)
+ if language is not None:
+ options.add_argument('--accept-lang=%s' % language)
+
+ # Fix for Chrome 117 | https://github.com/FlareSolverr/FlareSolverr/issues/910
+ if USER_AGENT is not None:
+ options.add_argument('--user-agent=%s' % USER_AGENT)
+
+ proxy_extension_dir = None
+ if proxy and all(key in proxy for key in ['url', 'username', 'password']):
+ proxy_extension_dir = create_proxy_extension(proxy)
+ options.add_argument("--disable-features=DisableLoadExtensionCommandLineSwitch")
+ options.add_argument("--load-extension=%s" % os.path.abspath(proxy_extension_dir))
+ elif proxy and 'url' in proxy:
+ proxy_url = proxy['url']
+ logging.debug("Using webdriver proxy: %s", proxy_url)
+ options.add_argument('--proxy-server=%s' % proxy_url)
+
+ # note: headless mode is detected (headless = True)
+ # we launch the browser in head-full mode with the window hidden
+ windows_headless = True if os.name == 'nt' else False
+ if get_config_headless():
+ if os.name != 'nt':
+ start_xvfb_display()
+
+ # Override for absolute invisibility on Windows
+ if os.name == 'nt':
+ options.add_argument('--hide-scrollbars')
+ options.add_argument('--disable-logging')
+ options.add_argument('--log-level=3')
+
+ # if we are inside the Docker container, we avoid downloading the driver
+ driver_exe_path = None
+ version_main = None
+ if os.path.exists("/app/chromedriver"):
+ # running inside Docker
+ driver_exe_path = "/app/chromedriver"
+ else:
+ version_main = get_chrome_major_version()
+ if PATCHED_DRIVER_PATH is not None:
+ driver_exe_path = PATCHED_DRIVER_PATH
+
+ # detect chrome path
+ browser_executable_path = get_chrome_exe_path()
+
+ # CRITICAL: Clean up undetected_chromedriver cache on Windows to avoid WinError 183
+ if os.name == 'nt':
+ try:
+ uc_path = os.path.join(os.environ.get('APPDATA', ''), 'undetected_chromedriver')
+ if os.path.exists(uc_path):
+ # Try to remove the file that usually causes WinError 183
+ target_exe = os.path.join(uc_path, 'undetected_chromedriver.exe')
+ if os.path.exists(target_exe):
+ try: os.remove(target_exe)
+ except: pass
+ except: pass
+
+ # downloads and patches the chromedriver
+ # if we don't set driver_executable_path it downloads, patches, and deletes the driver each time
+ try:
+ driver = uc.Chrome(options=options, browser_executable_path=browser_executable_path,
+ driver_executable_path=driver_exe_path, version_main=version_main,
+ windows_headless=windows_headless, headless=get_config_headless())
+ except Exception as e:
+ logging.error("Error starting Chrome: %s" % e)
+ # No point in continuing if we cannot retrieve the driver
+ raise e
+
+ # save the patched driver to avoid re-downloads
+ if driver_exe_path is None:
+ try:
+ target_path = os.path.join(driver.patcher.data_path, driver.patcher.exe_name)
+ if target_path != driver.patcher.executable_path:
+ # On Windows, we might get WinError 183 if the file is locked or exists
+ if os.path.exists(target_path):
+ try: os.remove(target_path)
+ except: pass
+ shutil.copy(driver.patcher.executable_path, target_path)
+ PATCHED_DRIVER_PATH = target_path
+ except Exception as e:
+ logging.warning(f"Failed to save patched driver: {e}")
+
+ # clean up proxy extension directory
+ if proxy_extension_dir is not None:
+ shutil.rmtree(proxy_extension_dir)
+
+ # selenium vanilla
+ # options = webdriver.ChromeOptions()
+ # options.add_argument('--no-sandbox')
+ # options.add_argument('--window-size=1920,1080')
+ # options.add_argument('--disable-setuid-sandbox')
+ # options.add_argument('--disable-dev-shm-usage')
+ # driver = webdriver.Chrome(options=options)
+
+ return driver
+
+
+def get_chrome_exe_path() -> str:
+ global CHROME_EXE_PATH
+ if CHROME_EXE_PATH is not None:
+ return CHROME_EXE_PATH
+ # linux pyinstaller bundle
+ chrome_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'chrome', "chrome")
+ if os.path.exists(chrome_path):
+ if not os.access(chrome_path, os.X_OK):
+ raise Exception(f'Chrome binary "{chrome_path}" is not executable. '
+ f'Please, extract the archive with "tar xzf ".')
+ CHROME_EXE_PATH = chrome_path
+ return CHROME_EXE_PATH
+ # windows pyinstaller bundle
+ chrome_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'chrome', "chrome.exe")
+ if os.path.exists(chrome_path):
+ CHROME_EXE_PATH = chrome_path
+ return CHROME_EXE_PATH
+ # system
+ CHROME_EXE_PATH = uc.find_chrome_executable()
+ return CHROME_EXE_PATH
+
+
+def get_chrome_major_version() -> str:
+ global CHROME_MAJOR_VERSION
+ if CHROME_MAJOR_VERSION is not None:
+ return CHROME_MAJOR_VERSION
+
+ if os.name == 'nt':
+ # Example: '104.0.5112.79'
+ try:
+ complete_version = extract_version_nt_executable(get_chrome_exe_path())
+ except Exception:
+ try:
+ complete_version = extract_version_nt_registry()
+ except Exception:
+ # Example: '104.0.5112.79'
+ complete_version = extract_version_nt_folder()
+ else:
+ chrome_path = get_chrome_exe_path()
+ process = os.popen(f'"{chrome_path}" --version')
+ # Example 1: 'Chromium 104.0.5112.79 Arch Linux\n'
+ # Example 2: 'Google Chrome 104.0.5112.79 Arch Linux\n'
+ complete_version = process.read()
+ process.close()
+
+ CHROME_MAJOR_VERSION = complete_version.split('.')[0].split(' ')[-1]
+ return CHROME_MAJOR_VERSION
+
+
+def extract_version_nt_executable(exe_path: str) -> str:
+ import pefile
+ pe = pefile.PE(exe_path, fast_load=True)
+ pe.parse_data_directories(
+ directories=[pefile.DIRECTORY_ENTRY["IMAGE_DIRECTORY_ENTRY_RESOURCE"]]
+ )
+ return pe.FileInfo[0][0].StringTable[0].entries[b"FileVersion"].decode('utf-8')
+
+
+def extract_version_nt_registry() -> str:
+ stream = os.popen(
+ 'reg query "HKLM\\SOFTWARE\\Wow6432Node\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\Google Chrome"')
+ output = stream.read()
+ google_version = ''
+ for letter in output[output.rindex('DisplayVersion REG_SZ') + 24:]:
+ if letter != '\n':
+ google_version += letter
+ else:
+ break
+ return google_version.strip()
+
+
+def extract_version_nt_folder() -> str:
+ # Check if the Chrome folder exists in the x32 or x64 Program Files folders.
+ for i in range(2):
+ path = 'C:\\Program Files' + (' (x86)' if i else '') + '\\Google\\Chrome\\Application'
+ if os.path.isdir(path):
+ paths = [f.path for f in os.scandir(path) if f.is_dir()]
+ for path in paths:
+ filename = os.path.basename(path)
+ pattern = r'\d+\.\d+\.\d+\.\d+'
+ match = re.search(pattern, filename)
+ if match and match.group():
+ # Found a Chrome version.
+ return match.group(0)
+ return ''
+
+
+def get_user_agent(driver=None) -> str:
+ global USER_AGENT
+ if USER_AGENT is not None:
+ return USER_AGENT
+
+ try:
+ if driver is None:
+ driver = get_webdriver()
+ USER_AGENT = driver.execute_script("return navigator.userAgent")
+ # Fix for Chrome 117 | https://github.com/FlareSolverr/FlareSolverr/issues/910
+ USER_AGENT = re.sub('HEADLESS', '', USER_AGENT, flags=re.IGNORECASE)
+ return USER_AGENT
+ except Exception as e:
+ raise Exception("Error getting browser User-Agent. " + str(e))
+ finally:
+ if driver is not None:
+ if PLATFORM_VERSION == "nt":
+ driver.close()
+ driver.quit()
+
+
+def start_xvfb_display():
+ global XVFB_DISPLAY
+ if XVFB_DISPLAY is None:
+ from xvfbwrapper import Xvfb
+ XVFB_DISPLAY = Xvfb()
+ XVFB_DISPLAY.start()
+
+
+def object_to_dict(_object):
+ json_dict = json.loads(json.dumps(_object, default=lambda o: o.__dict__))
+ # remove hidden fields
+ return {k: v for k, v in json_dict.items() if not k.startswith('__')}
diff --git a/keep_alive.py b/keep_alive.py
new file mode 100644
index 0000000000000000000000000000000000000000..00ace6dbf5cc709f6989a1324755af30602d32e1
--- /dev/null
+++ b/keep_alive.py
@@ -0,0 +1,47 @@
+"""
+Keep-Alive Service to prevent Render.com from sleeping
+Pings the server every 10 minutes to maintain activity
+"""
+import asyncio
+import httpx
+import logging
+from datetime import datetime
+
+logger = logging.getLogger("keep_alive")
+
+class KeepAliveService:
+ def __init__(self, base_url: str = "http://localhost:7860"):
+ self.base_url = base_url
+ self.running = False
+ self.ping_interval = 600 # 10 minutes
+
+ async def start(self):
+ """Start the keep-alive service"""
+ self.running = True
+ logger.info("🔄 Keep-Alive service started (pinging every 10 minutes)")
+
+ while self.running:
+ try:
+ await asyncio.sleep(self.ping_interval)
+ await self._ping()
+ except Exception as e:
+ logger.error(f"Keep-Alive error: {e}")
+
+ async def _ping(self):
+ """Send a ping to keep the service alive"""
+ try:
+ async with httpx.AsyncClient(timeout=10.0) as client:
+ response = await client.get(f"{self.base_url}/health")
+ if response.status_code == 200:
+ logger.info(f"✅ Keep-Alive ping successful at {datetime.now().strftime('%H:%M:%S')}")
+ else:
+ logger.warning(f"⚠️ Keep-Alive ping returned {response.status_code}")
+ except Exception as e:
+ logger.warning(f"Keep-Alive ping failed: {e}")
+
+ def stop(self):
+ """Stop the keep-alive service"""
+ self.running = False
+ logger.info("Keep-Alive service stopped")
+
+keep_alive = KeepAliveService()
diff --git a/main.py b/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..a62fcc62c44a7db2d9e156a0a8227b78912aca44
--- /dev/null
+++ b/main.py
@@ -0,0 +1,352 @@
+import logging
+import time
+from typing import List, Optional
+from fastapi import FastAPI, Request, HTTPException, Query
+from fastapi.responses import JSONResponse, FileResponse, StreamingResponse, RedirectResponse
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.middleware.gzip import GZipMiddleware
+import httpx
+from scraper.engine import scraper
+from downloader import downloader
+import os
+import re
+from urllib.parse import unquote, quote
+from fastapi.staticfiles import StaticFiles
+from database import init_db
+from keep_alive import keep_alive
+import asyncio
+import io
+
+# Configure logging
+logging.basicConfig(
+ level=logging.INFO,
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+ datefmt="%Y-%m-%d %H:%M:%S",
+)
+logger = logging.getLogger("backend")
+
+app = FastAPI(title="MEIH Movies API", version="2.0.0")
+
+# --- Simple Caching Layer ---
+class MemoryCache:
+ def __init__(self):
+ self._cache = {}
+
+ def get(self, key: str):
+ item = self._cache.get(key)
+ if item:
+ expire_time, data = item
+ if time.time() < expire_time:
+ return data
+ else:
+ del self._cache[key]
+ return None
+
+ def set(self, key: str, data, ttl_seconds: int = 600): # Default 10 mins
+ self._cache[key] = (time.time() + ttl_seconds, data)
+
+cache = MemoryCache()
+
+async def warm_scraper():
+ """Warms up the scraper by making an initial request to sync cookies."""
+ logger.info("🔥 Warming up scraper in background...")
+ try:
+ # Give services a few more seconds to be truly ready
+ await asyncio.sleep(5)
+ await scraper.fetch_home(page=1)
+ logger.info("✅ Scraper warmed up and cookies synced")
+ except Exception as e:
+ logger.warning(f"⚠️ Scraper warmup failed (will retry on first request): {e}")
+
+@app.on_event("startup")
+async def startup_event():
+ await init_db()
+ logger.info("🚀 Database initialized and ready")
+
+ # Detect if running on Hugging Face
+ is_hf = os.environ.get("SPACE_ID") is not None or os.environ.get("HF_SPACE") is not None
+
+ if not is_hf:
+ # Start Keep-Alive service (only for non-HF environments)
+ asyncio.create_task(keep_alive.start())
+ # Start Warm-up service
+ asyncio.create_task(warm_scraper())
+ # Start Nitro Pre-fetch (Populates cache in background)
+ if hasattr(scraper, '_turbo_prefetch'):
+ asyncio.create_task(scraper._turbo_prefetch())
+ logger.info("🔄 Background services activated")
+ else:
+ logger.info("🤗 Running on Hugging Face - Lightweight mode enabled")
+ # Just warm up the scraper without heavy pre-fetching
+ asyncio.create_task(warm_scraper())
+
+
+# Enable CORS for frontend
+app.add_middleware(
+ CORSMiddleware,
+ allow_origins=["*"],
+ allow_credentials=True,
+ allow_methods=["*"],
+ allow_headers=["*"],
+)
+app.add_middleware(GZipMiddleware, minimum_size=1000)
+
+@app.get("/")
+async def root():
+ return {
+ "status": "online",
+ "engine": "Nitro-Power Larooza Engine",
+ "engine_status": "WARM" if scraper._cookies_synced else "COLD",
+ "cached_keys": list(cache._cache.keys())
+ }
+
+@app.get("/latest")
+async def get_latest(page: int = 1):
+ cache_key = f"latest_{page}"
+ cached = cache.get(cache_key)
+ if cached:
+ return cached
+
+ try:
+ items = await scraper.fetch_home(page=page)
+ if items:
+ cache.set(cache_key, items)
+ return items
+ except Exception as e:
+ logger.error(f"Error fetching latest: {e}")
+ raise HTTPException(status_code=500, detail=str(e))
+
+@app.get("/category/{cat_id}")
+async def get_category(cat_id: str, page: int = 1):
+ cache_key = f"cat_{cat_id}_{page}"
+ cached = cache.get(cache_key)
+ if cached:
+ return cached
+
+ try:
+ items = await scraper.fetch_category(cat_id, page=page)
+ if items:
+ cache.set(cache_key, items)
+ return items
+ except Exception as e:
+ logger.error(f"Error fetching category {cat_id}: {e}")
+ raise HTTPException(status_code=500, detail=str(e))
+
+@app.get("/search")
+async def search(q: str):
+ cache_key = f"search_{q}"
+ cached = cache.get(cache_key)
+ if cached:
+ return cached
+
+ try:
+ items = await scraper.search(q)
+ if items:
+ cache.set(cache_key, items, ttl_seconds=3600) # Search results cache longer
+ return items
+ except Exception as e:
+ logger.error(f"Error searching for {q}: {e}")
+ raise HTTPException(status_code=500, detail=str(e))
+
+@app.get("/details/{safe_id}")
+async def get_details(safe_id: str):
+ cache_key = f"details_{safe_id}"
+ cached = cache.get(cache_key)
+ if cached:
+ return cached
+
+ try:
+ details = await scraper.fetch_details(safe_id)
+ if not details:
+ return JSONResponse(status_code=404, content={"error": "Content not found"})
+
+ cache.set(cache_key, details, ttl_seconds=86400) # Details cache for 24h
+ return details
+ except Exception as e:
+ logger.error(f"Error fetching details for {safe_id}: {e}")
+ raise HTTPException(status_code=500, detail=str(e))
+
+@app.get("/proxy/image")
+async def proxy_image(url: str):
+ if not url:
+ raise HTTPException(status_code=400, detail="URL is required")
+
+ url = unquote(url)
+
+ # --- Image Disk Cache ---
+ cache_dir = os.path.join(base_dir, "cache", "images")
+ os.makedirs(cache_dir, exist_ok=True)
+
+ # Generate simple hash for filename
+ import hashlib
+ url_hash = hashlib.md5(url.encode()).hexdigest()
+ cache_path = os.path.join(cache_dir, f"{url_hash}.img")
+
+ # 1. Check if cached
+ if os.path.exists(cache_path):
+ # Check cache age (optional - 1 week)
+ if time.time() - os.path.getmtime(cache_path) < 604800:
+ return FileResponse(
+ cache_path,
+ media_type="image/jpeg", # Approximate, browser will handle
+ headers={"Cache-Control": "public, max-age=31536000"}
+ )
+
+ try:
+ # Using follow_redirects and a longer timeout for images
+ async with httpx.AsyncClient(timeout=20.0, follow_redirects=True) as client:
+ resp = await client.get(url, headers={"User-Agent": scraper.headers["User-Agent"]})
+ if resp.status_code == 200:
+ # Save to cache
+ content = resp.content
+ with open(cache_path, "wb") as f:
+ f.write(content)
+
+ # Return the image stream directly
+ return StreamingResponse(
+ io.BytesIO(content),
+ media_type=resp.headers.get("Content-Type", "image/jpeg"),
+ headers={"Cache-Control": "public, max-age=31536000"}
+ )
+ else:
+ logger.warning(f"Failed to proxy image {url} (Status: {resp.status_code})")
+ return JSONResponse(status_code=resp.status_code, content={"error": f"Failed (Status {resp.status_code})"})
+ except httpx.TimeoutException:
+ logger.warning(f"Timeout proxying image: {url}")
+ return JSONResponse(status_code=504, content={"error": "Image timeout"})
+ except Exception as e:
+ logger.error(f"Proxy image error for {url}: {type(e).__name__} - {str(e)}")
+ return JSONResponse(status_code=500, content={"error": str(e)})
+
+@app.get("/download/info")
+async def get_download_info(url: str):
+ try:
+ info = await downloader.get_info(url)
+ return info
+ except Exception as e:
+ logger.error(f"Download info error for {url}: {e}")
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+
+@app.get("/download/file")
+async def download_file(url: str, filename: str = "video.mp4"):
+ """Handles file downloads, proxying if necessary to bypass IP blocks or hotlink protection."""
+ if not url:
+ raise HTTPException(status_code=400, detail="URL is required")
+
+ url = unquote(url)
+
+ # Domains that REQUIRE proxying (IP-bound or strict hotlink protection)
+ proxy_domains = [
+ "googlevideo.com",
+ "manifest.googlevideo.com",
+ "larozavideo.net",
+ "larooza.site",
+ "larooza.mom",
+ "laroza-tv.net",
+ "youtube.com",
+ "youtu.be"
+ ]
+
+ should_proxy = any(domain in url for domain in proxy_domains)
+
+ if should_proxy:
+ logger.info(f"🛡️ Proxying download: {filename[:50]}...")
+
+ # Clean filename for the ASCII part of Content-Disposition
+ # Remove non-ASCII characters for the fallback filename
+ ascii_filename = re.sub(r'[^\x00-\x7F]+', '_', filename)
+ encoded_filename = quote(filename)
+
+ async def stream_generator():
+ async with httpx.AsyncClient(timeout=None, follow_redirects=True) as client:
+ try:
+ async with client.stream("GET", url, headers={"User-Agent": scraper.headers["User-Agent"]}) as resp:
+ if resp.status_code != 200:
+ logger.error(f"Proxy source returned {resp.status_code}")
+ return
+
+ # We can't easily set Content-Length here because StreamingResponse
+ # starts before we have all chunks, but we can set it in the outer response
+ async for chunk in resp.aiter_bytes(chunk_size=1024*1024):
+ yield chunk
+ except Exception as e:
+ logger.error(f"Streaming error: {e}")
+
+ # Get initial headers to find content length/type if possible
+ try:
+ async with httpx.AsyncClient(timeout=10.0, follow_redirects=True) as client:
+ head_resp = await client.head(url, headers={"User-Agent": scraper.headers["User-Agent"]})
+ content_length = head_resp.headers.get("Content-Length")
+ content_type = head_resp.headers.get("Content-Type", "video/mp4")
+ except:
+ content_length = None
+ content_type = "video/mp4"
+
+ headers = {
+ "Content-Disposition": f"attachment; filename=\"{ascii_filename}\"; filename*=UTF-8''{encoded_filename}",
+ "Access-Control-Expose-Headers": "Content-Disposition"
+ }
+ if content_length:
+ headers["Content-Length"] = content_length
+
+ return StreamingResponse(stream_generator(), media_type=content_type, headers=headers)
+
+ # For other sources, a simple redirect is much faster and saves server bandwidth
+ return RedirectResponse(url=url)
+
+@app.get("/health")
+async def health():
+ # Check FlareSolverr
+ fs_status = "OFFLINE"
+ try:
+ # Increase timeout as solver might be busy
+ async with httpx.AsyncClient(timeout=5.0) as client:
+ resp = await client.get("http://localhost:8191/health")
+ if resp.status_code == 200:
+ fs_status = "ONLINE"
+ except:
+ pass
+
+ return {
+ "backend": "ONLINE",
+ "flaresolverr": fs_status,
+ "scraper_sync": scraper._cookies_synced,
+ "timestamp": time.time()
+ }
+
+# --- Frontend Mounting ---
+# This ensures that our React app is served directly by FastAPI in production
+# Check both relative and same-level structures for Docker/Local compatibility
+base_dir = os.path.dirname(__file__)
+frontend_path = os.path.join(base_dir, "meih-netflix-clone", "dist")
+
+if not os.path.exists(frontend_path):
+ # Try one level up (local dev structure)
+ frontend_path = os.path.join(base_dir, "..", "meih-netflix-clone", "dist")
+
+if os.path.exists(frontend_path):
+ # Assets are usually in dist/assets and referenced as /assets/ in Vite
+ assets_path = os.path.join(frontend_path, "assets")
+ if os.path.exists(assets_path):
+ app.mount("/assets", StaticFiles(directory=assets_path), name="assets")
+
+ @app.get("/{full_path:path}")
+ async def serve_frontend(full_path: str):
+ # Prevent infinite recursion for API routes if someone hits a wrong URL
+ if full_path.startswith(("api/", "latest", "category/", "search", "details", "proxy", "download", "health")):
+ return JSONResponse(status_code=404, content={"error": "Not Found"})
+ # If the path starts with api/ or other backend routes, it should have been caught above
+ # Otherwise, serve the main index.html for React Router to handle
+ file_path = os.path.join(frontend_path, full_path)
+ if os.path.exists(file_path) and os.path.isfile(file_path):
+ return FileResponse(file_path)
+ return FileResponse(os.path.join(frontend_path, "index.html"))
+else:
+ logger.warning(f"Frontend dist folder not found at {frontend_path}. Frontend serving disabled.")
+
+if __name__ == "__main__":
+ import uvicorn
+ # Use port 7860 for Hugging Face Spaces compatibility
+ uvicorn.run(app, host="0.0.0.0", port=7860)
diff --git a/package.json b/package.json
new file mode 100644
index 0000000000000000000000000000000000000000..a6e32ef56497088a4f5a24171819ab08b1b162a5
--- /dev/null
+++ b/package.json
@@ -0,0 +1,12 @@
+{
+ "name": "meih-movies-api",
+ "version": "1.0.0",
+ "description": "Nitro-powered movie scraping API",
+ "main": "main.py",
+ "scripts": {
+ "start": "bash start.sh"
+ },
+ "engines": {
+ "node": ">=18.x"
+ }
+}
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..31f359f5266d83b6ec834b1ec2a9abab8b8be5c8
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,14 @@
+fastapi
+uvicorn
+httpx[http2]
+beautifulsoup4
+curl-cffi
+yt-dlp
+pydantic
+python-multipart
+aiohttp
+aiosqlite
+certifi
+websockets
+packaging
+setuptools
diff --git a/scraper/engine.py b/scraper/engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..95586264495a9920bdcc4fe1861fa268a3551e3e
--- /dev/null
+++ b/scraper/engine.py
@@ -0,0 +1,996 @@
+import asyncio
+import httpx
+import re
+import logging
+import base64
+import random
+import os
+import time
+from typing import List, Dict, Optional
+from bs4 import BeautifulSoup
+from curl_cffi.requests import AsyncSession
+from urllib.parse import urljoin, quote
+from scraper.proxy_fetcher import proxy_fetcher
+# Optional dependencies for heavy bypasses
+try:
+ import undetected_chromedriver as uc
+ from selenium.webdriver.common.by import By
+ from selenium.webdriver.support.ui import WebDriverWait
+ from selenium.webdriver.support import expected_conditions as EC
+ HAS_SELENIUM = True
+except ImportError:
+ HAS_SELENIUM = False
+ logger.warning("⚠️ Selenium/Undetected-Chromedriver not installed. Nuclear bypass will be disabled.")
+
+# Clean, strictly used logger
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("scraper")
+
+class LaroozaScraper:
+ MIRRORS = ["https://q.larozavideo.net", "https://larooza.mom", "https://larooza.site", "https://m.laroza-tv.net"]
+ BASE_URL = "https://q.larozavideo.net"
+ TARGET_URL = "https://q.larozavideo.net/newvideos1.php"
+ _blacklisted_mirrors = {}
+
+ # Permanent Aliases -> Keywords search
+ CATEGORY_KEYWORDS = {
+ "arabic-movies": ["أفلام عربية", "افلام عربية", "افلام عربي", "arabic-movies33"],
+ "english-movies": ["افلام اجنبية", "أفلام أجنبية", "افلام اجنبي", "أجنبي", "all_movies_13"],
+ "indian-movies": ["افلام هندي", "أفلام هندية", "هندي", "indian-movies9"],
+ "anime-movies": ["افلام انمي", "أفلام أنمي", "انمي", "anime-movies-7"],
+ "dubbed-movies": ["افلام مدبلجة", "أفلام مدبلجة", "مدبلج", "7-aflammdblgh"],
+ "turkish-series": ["مسلسلات تركية", "تركي", "turkish-3isk-seriess47"],
+ "arabic-series": ["مسلسلات عربية", "عربي", "arabic-series46"],
+ "english-series": ["مسلسلات اجنبية", "أجنبي", "english-series10"],
+ "ramadan-2025": ["رمضان 2025", "13-ramadan-2025"],
+ "ramadan-2024": ["رمضان 2024", "28-ramadan-2024"],
+ "ramadan-2023": ["رمضان 2023", "10-ramadan-2023"],
+ "asian-movies": ["آسيوي", "اسيوي", "آسيوية", "6-asian-movies"],
+ "asian-series": ["مسلسلات اسياوية", "اسياوية", "6-asya"],
+ "turkish-movies": ["افلام تركية", "أفلام تركية", "8-aflam3isk"],
+ "anime-series": ["مسلسلات انمي", "كرتون", "6-anime-series"],
+ "indian-series": ["مسلسلات هندية", "11indian-series"],
+ "tv-programs": ["برامج تلفزيون", "tv-programs12"],
+ "plays": ["مسرحيات", "masrh-5"]
+ }
+
+ # Manual Fallbacks for reliability
+ HARDCODED_FALLBACKS = {
+ "arabic-movies": "arabic-movies33",
+ "english-movies": "all_movies_13",
+ "indian-movies": "indian-movies9",
+ "asian-movies": "6-asian-movies",
+ "anime-movies": "anime-movies-7",
+ "dubbed-movies": "7-aflammdblgh",
+ "turkish-movies": "8-aflam3isk",
+ "arabic-series": "arabic-series46",
+ "ramadan-2025": "13-ramadan-2025",
+ "ramadan-2024": "28-ramadan-2024",
+ "ramadan-2023": "10-ramadan-2023",
+ "english-series": "english-series10",
+ "turkish-series": "turkish-3isk-seriess47",
+ "indian-series": "11indian-series",
+ "tv-programs": "tv-programs12",
+ "plays": "masrh-5",
+ "anime-series": "6-anime-series",
+ "asian-series": "6-asya"
+ }
+
+ def __init__(self):
+ # Primary fetcher: curl-cffi (Fastest, TLS Impersonation)
+ # Using chrome120 and disabling SSL verify for maximum compatibility
+ self.session = AsyncSession(impersonate="chrome120", timeout=30, verify=False)
+ self._cookies_synced = False
+ self._last_pw_solve = 0
+ self._ua_synced = None
+ self._chrome_version = None
+ self._domain_lock = asyncio.Lock()
+ self._warming_lock = asyncio.Lock()
+ self._proxy_refresh_interval = 1800 # 30 minutes
+ self._proxy_refresh_time = 0
+ self._semaphore = asyncio.Semaphore(5) # Reduced from 15 for stability
+ self._optimization_started = False
+ self._is_prefetching = False
+ self._domain_detected = False
+
+
+ # Hybrid Configuration
+ self.REMOTE_SOLVER_URL = "https://meih-movies-api.onrender.com/remote-fetch"
+ self.IS_RENDER = os.environ.get("RENDER") is not None
+ self.IS_HUGGINGFACE = os.environ.get("SPACE_ID") is not None
+
+ # Free Proxy Pool for Hugging Face (to bypass IP bans)
+ self._free_proxy_pool = []
+ self._proxy_pool_last_refresh = 0
+
+ self.headers = {
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
+ "Accept-Language": "ar,en-US;q=0.9,en;q=0.8",
+ "Accept-Encoding": "gzip, deflate, br",
+ "Referer": "https://www.google.com/",
+ "Connection": "keep-alive",
+ "Sec-Fetch-Dest": "document",
+ "Sec-Fetch-Mode": "navigate",
+ "Sec-Fetch-Site": "cross-site",
+ }
+ self._session_initialized = False
+ self._session_warmed_at = 0
+ self._httpx_client = None
+
+ # --- Proxy Rotation System ---
+ proxy_str = os.getenv("PROXY_LIST", "")
+ self.proxies = [p.strip() for p in proxy_str.split(",") if p.strip()]
+ self._current_proxy_idx = 0
+ if self.proxies:
+ logger.info(f"✓ Proxy rotation enabled with {len(self.proxies)} endpoints")
+ self._category_map = {}
+ self._last_discovery = 0
+ self._discovery_lock = asyncio.Lock()
+
+ # --- Mirror & Performance ---
+ self._cache = {} # {url: (timestamp, data)}
+ self._cache_ttl = 3600 # 1 hour for data
+ self._free_proxies = []
+ self._optimization_started = False
+ self._uc_lock = asyncio.Lock()
+ self._solver_lock = asyncio.Lock() # Guard against multiple solvers
+
+ # We'll start optimization on the first request to avoid "no running loop" error
+
+ async def _optimize_connection(self):
+ """Find the fastest mirror and warm up the engine"""
+ # 1. Check if we already have a reasonably fresh fastest mirror
+ now = time.time()
+ if hasattr(self, '_fastest_mirror_detected_at') and now - self._fastest_mirror_detected_at < 3600:
+ return
+
+ logger.info("🔍 Testing mirror speeds (Optimized)...")
+
+ async def test_mirror(mirror):
+ try:
+ # very aggressive timeout for discovery
+ start = time.time()
+ test_url = f"{mirror}/newvideos1.php"
+ async with httpx.AsyncClient(timeout=1.5, follow_redirects=True, verify=False) as client:
+ resp = await client.get(test_url)
+ if resp.status_code == 200:
+ return (time.time() - start, mirror)
+ except:
+ pass
+ return (999, mirror)
+
+ results = await asyncio.gather(*(test_mirror(m) for m in self.MIRRORS))
+ results.sort()
+
+ min_time, fastest_mirror = results[0]
+
+ if min_time < 999:
+ logger.info(f"⚡ Fastest mirror: {fastest_mirror} ({min_time:.2f}s)")
+ self.BASE_URL = fastest_mirror
+ self.TARGET_URL = f"{fastest_mirror}/newvideos1.php"
+ self._fastest_mirror_detected_at = now
+ else:
+ logger.warning("⚠️ No mirrors responded quickly, using default.")
+ self._fastest_mirror_detected_at = now - 3300 # Retry sooner
+
+
+ async def _refresh_free_proxies(self):
+ """Fetch free proxies from public APIs (for cloud deployments)"""
+ # Enable on both Hugging Face and Render.com
+ if not (self.IS_HUGGINGFACE or self.IS_RENDER):
+ return
+
+ now = time.time()
+ if now - self._proxy_pool_last_refresh < 300: # Refresh every 5 minutes
+ return
+
+ logger.info("🔄 Refreshing free proxy pool...")
+ self._proxy_pool_last_refresh = now
+
+ proxy_sources = [
+ "https://api.proxyscrape.com/v2/?request=get&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all",
+ "https://www.proxy-list.download/api/v1/get?type=http",
+ ]
+
+ new_proxies = []
+ for source in proxy_sources:
+ try:
+ async with httpx.AsyncClient(timeout=10.0) as client:
+ resp = await client.get(source)
+ if resp.status_code == 200:
+ proxies = resp.text.strip().split('\n')
+ for proxy in proxies[:10]: # Take first 10 from each source
+ proxy = proxy.strip()
+ if proxy and ':' in proxy:
+ new_proxies.append(f"http://{proxy}")
+ except Exception as e:
+ logger.warning(f"Failed to fetch proxies from {source}: {e}")
+
+ if new_proxies:
+ self._free_proxy_pool = new_proxies
+ logger.info(f"✅ Loaded {len(new_proxies)} free proxies")
+ else:
+ logger.warning("⚠️ No free proxies available")
+
+ async def _discover_categories(self, force=False):
+ """Build the category map dynamically from the homepage"""
+ async with self._discovery_lock:
+ if not force and time.time() - self._last_discovery < 3600: # Cache for 1 hour
+ return
+
+ logger.info("Refreshing category mapping...")
+ html = await self._get_html(self.BASE_URL)
+ if not html: return
+
+ soup = BeautifulSoup(html, 'html.parser')
+ new_map = {}
+
+ # Find all category links
+ for a in soup.find_all('a', href=True):
+ href = a['href']
+ if 'cat=' not in href: continue
+
+ cat_id = href.split('cat=')[-1].split('&')[0]
+ text = a.get_text(strip=True).lower()
+
+ # Match against keywords
+ for alias, keywords in self.CATEGORY_KEYWORDS.items():
+ if alias not in new_map:
+ if any(k in text for k in keywords):
+ new_map[alias] = cat_id
+
+ if new_map:
+ self._category_map = new_map
+ self._last_discovery = time.time()
+ logger.info(f"✓ Mapped {len(new_map)} categories: {new_map}")
+
+ async def _resolve_cat_id(self, cat_id: str) -> str:
+ """Resolves an alias to a real ID, or returns the original if not an alias"""
+ await self._discover_categories()
+ # 1. Check dynamic map
+ if cat_id in self._category_map:
+ return self._category_map[cat_id]
+
+ # 2. Check hardcoded fallbacks if dynamic failed
+ if cat_id in self.HARDCODED_FALLBACKS:
+ return self.HARDCODED_FALLBACKS[cat_id]
+
+ return cat_id
+
+ async def _warm_session(self):
+ """Warm up session with the detected working mirror"""
+ if not self._domain_detected:
+ # We already set defaults in __init__ / class, just confirm
+ logger.info(f"🚀 Targeting exclusive source: {self.TARGET_URL}")
+ self._domain_detected = True
+
+ if not self._session_initialized:
+ self._session_initialized = True # Mark as init even if basic get fails, as PW will solve it
+
+ async def _refresh_free_proxies(self):
+ """Refresh free proxy list if needed"""
+ if time.time() - self._proxy_refresh_time > self._proxy_refresh_interval:
+ logger.info("Refreshing free proxy pool...")
+ self._free_proxies = await proxy_fetcher.get_working_proxies(max_count=15)
+ self._proxy_refresh_time = time.time()
+ logger.info(f"Loaded {len(self._free_proxies)} working free proxies")
+
+ def _get_proxy(self) -> Optional[str]:
+ # On cloud platforms (HF or Render), prioritize free proxy pool
+ if (self.IS_HUGGINGFACE or self.IS_RENDER) and self._free_proxy_pool:
+ proxy = self._free_proxy_pool[self._current_proxy_idx % len(self._free_proxy_pool)]
+ self._current_proxy_idx += 1
+ return proxy
+
+ # Try free proxies first (legacy proxy_fetcher)
+ if self._free_proxies:
+ proxy = self._free_proxies[self._current_proxy_idx % len(self._free_proxies)]
+ self._current_proxy_idx += 1
+ return proxy
+
+ # Fallback to configured proxies
+ if not self.proxies: return None
+ proxy = self.proxies[self._current_proxy_idx % len(self.proxies)]
+ self._current_proxy_idx += 1
+ return proxy
+
+
+ async def _get_html_with_undetected_chrome(self, url: str) -> Optional[str]:
+ """The 'NUCLEAR Option': Undetected-Chromedriver with safety locks for Windows"""
+ if not HAS_SELENIUM:
+ logger.error("❌ Cannot use UC: Selenium/Undetected-Chromedriver not installed.")
+ return None
+
+ async with self._uc_lock:
+ logger.info(f"💣 Launching Undetected-Chrome NUCLEAR Bypass for {url}...")
+
+ def get_chrome_version():
+ try:
+ import winreg
+ key = winreg.OpenKey(winreg.HKEY_CURRENT_USER, r'Software\Google\Chrome\BLBeacon')
+ version, _ = winreg.QueryValueEx(key, 'version')
+ return int(version.split('.')[0])
+ except:
+ return 120 # Fallback
+
+ if not self._chrome_version:
+ self._chrome_version = get_chrome_version()
+
+ def chrome_task():
+ driver = None
+ try:
+ options = uc.ChromeOptions()
+ options.add_argument('--headless')
+ options.add_argument('--no-sandbox')
+ options.add_argument('--disable-dev-shm-usage')
+ options.add_argument('--disable-gpu')
+ options.add_argument('--window-size=1280,1024')
+ options.add_argument('--mute-audio')
+ options.add_argument('--disable-notifications')
+ options.add_argument('--disable-popup-blocking')
+ options.add_argument('--hide-scrollbars')
+ options.add_argument('--disable-logging')
+ options.add_argument('--log-level=3')
+ options.add_argument('--no-first-run')
+ options.add_argument('--no-default-browser-check')
+ options.add_argument('--no-pings')
+ options.add_argument('--disable-blink-features=AutomationControlled')
+
+ # Disable images for maximum speed
+ prefs = {
+ 'profile.managed_default_content_settings.images': 2,
+ 'profile.default_content_settings.images': 2
+ }
+ options.add_experimental_option('prefs', prefs)
+
+ driver = uc.Chrome(options=options, version_main=self._chrome_version)
+ driver.set_page_load_timeout(60)
+
+ logger.info(f"💣 UC Fetching: {url}")
+ driver.get(url)
+
+ # Wait for either content or challenge
+ time.sleep(10) # Heavy sleep for UC
+
+ html = driver.page_source
+
+ # Basic sync of UA
+ ua = driver.execute_script("return navigator.userAgent")
+ if ua:
+ self.headers["User-Agent"] = ua
+
+ return html
+ except Exception as e:
+ logger.error(f"Undetected-Chrome failure: {e}")
+ return None
+ finally:
+ if driver:
+ try: driver.quit()
+ except: pass
+
+ loop = asyncio.get_event_loop()
+ return await loop.run_in_executor(None, chrome_task)
+
+ async def _get_html_with_flaresolverr(self, url: str) -> Optional[str]:
+ """FlareSolverr with Singleton Lock to avoid browser bloat"""
+ async with self._solver_lock:
+ # Re-check cache inside lock
+ if url in self._cache:
+ return self._cache[url][1]
+
+ logger.info(f"✨ Requesting FlareSolverr solve for {url}...")
+
+ flaresolverr_url = "http://localhost:8191/v1"
+ payload = {
+ "cmd": "request.get",
+ "url": url,
+ "maxTimeout": 60000
+ }
+
+ # Connection Retry Loop
+ max_conn_retries = 5 # Increased retries
+ for conn_attempt in range(max_conn_retries):
+ try:
+ async with httpx.AsyncClient(timeout=90.0) as client:
+ response = await client.post(flaresolverr_url, json=payload)
+ if response.status_code == 200:
+ data = response.json()
+ if data.get('status') == 'ok':
+ solution = data.get('solution', {})
+ html = solution.get('response', '')
+
+ # SYNCING LOGIC
+ cookies = solution.get('cookies', [])
+ ua = solution.get('userAgent', '')
+ if ua:
+ self._ua_synced = ua
+ self.headers["User-Agent"] = ua
+
+ for cookie in cookies:
+ # Ensure domain is set for proper cookie handling
+ domain = cookie.get('domain')
+ if not domain and url:
+ try:
+ domain = urlparse(url).netloc
+ if domain.startswith('www.'):
+ domain = domain[4:]
+ except:
+ pass
+
+ if domain:
+ self.session.cookies.set(
+ cookie['name'],
+ cookie['value'],
+ domain=domain,
+ path=cookie.get('path', '/'),
+ secure=cookie.get('secure', False),
+ expires=cookie.get('expires')
+ )
+
+ self._cookies_synced = True
+ self._last_pw_solve = time.time()
+ logger.info("✅ Session Synced!")
+ return html
+ else:
+ logger.warning(f"FlareSolverr error: {data.get('message')}")
+ else:
+ logger.warning(f"FlareSolverr returned status {response.status_code}")
+ except Exception as e:
+ if conn_attempt < max_conn_retries - 1:
+ logger.warning(f"FlareSolverr comm failed (attempt {conn_attempt+1}/{max_conn_retries}): {e}. Retrying...")
+ await asyncio.sleep(2)
+ else:
+ logger.error(f"FlareSolverr comm failed after {max_conn_retries} attempts: {e}")
+ return None
+
+ async def _turbo_prefetch(self):
+ """Pre-fetch all major categories in parallel to populate cache instantly"""
+ if self._is_prefetching: return
+ self._is_prefetching = True
+ logger.info("🚀 NITRO MODE: Starting concurrent background pre-fetch...")
+
+ try:
+ # List of high-priority tasks
+ tasks = [self.fetch_home(page=1)]
+
+ # Map of key categories to pre-warm
+ priority_cats = list(self.CATEGORY_KEYWORDS.keys())[:15]
+ for cat_id in priority_cats:
+ tasks.append(self.fetch_category(cat_id, page=1))
+
+ # Run everything in parallel with semaphore protection
+ await asyncio.gather(*tasks, return_exceptions=True)
+ logger.info(f"⚡ NITRO MODE complete! Cache primed with {len(self._cache)} items.")
+ except Exception as e:
+ logger.error(f"Nitro pre-fetch failed: {e}")
+ finally:
+ self._is_prefetching = False
+
+ async def _get_html(self, url: str, max_retries: int = 1, follow_meta=True) -> Optional[str]:
+ """Nitro-Speed Fetch with Parallel Safety"""
+ if not self._optimization_started:
+ self._optimization_started = True
+ asyncio.create_task(self._optimize_connection())
+
+ async with self._semaphore:
+ now = time.time()
+
+ # 0. Cache Check
+ if url in self._cache:
+ ts, data = self._cache[url]
+ if now - ts < self._cache_ttl:
+ return data
+
+ # Sanitize URL - Skip landing pages
+ if any(x in url for x in ["/gaza.20", "/gaza.18", "/gaza.22"]):
+ logger.info(f"Sanitizing landing page URL: {url} -> {self.TARGET_URL}")
+ url = self.TARGET_URL
+
+ # Refresh free proxies if on cloud platforms
+ if self.IS_HUGGINGFACE or self.IS_RENDER:
+ await self._refresh_free_proxies()
+
+ proxy = self._get_proxy()
+ proxy_dict = {"http": proxy, "https": proxy} if proxy else None
+
+ # 1. Nitro Path (curl-cffi)
+ logger.info(f"🚀 Nitro Path (curl-cffi) for {url}")
+ try:
+ # Increased timeout to 45s to handle extremely slow responses
+ resp = await self.session.get(url, headers=self.headers, timeout=45, proxies=proxy_dict)
+ status_code = resp.status_code
+ logger.info(f"📡 Nitro Path response: {status_code} ({len(resp.content)} bytes)")
+
+ if status_code == 200:
+ text = resp.text
+ # Improve Meta Refresh detection (Larooza uses this heavily for domain rotation)
+ refresh_match = re.search(r'http-equiv=["\']refresh["\'].*?content=["\']\d+;\s*url=(.*?)["\']', text, re.I)
+ if not refresh_match:
+ refresh_match = re.search(r'content=["\']\d+;\s*url=(.*?)["\']', text, re.I)
+
+ if refresh_match and follow_meta:
+ new_url_raw = refresh_match.group(1).strip("'\" ")
+ new_url = urljoin(url, new_url_raw)
+
+ # Preserve query parameters if the new URL doesn't have them but the old one did
+ if "?" not in new_url and "?" in url:
+ query = url.split("?")[-1]
+ new_url = f"{new_url}?{query}" if not new_url.endswith("?") else f"{new_url}{query}"
+
+ # If redirecting to a known landing page or ad-trap, skip it
+ if any(x in new_url for x in ["gaza.20", "gaza.18", "gaza.22", "gaza.24"]):
+ logger.info(f"🚫 Skipping ad-trap redirect: {new_url}")
+ new_url = self.TARGET_URL
+
+ logger.info(f"🔄 Following meta refresh to: {new_url}")
+ return await self._get_html(new_url, max_retries=max_retries, follow_meta=False)
+
+ # More robust Cloudflare & Landing Page detection
+ text_lower = text.lower()
+ cf_markers = ["challenge-running", "cf-ray", "cloudflare-static", "just a moment", "verify you are human", "checking your browser"]
+ is_cf = any(x in text_lower for x in cf_markers) or "id=\"challenge-form\"" in text_lower
+
+ # Detect landing page even if 200 OK (gaza.20 redirect in JS or Meta)
+ is_landing = "gaza.20" in text_lower or "gaza.18" in text_lower or "gaza.22" in text_lower
+
+ if is_cf:
+ logger.warning(f"⚠️ Cloudflare detected in Nitro response for {url}")
+ elif is_landing and follow_meta:
+ logger.info(f"🔄 Landing page detected in content for {url}, forcing target...")
+ return await self._get_html(self.TARGET_URL, max_retries=max_retries, follow_meta=False)
+ else:
+ self._cache[url] = (now, text)
+ return text
+ elif status_code == 404:
+ logger.warning(f"⚠️ Nitro Path 404 for {url} on mirror {self.BASE_URL}")
+ # If this was a mirror, fallback to primary domain
+ primary_primary = self.MIRRORS[0]
+ if self.BASE_URL != primary_primary:
+ fallback_url = url.replace(self.BASE_URL, primary_primary)
+ logger.info(f"🔁 Falling back to primary domain: {fallback_url}")
+ return await self._get_html(fallback_url, max_retries=max_retries, follow_meta=True)
+ elif status_code == 403:
+ logger.warning(f"🚫 Nitro Path 403 for {url}, falling back to solvers...")
+ except Exception as e:
+ logger.error(f"❌ Nitro Path error for {url}: {e}")
+
+ # 2. Solver Path
+ for att in range(max_retries):
+ # Use a specific lock for solver to prevent multiple concurrent solver requests for the same URL
+ # but allow different URLs in parallel. For simplicity, we use the existing semaphore and a small delay.
+
+ # Check cache again just in case another task filled it
+ if url in self._cache:
+ return self._cache[url][1]
+
+ html = await self._get_html_with_flaresolverr(url)
+ if html:
+ self._cache[url] = (now, html)
+ return html
+
+ # UC Fallback for critical pages
+ if att == max_retries - 1:
+ logger.info(f"UC Fallback for: {url}")
+ res = await self._get_html_with_undetected_chrome(url)
+ if res: return res
+
+ return None
+
+ def _extract_items(self, soup: BeautifulSoup) -> List[Dict]:
+ """Ultra-Fast Content Extraction with Deep Image Probing"""
+ items = []
+ if not soup: return []
+
+ if soup.title:
+ logger.info(f"Extracting: {soup.title.string}")
+ if "challenge" in str(soup.title).lower() or "cloudflare" in str(soup.title).lower():
+ return []
+
+ # Ultra-Strong Coverage for all Larooza Variants & Mirrors
+ containers = soup.select('.thumbnail, .pm-li-video, .pm-video-thumb, .video-block, .movie-item, li.col-xs-6, .box, .video-box, .video-item, .post-item')
+ if not containers:
+ # Deep scan for any link that looks like a video
+ containers = soup.select('a[href*="video.php"], a[href*="watch.php"], .video-listing-content, .card-video')
+
+ seen_urls = set()
+ for tag in containers:
+ # 1. Fast Link Detection
+ link = tag if (tag.name == 'a' and 'video.php' in tag.get('href', '')) else \
+ (tag.select_one('a.ellipsis') or tag.find('a', href=lambda x: x and 'video.php' in x))
+
+ if not link: continue
+ href = link.get('href')
+ if not href: continue
+
+ full_link = urljoin(self.BASE_URL, href)
+ if full_link in seen_urls: continue
+ seen_urls.add(full_link)
+
+ # 2. Extract Title & Clean it
+ title_node = tag.select_one('h3, h2, .title, .ellipsis, .video-title, p')
+ title = title_node.get_text(strip=True) if title_node else ""
+ if not title and link:
+ title = link.get('title') or link.get_text(strip=True)
+
+ # Clean Title (Remove noisy tags for premium look)
+ for t_tag in ["مشاهدة", "فيلم", "مسلسل", "كامل", "HDCAM", "HD", "WEB-DL", "Cam", "مترجم", "اون لاين", "مدبلج"]:
+ title = title.replace(t_tag, "").strip()
+ title = re.sub(r'\d{4}', '', title).strip("- ").strip() # Remove Year
+
+ # 3. Deep Image Probing
+ img_node = tag.select_one('img')
+ img_url = ""
+ if img_node:
+ # Try all possible lazy-load attributes, prefer potential real URLs over base64
+ candidates = [
+ img_node.get('data-src'),
+ img_node.get('data-lazy-src'),
+ img_node.get('data-original'),
+ img_node.get('srcset'),
+ img_node.get('src')
+ ]
+ for c in candidates:
+ if c and not c.startswith('data:'):
+ # Ensure it's a real URL
+ if c.startswith('http') or c.startswith('//') or c.startswith('/'):
+ img_url = c
+ break
+
+ # If still no image, try to find ANY attribute that looks like a URL
+ if not img_url:
+ for attr, val in img_node.attrs.items():
+ if isinstance(val, str) and (val.startswith('http') or '.jpg' in val or '.png' in val) and not val.startswith('data:'):
+ img_url = val
+ break
+
+ if img_url and "," in img_url: # Handle srcset
+ img_url = img_url.split(",")[0].split(" ")[0]
+
+ # Fallback: Check for background-image in style
+ if not img_url:
+ style = tag.get('style') or ""
+ if 'background-image' in style:
+ m = re.search(r'url\([\'"]?(.*?)[\'"]?\)', style)
+ if m:
+ img_url = m.group(1)
+
+ if not img_url or img_url.startswith('data:'):
+ img_url = "https://placehold.co/600x400/000000/FFFFFF?text=No+Poster"
+
+ # Absolute URL correction
+ if img_url.startswith('//'): img_url = 'https:' + img_url
+ elif img_url.startswith('/'): img_url = self.BASE_URL + img_url
+
+ # Proxy through our backend for stability
+ poster = f"/proxy/image?url={quote(img_url)}"
+
+ # 4. Speed-optimized Series Detection
+ lt = title.lower()
+ content_type = "series" if any(x in lt for x in ['حلقة', 'مسلسل', 'episode', 'season', 'series']) else "movie"
+
+ items.append({
+ "id": base64.urlsafe_b64encode(full_link.encode()).decode(),
+ "title": title,
+ "poster": poster,
+ "type": content_type,
+ "duration": tag.select_one('.duration, .pm-label-duration, .time').get_text(strip=True) if tag.select_one('.duration, .pm-label-duration, .time') else ""
+ })
+ return items
+
+ async def fetch_home(self, page: int = 1) -> List[Dict]:
+ target = f"{self.TARGET_URL}?page={page}"
+ html = await self._get_html(target, max_retries=3)
+ if not html:
+ logger.error(f"Failed to fetch home page: {target}")
+ return []
+
+ items = self._extract_items(BeautifulSoup(html, 'html.parser'))
+ logger.info(f"Fetched {len(items)} items from {target}")
+ return items
+
+ async def fetch_category(self, cat_id: str, page: int = 1) -> List[Dict]:
+ resolved_id = await self._resolve_cat_id(cat_id)
+ target = f"{self.BASE_URL}/category.php?cat={resolved_id}&page={page}"
+ html = await self._get_html(target, max_retries=3)
+ return self._extract_items(BeautifulSoup(html, 'html.parser')) if html else []
+
+ def _normalize_number(self, text: str) -> int:
+ """Extract episode number from Arabic/English text"""
+ # Arabic number words mapping
+ arabic_map = {
+ 'الأولى': 1, 'الاولى': 1, 'الثانية': 2, 'الثالثة': 3, 'الرابعة': 4,
+ 'الخامسة': 5, 'السادسة': 6, 'السابعة': 7, 'الثامنة': 8, 'التاسعة': 9,
+ 'العاشرة': 10, 'الحادية': 11, 'الثانية عشر': 12, 'الثالثة عشر': 13,
+ 'الرابعة عشر': 14, 'الخامسة عشر': 15, 'السادسة عشر': 16, 'السابعة عشر': 17,
+ 'الثامنة عشر': 18, 'التاسعة عشر': 19, 'العشرون': 20, 'الاخيرة': 999
+ }
+
+ # Try to find numeric digits first (most reliable)
+ match = re.search(r'(\d+)', text)
+ if match:
+ return int(match.group(1))
+
+ # Try Arabic number words
+ text_lower = text.lower()
+ for arabic_word, num in arabic_map.items():
+ if arabic_word in text_lower:
+ return num
+
+ # Try to extract from patterns like "الحلقة X" or "Episode X"
+ patterns = [
+ r'(?:الحلقة|حلقة|episode|ep)\s*[:\-]?\s*(\d+)',
+ r'(\d+)\s*(?:الحلقة|حلقة|episode|ep)',
+ ]
+ for pattern in patterns:
+ match = re.search(pattern, text_lower)
+ if match:
+ return int(match.group(1))
+
+ return 0
+
+ def _safe_get_episode(self, text: str, name_hint: str = None) -> int:
+ """Smarter episode number extraction with common patterns"""
+ # Remove common noise
+ clean = re.sub(r'\(.*?\)', '', text)
+ clean = re.sub(r'\[.*?\]', '', clean)
+
+ if name_hint:
+ # Remove the series name from the text to avoid matching numbers in the title (e.g. "2 قهوة")
+ clean = clean.replace(name_hint, "").strip()
+
+ # 1. Look for number after keywords (Most reliable)
+ m = re.search(r'(?:الحلقة|حلقة|ep|episode|part|p)\s*(\d+)', clean, re.I)
+ if m: return int(m.group(1))
+
+ # 2. Direct digits (Fallback)
+ m = re.search(r'(\d+)', clean)
+ if m: return int(m.group(1))
+
+ # 3. Word matches
+ return self._normalize_number(clean)
+
+ async def search(self, query: str) -> List[Dict]:
+ url = f"{self.BASE_URL}/search.php?keywords={quote(query)}"
+ html = await self._get_html(url, max_retries=2)
+ return self._extract_items(BeautifulSoup(html, 'html.parser')) if html else []
+
+ async def fetch_details(self, safe_id: str) -> Dict:
+ try:
+ url = base64.urlsafe_b64decode(safe_id).decode()
+ except: return {}
+
+ html = await self._get_html(url)
+ if not html: return {}
+
+ soup = BeautifulSoup(html, 'html.parser')
+
+ # Follow play.php for watch servers
+ watch_html = html
+ watch_soup = soup
+ play_a = soup.select_one('a[href*="play.php"]')
+ if play_a:
+ p_url = urljoin(self.BASE_URL, play_a.get('href'))
+ p_html = await self._get_html(p_url)
+ if p_html:
+ watch_soup = BeautifulSoup(p_html, 'html.parser')
+ watch_html = p_html
+
+ title = soup.find('h1').get_text(strip=True) if soup.find('h1') else "Unknown"
+ is_series = bool(soup.select('.episodes-list, .season-episodes, .vid-episodes')) or any(x in title for x in ["حلقة", "مسلسل", "الموسم"])
+
+ raw_poster = soup.select_one('meta[property="og:image"]')['content'] if soup.select_one('meta[property="og:image"]') else ""
+ if not raw_poster:
+ img_tag = soup.select_one('.poster img, .movie-poster img, .pm-video-watch-main img')
+ if img_tag:
+ raw_poster = img_tag.get('src') or img_tag.get('data-src')
+
+ poster = ""
+ if raw_poster:
+ full_poster_url = urljoin(self.BASE_URL, raw_poster)
+ poster = f"/proxy/image?url={quote(full_poster_url)}"
+
+ response = {
+ "id": safe_id, "title": title,
+ "description": soup.select_one('.story, .desc, .entry-content').get_text(strip=True) if soup.select_one('.story, .desc, .entry-content') else "",
+ "poster": poster,
+ "type": "series" if is_series else "movie",
+ "seasons": [], "episodes": [], "servers": [], "download_links": []
+ }
+
+ # --- Episodes ---
+ if is_series:
+ unique_eps = {}
+
+ # 1. Proactive Search: Look for a "Series Category" link
+ cat_link = None
+
+ # A. Check Breadcrumbs (Very reliable for series category)
+ for bc in soup.select('.breadcrumb a, .bread-crumb a, .breadcrumbs a, .pm-breadcrumb a'):
+ href = bc.get('href')
+ if href and ('cat=' in href or 'ser=' in href):
+ # Skip generic high-level categories if possible?
+ # Actually, we filter by title later, so it's okay.
+ cat_link = urljoin(self.BASE_URL, href)
+ if 'ser=' in href: # Prefer ser= over cat=
+ break
+
+ # Extract clean series name for filtering
+ clean_title = title.replace("مسلسل", "").strip()
+ # Try to get name before "الحلقة" or "المواسم"
+ series_name = re.split(r'الحلقة|الموسم|حلقة|season|episode', clean_title, flags=re.I)[0].strip()
+ # Arabic numeral support for filtering
+ series_name_alt = series_name.replace('0','٠').replace('1','١').replace('2','٢').replace('3','٣').replace('4','٤').replace('5','٥').replace('6','٦').replace('7','٧').replace('8','٨').replace('9','٩')
+
+ logger.info(f"Targeting series name: {series_name} (Alt: {series_name_alt})")
+
+ # B. Check if Title itself is a link to the category or series
+ if not cat_link:
+ title_link = soup.select_one('h1 a[href*="cat="], h1 a[href*="ser="], h1 a[href*="tag.php"]')
+ if title_link:
+ cat_link = urljoin(self.BASE_URL, title_link['href'])
+
+ # C. General search in links with strict patterns
+ if not cat_link:
+ for a in soup.find_all('a', href=True):
+ href = a['href']
+ a_text = a.get_text(strip=True)
+ # High-confidence patterns
+ if any(x in a_text for x in ["المسلسل:", "جميع الحلقات", "حلقات المسلسل", "كل الحلقات"]):
+ cat_link = urljoin(self.BASE_URL, href)
+ logger.info(f"Found cat_link via labels: {cat_link}")
+ break
+
+ # D. Fallback search by title
+ if not cat_link:
+ for a in soup.find_all('a', href=True):
+ href = a['href']
+ if any(x in href for x in ['ser=', 'cat=', 'tag.php']):
+ a_text = a.get_text(strip=True)
+ if (series_name and series_name in a_text) or (series_name_alt and series_name_alt in a_text):
+ cat_link = urljoin(self.BASE_URL, href)
+ logger.info(f"Found cat_link via fallback title search: {cat_link}")
+ break
+
+ if cat_link:
+ try:
+ # Determine type: view-serie.php, category.php, tag.php
+ is_view_serie = 'view-serie' in cat_link
+ param_name = 'ser' if is_view_serie else ('t' if 'tag.php' in cat_link else 'cat')
+
+ # Robust ID extraction
+ match = re.search(f'[?&]{param_name}=([^&]+)', cat_link)
+ if match:
+ cat_id = match.group(1)
+ base_deep_url = f"{self.BASE_URL}/tag.php?t={cat_id}" if param_name == 't' else \
+ (f"{self.BASE_URL}/view-serie.php?ser={cat_id}" if is_view_serie else \
+ f"{self.BASE_URL}/category.php?cat={cat_id}")
+
+ logger.info(f"Deep scraping episodes from {cat_link} (ID: {cat_id})")
+ # Fetch first 5 pages
+ for p in range(1, 6):
+ target_p = f"{base_deep_url}&page={p}" if p > 1 else base_deep_url
+ p_html = await self._get_html(target_p)
+ if not p_html: break
+ p_items = self._extract_items(BeautifulSoup(p_html, 'html.parser'))
+
+ if not p_items: break
+ for item in p_items:
+ # Filter Check: Use a fuzzy name match
+ i_title = item['title']
+ # Must match at least the first 2 words if possible, or the whole name
+ name_parts = series_name.split()
+ match_key = " ".join(name_parts[:2]) if len(name_parts) >= 2 else series_name
+
+ if match_key in i_title or series_name in i_title or series_name_alt in i_title:
+ e_num = self._safe_get_episode(i_title, name_hint=series_name)
+ if e_num and e_num not in unique_eps:
+ unique_eps[e_num] = {
+ "id": item['id'],
+ "episode": e_num,
+ "title": i_title
+ }
+ if len(p_items) < 10: break
+ except Exception as e:
+ logger.error(f"Category episode fetch failed: {e}")
+
+ # 2. Local fallback: Scrape episodes from the current page
+ for ep in soup.select('.episodes-list a, .season-episodes a, .vid-episodes a, ul.episodes li a, div.caption h3 a, .movie-item a, .related-vids a'):
+ ep_href = ep.get('href')
+ if not ep_href or 'video.php' not in ep_href: continue
+ ep_url = urljoin(self.BASE_URL, ep_href)
+ ep_text = ep.get_text(strip=True)
+
+ # If text is empty, check for nested title
+ if not ep_text:
+ inner = ep.find(['h3', 'span', 'strong'])
+ if inner: ep_text = inner.get_text(strip=True)
+
+ # CRITICAL FILTER: Item must belong to this series
+ if series_name and series_name not in ep_text:
+ continue
+
+ ep_num = self._safe_get_episode(ep_text, name_hint=series_name)
+ if ep_num and ep_num not in unique_eps:
+ unique_eps[ep_num] = {
+ "id": base64.urlsafe_b64encode(ep_url.encode()).decode(),
+ "episode": ep_num,
+ "title": ep_text
+ }
+
+ response['episodes'] = sorted(list(unique_eps.values()), key=lambda x: x['episode'])
+ response['seasons'] = [{"number": 1, "episodes": response['episodes']}]
+
+ # --- WATCH SERVERS ---
+ watch_urls = set()
+
+ def is_valid_srv(url_str: str) -> bool:
+ if not url_str or 'javascript' in url_str: return False
+ if 'larooza' in url_str and 'video.php' in url_str: return False
+ if any(x in url_str.lower() for x in ['beacon', 'analytics', 'pixel', 'ads.', 'google', 'facebook']): return False
+ return True
+
+ # 1. Primary: WatchList & Source tags
+ server_selectors = [
+ 'ul.WatchList li', '.server-list li', '#servers li', '.watch-servers li',
+ '.video-servers-list li', 'div.servers a', '.player-servers li'
+ ]
+
+ for sel in server_selectors:
+ for li in watch_soup.select(sel):
+ s_url = li.get('data-embed-url') or li.get('data-link') or li.get('data-embed') or li.get('data-src') or li.get('data-url')
+ if not s_url:
+ a_tag = li.find('a', href=True)
+ if a_tag and not a_tag['href'].startswith('javascript'):
+ s_url = a_tag['href']
+
+ if s_url and is_valid_srv(s_url):
+ if s_url.startswith('//'): s_url = "https:" + s_url
+ full_s_url = urljoin(self.BASE_URL, s_url)
+ if full_s_url not in watch_urls:
+ watch_urls.add(full_s_url)
+ name = li.get_text(strip=True) or f"سيرفر {len(response['servers']) + 1}"
+ response['servers'].append({"name": name, "url": full_s_url, "type": "iframe"})
+
+ # 2. Secondary: Deep Iframe Scan
+ for ifr in watch_soup.select('iframe[src], embed[src], video source[src]'):
+ src = ifr.get('src')
+ if is_valid_srv(src):
+ if src.startswith('//'): src = "https:" + src
+ full_s_url = urljoin(self.BASE_URL, src)
+ if full_s_url not in watch_urls:
+ watch_urls.add(full_s_url)
+ response['servers'].append({"name": f"سيرفر سريع {len(response['servers']) + 1}", "url": full_s_url, "type": "iframe"})
+
+ # 3. Regex Fallback (Scripts & Global)
+ patterns = [
+ r'iframe.*?src=["\'](https?://[^"\']+)["\']',
+ r'embedUrl["\']\s*:\s*["\'](https?://[^"\']+)["\']',
+ r'file["\']\s*:\s*["\'](https?://[^"\']+\.m3u8)["\']',
+ r'source\s*src=["\'](https?://[^"\']+)["\']'
+ ]
+ for pattern in patterns:
+ for match in re.findall(pattern, watch_html, re.I):
+ if is_valid_srv(match) and match not in watch_urls:
+ watch_urls.add(match)
+ response['servers'].append({"name": f"سيرفر احتياطي {len(response['servers']) + 1}", "url": match, "type": "iframe"})
+
+ # Clean duplicates and sort by quality/relevance if possible
+ # For now, just ensuring uniqueness
+
+ # --- Downloads ---
+ dl_url = url.replace('video.php', 'download.php').replace('play.php', 'download.php')
+ dl_html = await self._get_html(dl_url)
+ if dl_html:
+ dl_soup = BeautifulSoup(dl_html, 'html.parser')
+ for mirror in dl_soup.select('a[target="_blank"]'):
+ m_url = mirror.get('href')
+ if m_url and 'http' in m_url:
+ if any(x in m_url.lower() for x in ['wa.me', 'facebook.com', 'twitter.com', 'telegram.me', 't.me', 'sharer.php']):
+ continue
+ q_text = mirror.get_text(strip=True).replace("اضغط هنا للتحميل", "").replace("تحميل الملف", "").strip() or "رابط تحميل"
+ response['download_links'].append({"quality": q_text, "url": m_url})
+
+ return response
+
+scraper = LaroozaScraper()
diff --git a/scraper/proxy_fetcher.py b/scraper/proxy_fetcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..bde5e67f947c37c4c888ebe26b774c01261367bf
--- /dev/null
+++ b/scraper/proxy_fetcher.py
@@ -0,0 +1,66 @@
+"""
+Free Proxy Fetcher - Automatically fetches and validates free proxies
+"""
+import aiohttp
+import asyncio
+import logging
+
+logger = logging.getLogger("proxy_fetcher")
+
+class FreeProxyFetcher:
+ def __init__(self):
+ self.proxies = []
+ self.last_fetch = 0
+
+ async def fetch_free_proxies(self):
+ """Fetch free proxies from public APIs"""
+ proxy_sources = [
+ "https://api.proxyscrape.com/v2/?request=get&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all",
+ "https://www.proxy-list.download/api/v1/get?type=http",
+ ]
+
+ all_proxies = []
+ async with aiohttp.ClientSession() as session:
+ for source in proxy_sources:
+ try:
+ async with session.get(source, timeout=10) as resp:
+ if resp.status == 200:
+ text = await resp.text()
+ proxies = [f"http://{line.strip()}" for line in text.split('\n') if line.strip()]
+ all_proxies.extend(proxies[:20]) # Take first 20 from each source
+ logger.info(f"Fetched {len(proxies)} proxies from {source}")
+ except Exception as e:
+ logger.error(f"Failed to fetch from {source}: {e}")
+
+ self.proxies = all_proxies
+ logger.info(f"Total free proxies loaded: {len(self.proxies)}")
+ return self.proxies
+
+ async def validate_proxy(self, proxy, test_url="https://httpbin.org/ip"):
+ """Test if a proxy works"""
+ try:
+ async with aiohttp.ClientSession() as session:
+ async with session.get(test_url, proxy=proxy, timeout=5) as resp:
+ if resp.status == 200:
+ return True
+ except:
+ pass
+ return False
+
+ async def get_working_proxies(self, max_count=10):
+ """Get validated working proxies"""
+ if not self.proxies:
+ await self.fetch_free_proxies()
+
+ working = []
+ tasks = [self.validate_proxy(p) for p in self.proxies[:30]]
+ results = await asyncio.gather(*tasks, return_exceptions=True)
+
+ for proxy, is_working in zip(self.proxies[:30], results):
+ if is_working and len(working) < max_count:
+ working.append(proxy)
+
+ logger.info(f"Validated {len(working)} working proxies")
+ return working
+
+proxy_fetcher = FreeProxyFetcher()
diff --git a/start.sh b/start.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5b802ac38de4fc010cb9fd89edb69cf0e2fb3832
--- /dev/null
+++ b/start.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+set -e
+
+echo "--- STARTING MULTI-SERVICE BOOT ---"
+
+# Step 1: Start FlareSolverr
+echo "[1/3] Launching FlareSolverr in background..."
+export PYTHONPATH=$PYTHONPATH:/app/flaresolverr
+export PORT=8191
+export LOG_LEVEL=info
+
+# Run FlareSolverr with its own directory as CWD
+(cd /app/flaresolverr && python3 flaresolverr.py) &
+
+# Step 2: Health Check for FlareSolverr
+echo "[2/3] Waiting for FlareSolverr to bind to port 8191..."
+MAX_RETRIES=30
+COUNT=0
+while ! curl -s http://localhost:8191/health > /dev/null; do
+ sleep 1
+ COUNT=$((COUNT+1))
+ if [ $COUNT -ge $MAX_RETRIES ]; then
+ echo "⚠️ FlareSolverr failed to start in time, continuing to FastAPI anyway..."
+ break
+ fi
+done
+echo "✅ FlareSolverr is ready!"
+
+# Step 3: Start FastAPI
+echo "[3/3] Launching FastAPI on port 7860..."
+uvicorn main:app --host 0.0.0.0 --port 7860 --log-level info
diff --git a/start_render.sh b/start_render.sh
new file mode 100644
index 0000000000000000000000000000000000000000..31efd2c304b410aa48f932eca9467789c9864628
--- /dev/null
+++ b/start_render.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+set -e
+
+echo "--- RENDER.COM DEPLOYMENT ---"
+
+# Step 1: Start FlareSolverr
+echo "[1/2] Launching FlareSolverr in background..."
+export PYTHONPATH=$PYTHONPATH:/opt/render/project/src/flaresolverr
+export PORT_FS=8191
+export LOG_LEVEL=info
+
+(cd /opt/render/project/src/flaresolverr && python3 flaresolverr.py) &
+
+# Wait for FlareSolverr
+echo "[2/2] Waiting for FlareSolverr..."
+sleep 5
+
+echo "✅ FlareSolverr ready!"
+echo "--- Starting FastAPI on port $PORT ---"
+
+# Render provides $PORT automatically
+uvicorn main:app --host 0.0.0.0 --port ${PORT:-7860} --log-level info
diff --git a/tools/analyze_structure.py b/tools/analyze_structure.py
new file mode 100644
index 0000000000000000000000000000000000000000..3baafd030208a711c6d6c1241d975c65fcef5485
--- /dev/null
+++ b/tools/analyze_structure.py
@@ -0,0 +1,36 @@
+from bs4 import BeautifulSoup
+import sys
+import io
+
+# Set encoding for Windows terminal
+if sys.platform == 'win32':
+ sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+
+def analyze_html():
+ with open("flaresolverr_output.html", "r", encoding="utf-8") as f:
+ html = f.read()
+
+ soup = BeautifulSoup(html, 'html.parser')
+
+ print("--- Analyzing Links ---")
+ links = soup.find_all('a', href=True)
+ for i, a in enumerate(links[:100]):
+ href = a['href']
+ text = a.get_text(strip=True)
+ if 'cat=' in href or 'video' in href or 'movie' in href or 'series' in href:
+ print(f"{i}: Text: {text} | Href: {href}")
+
+ print("\n--- Analyzing Containers ---")
+ # Look for common patterns in classes
+ classes = set()
+ for tag in soup.find_all(True, class_=True):
+ for c in tag['class']:
+ classes.add(c)
+
+ print(f"Found {len(classes)} unique classes.")
+ # Print classes that might be containers
+ potential = [c for c in classes if any(x in c.lower() for x in ['item', 'video', 'movie', 'thumb', 'card', 'block', 'col'])]
+ print(f"Potential container classes: {potential}")
+
+if __name__ == "__main__":
+ analyze_html()
diff --git a/tools/check_mirrors.py b/tools/check_mirrors.py
new file mode 100644
index 0000000000000000000000000000000000000000..d19f65a91022affb504c6762a2a9ae1dac812bf1
--- /dev/null
+++ b/tools/check_mirrors.py
@@ -0,0 +1,34 @@
+import asyncio
+import httpx
+from curl_cffi.requests import AsyncSession
+
+async def check_mirrors():
+ mirrors = [
+ "https://larooza.mom",
+ "https://larooza.site",
+ "https://laroza-tv.net",
+ "https://larozavideo.net",
+ "https://larooza.video",
+ "https://q.larozavideo.net"
+ ]
+
+ headers = {
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+ }
+
+ for mirror in mirrors:
+ print(f"Checking {mirror}...")
+ try:
+ # Try curl-cffi first
+ async with AsyncSession(impersonate="chrome110") as s:
+ resp = await s.get(mirror, headers=headers, timeout=10)
+ print(f" [curl-cffi] {mirror}: {resp.status_code} | Title: {resp.text[:100].replace('\n', ' ')}")
+
+ async with httpx.AsyncClient(http2=True, timeout=10) as client:
+ resp = await client.get(mirror, headers=headers)
+ print(f" [httpx] {mirror}: {resp.status_code} | Title: {resp.text[:100].replace('\n', ' ')}")
+ except Exception as e:
+ print(f" [Error] {mirror}: {e}")
+
+if __name__ == "__main__":
+ asyncio.run(check_mirrors())
diff --git a/tools/debug_fs.py b/tools/debug_fs.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffc5176ebf1b1d7ee1513f0ffc30b2673b44a589
--- /dev/null
+++ b/tools/debug_fs.py
@@ -0,0 +1,51 @@
+import asyncio
+import httpx
+import json
+import sys
+
+# Set encoding to utf-8 for windows console
+if sys.platform == "win32":
+ import codecs
+ sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
+
+async def test():
+ urls = [
+ "https://q.larozavideo.net/home.24",
+ "https://q.larozavideo.net/newvideos1.php",
+ "https://q.larozavideo.net/category.php?cat=all_movies_13"
+ ]
+
+ flaresolverr_url = "http://127.0.0.1:8191/v1"
+
+ async with httpx.AsyncClient(timeout=90.0) as client:
+ for url in urls:
+ print(f"\n--- Testing {url} ---")
+ payload = {
+ "cmd": "request.get",
+ "url": url,
+ "maxTimeout": 60000
+ }
+ try:
+ response = await client.post(flaresolverr_url, json=payload)
+ if response.status_code == 200:
+ data = response.json()
+ if data.get('status') == 'ok':
+ solution = data.get('solution', {})
+ html = solution.get('response', '')
+ title = solution.get('title', '')
+ print(f"Title found: {title}")
+
+ if "video.php" in html or ".thumbnail" in html or ".box" in html:
+ print("FOUND: Movie items are present in HTML!")
+ else:
+ print("FAILED: No movie items in HTML.")
+ print(f"Snippet: {html[:500]}")
+ else:
+ print(f"FlareSolverr message: {data.get('message')}")
+ else:
+ print(f"Server error: {response.status_code}")
+ except Exception as e:
+ print(f"Script error: {e}")
+
+if __name__ == "__main__":
+ asyncio.run(test())
diff --git a/tools/debug_mirrors.py b/tools/debug_mirrors.py
new file mode 100644
index 0000000000000000000000000000000000000000..73988eb7cee36290bf496c4f97220ce223a50586
--- /dev/null
+++ b/tools/debug_mirrors.py
@@ -0,0 +1,35 @@
+import asyncio
+import httpx
+from bs4 import BeautifulSoup
+
+async def debug_fetch():
+ mirrors = ["https://q.larozavideo.net", "https://larooza.mom", "https://larooza.site", "https://m.laroza-tv.net"]
+ async with httpx.AsyncClient(timeout=10.0, follow_redirects=True) as client:
+ for mirror in mirrors:
+ print(f"\n--- Checking mirror: {mirror} ---")
+ try:
+ resp = await client.get(mirror, headers={"User-Agent": "Mozilla/5.0"})
+ print(f"Status: {resp.status_code}")
+ if resp.status_code == 200:
+ soup = BeautifulSoup(resp.text, 'html.parser')
+ title = soup.title.string if soup.title else "No title"
+ print(f"Title: {title}")
+
+ selectors = ['.thumbnail', '.pm-li-video', '.pm-video-thumb', '.video-block', '.movie-item', 'li.col-xs-6', '.box', '.video-box', '.video-item', '.post-item']
+ found = False
+ for sel in selectors:
+ count = len(soup.select(sel))
+ if count > 0:
+ print(f" Found {count} items with selector {sel}")
+ found = True
+
+ if not found:
+ video_links = len(soup.select('a[href*="video.php"], a[href*="watch.php"]'))
+ print(f" Found {video_links} video/watch links.")
+ else:
+ print(f" Snippet: {resp.text[:200]}")
+ except Exception as e:
+ print(f" Error: {e}")
+
+if __name__ == "__main__":
+ asyncio.run(debug_fetch())
diff --git a/tools/debug_scraper.py b/tools/debug_scraper.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd282e90cb4b1216d17ba7bc26bb17e8395aee28
--- /dev/null
+++ b/tools/debug_scraper.py
@@ -0,0 +1,27 @@
+import asyncio
+import sys
+import os
+
+# Add the current directory to path
+sys.path.append(os.getcwd())
+
+from scraper.engine import LaroozaScraper
+
+# Set encoding to utf-8 for windows console
+if sys.platform == "win32":
+ import codecs
+ sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
+
+async def test():
+ scraper = LaroozaScraper()
+ print("DEBUG: Fetching latest movies...")
+ items = await scraper.fetch_home(page=1)
+ print(f"DEBUG: Found {len(items)} items.")
+ if items:
+ for i, item in enumerate(items[:3]):
+ print(f" {i+1}. {item['title']} - ID: {item['id'][:20]}...")
+ else:
+ print("DEBUG: ❌ fetch_home returned 0 items.")
+
+if __name__ == "__main__":
+ asyncio.run(test())
diff --git a/tools/dump_html.py b/tools/dump_html.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad7841315db09ed09c88de092150245de1887509
--- /dev/null
+++ b/tools/dump_html.py
@@ -0,0 +1,25 @@
+import httpx
+import asyncio
+from bs4 import BeautifulSoup
+
+async def dump_html():
+ url = "https://larooza.mom" # Using the one that gave 0 links
+ headers = {
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+ }
+ async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
+ print(f"Fetching {url}...")
+ resp = await client.get(url, headers=headers)
+ print(f"Status: {resp.status_code}")
+ with open("dump.html", "w", encoding="utf-8") as f:
+ f.write(resp.text)
+ print("HTML dumped to dump.html")
+
+ soup = BeautifulSoup(resp.text, 'html.parser')
+ links = soup.select('a')
+ print(f"Total links: {len(links)}")
+ for a in links[:20]:
+ print(f"Link: {a.get('href')} | Text: {a.get_text(strip=True)[:30]}")
+
+if __name__ == "__main__":
+ asyncio.run(dump_html())
diff --git a/tools/dump_html_v2.py b/tools/dump_html_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..14d20010dc00520a4a089812f4651f4ef8b207c9
--- /dev/null
+++ b/tools/dump_html_v2.py
@@ -0,0 +1,25 @@
+import httpx
+import asyncio
+from bs4 import BeautifulSoup
+
+async def dump_html():
+ url = "https://q.larozavideo.net/newvideos1.php"
+ headers = {
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+ }
+ async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
+ print(f"Fetching {url}...")
+ resp = await client.get(url, headers=headers)
+ print(f"Status: {resp.status_code}")
+ print(f"Final URL: {resp.url}")
+
+ soup = BeautifulSoup(resp.text, 'html.parser')
+ containers = soup.select('.thumbnail, .pm-li-video, .pm-video-thumb, .video-block, .movie-item, li.col-xs-6, .box, .video-box, .video-item, .post-item')
+ print(f"Found {len(containers)} item containers.")
+
+ if len(containers) == 0:
+ print("Snippet of HTML:")
+ print(resp.text[:1000])
+
+if __name__ == "__main__":
+ asyncio.run(dump_html())
diff --git a/tools/extra/diagnose.py b/tools/extra/diagnose.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fcdffe72ce56a1e81f897a7cc88d9a683ef74a5
--- /dev/null
+++ b/tools/extra/diagnose.py
@@ -0,0 +1,27 @@
+
+import httpx
+import asyncio
+import os
+
+async def check_service(name, url):
+ try:
+ async with httpx.AsyncClient(timeout=5.0) as client:
+ resp = await client.get(url)
+ print(f"✅ {name} is UP ({url}) - Status: {resp.status_code}")
+ return True
+ except Exception as e:
+ print(f"❌ {name} is DOWN ({url}) - Error: {e}")
+ return False
+
+async def main():
+ print("--- Diagnostics ---")
+ await check_service("Backend", "http://localhost:8000/health")
+ await check_service("FlareSolverr", "http://localhost:8191/health")
+
+ # Try to find the tunnel URL from local logs if possible
+ print("\n--- Searching for Tunnel URL ---")
+ # This is a bit tricky, but we can try to find recent cloudflared logs
+ # Cloudflared usually doesn't log to a file unless specified, but we'll check common names
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/tools/extra/expose_to_internet.bat b/tools/extra/expose_to_internet.bat
new file mode 100644
index 0000000000000000000000000000000000000000..f06efe4cf91482e563beec9efbbad17cd211af81
--- /dev/null
+++ b/tools/extra/expose_to_internet.bat
@@ -0,0 +1,18 @@
+@echo off
+echo ==========================================
+echo CLOUDFLARE TUNNEL - EXPOSE TO INTERNET
+echo ==========================================
+
+REM Download Cloudflared (if not exists)
+if not exist cloudflared.exe (
+ echo Downloading Cloudflare Tunnel...
+ powershell -Command "Invoke-WebRequest -Uri 'https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-windows-amd64.exe' -OutFile 'cloudflared.exe'"
+)
+
+REM Start tunnel
+echo Starting Cloudflare Tunnel...
+echo Your backend will be accessible via a public URL in a moment...
+echo.
+cloudflared.exe tunnel --url http://localhost:8000
+
+pause
diff --git a/tools/extra/nulcd b/tools/extra/nulcd
new file mode 100644
index 0000000000000000000000000000000000000000..e54a14ae3638f055863e78839784e4d13b9f0f78
--- /dev/null
+++ b/tools/extra/nulcd
@@ -0,0 +1,2 @@
+ERROR: Invalid argument/option - 'c:\Users\Mina\Desktop\lmina\backend'.
+Type "TASKKILL /?" for usage.
diff --git a/tools/extra/run_vps.bat b/tools/extra/run_vps.bat
new file mode 100644
index 0000000000000000000000000000000000000000..ae7b591271f8b15eb202250f5b5832494bf0d4b2
--- /dev/null
+++ b/tools/extra/run_vps.bat
@@ -0,0 +1,58 @@
+@echo off
+setlocal enabledelayedexpansion
+
+echo ==========================================
+echo MEIH PLATFORM - VPS AUTO RUNNER
+echo ==========================================
+
+REM 1. Kill existing processes to avoid port conflicts
+echo [*] Cleaning up old processes...
+taskkill /F /IM uvicorn.exe /T 2>nul
+taskkill /F /IM python.exe /T 2>nul
+taskkill /F /IM cloudflared_vps.exe /T 2>nul
+
+REM 2. Start FlareSolverr
+echo [*] Starting FlareSolverr (Protection Bypass)...
+cd /d "%~dp0backend"
+start /B "FlareSolverr" cmd /c "cd flaresolverr && ..\venv\Scripts\python flaresolverr.py"
+
+REM 3. Start FastAPI Backend
+echo [*] Starting Backend Server...
+start /B "Backend" cmd /c "venv\Scripts\uvicorn main:app --port 8000 --workers 1"
+
+REM Wait for servers to warm up
+timeout /t 8 /nobreak >nul
+
+REM 4. Start Cloudflare Tunnel and Log Output
+echo [*] Starting Cloudflare Tunnel...
+echo [*] WAITING FOR YOUR UNIQUE URL...
+echo. > ..\vps_connection.log
+start /B "Cloudflare" cmd /c "..\cloudflared_vps.exe tunnel --url http://localhost:8000 --no-autoupdate > ..\vps_connection.log 2>&1"
+
+:WAIT_FOR_URL
+timeout /t 2 /nobreak >nul
+findstr "trycloudflare.com" ..\vps_connection.log >nul
+if errorlevel 1 goto WAIT_FOR_URL
+
+REM 5. Extract and Display the URL
+for /f "tokens=4" %%a in ('findstr "trycloudflare.com" ..\vps_connection.log') do (
+ set RAW_URL=%%a
+ REM Clean up the URL (remove pipes and spaces)
+ set CLEAN_URL=!RAW_URL:|=!
+ set CLEAN_URL=!CLEAN_URL: =!
+)
+
+echo.
+echo ==========================================
+echo SUCCESS! YOUR PROJECT IS ONLINE
+echo ==========================================
+echo.
+echo API URL: !CLEAN_URL!
+echo.
+echo 1. Open the URL above in your browser.
+echo 2. Click 'Advanced' -> 'Proceed' to trust it.
+echo 3. Then open: https://meih-netflix-clone.vercel.app/
+echo.
+echo KEEP THIS WINDOW OPEN TO STAY ONLINE
+echo ==========================================
+pause
diff --git a/tools/extra/setup_and_run.bat b/tools/extra/setup_and_run.bat
new file mode 100644
index 0000000000000000000000000000000000000000..ee15f0c1fc841019d0a4c2adc26ee6e452993e5a
--- /dev/null
+++ b/tools/extra/setup_and_run.bat
@@ -0,0 +1,93 @@
+@echo off
+setlocal enabledelayedexpansion
+
+echo ===========================================
+echo LMINA PLATFORM - NITRO AUTO SYSTEM
+echo ===========================================
+
+REM 1. Kill existing processes to avoid port conflicts
+echo [*] Cleaning up old processes...
+taskkill /F /IM node.exe /T 2>nul
+taskkill /F /IM uvicorn.exe /T 2>nul
+taskkill /F /IM python.exe /T 2>nul
+taskkill /F /IM cloudflared_vps.exe /T 2>nul
+taskkill /F /IM chrome.exe /T 2>nul
+
+REM 2. Backend Setup & Run
+echo [*] Initializing Backend...
+set "ROOT_DIR=%~dp0"
+cd /d "%ROOT_DIR%backend"
+
+if not exist venv (
+ echo [!] Creating Virtual Environment...
+ python -m venv venv
+)
+
+echo [*] Installing/Updating dependencies...
+call venv\Scripts\activate
+pip install -r requirements.txt >nul 2>&1
+
+REM 3. Start FlareSolverr
+echo [*] Starting FlareSolverr (Protection Bypass)...
+start /B "FlareSolverr" cmd /c "cd /d \"%ROOT_DIR%backend\flaresolverr\" && ..\venv\Scripts\python flaresolverr.py"
+
+REM 4. Start FastAPI Backend
+echo [*] Starting Backend Server on port 8000...
+start /B "Backend" cmd /c "cd /d \"%ROOT_DIR%backend\" && venv\Scripts\uvicorn main:app --port 8000 --workers 1"
+
+REM Wait for servers to warm up
+echo [*] Warming up servers (8s)...
+timeout /t 8 /nobreak >nul
+
+REM 5. Start Cloudflare Tunnel (VPS Mode)
+echo [*] Starting Cloudflare Tunnel...
+echo [*] WAITING FOR PUBLIC URL (This may take 10-20 seconds)...
+set "LOG_FILE=%ROOT_DIR%vps_connection.log"
+echo. > "!LOG_FILE!"
+
+REM Using a more robust start command
+start /B "Cloudflare" cmd /c "\"%ROOT_DIR%cloudflared_vps.exe\" tunnel --url http://localhost:8000 --no-autoupdate > \"!LOG_FILE!\" 2>&1"
+
+:WAIT_FOR_URL
+timeout /t 2 /nobreak >nul
+if not exist "!LOG_FILE!" goto WAIT_FOR_URL
+findstr "trycloudflare.com" "!LOG_FILE!" >nul
+if errorlevel 1 goto WAIT_FOR_URL
+
+REM Extract the URL
+set "CLEAN_URL=NOT_FOUND"
+for /f "tokens=4" %%a in ('findstr "trycloudflare.com" "!LOG_FILE!"') do (
+ set "RAW_URL=%%a"
+ set "CLEAN_URL=!RAW_URL:|=!"
+ set "CLEAN_URL=!CLEAN_URL: =!"
+)
+
+REM 6. Frontend Setup & Run (Local)
+echo [*] Initializing Frontend Local Server...
+cd /d "%ROOT_DIR%meih-netflix-clone"
+if not exist node_modules (
+ echo [!] Installing Node modules (First time only)...
+ call npm install >nul 2>&1
+)
+start "Frontend" cmd /k "npm run dev"
+
+echo.
+echo ===========================================
+echo SYSTEM DEPLOYED SUCCESSFULLY
+echo ===========================================
+echo.
+echo [LOCAL ACCESS]
+echo Frontend: http://localhost:5173
+echo Backend: http://localhost:8000
+echo.
+echo [VPS / MOBILE ACCESS]
+echo Public API URL: !CLEAN_URL!
+echo.
+echo [IMPORTANT]
+echo Open the Public API URL once in your browser,
+echo click Advanced -> Proceed, then use the Frontend.
+echo.
+echo ===========================================
+echo KEEP THIS WINDOW OPEN TO STAY ONLINE
+echo ===========================================
+pause
diff --git a/tools/extra/start_vps.bat b/tools/extra/start_vps.bat
new file mode 100644
index 0000000000000000000000000000000000000000..e350f028b9e4cad6cc7cd7c3c632f08d7ecdd932
--- /dev/null
+++ b/tools/extra/start_vps.bat
@@ -0,0 +1,4 @@
+@echo off
+echo [*] Starting Global Link (VPS Mode)...
+echo [*] Please wait while we generate your unique URL...
+c:\Users\Mina\Desktop\lmina\cloudflared.exe tunnel --url http://localhost:8000 --logfile %temp%\tunnel.log 2>&1
diff --git a/tools/find_body.py b/tools/find_body.py
new file mode 100644
index 0000000000000000000000000000000000000000..258f7554194ba7a5f34f77d9be29cedda13d57cd
--- /dev/null
+++ b/tools/find_body.py
@@ -0,0 +1,19 @@
+import sys
+import io
+
+if sys.platform == 'win32':
+ sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+
+with open("flaresolverr_output.html", "r", encoding="utf-8") as f:
+ content = f.read()
+
+ body_idx = content.find(" at index {body_idx}")
+ print(content[body_idx:body_idx+2000])
+ else:
+ print(" tag NOT found!")
+ # Check if it's all in scripts
+ print(f"Total length: {len(content)}")
+ print("Last 1000 chars:")
+ print(content[-1000:])
diff --git a/tools/quick_health.py b/tools/quick_health.py
new file mode 100644
index 0000000000000000000000000000000000000000..094671db1dedb8c940771243de412c1361692c02
--- /dev/null
+++ b/tools/quick_health.py
@@ -0,0 +1,8 @@
+import httpx
+try:
+ with httpx.Client(timeout=5.0) as client:
+ resp = client.get("http://localhost:8000/health")
+ print(f"Status: {resp.status_code}")
+ print(f"Data: {resp.json()}")
+except Exception as e:
+ print(f"Error: {e}")
diff --git a/tools/read_head.py b/tools/read_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e8ce1d7f999e8cbeaaf9ee644b317122f249f55
--- /dev/null
+++ b/tools/read_head.py
@@ -0,0 +1,9 @@
+import sys
+import io
+
+if sys.platform == 'win32':
+ sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+
+with open("flaresolverr_output.html", "r", encoding="utf-8") as f:
+ content = f.read(2000)
+ print(content)
diff --git a/tools/search_patterns.py b/tools/search_patterns.py
new file mode 100644
index 0000000000000000000000000000000000000000..815b700bd7c6208201f0cfaebe2689569f12b74f
--- /dev/null
+++ b/tools/search_patterns.py
@@ -0,0 +1,22 @@
+import sys
+import io
+import re
+
+if sys.platform == 'win32':
+ sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+
+with open("flaresolverr_output.html", "r", encoding="utf-8") as f:
+ content = f.read()
+
+ print(f"Total length: {len(content)}")
+
+ # Search for common patterns
+ patterns = ['thumbnail', 'pm-li-video', 'video-block', 'movie-item', 'video.php', 'watch.php']
+ for p in patterns:
+ count = len(re.findall(p, content))
+ print(f"Pattern '{p}': found {count} times")
+
+ # If not found, show some snippets from the middle
+ if len(content) > 10000:
+ print("\n--- Snippet from middle (50000:51000) ---")
+ print(content[50000:51000])
diff --git a/tools/test_comprehensive.py b/tools/test_comprehensive.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tools/test_curl_direct.py b/tools/test_curl_direct.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1870905cb71542bf4479d70a543bc1158a1d869
--- /dev/null
+++ b/tools/test_curl_direct.py
@@ -0,0 +1,29 @@
+from curl_cffi.requests import AsyncSession
+import asyncio
+
+async def test_curl():
+ url = "https://q.larozavideo.net/newvideos1.php"
+ headers = {
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+ }
+ async with AsyncSession(impersonate="chrome120") as s:
+ print(f"Fetching {url}...")
+ try:
+ resp = await s.get(url, headers=headers, timeout=15)
+ print(f"Status: {resp.status_code}")
+ if resp.status_code == 200:
+ print(f"Title: {resp.text.find('')}")
+ if "video.php" in resp.text:
+ print("SUCCESS: Found video items!")
+ else:
+ print("FAILED: No video items found.")
+ print(f"Snippet: {resp.text[:500]}")
+ else:
+ print(f"HTTP Error {resp.status_code}")
+ # Print headers to see if it's Cloudflare
+ print(f"Server: {resp.headers.get('Server')}")
+ except Exception as e:
+ print(f"Error: {e}")
+
+if __name__ == "__main__":
+ asyncio.run(test_curl())
diff --git a/tools/test_flaresolverr_direct.py b/tools/test_flaresolverr_direct.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bd5f3cfe90e5ad7eea6748613df0893fd1423e7
--- /dev/null
+++ b/tools/test_flaresolverr_direct.py
@@ -0,0 +1,56 @@
+import httpx
+import json
+import time
+import sys
+
+# Set encoding for Windows terminal
+if sys.platform == 'win32':
+ import io
+ sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+
+def test_flaresolverr():
+ url = "http://localhost:8191/v1"
+ target_url = "https://q.larozavideo.net/newvideos1.php"
+
+ payload = {
+ "cmd": "request.get",
+ "url": target_url,
+ "maxTimeout": 60000
+ }
+
+ print(f"Sending request to FlareSolverr for {target_url}...")
+ start_time = time.time()
+ try:
+ with httpx.Client(timeout=90.0) as client:
+ response = client.post(url, json=payload)
+ duration = time.time() - start_time
+ print(f"Status Code: {response.status_code}")
+ print(f"Duration: {duration:.2f}s")
+
+ if response.status_code == 200:
+ data = response.json()
+ print(f"FlareSolverr Status: {data.get('status')}")
+ if data.get('status') == 'ok':
+ solution = data.get('solution', {})
+ html = solution.get('response', '')
+ print(f"HTML Length: {len(html)}")
+ print(f"Cookies: {len(solution.get('cookies', []))}")
+ print(f"User-Agent: {solution.get('userAgent')}")
+
+ if "challenge-running" in html or "cf-ray" in html:
+ print("[X] Challenge still present in HTML!")
+ else:
+ print("[OK] Challenge solved (or not present)!")
+
+ # Save HTML for inspection
+ with open("flaresolverr_output.html", "w", encoding="utf-8") as f:
+ f.write(html)
+ else:
+ print(f"[X] FlareSolverr Error: {data.get('message')}")
+ else:
+ print(f"[X] HTTP Error: {response.text}")
+ except Exception as e:
+ print(f"[X] Exception: {e}")
+
+if __name__ == "__main__":
+ test_flaresolverr()
diff --git a/tools/test_mom.py b/tools/test_mom.py
new file mode 100644
index 0000000000000000000000000000000000000000..69c9a7790b3b87fff54c25b77b2a49cb2ca20b96
--- /dev/null
+++ b/tools/test_mom.py
@@ -0,0 +1,25 @@
+import asyncio
+from scraper.engine import LaroozaScraper
+from bs4 import BeautifulSoup
+import sys
+
+async def main():
+ s = LaroozaScraper()
+ s.BASE_URL = "https://larooza.mom"
+ s.TARGET_URL = "https://larooza.mom/newvideos.php"
+
+ print(f"Fetching {s.TARGET_URL}...")
+ html = await s._get_html(s.TARGET_URL)
+ if not html:
+ print("Failed to get HTML")
+ return
+
+ print(f"HTML Length: {len(html)}")
+ soup = BeautifulSoup(html, 'html.parser')
+ items = s._extract_items(soup)
+ print(f"Found {len(items)} items")
+ for item in items[:5]:
+ print(f" - {item['title']} ({item['type']})")
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/tools/test_quick.py b/tools/test_quick.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tools/test_scraper_direct.py b/tools/test_scraper_direct.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f6398128df311bd2b4d8e00a7b5fd505b82e9da
--- /dev/null
+++ b/tools/test_scraper_direct.py
@@ -0,0 +1,19 @@
+import asyncio
+import logging
+from scraper.engine import scraper
+
+async def test():
+ logging.basicConfig(level=logging.INFO)
+ print("Testing LaroozaScraper.fetch_home(1)...")
+ try:
+ items = await scraper.fetch_home(1)
+ print(f"Success! Found {len(items)} items.")
+ if items:
+ print(f"First item: {items[0]['title']}")
+ except Exception as e:
+ print(f"ERROR: {e}")
+ import traceback
+ traceback.print_exc()
+
+if __name__ == "__main__":
+ asyncio.run(test())
diff --git a/tools/test_scraper_full.py b/tools/test_scraper_full.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tools/test_scraper_logic.py b/tools/test_scraper_logic.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0fa18675b102753dfe8982725839245726809ec
--- /dev/null
+++ b/tools/test_scraper_logic.py
@@ -0,0 +1,40 @@
+import asyncio
+import sys
+import io
+from scraper.engine import scraper
+from bs4 import BeautifulSoup
+
+# Set encoding for Windows terminal
+if sys.platform == 'win32':
+ sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+
+async def test_scraper_logic():
+ # Test with the HTML we just saved from FlareSolverr
+ try:
+ with open("flaresolverr_output.html", "r", encoding="utf-8") as f:
+ html = f.read()
+
+ print(f"Testing extraction from saved HTML (length: {len(html)})...")
+ soup = BeautifulSoup(html, 'html.parser')
+ items = scraper._extract_items(soup)
+
+ print(f"Extracted {len(items)} items.")
+ for i, item in enumerate(items[:5]):
+ print(f"{i+1}. {item['title']} - {item['type']}")
+ print(f" Poster: {item['poster'][:50]}...")
+
+ if not items:
+ print("[X] No items extracted! Checking container selectors...")
+ # Debug selectors
+ selectors = ['.thumbnail', '.pm-li-video', '.pm-video-thumb', '.video-block', '.movie-item', 'li.col-xs-6', '.box', '.video-box', '.video-item', '.post-item']
+ for sel in selectors:
+ found = soup.select(sel)
+ print(f"Selector '{sel}': found {len(found)} elements")
+
+ except FileNotFoundError:
+ print("[X] flaresolverr_output.html not found. Run test_flaresolverr_direct.py first.")
+ except Exception as e:
+ print(f"[X] Error: {e}")
+
+if __name__ == "__main__":
+ asyncio.run(test_scraper_logic())
diff --git a/tools/test_search_details.py b/tools/test_search_details.py
new file mode 100644
index 0000000000000000000000000000000000000000..188ab42d32bfb1bcbdc693f68efe9c276f51e8c2
--- /dev/null
+++ b/tools/test_search_details.py
@@ -0,0 +1,46 @@
+import requests
+import sys
+
+# Ensure UTF-8 output for console
+sys.stdout.reconfigure(encoding='utf-8')
+
+def test_search(query):
+ url = f"http://localhost:8000/search?q={query}"
+ print(f"Searching for: {query}...")
+ try:
+ r = requests.get(url, timeout=30)
+ print(f"Status: {r.status_code}")
+ if r.status_code == 200:
+ data = r.json()
+ print(f"Results found: {len(data)}")
+ if data:
+ print(f"First result title: {data[0].get('title')}")
+ print(f"First result ID: {data[0].get('id')}")
+ return data[0].get('id')
+ else:
+ print(f"Error: {r.text[:500]}")
+ except Exception as e:
+ print(f"Search failed: {e}")
+ return None
+
+def test_details(safe_id):
+ url = f"http://localhost:8000/details/{safe_id}"
+ print(f"\nFetching details for: {safe_id}...")
+ try:
+ r = requests.get(url, timeout=30)
+ print(f"Status: {r.status_code}")
+ if r.status_code == 200:
+ data = r.json()
+ print(f"Title: {data.get('title')}")
+ print(f"Servers count: {len(data.get('servers', []))}")
+ print(f"Download links count: {len(data.get('download_links', []))}")
+ else:
+ print(f"Error: {r.text[:500]}")
+ except Exception as e:
+ print(f"Details failed: {e}")
+
+if __name__ == "__main__":
+ # Test with a likely existing movie title
+ movie_id = test_search("%D9%87%D9%8A%D8%A8%D8%AA%D8%A7") # "هيبتا"
+ if movie_id:
+ test_details(movie_id)
diff --git a/tools/test_system.py b/tools/test_system.py
new file mode 100644
index 0000000000000000000000000000000000000000..6192187fbb8b62ed725f238c4956954816636d14
--- /dev/null
+++ b/tools/test_system.py
@@ -0,0 +1,82 @@
+import asyncio
+import httpx
+import time
+import sys
+import os
+
+# Set encoding for Windows CLI
+if sys.platform == "win32":
+ import codecs
+ sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
+
+async def test_system():
+ print("\n" + "="*50)
+ print("🔍 MEIH SYSTEM HEALTH CHECK")
+ print("="*50 + "\n")
+
+ async with httpx.AsyncClient(timeout=30.0) as client:
+ # 1. Test FlareSolverr
+ print("📡 Checking FlareSolverr...")
+ try:
+ resp = await client.get("http://localhost:8191/health")
+ if resp.status_code == 200:
+ print("✅ FlareSolverr: ONLINE")
+ else:
+ print(f"❌ FlareSolverr: ERROR (Status {resp.status_code})")
+ except Exception as e:
+ print(f"❌ FlareSolverr: OFFLINE ({e})")
+
+ # 2. Test FastAPI Backend
+ print("\n⚙️ Checking FastAPI Backend...")
+ try:
+ resp = await client.get("http://localhost:8000/")
+ if resp.status_code == 200:
+ print("✅ Backend: ONLINE")
+ data = resp.json()
+ print(f" Mirror Active: {data.get('active_mirror')}")
+ print(f" Engine Status: {data.get('engine_status')}")
+ else:
+ print(f"❌ Backend: ERROR (Status {resp.status_code})")
+ except Exception as e:
+ print(f"❌ Backend: OFFLINE ({e})")
+
+ # 3. Test Scrapper (Latest Movies)
+ print("\n🎬 Testing Movie Scrapper (Live Fetch)...")
+ try:
+ start_time = time.time()
+ resp = await client.get("http://localhost:8000/latest")
+ duration = time.time() - start_time
+ if resp.status_code == 200:
+ items = resp.json()
+ print(f"✅ Scrapper: SUCCESS")
+ print(f" Items Found: {len(items)}")
+ print(f" Time Taken: {duration:.2f}s")
+ if items:
+ print(f" Top Item: {items[0]['title']}")
+ else:
+ print(f"❌ Scrapper: FAILED (Status {resp.status_code})")
+ except Exception as e:
+ print(f"❌ Scrapper: ERROR ({e})")
+
+ # 4. Test Category (Fast Path)
+ print("\n📂 Testing Category (Prefetch Integrity)...")
+ try:
+ start_time = time.time()
+ resp = await client.get("http://localhost:8000/category/arabic-movies")
+ duration = time.time() - start_time
+ if resp.status_code == 200:
+ items = resp.json()
+ print(f"✅ Category Path: STABLE")
+ print(f" Items: {len(items)}")
+ print(f" Time Taken: {duration:.2f}s (Should be < 0.5s if cached)")
+ else:
+ print(f"❌ Category: FAILED")
+ except Exception as e:
+ print(f"❌ Category: ERROR ({e})")
+
+ print("\n" + "="*50)
+ print("✨ ALL TESTS COMPLETED")
+ print("="*50 + "\n")
+
+if __name__ == "__main__":
+ asyncio.run(test_system())
diff --git a/tools/test_uc.py b/tools/test_uc.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b04368664de6f5acecb789fa20a14e12536d094
--- /dev/null
+++ b/tools/test_uc.py
@@ -0,0 +1,18 @@
+import undetected_chromedriver as uc
+import time
+
+def test():
+ print("Starting UC test...")
+ try:
+ options = uc.ChromeOptions()
+ options.add_argument('--headless')
+ driver = uc.Chrome(options=options)
+ print("Driver started successfully!")
+ driver.get("https://www.google.com")
+ print(f"Title: {driver.title}")
+ driver.quit()
+ except Exception as e:
+ print(f"Error: {e}")
+
+if __name__ == "__main__":
+ test()
diff --git a/tools/verify_latest.py b/tools/verify_latest.py
new file mode 100644
index 0000000000000000000000000000000000000000..fff94c40714a07613f3a512a510a7aee5074dbd4
--- /dev/null
+++ b/tools/verify_latest.py
@@ -0,0 +1,26 @@
+import requests
+import json
+import sys
+
+# Ensure UTF-8 output for Arabic characters
+try:
+ sys.stdout.reconfigure(encoding='utf-8')
+except AttributeError:
+ pass # Not available in all environments
+
+try:
+ r = requests.get('http://localhost:8000/latest', timeout=30)
+ print(f"Status: {r.status_code}")
+ if r.status_code == 200:
+ data = r.json()
+ print(f"Items found: {len(data)}")
+ if data:
+ print(f"First item title: {data[0].get('title')}")
+ print(f"First item ID: {data[0].get('id')}")
+ else:
+ print("Response body:")
+ print(r.text[:500])
+ else:
+ print(f"Error body: {r.text[:500]}")
+except Exception as e:
+ print(f"Request failed: {e}")