Mina commited on
Commit ·
25ae7fe
0
Parent(s):
Fresh deploy without large files
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitignore +17 -0
- Dockerfile +35 -0
- Dockerfile.hf +35 -0
- Procfile +1 -0
- README.hf.md +33 -0
- README.md +30 -0
- database.py +48 -0
- deploy/.dockerignore +30 -0
- deploy/Dockerfile +50 -0
- deploy/cloudflare-worker.js +77 -0
- deploy/render.yaml +18 -0
- downloader.py +145 -0
- flaresolverr/bottle_plugins/__init__.py +0 -0
- flaresolverr/bottle_plugins/error_plugin.py +22 -0
- flaresolverr/bottle_plugins/logger_plugin.py +23 -0
- flaresolverr/bottle_plugins/prometheus_plugin.py +66 -0
- flaresolverr/build_package.py +126 -0
- flaresolverr/dtos.py +94 -0
- flaresolverr/flaresolverr.py +155 -0
- flaresolverr/flaresolverr_service.py +519 -0
- flaresolverr/metrics.py +32 -0
- flaresolverr/sessions.py +84 -0
- flaresolverr/tests.py +655 -0
- flaresolverr/tests_sites.py +102 -0
- flaresolverr/undetected_chromedriver/__init__.py +910 -0
- flaresolverr/undetected_chromedriver/cdp.py +112 -0
- flaresolverr/undetected_chromedriver/devtool.py +193 -0
- flaresolverr/undetected_chromedriver/dprocess.py +77 -0
- flaresolverr/undetected_chromedriver/options.py +85 -0
- flaresolverr/undetected_chromedriver/patcher.py +473 -0
- flaresolverr/undetected_chromedriver/reactor.py +99 -0
- flaresolverr/undetected_chromedriver/webelement.py +86 -0
- flaresolverr/utils.py +376 -0
- keep_alive.py +47 -0
- main.py +352 -0
- package.json +12 -0
- requirements.txt +14 -0
- scraper/engine.py +996 -0
- scraper/proxy_fetcher.py +66 -0
- start.sh +31 -0
- start_render.sh +22 -0
- tools/analyze_structure.py +36 -0
- tools/check_mirrors.py +34 -0
- tools/debug_fs.py +51 -0
- tools/debug_mirrors.py +35 -0
- tools/debug_scraper.py +27 -0
- tools/dump_html.py +25 -0
- tools/dump_html_v2.py +25 -0
- tools/extra/diagnose.py +27 -0
- tools/extra/expose_to_internet.bat +18 -0
.gitignore
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
venv/
|
| 2 |
+
__pycache__/
|
| 3 |
+
archive/
|
| 4 |
+
*.db
|
| 5 |
+
*.log
|
| 6 |
+
.env
|
| 7 |
+
.vscode/
|
| 8 |
+
.idea/
|
| 9 |
+
bin/
|
| 10 |
+
cache/
|
| 11 |
+
logs/
|
| 12 |
+
*.exe
|
| 13 |
+
*.img
|
| 14 |
+
dist/
|
| 15 |
+
node_modules/
|
| 16 |
+
.choreo/
|
| 17 |
+
TUNNEL_TOKEN.txt
|
Dockerfile
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
# Hugging Face Optimized - Lightweight & Stable
|
| 4 |
+
ENV PYTHONUNBUFFERED=1
|
| 5 |
+
ENV HF_SPACE=1
|
| 6 |
+
|
| 7 |
+
# Install minimal system dependencies
|
| 8 |
+
RUN apt-get update && apt-get install -y \
|
| 9 |
+
curl \
|
| 10 |
+
ffmpeg \
|
| 11 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 12 |
+
|
| 13 |
+
WORKDIR /app
|
| 14 |
+
|
| 15 |
+
# Copy requirements and install
|
| 16 |
+
COPY requirements.txt .
|
| 17 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
| 18 |
+
|
| 19 |
+
# Create a non-privileged user (Required by Hugging Face)
|
| 20 |
+
RUN useradd -m -u 1000 user
|
| 21 |
+
RUN chown -R user:user /app
|
| 22 |
+
USER user
|
| 23 |
+
ENV HOME=/home/user \
|
| 24 |
+
PATH=/home/user/.local/bin:$PATH
|
| 25 |
+
|
| 26 |
+
# Copy application code
|
| 27 |
+
COPY --chown=user:user . .
|
| 28 |
+
|
| 29 |
+
# Hugging Face uses port 7860
|
| 30 |
+
EXPOSE 7860
|
| 31 |
+
ENV PORT=7860
|
| 32 |
+
|
| 33 |
+
# Start the application with optimized settings for limited RAM
|
| 34 |
+
# We use 1 worker to keep memory usage low on the free tier
|
| 35 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1", "--timeout-keep-alive", "60"]
|
Dockerfile.hf
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
# Hugging Face optimized - Lightweight without Chrome
|
| 4 |
+
ENV PYTHONUNBUFFERED=1
|
| 5 |
+
ENV SPACE_ID=huggingface
|
| 6 |
+
ENV HF_SPACE=1
|
| 7 |
+
|
| 8 |
+
# Install minimal dependencies
|
| 9 |
+
RUN apt-get update && apt-get install -y \
|
| 10 |
+
curl \
|
| 11 |
+
git \
|
| 12 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 13 |
+
|
| 14 |
+
WORKDIR /app
|
| 15 |
+
|
| 16 |
+
# Copy and install Python dependencies
|
| 17 |
+
COPY requirements.txt .
|
| 18 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
| 19 |
+
|
| 20 |
+
# Create user for Hugging Face
|
| 21 |
+
RUN useradd -m -u 1000 user
|
| 22 |
+
RUN chown -R user:user /app
|
| 23 |
+
USER user
|
| 24 |
+
ENV HOME=/home/user \
|
| 25 |
+
PATH=/home/user/.local/bin:$PATH
|
| 26 |
+
|
| 27 |
+
# Copy application
|
| 28 |
+
COPY --chown=user:user . .
|
| 29 |
+
|
| 30 |
+
# Hugging Face uses port 7860
|
| 31 |
+
EXPOSE 7860
|
| 32 |
+
ENV PORT=7860
|
| 33 |
+
|
| 34 |
+
# Start without FlareSolverr (too heavy for HF)
|
| 35 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
|
Procfile
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
web: uvicorn main:app --host 0.0.0.0 --port $PORT --log-level info
|
README.hf.md
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: MEIH Movies API
|
| 3 |
+
emoji: 🎬
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: gray
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_file: main.py
|
| 8 |
+
pinned: false
|
| 9 |
+
license: mit
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# MEIH Movies API - Hugging Face Edition
|
| 13 |
+
|
| 14 |
+
High-performance movie streaming API optimized for Hugging Face Spaces.
|
| 15 |
+
|
| 16 |
+
## Features
|
| 17 |
+
|
| 18 |
+
- Fast content scraping with curl-cffi
|
| 19 |
+
- Intelligent caching system
|
| 20 |
+
- Rate limiting for stability
|
| 21 |
+
- Proxy rotation support
|
| 22 |
+
|
| 23 |
+
## API Endpoints
|
| 24 |
+
|
| 25 |
+
- `GET /latest` - Latest movies and series
|
| 26 |
+
- `GET /category/{cat_id}` - Browse by category
|
| 27 |
+
- `GET /search?q={query}` - Search content
|
| 28 |
+
- `GET /details/{id}` - Get streaming links
|
| 29 |
+
- `GET /health` - Health check
|
| 30 |
+
|
| 31 |
+
## Usage
|
| 32 |
+
|
| 33 |
+
Visit the API at: `https://YOUR-SPACE-NAME.hf.space/`
|
README.md
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Meih Movies API
|
| 3 |
+
emoji: 🎬
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: gray
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
# MEIH Movies API - Hugging Face Edition
|
| 11 |
+
|
| 12 |
+
High-performance movie streaming API optimized for Hugging Face Spaces.
|
| 13 |
+
|
| 14 |
+
## Features
|
| 15 |
+
|
| 16 |
+
- **Lightweight**: Optimized for 16GB RAM environments.
|
| 17 |
+
- **Fast**: Powered by `curl-cffi` for high-speed scraping.
|
| 18 |
+
- **Stable**: Automatic proxy rotation and intelligent caching.
|
| 19 |
+
- **Universal**: Serves both API and Frontend (if built).
|
| 20 |
+
|
| 21 |
+
## API Endpoints
|
| 22 |
+
|
| 23 |
+
- `GET /latest` - Latest movies and series.
|
| 24 |
+
- `GET /search?q={query}` - Search content.
|
| 25 |
+
- `GET /details/{id}` - Get streaming links.
|
| 26 |
+
- `GET /health` - System status.
|
| 27 |
+
|
| 28 |
+
## Deployment Note
|
| 29 |
+
|
| 30 |
+
This project is configured to run on port **7860**. Ensure your Space is set to **Docker** SDK.
|
database.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import aiosqlite
|
| 2 |
+
import logging
|
| 3 |
+
|
| 4 |
+
DB_NAME = "netflix_clone.db"
|
| 5 |
+
|
| 6 |
+
async def init_db():
|
| 7 |
+
async with aiosqlite.connect(DB_NAME) as db:
|
| 8 |
+
# Movies Table
|
| 9 |
+
await db.execute("""
|
| 10 |
+
CREATE TABLE IF NOT EXISTS movies (
|
| 11 |
+
id TEXT PRIMARY KEY,
|
| 12 |
+
title TEXT,
|
| 13 |
+
poster TEXT,
|
| 14 |
+
year TEXT,
|
| 15 |
+
rating TEXT,
|
| 16 |
+
description TEXT,
|
| 17 |
+
category TEXT
|
| 18 |
+
)
|
| 19 |
+
""")
|
| 20 |
+
# Series Table
|
| 21 |
+
await db.execute("""
|
| 22 |
+
CREATE TABLE IF NOT EXISTS series (
|
| 23 |
+
id TEXT PRIMARY KEY,
|
| 24 |
+
title TEXT,
|
| 25 |
+
poster TEXT,
|
| 26 |
+
year TEXT,
|
| 27 |
+
rating TEXT,
|
| 28 |
+
description TEXT,
|
| 29 |
+
category TEXT
|
| 30 |
+
)
|
| 31 |
+
""")
|
| 32 |
+
# Episodes Table
|
| 33 |
+
await db.execute("""
|
| 34 |
+
CREATE TABLE IF NOT EXISTS episodes (
|
| 35 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 36 |
+
series_id TEXT,
|
| 37 |
+
episode_number INTEGER,
|
| 38 |
+
title TEXT,
|
| 39 |
+
watch_link TEXT,
|
| 40 |
+
FOREIGN KEY(series_id) REFERENCES series(id)
|
| 41 |
+
)
|
| 42 |
+
""")
|
| 43 |
+
await db.commit()
|
| 44 |
+
|
| 45 |
+
async def get_db_connection():
|
| 46 |
+
db = await aiosqlite.connect(DB_NAME)
|
| 47 |
+
db.row_factory = aiosqlite.Row
|
| 48 |
+
return db
|
deploy/.dockerignore
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python ignore
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
venv/
|
| 6 |
+
.env
|
| 7 |
+
netflix_clone.db
|
| 8 |
+
archive/
|
| 9 |
+
tools/
|
| 10 |
+
|
| 11 |
+
# Node ignore
|
| 12 |
+
node_modules/
|
| 13 |
+
dist/
|
| 14 |
+
build/
|
| 15 |
+
.next/
|
| 16 |
+
.vite/
|
| 17 |
+
|
| 18 |
+
# Git ignore
|
| 19 |
+
.git/
|
| 20 |
+
.gitignore
|
| 21 |
+
|
| 22 |
+
# OS ignore
|
| 23 |
+
.DS_Store
|
| 24 |
+
Thumbs.db
|
| 25 |
+
|
| 26 |
+
# Project ignore
|
| 27 |
+
setup_and_run.bat
|
| 28 |
+
*.md
|
| 29 |
+
.gemini/
|
| 30 |
+
.agent/
|
deploy/Dockerfile
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==========================================
|
| 2 |
+
# Nitro Backend-Only Dockerfile for Hugging Face
|
| 3 |
+
# ==========================================
|
| 4 |
+
FROM python:3.11-slim
|
| 5 |
+
|
| 6 |
+
# Install system dependencies for Scraper (Chrome) and FlareSolverr
|
| 7 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
| 8 |
+
RUN apt-get update && apt-get install -y \
|
| 9 |
+
ffmpeg \
|
| 10 |
+
curl \
|
| 11 |
+
git \
|
| 12 |
+
wget \
|
| 13 |
+
gnupg \
|
| 14 |
+
xvfb \
|
| 15 |
+
xauth \
|
| 16 |
+
dos2unix \
|
| 17 |
+
libnss3 \
|
| 18 |
+
libatk-bridge2.0-0 \
|
| 19 |
+
libgtk-3-0 \
|
| 20 |
+
&& wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /usr/share/keyrings/google-chrome.gpg \
|
| 21 |
+
&& echo "deb [arch=amd64 signed-by=/usr/share/keyrings/google-chrome.gpg] http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list \
|
| 22 |
+
&& apt-get update && apt-get install -y google-chrome-stable \
|
| 23 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 24 |
+
|
| 25 |
+
WORKDIR /app
|
| 26 |
+
|
| 27 |
+
# Install Backend Dependencies
|
| 28 |
+
COPY backend/requirements.txt ./
|
| 29 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
| 30 |
+
pip install --no-cache-dir -r requirements.txt
|
| 31 |
+
|
| 32 |
+
# Copy Backend Application
|
| 33 |
+
COPY backend/ ./
|
| 34 |
+
|
| 35 |
+
# Fix line endings and permissions
|
| 36 |
+
RUN dos2unix start.sh && chmod +x start.sh
|
| 37 |
+
|
| 38 |
+
# Create local user for Hugging Face Spaces (UID 1000)
|
| 39 |
+
RUN useradd -m -u 1000 user
|
| 40 |
+
RUN chown -R user:user /app
|
| 41 |
+
USER user
|
| 42 |
+
ENV HOME=/home/user \
|
| 43 |
+
PATH=/home/user/.local/bin:$PATH \
|
| 44 |
+
PYTHONPATH=/app
|
| 45 |
+
|
| 46 |
+
# Expose the mandatory Hugging Face Space port
|
| 47 |
+
EXPOSE 7860
|
| 48 |
+
|
| 49 |
+
# Kickstart the engine
|
| 50 |
+
CMD ["/bin/bash", "./start.sh"]
|
deploy/cloudflare-worker.js
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Cloudflare Worker - Proxy Bypass for Larooza Scraper
|
| 3 |
+
* Deploy this to Cloudflare Workers (100% FREE)
|
| 4 |
+
*
|
| 5 |
+
* This worker acts as a middle-man to bypass IP bans
|
| 6 |
+
*/
|
| 7 |
+
|
| 8 |
+
addEventListener('fetch', event => {
|
| 9 |
+
event.respondWith(handleRequest(event.request))
|
| 10 |
+
})
|
| 11 |
+
|
| 12 |
+
async function handleRequest(request) {
|
| 13 |
+
// Enable CORS
|
| 14 |
+
const corsHeaders = {
|
| 15 |
+
'Access-Control-Allow-Origin': '*',
|
| 16 |
+
'Access-Control-Allow-Methods': 'GET, POST, OPTIONS',
|
| 17 |
+
'Access-Control-Allow-Headers': 'Content-Type',
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
// Handle CORS preflight
|
| 21 |
+
if (request.method === 'OPTIONS') {
|
| 22 |
+
return new Response(null, { headers: corsHeaders })
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
// Get target URL from query parameter
|
| 26 |
+
const url = new URL(request.url)
|
| 27 |
+
const targetUrl = url.searchParams.get('url')
|
| 28 |
+
|
| 29 |
+
if (!targetUrl) {
|
| 30 |
+
return new Response(JSON.stringify({ error: 'Missing url parameter' }), {
|
| 31 |
+
status: 400,
|
| 32 |
+
headers: { ...corsHeaders, 'Content-Type': 'application/json' }
|
| 33 |
+
})
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
try {
|
| 37 |
+
// Fetch the target URL with realistic headers
|
| 38 |
+
const response = await fetch(targetUrl, {
|
| 39 |
+
headers: {
|
| 40 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 41 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
| 42 |
+
'Accept-Language': 'ar,en-US;q=0.9,en;q=0.8',
|
| 43 |
+
'Referer': 'https://www.google.com/',
|
| 44 |
+
'DNT': '1',
|
| 45 |
+
'Connection': 'keep-alive',
|
| 46 |
+
'Upgrade-Insecure-Requests': '1',
|
| 47 |
+
},
|
| 48 |
+
cf: {
|
| 49 |
+
// Cloudflare-specific options
|
| 50 |
+
cacheTtl: 300, // Cache for 5 minutes
|
| 51 |
+
cacheEverything: true,
|
| 52 |
+
}
|
| 53 |
+
})
|
| 54 |
+
|
| 55 |
+
// Get the HTML content
|
| 56 |
+
const html = await response.text()
|
| 57 |
+
|
| 58 |
+
// Return with CORS headers
|
| 59 |
+
return new Response(html, {
|
| 60 |
+
status: response.status,
|
| 61 |
+
headers: {
|
| 62 |
+
...corsHeaders,
|
| 63 |
+
'Content-Type': 'text/html; charset=utf-8',
|
| 64 |
+
'Cache-Control': 'public, max-age=300',
|
| 65 |
+
}
|
| 66 |
+
})
|
| 67 |
+
|
| 68 |
+
} catch (error) {
|
| 69 |
+
return new Response(JSON.stringify({
|
| 70 |
+
error: 'Failed to fetch target URL',
|
| 71 |
+
message: error.message
|
| 72 |
+
}), {
|
| 73 |
+
status: 500,
|
| 74 |
+
headers: { ...corsHeaders, 'Content-Type': 'application/json' }
|
| 75 |
+
})
|
| 76 |
+
}
|
| 77 |
+
}
|
deploy/render.yaml
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Render.com Deployment Configuration
|
| 2 |
+
# https://render.com
|
| 3 |
+
|
| 4 |
+
services:
|
| 5 |
+
- type: web
|
| 6 |
+
name: meih-movies-api
|
| 7 |
+
env: docker
|
| 8 |
+
dockerfilePath: ./Dockerfile
|
| 9 |
+
dockerContext: ./backend
|
| 10 |
+
plan: free
|
| 11 |
+
region: oregon
|
| 12 |
+
envVars:
|
| 13 |
+
- key: PYTHON_VERSION
|
| 14 |
+
value: 3.11
|
| 15 |
+
- key: PORT
|
| 16 |
+
value: 7860
|
| 17 |
+
healthCheckPath: /health
|
| 18 |
+
autoDeploy: true
|
downloader.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import yt_dlp
|
| 2 |
+
import logging
|
| 3 |
+
import asyncio
|
| 4 |
+
|
| 5 |
+
logger = logging.getLogger(__name__)
|
| 6 |
+
|
| 7 |
+
class VideoDownloader:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
self.ydl_opts = {
|
| 10 |
+
'quiet': True,
|
| 11 |
+
'no_warnings': True,
|
| 12 |
+
'format': 'best',
|
| 13 |
+
'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 14 |
+
'geo_bypass': True,
|
| 15 |
+
'no_playlist': True,
|
| 16 |
+
'nocheckcertificate': True,
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
async def get_info(self, url: str):
|
| 20 |
+
# 1. Handle Local Watch/Details Links or Direct Larooza Links
|
| 21 |
+
is_larooza = any(x in url for x in ['larozavideo', 'larooza', 'laroza'])
|
| 22 |
+
if "/watch/" in url or "/details/" in url or is_larooza:
|
| 23 |
+
try:
|
| 24 |
+
from scraper.engine import scraper
|
| 25 |
+
import base64
|
| 26 |
+
|
| 27 |
+
target_url = url
|
| 28 |
+
if "/watch/" in url or "/details/" in url:
|
| 29 |
+
id_part = url.split("/")[-1].split("?")[0]
|
| 30 |
+
if not id_part.startswith("http"):
|
| 31 |
+
target_url = base64.urlsafe_b64decode(id_part).decode()
|
| 32 |
+
|
| 33 |
+
# If it's a Larooza link (direct or decoded), use scraper
|
| 34 |
+
if any(x in target_url for x in ['larozavideo', 'larooza', 'laroza']):
|
| 35 |
+
logger.info(f"Routing Larooza link to scraper: {target_url}")
|
| 36 |
+
# Normalize: downloader works better with the video.php page
|
| 37 |
+
target_url = target_url.replace('play.php', 'video.php').replace('download.php', 'video.php')
|
| 38 |
+
|
| 39 |
+
safe_id = base64.urlsafe_b64encode(target_url.encode()).decode()
|
| 40 |
+
data = await scraper.fetch_details(safe_id)
|
| 41 |
+
|
| 42 |
+
if data and data.get('download_links'):
|
| 43 |
+
formats = []
|
| 44 |
+
for dl in data['download_links']:
|
| 45 |
+
formats.append({
|
| 46 |
+
'ext': 'mp4',
|
| 47 |
+
'resolution': dl['quality'],
|
| 48 |
+
'url': dl['url'],
|
| 49 |
+
'type': 'video'
|
| 50 |
+
})
|
| 51 |
+
return {
|
| 52 |
+
'title': data.get('title'),
|
| 53 |
+
'thumbnail': data.get('poster'),
|
| 54 |
+
'duration': 0,
|
| 55 |
+
'uploader': 'Larooza',
|
| 56 |
+
'source': 'Larooza',
|
| 57 |
+
'formats': formats
|
| 58 |
+
}
|
| 59 |
+
elif data:
|
| 60 |
+
return {"error": "لم يتم العثور على روابط تحميل لهذا الفيديو (ربما محمي أو غير متاح حالياً)."}
|
| 61 |
+
except Exception as e:
|
| 62 |
+
logger.error(f"Larooza-specific extraction failed: {e}")
|
| 63 |
+
|
| 64 |
+
# 2. Universal yt-dlp Path (YouTube, TikTok, etc.)
|
| 65 |
+
try:
|
| 66 |
+
loop = asyncio.get_event_loop()
|
| 67 |
+
# Use a more robust extraction with a timeout
|
| 68 |
+
try:
|
| 69 |
+
info = await asyncio.wait_for(
|
| 70 |
+
loop.run_in_executor(None, lambda: self._extract(url)),
|
| 71 |
+
timeout=30.0
|
| 72 |
+
)
|
| 73 |
+
except asyncio.TimeoutError:
|
| 74 |
+
logger.error(f"Timeout extracting info for {url}")
|
| 75 |
+
return {"error": "استغرق استخراج البيانات وقتاً طويلاً. حاول مرة أخرى."}
|
| 76 |
+
|
| 77 |
+
if not info:
|
| 78 |
+
return {"error": "فشل في استخراج بيانات الفيديو. تأكد من الرابط."}
|
| 79 |
+
|
| 80 |
+
# Live stream check
|
| 81 |
+
if info.get('is_live') or info.get('live_status') == 'is_upcoming':
|
| 82 |
+
return {"error": "هذا الفيديو لم يبدأ عرضه بعد أو هو بث مباشر حالياً."}
|
| 83 |
+
|
| 84 |
+
formats = []
|
| 85 |
+
seen_resolutions = set()
|
| 86 |
+
|
| 87 |
+
# Extract usable formats
|
| 88 |
+
raw_formats = info.get('formats', [])
|
| 89 |
+
if not raw_formats and info.get('url'):
|
| 90 |
+
raw_formats = [info] # For direct links
|
| 91 |
+
|
| 92 |
+
for f in raw_formats:
|
| 93 |
+
if not f: continue
|
| 94 |
+
# Filter out formats without a direct URL or those that are just manifests
|
| 95 |
+
f_url = f.get('url')
|
| 96 |
+
if not f_url or '.m3u8' in f_url or '.mpd' in f_url:
|
| 97 |
+
continue
|
| 98 |
+
|
| 99 |
+
ext = f.get('ext', 'mp4')
|
| 100 |
+
res = f.get('resolution') or f.get('format_note') or f.get('height') or 'Unknown'
|
| 101 |
+
|
| 102 |
+
# Clean resolution label
|
| 103 |
+
if isinstance(res, int): res = f"{res}p"
|
| 104 |
+
|
| 105 |
+
# Avoid duplicates and prioritize video formats
|
| 106 |
+
res_key = f"{res}_{f.get('vcodec') != 'none'}"
|
| 107 |
+
if res_key in seen_resolutions: continue
|
| 108 |
+
seen_resolutions.add(res_key)
|
| 109 |
+
|
| 110 |
+
formats.append({
|
| 111 |
+
'id': f.get('format_id', 'unknown'),
|
| 112 |
+
'ext': ext,
|
| 113 |
+
'resolution': res,
|
| 114 |
+
'filesize': f.get('filesize') or f.get('filesize_approx') or 0,
|
| 115 |
+
'url': f_url,
|
| 116 |
+
'type': 'video' if f.get('vcodec') != 'none' else 'audio'
|
| 117 |
+
})
|
| 118 |
+
|
| 119 |
+
if not formats:
|
| 120 |
+
return {"error": "لم يتم العثور على روابط تحميل مباشرة مدعومة لهذا الفيديو."}
|
| 121 |
+
|
| 122 |
+
return {
|
| 123 |
+
'title': info.get('title', 'Video'),
|
| 124 |
+
'thumbnail': info.get('thumbnail', ''),
|
| 125 |
+
'duration': info.get('duration', 0),
|
| 126 |
+
'uploader': info.get('uploader', 'Unknown'),
|
| 127 |
+
'source': info.get('extractor_key', 'Unknown'),
|
| 128 |
+
'formats': formats[::-1]
|
| 129 |
+
}
|
| 130 |
+
except Exception as e:
|
| 131 |
+
logger.error(f"Universal Downloader error for {url}: {e}")
|
| 132 |
+
return {"error": f"حدث خطأ غير متوقع: {str(e)}"}
|
| 133 |
+
|
| 134 |
+
def _extract(self, url):
|
| 135 |
+
opts = self.ydl_opts.copy()
|
| 136 |
+
# Add extra robustness for TikTok and newer sites
|
| 137 |
+
opts.update({
|
| 138 |
+
'nocheckcertificate': True,
|
| 139 |
+
'ignoreerrors': True,
|
| 140 |
+
'socket_timeout': 15,
|
| 141 |
+
})
|
| 142 |
+
with yt_dlp.YoutubeDL(opts) as ydl:
|
| 143 |
+
return ydl.extract_info(url, download=False)
|
| 144 |
+
|
| 145 |
+
downloader = VideoDownloader()
|
flaresolverr/bottle_plugins/__init__.py
ADDED
|
File without changes
|
flaresolverr/bottle_plugins/error_plugin.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from bottle import response
|
| 2 |
+
import logging
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def error_plugin(callback):
|
| 6 |
+
"""
|
| 7 |
+
Bottle plugin to handle exceptions
|
| 8 |
+
https://stackoverflow.com/a/32764250
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
def wrapper(*args, **kwargs):
|
| 12 |
+
try:
|
| 13 |
+
actual_response = callback(*args, **kwargs)
|
| 14 |
+
except Exception as e:
|
| 15 |
+
logging.error(str(e))
|
| 16 |
+
actual_response = {
|
| 17 |
+
"error": str(e)
|
| 18 |
+
}
|
| 19 |
+
response.status = 500
|
| 20 |
+
return actual_response
|
| 21 |
+
|
| 22 |
+
return wrapper
|
flaresolverr/bottle_plugins/logger_plugin.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from bottle import request, response
|
| 2 |
+
import logging
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def logger_plugin(callback):
|
| 6 |
+
"""
|
| 7 |
+
Bottle plugin to use logging module
|
| 8 |
+
https://bottlepy.org/docs/dev/plugindev.html
|
| 9 |
+
|
| 10 |
+
Wrap a Bottle request so that a log line is emitted after it's handled.
|
| 11 |
+
(This decorator can be extended to take the desired logger as a param.)
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
def wrapper(*args, **kwargs):
|
| 15 |
+
actual_response = callback(*args, **kwargs)
|
| 16 |
+
if not request.url.endswith("/health"):
|
| 17 |
+
logging.info('%s %s %s %s' % (request.remote_addr,
|
| 18 |
+
request.method,
|
| 19 |
+
request.url,
|
| 20 |
+
response.status))
|
| 21 |
+
return actual_response
|
| 22 |
+
|
| 23 |
+
return wrapper
|
flaresolverr/bottle_plugins/prometheus_plugin.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import os
|
| 3 |
+
import urllib.parse
|
| 4 |
+
|
| 5 |
+
from bottle import request
|
| 6 |
+
from dtos import V1RequestBase, V1ResponseBase
|
| 7 |
+
from metrics import start_metrics_http_server, REQUEST_COUNTER, REQUEST_DURATION
|
| 8 |
+
|
| 9 |
+
PROMETHEUS_ENABLED = os.environ.get('PROMETHEUS_ENABLED', 'false').lower() == 'true'
|
| 10 |
+
PROMETHEUS_PORT = int(os.environ.get('PROMETHEUS_PORT', 8192))
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def setup():
|
| 14 |
+
if PROMETHEUS_ENABLED:
|
| 15 |
+
start_metrics_http_server(PROMETHEUS_PORT)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def prometheus_plugin(callback):
|
| 19 |
+
"""
|
| 20 |
+
Bottle plugin to expose Prometheus metrics
|
| 21 |
+
https://bottlepy.org/docs/dev/plugindev.html
|
| 22 |
+
"""
|
| 23 |
+
def wrapper(*args, **kwargs):
|
| 24 |
+
actual_response = callback(*args, **kwargs)
|
| 25 |
+
|
| 26 |
+
if PROMETHEUS_ENABLED:
|
| 27 |
+
try:
|
| 28 |
+
export_metrics(actual_response)
|
| 29 |
+
except Exception as e:
|
| 30 |
+
logging.warning("Error exporting metrics: " + str(e))
|
| 31 |
+
|
| 32 |
+
return actual_response
|
| 33 |
+
|
| 34 |
+
def export_metrics(actual_response):
|
| 35 |
+
res = V1ResponseBase(actual_response)
|
| 36 |
+
|
| 37 |
+
if res.startTimestamp is None or res.endTimestamp is None:
|
| 38 |
+
# skip management and healthcheck endpoints
|
| 39 |
+
return
|
| 40 |
+
|
| 41 |
+
domain = "unknown"
|
| 42 |
+
if res.solution and res.solution.url:
|
| 43 |
+
domain = parse_domain_url(res.solution.url)
|
| 44 |
+
else:
|
| 45 |
+
# timeout error
|
| 46 |
+
req = V1RequestBase(request.json)
|
| 47 |
+
if req.url:
|
| 48 |
+
domain = parse_domain_url(req.url)
|
| 49 |
+
|
| 50 |
+
run_time = (res.endTimestamp - res.startTimestamp) / 1000
|
| 51 |
+
REQUEST_DURATION.labels(domain=domain).observe(run_time)
|
| 52 |
+
|
| 53 |
+
result = "unknown"
|
| 54 |
+
if res.message == "Challenge solved!":
|
| 55 |
+
result = "solved"
|
| 56 |
+
elif res.message == "Challenge not detected!":
|
| 57 |
+
result = "not_detected"
|
| 58 |
+
elif res.message.startswith("Error"):
|
| 59 |
+
result = "error"
|
| 60 |
+
REQUEST_COUNTER.labels(domain=domain, result=result).inc()
|
| 61 |
+
|
| 62 |
+
def parse_domain_url(url):
|
| 63 |
+
parsed_url = urllib.parse.urlparse(url)
|
| 64 |
+
return parsed_url.hostname
|
| 65 |
+
|
| 66 |
+
return wrapper
|
flaresolverr/build_package.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import platform
|
| 3 |
+
import shutil
|
| 4 |
+
import subprocess
|
| 5 |
+
import sys
|
| 6 |
+
import zipfile
|
| 7 |
+
import tarfile
|
| 8 |
+
|
| 9 |
+
import requests
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def clean_files():
|
| 13 |
+
try:
|
| 14 |
+
shutil.rmtree(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'build'))
|
| 15 |
+
except Exception:
|
| 16 |
+
pass
|
| 17 |
+
try:
|
| 18 |
+
shutil.rmtree(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'dist'))
|
| 19 |
+
except Exception:
|
| 20 |
+
pass
|
| 21 |
+
try:
|
| 22 |
+
shutil.rmtree(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'dist_chrome'))
|
| 23 |
+
except Exception:
|
| 24 |
+
pass
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def download_chromium():
|
| 28 |
+
# https://commondatastorage.googleapis.com/chromium-browser-snapshots/index.html?prefix=Linux_x64/
|
| 29 |
+
revision = "1522586" if os.name == 'nt' else '1522586'
|
| 30 |
+
arch = 'Win_x64' if os.name == 'nt' else 'Linux_x64'
|
| 31 |
+
dl_file = 'chrome-win' if os.name == 'nt' else 'chrome-linux'
|
| 32 |
+
dl_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'dist_chrome')
|
| 33 |
+
dl_path_folder = os.path.join(dl_path, dl_file)
|
| 34 |
+
dl_path_zip = dl_path_folder + '.zip'
|
| 35 |
+
|
| 36 |
+
# response = requests.get(
|
| 37 |
+
# f'https://commondatastorage.googleapis.com/chromium-browser-snapshots/{arch}/LAST_CHANGE',
|
| 38 |
+
# timeout=30)
|
| 39 |
+
# revision = response.text.strip()
|
| 40 |
+
print("Downloading revision: " + revision)
|
| 41 |
+
|
| 42 |
+
os.mkdir(dl_path)
|
| 43 |
+
with requests.get(
|
| 44 |
+
f'https://commondatastorage.googleapis.com/chromium-browser-snapshots/{arch}/{revision}/{dl_file}.zip',
|
| 45 |
+
stream=True) as r:
|
| 46 |
+
r.raise_for_status()
|
| 47 |
+
with open(dl_path_zip, 'wb') as f:
|
| 48 |
+
for chunk in r.iter_content(chunk_size=8192):
|
| 49 |
+
f.write(chunk)
|
| 50 |
+
print("File downloaded: " + dl_path_zip)
|
| 51 |
+
with zipfile.ZipFile(dl_path_zip, 'r') as zip_ref:
|
| 52 |
+
zip_ref.extractall(dl_path)
|
| 53 |
+
os.remove(dl_path_zip)
|
| 54 |
+
|
| 55 |
+
chrome_path = os.path.join(dl_path, "chrome")
|
| 56 |
+
shutil.move(dl_path_folder, chrome_path)
|
| 57 |
+
print("Extracted in: " + chrome_path)
|
| 58 |
+
|
| 59 |
+
if os.name != 'nt':
|
| 60 |
+
# Give executable permissions for *nix
|
| 61 |
+
# file * | grep executable | cut -d: -f1
|
| 62 |
+
print("Giving executable permissions...")
|
| 63 |
+
execs = ['chrome', 'chrome_crashpad_handler', 'chrome_sandbox', 'chrome-wrapper', 'xdg-mime', 'xdg-settings']
|
| 64 |
+
for exec_file in execs:
|
| 65 |
+
exec_path = os.path.join(chrome_path, exec_file)
|
| 66 |
+
os.chmod(exec_path, 0o755)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def run_pyinstaller():
|
| 70 |
+
sep = ';' if os.name == 'nt' else ':'
|
| 71 |
+
result = subprocess.run([sys.executable, "-m", "PyInstaller",
|
| 72 |
+
"--icon", "resources/flaresolverr_logo.ico",
|
| 73 |
+
"--add-data", f"package.json{sep}.",
|
| 74 |
+
"--add-data", f"{os.path.join('dist_chrome', 'chrome')}{sep}chrome",
|
| 75 |
+
os.path.join("src", "flaresolverr.py")],
|
| 76 |
+
cwd=os.pardir, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
| 77 |
+
if result.returncode != 0:
|
| 78 |
+
print(result.stderr.decode('utf-8'))
|
| 79 |
+
raise Exception("Error running pyInstaller")
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def compress_package():
|
| 83 |
+
dist_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'dist')
|
| 84 |
+
package_folder = os.path.join(dist_folder, 'package')
|
| 85 |
+
shutil.move(os.path.join(dist_folder, 'flaresolverr'), os.path.join(package_folder, 'flaresolverr'))
|
| 86 |
+
print("Package folder: " + package_folder)
|
| 87 |
+
|
| 88 |
+
compr_format = 'zip' if os.name == 'nt' else 'gztar'
|
| 89 |
+
compr_file_name = 'flaresolverr_windows_x64' if os.name == 'nt' else 'flaresolverr_linux_x64'
|
| 90 |
+
compr_file_path = os.path.join(dist_folder, compr_file_name)
|
| 91 |
+
|
| 92 |
+
if compr_format == 'zip':
|
| 93 |
+
shutil.make_archive(compr_file_path, compr_format, package_folder)
|
| 94 |
+
print("Compressed file path: " + compr_file_path)
|
| 95 |
+
else:
|
| 96 |
+
def _reset_tarinfo(tarinfo):
|
| 97 |
+
tarinfo.uid = 0
|
| 98 |
+
tarinfo.gid = 0
|
| 99 |
+
tarinfo.uname = ""
|
| 100 |
+
tarinfo.gname = ""
|
| 101 |
+
return tarinfo
|
| 102 |
+
|
| 103 |
+
tar_path = compr_file_path + '.tar.gz'
|
| 104 |
+
with tarfile.open(tar_path, 'w:gz') as tar:
|
| 105 |
+
for entry in os.listdir(package_folder):
|
| 106 |
+
fullpath = os.path.join(package_folder, entry)
|
| 107 |
+
tar.add(fullpath, arcname=entry, filter=_reset_tarinfo)
|
| 108 |
+
print("Compressed file path: " + tar_path)
|
| 109 |
+
|
| 110 |
+
if __name__ == "__main__":
|
| 111 |
+
print("Building package...")
|
| 112 |
+
print("Platform: " + platform.platform())
|
| 113 |
+
|
| 114 |
+
print("Cleaning previous build...")
|
| 115 |
+
clean_files()
|
| 116 |
+
|
| 117 |
+
print("Downloading Chromium...")
|
| 118 |
+
download_chromium()
|
| 119 |
+
|
| 120 |
+
print("Building pyinstaller executable... ")
|
| 121 |
+
run_pyinstaller()
|
| 122 |
+
|
| 123 |
+
print("Compressing package... ")
|
| 124 |
+
compress_package()
|
| 125 |
+
|
| 126 |
+
# NOTE: python -m pip install pyinstaller
|
flaresolverr/dtos.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
STATUS_OK = "ok"
|
| 3 |
+
STATUS_ERROR = "error"
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class ChallengeResolutionResultT:
|
| 7 |
+
url: str = None
|
| 8 |
+
status: int = None
|
| 9 |
+
headers: list = None
|
| 10 |
+
response: str = None
|
| 11 |
+
cookies: list = None
|
| 12 |
+
userAgent: str = None
|
| 13 |
+
screenshot: str | None = None
|
| 14 |
+
turnstile_token: str = None
|
| 15 |
+
|
| 16 |
+
def __init__(self, _dict):
|
| 17 |
+
self.__dict__.update(_dict)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class ChallengeResolutionT:
|
| 21 |
+
status: str = None
|
| 22 |
+
message: str = None
|
| 23 |
+
result: ChallengeResolutionResultT = None
|
| 24 |
+
|
| 25 |
+
def __init__(self, _dict):
|
| 26 |
+
self.__dict__.update(_dict)
|
| 27 |
+
if self.result is not None:
|
| 28 |
+
self.result = ChallengeResolutionResultT(self.result)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class V1RequestBase(object):
|
| 32 |
+
# V1RequestBase
|
| 33 |
+
cmd: str = None
|
| 34 |
+
cookies: list = None
|
| 35 |
+
maxTimeout: int = None
|
| 36 |
+
proxy: dict = None
|
| 37 |
+
session: str = None
|
| 38 |
+
session_ttl_minutes: int = None
|
| 39 |
+
headers: list = None # deprecated v2.0.0, not used
|
| 40 |
+
userAgent: str = None # deprecated v2.0.0, not used
|
| 41 |
+
|
| 42 |
+
# V1Request
|
| 43 |
+
url: str = None
|
| 44 |
+
postData: str = None
|
| 45 |
+
returnOnlyCookies: bool = None
|
| 46 |
+
returnScreenshot: bool = None
|
| 47 |
+
download: bool = None # deprecated v2.0.0, not used
|
| 48 |
+
returnRawHtml: bool = None # deprecated v2.0.0, not used
|
| 49 |
+
waitInSeconds: int = None
|
| 50 |
+
# Optional resource blocking flag (blocks images, CSS, and fonts)
|
| 51 |
+
disableMedia: bool = None
|
| 52 |
+
# Optional when you've got a turnstile captcha that needs to be clicked after X number of Tab presses
|
| 53 |
+
tabs_till_verify : int = None
|
| 54 |
+
|
| 55 |
+
def __init__(self, _dict):
|
| 56 |
+
self.__dict__.update(_dict)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
class V1ResponseBase(object):
|
| 60 |
+
# V1ResponseBase
|
| 61 |
+
status: str = None
|
| 62 |
+
message: str = None
|
| 63 |
+
session: str = None
|
| 64 |
+
sessions: list[str] = None
|
| 65 |
+
startTimestamp: int = None
|
| 66 |
+
endTimestamp: int = None
|
| 67 |
+
version: str = None
|
| 68 |
+
|
| 69 |
+
# V1ResponseSolution
|
| 70 |
+
solution: ChallengeResolutionResultT = None
|
| 71 |
+
|
| 72 |
+
# hidden vars
|
| 73 |
+
__error_500__: bool = False
|
| 74 |
+
|
| 75 |
+
def __init__(self, _dict):
|
| 76 |
+
self.__dict__.update(_dict)
|
| 77 |
+
if self.solution is not None:
|
| 78 |
+
self.solution = ChallengeResolutionResultT(self.solution)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
class IndexResponse(object):
|
| 82 |
+
msg: str = None
|
| 83 |
+
version: str = None
|
| 84 |
+
userAgent: str = None
|
| 85 |
+
|
| 86 |
+
def __init__(self, _dict):
|
| 87 |
+
self.__dict__.update(_dict)
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
class HealthResponse(object):
|
| 91 |
+
status: str = None
|
| 92 |
+
|
| 93 |
+
def __init__(self, _dict):
|
| 94 |
+
self.__dict__.update(_dict)
|
flaresolverr/flaresolverr.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import logging
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
|
| 6 |
+
import certifi
|
| 7 |
+
from bottle import run, response, Bottle, request, ServerAdapter
|
| 8 |
+
|
| 9 |
+
from bottle_plugins.error_plugin import error_plugin
|
| 10 |
+
from bottle_plugins.logger_plugin import logger_plugin
|
| 11 |
+
from bottle_plugins import prometheus_plugin
|
| 12 |
+
from dtos import V1RequestBase
|
| 13 |
+
import flaresolverr_service
|
| 14 |
+
import utils
|
| 15 |
+
|
| 16 |
+
env_proxy_url = os.environ.get('PROXY_URL', None)
|
| 17 |
+
env_proxy_username = os.environ.get('PROXY_USERNAME', None)
|
| 18 |
+
env_proxy_password = os.environ.get('PROXY_PASSWORD', None)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class JSONErrorBottle(Bottle):
|
| 22 |
+
"""
|
| 23 |
+
Handle 404 errors
|
| 24 |
+
"""
|
| 25 |
+
def default_error_handler(self, res):
|
| 26 |
+
response.content_type = 'application/json'
|
| 27 |
+
return json.dumps(dict(error=res.body, status_code=res.status_code))
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
app = JSONErrorBottle()
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
@app.route('/')
|
| 34 |
+
def index():
|
| 35 |
+
"""
|
| 36 |
+
Show welcome message
|
| 37 |
+
"""
|
| 38 |
+
res = flaresolverr_service.index_endpoint()
|
| 39 |
+
return utils.object_to_dict(res)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@app.route('/health')
|
| 43 |
+
def health():
|
| 44 |
+
"""
|
| 45 |
+
Healthcheck endpoint.
|
| 46 |
+
This endpoint is special because it doesn't print traces
|
| 47 |
+
"""
|
| 48 |
+
res = flaresolverr_service.health_endpoint()
|
| 49 |
+
return utils.object_to_dict(res)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
@app.post('/v1')
|
| 53 |
+
def controller_v1():
|
| 54 |
+
"""
|
| 55 |
+
Controller v1
|
| 56 |
+
"""
|
| 57 |
+
data = request.json or {}
|
| 58 |
+
if (('proxy' not in data or not data.get('proxy')) and env_proxy_url is not None and (env_proxy_username is None and env_proxy_password is None)):
|
| 59 |
+
logging.info('Using proxy URL ENV')
|
| 60 |
+
data['proxy'] = {"url": env_proxy_url}
|
| 61 |
+
if (('proxy' not in data or not data.get('proxy')) and env_proxy_url is not None and (env_proxy_username is not None or env_proxy_password is not None)):
|
| 62 |
+
logging.info('Using proxy URL, username & password ENVs')
|
| 63 |
+
data['proxy'] = {"url": env_proxy_url, "username": env_proxy_username, "password": env_proxy_password}
|
| 64 |
+
req = V1RequestBase(data)
|
| 65 |
+
res = flaresolverr_service.controller_v1_endpoint(req)
|
| 66 |
+
if res.__error_500__:
|
| 67 |
+
response.status = 500
|
| 68 |
+
return utils.object_to_dict(res)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
if __name__ == "__main__":
|
| 72 |
+
# check python version
|
| 73 |
+
if sys.version_info < (3, 9):
|
| 74 |
+
raise Exception("The Python version is less than 3.9, a version equal to or higher is required.")
|
| 75 |
+
|
| 76 |
+
# fix for HEADLESS=false in Windows binary
|
| 77 |
+
# https://stackoverflow.com/a/27694505
|
| 78 |
+
if os.name == 'nt':
|
| 79 |
+
import multiprocessing
|
| 80 |
+
multiprocessing.freeze_support()
|
| 81 |
+
|
| 82 |
+
# fix ssl certificates for compiled binaries
|
| 83 |
+
# https://github.com/pyinstaller/pyinstaller/issues/7229
|
| 84 |
+
# https://stackoverflow.com/q/55736855
|
| 85 |
+
os.environ["REQUESTS_CA_BUNDLE"] = certifi.where()
|
| 86 |
+
os.environ["SSL_CERT_FILE"] = certifi.where()
|
| 87 |
+
|
| 88 |
+
# validate configuration
|
| 89 |
+
log_level = os.environ.get('LOG_LEVEL', 'info').upper()
|
| 90 |
+
log_file = os.environ.get('LOG_FILE', None)
|
| 91 |
+
log_html = utils.get_config_log_html()
|
| 92 |
+
headless = utils.get_config_headless()
|
| 93 |
+
server_host = os.environ.get('HOST', '0.0.0.0')
|
| 94 |
+
server_port = int(os.environ.get('PORT', 8191))
|
| 95 |
+
|
| 96 |
+
# configure logger
|
| 97 |
+
logger_format = '%(asctime)s %(levelname)-8s %(message)s'
|
| 98 |
+
if log_level == 'DEBUG':
|
| 99 |
+
logger_format = '%(asctime)s %(levelname)-8s ReqId %(thread)s %(message)s'
|
| 100 |
+
if log_file:
|
| 101 |
+
log_file = os.path.realpath(log_file)
|
| 102 |
+
log_path = os.path.dirname(log_file)
|
| 103 |
+
os.makedirs(log_path, exist_ok=True)
|
| 104 |
+
logging.basicConfig(
|
| 105 |
+
format=logger_format,
|
| 106 |
+
level=log_level,
|
| 107 |
+
datefmt='%Y-%m-%d %H:%M:%S',
|
| 108 |
+
handlers=[
|
| 109 |
+
logging.StreamHandler(sys.stdout),
|
| 110 |
+
logging.FileHandler(log_file)
|
| 111 |
+
]
|
| 112 |
+
)
|
| 113 |
+
else:
|
| 114 |
+
logging.basicConfig(
|
| 115 |
+
format=logger_format,
|
| 116 |
+
level=log_level,
|
| 117 |
+
datefmt='%Y-%m-%d %H:%M:%S',
|
| 118 |
+
handlers=[
|
| 119 |
+
logging.StreamHandler(sys.stdout)
|
| 120 |
+
]
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
# disable warning traces from urllib3
|
| 124 |
+
logging.getLogger('urllib3').setLevel(logging.ERROR)
|
| 125 |
+
logging.getLogger('selenium.webdriver.remote.remote_connection').setLevel(logging.WARNING)
|
| 126 |
+
logging.getLogger('undetected_chromedriver').setLevel(logging.WARNING)
|
| 127 |
+
|
| 128 |
+
logging.info(f'FlareSolverr {utils.get_flaresolverr_version()}')
|
| 129 |
+
logging.debug('Debug log enabled')
|
| 130 |
+
|
| 131 |
+
# Get current OS for global variable
|
| 132 |
+
utils.get_current_platform()
|
| 133 |
+
|
| 134 |
+
# test browser installation
|
| 135 |
+
if os.environ.get('SKIP_BROWSER_TEST', 'false').lower() != 'true':
|
| 136 |
+
flaresolverr_service.test_browser_installation()
|
| 137 |
+
else:
|
| 138 |
+
logging.info("Skipping browser installation test for faster boot.")
|
| 139 |
+
|
| 140 |
+
# start bootle plugins
|
| 141 |
+
# plugin order is important
|
| 142 |
+
app.install(logger_plugin)
|
| 143 |
+
app.install(error_plugin)
|
| 144 |
+
prometheus_plugin.setup()
|
| 145 |
+
app.install(prometheus_plugin.prometheus_plugin)
|
| 146 |
+
|
| 147 |
+
# start webserver
|
| 148 |
+
# default server 'wsgiref' does not support concurrent requests
|
| 149 |
+
# https://github.com/FlareSolverr/FlareSolverr/issues/680
|
| 150 |
+
# https://github.com/Pylons/waitress/issues/31
|
| 151 |
+
class WaitressServerPoll(ServerAdapter):
|
| 152 |
+
def run(self, handler):
|
| 153 |
+
from waitress import serve
|
| 154 |
+
serve(handler, host=self.host, port=self.port, asyncore_use_poll=True)
|
| 155 |
+
run(app, host=server_host, port=server_port, quiet=True, server=WaitressServerPoll)
|
flaresolverr/flaresolverr_service.py
ADDED
|
@@ -0,0 +1,519 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import platform
|
| 3 |
+
import sys
|
| 4 |
+
import time
|
| 5 |
+
from datetime import timedelta
|
| 6 |
+
from html import escape
|
| 7 |
+
from urllib.parse import unquote, quote
|
| 8 |
+
|
| 9 |
+
from func_timeout import FunctionTimedOut, func_timeout
|
| 10 |
+
from selenium.common import TimeoutException
|
| 11 |
+
from selenium.webdriver.chrome.webdriver import WebDriver
|
| 12 |
+
from selenium.webdriver.common.by import By
|
| 13 |
+
from selenium.webdriver.common.keys import Keys
|
| 14 |
+
from selenium.webdriver.support.expected_conditions import (
|
| 15 |
+
presence_of_element_located, staleness_of, title_is)
|
| 16 |
+
from selenium.webdriver.common.action_chains import ActionChains
|
| 17 |
+
from selenium.webdriver.support.wait import WebDriverWait
|
| 18 |
+
|
| 19 |
+
import utils
|
| 20 |
+
from dtos import (STATUS_ERROR, STATUS_OK, ChallengeResolutionResultT,
|
| 21 |
+
ChallengeResolutionT, HealthResponse, IndexResponse,
|
| 22 |
+
V1RequestBase, V1ResponseBase)
|
| 23 |
+
from sessions import SessionsStorage
|
| 24 |
+
|
| 25 |
+
ACCESS_DENIED_TITLES = [
|
| 26 |
+
# Cloudflare
|
| 27 |
+
'Access denied',
|
| 28 |
+
# Cloudflare http://bitturk.net/ Firefox
|
| 29 |
+
'Attention Required! | Cloudflare'
|
| 30 |
+
]
|
| 31 |
+
ACCESS_DENIED_SELECTORS = [
|
| 32 |
+
# Cloudflare
|
| 33 |
+
'div.cf-error-title span.cf-code-label span',
|
| 34 |
+
# Cloudflare http://bitturk.net/ Firefox
|
| 35 |
+
'#cf-error-details div.cf-error-overview h1'
|
| 36 |
+
]
|
| 37 |
+
CHALLENGE_TITLES = [
|
| 38 |
+
# Cloudflare
|
| 39 |
+
'Just a moment...',
|
| 40 |
+
# DDoS-GUARD
|
| 41 |
+
'DDoS-Guard'
|
| 42 |
+
]
|
| 43 |
+
CHALLENGE_SELECTORS = [
|
| 44 |
+
# Cloudflare
|
| 45 |
+
'#cf-challenge-running', '.ray_id', '.attack-box', '#cf-please-wait', '#challenge-spinner', '#trk_jschal_js', '#turnstile-wrapper', '.lds-ring',
|
| 46 |
+
# Custom CloudFlare for EbookParadijs, Film-Paleis, MuziekFabriek and Puur-Hollands
|
| 47 |
+
'td.info #js_info',
|
| 48 |
+
# Fairlane / pararius.com
|
| 49 |
+
'div.vc div.text-box h2'
|
| 50 |
+
]
|
| 51 |
+
|
| 52 |
+
TURNSTILE_SELECTORS = [
|
| 53 |
+
"input[name='cf-turnstile-response']"
|
| 54 |
+
]
|
| 55 |
+
|
| 56 |
+
SHORT_TIMEOUT = 1
|
| 57 |
+
SESSIONS_STORAGE = SessionsStorage()
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def test_browser_installation():
|
| 61 |
+
logging.info("Testing web browser installation...")
|
| 62 |
+
logging.info("Platform: " + platform.platform())
|
| 63 |
+
|
| 64 |
+
chrome_exe_path = utils.get_chrome_exe_path()
|
| 65 |
+
if chrome_exe_path is None:
|
| 66 |
+
logging.error("Chrome / Chromium web browser not installed!")
|
| 67 |
+
sys.exit(1)
|
| 68 |
+
else:
|
| 69 |
+
logging.info("Chrome / Chromium path: " + chrome_exe_path)
|
| 70 |
+
|
| 71 |
+
chrome_major_version = utils.get_chrome_major_version()
|
| 72 |
+
if chrome_major_version == '':
|
| 73 |
+
logging.error("Chrome / Chromium version not detected!")
|
| 74 |
+
sys.exit(1)
|
| 75 |
+
else:
|
| 76 |
+
logging.info("Chrome / Chromium major version: " + chrome_major_version)
|
| 77 |
+
|
| 78 |
+
logging.info("Launching web browser...")
|
| 79 |
+
user_agent = utils.get_user_agent()
|
| 80 |
+
logging.info("FlareSolverr User-Agent: " + user_agent)
|
| 81 |
+
logging.info("Test successful!")
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def index_endpoint() -> IndexResponse:
|
| 85 |
+
res = IndexResponse({})
|
| 86 |
+
res.msg = "FlareSolverr is ready!"
|
| 87 |
+
res.version = utils.get_flaresolverr_version()
|
| 88 |
+
res.userAgent = utils.get_user_agent()
|
| 89 |
+
return res
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def health_endpoint() -> HealthResponse:
|
| 93 |
+
res = HealthResponse({})
|
| 94 |
+
res.status = STATUS_OK
|
| 95 |
+
return res
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def controller_v1_endpoint(req: V1RequestBase) -> V1ResponseBase:
|
| 99 |
+
start_ts = int(time.time() * 1000)
|
| 100 |
+
logging.info(f"Incoming request => POST /v1 body: {utils.object_to_dict(req)}")
|
| 101 |
+
res: V1ResponseBase
|
| 102 |
+
try:
|
| 103 |
+
res = _controller_v1_handler(req)
|
| 104 |
+
except Exception as e:
|
| 105 |
+
res = V1ResponseBase({})
|
| 106 |
+
res.__error_500__ = True
|
| 107 |
+
res.status = STATUS_ERROR
|
| 108 |
+
res.message = "Error: " + str(e)
|
| 109 |
+
logging.error(res.message)
|
| 110 |
+
|
| 111 |
+
res.startTimestamp = start_ts
|
| 112 |
+
res.endTimestamp = int(time.time() * 1000)
|
| 113 |
+
res.version = utils.get_flaresolverr_version()
|
| 114 |
+
logging.debug(f"Response => POST /v1 body: {utils.object_to_dict(res)}")
|
| 115 |
+
logging.info(f"Response in {(res.endTimestamp - res.startTimestamp) / 1000} s")
|
| 116 |
+
return res
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def _controller_v1_handler(req: V1RequestBase) -> V1ResponseBase:
|
| 120 |
+
# do some validations
|
| 121 |
+
if req.cmd is None:
|
| 122 |
+
raise Exception("Request parameter 'cmd' is mandatory.")
|
| 123 |
+
if req.headers is not None:
|
| 124 |
+
logging.warning("Request parameter 'headers' was removed in FlareSolverr v2.")
|
| 125 |
+
if req.userAgent is not None:
|
| 126 |
+
logging.warning("Request parameter 'userAgent' was removed in FlareSolverr v2.")
|
| 127 |
+
|
| 128 |
+
# set default values
|
| 129 |
+
if req.maxTimeout is None or int(req.maxTimeout) < 1:
|
| 130 |
+
req.maxTimeout = 60000
|
| 131 |
+
|
| 132 |
+
# execute the command
|
| 133 |
+
res: V1ResponseBase
|
| 134 |
+
if req.cmd == 'sessions.create':
|
| 135 |
+
res = _cmd_sessions_create(req)
|
| 136 |
+
elif req.cmd == 'sessions.list':
|
| 137 |
+
res = _cmd_sessions_list(req)
|
| 138 |
+
elif req.cmd == 'sessions.destroy':
|
| 139 |
+
res = _cmd_sessions_destroy(req)
|
| 140 |
+
elif req.cmd == 'request.get':
|
| 141 |
+
res = _cmd_request_get(req)
|
| 142 |
+
elif req.cmd == 'request.post':
|
| 143 |
+
res = _cmd_request_post(req)
|
| 144 |
+
else:
|
| 145 |
+
raise Exception(f"Request parameter 'cmd' = '{req.cmd}' is invalid.")
|
| 146 |
+
|
| 147 |
+
return res
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def _cmd_request_get(req: V1RequestBase) -> V1ResponseBase:
|
| 151 |
+
# do some validations
|
| 152 |
+
if req.url is None:
|
| 153 |
+
raise Exception("Request parameter 'url' is mandatory in 'request.get' command.")
|
| 154 |
+
if req.postData is not None:
|
| 155 |
+
raise Exception("Cannot use 'postBody' when sending a GET request.")
|
| 156 |
+
if req.returnRawHtml is not None:
|
| 157 |
+
logging.warning("Request parameter 'returnRawHtml' was removed in FlareSolverr v2.")
|
| 158 |
+
if req.download is not None:
|
| 159 |
+
logging.warning("Request parameter 'download' was removed in FlareSolverr v2.")
|
| 160 |
+
|
| 161 |
+
challenge_res = _resolve_challenge(req, 'GET')
|
| 162 |
+
res = V1ResponseBase({})
|
| 163 |
+
res.status = challenge_res.status
|
| 164 |
+
res.message = challenge_res.message
|
| 165 |
+
res.solution = challenge_res.result
|
| 166 |
+
return res
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def _cmd_request_post(req: V1RequestBase) -> V1ResponseBase:
|
| 170 |
+
# do some validations
|
| 171 |
+
if req.postData is None:
|
| 172 |
+
raise Exception("Request parameter 'postData' is mandatory in 'request.post' command.")
|
| 173 |
+
if req.returnRawHtml is not None:
|
| 174 |
+
logging.warning("Request parameter 'returnRawHtml' was removed in FlareSolverr v2.")
|
| 175 |
+
if req.download is not None:
|
| 176 |
+
logging.warning("Request parameter 'download' was removed in FlareSolverr v2.")
|
| 177 |
+
|
| 178 |
+
challenge_res = _resolve_challenge(req, 'POST')
|
| 179 |
+
res = V1ResponseBase({})
|
| 180 |
+
res.status = challenge_res.status
|
| 181 |
+
res.message = challenge_res.message
|
| 182 |
+
res.solution = challenge_res.result
|
| 183 |
+
return res
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
def _cmd_sessions_create(req: V1RequestBase) -> V1ResponseBase:
|
| 187 |
+
logging.debug("Creating new session...")
|
| 188 |
+
|
| 189 |
+
session, fresh = SESSIONS_STORAGE.create(session_id=req.session, proxy=req.proxy)
|
| 190 |
+
session_id = session.session_id
|
| 191 |
+
|
| 192 |
+
if not fresh:
|
| 193 |
+
return V1ResponseBase({
|
| 194 |
+
"status": STATUS_OK,
|
| 195 |
+
"message": "Session already exists.",
|
| 196 |
+
"session": session_id
|
| 197 |
+
})
|
| 198 |
+
|
| 199 |
+
return V1ResponseBase({
|
| 200 |
+
"status": STATUS_OK,
|
| 201 |
+
"message": "Session created successfully.",
|
| 202 |
+
"session": session_id
|
| 203 |
+
})
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
def _cmd_sessions_list(req: V1RequestBase) -> V1ResponseBase:
|
| 207 |
+
session_ids = SESSIONS_STORAGE.session_ids()
|
| 208 |
+
|
| 209 |
+
return V1ResponseBase({
|
| 210 |
+
"status": STATUS_OK,
|
| 211 |
+
"message": "",
|
| 212 |
+
"sessions": session_ids
|
| 213 |
+
})
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
def _cmd_sessions_destroy(req: V1RequestBase) -> V1ResponseBase:
|
| 217 |
+
session_id = req.session
|
| 218 |
+
existed = SESSIONS_STORAGE.destroy(session_id)
|
| 219 |
+
|
| 220 |
+
if not existed:
|
| 221 |
+
raise Exception("The session doesn't exist.")
|
| 222 |
+
|
| 223 |
+
return V1ResponseBase({
|
| 224 |
+
"status": STATUS_OK,
|
| 225 |
+
"message": "The session has been removed."
|
| 226 |
+
})
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
def _resolve_challenge(req: V1RequestBase, method: str) -> ChallengeResolutionT:
|
| 230 |
+
timeout = int(req.maxTimeout) / 1000
|
| 231 |
+
driver = None
|
| 232 |
+
try:
|
| 233 |
+
if req.session:
|
| 234 |
+
session_id = req.session
|
| 235 |
+
ttl = timedelta(minutes=req.session_ttl_minutes) if req.session_ttl_minutes else None
|
| 236 |
+
session, fresh = SESSIONS_STORAGE.get(session_id, ttl)
|
| 237 |
+
|
| 238 |
+
if fresh:
|
| 239 |
+
logging.debug(f"new session created to perform the request (session_id={session_id})")
|
| 240 |
+
else:
|
| 241 |
+
logging.debug(f"existing session is used to perform the request (session_id={session_id}, "
|
| 242 |
+
f"lifetime={str(session.lifetime())}, ttl={str(ttl)})")
|
| 243 |
+
|
| 244 |
+
driver = session.driver
|
| 245 |
+
else:
|
| 246 |
+
driver = utils.get_webdriver(req.proxy)
|
| 247 |
+
logging.debug('New instance of webdriver has been created to perform the request')
|
| 248 |
+
return func_timeout(timeout, _evil_logic, (req, driver, method))
|
| 249 |
+
except FunctionTimedOut:
|
| 250 |
+
raise Exception(f'Error solving the challenge. Timeout after {timeout} seconds.')
|
| 251 |
+
except Exception as e:
|
| 252 |
+
raise Exception('Error solving the challenge. ' + str(e).replace('\n', '\\n'))
|
| 253 |
+
finally:
|
| 254 |
+
if not req.session and driver is not None:
|
| 255 |
+
if utils.PLATFORM_VERSION == "nt":
|
| 256 |
+
driver.close()
|
| 257 |
+
driver.quit()
|
| 258 |
+
logging.debug('A used instance of webdriver has been destroyed')
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
def click_verify(driver: WebDriver, num_tabs: int = 1):
|
| 262 |
+
try:
|
| 263 |
+
logging.debug("Try to find the Cloudflare verify checkbox...")
|
| 264 |
+
actions = ActionChains(driver)
|
| 265 |
+
actions.pause(5)
|
| 266 |
+
for _ in range(num_tabs):
|
| 267 |
+
actions.send_keys(Keys.TAB).pause(0.1)
|
| 268 |
+
actions.pause(1)
|
| 269 |
+
actions.send_keys(Keys.SPACE).perform()
|
| 270 |
+
|
| 271 |
+
logging.debug(f"Cloudflare verify checkbox clicked after {num_tabs} tabs!")
|
| 272 |
+
except Exception:
|
| 273 |
+
logging.debug("Cloudflare verify checkbox not found on the page.")
|
| 274 |
+
finally:
|
| 275 |
+
driver.switch_to.default_content()
|
| 276 |
+
|
| 277 |
+
try:
|
| 278 |
+
logging.debug("Try to find the Cloudflare 'Verify you are human' button...")
|
| 279 |
+
button = driver.find_element(
|
| 280 |
+
by=By.XPATH,
|
| 281 |
+
value="//input[@type='button' and @value='Verify you are human']",
|
| 282 |
+
)
|
| 283 |
+
if button:
|
| 284 |
+
actions = ActionChains(driver)
|
| 285 |
+
actions.move_to_element_with_offset(button, 5, 7)
|
| 286 |
+
actions.click(button)
|
| 287 |
+
actions.perform()
|
| 288 |
+
logging.debug("The Cloudflare 'Verify you are human' button found and clicked!")
|
| 289 |
+
except Exception:
|
| 290 |
+
logging.debug("The Cloudflare 'Verify you are human' button not found on the page.")
|
| 291 |
+
|
| 292 |
+
time.sleep(2)
|
| 293 |
+
|
| 294 |
+
def _get_turnstile_token(driver: WebDriver, tabs: int):
|
| 295 |
+
token_input = driver.find_element(By.CSS_SELECTOR, "input[name='cf-turnstile-response']")
|
| 296 |
+
current_value = token_input.get_attribute("value")
|
| 297 |
+
while True:
|
| 298 |
+
click_verify(driver, num_tabs=tabs)
|
| 299 |
+
turnstile_token = token_input.get_attribute("value")
|
| 300 |
+
if turnstile_token:
|
| 301 |
+
if turnstile_token != current_value:
|
| 302 |
+
logging.info(f"Turnstile token: {turnstile_token}")
|
| 303 |
+
return turnstile_token
|
| 304 |
+
logging.debug(f"Failed to extract token possibly click failed")
|
| 305 |
+
|
| 306 |
+
# reset focus
|
| 307 |
+
driver.execute_script("""
|
| 308 |
+
let el = document.createElement('button');
|
| 309 |
+
el.style.position='fixed';
|
| 310 |
+
el.style.top='0';
|
| 311 |
+
el.style.left='0';
|
| 312 |
+
document.body.prepend(el);
|
| 313 |
+
el.focus();
|
| 314 |
+
""")
|
| 315 |
+
time.sleep(1)
|
| 316 |
+
|
| 317 |
+
def _resolve_turnstile_captcha(req: V1RequestBase, driver: WebDriver):
|
| 318 |
+
turnstile_token = None
|
| 319 |
+
if req.tabs_till_verify is not None:
|
| 320 |
+
logging.debug(f'Navigating to... {req.url} in order to pass the turnstile challenge')
|
| 321 |
+
driver.get(req.url)
|
| 322 |
+
|
| 323 |
+
turnstile_challenge_found = False
|
| 324 |
+
for selector in TURNSTILE_SELECTORS:
|
| 325 |
+
found_elements = driver.find_elements(By.CSS_SELECTOR, selector)
|
| 326 |
+
if len(found_elements) > 0:
|
| 327 |
+
turnstile_challenge_found = True
|
| 328 |
+
logging.info("Turnstile challenge detected. Selector found: " + selector)
|
| 329 |
+
break
|
| 330 |
+
if turnstile_challenge_found:
|
| 331 |
+
turnstile_token = _get_turnstile_token(driver=driver, tabs=req.tabs_till_verify)
|
| 332 |
+
else:
|
| 333 |
+
logging.debug(f'Turnstile challenge not found')
|
| 334 |
+
return turnstile_token
|
| 335 |
+
|
| 336 |
+
def _evil_logic(req: V1RequestBase, driver: WebDriver, method: str) -> ChallengeResolutionT:
|
| 337 |
+
res = ChallengeResolutionT({})
|
| 338 |
+
res.status = STATUS_OK
|
| 339 |
+
res.message = ""
|
| 340 |
+
|
| 341 |
+
# optionally block resources like images/css/fonts using CDP
|
| 342 |
+
disable_media = utils.get_config_disable_media()
|
| 343 |
+
if req.disableMedia is not None:
|
| 344 |
+
disable_media = req.disableMedia
|
| 345 |
+
if disable_media:
|
| 346 |
+
block_urls = [
|
| 347 |
+
# Images
|
| 348 |
+
"*.png", "*.jpg", "*.jpeg", "*.gif", "*.webp", "*.bmp", "*.svg", "*.ico",
|
| 349 |
+
"*.PNG", "*.JPG", "*.JPEG", "*.GIF", "*.WEBP", "*.BMP", "*.SVG", "*.ICO",
|
| 350 |
+
"*.tiff", "*.tif", "*.jpe", "*.apng", "*.avif", "*.heic", "*.heif",
|
| 351 |
+
"*.TIFF", "*.TIF", "*.JPE", "*.APNG", "*.AVIF", "*.HEIC", "*.HEIF",
|
| 352 |
+
# Stylesheets
|
| 353 |
+
"*.css",
|
| 354 |
+
"*.CSS",
|
| 355 |
+
# Fonts
|
| 356 |
+
"*.woff", "*.woff2", "*.ttf", "*.otf", "*.eot",
|
| 357 |
+
"*.WOFF", "*.WOFF2", "*.TTF", "*.OTF", "*.EOT"
|
| 358 |
+
]
|
| 359 |
+
try:
|
| 360 |
+
logging.debug("Network.setBlockedURLs: %s", block_urls)
|
| 361 |
+
driver.execute_cdp_cmd("Network.enable", {})
|
| 362 |
+
driver.execute_cdp_cmd("Network.setBlockedURLs", {"urls": block_urls})
|
| 363 |
+
except Exception:
|
| 364 |
+
# if CDP commands are not available or fail, ignore and continue
|
| 365 |
+
logging.debug("Network.setBlockedURLs failed or unsupported on this webdriver")
|
| 366 |
+
|
| 367 |
+
# navigate to the page
|
| 368 |
+
logging.debug(f"Navigating to... {req.url}")
|
| 369 |
+
turnstile_token = None
|
| 370 |
+
|
| 371 |
+
if method == "POST":
|
| 372 |
+
_post_request(req, driver)
|
| 373 |
+
else:
|
| 374 |
+
if req.tabs_till_verify is None:
|
| 375 |
+
driver.get(req.url)
|
| 376 |
+
else:
|
| 377 |
+
turnstile_token = _resolve_turnstile_captcha(req, driver)
|
| 378 |
+
|
| 379 |
+
# set cookies if required
|
| 380 |
+
if req.cookies is not None and len(req.cookies) > 0:
|
| 381 |
+
logging.debug(f'Setting cookies...')
|
| 382 |
+
for cookie in req.cookies:
|
| 383 |
+
driver.delete_cookie(cookie['name'])
|
| 384 |
+
driver.add_cookie(cookie)
|
| 385 |
+
# reload the page
|
| 386 |
+
if method == 'POST':
|
| 387 |
+
_post_request(req, driver)
|
| 388 |
+
else:
|
| 389 |
+
driver.get(req.url)
|
| 390 |
+
|
| 391 |
+
# wait for the page
|
| 392 |
+
if utils.get_config_log_html():
|
| 393 |
+
logging.debug(f"Response HTML:\n{driver.page_source}")
|
| 394 |
+
html_element = driver.find_element(By.TAG_NAME, "html")
|
| 395 |
+
page_title = driver.title
|
| 396 |
+
|
| 397 |
+
# find access denied titles
|
| 398 |
+
for title in ACCESS_DENIED_TITLES:
|
| 399 |
+
if page_title.startswith(title):
|
| 400 |
+
raise Exception('Cloudflare has blocked this request. '
|
| 401 |
+
'Probably your IP is banned for this site, check in your web browser.')
|
| 402 |
+
# find access denied selectors
|
| 403 |
+
for selector in ACCESS_DENIED_SELECTORS:
|
| 404 |
+
found_elements = driver.find_elements(By.CSS_SELECTOR, selector)
|
| 405 |
+
if len(found_elements) > 0:
|
| 406 |
+
raise Exception('Cloudflare has blocked this request. '
|
| 407 |
+
'Probably your IP is banned for this site, check in your web browser.')
|
| 408 |
+
|
| 409 |
+
# find challenge by title
|
| 410 |
+
challenge_found = False
|
| 411 |
+
for title in CHALLENGE_TITLES:
|
| 412 |
+
if title.lower() == page_title.lower():
|
| 413 |
+
challenge_found = True
|
| 414 |
+
logging.info("Challenge detected. Title found: " + page_title)
|
| 415 |
+
break
|
| 416 |
+
if not challenge_found:
|
| 417 |
+
# find challenge by selectors
|
| 418 |
+
for selector in CHALLENGE_SELECTORS:
|
| 419 |
+
found_elements = driver.find_elements(By.CSS_SELECTOR, selector)
|
| 420 |
+
if len(found_elements) > 0:
|
| 421 |
+
challenge_found = True
|
| 422 |
+
logging.info("Challenge detected. Selector found: " + selector)
|
| 423 |
+
break
|
| 424 |
+
|
| 425 |
+
attempt = 0
|
| 426 |
+
if challenge_found:
|
| 427 |
+
while True:
|
| 428 |
+
try:
|
| 429 |
+
attempt = attempt + 1
|
| 430 |
+
# wait until the title changes
|
| 431 |
+
for title in CHALLENGE_TITLES:
|
| 432 |
+
logging.debug("Waiting for title (attempt " + str(attempt) + "): " + title)
|
| 433 |
+
WebDriverWait(driver, SHORT_TIMEOUT).until_not(title_is(title))
|
| 434 |
+
|
| 435 |
+
# then wait until all the selectors disappear
|
| 436 |
+
for selector in CHALLENGE_SELECTORS:
|
| 437 |
+
logging.debug("Waiting for selector (attempt " + str(attempt) + "): " + selector)
|
| 438 |
+
WebDriverWait(driver, SHORT_TIMEOUT).until_not(
|
| 439 |
+
presence_of_element_located((By.CSS_SELECTOR, selector)))
|
| 440 |
+
|
| 441 |
+
# all elements not found
|
| 442 |
+
break
|
| 443 |
+
|
| 444 |
+
except TimeoutException:
|
| 445 |
+
logging.debug("Timeout waiting for selector")
|
| 446 |
+
|
| 447 |
+
click_verify(driver)
|
| 448 |
+
|
| 449 |
+
# update the html (cloudflare reloads the page every 5 s)
|
| 450 |
+
html_element = driver.find_element(By.TAG_NAME, "html")
|
| 451 |
+
|
| 452 |
+
# waits until cloudflare redirection ends
|
| 453 |
+
logging.debug("Waiting for redirect")
|
| 454 |
+
# noinspection PyBroadException
|
| 455 |
+
try:
|
| 456 |
+
WebDriverWait(driver, SHORT_TIMEOUT).until(staleness_of(html_element))
|
| 457 |
+
except Exception:
|
| 458 |
+
logging.debug("Timeout waiting for redirect")
|
| 459 |
+
|
| 460 |
+
logging.info("Challenge solved!")
|
| 461 |
+
res.message = "Challenge solved!"
|
| 462 |
+
else:
|
| 463 |
+
logging.info("Challenge not detected!")
|
| 464 |
+
res.message = "Challenge not detected!"
|
| 465 |
+
|
| 466 |
+
challenge_res = ChallengeResolutionResultT({})
|
| 467 |
+
challenge_res.url = driver.current_url
|
| 468 |
+
challenge_res.status = 200 # todo: fix, selenium not provides this info
|
| 469 |
+
challenge_res.cookies = driver.get_cookies()
|
| 470 |
+
challenge_res.userAgent = utils.get_user_agent(driver)
|
| 471 |
+
challenge_res.turnstile_token = turnstile_token
|
| 472 |
+
|
| 473 |
+
if not req.returnOnlyCookies:
|
| 474 |
+
challenge_res.headers = {} # todo: fix, selenium not provides this info
|
| 475 |
+
|
| 476 |
+
if req.waitInSeconds and req.waitInSeconds > 0:
|
| 477 |
+
logging.info("Waiting " + str(req.waitInSeconds) + " seconds before returning the response...")
|
| 478 |
+
time.sleep(req.waitInSeconds)
|
| 479 |
+
|
| 480 |
+
challenge_res.response = driver.page_source
|
| 481 |
+
|
| 482 |
+
if req.returnScreenshot:
|
| 483 |
+
challenge_res.screenshot = driver.get_screenshot_as_base64()
|
| 484 |
+
|
| 485 |
+
res.result = challenge_res
|
| 486 |
+
return res
|
| 487 |
+
|
| 488 |
+
|
| 489 |
+
def _post_request(req: V1RequestBase, driver: WebDriver):
|
| 490 |
+
post_form = f'<form id="hackForm" action="{req.url}" method="POST">'
|
| 491 |
+
query_string = req.postData if req.postData and req.postData[0] != '?' else req.postData[1:] if req.postData else ''
|
| 492 |
+
pairs = query_string.split('&')
|
| 493 |
+
for pair in pairs:
|
| 494 |
+
parts = pair.split('=', 1)
|
| 495 |
+
# noinspection PyBroadException
|
| 496 |
+
try:
|
| 497 |
+
name = unquote(parts[0])
|
| 498 |
+
except Exception:
|
| 499 |
+
name = parts[0]
|
| 500 |
+
if name == 'submit':
|
| 501 |
+
continue
|
| 502 |
+
# noinspection PyBroadException
|
| 503 |
+
try:
|
| 504 |
+
value = unquote(parts[1]) if len(parts) > 1 else ''
|
| 505 |
+
except Exception:
|
| 506 |
+
value = parts[1] if len(parts) > 1 else ''
|
| 507 |
+
# Protection of " character, for syntax
|
| 508 |
+
value=value.replace('"','"')
|
| 509 |
+
post_form += f'<input type="text" name="{escape(quote(name))}" value="{escape(quote(value))}"><br>'
|
| 510 |
+
post_form += '</form>'
|
| 511 |
+
html_content = f"""
|
| 512 |
+
<!DOCTYPE html>
|
| 513 |
+
<html>
|
| 514 |
+
<body>
|
| 515 |
+
{post_form}
|
| 516 |
+
<script>document.getElementById('hackForm').submit();</script>
|
| 517 |
+
</body>
|
| 518 |
+
</html>"""
|
| 519 |
+
driver.get("data:text/html;charset=utf-8,{html_content}".format(html_content=html_content))
|
flaresolverr/metrics.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
|
| 3 |
+
from prometheus_client import Counter, Histogram, start_http_server
|
| 4 |
+
import time
|
| 5 |
+
|
| 6 |
+
REQUEST_COUNTER = Counter(
|
| 7 |
+
name='flaresolverr_request',
|
| 8 |
+
documentation='Total requests with result',
|
| 9 |
+
labelnames=['domain', 'result']
|
| 10 |
+
)
|
| 11 |
+
REQUEST_DURATION = Histogram(
|
| 12 |
+
name='flaresolverr_request_duration',
|
| 13 |
+
documentation='Request duration in seconds',
|
| 14 |
+
labelnames=['domain'],
|
| 15 |
+
buckets=[0, 10, 25, 50]
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def serve(port):
|
| 20 |
+
start_http_server(port=port)
|
| 21 |
+
while True:
|
| 22 |
+
time.sleep(600)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def start_metrics_http_server(prometheus_port: int):
|
| 26 |
+
logging.info(f"Serving Prometheus exporter on http://0.0.0.0:{prometheus_port}/metrics")
|
| 27 |
+
from threading import Thread
|
| 28 |
+
Thread(
|
| 29 |
+
target=serve,
|
| 30 |
+
kwargs=dict(port=prometheus_port),
|
| 31 |
+
daemon=True,
|
| 32 |
+
).start()
|
flaresolverr/sessions.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from dataclasses import dataclass
|
| 3 |
+
from datetime import datetime, timedelta
|
| 4 |
+
from typing import Optional, Tuple
|
| 5 |
+
from uuid import uuid1
|
| 6 |
+
|
| 7 |
+
from selenium.webdriver.chrome.webdriver import WebDriver
|
| 8 |
+
|
| 9 |
+
import utils
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@dataclass
|
| 13 |
+
class Session:
|
| 14 |
+
session_id: str
|
| 15 |
+
driver: WebDriver
|
| 16 |
+
created_at: datetime
|
| 17 |
+
|
| 18 |
+
def lifetime(self) -> timedelta:
|
| 19 |
+
return datetime.now() - self.created_at
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class SessionsStorage:
|
| 23 |
+
"""SessionsStorage creates, stores and process all the sessions"""
|
| 24 |
+
|
| 25 |
+
def __init__(self):
|
| 26 |
+
self.sessions = {}
|
| 27 |
+
|
| 28 |
+
def create(self, session_id: Optional[str] = None, proxy: Optional[dict] = None,
|
| 29 |
+
force_new: Optional[bool] = False) -> Tuple[Session, bool]:
|
| 30 |
+
"""create creates new instance of WebDriver if necessary,
|
| 31 |
+
assign defined (or newly generated) session_id to the instance
|
| 32 |
+
and returns the session object. If a new session has been created
|
| 33 |
+
second argument is set to True.
|
| 34 |
+
|
| 35 |
+
Note: The function is idempotent, so in case if session_id
|
| 36 |
+
already exists in the storage a new instance of WebDriver won't be created
|
| 37 |
+
and existing session will be returned. Second argument defines if
|
| 38 |
+
new session has been created (True) or an existing one was used (False).
|
| 39 |
+
"""
|
| 40 |
+
session_id = session_id or str(uuid1())
|
| 41 |
+
|
| 42 |
+
if force_new:
|
| 43 |
+
self.destroy(session_id)
|
| 44 |
+
|
| 45 |
+
if self.exists(session_id):
|
| 46 |
+
return self.sessions[session_id], False
|
| 47 |
+
|
| 48 |
+
driver = utils.get_webdriver(proxy)
|
| 49 |
+
created_at = datetime.now()
|
| 50 |
+
session = Session(session_id, driver, created_at)
|
| 51 |
+
|
| 52 |
+
self.sessions[session_id] = session
|
| 53 |
+
|
| 54 |
+
return session, True
|
| 55 |
+
|
| 56 |
+
def exists(self, session_id: str) -> bool:
|
| 57 |
+
return session_id in self.sessions
|
| 58 |
+
|
| 59 |
+
def destroy(self, session_id: str) -> bool:
|
| 60 |
+
"""destroy closes the driver instance and removes session from the storage.
|
| 61 |
+
The function is noop if session_id doesn't exist.
|
| 62 |
+
The function returns True if session was found and destroyed,
|
| 63 |
+
and False if session_id wasn't found.
|
| 64 |
+
"""
|
| 65 |
+
if not self.exists(session_id):
|
| 66 |
+
return False
|
| 67 |
+
|
| 68 |
+
session = self.sessions.pop(session_id)
|
| 69 |
+
if utils.PLATFORM_VERSION == "nt":
|
| 70 |
+
session.driver.close()
|
| 71 |
+
session.driver.quit()
|
| 72 |
+
return True
|
| 73 |
+
|
| 74 |
+
def get(self, session_id: str, ttl: Optional[timedelta] = None) -> Tuple[Session, bool]:
|
| 75 |
+
session, fresh = self.create(session_id)
|
| 76 |
+
|
| 77 |
+
if ttl is not None and not fresh and session.lifetime() > ttl:
|
| 78 |
+
logging.debug(f'session\'s lifetime has expired, so the session is recreated (session_id={session_id})')
|
| 79 |
+
session, fresh = self.create(session_id, force_new=True)
|
| 80 |
+
|
| 81 |
+
return session, fresh
|
| 82 |
+
|
| 83 |
+
def session_ids(self) -> list[str]:
|
| 84 |
+
return list(self.sessions.keys())
|
flaresolverr/tests.py
ADDED
|
@@ -0,0 +1,655 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
from typing import Optional
|
| 3 |
+
|
| 4 |
+
from webtest import TestApp
|
| 5 |
+
|
| 6 |
+
from dtos import IndexResponse, HealthResponse, V1ResponseBase, STATUS_OK, STATUS_ERROR
|
| 7 |
+
import flaresolverr
|
| 8 |
+
import utils
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def _find_obj_by_key(key: str, value: str, _list: list) -> Optional[dict]:
|
| 12 |
+
for obj in _list:
|
| 13 |
+
if obj[key] == value:
|
| 14 |
+
return obj
|
| 15 |
+
return None
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class TestFlareSolverr(unittest.TestCase):
|
| 19 |
+
|
| 20 |
+
proxy_url = "http://127.0.0.1:8888"
|
| 21 |
+
proxy_socks_url = "socks5://127.0.0.1:1080"
|
| 22 |
+
google_url = "https://www.google.com"
|
| 23 |
+
post_url = "https://httpbin.org/post"
|
| 24 |
+
cloudflare_url = "https://nowsecure.nl/"
|
| 25 |
+
cloudflare_url_2 = "https://idope.se/torrent-list/harry/"
|
| 26 |
+
ddos_guard_url = "https://www.litres.ru/"
|
| 27 |
+
fairlane_url = "https://www.pararius.com/apartments/amsterdam"
|
| 28 |
+
custom_cloudflare_url = "https://www.muziekfabriek.org/"
|
| 29 |
+
cloudflare_blocked_url = "https://cpasbiens3.fr/index.php?do=search&subaction=search"
|
| 30 |
+
|
| 31 |
+
app = TestApp(flaresolverr.app)
|
| 32 |
+
# wait until the server is ready
|
| 33 |
+
app.get('/')
|
| 34 |
+
|
| 35 |
+
def test_wrong_endpoint(self):
|
| 36 |
+
res = self.app.get('/wrong', status=404)
|
| 37 |
+
self.assertEqual(res.status_code, 404)
|
| 38 |
+
|
| 39 |
+
body = res.json
|
| 40 |
+
self.assertEqual("Not found: '/wrong'", body['error'])
|
| 41 |
+
self.assertEqual(404, body['status_code'])
|
| 42 |
+
|
| 43 |
+
def test_index_endpoint(self):
|
| 44 |
+
res = self.app.get('/')
|
| 45 |
+
self.assertEqual(res.status_code, 200)
|
| 46 |
+
|
| 47 |
+
body = IndexResponse(res.json)
|
| 48 |
+
self.assertEqual("FlareSolverr is ready!", body.msg)
|
| 49 |
+
self.assertEqual(utils.get_flaresolverr_version(), body.version)
|
| 50 |
+
self.assertIn("Chrome/", body.userAgent)
|
| 51 |
+
|
| 52 |
+
def test_health_endpoint(self):
|
| 53 |
+
res = self.app.get('/health')
|
| 54 |
+
self.assertEqual(res.status_code, 200)
|
| 55 |
+
|
| 56 |
+
body = HealthResponse(res.json)
|
| 57 |
+
self.assertEqual(STATUS_OK, body.status)
|
| 58 |
+
|
| 59 |
+
def test_v1_endpoint_wrong_cmd(self):
|
| 60 |
+
res = self.app.post_json('/v1', {
|
| 61 |
+
"cmd": "request.bad",
|
| 62 |
+
"url": self.google_url
|
| 63 |
+
}, status=500)
|
| 64 |
+
self.assertEqual(res.status_code, 500)
|
| 65 |
+
|
| 66 |
+
body = V1ResponseBase(res.json)
|
| 67 |
+
self.assertEqual(STATUS_ERROR, body.status)
|
| 68 |
+
self.assertEqual("Error: Request parameter 'cmd' = 'request.bad' is invalid.", body.message)
|
| 69 |
+
self.assertGreater(body.startTimestamp, 10000)
|
| 70 |
+
self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
|
| 71 |
+
self.assertEqual(utils.get_flaresolverr_version(), body.version)
|
| 72 |
+
|
| 73 |
+
def test_v1_endpoint_request_get_no_cloudflare(self):
|
| 74 |
+
res = self.app.post_json('/v1', {
|
| 75 |
+
"cmd": "request.get",
|
| 76 |
+
"url": self.google_url
|
| 77 |
+
})
|
| 78 |
+
self.assertEqual(res.status_code, 200)
|
| 79 |
+
|
| 80 |
+
body = V1ResponseBase(res.json)
|
| 81 |
+
self.assertEqual(STATUS_OK, body.status)
|
| 82 |
+
self.assertEqual("Challenge not detected!", body.message)
|
| 83 |
+
self.assertGreater(body.startTimestamp, 10000)
|
| 84 |
+
self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
|
| 85 |
+
self.assertEqual(utils.get_flaresolverr_version(), body.version)
|
| 86 |
+
|
| 87 |
+
solution = body.solution
|
| 88 |
+
self.assertIn(self.google_url, solution.url)
|
| 89 |
+
self.assertEqual(solution.status, 200)
|
| 90 |
+
self.assertIs(len(solution.headers), 0)
|
| 91 |
+
self.assertIn("<title>Google</title>", solution.response)
|
| 92 |
+
self.assertGreater(len(solution.cookies), 0)
|
| 93 |
+
self.assertIn("Chrome/", solution.userAgent)
|
| 94 |
+
|
| 95 |
+
def test_v1_endpoint_request_get_disable_resources(self):
|
| 96 |
+
res = self.app.post_json("/v1", {
|
| 97 |
+
"cmd": "request.get",
|
| 98 |
+
"url": self.google_url,
|
| 99 |
+
"disableMedia": True
|
| 100 |
+
})
|
| 101 |
+
self.assertEqual(res.status_code, 200)
|
| 102 |
+
|
| 103 |
+
body = V1ResponseBase(res.json)
|
| 104 |
+
self.assertEqual(STATUS_OK, body.status)
|
| 105 |
+
self.assertEqual("Challenge not detected!", body.message)
|
| 106 |
+
self.assertGreater(body.startTimestamp, 10000)
|
| 107 |
+
self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
|
| 108 |
+
self.assertEqual(utils.get_flaresolverr_version(), body.version)
|
| 109 |
+
|
| 110 |
+
solution = body.solution
|
| 111 |
+
self.assertIn(self.google_url, solution.url)
|
| 112 |
+
self.assertEqual(solution.status, 200)
|
| 113 |
+
self.assertIs(len(solution.headers), 0)
|
| 114 |
+
self.assertIn("<title>Google</title>", solution.response)
|
| 115 |
+
self.assertGreater(len(solution.cookies), 0)
|
| 116 |
+
self.assertIn("Chrome/", solution.userAgent)
|
| 117 |
+
|
| 118 |
+
def test_v1_endpoint_request_get_cloudflare_js_1(self):
|
| 119 |
+
res = self.app.post_json('/v1', {
|
| 120 |
+
"cmd": "request.get",
|
| 121 |
+
"url": self.cloudflare_url
|
| 122 |
+
})
|
| 123 |
+
self.assertEqual(res.status_code, 200)
|
| 124 |
+
|
| 125 |
+
body = V1ResponseBase(res.json)
|
| 126 |
+
self.assertEqual(STATUS_OK, body.status)
|
| 127 |
+
self.assertEqual("Challenge solved!", body.message)
|
| 128 |
+
self.assertGreater(body.startTimestamp, 10000)
|
| 129 |
+
self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
|
| 130 |
+
self.assertEqual(utils.get_flaresolverr_version(), body.version)
|
| 131 |
+
|
| 132 |
+
solution = body.solution
|
| 133 |
+
self.assertIn(self.cloudflare_url, solution.url)
|
| 134 |
+
self.assertEqual(solution.status, 200)
|
| 135 |
+
self.assertIs(len(solution.headers), 0)
|
| 136 |
+
self.assertIn("<title>nowSecure</title>", solution.response)
|
| 137 |
+
self.assertGreater(len(solution.cookies), 0)
|
| 138 |
+
self.assertIn("Chrome/", solution.userAgent)
|
| 139 |
+
|
| 140 |
+
cf_cookie = _find_obj_by_key("name", "cf_clearance", solution.cookies)
|
| 141 |
+
self.assertIsNotNone(cf_cookie, "Cloudflare cookie not found")
|
| 142 |
+
self.assertGreater(len(cf_cookie["value"]), 30)
|
| 143 |
+
|
| 144 |
+
def test_v1_endpoint_request_get_cloudflare_js_2(self):
|
| 145 |
+
res = self.app.post_json('/v1', {
|
| 146 |
+
"cmd": "request.get",
|
| 147 |
+
"url": self.cloudflare_url_2
|
| 148 |
+
})
|
| 149 |
+
self.assertEqual(res.status_code, 200)
|
| 150 |
+
|
| 151 |
+
body = V1ResponseBase(res.json)
|
| 152 |
+
self.assertEqual(STATUS_OK, body.status)
|
| 153 |
+
self.assertEqual("Challenge solved!", body.message)
|
| 154 |
+
self.assertGreater(body.startTimestamp, 10000)
|
| 155 |
+
self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
|
| 156 |
+
self.assertEqual(utils.get_flaresolverr_version(), body.version)
|
| 157 |
+
|
| 158 |
+
solution = body.solution
|
| 159 |
+
self.assertIn(self.cloudflare_url_2, solution.url)
|
| 160 |
+
self.assertEqual(solution.status, 200)
|
| 161 |
+
self.assertIs(len(solution.headers), 0)
|
| 162 |
+
self.assertIn("<title>harry - idope torrent search</title>", solution.response)
|
| 163 |
+
self.assertGreater(len(solution.cookies), 0)
|
| 164 |
+
self.assertIn("Chrome/", solution.userAgent)
|
| 165 |
+
|
| 166 |
+
cf_cookie = _find_obj_by_key("name", "cf_clearance", solution.cookies)
|
| 167 |
+
self.assertIsNotNone(cf_cookie, "Cloudflare cookie not found")
|
| 168 |
+
self.assertGreater(len(cf_cookie["value"]), 30)
|
| 169 |
+
|
| 170 |
+
def test_v1_endpoint_request_get_ddos_guard_js(self):
|
| 171 |
+
res = self.app.post_json('/v1', {
|
| 172 |
+
"cmd": "request.get",
|
| 173 |
+
"url": self.ddos_guard_url
|
| 174 |
+
})
|
| 175 |
+
self.assertEqual(res.status_code, 200)
|
| 176 |
+
|
| 177 |
+
body = V1ResponseBase(res.json)
|
| 178 |
+
self.assertEqual(STATUS_OK, body.status)
|
| 179 |
+
self.assertEqual("Challenge solved!", body.message)
|
| 180 |
+
self.assertGreater(body.startTimestamp, 10000)
|
| 181 |
+
self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
|
| 182 |
+
self.assertEqual(utils.get_flaresolverr_version(), body.version)
|
| 183 |
+
|
| 184 |
+
solution = body.solution
|
| 185 |
+
self.assertIn(self.ddos_guard_url, solution.url)
|
| 186 |
+
self.assertEqual(solution.status, 200)
|
| 187 |
+
self.assertIs(len(solution.headers), 0)
|
| 188 |
+
self.assertIn("<title>Литрес", solution.response)
|
| 189 |
+
self.assertGreater(len(solution.cookies), 0)
|
| 190 |
+
self.assertIn("Chrome/", solution.userAgent)
|
| 191 |
+
|
| 192 |
+
cf_cookie = _find_obj_by_key("name", "__ddg1_", solution.cookies)
|
| 193 |
+
self.assertIsNotNone(cf_cookie, "DDOS-Guard cookie not found")
|
| 194 |
+
self.assertGreater(len(cf_cookie["value"]), 10)
|
| 195 |
+
|
| 196 |
+
def test_v1_endpoint_request_get_fairlane_js(self):
|
| 197 |
+
res = self.app.post_json('/v1', {
|
| 198 |
+
"cmd": "request.get",
|
| 199 |
+
"url": self.fairlane_url
|
| 200 |
+
})
|
| 201 |
+
self.assertEqual(res.status_code, 200)
|
| 202 |
+
|
| 203 |
+
body = V1ResponseBase(res.json)
|
| 204 |
+
self.assertEqual(STATUS_OK, body.status)
|
| 205 |
+
self.assertEqual("Challenge solved!", body.message)
|
| 206 |
+
self.assertGreater(body.startTimestamp, 10000)
|
| 207 |
+
self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
|
| 208 |
+
self.assertEqual(utils.get_flaresolverr_version(), body.version)
|
| 209 |
+
|
| 210 |
+
solution = body.solution
|
| 211 |
+
self.assertIn(self.fairlane_url, solution.url)
|
| 212 |
+
self.assertEqual(solution.status, 200)
|
| 213 |
+
self.assertIs(len(solution.headers), 0)
|
| 214 |
+
self.assertIn("<title>Rental Apartments Amsterdam</title>", solution.response)
|
| 215 |
+
self.assertGreater(len(solution.cookies), 0)
|
| 216 |
+
self.assertIn("Chrome/", solution.userAgent)
|
| 217 |
+
|
| 218 |
+
cf_cookie = _find_obj_by_key("name", "fl_pass_v2_b", solution.cookies)
|
| 219 |
+
self.assertIsNotNone(cf_cookie, "Fairlane cookie not found")
|
| 220 |
+
self.assertGreater(len(cf_cookie["value"]), 50)
|
| 221 |
+
|
| 222 |
+
def test_v1_endpoint_request_get_custom_cloudflare_js(self):
|
| 223 |
+
res = self.app.post_json('/v1', {
|
| 224 |
+
"cmd": "request.get",
|
| 225 |
+
"url": self.custom_cloudflare_url
|
| 226 |
+
})
|
| 227 |
+
self.assertEqual(res.status_code, 200)
|
| 228 |
+
|
| 229 |
+
body = V1ResponseBase(res.json)
|
| 230 |
+
self.assertEqual(STATUS_OK, body.status)
|
| 231 |
+
self.assertEqual("Challenge solved!", body.message)
|
| 232 |
+
self.assertGreater(body.startTimestamp, 10000)
|
| 233 |
+
self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
|
| 234 |
+
self.assertEqual(utils.get_flaresolverr_version(), body.version)
|
| 235 |
+
|
| 236 |
+
solution = body.solution
|
| 237 |
+
self.assertIn(self.custom_cloudflare_url, solution.url)
|
| 238 |
+
self.assertEqual(solution.status, 200)
|
| 239 |
+
self.assertIs(len(solution.headers), 0)
|
| 240 |
+
self.assertIn("<title>MuziekFabriek : Aanmelden</title>", solution.response)
|
| 241 |
+
self.assertGreater(len(solution.cookies), 0)
|
| 242 |
+
self.assertIn("Chrome/", solution.userAgent)
|
| 243 |
+
|
| 244 |
+
cf_cookie = _find_obj_by_key("name", "ct_anti_ddos_key", solution.cookies)
|
| 245 |
+
self.assertIsNotNone(cf_cookie, "Custom Cloudflare cookie not found")
|
| 246 |
+
self.assertGreater(len(cf_cookie["value"]), 10)
|
| 247 |
+
|
| 248 |
+
# todo: test Cmd 'request.get' should return fail with Cloudflare CAPTCHA
|
| 249 |
+
|
| 250 |
+
def test_v1_endpoint_request_get_cloudflare_blocked(self):
|
| 251 |
+
res = self.app.post_json('/v1', {
|
| 252 |
+
"cmd": "request.get",
|
| 253 |
+
"url": self.cloudflare_blocked_url
|
| 254 |
+
}, status=500)
|
| 255 |
+
self.assertEqual(res.status_code, 500)
|
| 256 |
+
|
| 257 |
+
body = V1ResponseBase(res.json)
|
| 258 |
+
self.assertEqual(STATUS_ERROR, body.status)
|
| 259 |
+
self.assertEqual("Error: Error solving the challenge. Cloudflare has blocked this request. "
|
| 260 |
+
"Probably your IP is banned for this site, check in your web browser.", body.message)
|
| 261 |
+
self.assertGreater(body.startTimestamp, 10000)
|
| 262 |
+
self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
|
| 263 |
+
self.assertEqual(utils.get_flaresolverr_version(), body.version)
|
| 264 |
+
|
| 265 |
+
def test_v1_endpoint_request_get_cookies_param(self):
|
| 266 |
+
res = self.app.post_json('/v1', {
|
| 267 |
+
"cmd": "request.get",
|
| 268 |
+
"url": self.google_url,
|
| 269 |
+
"cookies": [
|
| 270 |
+
{
|
| 271 |
+
"name": "testcookie1",
|
| 272 |
+
"value": "testvalue1"
|
| 273 |
+
},
|
| 274 |
+
{
|
| 275 |
+
"name": "testcookie2",
|
| 276 |
+
"value": "testvalue2"
|
| 277 |
+
}
|
| 278 |
+
]
|
| 279 |
+
})
|
| 280 |
+
self.assertEqual(res.status_code, 200)
|
| 281 |
+
|
| 282 |
+
body = V1ResponseBase(res.json)
|
| 283 |
+
self.assertEqual(STATUS_OK, body.status)
|
| 284 |
+
self.assertEqual("Challenge not detected!", body.message)
|
| 285 |
+
self.assertGreater(body.startTimestamp, 10000)
|
| 286 |
+
self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
|
| 287 |
+
self.assertEqual(utils.get_flaresolverr_version(), body.version)
|
| 288 |
+
|
| 289 |
+
solution = body.solution
|
| 290 |
+
self.assertIn(self.google_url, solution.url)
|
| 291 |
+
self.assertEqual(solution.status, 200)
|
| 292 |
+
self.assertIs(len(solution.headers), 0)
|
| 293 |
+
self.assertIn("<title>Google</title>", solution.response)
|
| 294 |
+
self.assertGreater(len(solution.cookies), 1)
|
| 295 |
+
self.assertIn("Chrome/", solution.userAgent)
|
| 296 |
+
|
| 297 |
+
user_cookie1 = _find_obj_by_key("name", "testcookie1", solution.cookies)
|
| 298 |
+
self.assertIsNotNone(user_cookie1, "User cookie 1 not found")
|
| 299 |
+
self.assertEqual("testvalue1", user_cookie1["value"])
|
| 300 |
+
|
| 301 |
+
user_cookie2 = _find_obj_by_key("name", "testcookie2", solution.cookies)
|
| 302 |
+
self.assertIsNotNone(user_cookie2, "User cookie 2 not found")
|
| 303 |
+
self.assertEqual("testvalue2", user_cookie2["value"])
|
| 304 |
+
|
| 305 |
+
def test_v1_endpoint_request_get_returnOnlyCookies_param(self):
|
| 306 |
+
res = self.app.post_json('/v1', {
|
| 307 |
+
"cmd": "request.get",
|
| 308 |
+
"url": self.google_url,
|
| 309 |
+
"returnOnlyCookies": True
|
| 310 |
+
})
|
| 311 |
+
self.assertEqual(res.status_code, 200)
|
| 312 |
+
|
| 313 |
+
body = V1ResponseBase(res.json)
|
| 314 |
+
self.assertEqual(STATUS_OK, body.status)
|
| 315 |
+
self.assertEqual("Challenge not detected!", body.message)
|
| 316 |
+
self.assertGreater(body.startTimestamp, 10000)
|
| 317 |
+
self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
|
| 318 |
+
self.assertEqual(utils.get_flaresolverr_version(), body.version)
|
| 319 |
+
|
| 320 |
+
solution = body.solution
|
| 321 |
+
self.assertIn(self.google_url, solution.url)
|
| 322 |
+
self.assertEqual(solution.status, 200)
|
| 323 |
+
self.assertIsNone(solution.headers)
|
| 324 |
+
self.assertIsNone(solution.response)
|
| 325 |
+
self.assertGreater(len(solution.cookies), 0)
|
| 326 |
+
self.assertIn("Chrome/", solution.userAgent)
|
| 327 |
+
|
| 328 |
+
def test_v1_endpoint_request_get_proxy_http_param(self):
|
| 329 |
+
"""
|
| 330 |
+
To configure TinyProxy in local:
|
| 331 |
+
* sudo vim /etc/tinyproxy/tinyproxy.conf
|
| 332 |
+
* edit => LogFile "/tmp/tinyproxy.log"
|
| 333 |
+
* edit => Syslog Off
|
| 334 |
+
* sudo tinyproxy -d
|
| 335 |
+
* sudo tail -f /tmp/tinyproxy.log
|
| 336 |
+
"""
|
| 337 |
+
res = self.app.post_json('/v1', {
|
| 338 |
+
"cmd": "request.get",
|
| 339 |
+
"url": self.google_url,
|
| 340 |
+
"proxy": {
|
| 341 |
+
"url": self.proxy_url
|
| 342 |
+
}
|
| 343 |
+
})
|
| 344 |
+
self.assertEqual(res.status_code, 200)
|
| 345 |
+
|
| 346 |
+
body = V1ResponseBase(res.json)
|
| 347 |
+
self.assertEqual(STATUS_OK, body.status)
|
| 348 |
+
self.assertEqual("Challenge not detected!", body.message)
|
| 349 |
+
self.assertGreater(body.startTimestamp, 10000)
|
| 350 |
+
self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
|
| 351 |
+
self.assertEqual(utils.get_flaresolverr_version(), body.version)
|
| 352 |
+
|
| 353 |
+
solution = body.solution
|
| 354 |
+
self.assertIn(self.google_url, solution.url)
|
| 355 |
+
self.assertEqual(solution.status, 200)
|
| 356 |
+
self.assertIs(len(solution.headers), 0)
|
| 357 |
+
self.assertIn("<title>Google</title>", solution.response)
|
| 358 |
+
self.assertGreater(len(solution.cookies), 0)
|
| 359 |
+
self.assertIn("Chrome/", solution.userAgent)
|
| 360 |
+
|
| 361 |
+
def test_v1_endpoint_request_get_proxy_http_param_with_credentials(self):
|
| 362 |
+
"""
|
| 363 |
+
To configure TinyProxy in local:
|
| 364 |
+
* sudo vim /etc/tinyproxy/tinyproxy.conf
|
| 365 |
+
* edit => LogFile "/tmp/tinyproxy.log"
|
| 366 |
+
* edit => Syslog Off
|
| 367 |
+
* add => BasicAuth testuser testpass
|
| 368 |
+
* sudo tinyproxy -d
|
| 369 |
+
* sudo tail -f /tmp/tinyproxy.log
|
| 370 |
+
"""
|
| 371 |
+
res = self.app.post_json('/v1', {
|
| 372 |
+
"cmd": "request.get",
|
| 373 |
+
"url": self.google_url,
|
| 374 |
+
"proxy": {
|
| 375 |
+
"url": self.proxy_url,
|
| 376 |
+
"username": "testuser",
|
| 377 |
+
"password": "testpass"
|
| 378 |
+
}
|
| 379 |
+
})
|
| 380 |
+
self.assertEqual(res.status_code, 200)
|
| 381 |
+
|
| 382 |
+
body = V1ResponseBase(res.json)
|
| 383 |
+
self.assertEqual(STATUS_OK, body.status)
|
| 384 |
+
self.assertEqual("Challenge not detected!", body.message)
|
| 385 |
+
self.assertGreater(body.startTimestamp, 10000)
|
| 386 |
+
self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
|
| 387 |
+
self.assertEqual(utils.get_flaresolverr_version(), body.version)
|
| 388 |
+
|
| 389 |
+
solution = body.solution
|
| 390 |
+
self.assertIn(self.google_url, solution.url)
|
| 391 |
+
self.assertEqual(solution.status, 200)
|
| 392 |
+
self.assertIs(len(solution.headers), 0)
|
| 393 |
+
self.assertIn("<title>Google</title>", solution.response)
|
| 394 |
+
self.assertGreater(len(solution.cookies), 0)
|
| 395 |
+
self.assertIn("Chrome/", solution.userAgent)
|
| 396 |
+
|
| 397 |
+
def test_v1_endpoint_request_get_proxy_socks_param(self):
|
| 398 |
+
"""
|
| 399 |
+
To configure Dante in local:
|
| 400 |
+
* https://linuxhint.com/set-up-a-socks5-proxy-on-ubuntu-with-dante/
|
| 401 |
+
* sudo vim /etc/sockd.conf
|
| 402 |
+
* sudo systemctl restart sockd.service
|
| 403 |
+
* curl --socks5 socks5://127.0.0.1:1080 https://www.google.com
|
| 404 |
+
"""
|
| 405 |
+
res = self.app.post_json('/v1', {
|
| 406 |
+
"cmd": "request.get",
|
| 407 |
+
"url": self.google_url,
|
| 408 |
+
"proxy": {
|
| 409 |
+
"url": self.proxy_socks_url
|
| 410 |
+
}
|
| 411 |
+
})
|
| 412 |
+
self.assertEqual(res.status_code, 200)
|
| 413 |
+
|
| 414 |
+
body = V1ResponseBase(res.json)
|
| 415 |
+
self.assertEqual(STATUS_OK, body.status)
|
| 416 |
+
self.assertEqual("Challenge not detected!", body.message)
|
| 417 |
+
self.assertGreater(body.startTimestamp, 10000)
|
| 418 |
+
self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
|
| 419 |
+
self.assertEqual(utils.get_flaresolverr_version(), body.version)
|
| 420 |
+
|
| 421 |
+
solution = body.solution
|
| 422 |
+
self.assertIn(self.google_url, solution.url)
|
| 423 |
+
self.assertEqual(solution.status, 200)
|
| 424 |
+
self.assertIs(len(solution.headers), 0)
|
| 425 |
+
self.assertIn("<title>Google</title>", solution.response)
|
| 426 |
+
self.assertGreater(len(solution.cookies), 0)
|
| 427 |
+
self.assertIn("Chrome/", solution.userAgent)
|
| 428 |
+
|
| 429 |
+
def test_v1_endpoint_request_get_proxy_wrong_param(self):
|
| 430 |
+
res = self.app.post_json('/v1', {
|
| 431 |
+
"cmd": "request.get",
|
| 432 |
+
"url": self.google_url,
|
| 433 |
+
"proxy": {
|
| 434 |
+
"url": "http://127.0.0.1:43210"
|
| 435 |
+
}
|
| 436 |
+
}, status=500)
|
| 437 |
+
self.assertEqual(res.status_code, 500)
|
| 438 |
+
|
| 439 |
+
body = V1ResponseBase(res.json)
|
| 440 |
+
self.assertEqual(STATUS_ERROR, body.status)
|
| 441 |
+
self.assertIn("Error: Error solving the challenge. Message: unknown error: net::ERR_PROXY_CONNECTION_FAILED",
|
| 442 |
+
body.message)
|
| 443 |
+
self.assertGreater(body.startTimestamp, 10000)
|
| 444 |
+
self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
|
| 445 |
+
self.assertEqual(utils.get_flaresolverr_version(), body.version)
|
| 446 |
+
|
| 447 |
+
def test_v1_endpoint_request_get_fail_timeout(self):
|
| 448 |
+
res = self.app.post_json('/v1', {
|
| 449 |
+
"cmd": "request.get",
|
| 450 |
+
"url": self.google_url,
|
| 451 |
+
"maxTimeout": 10
|
| 452 |
+
}, status=500)
|
| 453 |
+
self.assertEqual(res.status_code, 500)
|
| 454 |
+
|
| 455 |
+
body = V1ResponseBase(res.json)
|
| 456 |
+
self.assertEqual(STATUS_ERROR, body.status)
|
| 457 |
+
self.assertEqual("Error: Error solving the challenge. Timeout after 0.01 seconds.", body.message)
|
| 458 |
+
self.assertGreater(body.startTimestamp, 10000)
|
| 459 |
+
self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
|
| 460 |
+
self.assertEqual(utils.get_flaresolverr_version(), body.version)
|
| 461 |
+
|
| 462 |
+
def test_v1_endpoint_request_get_fail_bad_domain(self):
|
| 463 |
+
res = self.app.post_json('/v1', {
|
| 464 |
+
"cmd": "request.get",
|
| 465 |
+
"url": "https://www.google.combad"
|
| 466 |
+
}, status=500)
|
| 467 |
+
self.assertEqual(res.status_code, 500)
|
| 468 |
+
|
| 469 |
+
body = V1ResponseBase(res.json)
|
| 470 |
+
self.assertEqual(STATUS_ERROR, body.status)
|
| 471 |
+
self.assertIn("Message: unknown error: net::ERR_NAME_NOT_RESOLVED", body.message)
|
| 472 |
+
|
| 473 |
+
def test_v1_endpoint_request_get_deprecated_param(self):
|
| 474 |
+
res = self.app.post_json('/v1', {
|
| 475 |
+
"cmd": "request.get",
|
| 476 |
+
"url": self.google_url,
|
| 477 |
+
"userAgent": "Test User-Agent" # was removed in v2, not used
|
| 478 |
+
})
|
| 479 |
+
self.assertEqual(res.status_code, 200)
|
| 480 |
+
|
| 481 |
+
body = V1ResponseBase(res.json)
|
| 482 |
+
self.assertEqual(STATUS_OK, body.status)
|
| 483 |
+
self.assertEqual("Challenge not detected!", body.message)
|
| 484 |
+
|
| 485 |
+
def test_v1_endpoint_request_post_no_cloudflare(self):
|
| 486 |
+
res = self.app.post_json('/v1', {
|
| 487 |
+
"cmd": "request.post",
|
| 488 |
+
"url": self.post_url,
|
| 489 |
+
"postData": "param1=value1¶m2=value2"
|
| 490 |
+
})
|
| 491 |
+
self.assertEqual(res.status_code, 200)
|
| 492 |
+
|
| 493 |
+
body = V1ResponseBase(res.json)
|
| 494 |
+
self.assertEqual(STATUS_OK, body.status)
|
| 495 |
+
self.assertEqual("Challenge not detected!", body.message)
|
| 496 |
+
self.assertGreater(body.startTimestamp, 10000)
|
| 497 |
+
self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
|
| 498 |
+
self.assertEqual(utils.get_flaresolverr_version(), body.version)
|
| 499 |
+
|
| 500 |
+
solution = body.solution
|
| 501 |
+
self.assertIn(self.post_url, solution.url)
|
| 502 |
+
self.assertEqual(solution.status, 200)
|
| 503 |
+
self.assertIs(len(solution.headers), 0)
|
| 504 |
+
self.assertIn('"form": {\n "param1": "value1", \n "param2": "value2"\n }', solution.response)
|
| 505 |
+
self.assertEqual(len(solution.cookies), 0)
|
| 506 |
+
self.assertIn("Chrome/", solution.userAgent)
|
| 507 |
+
|
| 508 |
+
def test_v1_endpoint_request_post_cloudflare(self):
|
| 509 |
+
res = self.app.post_json('/v1', {
|
| 510 |
+
"cmd": "request.post",
|
| 511 |
+
"url": self.cloudflare_url,
|
| 512 |
+
"postData": "param1=value1¶m2=value2"
|
| 513 |
+
})
|
| 514 |
+
self.assertEqual(res.status_code, 200)
|
| 515 |
+
|
| 516 |
+
body = V1ResponseBase(res.json)
|
| 517 |
+
self.assertEqual(STATUS_OK, body.status)
|
| 518 |
+
self.assertEqual("Challenge solved!", body.message)
|
| 519 |
+
self.assertGreater(body.startTimestamp, 10000)
|
| 520 |
+
self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
|
| 521 |
+
self.assertEqual(utils.get_flaresolverr_version(), body.version)
|
| 522 |
+
|
| 523 |
+
solution = body.solution
|
| 524 |
+
self.assertIn(self.cloudflare_url, solution.url)
|
| 525 |
+
self.assertEqual(solution.status, 200)
|
| 526 |
+
self.assertIs(len(solution.headers), 0)
|
| 527 |
+
self.assertIn("<title>405 Not Allowed</title>", solution.response)
|
| 528 |
+
self.assertGreater(len(solution.cookies), 0)
|
| 529 |
+
self.assertIn("Chrome/", solution.userAgent)
|
| 530 |
+
|
| 531 |
+
cf_cookie = _find_obj_by_key("name", "cf_clearance", solution.cookies)
|
| 532 |
+
self.assertIsNotNone(cf_cookie, "Cloudflare cookie not found")
|
| 533 |
+
self.assertGreater(len(cf_cookie["value"]), 30)
|
| 534 |
+
|
| 535 |
+
def test_v1_endpoint_request_post_fail_no_post_data(self):
|
| 536 |
+
res = self.app.post_json('/v1', {
|
| 537 |
+
"cmd": "request.post",
|
| 538 |
+
"url": self.google_url
|
| 539 |
+
}, status=500)
|
| 540 |
+
self.assertEqual(res.status_code, 500)
|
| 541 |
+
|
| 542 |
+
body = V1ResponseBase(res.json)
|
| 543 |
+
self.assertEqual(STATUS_ERROR, body.status)
|
| 544 |
+
self.assertIn("Request parameter 'postData' is mandatory in 'request.post' command", body.message)
|
| 545 |
+
|
| 546 |
+
def test_v1_endpoint_request_post_deprecated_param(self):
|
| 547 |
+
res = self.app.post_json('/v1', {
|
| 548 |
+
"cmd": "request.post",
|
| 549 |
+
"url": self.google_url,
|
| 550 |
+
"postData": "param1=value1¶m2=value2",
|
| 551 |
+
"userAgent": "Test User-Agent" # was removed in v2, not used
|
| 552 |
+
})
|
| 553 |
+
self.assertEqual(res.status_code, 200)
|
| 554 |
+
|
| 555 |
+
body = V1ResponseBase(res.json)
|
| 556 |
+
self.assertEqual(STATUS_OK, body.status)
|
| 557 |
+
self.assertEqual("Challenge not detected!", body.message)
|
| 558 |
+
|
| 559 |
+
def test_v1_endpoint_sessions_create_without_session(self):
|
| 560 |
+
res = self.app.post_json('/v1', {
|
| 561 |
+
"cmd": "sessions.create"
|
| 562 |
+
})
|
| 563 |
+
self.assertEqual(res.status_code, 200)
|
| 564 |
+
|
| 565 |
+
body = V1ResponseBase(res.json)
|
| 566 |
+
self.assertEqual(STATUS_OK, body.status)
|
| 567 |
+
self.assertEqual("Session created successfully.", body.message)
|
| 568 |
+
self.assertIsNotNone(body.session)
|
| 569 |
+
|
| 570 |
+
def test_v1_endpoint_sessions_create_with_session(self):
|
| 571 |
+
res = self.app.post_json('/v1', {
|
| 572 |
+
"cmd": "sessions.create",
|
| 573 |
+
"session": "test_create_session"
|
| 574 |
+
})
|
| 575 |
+
self.assertEqual(res.status_code, 200)
|
| 576 |
+
|
| 577 |
+
body = V1ResponseBase(res.json)
|
| 578 |
+
self.assertEqual(STATUS_OK, body.status)
|
| 579 |
+
self.assertEqual("Session created successfully.", body.message)
|
| 580 |
+
self.assertEqual(body.session, "test_create_session")
|
| 581 |
+
|
| 582 |
+
def test_v1_endpoint_sessions_create_with_proxy(self):
|
| 583 |
+
res = self.app.post_json('/v1', {
|
| 584 |
+
"cmd": "sessions.create",
|
| 585 |
+
"proxy": {
|
| 586 |
+
"url": self.proxy_url
|
| 587 |
+
}
|
| 588 |
+
})
|
| 589 |
+
self.assertEqual(res.status_code, 200)
|
| 590 |
+
|
| 591 |
+
body = V1ResponseBase(res.json)
|
| 592 |
+
self.assertEqual(STATUS_OK, body.status)
|
| 593 |
+
self.assertEqual("Session created successfully.", body.message)
|
| 594 |
+
self.assertIsNotNone(body.session)
|
| 595 |
+
|
| 596 |
+
def test_v1_endpoint_sessions_list(self):
|
| 597 |
+
self.app.post_json('/v1', {
|
| 598 |
+
"cmd": "sessions.create",
|
| 599 |
+
"session": "test_list_sessions"
|
| 600 |
+
})
|
| 601 |
+
res = self.app.post_json('/v1', {
|
| 602 |
+
"cmd": "sessions.list"
|
| 603 |
+
})
|
| 604 |
+
self.assertEqual(res.status_code, 200)
|
| 605 |
+
|
| 606 |
+
body = V1ResponseBase(res.json)
|
| 607 |
+
self.assertEqual(STATUS_OK, body.status)
|
| 608 |
+
self.assertEqual("", body.message)
|
| 609 |
+
self.assertGreaterEqual(len(body.sessions), 1)
|
| 610 |
+
self.assertIn("test_list_sessions", body.sessions)
|
| 611 |
+
|
| 612 |
+
def test_v1_endpoint_sessions_destroy_existing_session(self):
|
| 613 |
+
self.app.post_json('/v1', {
|
| 614 |
+
"cmd": "sessions.create",
|
| 615 |
+
"session": "test_destroy_sessions"
|
| 616 |
+
})
|
| 617 |
+
res = self.app.post_json('/v1', {
|
| 618 |
+
"cmd": "sessions.destroy",
|
| 619 |
+
"session": "test_destroy_sessions"
|
| 620 |
+
})
|
| 621 |
+
self.assertEqual(res.status_code, 200)
|
| 622 |
+
|
| 623 |
+
body = V1ResponseBase(res.json)
|
| 624 |
+
self.assertEqual(STATUS_OK, body.status)
|
| 625 |
+
self.assertEqual("The session has been removed.", body.message)
|
| 626 |
+
|
| 627 |
+
def test_v1_endpoint_sessions_destroy_non_existing_session(self):
|
| 628 |
+
res = self.app.post_json('/v1', {
|
| 629 |
+
"cmd": "sessions.destroy",
|
| 630 |
+
"session": "non_existing_session_name"
|
| 631 |
+
}, status=500)
|
| 632 |
+
self.assertEqual(res.status_code, 500)
|
| 633 |
+
|
| 634 |
+
body = V1ResponseBase(res.json)
|
| 635 |
+
self.assertEqual(STATUS_ERROR, body.status)
|
| 636 |
+
self.assertEqual("Error: The session doesn't exist.", body.message)
|
| 637 |
+
|
| 638 |
+
def test_v1_endpoint_request_get_with_session(self):
|
| 639 |
+
self.app.post_json('/v1', {
|
| 640 |
+
"cmd": "sessions.create",
|
| 641 |
+
"session": "test_request_sessions"
|
| 642 |
+
})
|
| 643 |
+
res = self.app.post_json('/v1', {
|
| 644 |
+
"cmd": "request.get",
|
| 645 |
+
"session": "test_request_sessions",
|
| 646 |
+
"url": self.google_url
|
| 647 |
+
})
|
| 648 |
+
self.assertEqual(res.status_code, 200)
|
| 649 |
+
|
| 650 |
+
body = V1ResponseBase(res.json)
|
| 651 |
+
self.assertEqual(STATUS_OK, body.status)
|
| 652 |
+
|
| 653 |
+
|
| 654 |
+
if __name__ == '__main__':
|
| 655 |
+
unittest.main()
|
flaresolverr/tests_sites.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
|
| 3 |
+
from webtest import TestApp
|
| 4 |
+
|
| 5 |
+
from dtos import V1ResponseBase, STATUS_OK
|
| 6 |
+
import flaresolverr
|
| 7 |
+
import utils
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def _find_obj_by_key(key: str, value: str, _list: list) -> dict | None:
|
| 11 |
+
for obj in _list:
|
| 12 |
+
if obj[key] == value:
|
| 13 |
+
return obj
|
| 14 |
+
return None
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def asset_cloudflare_solution(self, res, site_url, site_text):
|
| 18 |
+
self.assertEqual(res.status_code, 200)
|
| 19 |
+
|
| 20 |
+
body = V1ResponseBase(res.json)
|
| 21 |
+
self.assertEqual(STATUS_OK, body.status)
|
| 22 |
+
self.assertEqual("Challenge solved!", body.message)
|
| 23 |
+
self.assertGreater(body.startTimestamp, 10000)
|
| 24 |
+
self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
|
| 25 |
+
self.assertEqual(utils.get_flaresolverr_version(), body.version)
|
| 26 |
+
|
| 27 |
+
solution = body.solution
|
| 28 |
+
self.assertIn(site_url, solution.url)
|
| 29 |
+
self.assertEqual(solution.status, 200)
|
| 30 |
+
self.assertIs(len(solution.headers), 0)
|
| 31 |
+
self.assertIn(site_text, solution.response)
|
| 32 |
+
self.assertGreater(len(solution.cookies), 0)
|
| 33 |
+
self.assertIn("Chrome/", solution.userAgent)
|
| 34 |
+
|
| 35 |
+
cf_cookie = _find_obj_by_key("name", "cf_clearance", solution.cookies)
|
| 36 |
+
self.assertIsNotNone(cf_cookie, "Cloudflare cookie not found")
|
| 37 |
+
self.assertGreater(len(cf_cookie["value"]), 30)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class TestFlareSolverr(unittest.TestCase):
|
| 41 |
+
app = TestApp(flaresolverr.app)
|
| 42 |
+
# wait until the server is ready
|
| 43 |
+
app.get('/')
|
| 44 |
+
|
| 45 |
+
def test_v1_endpoint_request_get_cloudflare(self):
|
| 46 |
+
sites_get = [
|
| 47 |
+
('nowsecure', 'https://nowsecure.nl', '<title>nowSecure</title>'),
|
| 48 |
+
('0magnet', 'https://0magnet.com/search?q=2022', 'Torrent Search - ØMagnet'),
|
| 49 |
+
('1337x', 'https://1337x.unblockit.cat/cat/Movies/time/desc/1/', ''),
|
| 50 |
+
('avistaz', 'https://avistaz.to/api/v1/jackett/torrents?in=1&type=0&search=',
|
| 51 |
+
'<title>Access denied</title>'),
|
| 52 |
+
('badasstorrents', 'https://badasstorrents.com/torrents/search/720p/date/desc',
|
| 53 |
+
'<title>Latest Torrents - BadassTorrents</title>'),
|
| 54 |
+
('bt4g', 'https://bt4g.org/search/2022', '<title>Download 2022 Torrents - BT4G</title>'),
|
| 55 |
+
('cinemaz', 'https://cinemaz.to/api/v1/jackett/torrents?in=1&type=0&search=',
|
| 56 |
+
'<title>Access denied</title>'),
|
| 57 |
+
('epublibre', 'https://epublibre.unblockit.cat/catalogo/index/0/nuevo/todos/sin/todos/--/ajax',
|
| 58 |
+
'<title>epublibre - catálogo</title>'),
|
| 59 |
+
('ext', 'https://ext.to/latest/?order=age&sort=desc',
|
| 60 |
+
'<title>Download Latest Torrents - EXT Torrents</title>'),
|
| 61 |
+
('extratorrent', 'https://extratorrent.st/search/?srt=added&order=desc&search=720p&new=1&x=0&y=0',
|
| 62 |
+
'Page 1 - ExtraTorrent'),
|
| 63 |
+
('idope', 'https://idope.se/browse.html', '<title>Recent Torrents</title>'),
|
| 64 |
+
('limetorrents', 'https://limetorrents.unblockninja.com/latest100',
|
| 65 |
+
'<title>Latest 100 torrents - LimeTorrents</title>'),
|
| 66 |
+
('privatehd', 'https://privatehd.to/api/v1/jackett/torrents?in=1&type=0&search=',
|
| 67 |
+
'<title>Access denied</title>'),
|
| 68 |
+
('torrentcore', 'https://torrentcore.xyz/index', '<title>Torrent[CORE] - Torrent community.</title>'),
|
| 69 |
+
('torrentqq223', 'https://torrentqq223.com/torrent/newest.html', 'https://torrentqq223.com/ads/'),
|
| 70 |
+
('36dm', 'https://www.36dm.club/1.html', 'https://www.36dm.club/yesterday-1.html'),
|
| 71 |
+
('erai-raws', 'https://www.erai-raws.info/feed/?type=magnet', '403 Forbidden'),
|
| 72 |
+
('teamos', 'https://www.teamos.xyz/torrents/?filename=&freeleech=',
|
| 73 |
+
'<title>Log in | Team OS : Your Only Destination To Custom OS !!</title>'),
|
| 74 |
+
('yts', 'https://yts.unblockninja.com/api/v2/list_movies.json?query_term=&limit=50&sort=date_added',
|
| 75 |
+
'{"movie_count":')
|
| 76 |
+
]
|
| 77 |
+
for site_name, site_url, site_text in sites_get:
|
| 78 |
+
with self.subTest(msg=site_name):
|
| 79 |
+
res = self.app.post_json('/v1', {
|
| 80 |
+
"cmd": "request.get",
|
| 81 |
+
"url": site_url
|
| 82 |
+
})
|
| 83 |
+
asset_cloudflare_solution(self, res, site_url, site_text)
|
| 84 |
+
|
| 85 |
+
def test_v1_endpoint_request_post_cloudflare(self):
|
| 86 |
+
sites_post = [
|
| 87 |
+
('nnmclub', 'https://nnmclub.to/forum/tracker.php', '<title>Трекер :: NNM-Club</title>',
|
| 88 |
+
'prev_sd=0&prev_a=0&prev_my=0&prev_n=0&prev_shc=0&prev_shf=1&prev_sha=1&prev_shs=0&prev_shr=0&prev_sht=0&f%5B%5D=-1&o=1&s=2&tm=-1&shf=1&sha=1&ta=-1&sns=-1&sds=-1&nm=&pn=&submit=%CF%EE%E8%F1%EA')
|
| 89 |
+
]
|
| 90 |
+
|
| 91 |
+
for site_name, site_url, site_text, post_data in sites_post:
|
| 92 |
+
with self.subTest(msg=site_name):
|
| 93 |
+
res = self.app.post_json('/v1', {
|
| 94 |
+
"cmd": "request.post",
|
| 95 |
+
"url": site_url,
|
| 96 |
+
"postData": post_data
|
| 97 |
+
})
|
| 98 |
+
asset_cloudflare_solution(self, res, site_url, site_text)
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
if __name__ == '__main__':
|
| 102 |
+
unittest.main()
|
flaresolverr/undetected_chromedriver/__init__.py
ADDED
|
@@ -0,0 +1,910 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
888 888 d8b
|
| 6 |
+
888 888 Y8P
|
| 7 |
+
888 888
|
| 8 |
+
.d8888b 88888b. 888d888 .d88b. 88888b.d88b. .d88b. .d88888 888d888 888 888 888 .d88b. 888d888
|
| 9 |
+
d88P" 888 "88b 888P" d88""88b 888 "888 "88b d8P Y8b d88" 888 888P" 888 888 888 d8P Y8b 888P"
|
| 10 |
+
888 888 888 888 888 888 888 888 888 88888888 888 888 888 888 Y88 88P 88888888 888
|
| 11 |
+
Y88b. 888 888 888 Y88..88P 888 888 888 Y8b. Y88b 888 888 888 Y8bd8P Y8b. 888
|
| 12 |
+
"Y8888P 888 888 888 "Y88P" 888 888 888 "Y8888 "Y88888 888 888 Y88P "Y8888 888 88888888
|
| 13 |
+
|
| 14 |
+
by UltrafunkAmsterdam (https://github.com/ultrafunkamsterdam)
|
| 15 |
+
|
| 16 |
+
"""
|
| 17 |
+
from __future__ import annotations
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
__version__ = "3.5.5"
|
| 21 |
+
|
| 22 |
+
import json
|
| 23 |
+
import logging
|
| 24 |
+
import os
|
| 25 |
+
import pathlib
|
| 26 |
+
import re
|
| 27 |
+
import shutil
|
| 28 |
+
import subprocess
|
| 29 |
+
import sys
|
| 30 |
+
import tempfile
|
| 31 |
+
import time
|
| 32 |
+
from weakref import finalize
|
| 33 |
+
|
| 34 |
+
import selenium.webdriver.chrome.service
|
| 35 |
+
import selenium.webdriver.chrome.webdriver
|
| 36 |
+
from selenium.webdriver.common.by import By
|
| 37 |
+
import selenium.webdriver.chromium.service
|
| 38 |
+
import selenium.webdriver.remote.command
|
| 39 |
+
import selenium.webdriver.remote.webdriver
|
| 40 |
+
|
| 41 |
+
from .cdp import CDP
|
| 42 |
+
from .dprocess import start_detached
|
| 43 |
+
from .options import ChromeOptions
|
| 44 |
+
from .patcher import IS_POSIX
|
| 45 |
+
from .patcher import Patcher
|
| 46 |
+
from .reactor import Reactor
|
| 47 |
+
from .webelement import UCWebElement
|
| 48 |
+
from .webelement import WebElement
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
__all__ = (
|
| 52 |
+
"Chrome",
|
| 53 |
+
"ChromeOptions",
|
| 54 |
+
"Patcher",
|
| 55 |
+
"Reactor",
|
| 56 |
+
"CDP",
|
| 57 |
+
"find_chrome_executable",
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
logger = logging.getLogger("uc")
|
| 61 |
+
logger.setLevel(logging.getLogger().getEffectiveLevel())
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
class Chrome(selenium.webdriver.chrome.webdriver.WebDriver):
|
| 65 |
+
"""
|
| 66 |
+
|
| 67 |
+
Controls the ChromeDriver and allows you to drive the browser.
|
| 68 |
+
|
| 69 |
+
The webdriver file will be downloaded by this module automatically,
|
| 70 |
+
you do not need to specify this. however, you may if you wish.
|
| 71 |
+
|
| 72 |
+
Attributes
|
| 73 |
+
----------
|
| 74 |
+
|
| 75 |
+
Methods
|
| 76 |
+
-------
|
| 77 |
+
|
| 78 |
+
reconnect()
|
| 79 |
+
|
| 80 |
+
this can be useful in case of heavy detection methods
|
| 81 |
+
-stops the chromedriver service which runs in the background
|
| 82 |
+
-starts the chromedriver service which runs in the background
|
| 83 |
+
-recreate session
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
start_session(capabilities=None, browser_profile=None)
|
| 87 |
+
|
| 88 |
+
differentiates from the regular method in that it does not
|
| 89 |
+
require a capabilities argument. The capabilities are automatically
|
| 90 |
+
recreated from the options at creation time.
|
| 91 |
+
|
| 92 |
+
--------------------------------------------------------------------------
|
| 93 |
+
NOTE:
|
| 94 |
+
Chrome has everything included to work out of the box.
|
| 95 |
+
it does not `need` customizations.
|
| 96 |
+
any customizations MAY lead to trigger bot migitation systems.
|
| 97 |
+
|
| 98 |
+
--------------------------------------------------------------------------
|
| 99 |
+
"""
|
| 100 |
+
|
| 101 |
+
_instances = set()
|
| 102 |
+
session_id = None
|
| 103 |
+
debug = False
|
| 104 |
+
|
| 105 |
+
def __init__(
|
| 106 |
+
self,
|
| 107 |
+
options=None,
|
| 108 |
+
user_data_dir=None,
|
| 109 |
+
driver_executable_path=None,
|
| 110 |
+
browser_executable_path=None,
|
| 111 |
+
port=0,
|
| 112 |
+
enable_cdp_events=False,
|
| 113 |
+
# service_args=None,
|
| 114 |
+
# service_creationflags=None,
|
| 115 |
+
desired_capabilities=None,
|
| 116 |
+
advanced_elements=False,
|
| 117 |
+
# service_log_path=None,
|
| 118 |
+
keep_alive=True,
|
| 119 |
+
log_level=0,
|
| 120 |
+
headless=False,
|
| 121 |
+
version_main=None,
|
| 122 |
+
patcher_force_close=False,
|
| 123 |
+
suppress_welcome=True,
|
| 124 |
+
use_subprocess=False,
|
| 125 |
+
debug=False,
|
| 126 |
+
no_sandbox=True,
|
| 127 |
+
windows_headless=False,
|
| 128 |
+
user_multi_procs: bool = False,
|
| 129 |
+
**kw,
|
| 130 |
+
):
|
| 131 |
+
"""
|
| 132 |
+
Creates a new instance of the chrome driver.
|
| 133 |
+
|
| 134 |
+
Starts the service and then creates new instance of chrome driver.
|
| 135 |
+
|
| 136 |
+
Parameters
|
| 137 |
+
----------
|
| 138 |
+
|
| 139 |
+
options: ChromeOptions, optional, default: None - automatic useful defaults
|
| 140 |
+
this takes an instance of ChromeOptions, mainly to customize browser behavior.
|
| 141 |
+
anything other dan the default, for example extensions or startup options
|
| 142 |
+
are not supported in case of failure, and can probably lowers your undetectability.
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
user_data_dir: str , optional, default: None (creates temp profile)
|
| 146 |
+
if user_data_dir is a path to a valid chrome profile directory, use it,
|
| 147 |
+
and turn off automatic removal mechanism at exit.
|
| 148 |
+
|
| 149 |
+
driver_executable_path: str, optional, default: None(=downloads and patches new binary)
|
| 150 |
+
|
| 151 |
+
browser_executable_path: str, optional, default: None - use find_chrome_executable
|
| 152 |
+
Path to the browser executable.
|
| 153 |
+
If not specified, make sure the executable's folder is in $PATH
|
| 154 |
+
|
| 155 |
+
port: int, optional, default: 0
|
| 156 |
+
port to be used by the chromedriver executable, this is NOT the debugger port.
|
| 157 |
+
leave it at 0 unless you know what you are doing.
|
| 158 |
+
the default value of 0 automatically picks an available port.
|
| 159 |
+
|
| 160 |
+
enable_cdp_events: bool, default: False
|
| 161 |
+
:: currently for chrome only
|
| 162 |
+
this enables the handling of wire messages
|
| 163 |
+
when enabled, you can subscribe to CDP events by using:
|
| 164 |
+
|
| 165 |
+
driver.add_cdp_listener("Network.dataReceived", yourcallback)
|
| 166 |
+
# yourcallback is an callable which accepts exactly 1 dict as parameter
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
service_args: list of str, optional, default: None
|
| 170 |
+
arguments to pass to the driver service
|
| 171 |
+
|
| 172 |
+
desired_capabilities: dict, optional, default: None - auto from config
|
| 173 |
+
Dictionary object with non-browser specific capabilities only, such as "item" or "loggingPref".
|
| 174 |
+
|
| 175 |
+
advanced_elements: bool, optional, default: False
|
| 176 |
+
makes it easier to recognize elements like you know them from html/browser inspection, especially when working
|
| 177 |
+
in an interactive environment
|
| 178 |
+
|
| 179 |
+
default webelement repr:
|
| 180 |
+
<selenium.webdriver.remote.webelement.WebElement (session="85ff0f671512fa535630e71ee951b1f2", element="6357cb55-92c3-4c0f-9416-b174f9c1b8c4")>
|
| 181 |
+
|
| 182 |
+
advanced webelement repr
|
| 183 |
+
<WebElement(<a class="mobile-show-inline-block mc-update-infos init-ok" href="#" id="main-cat-switcher-mobile">)>
|
| 184 |
+
|
| 185 |
+
note: when retrieving large amounts of elements ( example: find_elements_by_tag("*") ) and print them, it does take a little more time.
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
service_log_path: str, optional, default: None
|
| 189 |
+
path to log information from the driver.
|
| 190 |
+
|
| 191 |
+
keep_alive: bool, optional, default: True
|
| 192 |
+
Whether to configure ChromeRemoteConnection to use HTTP keep-alive.
|
| 193 |
+
|
| 194 |
+
log_level: int, optional, default: adapts to python global log level
|
| 195 |
+
|
| 196 |
+
headless: bool, optional, default: False
|
| 197 |
+
can also be specified in the options instance.
|
| 198 |
+
Specify whether you want to use the browser in headless mode.
|
| 199 |
+
warning: this lowers undetectability and not fully supported.
|
| 200 |
+
|
| 201 |
+
version_main: int, optional, default: None (=auto)
|
| 202 |
+
if you, for god knows whatever reason, use
|
| 203 |
+
an older version of Chrome. You can specify it's full rounded version number
|
| 204 |
+
here. Example: 87 for all versions of 87
|
| 205 |
+
|
| 206 |
+
patcher_force_close: bool, optional, default: False
|
| 207 |
+
instructs the patcher to do whatever it can to access the chromedriver binary
|
| 208 |
+
if the file is locked, it will force shutdown all instances.
|
| 209 |
+
setting it is not recommended, unless you know the implications and think
|
| 210 |
+
you might need it.
|
| 211 |
+
|
| 212 |
+
suppress_welcome: bool, optional , default: True
|
| 213 |
+
a "welcome" alert might show up on *nix-like systems asking whether you want to set
|
| 214 |
+
chrome as your default browser, and if you want to send even more data to google.
|
| 215 |
+
now, in case you are nag-fetishist, or a diagnostics data feeder to google, you can set this to False.
|
| 216 |
+
Note: if you don't handle the nag screen in time, the browser loses it's connection and throws an Exception.
|
| 217 |
+
|
| 218 |
+
use_subprocess: bool, optional , default: True,
|
| 219 |
+
|
| 220 |
+
False (the default) makes sure Chrome will get it's own process (so no subprocess of chromedriver.exe or python
|
| 221 |
+
This fixes a LOT of issues, like multithreaded run, but mst importantly. shutting corectly after
|
| 222 |
+
program exits or using .quit()
|
| 223 |
+
you should be knowing what you're doing, and know how python works.
|
| 224 |
+
|
| 225 |
+
unfortunately, there is always an edge case in which one would like to write an single script with the only contents being:
|
| 226 |
+
--start script--
|
| 227 |
+
import undetected_chromedriver as uc
|
| 228 |
+
d = uc.Chrome()
|
| 229 |
+
d.get('https://somesite/')
|
| 230 |
+
---end script --
|
| 231 |
+
|
| 232 |
+
and will be greeted with an error, since the program exists before chrome has a change to launch.
|
| 233 |
+
in that case you can set this to `True`. The browser will start via subprocess, and will keep running most of times.
|
| 234 |
+
! setting it to True comes with NO support when being detected. !
|
| 235 |
+
|
| 236 |
+
no_sandbox: bool, optional, default=True
|
| 237 |
+
uses the --no-sandbox option, and additionally does suppress the "unsecure option" status bar
|
| 238 |
+
this option has a default of True since many people seem to run this as root (....) , and chrome does not start
|
| 239 |
+
when running as root without using --no-sandbox flag.
|
| 240 |
+
|
| 241 |
+
user_multi_procs:
|
| 242 |
+
set to true when you are using multithreads/multiprocessing
|
| 243 |
+
ensures not all processes are trying to modify a binary which is in use by another.
|
| 244 |
+
for this to work. YOU MUST HAVE AT LEAST 1 UNDETECTED_CHROMEDRIVER BINARY IN YOUR ROAMING DATA FOLDER.
|
| 245 |
+
this requirement can be easily satisfied, by just running this program "normal" and close/kill it.
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
"""
|
| 249 |
+
|
| 250 |
+
finalize(self, self._ensure_close, self)
|
| 251 |
+
self.debug = debug
|
| 252 |
+
self.patcher = Patcher(
|
| 253 |
+
executable_path=driver_executable_path,
|
| 254 |
+
force=patcher_force_close,
|
| 255 |
+
version_main=version_main,
|
| 256 |
+
user_multi_procs=user_multi_procs,
|
| 257 |
+
)
|
| 258 |
+
# self.patcher.auto(user_multiprocess = user_multi_num_procs)
|
| 259 |
+
self.patcher.auto()
|
| 260 |
+
|
| 261 |
+
# self.patcher = patcher
|
| 262 |
+
if not options:
|
| 263 |
+
options = ChromeOptions()
|
| 264 |
+
|
| 265 |
+
try:
|
| 266 |
+
if hasattr(options, "_session") and options._session is not None:
|
| 267 |
+
# prevent reuse of options,
|
| 268 |
+
# as it just appends arguments, not replace them
|
| 269 |
+
# you'll get conflicts starting chrome
|
| 270 |
+
raise RuntimeError("you cannot reuse the ChromeOptions object")
|
| 271 |
+
except AttributeError:
|
| 272 |
+
pass
|
| 273 |
+
|
| 274 |
+
options._session = self
|
| 275 |
+
|
| 276 |
+
if not options.debugger_address:
|
| 277 |
+
debug_port = (
|
| 278 |
+
port
|
| 279 |
+
if port != 0
|
| 280 |
+
else selenium.webdriver.common.service.utils.free_port()
|
| 281 |
+
)
|
| 282 |
+
debug_host = "127.0.0.1"
|
| 283 |
+
options.debugger_address = "%s:%d" % (debug_host, debug_port)
|
| 284 |
+
else:
|
| 285 |
+
debug_host, debug_port = options.debugger_address.split(":")
|
| 286 |
+
debug_port = int(debug_port)
|
| 287 |
+
|
| 288 |
+
if enable_cdp_events:
|
| 289 |
+
options.set_capability(
|
| 290 |
+
"goog:loggingPrefs", {"performance": "ALL", "browser": "ALL"}
|
| 291 |
+
)
|
| 292 |
+
|
| 293 |
+
options.add_argument("--remote-debugging-host=%s" % debug_host)
|
| 294 |
+
options.add_argument("--remote-debugging-port=%s" % debug_port)
|
| 295 |
+
|
| 296 |
+
if user_data_dir:
|
| 297 |
+
options.add_argument("--user-data-dir=%s" % user_data_dir)
|
| 298 |
+
|
| 299 |
+
language, keep_user_data_dir = None, bool(user_data_dir)
|
| 300 |
+
|
| 301 |
+
# see if a custom user profile is specified in options
|
| 302 |
+
for arg in options.arguments:
|
| 303 |
+
|
| 304 |
+
if any([_ in arg for _ in ("--headless", "headless")]):
|
| 305 |
+
options.arguments.remove(arg)
|
| 306 |
+
options.headless = True
|
| 307 |
+
|
| 308 |
+
if "lang" in arg:
|
| 309 |
+
m = re.search("(?:--)?lang(?:[ =])?(.*)", arg)
|
| 310 |
+
try:
|
| 311 |
+
language = m[1]
|
| 312 |
+
except IndexError:
|
| 313 |
+
logger.debug("will set the language to en-US,en;q=0.9")
|
| 314 |
+
language = "en-US,en;q=0.9"
|
| 315 |
+
|
| 316 |
+
if "user-data-dir" in arg:
|
| 317 |
+
m = re.search("(?:--)?user-data-dir(?:[ =])?(.*)", arg)
|
| 318 |
+
try:
|
| 319 |
+
user_data_dir = m[1]
|
| 320 |
+
logger.debug(
|
| 321 |
+
"user-data-dir found in user argument %s => %s" % (arg, m[1])
|
| 322 |
+
)
|
| 323 |
+
keep_user_data_dir = True
|
| 324 |
+
|
| 325 |
+
except IndexError:
|
| 326 |
+
logger.debug(
|
| 327 |
+
"no user data dir could be extracted from supplied argument %s "
|
| 328 |
+
% arg
|
| 329 |
+
)
|
| 330 |
+
|
| 331 |
+
if not user_data_dir:
|
| 332 |
+
# backward compatiblity
|
| 333 |
+
# check if an old uc.ChromeOptions is used, and extract the user data dir
|
| 334 |
+
|
| 335 |
+
if hasattr(options, "user_data_dir") and getattr(
|
| 336 |
+
options, "user_data_dir", None
|
| 337 |
+
):
|
| 338 |
+
import warnings
|
| 339 |
+
|
| 340 |
+
warnings.warn(
|
| 341 |
+
"using ChromeOptions.user_data_dir might stop working in future versions."
|
| 342 |
+
"use uc.Chrome(user_data_dir='/xyz/some/data') in case you need existing profile folder"
|
| 343 |
+
)
|
| 344 |
+
options.add_argument("--user-data-dir=%s" % options.user_data_dir)
|
| 345 |
+
keep_user_data_dir = True
|
| 346 |
+
logger.debug(
|
| 347 |
+
"user_data_dir property found in options object: %s" % user_data_dir
|
| 348 |
+
)
|
| 349 |
+
|
| 350 |
+
else:
|
| 351 |
+
user_data_dir = os.path.normpath(tempfile.mkdtemp())
|
| 352 |
+
keep_user_data_dir = False
|
| 353 |
+
arg = "--user-data-dir=%s" % user_data_dir
|
| 354 |
+
options.add_argument(arg)
|
| 355 |
+
logger.debug(
|
| 356 |
+
"created a temporary folder in which the user-data (profile) will be stored during this\n"
|
| 357 |
+
"session, and added it to chrome startup arguments: %s" % arg
|
| 358 |
+
)
|
| 359 |
+
|
| 360 |
+
if not language:
|
| 361 |
+
try:
|
| 362 |
+
import locale
|
| 363 |
+
|
| 364 |
+
language = locale.getdefaultlocale()[0].replace("_", "-")
|
| 365 |
+
except Exception:
|
| 366 |
+
pass
|
| 367 |
+
if not language:
|
| 368 |
+
language = "en-US"
|
| 369 |
+
|
| 370 |
+
options.add_argument("--lang=%s" % language)
|
| 371 |
+
|
| 372 |
+
if not options.binary_location:
|
| 373 |
+
options.binary_location = (
|
| 374 |
+
browser_executable_path or find_chrome_executable()
|
| 375 |
+
)
|
| 376 |
+
|
| 377 |
+
if not options.binary_location or not \
|
| 378 |
+
pathlib.Path(options.binary_location).exists():
|
| 379 |
+
raise FileNotFoundError(
|
| 380 |
+
"\n---------------------\n"
|
| 381 |
+
"Could not determine browser executable."
|
| 382 |
+
"\n---------------------\n"
|
| 383 |
+
"Make sure your browser is installed in the default location (path).\n"
|
| 384 |
+
"If you are sure about the browser executable, you can specify it using\n"
|
| 385 |
+
"the `browser_executable_path='{}` parameter.\n\n"
|
| 386 |
+
.format("/path/to/browser/executable" if IS_POSIX else "c:/path/to/your/browser.exe")
|
| 387 |
+
)
|
| 388 |
+
|
| 389 |
+
self._delay = 3
|
| 390 |
+
|
| 391 |
+
self.user_data_dir = user_data_dir
|
| 392 |
+
self.keep_user_data_dir = keep_user_data_dir
|
| 393 |
+
|
| 394 |
+
if suppress_welcome:
|
| 395 |
+
options.arguments.extend(["--no-default-browser-check", "--no-first-run"])
|
| 396 |
+
if no_sandbox:
|
| 397 |
+
options.arguments.extend(["--no-sandbox", "--test-type"])
|
| 398 |
+
|
| 399 |
+
if headless or getattr(options, 'headless', None):
|
| 400 |
+
#workaround until a better checking is found
|
| 401 |
+
try:
|
| 402 |
+
v_main = int(self.patcher.version_main) if self.patcher.version_main else 108
|
| 403 |
+
if v_main < 108:
|
| 404 |
+
options.add_argument("--headless=chrome")
|
| 405 |
+
elif v_main >= 108:
|
| 406 |
+
options.add_argument("--headless=new")
|
| 407 |
+
except:
|
| 408 |
+
logger.warning("could not detect version_main."
|
| 409 |
+
"therefore, we are assuming it is chrome 108 or higher")
|
| 410 |
+
options.add_argument("--headless=new")
|
| 411 |
+
|
| 412 |
+
options.add_argument("--window-size=1920,1080")
|
| 413 |
+
options.add_argument("--start-maximized")
|
| 414 |
+
options.add_argument("--no-sandbox")
|
| 415 |
+
# fixes "could not connect to chrome" error when running
|
| 416 |
+
# on linux using privileged user like root (which i don't recommend)
|
| 417 |
+
|
| 418 |
+
options.add_argument(
|
| 419 |
+
"--log-level=%d" % log_level
|
| 420 |
+
or divmod(logging.getLogger().getEffectiveLevel(), 10)[0]
|
| 421 |
+
)
|
| 422 |
+
|
| 423 |
+
if hasattr(options, "handle_prefs"):
|
| 424 |
+
options.handle_prefs(user_data_dir)
|
| 425 |
+
|
| 426 |
+
# fix exit_type flag to prevent tab-restore nag
|
| 427 |
+
try:
|
| 428 |
+
with open(
|
| 429 |
+
os.path.join(user_data_dir, "Default/Preferences"),
|
| 430 |
+
encoding="latin1",
|
| 431 |
+
mode="r+",
|
| 432 |
+
) as fs:
|
| 433 |
+
config = json.load(fs)
|
| 434 |
+
if config["profile"]["exit_type"] is not None:
|
| 435 |
+
# fixing the restore-tabs-nag
|
| 436 |
+
config["profile"]["exit_type"] = None
|
| 437 |
+
fs.seek(0, 0)
|
| 438 |
+
json.dump(config, fs)
|
| 439 |
+
fs.truncate() # the file might be shorter
|
| 440 |
+
logger.debug("fixed exit_type flag")
|
| 441 |
+
except Exception as e:
|
| 442 |
+
logger.debug("did not find a bad exit_type flag ")
|
| 443 |
+
|
| 444 |
+
self.options = options
|
| 445 |
+
|
| 446 |
+
if not desired_capabilities:
|
| 447 |
+
desired_capabilities = options.to_capabilities()
|
| 448 |
+
|
| 449 |
+
if not use_subprocess and not windows_headless:
|
| 450 |
+
self.browser_pid = start_detached(
|
| 451 |
+
options.binary_location, *options.arguments
|
| 452 |
+
)
|
| 453 |
+
else:
|
| 454 |
+
startupinfo = None
|
| 455 |
+
if os.name == 'nt' and windows_headless:
|
| 456 |
+
# STARTUPINFO() is Windows only
|
| 457 |
+
startupinfo = subprocess.STARTUPINFO()
|
| 458 |
+
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
|
| 459 |
+
browser = subprocess.Popen(
|
| 460 |
+
[options.binary_location, *options.arguments],
|
| 461 |
+
stdin=subprocess.PIPE,
|
| 462 |
+
stdout=subprocess.PIPE,
|
| 463 |
+
stderr=subprocess.PIPE,
|
| 464 |
+
close_fds=IS_POSIX,
|
| 465 |
+
startupinfo=startupinfo
|
| 466 |
+
)
|
| 467 |
+
self.browser_pid = browser.pid
|
| 468 |
+
|
| 469 |
+
|
| 470 |
+
service = selenium.webdriver.chromium.service.ChromiumService(
|
| 471 |
+
self.patcher.executable_path
|
| 472 |
+
)
|
| 473 |
+
|
| 474 |
+
super().__init__(
|
| 475 |
+
service=service,
|
| 476 |
+
options=options,
|
| 477 |
+
keep_alive=keep_alive,
|
| 478 |
+
)
|
| 479 |
+
|
| 480 |
+
self.reactor = None
|
| 481 |
+
|
| 482 |
+
if enable_cdp_events:
|
| 483 |
+
if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
|
| 484 |
+
logging.getLogger(
|
| 485 |
+
"selenium.webdriver.remote.remote_connection"
|
| 486 |
+
).setLevel(20)
|
| 487 |
+
reactor = Reactor(self)
|
| 488 |
+
reactor.start()
|
| 489 |
+
self.reactor = reactor
|
| 490 |
+
|
| 491 |
+
if advanced_elements:
|
| 492 |
+
self._web_element_cls = UCWebElement
|
| 493 |
+
else:
|
| 494 |
+
self._web_element_cls = WebElement
|
| 495 |
+
|
| 496 |
+
if headless or getattr(options, 'headless', None):
|
| 497 |
+
self._configure_headless()
|
| 498 |
+
|
| 499 |
+
def _configure_headless(self):
|
| 500 |
+
orig_get = self.get
|
| 501 |
+
logger.info("setting properties for headless")
|
| 502 |
+
|
| 503 |
+
def get_wrapped(*args, **kwargs):
|
| 504 |
+
if self.execute_script("return navigator.webdriver"):
|
| 505 |
+
logger.info("patch navigator.webdriver")
|
| 506 |
+
self.execute_cdp_cmd(
|
| 507 |
+
"Page.addScriptToEvaluateOnNewDocument",
|
| 508 |
+
{
|
| 509 |
+
"source": """
|
| 510 |
+
Object.defineProperty(window, "navigator", {
|
| 511 |
+
value: new Proxy(navigator, {
|
| 512 |
+
has: (target, key) => (key === "webdriver" ? false : key in target),
|
| 513 |
+
get: (target, key) =>
|
| 514 |
+
key === "webdriver"
|
| 515 |
+
? false
|
| 516 |
+
: typeof target[key] === "function"
|
| 517 |
+
? target[key].bind(target)
|
| 518 |
+
: target[key],
|
| 519 |
+
}),
|
| 520 |
+
});
|
| 521 |
+
"""
|
| 522 |
+
},
|
| 523 |
+
)
|
| 524 |
+
|
| 525 |
+
logger.info("patch user-agent string")
|
| 526 |
+
self.execute_cdp_cmd(
|
| 527 |
+
"Network.setUserAgentOverride",
|
| 528 |
+
{
|
| 529 |
+
"userAgent": self.execute_script(
|
| 530 |
+
"return navigator.userAgent"
|
| 531 |
+
).replace("Headless", "")
|
| 532 |
+
},
|
| 533 |
+
)
|
| 534 |
+
self.execute_cdp_cmd(
|
| 535 |
+
"Page.addScriptToEvaluateOnNewDocument",
|
| 536 |
+
{
|
| 537 |
+
"source": """
|
| 538 |
+
Object.defineProperty(navigator, 'maxTouchPoints', {get: () => 1});
|
| 539 |
+
Object.defineProperty(navigator.connection, 'rtt', {get: () => 100});
|
| 540 |
+
|
| 541 |
+
// https://github.com/microlinkhq/browserless/blob/master/packages/goto/src/evasions/chrome-runtime.js
|
| 542 |
+
window.chrome = {
|
| 543 |
+
app: {
|
| 544 |
+
isInstalled: false,
|
| 545 |
+
InstallState: {
|
| 546 |
+
DISABLED: 'disabled',
|
| 547 |
+
INSTALLED: 'installed',
|
| 548 |
+
NOT_INSTALLED: 'not_installed'
|
| 549 |
+
},
|
| 550 |
+
RunningState: {
|
| 551 |
+
CANNOT_RUN: 'cannot_run',
|
| 552 |
+
READY_TO_RUN: 'ready_to_run',
|
| 553 |
+
RUNNING: 'running'
|
| 554 |
+
}
|
| 555 |
+
},
|
| 556 |
+
runtime: {
|
| 557 |
+
OnInstalledReason: {
|
| 558 |
+
CHROME_UPDATE: 'chrome_update',
|
| 559 |
+
INSTALL: 'install',
|
| 560 |
+
SHARED_MODULE_UPDATE: 'shared_module_update',
|
| 561 |
+
UPDATE: 'update'
|
| 562 |
+
},
|
| 563 |
+
OnRestartRequiredReason: {
|
| 564 |
+
APP_UPDATE: 'app_update',
|
| 565 |
+
OS_UPDATE: 'os_update',
|
| 566 |
+
PERIODIC: 'periodic'
|
| 567 |
+
},
|
| 568 |
+
PlatformArch: {
|
| 569 |
+
ARM: 'arm',
|
| 570 |
+
ARM64: 'arm64',
|
| 571 |
+
MIPS: 'mips',
|
| 572 |
+
MIPS64: 'mips64',
|
| 573 |
+
X86_32: 'x86-32',
|
| 574 |
+
X86_64: 'x86-64'
|
| 575 |
+
},
|
| 576 |
+
PlatformNaclArch: {
|
| 577 |
+
ARM: 'arm',
|
| 578 |
+
MIPS: 'mips',
|
| 579 |
+
MIPS64: 'mips64',
|
| 580 |
+
X86_32: 'x86-32',
|
| 581 |
+
X86_64: 'x86-64'
|
| 582 |
+
},
|
| 583 |
+
PlatformOs: {
|
| 584 |
+
ANDROID: 'android',
|
| 585 |
+
CROS: 'cros',
|
| 586 |
+
LINUX: 'linux',
|
| 587 |
+
MAC: 'mac',
|
| 588 |
+
OPENBSD: 'openbsd',
|
| 589 |
+
WIN: 'win'
|
| 590 |
+
},
|
| 591 |
+
RequestUpdateCheckStatus: {
|
| 592 |
+
NO_UPDATE: 'no_update',
|
| 593 |
+
THROTTLED: 'throttled',
|
| 594 |
+
UPDATE_AVAILABLE: 'update_available'
|
| 595 |
+
}
|
| 596 |
+
}
|
| 597 |
+
}
|
| 598 |
+
|
| 599 |
+
// https://github.com/microlinkhq/browserless/blob/master/packages/goto/src/evasions/navigator-permissions.js
|
| 600 |
+
if (!window.Notification) {
|
| 601 |
+
window.Notification = {
|
| 602 |
+
permission: 'denied'
|
| 603 |
+
}
|
| 604 |
+
}
|
| 605 |
+
|
| 606 |
+
const originalQuery = window.navigator.permissions.query
|
| 607 |
+
window.navigator.permissions.__proto__.query = parameters =>
|
| 608 |
+
parameters.name === 'notifications'
|
| 609 |
+
? Promise.resolve({ state: window.Notification.permission })
|
| 610 |
+
: originalQuery(parameters)
|
| 611 |
+
|
| 612 |
+
const oldCall = Function.prototype.call
|
| 613 |
+
function call() {
|
| 614 |
+
return oldCall.apply(this, arguments)
|
| 615 |
+
}
|
| 616 |
+
Function.prototype.call = call
|
| 617 |
+
|
| 618 |
+
const nativeToStringFunctionString = Error.toString().replace(/Error/g, 'toString')
|
| 619 |
+
const oldToString = Function.prototype.toString
|
| 620 |
+
|
| 621 |
+
function functionToString() {
|
| 622 |
+
if (this === window.navigator.permissions.query) {
|
| 623 |
+
return 'function query() { [native code] }'
|
| 624 |
+
}
|
| 625 |
+
if (this === functionToString) {
|
| 626 |
+
return nativeToStringFunctionString
|
| 627 |
+
}
|
| 628 |
+
return oldCall.call(oldToString, this)
|
| 629 |
+
}
|
| 630 |
+
// eslint-disable-next-line
|
| 631 |
+
Function.prototype.toString = functionToString
|
| 632 |
+
"""
|
| 633 |
+
},
|
| 634 |
+
)
|
| 635 |
+
return orig_get(*args, **kwargs)
|
| 636 |
+
|
| 637 |
+
self.get = get_wrapped
|
| 638 |
+
|
| 639 |
+
# def _get_cdc_props(self):
|
| 640 |
+
# return self.execute_script(
|
| 641 |
+
# """
|
| 642 |
+
# let objectToInspect = window,
|
| 643 |
+
# result = [];
|
| 644 |
+
# while(objectToInspect !== null)
|
| 645 |
+
# { result = result.concat(Object.getOwnPropertyNames(objectToInspect));
|
| 646 |
+
# objectToInspect = Object.getPrototypeOf(objectToInspect); }
|
| 647 |
+
#
|
| 648 |
+
# return result.filter(i => i.match(/^([a-zA-Z]){27}(Array|Promise|Symbol)$/ig))
|
| 649 |
+
# """
|
| 650 |
+
# )
|
| 651 |
+
#
|
| 652 |
+
# def _hook_remove_cdc_props(self):
|
| 653 |
+
# self.execute_cdp_cmd(
|
| 654 |
+
# "Page.addScriptToEvaluateOnNewDocument",
|
| 655 |
+
# {
|
| 656 |
+
# "source": """
|
| 657 |
+
# let objectToInspect = window,
|
| 658 |
+
# result = [];
|
| 659 |
+
# while(objectToInspect !== null)
|
| 660 |
+
# { result = result.concat(Object.getOwnPropertyNames(objectToInspect));
|
| 661 |
+
# objectToInspect = Object.getPrototypeOf(objectToInspect); }
|
| 662 |
+
# result.forEach(p => p.match(/^([a-zA-Z]){27}(Array|Promise|Symbol)$/ig)
|
| 663 |
+
# &&delete window[p]&&console.log('removed',p))
|
| 664 |
+
# """
|
| 665 |
+
# },
|
| 666 |
+
# )
|
| 667 |
+
|
| 668 |
+
def get(self, url):
|
| 669 |
+
# if self._get_cdc_props():
|
| 670 |
+
# self._hook_remove_cdc_props()
|
| 671 |
+
return super().get(url)
|
| 672 |
+
|
| 673 |
+
def add_cdp_listener(self, event_name, callback):
|
| 674 |
+
if (
|
| 675 |
+
self.reactor
|
| 676 |
+
and self.reactor is not None
|
| 677 |
+
and isinstance(self.reactor, Reactor)
|
| 678 |
+
):
|
| 679 |
+
self.reactor.add_event_handler(event_name, callback)
|
| 680 |
+
return self.reactor.handlers
|
| 681 |
+
return False
|
| 682 |
+
|
| 683 |
+
def clear_cdp_listeners(self):
|
| 684 |
+
if self.reactor and isinstance(self.reactor, Reactor):
|
| 685 |
+
self.reactor.handlers.clear()
|
| 686 |
+
|
| 687 |
+
def window_new(self):
|
| 688 |
+
self.execute(
|
| 689 |
+
selenium.webdriver.remote.command.Command.NEW_WINDOW, {"type": "window"}
|
| 690 |
+
)
|
| 691 |
+
|
| 692 |
+
def tab_new(self, url: str):
|
| 693 |
+
"""
|
| 694 |
+
this opens a url in a new tab.
|
| 695 |
+
apparently, that passes all tests directly!
|
| 696 |
+
|
| 697 |
+
Parameters
|
| 698 |
+
----------
|
| 699 |
+
url
|
| 700 |
+
|
| 701 |
+
Returns
|
| 702 |
+
-------
|
| 703 |
+
|
| 704 |
+
"""
|
| 705 |
+
if not hasattr(self, "cdp"):
|
| 706 |
+
from .cdp import CDP
|
| 707 |
+
|
| 708 |
+
cdp = CDP(self.options)
|
| 709 |
+
cdp.tab_new(url)
|
| 710 |
+
|
| 711 |
+
def reconnect(self, timeout=0.1):
|
| 712 |
+
try:
|
| 713 |
+
self.service.stop()
|
| 714 |
+
except Exception as e:
|
| 715 |
+
logger.debug(e)
|
| 716 |
+
time.sleep(timeout)
|
| 717 |
+
try:
|
| 718 |
+
self.service.start()
|
| 719 |
+
except Exception as e:
|
| 720 |
+
logger.debug(e)
|
| 721 |
+
|
| 722 |
+
try:
|
| 723 |
+
self.start_session()
|
| 724 |
+
except Exception as e:
|
| 725 |
+
logger.debug(e)
|
| 726 |
+
|
| 727 |
+
def start_session(self, capabilities=None, browser_profile=None):
|
| 728 |
+
if not capabilities:
|
| 729 |
+
capabilities = self.options.to_capabilities()
|
| 730 |
+
super().start_session(capabilities)
|
| 731 |
+
# super(Chrome, self).start_session(capabilities, browser_profile) # Original explicit call commented out
|
| 732 |
+
|
| 733 |
+
def find_elements_recursive(self, by, value):
|
| 734 |
+
"""
|
| 735 |
+
find elements in all frames
|
| 736 |
+
this is a generator function, which is needed
|
| 737 |
+
since if it would return a list of elements, they
|
| 738 |
+
will be stale on arrival.
|
| 739 |
+
using generator, when the element is returned we are in the correct frame
|
| 740 |
+
to use it directly
|
| 741 |
+
Args:
|
| 742 |
+
by: By
|
| 743 |
+
value: str
|
| 744 |
+
Returns: Generator[webelement.WebElement]
|
| 745 |
+
"""
|
| 746 |
+
def search_frame(f=None):
|
| 747 |
+
if not f:
|
| 748 |
+
# ensure we are on main content frame
|
| 749 |
+
self.switch_to.default_content()
|
| 750 |
+
else:
|
| 751 |
+
self.switch_to.frame(f)
|
| 752 |
+
for elem in self.find_elements(by, value):
|
| 753 |
+
yield elem
|
| 754 |
+
# switch back to main content, otherwise we will get StaleElementReferenceException
|
| 755 |
+
self.switch_to.default_content()
|
| 756 |
+
|
| 757 |
+
# search root frame
|
| 758 |
+
for elem in search_frame():
|
| 759 |
+
yield elem
|
| 760 |
+
# get iframes
|
| 761 |
+
frames = self.find_elements('css selector', 'iframe')
|
| 762 |
+
|
| 763 |
+
# search per frame
|
| 764 |
+
for f in frames:
|
| 765 |
+
for elem in search_frame(f):
|
| 766 |
+
yield elem
|
| 767 |
+
|
| 768 |
+
def quit(self):
|
| 769 |
+
try:
|
| 770 |
+
self.service.stop()
|
| 771 |
+
self.service.process.kill()
|
| 772 |
+
self.command_executor.close()
|
| 773 |
+
self.service.process.wait(5)
|
| 774 |
+
logger.debug("webdriver process ended")
|
| 775 |
+
except (AttributeError, RuntimeError, OSError):
|
| 776 |
+
pass
|
| 777 |
+
try:
|
| 778 |
+
self.reactor.event.set()
|
| 779 |
+
logger.debug("shutting down reactor")
|
| 780 |
+
except AttributeError:
|
| 781 |
+
pass
|
| 782 |
+
try:
|
| 783 |
+
os.kill(self.browser_pid, 15)
|
| 784 |
+
logger.debug("gracefully closed browser")
|
| 785 |
+
except Exception as e: # noqa
|
| 786 |
+
pass
|
| 787 |
+
if (
|
| 788 |
+
hasattr(self, "keep_user_data_dir")
|
| 789 |
+
and hasattr(self, "user_data_dir")
|
| 790 |
+
and not self.keep_user_data_dir
|
| 791 |
+
):
|
| 792 |
+
for _ in range(5):
|
| 793 |
+
try:
|
| 794 |
+
shutil.rmtree(self.user_data_dir, ignore_errors=False)
|
| 795 |
+
except FileNotFoundError:
|
| 796 |
+
pass
|
| 797 |
+
except (RuntimeError, OSError, PermissionError) as e:
|
| 798 |
+
logger.debug(
|
| 799 |
+
"When removing the temp profile, a %s occured: %s\nretrying..."
|
| 800 |
+
% (e.__class__.__name__, e)
|
| 801 |
+
)
|
| 802 |
+
else:
|
| 803 |
+
logger.debug("successfully removed %s" % self.user_data_dir)
|
| 804 |
+
break
|
| 805 |
+
|
| 806 |
+
try:
|
| 807 |
+
time.sleep(0.1)
|
| 808 |
+
except OSError:
|
| 809 |
+
pass
|
| 810 |
+
|
| 811 |
+
# dereference patcher, so patcher can start cleaning up as well.
|
| 812 |
+
# this must come last, otherwise it will throw 'in use' errors
|
| 813 |
+
self.patcher = None
|
| 814 |
+
|
| 815 |
+
def __getattribute__(self, item):
|
| 816 |
+
if not super().__getattribute__("debug"):
|
| 817 |
+
return super().__getattribute__(item)
|
| 818 |
+
else:
|
| 819 |
+
import inspect
|
| 820 |
+
|
| 821 |
+
original = super().__getattribute__(item)
|
| 822 |
+
if inspect.ismethod(original) and not inspect.isclass(original):
|
| 823 |
+
|
| 824 |
+
def newfunc(*args, **kwargs):
|
| 825 |
+
logger.debug(
|
| 826 |
+
"calling %s with args %s and kwargs %s\n"
|
| 827 |
+
% (original.__qualname__, args, kwargs)
|
| 828 |
+
)
|
| 829 |
+
return original(*args, **kwargs)
|
| 830 |
+
|
| 831 |
+
return newfunc
|
| 832 |
+
return original
|
| 833 |
+
|
| 834 |
+
def __enter__(self):
|
| 835 |
+
return self
|
| 836 |
+
|
| 837 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 838 |
+
self.service.stop()
|
| 839 |
+
time.sleep(self._delay)
|
| 840 |
+
self.service.start()
|
| 841 |
+
self.start_session()
|
| 842 |
+
|
| 843 |
+
def __hash__(self):
|
| 844 |
+
return hash(self.options.debugger_address)
|
| 845 |
+
|
| 846 |
+
def __dir__(self):
|
| 847 |
+
return object.__dir__(self)
|
| 848 |
+
|
| 849 |
+
def __del__(self):
|
| 850 |
+
try:
|
| 851 |
+
self.service.process.kill()
|
| 852 |
+
except: # noqa
|
| 853 |
+
pass
|
| 854 |
+
self.quit()
|
| 855 |
+
|
| 856 |
+
@classmethod
|
| 857 |
+
def _ensure_close(cls, self):
|
| 858 |
+
# needs to be a classmethod so finalize can find the reference
|
| 859 |
+
logger.info("ensuring close")
|
| 860 |
+
if (
|
| 861 |
+
hasattr(self, "service")
|
| 862 |
+
and hasattr(self.service, "process")
|
| 863 |
+
and hasattr(self.service.process, "kill")
|
| 864 |
+
):
|
| 865 |
+
self.service.process.kill()
|
| 866 |
+
|
| 867 |
+
|
| 868 |
+
def find_chrome_executable():
|
| 869 |
+
"""
|
| 870 |
+
Finds the chrome, chrome beta, chrome canary, chromium executable
|
| 871 |
+
|
| 872 |
+
Returns
|
| 873 |
+
-------
|
| 874 |
+
executable_path : str
|
| 875 |
+
the full file path to found executable
|
| 876 |
+
|
| 877 |
+
"""
|
| 878 |
+
candidates = set()
|
| 879 |
+
if IS_POSIX:
|
| 880 |
+
for item in os.environ.get("PATH").split(os.pathsep):
|
| 881 |
+
for subitem in (
|
| 882 |
+
"google-chrome",
|
| 883 |
+
"chromium",
|
| 884 |
+
"chromium-browser",
|
| 885 |
+
"chrome",
|
| 886 |
+
"google-chrome-stable",
|
| 887 |
+
):
|
| 888 |
+
candidates.add(os.sep.join((item, subitem)))
|
| 889 |
+
if "darwin" in sys.platform:
|
| 890 |
+
candidates.update(
|
| 891 |
+
[
|
| 892 |
+
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
| 893 |
+
"/Applications/Chromium.app/Contents/MacOS/Chromium",
|
| 894 |
+
]
|
| 895 |
+
)
|
| 896 |
+
else:
|
| 897 |
+
for item in map(
|
| 898 |
+
os.environ.get,
|
| 899 |
+
("PROGRAMFILES", "PROGRAMFILES(X86)", "LOCALAPPDATA", "PROGRAMW6432"),
|
| 900 |
+
):
|
| 901 |
+
if item is not None:
|
| 902 |
+
for subitem in (
|
| 903 |
+
"Google/Chrome/Application",
|
| 904 |
+
):
|
| 905 |
+
candidates.add(os.sep.join((item, subitem, "chrome.exe")))
|
| 906 |
+
for candidate in candidates:
|
| 907 |
+
logger.debug('checking if %s exists and is executable' % candidate)
|
| 908 |
+
if os.path.exists(candidate) and os.access(candidate, os.X_OK):
|
| 909 |
+
logger.debug('found! using %s' % candidate)
|
| 910 |
+
return os.path.normpath(candidate)
|
flaresolverr/undetected_chromedriver/cdp.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# this module is part of undetected_chromedriver
|
| 3 |
+
|
| 4 |
+
import json
|
| 5 |
+
import logging
|
| 6 |
+
|
| 7 |
+
import requests
|
| 8 |
+
import websockets
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
log = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class CDPObject(dict):
|
| 15 |
+
def __init__(self, *a, **k):
|
| 16 |
+
super().__init__(*a, **k)
|
| 17 |
+
self.__dict__ = self
|
| 18 |
+
for k in self.__dict__:
|
| 19 |
+
if isinstance(self.__dict__[k], dict):
|
| 20 |
+
self.__dict__[k] = CDPObject(self.__dict__[k])
|
| 21 |
+
elif isinstance(self.__dict__[k], list):
|
| 22 |
+
for i in range(len(self.__dict__[k])):
|
| 23 |
+
if isinstance(self.__dict__[k][i], dict):
|
| 24 |
+
self.__dict__[k][i] = CDPObject(self)
|
| 25 |
+
|
| 26 |
+
def __repr__(self):
|
| 27 |
+
tpl = f"{self.__class__.__name__}(\n\t{{}}\n\t)"
|
| 28 |
+
return tpl.format("\n ".join(f"{k} = {v}" for k, v in self.items()))
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class PageElement(CDPObject):
|
| 32 |
+
pass
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class CDP:
|
| 36 |
+
log = logging.getLogger("CDP")
|
| 37 |
+
|
| 38 |
+
endpoints = CDPObject(
|
| 39 |
+
{
|
| 40 |
+
"json": "/json",
|
| 41 |
+
"protocol": "/json/protocol",
|
| 42 |
+
"list": "/json/list",
|
| 43 |
+
"new": "/json/new?{url}",
|
| 44 |
+
"activate": "/json/activate/{id}",
|
| 45 |
+
"close": "/json/close/{id}",
|
| 46 |
+
}
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
def __init__(self, options: "ChromeOptions"): # noqa
|
| 50 |
+
self.server_addr = "http://{0}:{1}".format(*options.debugger_address.split(":"))
|
| 51 |
+
|
| 52 |
+
self._reqid = 0
|
| 53 |
+
self._session = requests.Session()
|
| 54 |
+
self._last_resp = None
|
| 55 |
+
self._last_json = None
|
| 56 |
+
|
| 57 |
+
resp = self.get(self.endpoints.json) # noqa
|
| 58 |
+
self.sessionId = resp[0]["id"]
|
| 59 |
+
self.wsurl = resp[0]["webSocketDebuggerUrl"]
|
| 60 |
+
|
| 61 |
+
def tab_activate(self, id=None):
|
| 62 |
+
if not id:
|
| 63 |
+
active_tab = self.tab_list()[0]
|
| 64 |
+
id = active_tab.id # noqa
|
| 65 |
+
self.wsurl = active_tab.webSocketDebuggerUrl # noqa
|
| 66 |
+
return self.post(self.endpoints["activate"].format(id=id))
|
| 67 |
+
|
| 68 |
+
def tab_list(self):
|
| 69 |
+
retval = self.get(self.endpoints["list"])
|
| 70 |
+
return [PageElement(o) for o in retval]
|
| 71 |
+
|
| 72 |
+
def tab_new(self, url):
|
| 73 |
+
return self.post(self.endpoints["new"].format(url=url))
|
| 74 |
+
|
| 75 |
+
def tab_close_last_opened(self):
|
| 76 |
+
sessions = self.tab_list()
|
| 77 |
+
opentabs = [s for s in sessions if s["type"] == "page"]
|
| 78 |
+
return self.post(self.endpoints["close"].format(id=opentabs[-1]["id"]))
|
| 79 |
+
|
| 80 |
+
async def send(self, method: str, params: dict):
|
| 81 |
+
self._reqid += 1
|
| 82 |
+
async with websockets.connect(self.wsurl) as ws:
|
| 83 |
+
await ws.send(
|
| 84 |
+
json.dumps({"method": method, "params": params, "id": self._reqid})
|
| 85 |
+
)
|
| 86 |
+
self._last_resp = await ws.recv()
|
| 87 |
+
self._last_json = json.loads(self._last_resp)
|
| 88 |
+
self.log.info(self._last_json)
|
| 89 |
+
|
| 90 |
+
def get(self, uri):
|
| 91 |
+
resp = self._session.get(self.server_addr + uri)
|
| 92 |
+
try:
|
| 93 |
+
self._last_resp = resp
|
| 94 |
+
self._last_json = resp.json()
|
| 95 |
+
except Exception:
|
| 96 |
+
return
|
| 97 |
+
else:
|
| 98 |
+
return self._last_json
|
| 99 |
+
|
| 100 |
+
def post(self, uri, data: dict = None):
|
| 101 |
+
if not data:
|
| 102 |
+
data = {}
|
| 103 |
+
resp = self._session.post(self.server_addr + uri, json=data)
|
| 104 |
+
try:
|
| 105 |
+
self._last_resp = resp
|
| 106 |
+
self._last_json = resp.json()
|
| 107 |
+
except Exception:
|
| 108 |
+
return self._last_resp
|
| 109 |
+
|
| 110 |
+
@property
|
| 111 |
+
def last_json(self):
|
| 112 |
+
return self._last_json
|
flaresolverr/undetected_chromedriver/devtool.py
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
from collections.abc import Mapping
|
| 3 |
+
from collections.abc import Sequence
|
| 4 |
+
from functools import wraps
|
| 5 |
+
import os
|
| 6 |
+
import logging
|
| 7 |
+
import threading
|
| 8 |
+
import time
|
| 9 |
+
import traceback
|
| 10 |
+
from typing import Any
|
| 11 |
+
from typing import Awaitable
|
| 12 |
+
from typing import Callable
|
| 13 |
+
from typing import List
|
| 14 |
+
from typing import Optional
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class Structure(dict):
|
| 18 |
+
"""
|
| 19 |
+
This is a dict-like object structure, which you should subclass
|
| 20 |
+
Only properties defined in the class context are used on initialization.
|
| 21 |
+
|
| 22 |
+
See example
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
_store = {}
|
| 26 |
+
|
| 27 |
+
def __init__(self, *a, **kw):
|
| 28 |
+
"""
|
| 29 |
+
Instantiate a new instance.
|
| 30 |
+
|
| 31 |
+
:param a:
|
| 32 |
+
:param kw:
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
super().__init__()
|
| 36 |
+
|
| 37 |
+
# auxiliar dict
|
| 38 |
+
d = dict(*a, **kw)
|
| 39 |
+
for k, v in d.items():
|
| 40 |
+
if isinstance(v, Mapping):
|
| 41 |
+
self[k] = self.__class__(v)
|
| 42 |
+
elif isinstance(v, Sequence) and not isinstance(v, (str, bytes)):
|
| 43 |
+
self[k] = [self.__class__(i) for i in v]
|
| 44 |
+
else:
|
| 45 |
+
self[k] = v
|
| 46 |
+
super().__setattr__("__dict__", self)
|
| 47 |
+
|
| 48 |
+
def __getattr__(self, item):
|
| 49 |
+
return getattr(super(), item)
|
| 50 |
+
|
| 51 |
+
def __getitem__(self, item):
|
| 52 |
+
return super().__getitem__(item)
|
| 53 |
+
|
| 54 |
+
def __setattr__(self, key, value):
|
| 55 |
+
self.__setitem__(key, value)
|
| 56 |
+
|
| 57 |
+
def __setitem__(self, key, value):
|
| 58 |
+
super().__setitem__(key, value)
|
| 59 |
+
|
| 60 |
+
def update(self, *a, **kw):
|
| 61 |
+
super().update(*a, **kw)
|
| 62 |
+
|
| 63 |
+
def __eq__(self, other):
|
| 64 |
+
return frozenset(other.items()) == frozenset(self.items())
|
| 65 |
+
|
| 66 |
+
def __hash__(self):
|
| 67 |
+
return hash(frozenset(self.items()))
|
| 68 |
+
|
| 69 |
+
@classmethod
|
| 70 |
+
def __init_subclass__(cls, **kwargs):
|
| 71 |
+
cls._store = {}
|
| 72 |
+
|
| 73 |
+
def _normalize_strings(self):
|
| 74 |
+
for k, v in self.copy().items():
|
| 75 |
+
if isinstance(v, (str)):
|
| 76 |
+
self[k] = v.strip()
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def timeout(seconds=3, on_timeout: Optional[Callable[[callable], Any]] = None):
|
| 80 |
+
def wrapper(func):
|
| 81 |
+
@wraps(func)
|
| 82 |
+
def wrapped(*args, **kwargs):
|
| 83 |
+
def function_reached_timeout():
|
| 84 |
+
if on_timeout:
|
| 85 |
+
on_timeout(func)
|
| 86 |
+
else:
|
| 87 |
+
raise TimeoutError("function call timed out")
|
| 88 |
+
|
| 89 |
+
t = threading.Timer(interval=seconds, function=function_reached_timeout)
|
| 90 |
+
t.start()
|
| 91 |
+
try:
|
| 92 |
+
return func(*args, **kwargs)
|
| 93 |
+
except:
|
| 94 |
+
t.cancel()
|
| 95 |
+
raise
|
| 96 |
+
finally:
|
| 97 |
+
t.cancel()
|
| 98 |
+
|
| 99 |
+
return wrapped
|
| 100 |
+
|
| 101 |
+
return wrapper
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def test():
|
| 105 |
+
import sys, os
|
| 106 |
+
|
| 107 |
+
sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
|
| 108 |
+
import undetected_chromedriver as uc
|
| 109 |
+
import threading
|
| 110 |
+
|
| 111 |
+
def collector(
|
| 112 |
+
driver: uc.Chrome,
|
| 113 |
+
stop_event: threading.Event,
|
| 114 |
+
on_event_coro: Optional[Callable[[List[str]], Awaitable[Any]]] = None,
|
| 115 |
+
listen_events: Sequence = ("browser", "network", "performance"),
|
| 116 |
+
):
|
| 117 |
+
def threaded(driver, stop_event, on_event_coro):
|
| 118 |
+
async def _ensure_service_started():
|
| 119 |
+
while (
|
| 120 |
+
getattr(driver, "service", False)
|
| 121 |
+
and getattr(driver.service, "process", False)
|
| 122 |
+
and driver.service.process.poll()
|
| 123 |
+
):
|
| 124 |
+
print("waiting for driver service to come back on")
|
| 125 |
+
await asyncio.sleep(0.05)
|
| 126 |
+
# await asyncio.sleep(driver._delay or .25)
|
| 127 |
+
|
| 128 |
+
async def get_log_lines(typ):
|
| 129 |
+
await _ensure_service_started()
|
| 130 |
+
return driver.get_log(typ)
|
| 131 |
+
|
| 132 |
+
async def looper():
|
| 133 |
+
while not stop_event.is_set():
|
| 134 |
+
log_lines = []
|
| 135 |
+
try:
|
| 136 |
+
for _ in listen_events:
|
| 137 |
+
try:
|
| 138 |
+
log_lines += await get_log_lines(_)
|
| 139 |
+
except:
|
| 140 |
+
if logging.getLogger().getEffectiveLevel() <= 10:
|
| 141 |
+
traceback.print_exc()
|
| 142 |
+
continue
|
| 143 |
+
if log_lines and on_event_coro:
|
| 144 |
+
await on_event_coro(log_lines)
|
| 145 |
+
except Exception as e:
|
| 146 |
+
if logging.getLogger().getEffectiveLevel() <= 10:
|
| 147 |
+
traceback.print_exc()
|
| 148 |
+
|
| 149 |
+
loop = asyncio.new_event_loop()
|
| 150 |
+
asyncio.set_event_loop(loop)
|
| 151 |
+
loop.run_until_complete(looper())
|
| 152 |
+
|
| 153 |
+
t = threading.Thread(target=threaded, args=(driver, stop_event, on_event_coro))
|
| 154 |
+
t.start()
|
| 155 |
+
|
| 156 |
+
async def on_event(data):
|
| 157 |
+
print("on_event")
|
| 158 |
+
print("data:", data)
|
| 159 |
+
|
| 160 |
+
def func_called(fn):
|
| 161 |
+
def wrapped(*args, **kwargs):
|
| 162 |
+
print(
|
| 163 |
+
"func called! %s (args: %s, kwargs: %s)" % (fn.__name__, args, kwargs)
|
| 164 |
+
)
|
| 165 |
+
while driver.service.process and driver.service.process.poll() is not None:
|
| 166 |
+
time.sleep(0.1)
|
| 167 |
+
res = fn(*args, **kwargs)
|
| 168 |
+
print("func completed! (result: %s)" % res)
|
| 169 |
+
return res
|
| 170 |
+
|
| 171 |
+
return wrapped
|
| 172 |
+
|
| 173 |
+
logging.basicConfig(level=10)
|
| 174 |
+
|
| 175 |
+
options = uc.ChromeOptions()
|
| 176 |
+
options.set_capability(
|
| 177 |
+
"goog:loggingPrefs", {"performance": "ALL", "browser": "ALL", "network": "ALL"}
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
driver = uc.Chrome(version_main=96, options=options)
|
| 181 |
+
|
| 182 |
+
# driver.command_executor._request = timeout(seconds=1)(driver.command_executor._request)
|
| 183 |
+
driver.command_executor._request = func_called(driver.command_executor._request)
|
| 184 |
+
collector_stop = threading.Event()
|
| 185 |
+
collector(driver, collector_stop, on_event)
|
| 186 |
+
|
| 187 |
+
driver.get("https://nowsecure.nl")
|
| 188 |
+
|
| 189 |
+
time.sleep(10)
|
| 190 |
+
|
| 191 |
+
if os.name == "nt":
|
| 192 |
+
driver.close()
|
| 193 |
+
driver.quit()
|
flaresolverr/undetected_chromedriver/dprocess.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import atexit
|
| 2 |
+
import logging
|
| 3 |
+
import multiprocessing
|
| 4 |
+
import os
|
| 5 |
+
import platform
|
| 6 |
+
import signal
|
| 7 |
+
from subprocess import PIPE
|
| 8 |
+
from subprocess import Popen
|
| 9 |
+
import sys
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
CREATE_NEW_PROCESS_GROUP = 0x00000200
|
| 13 |
+
DETACHED_PROCESS = 0x00000008
|
| 14 |
+
|
| 15 |
+
REGISTERED = []
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def start_detached(executable, *args):
|
| 19 |
+
"""
|
| 20 |
+
Starts a fully independent subprocess (with no parent)
|
| 21 |
+
:param executable: executable
|
| 22 |
+
:param args: arguments to the executable, eg: ['--param1_key=param1_val', '-vvv' ...]
|
| 23 |
+
:return: pid of the grandchild process
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
# create pipe
|
| 27 |
+
reader, writer = multiprocessing.Pipe(False)
|
| 28 |
+
|
| 29 |
+
# do not keep reference
|
| 30 |
+
process = multiprocessing.Process(
|
| 31 |
+
target=_start_detached,
|
| 32 |
+
args=(executable, *args),
|
| 33 |
+
kwargs={"writer": writer},
|
| 34 |
+
daemon=True,
|
| 35 |
+
)
|
| 36 |
+
process.start()
|
| 37 |
+
process.join()
|
| 38 |
+
# receive pid from pipe
|
| 39 |
+
pid = reader.recv()
|
| 40 |
+
REGISTERED.append(pid)
|
| 41 |
+
# close pipes
|
| 42 |
+
writer.close()
|
| 43 |
+
reader.close()
|
| 44 |
+
process.close()
|
| 45 |
+
|
| 46 |
+
return pid
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def _start_detached(executable, *args, writer: multiprocessing.Pipe = None):
|
| 50 |
+
# configure launch
|
| 51 |
+
kwargs = {}
|
| 52 |
+
if platform.system() == "Windows":
|
| 53 |
+
kwargs.update(creationflags=DETACHED_PROCESS | CREATE_NEW_PROCESS_GROUP)
|
| 54 |
+
elif sys.version_info < (3, 2):
|
| 55 |
+
# assume posix
|
| 56 |
+
kwargs.update(preexec_fn=os.setsid)
|
| 57 |
+
else: # Python 3.2+ and Unix
|
| 58 |
+
kwargs.update(start_new_session=True)
|
| 59 |
+
|
| 60 |
+
# run
|
| 61 |
+
p = Popen([executable, *args], stdin=PIPE, stdout=PIPE, stderr=PIPE, **kwargs)
|
| 62 |
+
|
| 63 |
+
# send pid to pipe
|
| 64 |
+
writer.send(p.pid)
|
| 65 |
+
sys.exit()
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def _cleanup():
|
| 69 |
+
for pid in REGISTERED:
|
| 70 |
+
try:
|
| 71 |
+
logging.getLogger(__name__).debug("cleaning up pid %d " % pid)
|
| 72 |
+
os.kill(pid, signal.SIGTERM)
|
| 73 |
+
except: # noqa
|
| 74 |
+
pass
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
atexit.register(_cleanup)
|
flaresolverr/undetected_chromedriver/options.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# this module is part of undetected_chromedriver
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
from selenium.webdriver.chromium.options import ChromiumOptions as _ChromiumOptions
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class ChromeOptions(_ChromiumOptions):
|
| 12 |
+
_session = None
|
| 13 |
+
_user_data_dir = None
|
| 14 |
+
|
| 15 |
+
@property
|
| 16 |
+
def user_data_dir(self):
|
| 17 |
+
return self._user_data_dir
|
| 18 |
+
|
| 19 |
+
@user_data_dir.setter
|
| 20 |
+
def user_data_dir(self, path: str):
|
| 21 |
+
"""
|
| 22 |
+
Sets the browser profile folder to use, or creates a new profile
|
| 23 |
+
at given <path>.
|
| 24 |
+
|
| 25 |
+
Parameters
|
| 26 |
+
----------
|
| 27 |
+
path: str
|
| 28 |
+
the path to a chrome profile folder
|
| 29 |
+
if it does not exist, a new profile will be created at given location
|
| 30 |
+
"""
|
| 31 |
+
apath = os.path.abspath(path)
|
| 32 |
+
self._user_data_dir = os.path.normpath(apath)
|
| 33 |
+
|
| 34 |
+
@staticmethod
|
| 35 |
+
def _undot_key(key, value):
|
| 36 |
+
"""turn a (dotted key, value) into a proper nested dict"""
|
| 37 |
+
if "." in key:
|
| 38 |
+
key, rest = key.split(".", 1)
|
| 39 |
+
value = ChromeOptions._undot_key(rest, value)
|
| 40 |
+
return {key: value}
|
| 41 |
+
|
| 42 |
+
@staticmethod
|
| 43 |
+
def _merge_nested(a, b):
|
| 44 |
+
"""
|
| 45 |
+
merges b into a
|
| 46 |
+
leaf values in a are overwritten with values from b
|
| 47 |
+
"""
|
| 48 |
+
for key in b:
|
| 49 |
+
if key in a:
|
| 50 |
+
if isinstance(a[key], dict) and isinstance(b[key], dict):
|
| 51 |
+
ChromeOptions._merge_nested(a[key], b[key])
|
| 52 |
+
continue
|
| 53 |
+
a[key] = b[key]
|
| 54 |
+
return a
|
| 55 |
+
|
| 56 |
+
def handle_prefs(self, user_data_dir):
|
| 57 |
+
prefs = self.experimental_options.get("prefs")
|
| 58 |
+
if prefs:
|
| 59 |
+
user_data_dir = user_data_dir or self._user_data_dir
|
| 60 |
+
default_path = os.path.join(user_data_dir, "Default")
|
| 61 |
+
os.makedirs(default_path, exist_ok=True)
|
| 62 |
+
|
| 63 |
+
# undot prefs dict keys
|
| 64 |
+
undot_prefs = {}
|
| 65 |
+
for key, value in prefs.items():
|
| 66 |
+
undot_prefs = self._merge_nested(
|
| 67 |
+
undot_prefs, self._undot_key(key, value)
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
prefs_file = os.path.join(default_path, "Preferences")
|
| 71 |
+
if os.path.exists(prefs_file):
|
| 72 |
+
with open(prefs_file, encoding="latin1", mode="r") as f:
|
| 73 |
+
undot_prefs = self._merge_nested(json.load(f), undot_prefs)
|
| 74 |
+
|
| 75 |
+
with open(prefs_file, encoding="latin1", mode="w") as f:
|
| 76 |
+
json.dump(undot_prefs, f)
|
| 77 |
+
|
| 78 |
+
# remove the experimental_options to avoid an error
|
| 79 |
+
del self._experimental_options["prefs"]
|
| 80 |
+
|
| 81 |
+
@classmethod
|
| 82 |
+
def from_options(cls, options):
|
| 83 |
+
o = cls()
|
| 84 |
+
o.__dict__.update(options.__dict__)
|
| 85 |
+
return o
|
flaresolverr/undetected_chromedriver/patcher.py
ADDED
|
@@ -0,0 +1,473 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# this module is part of undetected_chromedriver
|
| 3 |
+
|
| 4 |
+
from packaging.version import Version as LooseVersion
|
| 5 |
+
import io
|
| 6 |
+
import json
|
| 7 |
+
import logging
|
| 8 |
+
import os
|
| 9 |
+
import pathlib
|
| 10 |
+
import platform
|
| 11 |
+
import random
|
| 12 |
+
import re
|
| 13 |
+
import shutil
|
| 14 |
+
import string
|
| 15 |
+
import subprocess
|
| 16 |
+
import sys
|
| 17 |
+
import time
|
| 18 |
+
from urllib.request import urlopen
|
| 19 |
+
from urllib.request import urlretrieve
|
| 20 |
+
import zipfile
|
| 21 |
+
from multiprocessing import Lock
|
| 22 |
+
|
| 23 |
+
logger = logging.getLogger(__name__)
|
| 24 |
+
|
| 25 |
+
IS_POSIX = sys.platform.startswith(("darwin", "cygwin", "linux", "linux2", "freebsd"))
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class Patcher(object):
|
| 29 |
+
lock = Lock()
|
| 30 |
+
exe_name = "chromedriver%s"
|
| 31 |
+
|
| 32 |
+
platform = sys.platform
|
| 33 |
+
if platform.endswith("win32"):
|
| 34 |
+
d = "~/appdata/roaming/undetected_chromedriver"
|
| 35 |
+
elif "LAMBDA_TASK_ROOT" in os.environ:
|
| 36 |
+
d = "/tmp/undetected_chromedriver"
|
| 37 |
+
elif platform.startswith(("linux", "linux2")):
|
| 38 |
+
d = "~/.local/share/undetected_chromedriver"
|
| 39 |
+
elif platform.endswith("darwin"):
|
| 40 |
+
d = "~/Library/Application Support/undetected_chromedriver"
|
| 41 |
+
else:
|
| 42 |
+
d = "~/.undetected_chromedriver"
|
| 43 |
+
data_path = os.path.abspath(os.path.expanduser(d))
|
| 44 |
+
|
| 45 |
+
def __init__(
|
| 46 |
+
self,
|
| 47 |
+
executable_path=None,
|
| 48 |
+
force=False,
|
| 49 |
+
version_main: int = 0,
|
| 50 |
+
user_multi_procs=False,
|
| 51 |
+
):
|
| 52 |
+
"""
|
| 53 |
+
Args:
|
| 54 |
+
executable_path: None = automatic
|
| 55 |
+
a full file path to the chromedriver executable
|
| 56 |
+
force: False
|
| 57 |
+
terminate processes which are holding lock
|
| 58 |
+
version_main: 0 = auto
|
| 59 |
+
specify main chrome version (rounded, ex: 82)
|
| 60 |
+
"""
|
| 61 |
+
self.force = force
|
| 62 |
+
self._custom_exe_path = False
|
| 63 |
+
prefix = "undetected"
|
| 64 |
+
self.user_multi_procs = user_multi_procs
|
| 65 |
+
|
| 66 |
+
try:
|
| 67 |
+
# Try to convert version_main into an integer
|
| 68 |
+
version_main_int = int(version_main)
|
| 69 |
+
# check if version_main_int is less than or equal to e.g 114
|
| 70 |
+
self.is_old_chromedriver = version_main and version_main_int <= 114
|
| 71 |
+
except (ValueError,TypeError):
|
| 72 |
+
# Check not running inside Docker
|
| 73 |
+
if not os.path.exists("/app/chromedriver"):
|
| 74 |
+
# If the conversion fails, log an error message
|
| 75 |
+
logging.info("version_main cannot be converted to an integer")
|
| 76 |
+
# Set self.is_old_chromedriver to False if the conversion fails
|
| 77 |
+
self.is_old_chromedriver = False
|
| 78 |
+
|
| 79 |
+
# Needs to be called before self.exe_name is accessed
|
| 80 |
+
self._set_platform_name()
|
| 81 |
+
|
| 82 |
+
if not os.path.exists(self.data_path):
|
| 83 |
+
os.makedirs(self.data_path, exist_ok=True)
|
| 84 |
+
|
| 85 |
+
if not executable_path:
|
| 86 |
+
if sys.platform.startswith("freebsd"):
|
| 87 |
+
self.executable_path = os.path.join(
|
| 88 |
+
self.data_path, self.exe_name
|
| 89 |
+
)
|
| 90 |
+
else:
|
| 91 |
+
self.executable_path = os.path.join(
|
| 92 |
+
self.data_path, "_".join([prefix, self.exe_name])
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
if not IS_POSIX:
|
| 96 |
+
if executable_path:
|
| 97 |
+
if not executable_path[-4:] == ".exe":
|
| 98 |
+
executable_path += ".exe"
|
| 99 |
+
|
| 100 |
+
self.zip_path = os.path.join(self.data_path, prefix)
|
| 101 |
+
|
| 102 |
+
if not executable_path:
|
| 103 |
+
if not self.user_multi_procs:
|
| 104 |
+
self.executable_path = os.path.abspath(
|
| 105 |
+
os.path.join(".", self.executable_path)
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
if executable_path:
|
| 109 |
+
self._custom_exe_path = True
|
| 110 |
+
self.executable_path = executable_path
|
| 111 |
+
|
| 112 |
+
# Set the correct repository to download the Chromedriver from
|
| 113 |
+
if self.is_old_chromedriver:
|
| 114 |
+
self.url_repo = "https://chromedriver.storage.googleapis.com"
|
| 115 |
+
else:
|
| 116 |
+
self.url_repo = "https://googlechromelabs.github.io/chrome-for-testing"
|
| 117 |
+
|
| 118 |
+
self.version_main = version_main
|
| 119 |
+
self.version_full = None
|
| 120 |
+
|
| 121 |
+
def _set_platform_name(self):
|
| 122 |
+
"""
|
| 123 |
+
Set the platform and exe name based on the platform undetected_chromedriver is running on
|
| 124 |
+
in order to download the correct chromedriver.
|
| 125 |
+
"""
|
| 126 |
+
if self.platform.endswith("win32"):
|
| 127 |
+
self.platform_name = "win32"
|
| 128 |
+
self.exe_name %= ".exe"
|
| 129 |
+
if self.platform.endswith(("linux", "linux2")):
|
| 130 |
+
self.platform_name = "linux64"
|
| 131 |
+
self.exe_name %= ""
|
| 132 |
+
if self.platform.endswith("darwin"):
|
| 133 |
+
if self.is_old_chromedriver:
|
| 134 |
+
self.platform_name = "mac64"
|
| 135 |
+
else:
|
| 136 |
+
self.platform_name = "mac-x64"
|
| 137 |
+
self.exe_name %= ""
|
| 138 |
+
if self.platform.startswith("freebsd"):
|
| 139 |
+
self.platform_name = "freebsd"
|
| 140 |
+
self.exe_name %= ""
|
| 141 |
+
|
| 142 |
+
def auto(self, executable_path=None, force=False, version_main=None, _=None):
|
| 143 |
+
"""
|
| 144 |
+
|
| 145 |
+
Args:
|
| 146 |
+
executable_path:
|
| 147 |
+
force:
|
| 148 |
+
version_main:
|
| 149 |
+
|
| 150 |
+
Returns:
|
| 151 |
+
|
| 152 |
+
"""
|
| 153 |
+
p = pathlib.Path(self.data_path)
|
| 154 |
+
if self.user_multi_procs:
|
| 155 |
+
with Lock():
|
| 156 |
+
files = list(p.rglob("*chromedriver*"))
|
| 157 |
+
most_recent = max(files, key=lambda f: f.stat().st_mtime)
|
| 158 |
+
files.remove(most_recent)
|
| 159 |
+
list(map(lambda f: f.unlink(), files))
|
| 160 |
+
if self.is_binary_patched(most_recent):
|
| 161 |
+
self.executable_path = str(most_recent)
|
| 162 |
+
return True
|
| 163 |
+
|
| 164 |
+
if executable_path:
|
| 165 |
+
self.executable_path = executable_path
|
| 166 |
+
self._custom_exe_path = True
|
| 167 |
+
|
| 168 |
+
if self._custom_exe_path:
|
| 169 |
+
ispatched = self.is_binary_patched(self.executable_path)
|
| 170 |
+
if not ispatched:
|
| 171 |
+
return self.patch_exe()
|
| 172 |
+
else:
|
| 173 |
+
return
|
| 174 |
+
|
| 175 |
+
if version_main:
|
| 176 |
+
self.version_main = version_main
|
| 177 |
+
if force is True:
|
| 178 |
+
self.force = force
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
if self.platform_name == "freebsd":
|
| 182 |
+
chromedriver_path = shutil.which("chromedriver")
|
| 183 |
+
|
| 184 |
+
if not os.path.isfile(chromedriver_path) or not os.access(chromedriver_path, os.X_OK):
|
| 185 |
+
logging.error("Chromedriver not installed!")
|
| 186 |
+
return
|
| 187 |
+
|
| 188 |
+
version_path = os.path.join(os.path.dirname(self.executable_path), "version.txt")
|
| 189 |
+
|
| 190 |
+
process = os.popen(f'"{chromedriver_path}" --version')
|
| 191 |
+
chromedriver_version = process.read().split(' ')[1].split(' ')[0]
|
| 192 |
+
process.close()
|
| 193 |
+
|
| 194 |
+
current_version = None
|
| 195 |
+
if os.path.isfile(version_path) or os.access(version_path, os.X_OK):
|
| 196 |
+
with open(version_path, 'r') as f:
|
| 197 |
+
current_version = f.read()
|
| 198 |
+
|
| 199 |
+
if current_version != chromedriver_version:
|
| 200 |
+
logging.info("Copying chromedriver executable...")
|
| 201 |
+
shutil.copy(chromedriver_path, self.executable_path)
|
| 202 |
+
os.chmod(self.executable_path, 0o755)
|
| 203 |
+
|
| 204 |
+
with open(version_path, 'w') as f:
|
| 205 |
+
f.write(chromedriver_version)
|
| 206 |
+
|
| 207 |
+
logging.info("Chromedriver executable copied!")
|
| 208 |
+
else:
|
| 209 |
+
try:
|
| 210 |
+
os.unlink(self.executable_path)
|
| 211 |
+
except PermissionError:
|
| 212 |
+
if self.force:
|
| 213 |
+
self.force_kill_instances(self.executable_path)
|
| 214 |
+
return self.auto(force=not self.force)
|
| 215 |
+
try:
|
| 216 |
+
if self.is_binary_patched():
|
| 217 |
+
# assumes already running AND patched
|
| 218 |
+
return True
|
| 219 |
+
except PermissionError:
|
| 220 |
+
pass
|
| 221 |
+
# return False
|
| 222 |
+
except FileNotFoundError:
|
| 223 |
+
pass
|
| 224 |
+
|
| 225 |
+
release = self.fetch_release_number()
|
| 226 |
+
self.version_main = release.major
|
| 227 |
+
self.version_full = release
|
| 228 |
+
self.unzip_package(self.fetch_package())
|
| 229 |
+
|
| 230 |
+
return self.patch()
|
| 231 |
+
|
| 232 |
+
def driver_binary_in_use(self, path: str = None) -> bool:
|
| 233 |
+
"""
|
| 234 |
+
naive test to check if a found chromedriver binary is
|
| 235 |
+
currently in use
|
| 236 |
+
|
| 237 |
+
Args:
|
| 238 |
+
path: a string or PathLike object to the binary to check.
|
| 239 |
+
if not specified, we check use this object's executable_path
|
| 240 |
+
"""
|
| 241 |
+
if not path:
|
| 242 |
+
path = self.executable_path
|
| 243 |
+
p = pathlib.Path(path)
|
| 244 |
+
|
| 245 |
+
if not p.exists():
|
| 246 |
+
raise OSError("file does not exist: %s" % p)
|
| 247 |
+
try:
|
| 248 |
+
with open(p, mode="a+b") as fs:
|
| 249 |
+
exc = []
|
| 250 |
+
try:
|
| 251 |
+
|
| 252 |
+
fs.seek(0, 0)
|
| 253 |
+
except PermissionError as e:
|
| 254 |
+
exc.append(e) # since some systems apprently allow seeking
|
| 255 |
+
# we conduct another test
|
| 256 |
+
try:
|
| 257 |
+
fs.readline()
|
| 258 |
+
except PermissionError as e:
|
| 259 |
+
exc.append(e)
|
| 260 |
+
|
| 261 |
+
if exc:
|
| 262 |
+
|
| 263 |
+
return True
|
| 264 |
+
return False
|
| 265 |
+
# ok safe to assume this is in use
|
| 266 |
+
except Exception as e:
|
| 267 |
+
# logger.exception("whoops ", e)
|
| 268 |
+
pass
|
| 269 |
+
|
| 270 |
+
def cleanup_unused_files(self):
|
| 271 |
+
p = pathlib.Path(self.data_path)
|
| 272 |
+
items = list(p.glob("*undetected*"))
|
| 273 |
+
for item in items:
|
| 274 |
+
try:
|
| 275 |
+
item.unlink()
|
| 276 |
+
except:
|
| 277 |
+
pass
|
| 278 |
+
|
| 279 |
+
def patch(self):
|
| 280 |
+
self.patch_exe()
|
| 281 |
+
return self.is_binary_patched()
|
| 282 |
+
|
| 283 |
+
def fetch_release_number(self):
|
| 284 |
+
"""
|
| 285 |
+
Gets the latest major version available, or the latest major version of self.target_version if set explicitly.
|
| 286 |
+
:return: version string
|
| 287 |
+
:rtype: LooseVersion
|
| 288 |
+
"""
|
| 289 |
+
# Endpoint for old versions of Chromedriver (114 and below)
|
| 290 |
+
if self.is_old_chromedriver:
|
| 291 |
+
path = f"/latest_release_{self.version_main}"
|
| 292 |
+
path = path.upper()
|
| 293 |
+
logger.debug("getting release number from %s" % path)
|
| 294 |
+
return LooseVersion(urlopen(self.url_repo + path).read().decode())
|
| 295 |
+
|
| 296 |
+
# Endpoint for new versions of Chromedriver (115+)
|
| 297 |
+
if not self.version_main:
|
| 298 |
+
# Fetch the latest version
|
| 299 |
+
path = "/last-known-good-versions-with-downloads.json"
|
| 300 |
+
logger.debug("getting release number from %s" % path)
|
| 301 |
+
with urlopen(self.url_repo + path) as conn:
|
| 302 |
+
response = conn.read().decode()
|
| 303 |
+
|
| 304 |
+
last_versions = json.loads(response)
|
| 305 |
+
return LooseVersion(last_versions["channels"]["Stable"]["version"])
|
| 306 |
+
|
| 307 |
+
# Fetch the latest minor version of the major version provided
|
| 308 |
+
path = "/latest-versions-per-milestone-with-downloads.json"
|
| 309 |
+
logger.debug("getting release number from %s" % path)
|
| 310 |
+
with urlopen(self.url_repo + path) as conn:
|
| 311 |
+
response = conn.read().decode()
|
| 312 |
+
|
| 313 |
+
major_versions = json.loads(response)
|
| 314 |
+
return LooseVersion(major_versions["milestones"][str(self.version_main)]["version"])
|
| 315 |
+
|
| 316 |
+
def parse_exe_version(self):
|
| 317 |
+
with io.open(self.executable_path, "rb") as f:
|
| 318 |
+
for line in iter(lambda: f.readline(), b""):
|
| 319 |
+
match = re.search(rb"platform_handle\x00content\x00([0-9.]*)", line)
|
| 320 |
+
if match:
|
| 321 |
+
return LooseVersion(match[1].decode())
|
| 322 |
+
|
| 323 |
+
def fetch_package(self):
|
| 324 |
+
"""
|
| 325 |
+
Downloads ChromeDriver from source
|
| 326 |
+
|
| 327 |
+
:return: path to downloaded file
|
| 328 |
+
"""
|
| 329 |
+
zip_name = f"chromedriver_{self.platform_name}.zip"
|
| 330 |
+
if self.is_old_chromedriver:
|
| 331 |
+
download_url = "%s/%s/%s" % (self.url_repo, str(self.version_full), zip_name)
|
| 332 |
+
else:
|
| 333 |
+
zip_name = zip_name.replace("_", "-", 1)
|
| 334 |
+
download_url = "https://storage.googleapis.com/chrome-for-testing-public/%s/%s/%s"
|
| 335 |
+
download_url %= (str(self.version_full), self.platform_name, zip_name)
|
| 336 |
+
|
| 337 |
+
logger.debug("downloading from %s" % download_url)
|
| 338 |
+
return urlretrieve(download_url)[0]
|
| 339 |
+
|
| 340 |
+
def unzip_package(self, fp):
|
| 341 |
+
"""
|
| 342 |
+
Does what it says
|
| 343 |
+
|
| 344 |
+
:return: path to unpacked executable
|
| 345 |
+
"""
|
| 346 |
+
exe_path = self.exe_name
|
| 347 |
+
if not self.is_old_chromedriver:
|
| 348 |
+
# The new chromedriver unzips into its own folder
|
| 349 |
+
zip_name = f"chromedriver-{self.platform_name}"
|
| 350 |
+
exe_path = os.path.join(zip_name, self.exe_name)
|
| 351 |
+
|
| 352 |
+
logger.debug("unzipping %s" % fp)
|
| 353 |
+
try:
|
| 354 |
+
os.unlink(self.zip_path)
|
| 355 |
+
except (FileNotFoundError, OSError):
|
| 356 |
+
pass
|
| 357 |
+
|
| 358 |
+
os.makedirs(self.zip_path, mode=0o755, exist_ok=True)
|
| 359 |
+
with zipfile.ZipFile(fp, mode="r") as zf:
|
| 360 |
+
zf.extractall(self.zip_path)
|
| 361 |
+
os.rename(os.path.join(self.zip_path, exe_path), self.executable_path)
|
| 362 |
+
os.remove(fp)
|
| 363 |
+
shutil.rmtree
|
| 364 |
+
os.chmod(self.executable_path, 0o755)
|
| 365 |
+
return self.executable_path
|
| 366 |
+
|
| 367 |
+
@staticmethod
|
| 368 |
+
def force_kill_instances(exe_name):
|
| 369 |
+
"""
|
| 370 |
+
kills running instances.
|
| 371 |
+
:param: executable name to kill, may be a path as well
|
| 372 |
+
|
| 373 |
+
:return: True on success else False
|
| 374 |
+
"""
|
| 375 |
+
exe_name = os.path.basename(exe_name)
|
| 376 |
+
if IS_POSIX:
|
| 377 |
+
# Using shell=True for pidof, consider a more robust pid finding method if issues arise.
|
| 378 |
+
# pgrep can be an alternative: ["pgrep", "-f", exe_name]
|
| 379 |
+
# Or psutil if adding a dependency is acceptable.
|
| 380 |
+
command = f"pidof {exe_name}"
|
| 381 |
+
try:
|
| 382 |
+
result = subprocess.run(command, shell=True, capture_output=True, text=True, check=True)
|
| 383 |
+
pids = result.stdout.strip().split()
|
| 384 |
+
if pids:
|
| 385 |
+
subprocess.run(["kill", "-9"] + pids, check=False) # Changed from -f -9 to -9 as -f is not standard for kill
|
| 386 |
+
return True
|
| 387 |
+
return False # No PIDs found
|
| 388 |
+
except subprocess.CalledProcessError: # pidof returns 1 if no process found
|
| 389 |
+
return False # No process found
|
| 390 |
+
except Exception as e:
|
| 391 |
+
logger.debug(f"Error killing process on POSIX: {e}")
|
| 392 |
+
return False
|
| 393 |
+
else:
|
| 394 |
+
try:
|
| 395 |
+
# TASKKILL /F /IM chromedriver.exe
|
| 396 |
+
result = subprocess.run(["taskkill", "/f", "/im", exe_name], check=False, capture_output=True)
|
| 397 |
+
# taskkill returns 0 if process was killed, 128 if not found.
|
| 398 |
+
return result.returncode == 0
|
| 399 |
+
except Exception as e:
|
| 400 |
+
logger.debug(f"Error killing process on Windows: {e}")
|
| 401 |
+
return False
|
| 402 |
+
|
| 403 |
+
@staticmethod
|
| 404 |
+
def gen_random_cdc():
|
| 405 |
+
cdc = random.choices(string.ascii_letters, k=27)
|
| 406 |
+
return "".join(cdc).encode()
|
| 407 |
+
|
| 408 |
+
def is_binary_patched(self, executable_path=None):
|
| 409 |
+
executable_path = executable_path or self.executable_path
|
| 410 |
+
try:
|
| 411 |
+
with io.open(executable_path, "rb") as fh:
|
| 412 |
+
return fh.read().find(b"undetected chromedriver") != -1
|
| 413 |
+
except FileNotFoundError:
|
| 414 |
+
return False
|
| 415 |
+
|
| 416 |
+
def patch_exe(self):
|
| 417 |
+
start = time.perf_counter()
|
| 418 |
+
logger.info("patching driver executable %s" % self.executable_path)
|
| 419 |
+
with io.open(self.executable_path, "r+b") as fh:
|
| 420 |
+
content = fh.read()
|
| 421 |
+
# match_injected_codeblock = re.search(rb"{window.*;}", content)
|
| 422 |
+
match_injected_codeblock = re.search(rb"\{window\.cdc.*?;\}", content)
|
| 423 |
+
if match_injected_codeblock:
|
| 424 |
+
target_bytes = match_injected_codeblock[0]
|
| 425 |
+
new_target_bytes = (
|
| 426 |
+
b'{console.log("undetected chromedriver 1337!")}'.ljust(
|
| 427 |
+
len(target_bytes), b" "
|
| 428 |
+
)
|
| 429 |
+
)
|
| 430 |
+
new_content = content.replace(target_bytes, new_target_bytes)
|
| 431 |
+
if new_content == content:
|
| 432 |
+
logger.warning(
|
| 433 |
+
"something went wrong patching the driver binary. could not find injection code block"
|
| 434 |
+
)
|
| 435 |
+
else:
|
| 436 |
+
logger.debug(
|
| 437 |
+
"found block:\n%s\nreplacing with:\n%s"
|
| 438 |
+
% (target_bytes, new_target_bytes)
|
| 439 |
+
)
|
| 440 |
+
fh.seek(0)
|
| 441 |
+
fh.write(new_content)
|
| 442 |
+
logger.debug(
|
| 443 |
+
"patching took us {:.2f} seconds".format(time.perf_counter() - start)
|
| 444 |
+
)
|
| 445 |
+
|
| 446 |
+
def __repr__(self):
|
| 447 |
+
return "{0:s}({1:s})".format(
|
| 448 |
+
self.__class__.__name__,
|
| 449 |
+
self.executable_path,
|
| 450 |
+
)
|
| 451 |
+
|
| 452 |
+
def __del__(self):
|
| 453 |
+
if self._custom_exe_path:
|
| 454 |
+
# if the driver binary is specified by user
|
| 455 |
+
# we assume it is important enough to not delete it
|
| 456 |
+
return
|
| 457 |
+
else:
|
| 458 |
+
timeout = 3 # stop trying after this many seconds
|
| 459 |
+
t = time.monotonic()
|
| 460 |
+
now = lambda: time.monotonic()
|
| 461 |
+
while now() - t > timeout:
|
| 462 |
+
# we don't want to wait until the end of time
|
| 463 |
+
try:
|
| 464 |
+
if self.user_multi_procs:
|
| 465 |
+
break
|
| 466 |
+
os.unlink(self.executable_path)
|
| 467 |
+
logger.debug("successfully unlinked %s" % self.executable_path)
|
| 468 |
+
break
|
| 469 |
+
except (OSError, RuntimeError, PermissionError):
|
| 470 |
+
time.sleep(0.01)
|
| 471 |
+
continue
|
| 472 |
+
except FileNotFoundError:
|
| 473 |
+
break
|
flaresolverr/undetected_chromedriver/reactor.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# this module is part of undetected_chromedriver
|
| 3 |
+
|
| 4 |
+
import asyncio
|
| 5 |
+
import json
|
| 6 |
+
import logging
|
| 7 |
+
import threading
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class Reactor(threading.Thread):
|
| 14 |
+
def __init__(self, driver: "Chrome"):
|
| 15 |
+
super().__init__()
|
| 16 |
+
|
| 17 |
+
self.driver = driver
|
| 18 |
+
self.loop = asyncio.new_event_loop()
|
| 19 |
+
|
| 20 |
+
self.lock = threading.Lock()
|
| 21 |
+
self.event = threading.Event()
|
| 22 |
+
self.daemon = True
|
| 23 |
+
self.handlers = {}
|
| 24 |
+
|
| 25 |
+
def add_event_handler(self, method_name, callback: callable):
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
Parameters
|
| 29 |
+
----------
|
| 30 |
+
event_name: str
|
| 31 |
+
example "Network.responseReceived"
|
| 32 |
+
|
| 33 |
+
callback: callable
|
| 34 |
+
callable which accepts 1 parameter: the message object dictionary
|
| 35 |
+
|
| 36 |
+
Returns
|
| 37 |
+
-------
|
| 38 |
+
|
| 39 |
+
"""
|
| 40 |
+
with self.lock:
|
| 41 |
+
self.handlers[method_name.lower()] = callback
|
| 42 |
+
|
| 43 |
+
@property
|
| 44 |
+
def running(self):
|
| 45 |
+
return not self.event.is_set()
|
| 46 |
+
|
| 47 |
+
def run(self):
|
| 48 |
+
try:
|
| 49 |
+
asyncio.set_event_loop(self.loop)
|
| 50 |
+
self.loop.run_until_complete(self.listen())
|
| 51 |
+
except Exception as e:
|
| 52 |
+
logger.warning("Reactor.run() => %s", e)
|
| 53 |
+
|
| 54 |
+
async def _wait_service_started(self):
|
| 55 |
+
while True:
|
| 56 |
+
with self.lock:
|
| 57 |
+
if (
|
| 58 |
+
getattr(self.driver, "service", None)
|
| 59 |
+
and getattr(self.driver.service, "process", None)
|
| 60 |
+
and self.driver.service.process.poll()
|
| 61 |
+
):
|
| 62 |
+
await asyncio.sleep(self.driver._delay or 0.25)
|
| 63 |
+
else:
|
| 64 |
+
break
|
| 65 |
+
|
| 66 |
+
async def listen(self):
|
| 67 |
+
while self.running:
|
| 68 |
+
await self._wait_service_started()
|
| 69 |
+
await asyncio.sleep(1)
|
| 70 |
+
|
| 71 |
+
try:
|
| 72 |
+
with self.lock:
|
| 73 |
+
log_entries = self.driver.get_log("performance")
|
| 74 |
+
|
| 75 |
+
for entry in log_entries:
|
| 76 |
+
try:
|
| 77 |
+
obj_serialized: str = entry.get("message")
|
| 78 |
+
obj = json.loads(obj_serialized)
|
| 79 |
+
message = obj.get("message")
|
| 80 |
+
method = message.get("method")
|
| 81 |
+
|
| 82 |
+
if "*" in self.handlers:
|
| 83 |
+
await self.loop.run_in_executor(
|
| 84 |
+
None, self.handlers["*"], message
|
| 85 |
+
)
|
| 86 |
+
elif method.lower() in self.handlers:
|
| 87 |
+
await self.loop.run_in_executor(
|
| 88 |
+
None, self.handlers[method.lower()], message
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
# print(type(message), message)
|
| 92 |
+
except Exception as e:
|
| 93 |
+
raise e from None
|
| 94 |
+
|
| 95 |
+
except Exception as e:
|
| 96 |
+
if "invalid session id" in str(e):
|
| 97 |
+
pass
|
| 98 |
+
else:
|
| 99 |
+
logging.debug("exception ignored :", e)
|
flaresolverr/undetected_chromedriver/webelement.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
|
| 3 |
+
from selenium.webdriver.common.by import By
|
| 4 |
+
import selenium.webdriver.remote.webelement
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class WebElement(selenium.webdriver.remote.webelement.WebElement):
|
| 8 |
+
def click_safe(self):
|
| 9 |
+
super().click()
|
| 10 |
+
self._parent.reconnect(0.1)
|
| 11 |
+
|
| 12 |
+
def children(
|
| 13 |
+
self, tag=None, recursive=False
|
| 14 |
+
) -> List[selenium.webdriver.remote.webelement.WebElement]:
|
| 15 |
+
"""
|
| 16 |
+
returns direct child elements of current element
|
| 17 |
+
:param tag: str, if supplied, returns <tag> nodes only
|
| 18 |
+
"""
|
| 19 |
+
script = "return [... arguments[0].children]"
|
| 20 |
+
if tag:
|
| 21 |
+
script += ".filter( node => node.tagName === '%s')" % tag.upper()
|
| 22 |
+
if recursive:
|
| 23 |
+
return list(_recursive_children(self, tag))
|
| 24 |
+
return list(self._parent.execute_script(script, self))
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class UCWebElement(WebElement):
|
| 28 |
+
"""
|
| 29 |
+
Custom WebElement class which makes it easier to view elements when
|
| 30 |
+
working in an interactive environment.
|
| 31 |
+
|
| 32 |
+
standard webelement repr:
|
| 33 |
+
<selenium.webdriver.remote.webelement.WebElement (session="85ff0f671512fa535630e71ee951b1f2", element="6357cb55-92c3-4c0f-9416-b174f9c1b8c4")>
|
| 34 |
+
|
| 35 |
+
using this WebElement class:
|
| 36 |
+
<WebElement(<a class="mobile-show-inline-block mc-update-infos init-ok" href="#" id="main-cat-switcher-mobile">)>
|
| 37 |
+
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
def __init__(self, parent, id_):
|
| 41 |
+
super().__init__(parent, id_)
|
| 42 |
+
self._attrs = None
|
| 43 |
+
|
| 44 |
+
@property
|
| 45 |
+
def attrs(self):
|
| 46 |
+
if not self._attrs:
|
| 47 |
+
self._attrs = self._parent.execute_script(
|
| 48 |
+
"""
|
| 49 |
+
var items = {};
|
| 50 |
+
for (index = 0; index < arguments[0].attributes.length; ++index)
|
| 51 |
+
{
|
| 52 |
+
items[arguments[0].attributes[index].name] = arguments[0].attributes[index].value
|
| 53 |
+
};
|
| 54 |
+
return items;
|
| 55 |
+
""",
|
| 56 |
+
self,
|
| 57 |
+
)
|
| 58 |
+
return self._attrs
|
| 59 |
+
|
| 60 |
+
def __repr__(self):
|
| 61 |
+
strattrs = " ".join([f'{k}="{v}"' for k, v in self.attrs.items()])
|
| 62 |
+
if strattrs:
|
| 63 |
+
strattrs = " " + strattrs
|
| 64 |
+
return f"{self.__class__.__name__} <{self.tag_name}{strattrs}>"
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def _recursive_children(element, tag: str = None, _results=None):
|
| 68 |
+
"""
|
| 69 |
+
returns all children of <element> recursively
|
| 70 |
+
|
| 71 |
+
:param element: `WebElement` object.
|
| 72 |
+
find children below this <element>
|
| 73 |
+
|
| 74 |
+
:param tag: str = None.
|
| 75 |
+
if provided, return only <tag> elements. example: 'a', or 'img'
|
| 76 |
+
:param _results: do not use!
|
| 77 |
+
"""
|
| 78 |
+
results = _results or set()
|
| 79 |
+
for element in element.children():
|
| 80 |
+
if tag:
|
| 81 |
+
if element.tag_name == tag:
|
| 82 |
+
results.add(element)
|
| 83 |
+
else:
|
| 84 |
+
results.add(element)
|
| 85 |
+
results |= _recursive_children(element, tag, results)
|
| 86 |
+
return results
|
flaresolverr/utils.py
ADDED
|
@@ -0,0 +1,376 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import logging
|
| 3 |
+
import os
|
| 4 |
+
import platform
|
| 5 |
+
import re
|
| 6 |
+
import shutil
|
| 7 |
+
import sys
|
| 8 |
+
import tempfile
|
| 9 |
+
import urllib.parse
|
| 10 |
+
|
| 11 |
+
from selenium.webdriver.chrome.webdriver import WebDriver
|
| 12 |
+
import undetected_chromedriver as uc
|
| 13 |
+
|
| 14 |
+
FLARESOLVERR_VERSION = None
|
| 15 |
+
PLATFORM_VERSION = None
|
| 16 |
+
CHROME_EXE_PATH = None
|
| 17 |
+
CHROME_MAJOR_VERSION = None
|
| 18 |
+
USER_AGENT = None
|
| 19 |
+
XVFB_DISPLAY = None
|
| 20 |
+
PATCHED_DRIVER_PATH = None
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def get_config_log_html() -> bool:
|
| 24 |
+
return os.environ.get('LOG_HTML', 'false').lower() == 'true'
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def get_config_headless() -> bool:
|
| 28 |
+
return os.environ.get('HEADLESS', 'true').lower() == 'true'
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def get_config_disable_media() -> bool:
|
| 32 |
+
return os.environ.get('DISABLE_MEDIA', 'false').lower() == 'true'
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def get_flaresolverr_version() -> str:
|
| 36 |
+
global FLARESOLVERR_VERSION
|
| 37 |
+
if FLARESOLVERR_VERSION is not None:
|
| 38 |
+
return FLARESOLVERR_VERSION
|
| 39 |
+
|
| 40 |
+
package_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'package.json')
|
| 41 |
+
if not os.path.isfile(package_path):
|
| 42 |
+
package_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'package.json')
|
| 43 |
+
with open(package_path) as f:
|
| 44 |
+
FLARESOLVERR_VERSION = json.loads(f.read())['version']
|
| 45 |
+
return FLARESOLVERR_VERSION
|
| 46 |
+
|
| 47 |
+
def get_current_platform() -> str:
|
| 48 |
+
global PLATFORM_VERSION
|
| 49 |
+
if PLATFORM_VERSION is not None:
|
| 50 |
+
return PLATFORM_VERSION
|
| 51 |
+
PLATFORM_VERSION = os.name
|
| 52 |
+
return PLATFORM_VERSION
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def create_proxy_extension(proxy: dict) -> str:
|
| 56 |
+
parsed_url = urllib.parse.urlparse(proxy['url'])
|
| 57 |
+
scheme = parsed_url.scheme
|
| 58 |
+
host = parsed_url.hostname
|
| 59 |
+
port = parsed_url.port
|
| 60 |
+
username = proxy['username']
|
| 61 |
+
password = proxy['password']
|
| 62 |
+
manifest_json = """
|
| 63 |
+
{
|
| 64 |
+
"version": "1.0.0",
|
| 65 |
+
"manifest_version": 3,
|
| 66 |
+
"name": "Chrome Proxy",
|
| 67 |
+
"permissions": [
|
| 68 |
+
"proxy",
|
| 69 |
+
"tabs",
|
| 70 |
+
"storage",
|
| 71 |
+
"webRequest",
|
| 72 |
+
"webRequestAuthProvider"
|
| 73 |
+
],
|
| 74 |
+
"host_permissions": [
|
| 75 |
+
"<all_urls>"
|
| 76 |
+
],
|
| 77 |
+
"background": {
|
| 78 |
+
"service_worker": "background.js"
|
| 79 |
+
},
|
| 80 |
+
"minimum_chrome_version": "76.0.0"
|
| 81 |
+
}
|
| 82 |
+
"""
|
| 83 |
+
|
| 84 |
+
background_js = """
|
| 85 |
+
var config = {
|
| 86 |
+
mode: "fixed_servers",
|
| 87 |
+
rules: {
|
| 88 |
+
singleProxy: {
|
| 89 |
+
scheme: "%s",
|
| 90 |
+
host: "%s",
|
| 91 |
+
port: %d
|
| 92 |
+
},
|
| 93 |
+
bypassList: ["localhost"]
|
| 94 |
+
}
|
| 95 |
+
};
|
| 96 |
+
|
| 97 |
+
chrome.proxy.settings.set({value: config, scope: "regular"}, function() {});
|
| 98 |
+
|
| 99 |
+
function callbackFn(details) {
|
| 100 |
+
return {
|
| 101 |
+
authCredentials: {
|
| 102 |
+
username: "%s",
|
| 103 |
+
password: "%s"
|
| 104 |
+
}
|
| 105 |
+
};
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
chrome.webRequest.onAuthRequired.addListener(
|
| 109 |
+
callbackFn,
|
| 110 |
+
{ urls: ["<all_urls>"] },
|
| 111 |
+
['blocking']
|
| 112 |
+
);
|
| 113 |
+
""" % (
|
| 114 |
+
scheme,
|
| 115 |
+
host,
|
| 116 |
+
port,
|
| 117 |
+
username,
|
| 118 |
+
password
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
proxy_extension_dir = tempfile.mkdtemp()
|
| 122 |
+
|
| 123 |
+
with open(os.path.join(proxy_extension_dir, "manifest.json"), "w") as f:
|
| 124 |
+
f.write(manifest_json)
|
| 125 |
+
|
| 126 |
+
with open(os.path.join(proxy_extension_dir, "background.js"), "w") as f:
|
| 127 |
+
f.write(background_js)
|
| 128 |
+
|
| 129 |
+
return proxy_extension_dir
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def get_webdriver(proxy: dict = None) -> WebDriver:
|
| 133 |
+
global PATCHED_DRIVER_PATH, USER_AGENT
|
| 134 |
+
logging.debug('Launching web browser...')
|
| 135 |
+
|
| 136 |
+
# undetected_chromedriver
|
| 137 |
+
options = uc.ChromeOptions()
|
| 138 |
+
options.add_argument('--no-sandbox')
|
| 139 |
+
options.add_argument('--window-size=1280,1024') # Smaller window for less overhead
|
| 140 |
+
options.add_argument('--disable-search-engine-choice-screen')
|
| 141 |
+
options.add_argument('--disable-setuid-sandbox')
|
| 142 |
+
options.add_argument('--disable-dev-shm-usage')
|
| 143 |
+
options.add_argument('--no-zygote')
|
| 144 |
+
options.add_argument('--disable-gpu') # Disable GPU for faster headless boot
|
| 145 |
+
options.add_argument('--mute-audio')
|
| 146 |
+
options.add_argument('--disable-notifications')
|
| 147 |
+
options.add_argument('--disable-popup-blocking')
|
| 148 |
+
options.add_argument('--disable-extensions')
|
| 149 |
+
options.add_argument('--disable-blink-features=AutomationControlled')
|
| 150 |
+
|
| 151 |
+
# Force headless and invisibility
|
| 152 |
+
options.add_argument('--headless=new')
|
| 153 |
+
|
| 154 |
+
IS_ARMARCH = platform.machine().startswith(('arm', 'aarch'))
|
| 155 |
+
if IS_ARMARCH:
|
| 156 |
+
options.add_argument('--disable-gpu-sandbox')
|
| 157 |
+
options.add_argument('--ignore-certificate-errors')
|
| 158 |
+
options.add_argument('--ignore-ssl-errors')
|
| 159 |
+
|
| 160 |
+
language = os.environ.get('LANG', None)
|
| 161 |
+
if language is not None:
|
| 162 |
+
options.add_argument('--accept-lang=%s' % language)
|
| 163 |
+
|
| 164 |
+
# Fix for Chrome 117 | https://github.com/FlareSolverr/FlareSolverr/issues/910
|
| 165 |
+
if USER_AGENT is not None:
|
| 166 |
+
options.add_argument('--user-agent=%s' % USER_AGENT)
|
| 167 |
+
|
| 168 |
+
proxy_extension_dir = None
|
| 169 |
+
if proxy and all(key in proxy for key in ['url', 'username', 'password']):
|
| 170 |
+
proxy_extension_dir = create_proxy_extension(proxy)
|
| 171 |
+
options.add_argument("--disable-features=DisableLoadExtensionCommandLineSwitch")
|
| 172 |
+
options.add_argument("--load-extension=%s" % os.path.abspath(proxy_extension_dir))
|
| 173 |
+
elif proxy and 'url' in proxy:
|
| 174 |
+
proxy_url = proxy['url']
|
| 175 |
+
logging.debug("Using webdriver proxy: %s", proxy_url)
|
| 176 |
+
options.add_argument('--proxy-server=%s' % proxy_url)
|
| 177 |
+
|
| 178 |
+
# note: headless mode is detected (headless = True)
|
| 179 |
+
# we launch the browser in head-full mode with the window hidden
|
| 180 |
+
windows_headless = True if os.name == 'nt' else False
|
| 181 |
+
if get_config_headless():
|
| 182 |
+
if os.name != 'nt':
|
| 183 |
+
start_xvfb_display()
|
| 184 |
+
|
| 185 |
+
# Override for absolute invisibility on Windows
|
| 186 |
+
if os.name == 'nt':
|
| 187 |
+
options.add_argument('--hide-scrollbars')
|
| 188 |
+
options.add_argument('--disable-logging')
|
| 189 |
+
options.add_argument('--log-level=3')
|
| 190 |
+
|
| 191 |
+
# if we are inside the Docker container, we avoid downloading the driver
|
| 192 |
+
driver_exe_path = None
|
| 193 |
+
version_main = None
|
| 194 |
+
if os.path.exists("/app/chromedriver"):
|
| 195 |
+
# running inside Docker
|
| 196 |
+
driver_exe_path = "/app/chromedriver"
|
| 197 |
+
else:
|
| 198 |
+
version_main = get_chrome_major_version()
|
| 199 |
+
if PATCHED_DRIVER_PATH is not None:
|
| 200 |
+
driver_exe_path = PATCHED_DRIVER_PATH
|
| 201 |
+
|
| 202 |
+
# detect chrome path
|
| 203 |
+
browser_executable_path = get_chrome_exe_path()
|
| 204 |
+
|
| 205 |
+
# CRITICAL: Clean up undetected_chromedriver cache on Windows to avoid WinError 183
|
| 206 |
+
if os.name == 'nt':
|
| 207 |
+
try:
|
| 208 |
+
uc_path = os.path.join(os.environ.get('APPDATA', ''), 'undetected_chromedriver')
|
| 209 |
+
if os.path.exists(uc_path):
|
| 210 |
+
# Try to remove the file that usually causes WinError 183
|
| 211 |
+
target_exe = os.path.join(uc_path, 'undetected_chromedriver.exe')
|
| 212 |
+
if os.path.exists(target_exe):
|
| 213 |
+
try: os.remove(target_exe)
|
| 214 |
+
except: pass
|
| 215 |
+
except: pass
|
| 216 |
+
|
| 217 |
+
# downloads and patches the chromedriver
|
| 218 |
+
# if we don't set driver_executable_path it downloads, patches, and deletes the driver each time
|
| 219 |
+
try:
|
| 220 |
+
driver = uc.Chrome(options=options, browser_executable_path=browser_executable_path,
|
| 221 |
+
driver_executable_path=driver_exe_path, version_main=version_main,
|
| 222 |
+
windows_headless=windows_headless, headless=get_config_headless())
|
| 223 |
+
except Exception as e:
|
| 224 |
+
logging.error("Error starting Chrome: %s" % e)
|
| 225 |
+
# No point in continuing if we cannot retrieve the driver
|
| 226 |
+
raise e
|
| 227 |
+
|
| 228 |
+
# save the patched driver to avoid re-downloads
|
| 229 |
+
if driver_exe_path is None:
|
| 230 |
+
try:
|
| 231 |
+
target_path = os.path.join(driver.patcher.data_path, driver.patcher.exe_name)
|
| 232 |
+
if target_path != driver.patcher.executable_path:
|
| 233 |
+
# On Windows, we might get WinError 183 if the file is locked or exists
|
| 234 |
+
if os.path.exists(target_path):
|
| 235 |
+
try: os.remove(target_path)
|
| 236 |
+
except: pass
|
| 237 |
+
shutil.copy(driver.patcher.executable_path, target_path)
|
| 238 |
+
PATCHED_DRIVER_PATH = target_path
|
| 239 |
+
except Exception as e:
|
| 240 |
+
logging.warning(f"Failed to save patched driver: {e}")
|
| 241 |
+
|
| 242 |
+
# clean up proxy extension directory
|
| 243 |
+
if proxy_extension_dir is not None:
|
| 244 |
+
shutil.rmtree(proxy_extension_dir)
|
| 245 |
+
|
| 246 |
+
# selenium vanilla
|
| 247 |
+
# options = webdriver.ChromeOptions()
|
| 248 |
+
# options.add_argument('--no-sandbox')
|
| 249 |
+
# options.add_argument('--window-size=1920,1080')
|
| 250 |
+
# options.add_argument('--disable-setuid-sandbox')
|
| 251 |
+
# options.add_argument('--disable-dev-shm-usage')
|
| 252 |
+
# driver = webdriver.Chrome(options=options)
|
| 253 |
+
|
| 254 |
+
return driver
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
def get_chrome_exe_path() -> str:
|
| 258 |
+
global CHROME_EXE_PATH
|
| 259 |
+
if CHROME_EXE_PATH is not None:
|
| 260 |
+
return CHROME_EXE_PATH
|
| 261 |
+
# linux pyinstaller bundle
|
| 262 |
+
chrome_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'chrome', "chrome")
|
| 263 |
+
if os.path.exists(chrome_path):
|
| 264 |
+
if not os.access(chrome_path, os.X_OK):
|
| 265 |
+
raise Exception(f'Chrome binary "{chrome_path}" is not executable. '
|
| 266 |
+
f'Please, extract the archive with "tar xzf <file.tar.gz>".')
|
| 267 |
+
CHROME_EXE_PATH = chrome_path
|
| 268 |
+
return CHROME_EXE_PATH
|
| 269 |
+
# windows pyinstaller bundle
|
| 270 |
+
chrome_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'chrome', "chrome.exe")
|
| 271 |
+
if os.path.exists(chrome_path):
|
| 272 |
+
CHROME_EXE_PATH = chrome_path
|
| 273 |
+
return CHROME_EXE_PATH
|
| 274 |
+
# system
|
| 275 |
+
CHROME_EXE_PATH = uc.find_chrome_executable()
|
| 276 |
+
return CHROME_EXE_PATH
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
def get_chrome_major_version() -> str:
|
| 280 |
+
global CHROME_MAJOR_VERSION
|
| 281 |
+
if CHROME_MAJOR_VERSION is not None:
|
| 282 |
+
return CHROME_MAJOR_VERSION
|
| 283 |
+
|
| 284 |
+
if os.name == 'nt':
|
| 285 |
+
# Example: '104.0.5112.79'
|
| 286 |
+
try:
|
| 287 |
+
complete_version = extract_version_nt_executable(get_chrome_exe_path())
|
| 288 |
+
except Exception:
|
| 289 |
+
try:
|
| 290 |
+
complete_version = extract_version_nt_registry()
|
| 291 |
+
except Exception:
|
| 292 |
+
# Example: '104.0.5112.79'
|
| 293 |
+
complete_version = extract_version_nt_folder()
|
| 294 |
+
else:
|
| 295 |
+
chrome_path = get_chrome_exe_path()
|
| 296 |
+
process = os.popen(f'"{chrome_path}" --version')
|
| 297 |
+
# Example 1: 'Chromium 104.0.5112.79 Arch Linux\n'
|
| 298 |
+
# Example 2: 'Google Chrome 104.0.5112.79 Arch Linux\n'
|
| 299 |
+
complete_version = process.read()
|
| 300 |
+
process.close()
|
| 301 |
+
|
| 302 |
+
CHROME_MAJOR_VERSION = complete_version.split('.')[0].split(' ')[-1]
|
| 303 |
+
return CHROME_MAJOR_VERSION
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
def extract_version_nt_executable(exe_path: str) -> str:
|
| 307 |
+
import pefile
|
| 308 |
+
pe = pefile.PE(exe_path, fast_load=True)
|
| 309 |
+
pe.parse_data_directories(
|
| 310 |
+
directories=[pefile.DIRECTORY_ENTRY["IMAGE_DIRECTORY_ENTRY_RESOURCE"]]
|
| 311 |
+
)
|
| 312 |
+
return pe.FileInfo[0][0].StringTable[0].entries[b"FileVersion"].decode('utf-8')
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
def extract_version_nt_registry() -> str:
|
| 316 |
+
stream = os.popen(
|
| 317 |
+
'reg query "HKLM\\SOFTWARE\\Wow6432Node\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\Google Chrome"')
|
| 318 |
+
output = stream.read()
|
| 319 |
+
google_version = ''
|
| 320 |
+
for letter in output[output.rindex('DisplayVersion REG_SZ') + 24:]:
|
| 321 |
+
if letter != '\n':
|
| 322 |
+
google_version += letter
|
| 323 |
+
else:
|
| 324 |
+
break
|
| 325 |
+
return google_version.strip()
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
def extract_version_nt_folder() -> str:
|
| 329 |
+
# Check if the Chrome folder exists in the x32 or x64 Program Files folders.
|
| 330 |
+
for i in range(2):
|
| 331 |
+
path = 'C:\\Program Files' + (' (x86)' if i else '') + '\\Google\\Chrome\\Application'
|
| 332 |
+
if os.path.isdir(path):
|
| 333 |
+
paths = [f.path for f in os.scandir(path) if f.is_dir()]
|
| 334 |
+
for path in paths:
|
| 335 |
+
filename = os.path.basename(path)
|
| 336 |
+
pattern = r'\d+\.\d+\.\d+\.\d+'
|
| 337 |
+
match = re.search(pattern, filename)
|
| 338 |
+
if match and match.group():
|
| 339 |
+
# Found a Chrome version.
|
| 340 |
+
return match.group(0)
|
| 341 |
+
return ''
|
| 342 |
+
|
| 343 |
+
|
| 344 |
+
def get_user_agent(driver=None) -> str:
|
| 345 |
+
global USER_AGENT
|
| 346 |
+
if USER_AGENT is not None:
|
| 347 |
+
return USER_AGENT
|
| 348 |
+
|
| 349 |
+
try:
|
| 350 |
+
if driver is None:
|
| 351 |
+
driver = get_webdriver()
|
| 352 |
+
USER_AGENT = driver.execute_script("return navigator.userAgent")
|
| 353 |
+
# Fix for Chrome 117 | https://github.com/FlareSolverr/FlareSolverr/issues/910
|
| 354 |
+
USER_AGENT = re.sub('HEADLESS', '', USER_AGENT, flags=re.IGNORECASE)
|
| 355 |
+
return USER_AGENT
|
| 356 |
+
except Exception as e:
|
| 357 |
+
raise Exception("Error getting browser User-Agent. " + str(e))
|
| 358 |
+
finally:
|
| 359 |
+
if driver is not None:
|
| 360 |
+
if PLATFORM_VERSION == "nt":
|
| 361 |
+
driver.close()
|
| 362 |
+
driver.quit()
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
def start_xvfb_display():
|
| 366 |
+
global XVFB_DISPLAY
|
| 367 |
+
if XVFB_DISPLAY is None:
|
| 368 |
+
from xvfbwrapper import Xvfb
|
| 369 |
+
XVFB_DISPLAY = Xvfb()
|
| 370 |
+
XVFB_DISPLAY.start()
|
| 371 |
+
|
| 372 |
+
|
| 373 |
+
def object_to_dict(_object):
|
| 374 |
+
json_dict = json.loads(json.dumps(_object, default=lambda o: o.__dict__))
|
| 375 |
+
# remove hidden fields
|
| 376 |
+
return {k: v for k, v in json_dict.items() if not k.startswith('__')}
|
keep_alive.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Keep-Alive Service to prevent Render.com from sleeping
|
| 3 |
+
Pings the server every 10 minutes to maintain activity
|
| 4 |
+
"""
|
| 5 |
+
import asyncio
|
| 6 |
+
import httpx
|
| 7 |
+
import logging
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger("keep_alive")
|
| 11 |
+
|
| 12 |
+
class KeepAliveService:
|
| 13 |
+
def __init__(self, base_url: str = "http://localhost:7860"):
|
| 14 |
+
self.base_url = base_url
|
| 15 |
+
self.running = False
|
| 16 |
+
self.ping_interval = 600 # 10 minutes
|
| 17 |
+
|
| 18 |
+
async def start(self):
|
| 19 |
+
"""Start the keep-alive service"""
|
| 20 |
+
self.running = True
|
| 21 |
+
logger.info("🔄 Keep-Alive service started (pinging every 10 minutes)")
|
| 22 |
+
|
| 23 |
+
while self.running:
|
| 24 |
+
try:
|
| 25 |
+
await asyncio.sleep(self.ping_interval)
|
| 26 |
+
await self._ping()
|
| 27 |
+
except Exception as e:
|
| 28 |
+
logger.error(f"Keep-Alive error: {e}")
|
| 29 |
+
|
| 30 |
+
async def _ping(self):
|
| 31 |
+
"""Send a ping to keep the service alive"""
|
| 32 |
+
try:
|
| 33 |
+
async with httpx.AsyncClient(timeout=10.0) as client:
|
| 34 |
+
response = await client.get(f"{self.base_url}/health")
|
| 35 |
+
if response.status_code == 200:
|
| 36 |
+
logger.info(f"✅ Keep-Alive ping successful at {datetime.now().strftime('%H:%M:%S')}")
|
| 37 |
+
else:
|
| 38 |
+
logger.warning(f"⚠️ Keep-Alive ping returned {response.status_code}")
|
| 39 |
+
except Exception as e:
|
| 40 |
+
logger.warning(f"Keep-Alive ping failed: {e}")
|
| 41 |
+
|
| 42 |
+
def stop(self):
|
| 43 |
+
"""Stop the keep-alive service"""
|
| 44 |
+
self.running = False
|
| 45 |
+
logger.info("Keep-Alive service stopped")
|
| 46 |
+
|
| 47 |
+
keep_alive = KeepAliveService()
|
main.py
ADDED
|
@@ -0,0 +1,352 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import time
|
| 3 |
+
from typing import List, Optional
|
| 4 |
+
from fastapi import FastAPI, Request, HTTPException, Query
|
| 5 |
+
from fastapi.responses import JSONResponse, FileResponse, StreamingResponse, RedirectResponse
|
| 6 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 7 |
+
from fastapi.middleware.gzip import GZipMiddleware
|
| 8 |
+
import httpx
|
| 9 |
+
from scraper.engine import scraper
|
| 10 |
+
from downloader import downloader
|
| 11 |
+
import os
|
| 12 |
+
import re
|
| 13 |
+
from urllib.parse import unquote, quote
|
| 14 |
+
from fastapi.staticfiles import StaticFiles
|
| 15 |
+
from database import init_db
|
| 16 |
+
from keep_alive import keep_alive
|
| 17 |
+
import asyncio
|
| 18 |
+
import io
|
| 19 |
+
|
| 20 |
+
# Configure logging
|
| 21 |
+
logging.basicConfig(
|
| 22 |
+
level=logging.INFO,
|
| 23 |
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
| 24 |
+
datefmt="%Y-%m-%d %H:%M:%S",
|
| 25 |
+
)
|
| 26 |
+
logger = logging.getLogger("backend")
|
| 27 |
+
|
| 28 |
+
app = FastAPI(title="MEIH Movies API", version="2.0.0")
|
| 29 |
+
|
| 30 |
+
# --- Simple Caching Layer ---
|
| 31 |
+
class MemoryCache:
|
| 32 |
+
def __init__(self):
|
| 33 |
+
self._cache = {}
|
| 34 |
+
|
| 35 |
+
def get(self, key: str):
|
| 36 |
+
item = self._cache.get(key)
|
| 37 |
+
if item:
|
| 38 |
+
expire_time, data = item
|
| 39 |
+
if time.time() < expire_time:
|
| 40 |
+
return data
|
| 41 |
+
else:
|
| 42 |
+
del self._cache[key]
|
| 43 |
+
return None
|
| 44 |
+
|
| 45 |
+
def set(self, key: str, data, ttl_seconds: int = 600): # Default 10 mins
|
| 46 |
+
self._cache[key] = (time.time() + ttl_seconds, data)
|
| 47 |
+
|
| 48 |
+
cache = MemoryCache()
|
| 49 |
+
|
| 50 |
+
async def warm_scraper():
|
| 51 |
+
"""Warms up the scraper by making an initial request to sync cookies."""
|
| 52 |
+
logger.info("🔥 Warming up scraper in background...")
|
| 53 |
+
try:
|
| 54 |
+
# Give services a few more seconds to be truly ready
|
| 55 |
+
await asyncio.sleep(5)
|
| 56 |
+
await scraper.fetch_home(page=1)
|
| 57 |
+
logger.info("✅ Scraper warmed up and cookies synced")
|
| 58 |
+
except Exception as e:
|
| 59 |
+
logger.warning(f"⚠️ Scraper warmup failed (will retry on first request): {e}")
|
| 60 |
+
|
| 61 |
+
@app.on_event("startup")
|
| 62 |
+
async def startup_event():
|
| 63 |
+
await init_db()
|
| 64 |
+
logger.info("🚀 Database initialized and ready")
|
| 65 |
+
|
| 66 |
+
# Detect if running on Hugging Face
|
| 67 |
+
is_hf = os.environ.get("SPACE_ID") is not None or os.environ.get("HF_SPACE") is not None
|
| 68 |
+
|
| 69 |
+
if not is_hf:
|
| 70 |
+
# Start Keep-Alive service (only for non-HF environments)
|
| 71 |
+
asyncio.create_task(keep_alive.start())
|
| 72 |
+
# Start Warm-up service
|
| 73 |
+
asyncio.create_task(warm_scraper())
|
| 74 |
+
# Start Nitro Pre-fetch (Populates cache in background)
|
| 75 |
+
if hasattr(scraper, '_turbo_prefetch'):
|
| 76 |
+
asyncio.create_task(scraper._turbo_prefetch())
|
| 77 |
+
logger.info("🔄 Background services activated")
|
| 78 |
+
else:
|
| 79 |
+
logger.info("🤗 Running on Hugging Face - Lightweight mode enabled")
|
| 80 |
+
# Just warm up the scraper without heavy pre-fetching
|
| 81 |
+
asyncio.create_task(warm_scraper())
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
# Enable CORS for frontend
|
| 85 |
+
app.add_middleware(
|
| 86 |
+
CORSMiddleware,
|
| 87 |
+
allow_origins=["*"],
|
| 88 |
+
allow_credentials=True,
|
| 89 |
+
allow_methods=["*"],
|
| 90 |
+
allow_headers=["*"],
|
| 91 |
+
)
|
| 92 |
+
app.add_middleware(GZipMiddleware, minimum_size=1000)
|
| 93 |
+
|
| 94 |
+
@app.get("/")
|
| 95 |
+
async def root():
|
| 96 |
+
return {
|
| 97 |
+
"status": "online",
|
| 98 |
+
"engine": "Nitro-Power Larooza Engine",
|
| 99 |
+
"engine_status": "WARM" if scraper._cookies_synced else "COLD",
|
| 100 |
+
"cached_keys": list(cache._cache.keys())
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
@app.get("/latest")
|
| 104 |
+
async def get_latest(page: int = 1):
|
| 105 |
+
cache_key = f"latest_{page}"
|
| 106 |
+
cached = cache.get(cache_key)
|
| 107 |
+
if cached:
|
| 108 |
+
return cached
|
| 109 |
+
|
| 110 |
+
try:
|
| 111 |
+
items = await scraper.fetch_home(page=page)
|
| 112 |
+
if items:
|
| 113 |
+
cache.set(cache_key, items)
|
| 114 |
+
return items
|
| 115 |
+
except Exception as e:
|
| 116 |
+
logger.error(f"Error fetching latest: {e}")
|
| 117 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 118 |
+
|
| 119 |
+
@app.get("/category/{cat_id}")
|
| 120 |
+
async def get_category(cat_id: str, page: int = 1):
|
| 121 |
+
cache_key = f"cat_{cat_id}_{page}"
|
| 122 |
+
cached = cache.get(cache_key)
|
| 123 |
+
if cached:
|
| 124 |
+
return cached
|
| 125 |
+
|
| 126 |
+
try:
|
| 127 |
+
items = await scraper.fetch_category(cat_id, page=page)
|
| 128 |
+
if items:
|
| 129 |
+
cache.set(cache_key, items)
|
| 130 |
+
return items
|
| 131 |
+
except Exception as e:
|
| 132 |
+
logger.error(f"Error fetching category {cat_id}: {e}")
|
| 133 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 134 |
+
|
| 135 |
+
@app.get("/search")
|
| 136 |
+
async def search(q: str):
|
| 137 |
+
cache_key = f"search_{q}"
|
| 138 |
+
cached = cache.get(cache_key)
|
| 139 |
+
if cached:
|
| 140 |
+
return cached
|
| 141 |
+
|
| 142 |
+
try:
|
| 143 |
+
items = await scraper.search(q)
|
| 144 |
+
if items:
|
| 145 |
+
cache.set(cache_key, items, ttl_seconds=3600) # Search results cache longer
|
| 146 |
+
return items
|
| 147 |
+
except Exception as e:
|
| 148 |
+
logger.error(f"Error searching for {q}: {e}")
|
| 149 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 150 |
+
|
| 151 |
+
@app.get("/details/{safe_id}")
|
| 152 |
+
async def get_details(safe_id: str):
|
| 153 |
+
cache_key = f"details_{safe_id}"
|
| 154 |
+
cached = cache.get(cache_key)
|
| 155 |
+
if cached:
|
| 156 |
+
return cached
|
| 157 |
+
|
| 158 |
+
try:
|
| 159 |
+
details = await scraper.fetch_details(safe_id)
|
| 160 |
+
if not details:
|
| 161 |
+
return JSONResponse(status_code=404, content={"error": "Content not found"})
|
| 162 |
+
|
| 163 |
+
cache.set(cache_key, details, ttl_seconds=86400) # Details cache for 24h
|
| 164 |
+
return details
|
| 165 |
+
except Exception as e:
|
| 166 |
+
logger.error(f"Error fetching details for {safe_id}: {e}")
|
| 167 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 168 |
+
|
| 169 |
+
@app.get("/proxy/image")
|
| 170 |
+
async def proxy_image(url: str):
|
| 171 |
+
if not url:
|
| 172 |
+
raise HTTPException(status_code=400, detail="URL is required")
|
| 173 |
+
|
| 174 |
+
url = unquote(url)
|
| 175 |
+
|
| 176 |
+
# --- Image Disk Cache ---
|
| 177 |
+
cache_dir = os.path.join(base_dir, "cache", "images")
|
| 178 |
+
os.makedirs(cache_dir, exist_ok=True)
|
| 179 |
+
|
| 180 |
+
# Generate simple hash for filename
|
| 181 |
+
import hashlib
|
| 182 |
+
url_hash = hashlib.md5(url.encode()).hexdigest()
|
| 183 |
+
cache_path = os.path.join(cache_dir, f"{url_hash}.img")
|
| 184 |
+
|
| 185 |
+
# 1. Check if cached
|
| 186 |
+
if os.path.exists(cache_path):
|
| 187 |
+
# Check cache age (optional - 1 week)
|
| 188 |
+
if time.time() - os.path.getmtime(cache_path) < 604800:
|
| 189 |
+
return FileResponse(
|
| 190 |
+
cache_path,
|
| 191 |
+
media_type="image/jpeg", # Approximate, browser will handle
|
| 192 |
+
headers={"Cache-Control": "public, max-age=31536000"}
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
try:
|
| 196 |
+
# Using follow_redirects and a longer timeout for images
|
| 197 |
+
async with httpx.AsyncClient(timeout=20.0, follow_redirects=True) as client:
|
| 198 |
+
resp = await client.get(url, headers={"User-Agent": scraper.headers["User-Agent"]})
|
| 199 |
+
if resp.status_code == 200:
|
| 200 |
+
# Save to cache
|
| 201 |
+
content = resp.content
|
| 202 |
+
with open(cache_path, "wb") as f:
|
| 203 |
+
f.write(content)
|
| 204 |
+
|
| 205 |
+
# Return the image stream directly
|
| 206 |
+
return StreamingResponse(
|
| 207 |
+
io.BytesIO(content),
|
| 208 |
+
media_type=resp.headers.get("Content-Type", "image/jpeg"),
|
| 209 |
+
headers={"Cache-Control": "public, max-age=31536000"}
|
| 210 |
+
)
|
| 211 |
+
else:
|
| 212 |
+
logger.warning(f"Failed to proxy image {url} (Status: {resp.status_code})")
|
| 213 |
+
return JSONResponse(status_code=resp.status_code, content={"error": f"Failed (Status {resp.status_code})"})
|
| 214 |
+
except httpx.TimeoutException:
|
| 215 |
+
logger.warning(f"Timeout proxying image: {url}")
|
| 216 |
+
return JSONResponse(status_code=504, content={"error": "Image timeout"})
|
| 217 |
+
except Exception as e:
|
| 218 |
+
logger.error(f"Proxy image error for {url}: {type(e).__name__} - {str(e)}")
|
| 219 |
+
return JSONResponse(status_code=500, content={"error": str(e)})
|
| 220 |
+
|
| 221 |
+
@app.get("/download/info")
|
| 222 |
+
async def get_download_info(url: str):
|
| 223 |
+
try:
|
| 224 |
+
info = await downloader.get_info(url)
|
| 225 |
+
return info
|
| 226 |
+
except Exception as e:
|
| 227 |
+
logger.error(f"Download info error for {url}: {e}")
|
| 228 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
@app.get("/download/file")
|
| 233 |
+
async def download_file(url: str, filename: str = "video.mp4"):
|
| 234 |
+
"""Handles file downloads, proxying if necessary to bypass IP blocks or hotlink protection."""
|
| 235 |
+
if not url:
|
| 236 |
+
raise HTTPException(status_code=400, detail="URL is required")
|
| 237 |
+
|
| 238 |
+
url = unquote(url)
|
| 239 |
+
|
| 240 |
+
# Domains that REQUIRE proxying (IP-bound or strict hotlink protection)
|
| 241 |
+
proxy_domains = [
|
| 242 |
+
"googlevideo.com",
|
| 243 |
+
"manifest.googlevideo.com",
|
| 244 |
+
"larozavideo.net",
|
| 245 |
+
"larooza.site",
|
| 246 |
+
"larooza.mom",
|
| 247 |
+
"laroza-tv.net",
|
| 248 |
+
"youtube.com",
|
| 249 |
+
"youtu.be"
|
| 250 |
+
]
|
| 251 |
+
|
| 252 |
+
should_proxy = any(domain in url for domain in proxy_domains)
|
| 253 |
+
|
| 254 |
+
if should_proxy:
|
| 255 |
+
logger.info(f"🛡️ Proxying download: {filename[:50]}...")
|
| 256 |
+
|
| 257 |
+
# Clean filename for the ASCII part of Content-Disposition
|
| 258 |
+
# Remove non-ASCII characters for the fallback filename
|
| 259 |
+
ascii_filename = re.sub(r'[^\x00-\x7F]+', '_', filename)
|
| 260 |
+
encoded_filename = quote(filename)
|
| 261 |
+
|
| 262 |
+
async def stream_generator():
|
| 263 |
+
async with httpx.AsyncClient(timeout=None, follow_redirects=True) as client:
|
| 264 |
+
try:
|
| 265 |
+
async with client.stream("GET", url, headers={"User-Agent": scraper.headers["User-Agent"]}) as resp:
|
| 266 |
+
if resp.status_code != 200:
|
| 267 |
+
logger.error(f"Proxy source returned {resp.status_code}")
|
| 268 |
+
return
|
| 269 |
+
|
| 270 |
+
# We can't easily set Content-Length here because StreamingResponse
|
| 271 |
+
# starts before we have all chunks, but we can set it in the outer response
|
| 272 |
+
async for chunk in resp.aiter_bytes(chunk_size=1024*1024):
|
| 273 |
+
yield chunk
|
| 274 |
+
except Exception as e:
|
| 275 |
+
logger.error(f"Streaming error: {e}")
|
| 276 |
+
|
| 277 |
+
# Get initial headers to find content length/type if possible
|
| 278 |
+
try:
|
| 279 |
+
async with httpx.AsyncClient(timeout=10.0, follow_redirects=True) as client:
|
| 280 |
+
head_resp = await client.head(url, headers={"User-Agent": scraper.headers["User-Agent"]})
|
| 281 |
+
content_length = head_resp.headers.get("Content-Length")
|
| 282 |
+
content_type = head_resp.headers.get("Content-Type", "video/mp4")
|
| 283 |
+
except:
|
| 284 |
+
content_length = None
|
| 285 |
+
content_type = "video/mp4"
|
| 286 |
+
|
| 287 |
+
headers = {
|
| 288 |
+
"Content-Disposition": f"attachment; filename=\"{ascii_filename}\"; filename*=UTF-8''{encoded_filename}",
|
| 289 |
+
"Access-Control-Expose-Headers": "Content-Disposition"
|
| 290 |
+
}
|
| 291 |
+
if content_length:
|
| 292 |
+
headers["Content-Length"] = content_length
|
| 293 |
+
|
| 294 |
+
return StreamingResponse(stream_generator(), media_type=content_type, headers=headers)
|
| 295 |
+
|
| 296 |
+
# For other sources, a simple redirect is much faster and saves server bandwidth
|
| 297 |
+
return RedirectResponse(url=url)
|
| 298 |
+
|
| 299 |
+
@app.get("/health")
|
| 300 |
+
async def health():
|
| 301 |
+
# Check FlareSolverr
|
| 302 |
+
fs_status = "OFFLINE"
|
| 303 |
+
try:
|
| 304 |
+
# Increase timeout as solver might be busy
|
| 305 |
+
async with httpx.AsyncClient(timeout=5.0) as client:
|
| 306 |
+
resp = await client.get("http://localhost:8191/health")
|
| 307 |
+
if resp.status_code == 200:
|
| 308 |
+
fs_status = "ONLINE"
|
| 309 |
+
except:
|
| 310 |
+
pass
|
| 311 |
+
|
| 312 |
+
return {
|
| 313 |
+
"backend": "ONLINE",
|
| 314 |
+
"flaresolverr": fs_status,
|
| 315 |
+
"scraper_sync": scraper._cookies_synced,
|
| 316 |
+
"timestamp": time.time()
|
| 317 |
+
}
|
| 318 |
+
|
| 319 |
+
# --- Frontend Mounting ---
|
| 320 |
+
# This ensures that our React app is served directly by FastAPI in production
|
| 321 |
+
# Check both relative and same-level structures for Docker/Local compatibility
|
| 322 |
+
base_dir = os.path.dirname(__file__)
|
| 323 |
+
frontend_path = os.path.join(base_dir, "meih-netflix-clone", "dist")
|
| 324 |
+
|
| 325 |
+
if not os.path.exists(frontend_path):
|
| 326 |
+
# Try one level up (local dev structure)
|
| 327 |
+
frontend_path = os.path.join(base_dir, "..", "meih-netflix-clone", "dist")
|
| 328 |
+
|
| 329 |
+
if os.path.exists(frontend_path):
|
| 330 |
+
# Assets are usually in dist/assets and referenced as /assets/ in Vite
|
| 331 |
+
assets_path = os.path.join(frontend_path, "assets")
|
| 332 |
+
if os.path.exists(assets_path):
|
| 333 |
+
app.mount("/assets", StaticFiles(directory=assets_path), name="assets")
|
| 334 |
+
|
| 335 |
+
@app.get("/{full_path:path}")
|
| 336 |
+
async def serve_frontend(full_path: str):
|
| 337 |
+
# Prevent infinite recursion for API routes if someone hits a wrong URL
|
| 338 |
+
if full_path.startswith(("api/", "latest", "category/", "search", "details", "proxy", "download", "health")):
|
| 339 |
+
return JSONResponse(status_code=404, content={"error": "Not Found"})
|
| 340 |
+
# If the path starts with api/ or other backend routes, it should have been caught above
|
| 341 |
+
# Otherwise, serve the main index.html for React Router to handle
|
| 342 |
+
file_path = os.path.join(frontend_path, full_path)
|
| 343 |
+
if os.path.exists(file_path) and os.path.isfile(file_path):
|
| 344 |
+
return FileResponse(file_path)
|
| 345 |
+
return FileResponse(os.path.join(frontend_path, "index.html"))
|
| 346 |
+
else:
|
| 347 |
+
logger.warning(f"Frontend dist folder not found at {frontend_path}. Frontend serving disabled.")
|
| 348 |
+
|
| 349 |
+
if __name__ == "__main__":
|
| 350 |
+
import uvicorn
|
| 351 |
+
# Use port 7860 for Hugging Face Spaces compatibility
|
| 352 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
package.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "meih-movies-api",
|
| 3 |
+
"version": "1.0.0",
|
| 4 |
+
"description": "Nitro-powered movie scraping API",
|
| 5 |
+
"main": "main.py",
|
| 6 |
+
"scripts": {
|
| 7 |
+
"start": "bash start.sh"
|
| 8 |
+
},
|
| 9 |
+
"engines": {
|
| 10 |
+
"node": ">=18.x"
|
| 11 |
+
}
|
| 12 |
+
}
|
requirements.txt
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn
|
| 3 |
+
httpx[http2]
|
| 4 |
+
beautifulsoup4
|
| 5 |
+
curl-cffi
|
| 6 |
+
yt-dlp
|
| 7 |
+
pydantic
|
| 8 |
+
python-multipart
|
| 9 |
+
aiohttp
|
| 10 |
+
aiosqlite
|
| 11 |
+
certifi
|
| 12 |
+
websockets
|
| 13 |
+
packaging
|
| 14 |
+
setuptools
|
scraper/engine.py
ADDED
|
@@ -0,0 +1,996 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import httpx
|
| 3 |
+
import re
|
| 4 |
+
import logging
|
| 5 |
+
import base64
|
| 6 |
+
import random
|
| 7 |
+
import os
|
| 8 |
+
import time
|
| 9 |
+
from typing import List, Dict, Optional
|
| 10 |
+
from bs4 import BeautifulSoup
|
| 11 |
+
from curl_cffi.requests import AsyncSession
|
| 12 |
+
from urllib.parse import urljoin, quote
|
| 13 |
+
from scraper.proxy_fetcher import proxy_fetcher
|
| 14 |
+
# Optional dependencies for heavy bypasses
|
| 15 |
+
try:
|
| 16 |
+
import undetected_chromedriver as uc
|
| 17 |
+
from selenium.webdriver.common.by import By
|
| 18 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 19 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 20 |
+
HAS_SELENIUM = True
|
| 21 |
+
except ImportError:
|
| 22 |
+
HAS_SELENIUM = False
|
| 23 |
+
logger.warning("⚠️ Selenium/Undetected-Chromedriver not installed. Nuclear bypass will be disabled.")
|
| 24 |
+
|
| 25 |
+
# Clean, strictly used logger
|
| 26 |
+
logging.basicConfig(level=logging.INFO)
|
| 27 |
+
logger = logging.getLogger("scraper")
|
| 28 |
+
|
| 29 |
+
class LaroozaScraper:
|
| 30 |
+
MIRRORS = ["https://q.larozavideo.net", "https://larooza.mom", "https://larooza.site", "https://m.laroza-tv.net"]
|
| 31 |
+
BASE_URL = "https://q.larozavideo.net"
|
| 32 |
+
TARGET_URL = "https://q.larozavideo.net/newvideos1.php"
|
| 33 |
+
_blacklisted_mirrors = {}
|
| 34 |
+
|
| 35 |
+
# Permanent Aliases -> Keywords search
|
| 36 |
+
CATEGORY_KEYWORDS = {
|
| 37 |
+
"arabic-movies": ["أفلام عربية", "افلام عربية", "افلام عربي", "arabic-movies33"],
|
| 38 |
+
"english-movies": ["افلام اجنبية", "أفلام أجنبية", "افلام اجنبي", "أجنبي", "all_movies_13"],
|
| 39 |
+
"indian-movies": ["افلام هندي", "أفلام هندية", "هندي", "indian-movies9"],
|
| 40 |
+
"anime-movies": ["افلام انمي", "أفلام أنمي", "انمي", "anime-movies-7"],
|
| 41 |
+
"dubbed-movies": ["افلام مدبلجة", "أفلام مدبلجة", "مدبلج", "7-aflammdblgh"],
|
| 42 |
+
"turkish-series": ["مسلسلات تركية", "تركي", "turkish-3isk-seriess47"],
|
| 43 |
+
"arabic-series": ["مسلسلات عربية", "عربي", "arabic-series46"],
|
| 44 |
+
"english-series": ["مسلسلات اجنبية", "أجنبي", "english-series10"],
|
| 45 |
+
"ramadan-2025": ["رمضان 2025", "13-ramadan-2025"],
|
| 46 |
+
"ramadan-2024": ["رمضان 2024", "28-ramadan-2024"],
|
| 47 |
+
"ramadan-2023": ["رمضان 2023", "10-ramadan-2023"],
|
| 48 |
+
"asian-movies": ["آسيوي", "اسيوي", "آسيوية", "6-asian-movies"],
|
| 49 |
+
"asian-series": ["مسلسلات اسياوية", "اسياوية", "6-asya"],
|
| 50 |
+
"turkish-movies": ["افلام تركية", "أفلام تركية", "8-aflam3isk"],
|
| 51 |
+
"anime-series": ["مسلسلات انمي", "كرتون", "6-anime-series"],
|
| 52 |
+
"indian-series": ["مسلسلات هندية", "11indian-series"],
|
| 53 |
+
"tv-programs": ["برامج تلفزيون", "tv-programs12"],
|
| 54 |
+
"plays": ["مسرحيات", "masrh-5"]
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
# Manual Fallbacks for reliability
|
| 58 |
+
HARDCODED_FALLBACKS = {
|
| 59 |
+
"arabic-movies": "arabic-movies33",
|
| 60 |
+
"english-movies": "all_movies_13",
|
| 61 |
+
"indian-movies": "indian-movies9",
|
| 62 |
+
"asian-movies": "6-asian-movies",
|
| 63 |
+
"anime-movies": "anime-movies-7",
|
| 64 |
+
"dubbed-movies": "7-aflammdblgh",
|
| 65 |
+
"turkish-movies": "8-aflam3isk",
|
| 66 |
+
"arabic-series": "arabic-series46",
|
| 67 |
+
"ramadan-2025": "13-ramadan-2025",
|
| 68 |
+
"ramadan-2024": "28-ramadan-2024",
|
| 69 |
+
"ramadan-2023": "10-ramadan-2023",
|
| 70 |
+
"english-series": "english-series10",
|
| 71 |
+
"turkish-series": "turkish-3isk-seriess47",
|
| 72 |
+
"indian-series": "11indian-series",
|
| 73 |
+
"tv-programs": "tv-programs12",
|
| 74 |
+
"plays": "masrh-5",
|
| 75 |
+
"anime-series": "6-anime-series",
|
| 76 |
+
"asian-series": "6-asya"
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
def __init__(self):
|
| 80 |
+
# Primary fetcher: curl-cffi (Fastest, TLS Impersonation)
|
| 81 |
+
# Using chrome120 and disabling SSL verify for maximum compatibility
|
| 82 |
+
self.session = AsyncSession(impersonate="chrome120", timeout=30, verify=False)
|
| 83 |
+
self._cookies_synced = False
|
| 84 |
+
self._last_pw_solve = 0
|
| 85 |
+
self._ua_synced = None
|
| 86 |
+
self._chrome_version = None
|
| 87 |
+
self._domain_lock = asyncio.Lock()
|
| 88 |
+
self._warming_lock = asyncio.Lock()
|
| 89 |
+
self._proxy_refresh_interval = 1800 # 30 minutes
|
| 90 |
+
self._proxy_refresh_time = 0
|
| 91 |
+
self._semaphore = asyncio.Semaphore(5) # Reduced from 15 for stability
|
| 92 |
+
self._optimization_started = False
|
| 93 |
+
self._is_prefetching = False
|
| 94 |
+
self._domain_detected = False
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
# Hybrid Configuration
|
| 98 |
+
self.REMOTE_SOLVER_URL = "https://meih-movies-api.onrender.com/remote-fetch"
|
| 99 |
+
self.IS_RENDER = os.environ.get("RENDER") is not None
|
| 100 |
+
self.IS_HUGGINGFACE = os.environ.get("SPACE_ID") is not None
|
| 101 |
+
|
| 102 |
+
# Free Proxy Pool for Hugging Face (to bypass IP bans)
|
| 103 |
+
self._free_proxy_pool = []
|
| 104 |
+
self._proxy_pool_last_refresh = 0
|
| 105 |
+
|
| 106 |
+
self.headers = {
|
| 107 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
| 108 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
| 109 |
+
"Accept-Language": "ar,en-US;q=0.9,en;q=0.8",
|
| 110 |
+
"Accept-Encoding": "gzip, deflate, br",
|
| 111 |
+
"Referer": "https://www.google.com/",
|
| 112 |
+
"Connection": "keep-alive",
|
| 113 |
+
"Sec-Fetch-Dest": "document",
|
| 114 |
+
"Sec-Fetch-Mode": "navigate",
|
| 115 |
+
"Sec-Fetch-Site": "cross-site",
|
| 116 |
+
}
|
| 117 |
+
self._session_initialized = False
|
| 118 |
+
self._session_warmed_at = 0
|
| 119 |
+
self._httpx_client = None
|
| 120 |
+
|
| 121 |
+
# --- Proxy Rotation System ---
|
| 122 |
+
proxy_str = os.getenv("PROXY_LIST", "")
|
| 123 |
+
self.proxies = [p.strip() for p in proxy_str.split(",") if p.strip()]
|
| 124 |
+
self._current_proxy_idx = 0
|
| 125 |
+
if self.proxies:
|
| 126 |
+
logger.info(f"✓ Proxy rotation enabled with {len(self.proxies)} endpoints")
|
| 127 |
+
self._category_map = {}
|
| 128 |
+
self._last_discovery = 0
|
| 129 |
+
self._discovery_lock = asyncio.Lock()
|
| 130 |
+
|
| 131 |
+
# --- Mirror & Performance ---
|
| 132 |
+
self._cache = {} # {url: (timestamp, data)}
|
| 133 |
+
self._cache_ttl = 3600 # 1 hour for data
|
| 134 |
+
self._free_proxies = []
|
| 135 |
+
self._optimization_started = False
|
| 136 |
+
self._uc_lock = asyncio.Lock()
|
| 137 |
+
self._solver_lock = asyncio.Lock() # Guard against multiple solvers
|
| 138 |
+
|
| 139 |
+
# We'll start optimization on the first request to avoid "no running loop" error
|
| 140 |
+
|
| 141 |
+
async def _optimize_connection(self):
|
| 142 |
+
"""Find the fastest mirror and warm up the engine"""
|
| 143 |
+
# 1. Check if we already have a reasonably fresh fastest mirror
|
| 144 |
+
now = time.time()
|
| 145 |
+
if hasattr(self, '_fastest_mirror_detected_at') and now - self._fastest_mirror_detected_at < 3600:
|
| 146 |
+
return
|
| 147 |
+
|
| 148 |
+
logger.info("🔍 Testing mirror speeds (Optimized)...")
|
| 149 |
+
|
| 150 |
+
async def test_mirror(mirror):
|
| 151 |
+
try:
|
| 152 |
+
# very aggressive timeout for discovery
|
| 153 |
+
start = time.time()
|
| 154 |
+
test_url = f"{mirror}/newvideos1.php"
|
| 155 |
+
async with httpx.AsyncClient(timeout=1.5, follow_redirects=True, verify=False) as client:
|
| 156 |
+
resp = await client.get(test_url)
|
| 157 |
+
if resp.status_code == 200:
|
| 158 |
+
return (time.time() - start, mirror)
|
| 159 |
+
except:
|
| 160 |
+
pass
|
| 161 |
+
return (999, mirror)
|
| 162 |
+
|
| 163 |
+
results = await asyncio.gather(*(test_mirror(m) for m in self.MIRRORS))
|
| 164 |
+
results.sort()
|
| 165 |
+
|
| 166 |
+
min_time, fastest_mirror = results[0]
|
| 167 |
+
|
| 168 |
+
if min_time < 999:
|
| 169 |
+
logger.info(f"⚡ Fastest mirror: {fastest_mirror} ({min_time:.2f}s)")
|
| 170 |
+
self.BASE_URL = fastest_mirror
|
| 171 |
+
self.TARGET_URL = f"{fastest_mirror}/newvideos1.php"
|
| 172 |
+
self._fastest_mirror_detected_at = now
|
| 173 |
+
else:
|
| 174 |
+
logger.warning("⚠️ No mirrors responded quickly, using default.")
|
| 175 |
+
self._fastest_mirror_detected_at = now - 3300 # Retry sooner
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
async def _refresh_free_proxies(self):
|
| 179 |
+
"""Fetch free proxies from public APIs (for cloud deployments)"""
|
| 180 |
+
# Enable on both Hugging Face and Render.com
|
| 181 |
+
if not (self.IS_HUGGINGFACE or self.IS_RENDER):
|
| 182 |
+
return
|
| 183 |
+
|
| 184 |
+
now = time.time()
|
| 185 |
+
if now - self._proxy_pool_last_refresh < 300: # Refresh every 5 minutes
|
| 186 |
+
return
|
| 187 |
+
|
| 188 |
+
logger.info("🔄 Refreshing free proxy pool...")
|
| 189 |
+
self._proxy_pool_last_refresh = now
|
| 190 |
+
|
| 191 |
+
proxy_sources = [
|
| 192 |
+
"https://api.proxyscrape.com/v2/?request=get&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all",
|
| 193 |
+
"https://www.proxy-list.download/api/v1/get?type=http",
|
| 194 |
+
]
|
| 195 |
+
|
| 196 |
+
new_proxies = []
|
| 197 |
+
for source in proxy_sources:
|
| 198 |
+
try:
|
| 199 |
+
async with httpx.AsyncClient(timeout=10.0) as client:
|
| 200 |
+
resp = await client.get(source)
|
| 201 |
+
if resp.status_code == 200:
|
| 202 |
+
proxies = resp.text.strip().split('\n')
|
| 203 |
+
for proxy in proxies[:10]: # Take first 10 from each source
|
| 204 |
+
proxy = proxy.strip()
|
| 205 |
+
if proxy and ':' in proxy:
|
| 206 |
+
new_proxies.append(f"http://{proxy}")
|
| 207 |
+
except Exception as e:
|
| 208 |
+
logger.warning(f"Failed to fetch proxies from {source}: {e}")
|
| 209 |
+
|
| 210 |
+
if new_proxies:
|
| 211 |
+
self._free_proxy_pool = new_proxies
|
| 212 |
+
logger.info(f"✅ Loaded {len(new_proxies)} free proxies")
|
| 213 |
+
else:
|
| 214 |
+
logger.warning("⚠️ No free proxies available")
|
| 215 |
+
|
| 216 |
+
async def _discover_categories(self, force=False):
|
| 217 |
+
"""Build the category map dynamically from the homepage"""
|
| 218 |
+
async with self._discovery_lock:
|
| 219 |
+
if not force and time.time() - self._last_discovery < 3600: # Cache for 1 hour
|
| 220 |
+
return
|
| 221 |
+
|
| 222 |
+
logger.info("Refreshing category mapping...")
|
| 223 |
+
html = await self._get_html(self.BASE_URL)
|
| 224 |
+
if not html: return
|
| 225 |
+
|
| 226 |
+
soup = BeautifulSoup(html, 'html.parser')
|
| 227 |
+
new_map = {}
|
| 228 |
+
|
| 229 |
+
# Find all category links
|
| 230 |
+
for a in soup.find_all('a', href=True):
|
| 231 |
+
href = a['href']
|
| 232 |
+
if 'cat=' not in href: continue
|
| 233 |
+
|
| 234 |
+
cat_id = href.split('cat=')[-1].split('&')[0]
|
| 235 |
+
text = a.get_text(strip=True).lower()
|
| 236 |
+
|
| 237 |
+
# Match against keywords
|
| 238 |
+
for alias, keywords in self.CATEGORY_KEYWORDS.items():
|
| 239 |
+
if alias not in new_map:
|
| 240 |
+
if any(k in text for k in keywords):
|
| 241 |
+
new_map[alias] = cat_id
|
| 242 |
+
|
| 243 |
+
if new_map:
|
| 244 |
+
self._category_map = new_map
|
| 245 |
+
self._last_discovery = time.time()
|
| 246 |
+
logger.info(f"✓ Mapped {len(new_map)} categories: {new_map}")
|
| 247 |
+
|
| 248 |
+
async def _resolve_cat_id(self, cat_id: str) -> str:
|
| 249 |
+
"""Resolves an alias to a real ID, or returns the original if not an alias"""
|
| 250 |
+
await self._discover_categories()
|
| 251 |
+
# 1. Check dynamic map
|
| 252 |
+
if cat_id in self._category_map:
|
| 253 |
+
return self._category_map[cat_id]
|
| 254 |
+
|
| 255 |
+
# 2. Check hardcoded fallbacks if dynamic failed
|
| 256 |
+
if cat_id in self.HARDCODED_FALLBACKS:
|
| 257 |
+
return self.HARDCODED_FALLBACKS[cat_id]
|
| 258 |
+
|
| 259 |
+
return cat_id
|
| 260 |
+
|
| 261 |
+
async def _warm_session(self):
|
| 262 |
+
"""Warm up session with the detected working mirror"""
|
| 263 |
+
if not self._domain_detected:
|
| 264 |
+
# We already set defaults in __init__ / class, just confirm
|
| 265 |
+
logger.info(f"🚀 Targeting exclusive source: {self.TARGET_URL}")
|
| 266 |
+
self._domain_detected = True
|
| 267 |
+
|
| 268 |
+
if not self._session_initialized:
|
| 269 |
+
self._session_initialized = True # Mark as init even if basic get fails, as PW will solve it
|
| 270 |
+
|
| 271 |
+
async def _refresh_free_proxies(self):
|
| 272 |
+
"""Refresh free proxy list if needed"""
|
| 273 |
+
if time.time() - self._proxy_refresh_time > self._proxy_refresh_interval:
|
| 274 |
+
logger.info("Refreshing free proxy pool...")
|
| 275 |
+
self._free_proxies = await proxy_fetcher.get_working_proxies(max_count=15)
|
| 276 |
+
self._proxy_refresh_time = time.time()
|
| 277 |
+
logger.info(f"Loaded {len(self._free_proxies)} working free proxies")
|
| 278 |
+
|
| 279 |
+
def _get_proxy(self) -> Optional[str]:
|
| 280 |
+
# On cloud platforms (HF or Render), prioritize free proxy pool
|
| 281 |
+
if (self.IS_HUGGINGFACE or self.IS_RENDER) and self._free_proxy_pool:
|
| 282 |
+
proxy = self._free_proxy_pool[self._current_proxy_idx % len(self._free_proxy_pool)]
|
| 283 |
+
self._current_proxy_idx += 1
|
| 284 |
+
return proxy
|
| 285 |
+
|
| 286 |
+
# Try free proxies first (legacy proxy_fetcher)
|
| 287 |
+
if self._free_proxies:
|
| 288 |
+
proxy = self._free_proxies[self._current_proxy_idx % len(self._free_proxies)]
|
| 289 |
+
self._current_proxy_idx += 1
|
| 290 |
+
return proxy
|
| 291 |
+
|
| 292 |
+
# Fallback to configured proxies
|
| 293 |
+
if not self.proxies: return None
|
| 294 |
+
proxy = self.proxies[self._current_proxy_idx % len(self.proxies)]
|
| 295 |
+
self._current_proxy_idx += 1
|
| 296 |
+
return proxy
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
async def _get_html_with_undetected_chrome(self, url: str) -> Optional[str]:
|
| 300 |
+
"""The 'NUCLEAR Option': Undetected-Chromedriver with safety locks for Windows"""
|
| 301 |
+
if not HAS_SELENIUM:
|
| 302 |
+
logger.error("❌ Cannot use UC: Selenium/Undetected-Chromedriver not installed.")
|
| 303 |
+
return None
|
| 304 |
+
|
| 305 |
+
async with self._uc_lock:
|
| 306 |
+
logger.info(f"💣 Launching Undetected-Chrome NUCLEAR Bypass for {url}...")
|
| 307 |
+
|
| 308 |
+
def get_chrome_version():
|
| 309 |
+
try:
|
| 310 |
+
import winreg
|
| 311 |
+
key = winreg.OpenKey(winreg.HKEY_CURRENT_USER, r'Software\Google\Chrome\BLBeacon')
|
| 312 |
+
version, _ = winreg.QueryValueEx(key, 'version')
|
| 313 |
+
return int(version.split('.')[0])
|
| 314 |
+
except:
|
| 315 |
+
return 120 # Fallback
|
| 316 |
+
|
| 317 |
+
if not self._chrome_version:
|
| 318 |
+
self._chrome_version = get_chrome_version()
|
| 319 |
+
|
| 320 |
+
def chrome_task():
|
| 321 |
+
driver = None
|
| 322 |
+
try:
|
| 323 |
+
options = uc.ChromeOptions()
|
| 324 |
+
options.add_argument('--headless')
|
| 325 |
+
options.add_argument('--no-sandbox')
|
| 326 |
+
options.add_argument('--disable-dev-shm-usage')
|
| 327 |
+
options.add_argument('--disable-gpu')
|
| 328 |
+
options.add_argument('--window-size=1280,1024')
|
| 329 |
+
options.add_argument('--mute-audio')
|
| 330 |
+
options.add_argument('--disable-notifications')
|
| 331 |
+
options.add_argument('--disable-popup-blocking')
|
| 332 |
+
options.add_argument('--hide-scrollbars')
|
| 333 |
+
options.add_argument('--disable-logging')
|
| 334 |
+
options.add_argument('--log-level=3')
|
| 335 |
+
options.add_argument('--no-first-run')
|
| 336 |
+
options.add_argument('--no-default-browser-check')
|
| 337 |
+
options.add_argument('--no-pings')
|
| 338 |
+
options.add_argument('--disable-blink-features=AutomationControlled')
|
| 339 |
+
|
| 340 |
+
# Disable images for maximum speed
|
| 341 |
+
prefs = {
|
| 342 |
+
'profile.managed_default_content_settings.images': 2,
|
| 343 |
+
'profile.default_content_settings.images': 2
|
| 344 |
+
}
|
| 345 |
+
options.add_experimental_option('prefs', prefs)
|
| 346 |
+
|
| 347 |
+
driver = uc.Chrome(options=options, version_main=self._chrome_version)
|
| 348 |
+
driver.set_page_load_timeout(60)
|
| 349 |
+
|
| 350 |
+
logger.info(f"💣 UC Fetching: {url}")
|
| 351 |
+
driver.get(url)
|
| 352 |
+
|
| 353 |
+
# Wait for either content or challenge
|
| 354 |
+
time.sleep(10) # Heavy sleep for UC
|
| 355 |
+
|
| 356 |
+
html = driver.page_source
|
| 357 |
+
|
| 358 |
+
# Basic sync of UA
|
| 359 |
+
ua = driver.execute_script("return navigator.userAgent")
|
| 360 |
+
if ua:
|
| 361 |
+
self.headers["User-Agent"] = ua
|
| 362 |
+
|
| 363 |
+
return html
|
| 364 |
+
except Exception as e:
|
| 365 |
+
logger.error(f"Undetected-Chrome failure: {e}")
|
| 366 |
+
return None
|
| 367 |
+
finally:
|
| 368 |
+
if driver:
|
| 369 |
+
try: driver.quit()
|
| 370 |
+
except: pass
|
| 371 |
+
|
| 372 |
+
loop = asyncio.get_event_loop()
|
| 373 |
+
return await loop.run_in_executor(None, chrome_task)
|
| 374 |
+
|
| 375 |
+
async def _get_html_with_flaresolverr(self, url: str) -> Optional[str]:
|
| 376 |
+
"""FlareSolverr with Singleton Lock to avoid browser bloat"""
|
| 377 |
+
async with self._solver_lock:
|
| 378 |
+
# Re-check cache inside lock
|
| 379 |
+
if url in self._cache:
|
| 380 |
+
return self._cache[url][1]
|
| 381 |
+
|
| 382 |
+
logger.info(f"✨ Requesting FlareSolverr solve for {url}...")
|
| 383 |
+
|
| 384 |
+
flaresolverr_url = "http://localhost:8191/v1"
|
| 385 |
+
payload = {
|
| 386 |
+
"cmd": "request.get",
|
| 387 |
+
"url": url,
|
| 388 |
+
"maxTimeout": 60000
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
# Connection Retry Loop
|
| 392 |
+
max_conn_retries = 5 # Increased retries
|
| 393 |
+
for conn_attempt in range(max_conn_retries):
|
| 394 |
+
try:
|
| 395 |
+
async with httpx.AsyncClient(timeout=90.0) as client:
|
| 396 |
+
response = await client.post(flaresolverr_url, json=payload)
|
| 397 |
+
if response.status_code == 200:
|
| 398 |
+
data = response.json()
|
| 399 |
+
if data.get('status') == 'ok':
|
| 400 |
+
solution = data.get('solution', {})
|
| 401 |
+
html = solution.get('response', '')
|
| 402 |
+
|
| 403 |
+
# SYNCING LOGIC
|
| 404 |
+
cookies = solution.get('cookies', [])
|
| 405 |
+
ua = solution.get('userAgent', '')
|
| 406 |
+
if ua:
|
| 407 |
+
self._ua_synced = ua
|
| 408 |
+
self.headers["User-Agent"] = ua
|
| 409 |
+
|
| 410 |
+
for cookie in cookies:
|
| 411 |
+
# Ensure domain is set for proper cookie handling
|
| 412 |
+
domain = cookie.get('domain')
|
| 413 |
+
if not domain and url:
|
| 414 |
+
try:
|
| 415 |
+
domain = urlparse(url).netloc
|
| 416 |
+
if domain.startswith('www.'):
|
| 417 |
+
domain = domain[4:]
|
| 418 |
+
except:
|
| 419 |
+
pass
|
| 420 |
+
|
| 421 |
+
if domain:
|
| 422 |
+
self.session.cookies.set(
|
| 423 |
+
cookie['name'],
|
| 424 |
+
cookie['value'],
|
| 425 |
+
domain=domain,
|
| 426 |
+
path=cookie.get('path', '/'),
|
| 427 |
+
secure=cookie.get('secure', False),
|
| 428 |
+
expires=cookie.get('expires')
|
| 429 |
+
)
|
| 430 |
+
|
| 431 |
+
self._cookies_synced = True
|
| 432 |
+
self._last_pw_solve = time.time()
|
| 433 |
+
logger.info("✅ Session Synced!")
|
| 434 |
+
return html
|
| 435 |
+
else:
|
| 436 |
+
logger.warning(f"FlareSolverr error: {data.get('message')}")
|
| 437 |
+
else:
|
| 438 |
+
logger.warning(f"FlareSolverr returned status {response.status_code}")
|
| 439 |
+
except Exception as e:
|
| 440 |
+
if conn_attempt < max_conn_retries - 1:
|
| 441 |
+
logger.warning(f"FlareSolverr comm failed (attempt {conn_attempt+1}/{max_conn_retries}): {e}. Retrying...")
|
| 442 |
+
await asyncio.sleep(2)
|
| 443 |
+
else:
|
| 444 |
+
logger.error(f"FlareSolverr comm failed after {max_conn_retries} attempts: {e}")
|
| 445 |
+
return None
|
| 446 |
+
|
| 447 |
+
async def _turbo_prefetch(self):
|
| 448 |
+
"""Pre-fetch all major categories in parallel to populate cache instantly"""
|
| 449 |
+
if self._is_prefetching: return
|
| 450 |
+
self._is_prefetching = True
|
| 451 |
+
logger.info("🚀 NITRO MODE: Starting concurrent background pre-fetch...")
|
| 452 |
+
|
| 453 |
+
try:
|
| 454 |
+
# List of high-priority tasks
|
| 455 |
+
tasks = [self.fetch_home(page=1)]
|
| 456 |
+
|
| 457 |
+
# Map of key categories to pre-warm
|
| 458 |
+
priority_cats = list(self.CATEGORY_KEYWORDS.keys())[:15]
|
| 459 |
+
for cat_id in priority_cats:
|
| 460 |
+
tasks.append(self.fetch_category(cat_id, page=1))
|
| 461 |
+
|
| 462 |
+
# Run everything in parallel with semaphore protection
|
| 463 |
+
await asyncio.gather(*tasks, return_exceptions=True)
|
| 464 |
+
logger.info(f"⚡ NITRO MODE complete! Cache primed with {len(self._cache)} items.")
|
| 465 |
+
except Exception as e:
|
| 466 |
+
logger.error(f"Nitro pre-fetch failed: {e}")
|
| 467 |
+
finally:
|
| 468 |
+
self._is_prefetching = False
|
| 469 |
+
|
| 470 |
+
async def _get_html(self, url: str, max_retries: int = 1, follow_meta=True) -> Optional[str]:
|
| 471 |
+
"""Nitro-Speed Fetch with Parallel Safety"""
|
| 472 |
+
if not self._optimization_started:
|
| 473 |
+
self._optimization_started = True
|
| 474 |
+
asyncio.create_task(self._optimize_connection())
|
| 475 |
+
|
| 476 |
+
async with self._semaphore:
|
| 477 |
+
now = time.time()
|
| 478 |
+
|
| 479 |
+
# 0. Cache Check
|
| 480 |
+
if url in self._cache:
|
| 481 |
+
ts, data = self._cache[url]
|
| 482 |
+
if now - ts < self._cache_ttl:
|
| 483 |
+
return data
|
| 484 |
+
|
| 485 |
+
# Sanitize URL - Skip landing pages
|
| 486 |
+
if any(x in url for x in ["/gaza.20", "/gaza.18", "/gaza.22"]):
|
| 487 |
+
logger.info(f"Sanitizing landing page URL: {url} -> {self.TARGET_URL}")
|
| 488 |
+
url = self.TARGET_URL
|
| 489 |
+
|
| 490 |
+
# Refresh free proxies if on cloud platforms
|
| 491 |
+
if self.IS_HUGGINGFACE or self.IS_RENDER:
|
| 492 |
+
await self._refresh_free_proxies()
|
| 493 |
+
|
| 494 |
+
proxy = self._get_proxy()
|
| 495 |
+
proxy_dict = {"http": proxy, "https": proxy} if proxy else None
|
| 496 |
+
|
| 497 |
+
# 1. Nitro Path (curl-cffi)
|
| 498 |
+
logger.info(f"🚀 Nitro Path (curl-cffi) for {url}")
|
| 499 |
+
try:
|
| 500 |
+
# Increased timeout to 45s to handle extremely slow responses
|
| 501 |
+
resp = await self.session.get(url, headers=self.headers, timeout=45, proxies=proxy_dict)
|
| 502 |
+
status_code = resp.status_code
|
| 503 |
+
logger.info(f"📡 Nitro Path response: {status_code} ({len(resp.content)} bytes)")
|
| 504 |
+
|
| 505 |
+
if status_code == 200:
|
| 506 |
+
text = resp.text
|
| 507 |
+
# Improve Meta Refresh detection (Larooza uses this heavily for domain rotation)
|
| 508 |
+
refresh_match = re.search(r'http-equiv=["\']refresh["\'].*?content=["\']\d+;\s*url=(.*?)["\']', text, re.I)
|
| 509 |
+
if not refresh_match:
|
| 510 |
+
refresh_match = re.search(r'content=["\']\d+;\s*url=(.*?)["\']', text, re.I)
|
| 511 |
+
|
| 512 |
+
if refresh_match and follow_meta:
|
| 513 |
+
new_url_raw = refresh_match.group(1).strip("'\" ")
|
| 514 |
+
new_url = urljoin(url, new_url_raw)
|
| 515 |
+
|
| 516 |
+
# Preserve query parameters if the new URL doesn't have them but the old one did
|
| 517 |
+
if "?" not in new_url and "?" in url:
|
| 518 |
+
query = url.split("?")[-1]
|
| 519 |
+
new_url = f"{new_url}?{query}" if not new_url.endswith("?") else f"{new_url}{query}"
|
| 520 |
+
|
| 521 |
+
# If redirecting to a known landing page or ad-trap, skip it
|
| 522 |
+
if any(x in new_url for x in ["gaza.20", "gaza.18", "gaza.22", "gaza.24"]):
|
| 523 |
+
logger.info(f"🚫 Skipping ad-trap redirect: {new_url}")
|
| 524 |
+
new_url = self.TARGET_URL
|
| 525 |
+
|
| 526 |
+
logger.info(f"🔄 Following meta refresh to: {new_url}")
|
| 527 |
+
return await self._get_html(new_url, max_retries=max_retries, follow_meta=False)
|
| 528 |
+
|
| 529 |
+
# More robust Cloudflare & Landing Page detection
|
| 530 |
+
text_lower = text.lower()
|
| 531 |
+
cf_markers = ["challenge-running", "cf-ray", "cloudflare-static", "just a moment", "verify you are human", "checking your browser"]
|
| 532 |
+
is_cf = any(x in text_lower for x in cf_markers) or "id=\"challenge-form\"" in text_lower
|
| 533 |
+
|
| 534 |
+
# Detect landing page even if 200 OK (gaza.20 redirect in JS or Meta)
|
| 535 |
+
is_landing = "gaza.20" in text_lower or "gaza.18" in text_lower or "gaza.22" in text_lower
|
| 536 |
+
|
| 537 |
+
if is_cf:
|
| 538 |
+
logger.warning(f"⚠️ Cloudflare detected in Nitro response for {url}")
|
| 539 |
+
elif is_landing and follow_meta:
|
| 540 |
+
logger.info(f"🔄 Landing page detected in content for {url}, forcing target...")
|
| 541 |
+
return await self._get_html(self.TARGET_URL, max_retries=max_retries, follow_meta=False)
|
| 542 |
+
else:
|
| 543 |
+
self._cache[url] = (now, text)
|
| 544 |
+
return text
|
| 545 |
+
elif status_code == 404:
|
| 546 |
+
logger.warning(f"⚠️ Nitro Path 404 for {url} on mirror {self.BASE_URL}")
|
| 547 |
+
# If this was a mirror, fallback to primary domain
|
| 548 |
+
primary_primary = self.MIRRORS[0]
|
| 549 |
+
if self.BASE_URL != primary_primary:
|
| 550 |
+
fallback_url = url.replace(self.BASE_URL, primary_primary)
|
| 551 |
+
logger.info(f"🔁 Falling back to primary domain: {fallback_url}")
|
| 552 |
+
return await self._get_html(fallback_url, max_retries=max_retries, follow_meta=True)
|
| 553 |
+
elif status_code == 403:
|
| 554 |
+
logger.warning(f"🚫 Nitro Path 403 for {url}, falling back to solvers...")
|
| 555 |
+
except Exception as e:
|
| 556 |
+
logger.error(f"❌ Nitro Path error for {url}: {e}")
|
| 557 |
+
|
| 558 |
+
# 2. Solver Path
|
| 559 |
+
for att in range(max_retries):
|
| 560 |
+
# Use a specific lock for solver to prevent multiple concurrent solver requests for the same URL
|
| 561 |
+
# but allow different URLs in parallel. For simplicity, we use the existing semaphore and a small delay.
|
| 562 |
+
|
| 563 |
+
# Check cache again just in case another task filled it
|
| 564 |
+
if url in self._cache:
|
| 565 |
+
return self._cache[url][1]
|
| 566 |
+
|
| 567 |
+
html = await self._get_html_with_flaresolverr(url)
|
| 568 |
+
if html:
|
| 569 |
+
self._cache[url] = (now, html)
|
| 570 |
+
return html
|
| 571 |
+
|
| 572 |
+
# UC Fallback for critical pages
|
| 573 |
+
if att == max_retries - 1:
|
| 574 |
+
logger.info(f"UC Fallback for: {url}")
|
| 575 |
+
res = await self._get_html_with_undetected_chrome(url)
|
| 576 |
+
if res: return res
|
| 577 |
+
|
| 578 |
+
return None
|
| 579 |
+
|
| 580 |
+
def _extract_items(self, soup: BeautifulSoup) -> List[Dict]:
|
| 581 |
+
"""Ultra-Fast Content Extraction with Deep Image Probing"""
|
| 582 |
+
items = []
|
| 583 |
+
if not soup: return []
|
| 584 |
+
|
| 585 |
+
if soup.title:
|
| 586 |
+
logger.info(f"Extracting: {soup.title.string}")
|
| 587 |
+
if "challenge" in str(soup.title).lower() or "cloudflare" in str(soup.title).lower():
|
| 588 |
+
return []
|
| 589 |
+
|
| 590 |
+
# Ultra-Strong Coverage for all Larooza Variants & Mirrors
|
| 591 |
+
containers = soup.select('.thumbnail, .pm-li-video, .pm-video-thumb, .video-block, .movie-item, li.col-xs-6, .box, .video-box, .video-item, .post-item')
|
| 592 |
+
if not containers:
|
| 593 |
+
# Deep scan for any link that looks like a video
|
| 594 |
+
containers = soup.select('a[href*="video.php"], a[href*="watch.php"], .video-listing-content, .card-video')
|
| 595 |
+
|
| 596 |
+
seen_urls = set()
|
| 597 |
+
for tag in containers:
|
| 598 |
+
# 1. Fast Link Detection
|
| 599 |
+
link = tag if (tag.name == 'a' and 'video.php' in tag.get('href', '')) else \
|
| 600 |
+
(tag.select_one('a.ellipsis') or tag.find('a', href=lambda x: x and 'video.php' in x))
|
| 601 |
+
|
| 602 |
+
if not link: continue
|
| 603 |
+
href = link.get('href')
|
| 604 |
+
if not href: continue
|
| 605 |
+
|
| 606 |
+
full_link = urljoin(self.BASE_URL, href)
|
| 607 |
+
if full_link in seen_urls: continue
|
| 608 |
+
seen_urls.add(full_link)
|
| 609 |
+
|
| 610 |
+
# 2. Extract Title & Clean it
|
| 611 |
+
title_node = tag.select_one('h3, h2, .title, .ellipsis, .video-title, p')
|
| 612 |
+
title = title_node.get_text(strip=True) if title_node else ""
|
| 613 |
+
if not title and link:
|
| 614 |
+
title = link.get('title') or link.get_text(strip=True)
|
| 615 |
+
|
| 616 |
+
# Clean Title (Remove noisy tags for premium look)
|
| 617 |
+
for t_tag in ["مشاهدة", "فيلم", "مسلسل", "كامل", "HDCAM", "HD", "WEB-DL", "Cam", "مترجم", "اون لاين", "مدبلج"]:
|
| 618 |
+
title = title.replace(t_tag, "").strip()
|
| 619 |
+
title = re.sub(r'\d{4}', '', title).strip("- ").strip() # Remove Year
|
| 620 |
+
|
| 621 |
+
# 3. Deep Image Probing
|
| 622 |
+
img_node = tag.select_one('img')
|
| 623 |
+
img_url = ""
|
| 624 |
+
if img_node:
|
| 625 |
+
# Try all possible lazy-load attributes, prefer potential real URLs over base64
|
| 626 |
+
candidates = [
|
| 627 |
+
img_node.get('data-src'),
|
| 628 |
+
img_node.get('data-lazy-src'),
|
| 629 |
+
img_node.get('data-original'),
|
| 630 |
+
img_node.get('srcset'),
|
| 631 |
+
img_node.get('src')
|
| 632 |
+
]
|
| 633 |
+
for c in candidates:
|
| 634 |
+
if c and not c.startswith('data:'):
|
| 635 |
+
# Ensure it's a real URL
|
| 636 |
+
if c.startswith('http') or c.startswith('//') or c.startswith('/'):
|
| 637 |
+
img_url = c
|
| 638 |
+
break
|
| 639 |
+
|
| 640 |
+
# If still no image, try to find ANY attribute that looks like a URL
|
| 641 |
+
if not img_url:
|
| 642 |
+
for attr, val in img_node.attrs.items():
|
| 643 |
+
if isinstance(val, str) and (val.startswith('http') or '.jpg' in val or '.png' in val) and not val.startswith('data:'):
|
| 644 |
+
img_url = val
|
| 645 |
+
break
|
| 646 |
+
|
| 647 |
+
if img_url and "," in img_url: # Handle srcset
|
| 648 |
+
img_url = img_url.split(",")[0].split(" ")[0]
|
| 649 |
+
|
| 650 |
+
# Fallback: Check for background-image in style
|
| 651 |
+
if not img_url:
|
| 652 |
+
style = tag.get('style') or ""
|
| 653 |
+
if 'background-image' in style:
|
| 654 |
+
m = re.search(r'url\([\'"]?(.*?)[\'"]?\)', style)
|
| 655 |
+
if m:
|
| 656 |
+
img_url = m.group(1)
|
| 657 |
+
|
| 658 |
+
if not img_url or img_url.startswith('data:'):
|
| 659 |
+
img_url = "https://placehold.co/600x400/000000/FFFFFF?text=No+Poster"
|
| 660 |
+
|
| 661 |
+
# Absolute URL correction
|
| 662 |
+
if img_url.startswith('//'): img_url = 'https:' + img_url
|
| 663 |
+
elif img_url.startswith('/'): img_url = self.BASE_URL + img_url
|
| 664 |
+
|
| 665 |
+
# Proxy through our backend for stability
|
| 666 |
+
poster = f"/proxy/image?url={quote(img_url)}"
|
| 667 |
+
|
| 668 |
+
# 4. Speed-optimized Series Detection
|
| 669 |
+
lt = title.lower()
|
| 670 |
+
content_type = "series" if any(x in lt for x in ['حلقة', 'مسلسل', 'episode', 'season', 'series']) else "movie"
|
| 671 |
+
|
| 672 |
+
items.append({
|
| 673 |
+
"id": base64.urlsafe_b64encode(full_link.encode()).decode(),
|
| 674 |
+
"title": title,
|
| 675 |
+
"poster": poster,
|
| 676 |
+
"type": content_type,
|
| 677 |
+
"duration": tag.select_one('.duration, .pm-label-duration, .time').get_text(strip=True) if tag.select_one('.duration, .pm-label-duration, .time') else ""
|
| 678 |
+
})
|
| 679 |
+
return items
|
| 680 |
+
|
| 681 |
+
async def fetch_home(self, page: int = 1) -> List[Dict]:
|
| 682 |
+
target = f"{self.TARGET_URL}?page={page}"
|
| 683 |
+
html = await self._get_html(target, max_retries=3)
|
| 684 |
+
if not html:
|
| 685 |
+
logger.error(f"Failed to fetch home page: {target}")
|
| 686 |
+
return []
|
| 687 |
+
|
| 688 |
+
items = self._extract_items(BeautifulSoup(html, 'html.parser'))
|
| 689 |
+
logger.info(f"Fetched {len(items)} items from {target}")
|
| 690 |
+
return items
|
| 691 |
+
|
| 692 |
+
async def fetch_category(self, cat_id: str, page: int = 1) -> List[Dict]:
|
| 693 |
+
resolved_id = await self._resolve_cat_id(cat_id)
|
| 694 |
+
target = f"{self.BASE_URL}/category.php?cat={resolved_id}&page={page}"
|
| 695 |
+
html = await self._get_html(target, max_retries=3)
|
| 696 |
+
return self._extract_items(BeautifulSoup(html, 'html.parser')) if html else []
|
| 697 |
+
|
| 698 |
+
def _normalize_number(self, text: str) -> int:
|
| 699 |
+
"""Extract episode number from Arabic/English text"""
|
| 700 |
+
# Arabic number words mapping
|
| 701 |
+
arabic_map = {
|
| 702 |
+
'الأولى': 1, 'الاولى': 1, 'الثانية': 2, 'الثالثة': 3, 'الرابعة': 4,
|
| 703 |
+
'الخامسة': 5, 'السادسة': 6, 'السابعة': 7, 'الثامنة': 8, 'التاسعة': 9,
|
| 704 |
+
'العاشرة': 10, 'الحادية': 11, 'الثانية عشر': 12, 'الثالثة عشر': 13,
|
| 705 |
+
'الرابعة عشر': 14, 'الخامسة عشر': 15, 'السادسة عشر': 16, 'السابعة عشر': 17,
|
| 706 |
+
'الثامنة عشر': 18, 'التاسعة عشر': 19, 'العشرون': 20, 'الاخيرة': 999
|
| 707 |
+
}
|
| 708 |
+
|
| 709 |
+
# Try to find numeric digits first (most reliable)
|
| 710 |
+
match = re.search(r'(\d+)', text)
|
| 711 |
+
if match:
|
| 712 |
+
return int(match.group(1))
|
| 713 |
+
|
| 714 |
+
# Try Arabic number words
|
| 715 |
+
text_lower = text.lower()
|
| 716 |
+
for arabic_word, num in arabic_map.items():
|
| 717 |
+
if arabic_word in text_lower:
|
| 718 |
+
return num
|
| 719 |
+
|
| 720 |
+
# Try to extract from patterns like "الحلقة X" or "Episode X"
|
| 721 |
+
patterns = [
|
| 722 |
+
r'(?:الحلقة|حلقة|episode|ep)\s*[:\-]?\s*(\d+)',
|
| 723 |
+
r'(\d+)\s*(?:الحلقة|حلقة|episode|ep)',
|
| 724 |
+
]
|
| 725 |
+
for pattern in patterns:
|
| 726 |
+
match = re.search(pattern, text_lower)
|
| 727 |
+
if match:
|
| 728 |
+
return int(match.group(1))
|
| 729 |
+
|
| 730 |
+
return 0
|
| 731 |
+
|
| 732 |
+
def _safe_get_episode(self, text: str, name_hint: str = None) -> int:
|
| 733 |
+
"""Smarter episode number extraction with common patterns"""
|
| 734 |
+
# Remove common noise
|
| 735 |
+
clean = re.sub(r'\(.*?\)', '', text)
|
| 736 |
+
clean = re.sub(r'\[.*?\]', '', clean)
|
| 737 |
+
|
| 738 |
+
if name_hint:
|
| 739 |
+
# Remove the series name from the text to avoid matching numbers in the title (e.g. "2 قهوة")
|
| 740 |
+
clean = clean.replace(name_hint, "").strip()
|
| 741 |
+
|
| 742 |
+
# 1. Look for number after keywords (Most reliable)
|
| 743 |
+
m = re.search(r'(?:الحلقة|حلقة|ep|episode|part|p)\s*(\d+)', clean, re.I)
|
| 744 |
+
if m: return int(m.group(1))
|
| 745 |
+
|
| 746 |
+
# 2. Direct digits (Fallback)
|
| 747 |
+
m = re.search(r'(\d+)', clean)
|
| 748 |
+
if m: return int(m.group(1))
|
| 749 |
+
|
| 750 |
+
# 3. Word matches
|
| 751 |
+
return self._normalize_number(clean)
|
| 752 |
+
|
| 753 |
+
async def search(self, query: str) -> List[Dict]:
|
| 754 |
+
url = f"{self.BASE_URL}/search.php?keywords={quote(query)}"
|
| 755 |
+
html = await self._get_html(url, max_retries=2)
|
| 756 |
+
return self._extract_items(BeautifulSoup(html, 'html.parser')) if html else []
|
| 757 |
+
|
| 758 |
+
async def fetch_details(self, safe_id: str) -> Dict:
|
| 759 |
+
try:
|
| 760 |
+
url = base64.urlsafe_b64decode(safe_id).decode()
|
| 761 |
+
except: return {}
|
| 762 |
+
|
| 763 |
+
html = await self._get_html(url)
|
| 764 |
+
if not html: return {}
|
| 765 |
+
|
| 766 |
+
soup = BeautifulSoup(html, 'html.parser')
|
| 767 |
+
|
| 768 |
+
# Follow play.php for watch servers
|
| 769 |
+
watch_html = html
|
| 770 |
+
watch_soup = soup
|
| 771 |
+
play_a = soup.select_one('a[href*="play.php"]')
|
| 772 |
+
if play_a:
|
| 773 |
+
p_url = urljoin(self.BASE_URL, play_a.get('href'))
|
| 774 |
+
p_html = await self._get_html(p_url)
|
| 775 |
+
if p_html:
|
| 776 |
+
watch_soup = BeautifulSoup(p_html, 'html.parser')
|
| 777 |
+
watch_html = p_html
|
| 778 |
+
|
| 779 |
+
title = soup.find('h1').get_text(strip=True) if soup.find('h1') else "Unknown"
|
| 780 |
+
is_series = bool(soup.select('.episodes-list, .season-episodes, .vid-episodes')) or any(x in title for x in ["حلقة", "مسلسل", "الموسم"])
|
| 781 |
+
|
| 782 |
+
raw_poster = soup.select_one('meta[property="og:image"]')['content'] if soup.select_one('meta[property="og:image"]') else ""
|
| 783 |
+
if not raw_poster:
|
| 784 |
+
img_tag = soup.select_one('.poster img, .movie-poster img, .pm-video-watch-main img')
|
| 785 |
+
if img_tag:
|
| 786 |
+
raw_poster = img_tag.get('src') or img_tag.get('data-src')
|
| 787 |
+
|
| 788 |
+
poster = ""
|
| 789 |
+
if raw_poster:
|
| 790 |
+
full_poster_url = urljoin(self.BASE_URL, raw_poster)
|
| 791 |
+
poster = f"/proxy/image?url={quote(full_poster_url)}"
|
| 792 |
+
|
| 793 |
+
response = {
|
| 794 |
+
"id": safe_id, "title": title,
|
| 795 |
+
"description": soup.select_one('.story, .desc, .entry-content').get_text(strip=True) if soup.select_one('.story, .desc, .entry-content') else "",
|
| 796 |
+
"poster": poster,
|
| 797 |
+
"type": "series" if is_series else "movie",
|
| 798 |
+
"seasons": [], "episodes": [], "servers": [], "download_links": []
|
| 799 |
+
}
|
| 800 |
+
|
| 801 |
+
# --- Episodes ---
|
| 802 |
+
if is_series:
|
| 803 |
+
unique_eps = {}
|
| 804 |
+
|
| 805 |
+
# 1. Proactive Search: Look for a "Series Category" link
|
| 806 |
+
cat_link = None
|
| 807 |
+
|
| 808 |
+
# A. Check Breadcrumbs (Very reliable for series category)
|
| 809 |
+
for bc in soup.select('.breadcrumb a, .bread-crumb a, .breadcrumbs a, .pm-breadcrumb a'):
|
| 810 |
+
href = bc.get('href')
|
| 811 |
+
if href and ('cat=' in href or 'ser=' in href):
|
| 812 |
+
# Skip generic high-level categories if possible?
|
| 813 |
+
# Actually, we filter by title later, so it's okay.
|
| 814 |
+
cat_link = urljoin(self.BASE_URL, href)
|
| 815 |
+
if 'ser=' in href: # Prefer ser= over cat=
|
| 816 |
+
break
|
| 817 |
+
|
| 818 |
+
# Extract clean series name for filtering
|
| 819 |
+
clean_title = title.replace("مسلسل", "").strip()
|
| 820 |
+
# Try to get name before "الحلقة" or "المواسم"
|
| 821 |
+
series_name = re.split(r'الحلقة|الموسم|حلقة|season|episode', clean_title, flags=re.I)[0].strip()
|
| 822 |
+
# Arabic numeral support for filtering
|
| 823 |
+
series_name_alt = series_name.replace('0','٠').replace('1','١').replace('2','٢').replace('3','٣').replace('4','٤').replace('5','٥').replace('6','٦').replace('7','٧').replace('8','٨').replace('9','٩')
|
| 824 |
+
|
| 825 |
+
logger.info(f"Targeting series name: {series_name} (Alt: {series_name_alt})")
|
| 826 |
+
|
| 827 |
+
# B. Check if Title itself is a link to the category or series
|
| 828 |
+
if not cat_link:
|
| 829 |
+
title_link = soup.select_one('h1 a[href*="cat="], h1 a[href*="ser="], h1 a[href*="tag.php"]')
|
| 830 |
+
if title_link:
|
| 831 |
+
cat_link = urljoin(self.BASE_URL, title_link['href'])
|
| 832 |
+
|
| 833 |
+
# C. General search in links with strict patterns
|
| 834 |
+
if not cat_link:
|
| 835 |
+
for a in soup.find_all('a', href=True):
|
| 836 |
+
href = a['href']
|
| 837 |
+
a_text = a.get_text(strip=True)
|
| 838 |
+
# High-confidence patterns
|
| 839 |
+
if any(x in a_text for x in ["المسلسل:", "جميع الحلقات", "حلقات المسلسل", "كل الحلقات"]):
|
| 840 |
+
cat_link = urljoin(self.BASE_URL, href)
|
| 841 |
+
logger.info(f"Found cat_link via labels: {cat_link}")
|
| 842 |
+
break
|
| 843 |
+
|
| 844 |
+
# D. Fallback search by title
|
| 845 |
+
if not cat_link:
|
| 846 |
+
for a in soup.find_all('a', href=True):
|
| 847 |
+
href = a['href']
|
| 848 |
+
if any(x in href for x in ['ser=', 'cat=', 'tag.php']):
|
| 849 |
+
a_text = a.get_text(strip=True)
|
| 850 |
+
if (series_name and series_name in a_text) or (series_name_alt and series_name_alt in a_text):
|
| 851 |
+
cat_link = urljoin(self.BASE_URL, href)
|
| 852 |
+
logger.info(f"Found cat_link via fallback title search: {cat_link}")
|
| 853 |
+
break
|
| 854 |
+
|
| 855 |
+
if cat_link:
|
| 856 |
+
try:
|
| 857 |
+
# Determine type: view-serie.php, category.php, tag.php
|
| 858 |
+
is_view_serie = 'view-serie' in cat_link
|
| 859 |
+
param_name = 'ser' if is_view_serie else ('t' if 'tag.php' in cat_link else 'cat')
|
| 860 |
+
|
| 861 |
+
# Robust ID extraction
|
| 862 |
+
match = re.search(f'[?&]{param_name}=([^&]+)', cat_link)
|
| 863 |
+
if match:
|
| 864 |
+
cat_id = match.group(1)
|
| 865 |
+
base_deep_url = f"{self.BASE_URL}/tag.php?t={cat_id}" if param_name == 't' else \
|
| 866 |
+
(f"{self.BASE_URL}/view-serie.php?ser={cat_id}" if is_view_serie else \
|
| 867 |
+
f"{self.BASE_URL}/category.php?cat={cat_id}")
|
| 868 |
+
|
| 869 |
+
logger.info(f"Deep scraping episodes from {cat_link} (ID: {cat_id})")
|
| 870 |
+
# Fetch first 5 pages
|
| 871 |
+
for p in range(1, 6):
|
| 872 |
+
target_p = f"{base_deep_url}&page={p}" if p > 1 else base_deep_url
|
| 873 |
+
p_html = await self._get_html(target_p)
|
| 874 |
+
if not p_html: break
|
| 875 |
+
p_items = self._extract_items(BeautifulSoup(p_html, 'html.parser'))
|
| 876 |
+
|
| 877 |
+
if not p_items: break
|
| 878 |
+
for item in p_items:
|
| 879 |
+
# Filter Check: Use a fuzzy name match
|
| 880 |
+
i_title = item['title']
|
| 881 |
+
# Must match at least the first 2 words if possible, or the whole name
|
| 882 |
+
name_parts = series_name.split()
|
| 883 |
+
match_key = " ".join(name_parts[:2]) if len(name_parts) >= 2 else series_name
|
| 884 |
+
|
| 885 |
+
if match_key in i_title or series_name in i_title or series_name_alt in i_title:
|
| 886 |
+
e_num = self._safe_get_episode(i_title, name_hint=series_name)
|
| 887 |
+
if e_num and e_num not in unique_eps:
|
| 888 |
+
unique_eps[e_num] = {
|
| 889 |
+
"id": item['id'],
|
| 890 |
+
"episode": e_num,
|
| 891 |
+
"title": i_title
|
| 892 |
+
}
|
| 893 |
+
if len(p_items) < 10: break
|
| 894 |
+
except Exception as e:
|
| 895 |
+
logger.error(f"Category episode fetch failed: {e}")
|
| 896 |
+
|
| 897 |
+
# 2. Local fallback: Scrape episodes from the current page
|
| 898 |
+
for ep in soup.select('.episodes-list a, .season-episodes a, .vid-episodes a, ul.episodes li a, div.caption h3 a, .movie-item a, .related-vids a'):
|
| 899 |
+
ep_href = ep.get('href')
|
| 900 |
+
if not ep_href or 'video.php' not in ep_href: continue
|
| 901 |
+
ep_url = urljoin(self.BASE_URL, ep_href)
|
| 902 |
+
ep_text = ep.get_text(strip=True)
|
| 903 |
+
|
| 904 |
+
# If text is empty, check for nested title
|
| 905 |
+
if not ep_text:
|
| 906 |
+
inner = ep.find(['h3', 'span', 'strong'])
|
| 907 |
+
if inner: ep_text = inner.get_text(strip=True)
|
| 908 |
+
|
| 909 |
+
# CRITICAL FILTER: Item must belong to this series
|
| 910 |
+
if series_name and series_name not in ep_text:
|
| 911 |
+
continue
|
| 912 |
+
|
| 913 |
+
ep_num = self._safe_get_episode(ep_text, name_hint=series_name)
|
| 914 |
+
if ep_num and ep_num not in unique_eps:
|
| 915 |
+
unique_eps[ep_num] = {
|
| 916 |
+
"id": base64.urlsafe_b64encode(ep_url.encode()).decode(),
|
| 917 |
+
"episode": ep_num,
|
| 918 |
+
"title": ep_text
|
| 919 |
+
}
|
| 920 |
+
|
| 921 |
+
response['episodes'] = sorted(list(unique_eps.values()), key=lambda x: x['episode'])
|
| 922 |
+
response['seasons'] = [{"number": 1, "episodes": response['episodes']}]
|
| 923 |
+
|
| 924 |
+
# --- WATCH SERVERS ---
|
| 925 |
+
watch_urls = set()
|
| 926 |
+
|
| 927 |
+
def is_valid_srv(url_str: str) -> bool:
|
| 928 |
+
if not url_str or 'javascript' in url_str: return False
|
| 929 |
+
if 'larooza' in url_str and 'video.php' in url_str: return False
|
| 930 |
+
if any(x in url_str.lower() for x in ['beacon', 'analytics', 'pixel', 'ads.', 'google', 'facebook']): return False
|
| 931 |
+
return True
|
| 932 |
+
|
| 933 |
+
# 1. Primary: WatchList & Source tags
|
| 934 |
+
server_selectors = [
|
| 935 |
+
'ul.WatchList li', '.server-list li', '#servers li', '.watch-servers li',
|
| 936 |
+
'.video-servers-list li', 'div.servers a', '.player-servers li'
|
| 937 |
+
]
|
| 938 |
+
|
| 939 |
+
for sel in server_selectors:
|
| 940 |
+
for li in watch_soup.select(sel):
|
| 941 |
+
s_url = li.get('data-embed-url') or li.get('data-link') or li.get('data-embed') or li.get('data-src') or li.get('data-url')
|
| 942 |
+
if not s_url:
|
| 943 |
+
a_tag = li.find('a', href=True)
|
| 944 |
+
if a_tag and not a_tag['href'].startswith('javascript'):
|
| 945 |
+
s_url = a_tag['href']
|
| 946 |
+
|
| 947 |
+
if s_url and is_valid_srv(s_url):
|
| 948 |
+
if s_url.startswith('//'): s_url = "https:" + s_url
|
| 949 |
+
full_s_url = urljoin(self.BASE_URL, s_url)
|
| 950 |
+
if full_s_url not in watch_urls:
|
| 951 |
+
watch_urls.add(full_s_url)
|
| 952 |
+
name = li.get_text(strip=True) or f"سيرفر {len(response['servers']) + 1}"
|
| 953 |
+
response['servers'].append({"name": name, "url": full_s_url, "type": "iframe"})
|
| 954 |
+
|
| 955 |
+
# 2. Secondary: Deep Iframe Scan
|
| 956 |
+
for ifr in watch_soup.select('iframe[src], embed[src], video source[src]'):
|
| 957 |
+
src = ifr.get('src')
|
| 958 |
+
if is_valid_srv(src):
|
| 959 |
+
if src.startswith('//'): src = "https:" + src
|
| 960 |
+
full_s_url = urljoin(self.BASE_URL, src)
|
| 961 |
+
if full_s_url not in watch_urls:
|
| 962 |
+
watch_urls.add(full_s_url)
|
| 963 |
+
response['servers'].append({"name": f"سيرفر سريع {len(response['servers']) + 1}", "url": full_s_url, "type": "iframe"})
|
| 964 |
+
|
| 965 |
+
# 3. Regex Fallback (Scripts & Global)
|
| 966 |
+
patterns = [
|
| 967 |
+
r'iframe.*?src=["\'](https?://[^"\']+)["\']',
|
| 968 |
+
r'embedUrl["\']\s*:\s*["\'](https?://[^"\']+)["\']',
|
| 969 |
+
r'file["\']\s*:\s*["\'](https?://[^"\']+\.m3u8)["\']',
|
| 970 |
+
r'source\s*src=["\'](https?://[^"\']+)["\']'
|
| 971 |
+
]
|
| 972 |
+
for pattern in patterns:
|
| 973 |
+
for match in re.findall(pattern, watch_html, re.I):
|
| 974 |
+
if is_valid_srv(match) and match not in watch_urls:
|
| 975 |
+
watch_urls.add(match)
|
| 976 |
+
response['servers'].append({"name": f"سيرفر احتياطي {len(response['servers']) + 1}", "url": match, "type": "iframe"})
|
| 977 |
+
|
| 978 |
+
# Clean duplicates and sort by quality/relevance if possible
|
| 979 |
+
# For now, just ensuring uniqueness
|
| 980 |
+
|
| 981 |
+
# --- Downloads ---
|
| 982 |
+
dl_url = url.replace('video.php', 'download.php').replace('play.php', 'download.php')
|
| 983 |
+
dl_html = await self._get_html(dl_url)
|
| 984 |
+
if dl_html:
|
| 985 |
+
dl_soup = BeautifulSoup(dl_html, 'html.parser')
|
| 986 |
+
for mirror in dl_soup.select('a[target="_blank"]'):
|
| 987 |
+
m_url = mirror.get('href')
|
| 988 |
+
if m_url and 'http' in m_url:
|
| 989 |
+
if any(x in m_url.lower() for x in ['wa.me', 'facebook.com', 'twitter.com', 'telegram.me', 't.me', 'sharer.php']):
|
| 990 |
+
continue
|
| 991 |
+
q_text = mirror.get_text(strip=True).replace("اضغط هنا للتحميل", "").replace("تحميل الملف", "").strip() or "رابط تحميل"
|
| 992 |
+
response['download_links'].append({"quality": q_text, "url": m_url})
|
| 993 |
+
|
| 994 |
+
return response
|
| 995 |
+
|
| 996 |
+
scraper = LaroozaScraper()
|
scraper/proxy_fetcher.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Free Proxy Fetcher - Automatically fetches and validates free proxies
|
| 3 |
+
"""
|
| 4 |
+
import aiohttp
|
| 5 |
+
import asyncio
|
| 6 |
+
import logging
|
| 7 |
+
|
| 8 |
+
logger = logging.getLogger("proxy_fetcher")
|
| 9 |
+
|
| 10 |
+
class FreeProxyFetcher:
|
| 11 |
+
def __init__(self):
|
| 12 |
+
self.proxies = []
|
| 13 |
+
self.last_fetch = 0
|
| 14 |
+
|
| 15 |
+
async def fetch_free_proxies(self):
|
| 16 |
+
"""Fetch free proxies from public APIs"""
|
| 17 |
+
proxy_sources = [
|
| 18 |
+
"https://api.proxyscrape.com/v2/?request=get&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all",
|
| 19 |
+
"https://www.proxy-list.download/api/v1/get?type=http",
|
| 20 |
+
]
|
| 21 |
+
|
| 22 |
+
all_proxies = []
|
| 23 |
+
async with aiohttp.ClientSession() as session:
|
| 24 |
+
for source in proxy_sources:
|
| 25 |
+
try:
|
| 26 |
+
async with session.get(source, timeout=10) as resp:
|
| 27 |
+
if resp.status == 200:
|
| 28 |
+
text = await resp.text()
|
| 29 |
+
proxies = [f"http://{line.strip()}" for line in text.split('\n') if line.strip()]
|
| 30 |
+
all_proxies.extend(proxies[:20]) # Take first 20 from each source
|
| 31 |
+
logger.info(f"Fetched {len(proxies)} proxies from {source}")
|
| 32 |
+
except Exception as e:
|
| 33 |
+
logger.error(f"Failed to fetch from {source}: {e}")
|
| 34 |
+
|
| 35 |
+
self.proxies = all_proxies
|
| 36 |
+
logger.info(f"Total free proxies loaded: {len(self.proxies)}")
|
| 37 |
+
return self.proxies
|
| 38 |
+
|
| 39 |
+
async def validate_proxy(self, proxy, test_url="https://httpbin.org/ip"):
|
| 40 |
+
"""Test if a proxy works"""
|
| 41 |
+
try:
|
| 42 |
+
async with aiohttp.ClientSession() as session:
|
| 43 |
+
async with session.get(test_url, proxy=proxy, timeout=5) as resp:
|
| 44 |
+
if resp.status == 200:
|
| 45 |
+
return True
|
| 46 |
+
except:
|
| 47 |
+
pass
|
| 48 |
+
return False
|
| 49 |
+
|
| 50 |
+
async def get_working_proxies(self, max_count=10):
|
| 51 |
+
"""Get validated working proxies"""
|
| 52 |
+
if not self.proxies:
|
| 53 |
+
await self.fetch_free_proxies()
|
| 54 |
+
|
| 55 |
+
working = []
|
| 56 |
+
tasks = [self.validate_proxy(p) for p in self.proxies[:30]]
|
| 57 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
| 58 |
+
|
| 59 |
+
for proxy, is_working in zip(self.proxies[:30], results):
|
| 60 |
+
if is_working and len(working) < max_count:
|
| 61 |
+
working.append(proxy)
|
| 62 |
+
|
| 63 |
+
logger.info(f"Validated {len(working)} working proxies")
|
| 64 |
+
return working
|
| 65 |
+
|
| 66 |
+
proxy_fetcher = FreeProxyFetcher()
|
start.sh
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
set -e
|
| 3 |
+
|
| 4 |
+
echo "--- STARTING MULTI-SERVICE BOOT ---"
|
| 5 |
+
|
| 6 |
+
# Step 1: Start FlareSolverr
|
| 7 |
+
echo "[1/3] Launching FlareSolverr in background..."
|
| 8 |
+
export PYTHONPATH=$PYTHONPATH:/app/flaresolverr
|
| 9 |
+
export PORT=8191
|
| 10 |
+
export LOG_LEVEL=info
|
| 11 |
+
|
| 12 |
+
# Run FlareSolverr with its own directory as CWD
|
| 13 |
+
(cd /app/flaresolverr && python3 flaresolverr.py) &
|
| 14 |
+
|
| 15 |
+
# Step 2: Health Check for FlareSolverr
|
| 16 |
+
echo "[2/3] Waiting for FlareSolverr to bind to port 8191..."
|
| 17 |
+
MAX_RETRIES=30
|
| 18 |
+
COUNT=0
|
| 19 |
+
while ! curl -s http://localhost:8191/health > /dev/null; do
|
| 20 |
+
sleep 1
|
| 21 |
+
COUNT=$((COUNT+1))
|
| 22 |
+
if [ $COUNT -ge $MAX_RETRIES ]; then
|
| 23 |
+
echo "⚠️ FlareSolverr failed to start in time, continuing to FastAPI anyway..."
|
| 24 |
+
break
|
| 25 |
+
fi
|
| 26 |
+
done
|
| 27 |
+
echo "✅ FlareSolverr is ready!"
|
| 28 |
+
|
| 29 |
+
# Step 3: Start FastAPI
|
| 30 |
+
echo "[3/3] Launching FastAPI on port 7860..."
|
| 31 |
+
uvicorn main:app --host 0.0.0.0 --port 7860 --log-level info
|
start_render.sh
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
set -e
|
| 3 |
+
|
| 4 |
+
echo "--- RENDER.COM DEPLOYMENT ---"
|
| 5 |
+
|
| 6 |
+
# Step 1: Start FlareSolverr
|
| 7 |
+
echo "[1/2] Launching FlareSolverr in background..."
|
| 8 |
+
export PYTHONPATH=$PYTHONPATH:/opt/render/project/src/flaresolverr
|
| 9 |
+
export PORT_FS=8191
|
| 10 |
+
export LOG_LEVEL=info
|
| 11 |
+
|
| 12 |
+
(cd /opt/render/project/src/flaresolverr && python3 flaresolverr.py) &
|
| 13 |
+
|
| 14 |
+
# Wait for FlareSolverr
|
| 15 |
+
echo "[2/2] Waiting for FlareSolverr..."
|
| 16 |
+
sleep 5
|
| 17 |
+
|
| 18 |
+
echo "✅ FlareSolverr ready!"
|
| 19 |
+
echo "--- Starting FastAPI on port $PORT ---"
|
| 20 |
+
|
| 21 |
+
# Render provides $PORT automatically
|
| 22 |
+
uvicorn main:app --host 0.0.0.0 --port ${PORT:-7860} --log-level info
|
tools/analyze_structure.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from bs4 import BeautifulSoup
|
| 2 |
+
import sys
|
| 3 |
+
import io
|
| 4 |
+
|
| 5 |
+
# Set encoding for Windows terminal
|
| 6 |
+
if sys.platform == 'win32':
|
| 7 |
+
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
| 8 |
+
|
| 9 |
+
def analyze_html():
|
| 10 |
+
with open("flaresolverr_output.html", "r", encoding="utf-8") as f:
|
| 11 |
+
html = f.read()
|
| 12 |
+
|
| 13 |
+
soup = BeautifulSoup(html, 'html.parser')
|
| 14 |
+
|
| 15 |
+
print("--- Analyzing Links ---")
|
| 16 |
+
links = soup.find_all('a', href=True)
|
| 17 |
+
for i, a in enumerate(links[:100]):
|
| 18 |
+
href = a['href']
|
| 19 |
+
text = a.get_text(strip=True)
|
| 20 |
+
if 'cat=' in href or 'video' in href or 'movie' in href or 'series' in href:
|
| 21 |
+
print(f"{i}: Text: {text} | Href: {href}")
|
| 22 |
+
|
| 23 |
+
print("\n--- Analyzing Containers ---")
|
| 24 |
+
# Look for common patterns in classes
|
| 25 |
+
classes = set()
|
| 26 |
+
for tag in soup.find_all(True, class_=True):
|
| 27 |
+
for c in tag['class']:
|
| 28 |
+
classes.add(c)
|
| 29 |
+
|
| 30 |
+
print(f"Found {len(classes)} unique classes.")
|
| 31 |
+
# Print classes that might be containers
|
| 32 |
+
potential = [c for c in classes if any(x in c.lower() for x in ['item', 'video', 'movie', 'thumb', 'card', 'block', 'col'])]
|
| 33 |
+
print(f"Potential container classes: {potential}")
|
| 34 |
+
|
| 35 |
+
if __name__ == "__main__":
|
| 36 |
+
analyze_html()
|
tools/check_mirrors.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import httpx
|
| 3 |
+
from curl_cffi.requests import AsyncSession
|
| 4 |
+
|
| 5 |
+
async def check_mirrors():
|
| 6 |
+
mirrors = [
|
| 7 |
+
"https://larooza.mom",
|
| 8 |
+
"https://larooza.site",
|
| 9 |
+
"https://laroza-tv.net",
|
| 10 |
+
"https://larozavideo.net",
|
| 11 |
+
"https://larooza.video",
|
| 12 |
+
"https://q.larozavideo.net"
|
| 13 |
+
]
|
| 14 |
+
|
| 15 |
+
headers = {
|
| 16 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
for mirror in mirrors:
|
| 20 |
+
print(f"Checking {mirror}...")
|
| 21 |
+
try:
|
| 22 |
+
# Try curl-cffi first
|
| 23 |
+
async with AsyncSession(impersonate="chrome110") as s:
|
| 24 |
+
resp = await s.get(mirror, headers=headers, timeout=10)
|
| 25 |
+
print(f" [curl-cffi] {mirror}: {resp.status_code} | Title: {resp.text[:100].replace('\n', ' ')}")
|
| 26 |
+
|
| 27 |
+
async with httpx.AsyncClient(http2=True, timeout=10) as client:
|
| 28 |
+
resp = await client.get(mirror, headers=headers)
|
| 29 |
+
print(f" [httpx] {mirror}: {resp.status_code} | Title: {resp.text[:100].replace('\n', ' ')}")
|
| 30 |
+
except Exception as e:
|
| 31 |
+
print(f" [Error] {mirror}: {e}")
|
| 32 |
+
|
| 33 |
+
if __name__ == "__main__":
|
| 34 |
+
asyncio.run(check_mirrors())
|
tools/debug_fs.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import httpx
|
| 3 |
+
import json
|
| 4 |
+
import sys
|
| 5 |
+
|
| 6 |
+
# Set encoding to utf-8 for windows console
|
| 7 |
+
if sys.platform == "win32":
|
| 8 |
+
import codecs
|
| 9 |
+
sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
|
| 10 |
+
|
| 11 |
+
async def test():
|
| 12 |
+
urls = [
|
| 13 |
+
"https://q.larozavideo.net/home.24",
|
| 14 |
+
"https://q.larozavideo.net/newvideos1.php",
|
| 15 |
+
"https://q.larozavideo.net/category.php?cat=all_movies_13"
|
| 16 |
+
]
|
| 17 |
+
|
| 18 |
+
flaresolverr_url = "http://127.0.0.1:8191/v1"
|
| 19 |
+
|
| 20 |
+
async with httpx.AsyncClient(timeout=90.0) as client:
|
| 21 |
+
for url in urls:
|
| 22 |
+
print(f"\n--- Testing {url} ---")
|
| 23 |
+
payload = {
|
| 24 |
+
"cmd": "request.get",
|
| 25 |
+
"url": url,
|
| 26 |
+
"maxTimeout": 60000
|
| 27 |
+
}
|
| 28 |
+
try:
|
| 29 |
+
response = await client.post(flaresolverr_url, json=payload)
|
| 30 |
+
if response.status_code == 200:
|
| 31 |
+
data = response.json()
|
| 32 |
+
if data.get('status') == 'ok':
|
| 33 |
+
solution = data.get('solution', {})
|
| 34 |
+
html = solution.get('response', '')
|
| 35 |
+
title = solution.get('title', '')
|
| 36 |
+
print(f"Title found: {title}")
|
| 37 |
+
|
| 38 |
+
if "video.php" in html or ".thumbnail" in html or ".box" in html:
|
| 39 |
+
print("FOUND: Movie items are present in HTML!")
|
| 40 |
+
else:
|
| 41 |
+
print("FAILED: No movie items in HTML.")
|
| 42 |
+
print(f"Snippet: {html[:500]}")
|
| 43 |
+
else:
|
| 44 |
+
print(f"FlareSolverr message: {data.get('message')}")
|
| 45 |
+
else:
|
| 46 |
+
print(f"Server error: {response.status_code}")
|
| 47 |
+
except Exception as e:
|
| 48 |
+
print(f"Script error: {e}")
|
| 49 |
+
|
| 50 |
+
if __name__ == "__main__":
|
| 51 |
+
asyncio.run(test())
|
tools/debug_mirrors.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import httpx
|
| 3 |
+
from bs4 import BeautifulSoup
|
| 4 |
+
|
| 5 |
+
async def debug_fetch():
|
| 6 |
+
mirrors = ["https://q.larozavideo.net", "https://larooza.mom", "https://larooza.site", "https://m.laroza-tv.net"]
|
| 7 |
+
async with httpx.AsyncClient(timeout=10.0, follow_redirects=True) as client:
|
| 8 |
+
for mirror in mirrors:
|
| 9 |
+
print(f"\n--- Checking mirror: {mirror} ---")
|
| 10 |
+
try:
|
| 11 |
+
resp = await client.get(mirror, headers={"User-Agent": "Mozilla/5.0"})
|
| 12 |
+
print(f"Status: {resp.status_code}")
|
| 13 |
+
if resp.status_code == 200:
|
| 14 |
+
soup = BeautifulSoup(resp.text, 'html.parser')
|
| 15 |
+
title = soup.title.string if soup.title else "No title"
|
| 16 |
+
print(f"Title: {title}")
|
| 17 |
+
|
| 18 |
+
selectors = ['.thumbnail', '.pm-li-video', '.pm-video-thumb', '.video-block', '.movie-item', 'li.col-xs-6', '.box', '.video-box', '.video-item', '.post-item']
|
| 19 |
+
found = False
|
| 20 |
+
for sel in selectors:
|
| 21 |
+
count = len(soup.select(sel))
|
| 22 |
+
if count > 0:
|
| 23 |
+
print(f" Found {count} items with selector {sel}")
|
| 24 |
+
found = True
|
| 25 |
+
|
| 26 |
+
if not found:
|
| 27 |
+
video_links = len(soup.select('a[href*="video.php"], a[href*="watch.php"]'))
|
| 28 |
+
print(f" Found {video_links} video/watch links.")
|
| 29 |
+
else:
|
| 30 |
+
print(f" Snippet: {resp.text[:200]}")
|
| 31 |
+
except Exception as e:
|
| 32 |
+
print(f" Error: {e}")
|
| 33 |
+
|
| 34 |
+
if __name__ == "__main__":
|
| 35 |
+
asyncio.run(debug_fetch())
|
tools/debug_scraper.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import sys
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
# Add the current directory to path
|
| 6 |
+
sys.path.append(os.getcwd())
|
| 7 |
+
|
| 8 |
+
from scraper.engine import LaroozaScraper
|
| 9 |
+
|
| 10 |
+
# Set encoding to utf-8 for windows console
|
| 11 |
+
if sys.platform == "win32":
|
| 12 |
+
import codecs
|
| 13 |
+
sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
|
| 14 |
+
|
| 15 |
+
async def test():
|
| 16 |
+
scraper = LaroozaScraper()
|
| 17 |
+
print("DEBUG: Fetching latest movies...")
|
| 18 |
+
items = await scraper.fetch_home(page=1)
|
| 19 |
+
print(f"DEBUG: Found {len(items)} items.")
|
| 20 |
+
if items:
|
| 21 |
+
for i, item in enumerate(items[:3]):
|
| 22 |
+
print(f" {i+1}. {item['title']} - ID: {item['id'][:20]}...")
|
| 23 |
+
else:
|
| 24 |
+
print("DEBUG: ❌ fetch_home returned 0 items.")
|
| 25 |
+
|
| 26 |
+
if __name__ == "__main__":
|
| 27 |
+
asyncio.run(test())
|
tools/dump_html.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import httpx
|
| 2 |
+
import asyncio
|
| 3 |
+
from bs4 import BeautifulSoup
|
| 4 |
+
|
| 5 |
+
async def dump_html():
|
| 6 |
+
url = "https://larooza.mom" # Using the one that gave 0 links
|
| 7 |
+
headers = {
|
| 8 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
| 9 |
+
}
|
| 10 |
+
async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
|
| 11 |
+
print(f"Fetching {url}...")
|
| 12 |
+
resp = await client.get(url, headers=headers)
|
| 13 |
+
print(f"Status: {resp.status_code}")
|
| 14 |
+
with open("dump.html", "w", encoding="utf-8") as f:
|
| 15 |
+
f.write(resp.text)
|
| 16 |
+
print("HTML dumped to dump.html")
|
| 17 |
+
|
| 18 |
+
soup = BeautifulSoup(resp.text, 'html.parser')
|
| 19 |
+
links = soup.select('a')
|
| 20 |
+
print(f"Total links: {len(links)}")
|
| 21 |
+
for a in links[:20]:
|
| 22 |
+
print(f"Link: {a.get('href')} | Text: {a.get_text(strip=True)[:30]}")
|
| 23 |
+
|
| 24 |
+
if __name__ == "__main__":
|
| 25 |
+
asyncio.run(dump_html())
|
tools/dump_html_v2.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import httpx
|
| 2 |
+
import asyncio
|
| 3 |
+
from bs4 import BeautifulSoup
|
| 4 |
+
|
| 5 |
+
async def dump_html():
|
| 6 |
+
url = "https://q.larozavideo.net/newvideos1.php"
|
| 7 |
+
headers = {
|
| 8 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
| 9 |
+
}
|
| 10 |
+
async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
|
| 11 |
+
print(f"Fetching {url}...")
|
| 12 |
+
resp = await client.get(url, headers=headers)
|
| 13 |
+
print(f"Status: {resp.status_code}")
|
| 14 |
+
print(f"Final URL: {resp.url}")
|
| 15 |
+
|
| 16 |
+
soup = BeautifulSoup(resp.text, 'html.parser')
|
| 17 |
+
containers = soup.select('.thumbnail, .pm-li-video, .pm-video-thumb, .video-block, .movie-item, li.col-xs-6, .box, .video-box, .video-item, .post-item')
|
| 18 |
+
print(f"Found {len(containers)} item containers.")
|
| 19 |
+
|
| 20 |
+
if len(containers) == 0:
|
| 21 |
+
print("Snippet of HTML:")
|
| 22 |
+
print(resp.text[:1000])
|
| 23 |
+
|
| 24 |
+
if __name__ == "__main__":
|
| 25 |
+
asyncio.run(dump_html())
|
tools/extra/diagnose.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import httpx
|
| 3 |
+
import asyncio
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
async def check_service(name, url):
|
| 7 |
+
try:
|
| 8 |
+
async with httpx.AsyncClient(timeout=5.0) as client:
|
| 9 |
+
resp = await client.get(url)
|
| 10 |
+
print(f"✅ {name} is UP ({url}) - Status: {resp.status_code}")
|
| 11 |
+
return True
|
| 12 |
+
except Exception as e:
|
| 13 |
+
print(f"❌ {name} is DOWN ({url}) - Error: {e}")
|
| 14 |
+
return False
|
| 15 |
+
|
| 16 |
+
async def main():
|
| 17 |
+
print("--- Diagnostics ---")
|
| 18 |
+
await check_service("Backend", "http://localhost:8000/health")
|
| 19 |
+
await check_service("FlareSolverr", "http://localhost:8191/health")
|
| 20 |
+
|
| 21 |
+
# Try to find the tunnel URL from local logs if possible
|
| 22 |
+
print("\n--- Searching for Tunnel URL ---")
|
| 23 |
+
# This is a bit tricky, but we can try to find recent cloudflared logs
|
| 24 |
+
# Cloudflared usually doesn't log to a file unless specified, but we'll check common names
|
| 25 |
+
|
| 26 |
+
if __name__ == "__main__":
|
| 27 |
+
asyncio.run(main())
|
tools/extra/expose_to_internet.bat
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
@echo off
|
| 2 |
+
echo ==========================================
|
| 3 |
+
echo CLOUDFLARE TUNNEL - EXPOSE TO INTERNET
|
| 4 |
+
echo ==========================================
|
| 5 |
+
|
| 6 |
+
REM Download Cloudflared (if not exists)
|
| 7 |
+
if not exist cloudflared.exe (
|
| 8 |
+
echo Downloading Cloudflare Tunnel...
|
| 9 |
+
powershell -Command "Invoke-WebRequest -Uri 'https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-windows-amd64.exe' -OutFile 'cloudflared.exe'"
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
REM Start tunnel
|
| 13 |
+
echo Starting Cloudflare Tunnel...
|
| 14 |
+
echo Your backend will be accessible via a public URL in a moment...
|
| 15 |
+
echo.
|
| 16 |
+
cloudflared.exe tunnel --url http://localhost:8000
|
| 17 |
+
|
| 18 |
+
pause
|