Mina commited on
Commit
25ae7fe
·
0 Parent(s):

Fresh deploy without large files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +17 -0
  2. Dockerfile +35 -0
  3. Dockerfile.hf +35 -0
  4. Procfile +1 -0
  5. README.hf.md +33 -0
  6. README.md +30 -0
  7. database.py +48 -0
  8. deploy/.dockerignore +30 -0
  9. deploy/Dockerfile +50 -0
  10. deploy/cloudflare-worker.js +77 -0
  11. deploy/render.yaml +18 -0
  12. downloader.py +145 -0
  13. flaresolverr/bottle_plugins/__init__.py +0 -0
  14. flaresolverr/bottle_plugins/error_plugin.py +22 -0
  15. flaresolverr/bottle_plugins/logger_plugin.py +23 -0
  16. flaresolverr/bottle_plugins/prometheus_plugin.py +66 -0
  17. flaresolverr/build_package.py +126 -0
  18. flaresolverr/dtos.py +94 -0
  19. flaresolverr/flaresolverr.py +155 -0
  20. flaresolverr/flaresolverr_service.py +519 -0
  21. flaresolverr/metrics.py +32 -0
  22. flaresolverr/sessions.py +84 -0
  23. flaresolverr/tests.py +655 -0
  24. flaresolverr/tests_sites.py +102 -0
  25. flaresolverr/undetected_chromedriver/__init__.py +910 -0
  26. flaresolverr/undetected_chromedriver/cdp.py +112 -0
  27. flaresolverr/undetected_chromedriver/devtool.py +193 -0
  28. flaresolverr/undetected_chromedriver/dprocess.py +77 -0
  29. flaresolverr/undetected_chromedriver/options.py +85 -0
  30. flaresolverr/undetected_chromedriver/patcher.py +473 -0
  31. flaresolverr/undetected_chromedriver/reactor.py +99 -0
  32. flaresolverr/undetected_chromedriver/webelement.py +86 -0
  33. flaresolverr/utils.py +376 -0
  34. keep_alive.py +47 -0
  35. main.py +352 -0
  36. package.json +12 -0
  37. requirements.txt +14 -0
  38. scraper/engine.py +996 -0
  39. scraper/proxy_fetcher.py +66 -0
  40. start.sh +31 -0
  41. start_render.sh +22 -0
  42. tools/analyze_structure.py +36 -0
  43. tools/check_mirrors.py +34 -0
  44. tools/debug_fs.py +51 -0
  45. tools/debug_mirrors.py +35 -0
  46. tools/debug_scraper.py +27 -0
  47. tools/dump_html.py +25 -0
  48. tools/dump_html_v2.py +25 -0
  49. tools/extra/diagnose.py +27 -0
  50. tools/extra/expose_to_internet.bat +18 -0
.gitignore ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ venv/
2
+ __pycache__/
3
+ archive/
4
+ *.db
5
+ *.log
6
+ .env
7
+ .vscode/
8
+ .idea/
9
+ bin/
10
+ cache/
11
+ logs/
12
+ *.exe
13
+ *.img
14
+ dist/
15
+ node_modules/
16
+ .choreo/
17
+ TUNNEL_TOKEN.txt
Dockerfile ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Hugging Face Optimized - Lightweight & Stable
4
+ ENV PYTHONUNBUFFERED=1
5
+ ENV HF_SPACE=1
6
+
7
+ # Install minimal system dependencies
8
+ RUN apt-get update && apt-get install -y \
9
+ curl \
10
+ ffmpeg \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ WORKDIR /app
14
+
15
+ # Copy requirements and install
16
+ COPY requirements.txt .
17
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
18
+
19
+ # Create a non-privileged user (Required by Hugging Face)
20
+ RUN useradd -m -u 1000 user
21
+ RUN chown -R user:user /app
22
+ USER user
23
+ ENV HOME=/home/user \
24
+ PATH=/home/user/.local/bin:$PATH
25
+
26
+ # Copy application code
27
+ COPY --chown=user:user . .
28
+
29
+ # Hugging Face uses port 7860
30
+ EXPOSE 7860
31
+ ENV PORT=7860
32
+
33
+ # Start the application with optimized settings for limited RAM
34
+ # We use 1 worker to keep memory usage low on the free tier
35
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1", "--timeout-keep-alive", "60"]
Dockerfile.hf ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Hugging Face optimized - Lightweight without Chrome
4
+ ENV PYTHONUNBUFFERED=1
5
+ ENV SPACE_ID=huggingface
6
+ ENV HF_SPACE=1
7
+
8
+ # Install minimal dependencies
9
+ RUN apt-get update && apt-get install -y \
10
+ curl \
11
+ git \
12
+ && rm -rf /var/lib/apt/lists/*
13
+
14
+ WORKDIR /app
15
+
16
+ # Copy and install Python dependencies
17
+ COPY requirements.txt .
18
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
19
+
20
+ # Create user for Hugging Face
21
+ RUN useradd -m -u 1000 user
22
+ RUN chown -R user:user /app
23
+ USER user
24
+ ENV HOME=/home/user \
25
+ PATH=/home/user/.local/bin:$PATH
26
+
27
+ # Copy application
28
+ COPY --chown=user:user . .
29
+
30
+ # Hugging Face uses port 7860
31
+ EXPOSE 7860
32
+ ENV PORT=7860
33
+
34
+ # Start without FlareSolverr (too heavy for HF)
35
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
Procfile ADDED
@@ -0,0 +1 @@
 
 
1
+ web: uvicorn main:app --host 0.0.0.0 --port $PORT --log-level info
README.hf.md ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: MEIH Movies API
3
+ emoji: 🎬
4
+ colorFrom: red
5
+ colorTo: gray
6
+ sdk: docker
7
+ app_file: main.py
8
+ pinned: false
9
+ license: mit
10
+ ---
11
+
12
+ # MEIH Movies API - Hugging Face Edition
13
+
14
+ High-performance movie streaming API optimized for Hugging Face Spaces.
15
+
16
+ ## Features
17
+
18
+ - Fast content scraping with curl-cffi
19
+ - Intelligent caching system
20
+ - Rate limiting for stability
21
+ - Proxy rotation support
22
+
23
+ ## API Endpoints
24
+
25
+ - `GET /latest` - Latest movies and series
26
+ - `GET /category/{cat_id}` - Browse by category
27
+ - `GET /search?q={query}` - Search content
28
+ - `GET /details/{id}` - Get streaming links
29
+ - `GET /health` - Health check
30
+
31
+ ## Usage
32
+
33
+ Visit the API at: `https://YOUR-SPACE-NAME.hf.space/`
README.md ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Meih Movies API
3
+ emoji: 🎬
4
+ colorFrom: red
5
+ colorTo: gray
6
+ sdk: docker
7
+ pinned: false
8
+ ---
9
+
10
+ # MEIH Movies API - Hugging Face Edition
11
+
12
+ High-performance movie streaming API optimized for Hugging Face Spaces.
13
+
14
+ ## Features
15
+
16
+ - **Lightweight**: Optimized for 16GB RAM environments.
17
+ - **Fast**: Powered by `curl-cffi` for high-speed scraping.
18
+ - **Stable**: Automatic proxy rotation and intelligent caching.
19
+ - **Universal**: Serves both API and Frontend (if built).
20
+
21
+ ## API Endpoints
22
+
23
+ - `GET /latest` - Latest movies and series.
24
+ - `GET /search?q={query}` - Search content.
25
+ - `GET /details/{id}` - Get streaming links.
26
+ - `GET /health` - System status.
27
+
28
+ ## Deployment Note
29
+
30
+ This project is configured to run on port **7860**. Ensure your Space is set to **Docker** SDK.
database.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import aiosqlite
2
+ import logging
3
+
4
+ DB_NAME = "netflix_clone.db"
5
+
6
+ async def init_db():
7
+ async with aiosqlite.connect(DB_NAME) as db:
8
+ # Movies Table
9
+ await db.execute("""
10
+ CREATE TABLE IF NOT EXISTS movies (
11
+ id TEXT PRIMARY KEY,
12
+ title TEXT,
13
+ poster TEXT,
14
+ year TEXT,
15
+ rating TEXT,
16
+ description TEXT,
17
+ category TEXT
18
+ )
19
+ """)
20
+ # Series Table
21
+ await db.execute("""
22
+ CREATE TABLE IF NOT EXISTS series (
23
+ id TEXT PRIMARY KEY,
24
+ title TEXT,
25
+ poster TEXT,
26
+ year TEXT,
27
+ rating TEXT,
28
+ description TEXT,
29
+ category TEXT
30
+ )
31
+ """)
32
+ # Episodes Table
33
+ await db.execute("""
34
+ CREATE TABLE IF NOT EXISTS episodes (
35
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
36
+ series_id TEXT,
37
+ episode_number INTEGER,
38
+ title TEXT,
39
+ watch_link TEXT,
40
+ FOREIGN KEY(series_id) REFERENCES series(id)
41
+ )
42
+ """)
43
+ await db.commit()
44
+
45
+ async def get_db_connection():
46
+ db = await aiosqlite.connect(DB_NAME)
47
+ db.row_factory = aiosqlite.Row
48
+ return db
deploy/.dockerignore ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python ignore
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ venv/
6
+ .env
7
+ netflix_clone.db
8
+ archive/
9
+ tools/
10
+
11
+ # Node ignore
12
+ node_modules/
13
+ dist/
14
+ build/
15
+ .next/
16
+ .vite/
17
+
18
+ # Git ignore
19
+ .git/
20
+ .gitignore
21
+
22
+ # OS ignore
23
+ .DS_Store
24
+ Thumbs.db
25
+
26
+ # Project ignore
27
+ setup_and_run.bat
28
+ *.md
29
+ .gemini/
30
+ .agent/
deploy/Dockerfile ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==========================================
2
+ # Nitro Backend-Only Dockerfile for Hugging Face
3
+ # ==========================================
4
+ FROM python:3.11-slim
5
+
6
+ # Install system dependencies for Scraper (Chrome) and FlareSolverr
7
+ ENV DEBIAN_FRONTEND=noninteractive
8
+ RUN apt-get update && apt-get install -y \
9
+ ffmpeg \
10
+ curl \
11
+ git \
12
+ wget \
13
+ gnupg \
14
+ xvfb \
15
+ xauth \
16
+ dos2unix \
17
+ libnss3 \
18
+ libatk-bridge2.0-0 \
19
+ libgtk-3-0 \
20
+ && wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /usr/share/keyrings/google-chrome.gpg \
21
+ && echo "deb [arch=amd64 signed-by=/usr/share/keyrings/google-chrome.gpg] http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list \
22
+ && apt-get update && apt-get install -y google-chrome-stable \
23
+ && rm -rf /var/lib/apt/lists/*
24
+
25
+ WORKDIR /app
26
+
27
+ # Install Backend Dependencies
28
+ COPY backend/requirements.txt ./
29
+ RUN pip install --no-cache-dir --upgrade pip && \
30
+ pip install --no-cache-dir -r requirements.txt
31
+
32
+ # Copy Backend Application
33
+ COPY backend/ ./
34
+
35
+ # Fix line endings and permissions
36
+ RUN dos2unix start.sh && chmod +x start.sh
37
+
38
+ # Create local user for Hugging Face Spaces (UID 1000)
39
+ RUN useradd -m -u 1000 user
40
+ RUN chown -R user:user /app
41
+ USER user
42
+ ENV HOME=/home/user \
43
+ PATH=/home/user/.local/bin:$PATH \
44
+ PYTHONPATH=/app
45
+
46
+ # Expose the mandatory Hugging Face Space port
47
+ EXPOSE 7860
48
+
49
+ # Kickstart the engine
50
+ CMD ["/bin/bash", "./start.sh"]
deploy/cloudflare-worker.js ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Cloudflare Worker - Proxy Bypass for Larooza Scraper
3
+ * Deploy this to Cloudflare Workers (100% FREE)
4
+ *
5
+ * This worker acts as a middle-man to bypass IP bans
6
+ */
7
+
8
+ addEventListener('fetch', event => {
9
+ event.respondWith(handleRequest(event.request))
10
+ })
11
+
12
+ async function handleRequest(request) {
13
+ // Enable CORS
14
+ const corsHeaders = {
15
+ 'Access-Control-Allow-Origin': '*',
16
+ 'Access-Control-Allow-Methods': 'GET, POST, OPTIONS',
17
+ 'Access-Control-Allow-Headers': 'Content-Type',
18
+ }
19
+
20
+ // Handle CORS preflight
21
+ if (request.method === 'OPTIONS') {
22
+ return new Response(null, { headers: corsHeaders })
23
+ }
24
+
25
+ // Get target URL from query parameter
26
+ const url = new URL(request.url)
27
+ const targetUrl = url.searchParams.get('url')
28
+
29
+ if (!targetUrl) {
30
+ return new Response(JSON.stringify({ error: 'Missing url parameter' }), {
31
+ status: 400,
32
+ headers: { ...corsHeaders, 'Content-Type': 'application/json' }
33
+ })
34
+ }
35
+
36
+ try {
37
+ // Fetch the target URL with realistic headers
38
+ const response = await fetch(targetUrl, {
39
+ headers: {
40
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
41
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
42
+ 'Accept-Language': 'ar,en-US;q=0.9,en;q=0.8',
43
+ 'Referer': 'https://www.google.com/',
44
+ 'DNT': '1',
45
+ 'Connection': 'keep-alive',
46
+ 'Upgrade-Insecure-Requests': '1',
47
+ },
48
+ cf: {
49
+ // Cloudflare-specific options
50
+ cacheTtl: 300, // Cache for 5 minutes
51
+ cacheEverything: true,
52
+ }
53
+ })
54
+
55
+ // Get the HTML content
56
+ const html = await response.text()
57
+
58
+ // Return with CORS headers
59
+ return new Response(html, {
60
+ status: response.status,
61
+ headers: {
62
+ ...corsHeaders,
63
+ 'Content-Type': 'text/html; charset=utf-8',
64
+ 'Cache-Control': 'public, max-age=300',
65
+ }
66
+ })
67
+
68
+ } catch (error) {
69
+ return new Response(JSON.stringify({
70
+ error: 'Failed to fetch target URL',
71
+ message: error.message
72
+ }), {
73
+ status: 500,
74
+ headers: { ...corsHeaders, 'Content-Type': 'application/json' }
75
+ })
76
+ }
77
+ }
deploy/render.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Render.com Deployment Configuration
2
+ # https://render.com
3
+
4
+ services:
5
+ - type: web
6
+ name: meih-movies-api
7
+ env: docker
8
+ dockerfilePath: ./Dockerfile
9
+ dockerContext: ./backend
10
+ plan: free
11
+ region: oregon
12
+ envVars:
13
+ - key: PYTHON_VERSION
14
+ value: 3.11
15
+ - key: PORT
16
+ value: 7860
17
+ healthCheckPath: /health
18
+ autoDeploy: true
downloader.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yt_dlp
2
+ import logging
3
+ import asyncio
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+ class VideoDownloader:
8
+ def __init__(self):
9
+ self.ydl_opts = {
10
+ 'quiet': True,
11
+ 'no_warnings': True,
12
+ 'format': 'best',
13
+ 'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
14
+ 'geo_bypass': True,
15
+ 'no_playlist': True,
16
+ 'nocheckcertificate': True,
17
+ }
18
+
19
+ async def get_info(self, url: str):
20
+ # 1. Handle Local Watch/Details Links or Direct Larooza Links
21
+ is_larooza = any(x in url for x in ['larozavideo', 'larooza', 'laroza'])
22
+ if "/watch/" in url or "/details/" in url or is_larooza:
23
+ try:
24
+ from scraper.engine import scraper
25
+ import base64
26
+
27
+ target_url = url
28
+ if "/watch/" in url or "/details/" in url:
29
+ id_part = url.split("/")[-1].split("?")[0]
30
+ if not id_part.startswith("http"):
31
+ target_url = base64.urlsafe_b64decode(id_part).decode()
32
+
33
+ # If it's a Larooza link (direct or decoded), use scraper
34
+ if any(x in target_url for x in ['larozavideo', 'larooza', 'laroza']):
35
+ logger.info(f"Routing Larooza link to scraper: {target_url}")
36
+ # Normalize: downloader works better with the video.php page
37
+ target_url = target_url.replace('play.php', 'video.php').replace('download.php', 'video.php')
38
+
39
+ safe_id = base64.urlsafe_b64encode(target_url.encode()).decode()
40
+ data = await scraper.fetch_details(safe_id)
41
+
42
+ if data and data.get('download_links'):
43
+ formats = []
44
+ for dl in data['download_links']:
45
+ formats.append({
46
+ 'ext': 'mp4',
47
+ 'resolution': dl['quality'],
48
+ 'url': dl['url'],
49
+ 'type': 'video'
50
+ })
51
+ return {
52
+ 'title': data.get('title'),
53
+ 'thumbnail': data.get('poster'),
54
+ 'duration': 0,
55
+ 'uploader': 'Larooza',
56
+ 'source': 'Larooza',
57
+ 'formats': formats
58
+ }
59
+ elif data:
60
+ return {"error": "لم يتم العثور على روابط تحميل لهذا الفيديو (ربما محمي أو غير متاح حالياً)."}
61
+ except Exception as e:
62
+ logger.error(f"Larooza-specific extraction failed: {e}")
63
+
64
+ # 2. Universal yt-dlp Path (YouTube, TikTok, etc.)
65
+ try:
66
+ loop = asyncio.get_event_loop()
67
+ # Use a more robust extraction with a timeout
68
+ try:
69
+ info = await asyncio.wait_for(
70
+ loop.run_in_executor(None, lambda: self._extract(url)),
71
+ timeout=30.0
72
+ )
73
+ except asyncio.TimeoutError:
74
+ logger.error(f"Timeout extracting info for {url}")
75
+ return {"error": "استغرق استخراج البيانات وقتاً طويلاً. حاول مرة أخرى."}
76
+
77
+ if not info:
78
+ return {"error": "فشل في استخراج بيانات الفيديو. تأكد من الرابط."}
79
+
80
+ # Live stream check
81
+ if info.get('is_live') or info.get('live_status') == 'is_upcoming':
82
+ return {"error": "هذا الفيديو لم يبدأ عرضه بعد أو هو بث مباشر حالياً."}
83
+
84
+ formats = []
85
+ seen_resolutions = set()
86
+
87
+ # Extract usable formats
88
+ raw_formats = info.get('formats', [])
89
+ if not raw_formats and info.get('url'):
90
+ raw_formats = [info] # For direct links
91
+
92
+ for f in raw_formats:
93
+ if not f: continue
94
+ # Filter out formats without a direct URL or those that are just manifests
95
+ f_url = f.get('url')
96
+ if not f_url or '.m3u8' in f_url or '.mpd' in f_url:
97
+ continue
98
+
99
+ ext = f.get('ext', 'mp4')
100
+ res = f.get('resolution') or f.get('format_note') or f.get('height') or 'Unknown'
101
+
102
+ # Clean resolution label
103
+ if isinstance(res, int): res = f"{res}p"
104
+
105
+ # Avoid duplicates and prioritize video formats
106
+ res_key = f"{res}_{f.get('vcodec') != 'none'}"
107
+ if res_key in seen_resolutions: continue
108
+ seen_resolutions.add(res_key)
109
+
110
+ formats.append({
111
+ 'id': f.get('format_id', 'unknown'),
112
+ 'ext': ext,
113
+ 'resolution': res,
114
+ 'filesize': f.get('filesize') or f.get('filesize_approx') or 0,
115
+ 'url': f_url,
116
+ 'type': 'video' if f.get('vcodec') != 'none' else 'audio'
117
+ })
118
+
119
+ if not formats:
120
+ return {"error": "لم يتم العثور على روابط تحميل مباشرة مدعومة لهذا الفيديو."}
121
+
122
+ return {
123
+ 'title': info.get('title', 'Video'),
124
+ 'thumbnail': info.get('thumbnail', ''),
125
+ 'duration': info.get('duration', 0),
126
+ 'uploader': info.get('uploader', 'Unknown'),
127
+ 'source': info.get('extractor_key', 'Unknown'),
128
+ 'formats': formats[::-1]
129
+ }
130
+ except Exception as e:
131
+ logger.error(f"Universal Downloader error for {url}: {e}")
132
+ return {"error": f"حدث خطأ غير متوقع: {str(e)}"}
133
+
134
+ def _extract(self, url):
135
+ opts = self.ydl_opts.copy()
136
+ # Add extra robustness for TikTok and newer sites
137
+ opts.update({
138
+ 'nocheckcertificate': True,
139
+ 'ignoreerrors': True,
140
+ 'socket_timeout': 15,
141
+ })
142
+ with yt_dlp.YoutubeDL(opts) as ydl:
143
+ return ydl.extract_info(url, download=False)
144
+
145
+ downloader = VideoDownloader()
flaresolverr/bottle_plugins/__init__.py ADDED
File without changes
flaresolverr/bottle_plugins/error_plugin.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bottle import response
2
+ import logging
3
+
4
+
5
+ def error_plugin(callback):
6
+ """
7
+ Bottle plugin to handle exceptions
8
+ https://stackoverflow.com/a/32764250
9
+ """
10
+
11
+ def wrapper(*args, **kwargs):
12
+ try:
13
+ actual_response = callback(*args, **kwargs)
14
+ except Exception as e:
15
+ logging.error(str(e))
16
+ actual_response = {
17
+ "error": str(e)
18
+ }
19
+ response.status = 500
20
+ return actual_response
21
+
22
+ return wrapper
flaresolverr/bottle_plugins/logger_plugin.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bottle import request, response
2
+ import logging
3
+
4
+
5
+ def logger_plugin(callback):
6
+ """
7
+ Bottle plugin to use logging module
8
+ https://bottlepy.org/docs/dev/plugindev.html
9
+
10
+ Wrap a Bottle request so that a log line is emitted after it's handled.
11
+ (This decorator can be extended to take the desired logger as a param.)
12
+ """
13
+
14
+ def wrapper(*args, **kwargs):
15
+ actual_response = callback(*args, **kwargs)
16
+ if not request.url.endswith("/health"):
17
+ logging.info('%s %s %s %s' % (request.remote_addr,
18
+ request.method,
19
+ request.url,
20
+ response.status))
21
+ return actual_response
22
+
23
+ return wrapper
flaresolverr/bottle_plugins/prometheus_plugin.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import urllib.parse
4
+
5
+ from bottle import request
6
+ from dtos import V1RequestBase, V1ResponseBase
7
+ from metrics import start_metrics_http_server, REQUEST_COUNTER, REQUEST_DURATION
8
+
9
+ PROMETHEUS_ENABLED = os.environ.get('PROMETHEUS_ENABLED', 'false').lower() == 'true'
10
+ PROMETHEUS_PORT = int(os.environ.get('PROMETHEUS_PORT', 8192))
11
+
12
+
13
+ def setup():
14
+ if PROMETHEUS_ENABLED:
15
+ start_metrics_http_server(PROMETHEUS_PORT)
16
+
17
+
18
+ def prometheus_plugin(callback):
19
+ """
20
+ Bottle plugin to expose Prometheus metrics
21
+ https://bottlepy.org/docs/dev/plugindev.html
22
+ """
23
+ def wrapper(*args, **kwargs):
24
+ actual_response = callback(*args, **kwargs)
25
+
26
+ if PROMETHEUS_ENABLED:
27
+ try:
28
+ export_metrics(actual_response)
29
+ except Exception as e:
30
+ logging.warning("Error exporting metrics: " + str(e))
31
+
32
+ return actual_response
33
+
34
+ def export_metrics(actual_response):
35
+ res = V1ResponseBase(actual_response)
36
+
37
+ if res.startTimestamp is None or res.endTimestamp is None:
38
+ # skip management and healthcheck endpoints
39
+ return
40
+
41
+ domain = "unknown"
42
+ if res.solution and res.solution.url:
43
+ domain = parse_domain_url(res.solution.url)
44
+ else:
45
+ # timeout error
46
+ req = V1RequestBase(request.json)
47
+ if req.url:
48
+ domain = parse_domain_url(req.url)
49
+
50
+ run_time = (res.endTimestamp - res.startTimestamp) / 1000
51
+ REQUEST_DURATION.labels(domain=domain).observe(run_time)
52
+
53
+ result = "unknown"
54
+ if res.message == "Challenge solved!":
55
+ result = "solved"
56
+ elif res.message == "Challenge not detected!":
57
+ result = "not_detected"
58
+ elif res.message.startswith("Error"):
59
+ result = "error"
60
+ REQUEST_COUNTER.labels(domain=domain, result=result).inc()
61
+
62
+ def parse_domain_url(url):
63
+ parsed_url = urllib.parse.urlparse(url)
64
+ return parsed_url.hostname
65
+
66
+ return wrapper
flaresolverr/build_package.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import platform
3
+ import shutil
4
+ import subprocess
5
+ import sys
6
+ import zipfile
7
+ import tarfile
8
+
9
+ import requests
10
+
11
+
12
+ def clean_files():
13
+ try:
14
+ shutil.rmtree(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'build'))
15
+ except Exception:
16
+ pass
17
+ try:
18
+ shutil.rmtree(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'dist'))
19
+ except Exception:
20
+ pass
21
+ try:
22
+ shutil.rmtree(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'dist_chrome'))
23
+ except Exception:
24
+ pass
25
+
26
+
27
+ def download_chromium():
28
+ # https://commondatastorage.googleapis.com/chromium-browser-snapshots/index.html?prefix=Linux_x64/
29
+ revision = "1522586" if os.name == 'nt' else '1522586'
30
+ arch = 'Win_x64' if os.name == 'nt' else 'Linux_x64'
31
+ dl_file = 'chrome-win' if os.name == 'nt' else 'chrome-linux'
32
+ dl_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'dist_chrome')
33
+ dl_path_folder = os.path.join(dl_path, dl_file)
34
+ dl_path_zip = dl_path_folder + '.zip'
35
+
36
+ # response = requests.get(
37
+ # f'https://commondatastorage.googleapis.com/chromium-browser-snapshots/{arch}/LAST_CHANGE',
38
+ # timeout=30)
39
+ # revision = response.text.strip()
40
+ print("Downloading revision: " + revision)
41
+
42
+ os.mkdir(dl_path)
43
+ with requests.get(
44
+ f'https://commondatastorage.googleapis.com/chromium-browser-snapshots/{arch}/{revision}/{dl_file}.zip',
45
+ stream=True) as r:
46
+ r.raise_for_status()
47
+ with open(dl_path_zip, 'wb') as f:
48
+ for chunk in r.iter_content(chunk_size=8192):
49
+ f.write(chunk)
50
+ print("File downloaded: " + dl_path_zip)
51
+ with zipfile.ZipFile(dl_path_zip, 'r') as zip_ref:
52
+ zip_ref.extractall(dl_path)
53
+ os.remove(dl_path_zip)
54
+
55
+ chrome_path = os.path.join(dl_path, "chrome")
56
+ shutil.move(dl_path_folder, chrome_path)
57
+ print("Extracted in: " + chrome_path)
58
+
59
+ if os.name != 'nt':
60
+ # Give executable permissions for *nix
61
+ # file * | grep executable | cut -d: -f1
62
+ print("Giving executable permissions...")
63
+ execs = ['chrome', 'chrome_crashpad_handler', 'chrome_sandbox', 'chrome-wrapper', 'xdg-mime', 'xdg-settings']
64
+ for exec_file in execs:
65
+ exec_path = os.path.join(chrome_path, exec_file)
66
+ os.chmod(exec_path, 0o755)
67
+
68
+
69
+ def run_pyinstaller():
70
+ sep = ';' if os.name == 'nt' else ':'
71
+ result = subprocess.run([sys.executable, "-m", "PyInstaller",
72
+ "--icon", "resources/flaresolverr_logo.ico",
73
+ "--add-data", f"package.json{sep}.",
74
+ "--add-data", f"{os.path.join('dist_chrome', 'chrome')}{sep}chrome",
75
+ os.path.join("src", "flaresolverr.py")],
76
+ cwd=os.pardir, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
77
+ if result.returncode != 0:
78
+ print(result.stderr.decode('utf-8'))
79
+ raise Exception("Error running pyInstaller")
80
+
81
+
82
+ def compress_package():
83
+ dist_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'dist')
84
+ package_folder = os.path.join(dist_folder, 'package')
85
+ shutil.move(os.path.join(dist_folder, 'flaresolverr'), os.path.join(package_folder, 'flaresolverr'))
86
+ print("Package folder: " + package_folder)
87
+
88
+ compr_format = 'zip' if os.name == 'nt' else 'gztar'
89
+ compr_file_name = 'flaresolverr_windows_x64' if os.name == 'nt' else 'flaresolverr_linux_x64'
90
+ compr_file_path = os.path.join(dist_folder, compr_file_name)
91
+
92
+ if compr_format == 'zip':
93
+ shutil.make_archive(compr_file_path, compr_format, package_folder)
94
+ print("Compressed file path: " + compr_file_path)
95
+ else:
96
+ def _reset_tarinfo(tarinfo):
97
+ tarinfo.uid = 0
98
+ tarinfo.gid = 0
99
+ tarinfo.uname = ""
100
+ tarinfo.gname = ""
101
+ return tarinfo
102
+
103
+ tar_path = compr_file_path + '.tar.gz'
104
+ with tarfile.open(tar_path, 'w:gz') as tar:
105
+ for entry in os.listdir(package_folder):
106
+ fullpath = os.path.join(package_folder, entry)
107
+ tar.add(fullpath, arcname=entry, filter=_reset_tarinfo)
108
+ print("Compressed file path: " + tar_path)
109
+
110
+ if __name__ == "__main__":
111
+ print("Building package...")
112
+ print("Platform: " + platform.platform())
113
+
114
+ print("Cleaning previous build...")
115
+ clean_files()
116
+
117
+ print("Downloading Chromium...")
118
+ download_chromium()
119
+
120
+ print("Building pyinstaller executable... ")
121
+ run_pyinstaller()
122
+
123
+ print("Compressing package... ")
124
+ compress_package()
125
+
126
+ # NOTE: python -m pip install pyinstaller
flaresolverr/dtos.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ STATUS_OK = "ok"
3
+ STATUS_ERROR = "error"
4
+
5
+
6
+ class ChallengeResolutionResultT:
7
+ url: str = None
8
+ status: int = None
9
+ headers: list = None
10
+ response: str = None
11
+ cookies: list = None
12
+ userAgent: str = None
13
+ screenshot: str | None = None
14
+ turnstile_token: str = None
15
+
16
+ def __init__(self, _dict):
17
+ self.__dict__.update(_dict)
18
+
19
+
20
+ class ChallengeResolutionT:
21
+ status: str = None
22
+ message: str = None
23
+ result: ChallengeResolutionResultT = None
24
+
25
+ def __init__(self, _dict):
26
+ self.__dict__.update(_dict)
27
+ if self.result is not None:
28
+ self.result = ChallengeResolutionResultT(self.result)
29
+
30
+
31
+ class V1RequestBase(object):
32
+ # V1RequestBase
33
+ cmd: str = None
34
+ cookies: list = None
35
+ maxTimeout: int = None
36
+ proxy: dict = None
37
+ session: str = None
38
+ session_ttl_minutes: int = None
39
+ headers: list = None # deprecated v2.0.0, not used
40
+ userAgent: str = None # deprecated v2.0.0, not used
41
+
42
+ # V1Request
43
+ url: str = None
44
+ postData: str = None
45
+ returnOnlyCookies: bool = None
46
+ returnScreenshot: bool = None
47
+ download: bool = None # deprecated v2.0.0, not used
48
+ returnRawHtml: bool = None # deprecated v2.0.0, not used
49
+ waitInSeconds: int = None
50
+ # Optional resource blocking flag (blocks images, CSS, and fonts)
51
+ disableMedia: bool = None
52
+ # Optional when you've got a turnstile captcha that needs to be clicked after X number of Tab presses
53
+ tabs_till_verify : int = None
54
+
55
+ def __init__(self, _dict):
56
+ self.__dict__.update(_dict)
57
+
58
+
59
+ class V1ResponseBase(object):
60
+ # V1ResponseBase
61
+ status: str = None
62
+ message: str = None
63
+ session: str = None
64
+ sessions: list[str] = None
65
+ startTimestamp: int = None
66
+ endTimestamp: int = None
67
+ version: str = None
68
+
69
+ # V1ResponseSolution
70
+ solution: ChallengeResolutionResultT = None
71
+
72
+ # hidden vars
73
+ __error_500__: bool = False
74
+
75
+ def __init__(self, _dict):
76
+ self.__dict__.update(_dict)
77
+ if self.solution is not None:
78
+ self.solution = ChallengeResolutionResultT(self.solution)
79
+
80
+
81
+ class IndexResponse(object):
82
+ msg: str = None
83
+ version: str = None
84
+ userAgent: str = None
85
+
86
+ def __init__(self, _dict):
87
+ self.__dict__.update(_dict)
88
+
89
+
90
+ class HealthResponse(object):
91
+ status: str = None
92
+
93
+ def __init__(self, _dict):
94
+ self.__dict__.update(_dict)
flaresolverr/flaresolverr.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ import sys
5
+
6
+ import certifi
7
+ from bottle import run, response, Bottle, request, ServerAdapter
8
+
9
+ from bottle_plugins.error_plugin import error_plugin
10
+ from bottle_plugins.logger_plugin import logger_plugin
11
+ from bottle_plugins import prometheus_plugin
12
+ from dtos import V1RequestBase
13
+ import flaresolverr_service
14
+ import utils
15
+
16
+ env_proxy_url = os.environ.get('PROXY_URL', None)
17
+ env_proxy_username = os.environ.get('PROXY_USERNAME', None)
18
+ env_proxy_password = os.environ.get('PROXY_PASSWORD', None)
19
+
20
+
21
+ class JSONErrorBottle(Bottle):
22
+ """
23
+ Handle 404 errors
24
+ """
25
+ def default_error_handler(self, res):
26
+ response.content_type = 'application/json'
27
+ return json.dumps(dict(error=res.body, status_code=res.status_code))
28
+
29
+
30
+ app = JSONErrorBottle()
31
+
32
+
33
+ @app.route('/')
34
+ def index():
35
+ """
36
+ Show welcome message
37
+ """
38
+ res = flaresolverr_service.index_endpoint()
39
+ return utils.object_to_dict(res)
40
+
41
+
42
+ @app.route('/health')
43
+ def health():
44
+ """
45
+ Healthcheck endpoint.
46
+ This endpoint is special because it doesn't print traces
47
+ """
48
+ res = flaresolverr_service.health_endpoint()
49
+ return utils.object_to_dict(res)
50
+
51
+
52
+ @app.post('/v1')
53
+ def controller_v1():
54
+ """
55
+ Controller v1
56
+ """
57
+ data = request.json or {}
58
+ if (('proxy' not in data or not data.get('proxy')) and env_proxy_url is not None and (env_proxy_username is None and env_proxy_password is None)):
59
+ logging.info('Using proxy URL ENV')
60
+ data['proxy'] = {"url": env_proxy_url}
61
+ if (('proxy' not in data or not data.get('proxy')) and env_proxy_url is not None and (env_proxy_username is not None or env_proxy_password is not None)):
62
+ logging.info('Using proxy URL, username & password ENVs')
63
+ data['proxy'] = {"url": env_proxy_url, "username": env_proxy_username, "password": env_proxy_password}
64
+ req = V1RequestBase(data)
65
+ res = flaresolverr_service.controller_v1_endpoint(req)
66
+ if res.__error_500__:
67
+ response.status = 500
68
+ return utils.object_to_dict(res)
69
+
70
+
71
+ if __name__ == "__main__":
72
+ # check python version
73
+ if sys.version_info < (3, 9):
74
+ raise Exception("The Python version is less than 3.9, a version equal to or higher is required.")
75
+
76
+ # fix for HEADLESS=false in Windows binary
77
+ # https://stackoverflow.com/a/27694505
78
+ if os.name == 'nt':
79
+ import multiprocessing
80
+ multiprocessing.freeze_support()
81
+
82
+ # fix ssl certificates for compiled binaries
83
+ # https://github.com/pyinstaller/pyinstaller/issues/7229
84
+ # https://stackoverflow.com/q/55736855
85
+ os.environ["REQUESTS_CA_BUNDLE"] = certifi.where()
86
+ os.environ["SSL_CERT_FILE"] = certifi.where()
87
+
88
+ # validate configuration
89
+ log_level = os.environ.get('LOG_LEVEL', 'info').upper()
90
+ log_file = os.environ.get('LOG_FILE', None)
91
+ log_html = utils.get_config_log_html()
92
+ headless = utils.get_config_headless()
93
+ server_host = os.environ.get('HOST', '0.0.0.0')
94
+ server_port = int(os.environ.get('PORT', 8191))
95
+
96
+ # configure logger
97
+ logger_format = '%(asctime)s %(levelname)-8s %(message)s'
98
+ if log_level == 'DEBUG':
99
+ logger_format = '%(asctime)s %(levelname)-8s ReqId %(thread)s %(message)s'
100
+ if log_file:
101
+ log_file = os.path.realpath(log_file)
102
+ log_path = os.path.dirname(log_file)
103
+ os.makedirs(log_path, exist_ok=True)
104
+ logging.basicConfig(
105
+ format=logger_format,
106
+ level=log_level,
107
+ datefmt='%Y-%m-%d %H:%M:%S',
108
+ handlers=[
109
+ logging.StreamHandler(sys.stdout),
110
+ logging.FileHandler(log_file)
111
+ ]
112
+ )
113
+ else:
114
+ logging.basicConfig(
115
+ format=logger_format,
116
+ level=log_level,
117
+ datefmt='%Y-%m-%d %H:%M:%S',
118
+ handlers=[
119
+ logging.StreamHandler(sys.stdout)
120
+ ]
121
+ )
122
+
123
+ # disable warning traces from urllib3
124
+ logging.getLogger('urllib3').setLevel(logging.ERROR)
125
+ logging.getLogger('selenium.webdriver.remote.remote_connection').setLevel(logging.WARNING)
126
+ logging.getLogger('undetected_chromedriver').setLevel(logging.WARNING)
127
+
128
+ logging.info(f'FlareSolverr {utils.get_flaresolverr_version()}')
129
+ logging.debug('Debug log enabled')
130
+
131
+ # Get current OS for global variable
132
+ utils.get_current_platform()
133
+
134
+ # test browser installation
135
+ if os.environ.get('SKIP_BROWSER_TEST', 'false').lower() != 'true':
136
+ flaresolverr_service.test_browser_installation()
137
+ else:
138
+ logging.info("Skipping browser installation test for faster boot.")
139
+
140
+ # start bootle plugins
141
+ # plugin order is important
142
+ app.install(logger_plugin)
143
+ app.install(error_plugin)
144
+ prometheus_plugin.setup()
145
+ app.install(prometheus_plugin.prometheus_plugin)
146
+
147
+ # start webserver
148
+ # default server 'wsgiref' does not support concurrent requests
149
+ # https://github.com/FlareSolverr/FlareSolverr/issues/680
150
+ # https://github.com/Pylons/waitress/issues/31
151
+ class WaitressServerPoll(ServerAdapter):
152
+ def run(self, handler):
153
+ from waitress import serve
154
+ serve(handler, host=self.host, port=self.port, asyncore_use_poll=True)
155
+ run(app, host=server_host, port=server_port, quiet=True, server=WaitressServerPoll)
flaresolverr/flaresolverr_service.py ADDED
@@ -0,0 +1,519 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import platform
3
+ import sys
4
+ import time
5
+ from datetime import timedelta
6
+ from html import escape
7
+ from urllib.parse import unquote, quote
8
+
9
+ from func_timeout import FunctionTimedOut, func_timeout
10
+ from selenium.common import TimeoutException
11
+ from selenium.webdriver.chrome.webdriver import WebDriver
12
+ from selenium.webdriver.common.by import By
13
+ from selenium.webdriver.common.keys import Keys
14
+ from selenium.webdriver.support.expected_conditions import (
15
+ presence_of_element_located, staleness_of, title_is)
16
+ from selenium.webdriver.common.action_chains import ActionChains
17
+ from selenium.webdriver.support.wait import WebDriverWait
18
+
19
+ import utils
20
+ from dtos import (STATUS_ERROR, STATUS_OK, ChallengeResolutionResultT,
21
+ ChallengeResolutionT, HealthResponse, IndexResponse,
22
+ V1RequestBase, V1ResponseBase)
23
+ from sessions import SessionsStorage
24
+
25
+ ACCESS_DENIED_TITLES = [
26
+ # Cloudflare
27
+ 'Access denied',
28
+ # Cloudflare http://bitturk.net/ Firefox
29
+ 'Attention Required! | Cloudflare'
30
+ ]
31
+ ACCESS_DENIED_SELECTORS = [
32
+ # Cloudflare
33
+ 'div.cf-error-title span.cf-code-label span',
34
+ # Cloudflare http://bitturk.net/ Firefox
35
+ '#cf-error-details div.cf-error-overview h1'
36
+ ]
37
+ CHALLENGE_TITLES = [
38
+ # Cloudflare
39
+ 'Just a moment...',
40
+ # DDoS-GUARD
41
+ 'DDoS-Guard'
42
+ ]
43
+ CHALLENGE_SELECTORS = [
44
+ # Cloudflare
45
+ '#cf-challenge-running', '.ray_id', '.attack-box', '#cf-please-wait', '#challenge-spinner', '#trk_jschal_js', '#turnstile-wrapper', '.lds-ring',
46
+ # Custom CloudFlare for EbookParadijs, Film-Paleis, MuziekFabriek and Puur-Hollands
47
+ 'td.info #js_info',
48
+ # Fairlane / pararius.com
49
+ 'div.vc div.text-box h2'
50
+ ]
51
+
52
+ TURNSTILE_SELECTORS = [
53
+ "input[name='cf-turnstile-response']"
54
+ ]
55
+
56
+ SHORT_TIMEOUT = 1
57
+ SESSIONS_STORAGE = SessionsStorage()
58
+
59
+
60
+ def test_browser_installation():
61
+ logging.info("Testing web browser installation...")
62
+ logging.info("Platform: " + platform.platform())
63
+
64
+ chrome_exe_path = utils.get_chrome_exe_path()
65
+ if chrome_exe_path is None:
66
+ logging.error("Chrome / Chromium web browser not installed!")
67
+ sys.exit(1)
68
+ else:
69
+ logging.info("Chrome / Chromium path: " + chrome_exe_path)
70
+
71
+ chrome_major_version = utils.get_chrome_major_version()
72
+ if chrome_major_version == '':
73
+ logging.error("Chrome / Chromium version not detected!")
74
+ sys.exit(1)
75
+ else:
76
+ logging.info("Chrome / Chromium major version: " + chrome_major_version)
77
+
78
+ logging.info("Launching web browser...")
79
+ user_agent = utils.get_user_agent()
80
+ logging.info("FlareSolverr User-Agent: " + user_agent)
81
+ logging.info("Test successful!")
82
+
83
+
84
+ def index_endpoint() -> IndexResponse:
85
+ res = IndexResponse({})
86
+ res.msg = "FlareSolverr is ready!"
87
+ res.version = utils.get_flaresolverr_version()
88
+ res.userAgent = utils.get_user_agent()
89
+ return res
90
+
91
+
92
+ def health_endpoint() -> HealthResponse:
93
+ res = HealthResponse({})
94
+ res.status = STATUS_OK
95
+ return res
96
+
97
+
98
+ def controller_v1_endpoint(req: V1RequestBase) -> V1ResponseBase:
99
+ start_ts = int(time.time() * 1000)
100
+ logging.info(f"Incoming request => POST /v1 body: {utils.object_to_dict(req)}")
101
+ res: V1ResponseBase
102
+ try:
103
+ res = _controller_v1_handler(req)
104
+ except Exception as e:
105
+ res = V1ResponseBase({})
106
+ res.__error_500__ = True
107
+ res.status = STATUS_ERROR
108
+ res.message = "Error: " + str(e)
109
+ logging.error(res.message)
110
+
111
+ res.startTimestamp = start_ts
112
+ res.endTimestamp = int(time.time() * 1000)
113
+ res.version = utils.get_flaresolverr_version()
114
+ logging.debug(f"Response => POST /v1 body: {utils.object_to_dict(res)}")
115
+ logging.info(f"Response in {(res.endTimestamp - res.startTimestamp) / 1000} s")
116
+ return res
117
+
118
+
119
+ def _controller_v1_handler(req: V1RequestBase) -> V1ResponseBase:
120
+ # do some validations
121
+ if req.cmd is None:
122
+ raise Exception("Request parameter 'cmd' is mandatory.")
123
+ if req.headers is not None:
124
+ logging.warning("Request parameter 'headers' was removed in FlareSolverr v2.")
125
+ if req.userAgent is not None:
126
+ logging.warning("Request parameter 'userAgent' was removed in FlareSolverr v2.")
127
+
128
+ # set default values
129
+ if req.maxTimeout is None or int(req.maxTimeout) < 1:
130
+ req.maxTimeout = 60000
131
+
132
+ # execute the command
133
+ res: V1ResponseBase
134
+ if req.cmd == 'sessions.create':
135
+ res = _cmd_sessions_create(req)
136
+ elif req.cmd == 'sessions.list':
137
+ res = _cmd_sessions_list(req)
138
+ elif req.cmd == 'sessions.destroy':
139
+ res = _cmd_sessions_destroy(req)
140
+ elif req.cmd == 'request.get':
141
+ res = _cmd_request_get(req)
142
+ elif req.cmd == 'request.post':
143
+ res = _cmd_request_post(req)
144
+ else:
145
+ raise Exception(f"Request parameter 'cmd' = '{req.cmd}' is invalid.")
146
+
147
+ return res
148
+
149
+
150
+ def _cmd_request_get(req: V1RequestBase) -> V1ResponseBase:
151
+ # do some validations
152
+ if req.url is None:
153
+ raise Exception("Request parameter 'url' is mandatory in 'request.get' command.")
154
+ if req.postData is not None:
155
+ raise Exception("Cannot use 'postBody' when sending a GET request.")
156
+ if req.returnRawHtml is not None:
157
+ logging.warning("Request parameter 'returnRawHtml' was removed in FlareSolverr v2.")
158
+ if req.download is not None:
159
+ logging.warning("Request parameter 'download' was removed in FlareSolverr v2.")
160
+
161
+ challenge_res = _resolve_challenge(req, 'GET')
162
+ res = V1ResponseBase({})
163
+ res.status = challenge_res.status
164
+ res.message = challenge_res.message
165
+ res.solution = challenge_res.result
166
+ return res
167
+
168
+
169
+ def _cmd_request_post(req: V1RequestBase) -> V1ResponseBase:
170
+ # do some validations
171
+ if req.postData is None:
172
+ raise Exception("Request parameter 'postData' is mandatory in 'request.post' command.")
173
+ if req.returnRawHtml is not None:
174
+ logging.warning("Request parameter 'returnRawHtml' was removed in FlareSolverr v2.")
175
+ if req.download is not None:
176
+ logging.warning("Request parameter 'download' was removed in FlareSolverr v2.")
177
+
178
+ challenge_res = _resolve_challenge(req, 'POST')
179
+ res = V1ResponseBase({})
180
+ res.status = challenge_res.status
181
+ res.message = challenge_res.message
182
+ res.solution = challenge_res.result
183
+ return res
184
+
185
+
186
+ def _cmd_sessions_create(req: V1RequestBase) -> V1ResponseBase:
187
+ logging.debug("Creating new session...")
188
+
189
+ session, fresh = SESSIONS_STORAGE.create(session_id=req.session, proxy=req.proxy)
190
+ session_id = session.session_id
191
+
192
+ if not fresh:
193
+ return V1ResponseBase({
194
+ "status": STATUS_OK,
195
+ "message": "Session already exists.",
196
+ "session": session_id
197
+ })
198
+
199
+ return V1ResponseBase({
200
+ "status": STATUS_OK,
201
+ "message": "Session created successfully.",
202
+ "session": session_id
203
+ })
204
+
205
+
206
+ def _cmd_sessions_list(req: V1RequestBase) -> V1ResponseBase:
207
+ session_ids = SESSIONS_STORAGE.session_ids()
208
+
209
+ return V1ResponseBase({
210
+ "status": STATUS_OK,
211
+ "message": "",
212
+ "sessions": session_ids
213
+ })
214
+
215
+
216
+ def _cmd_sessions_destroy(req: V1RequestBase) -> V1ResponseBase:
217
+ session_id = req.session
218
+ existed = SESSIONS_STORAGE.destroy(session_id)
219
+
220
+ if not existed:
221
+ raise Exception("The session doesn't exist.")
222
+
223
+ return V1ResponseBase({
224
+ "status": STATUS_OK,
225
+ "message": "The session has been removed."
226
+ })
227
+
228
+
229
+ def _resolve_challenge(req: V1RequestBase, method: str) -> ChallengeResolutionT:
230
+ timeout = int(req.maxTimeout) / 1000
231
+ driver = None
232
+ try:
233
+ if req.session:
234
+ session_id = req.session
235
+ ttl = timedelta(minutes=req.session_ttl_minutes) if req.session_ttl_minutes else None
236
+ session, fresh = SESSIONS_STORAGE.get(session_id, ttl)
237
+
238
+ if fresh:
239
+ logging.debug(f"new session created to perform the request (session_id={session_id})")
240
+ else:
241
+ logging.debug(f"existing session is used to perform the request (session_id={session_id}, "
242
+ f"lifetime={str(session.lifetime())}, ttl={str(ttl)})")
243
+
244
+ driver = session.driver
245
+ else:
246
+ driver = utils.get_webdriver(req.proxy)
247
+ logging.debug('New instance of webdriver has been created to perform the request')
248
+ return func_timeout(timeout, _evil_logic, (req, driver, method))
249
+ except FunctionTimedOut:
250
+ raise Exception(f'Error solving the challenge. Timeout after {timeout} seconds.')
251
+ except Exception as e:
252
+ raise Exception('Error solving the challenge. ' + str(e).replace('\n', '\\n'))
253
+ finally:
254
+ if not req.session and driver is not None:
255
+ if utils.PLATFORM_VERSION == "nt":
256
+ driver.close()
257
+ driver.quit()
258
+ logging.debug('A used instance of webdriver has been destroyed')
259
+
260
+
261
+ def click_verify(driver: WebDriver, num_tabs: int = 1):
262
+ try:
263
+ logging.debug("Try to find the Cloudflare verify checkbox...")
264
+ actions = ActionChains(driver)
265
+ actions.pause(5)
266
+ for _ in range(num_tabs):
267
+ actions.send_keys(Keys.TAB).pause(0.1)
268
+ actions.pause(1)
269
+ actions.send_keys(Keys.SPACE).perform()
270
+
271
+ logging.debug(f"Cloudflare verify checkbox clicked after {num_tabs} tabs!")
272
+ except Exception:
273
+ logging.debug("Cloudflare verify checkbox not found on the page.")
274
+ finally:
275
+ driver.switch_to.default_content()
276
+
277
+ try:
278
+ logging.debug("Try to find the Cloudflare 'Verify you are human' button...")
279
+ button = driver.find_element(
280
+ by=By.XPATH,
281
+ value="//input[@type='button' and @value='Verify you are human']",
282
+ )
283
+ if button:
284
+ actions = ActionChains(driver)
285
+ actions.move_to_element_with_offset(button, 5, 7)
286
+ actions.click(button)
287
+ actions.perform()
288
+ logging.debug("The Cloudflare 'Verify you are human' button found and clicked!")
289
+ except Exception:
290
+ logging.debug("The Cloudflare 'Verify you are human' button not found on the page.")
291
+
292
+ time.sleep(2)
293
+
294
+ def _get_turnstile_token(driver: WebDriver, tabs: int):
295
+ token_input = driver.find_element(By.CSS_SELECTOR, "input[name='cf-turnstile-response']")
296
+ current_value = token_input.get_attribute("value")
297
+ while True:
298
+ click_verify(driver, num_tabs=tabs)
299
+ turnstile_token = token_input.get_attribute("value")
300
+ if turnstile_token:
301
+ if turnstile_token != current_value:
302
+ logging.info(f"Turnstile token: {turnstile_token}")
303
+ return turnstile_token
304
+ logging.debug(f"Failed to extract token possibly click failed")
305
+
306
+ # reset focus
307
+ driver.execute_script("""
308
+ let el = document.createElement('button');
309
+ el.style.position='fixed';
310
+ el.style.top='0';
311
+ el.style.left='0';
312
+ document.body.prepend(el);
313
+ el.focus();
314
+ """)
315
+ time.sleep(1)
316
+
317
+ def _resolve_turnstile_captcha(req: V1RequestBase, driver: WebDriver):
318
+ turnstile_token = None
319
+ if req.tabs_till_verify is not None:
320
+ logging.debug(f'Navigating to... {req.url} in order to pass the turnstile challenge')
321
+ driver.get(req.url)
322
+
323
+ turnstile_challenge_found = False
324
+ for selector in TURNSTILE_SELECTORS:
325
+ found_elements = driver.find_elements(By.CSS_SELECTOR, selector)
326
+ if len(found_elements) > 0:
327
+ turnstile_challenge_found = True
328
+ logging.info("Turnstile challenge detected. Selector found: " + selector)
329
+ break
330
+ if turnstile_challenge_found:
331
+ turnstile_token = _get_turnstile_token(driver=driver, tabs=req.tabs_till_verify)
332
+ else:
333
+ logging.debug(f'Turnstile challenge not found')
334
+ return turnstile_token
335
+
336
+ def _evil_logic(req: V1RequestBase, driver: WebDriver, method: str) -> ChallengeResolutionT:
337
+ res = ChallengeResolutionT({})
338
+ res.status = STATUS_OK
339
+ res.message = ""
340
+
341
+ # optionally block resources like images/css/fonts using CDP
342
+ disable_media = utils.get_config_disable_media()
343
+ if req.disableMedia is not None:
344
+ disable_media = req.disableMedia
345
+ if disable_media:
346
+ block_urls = [
347
+ # Images
348
+ "*.png", "*.jpg", "*.jpeg", "*.gif", "*.webp", "*.bmp", "*.svg", "*.ico",
349
+ "*.PNG", "*.JPG", "*.JPEG", "*.GIF", "*.WEBP", "*.BMP", "*.SVG", "*.ICO",
350
+ "*.tiff", "*.tif", "*.jpe", "*.apng", "*.avif", "*.heic", "*.heif",
351
+ "*.TIFF", "*.TIF", "*.JPE", "*.APNG", "*.AVIF", "*.HEIC", "*.HEIF",
352
+ # Stylesheets
353
+ "*.css",
354
+ "*.CSS",
355
+ # Fonts
356
+ "*.woff", "*.woff2", "*.ttf", "*.otf", "*.eot",
357
+ "*.WOFF", "*.WOFF2", "*.TTF", "*.OTF", "*.EOT"
358
+ ]
359
+ try:
360
+ logging.debug("Network.setBlockedURLs: %s", block_urls)
361
+ driver.execute_cdp_cmd("Network.enable", {})
362
+ driver.execute_cdp_cmd("Network.setBlockedURLs", {"urls": block_urls})
363
+ except Exception:
364
+ # if CDP commands are not available or fail, ignore and continue
365
+ logging.debug("Network.setBlockedURLs failed or unsupported on this webdriver")
366
+
367
+ # navigate to the page
368
+ logging.debug(f"Navigating to... {req.url}")
369
+ turnstile_token = None
370
+
371
+ if method == "POST":
372
+ _post_request(req, driver)
373
+ else:
374
+ if req.tabs_till_verify is None:
375
+ driver.get(req.url)
376
+ else:
377
+ turnstile_token = _resolve_turnstile_captcha(req, driver)
378
+
379
+ # set cookies if required
380
+ if req.cookies is not None and len(req.cookies) > 0:
381
+ logging.debug(f'Setting cookies...')
382
+ for cookie in req.cookies:
383
+ driver.delete_cookie(cookie['name'])
384
+ driver.add_cookie(cookie)
385
+ # reload the page
386
+ if method == 'POST':
387
+ _post_request(req, driver)
388
+ else:
389
+ driver.get(req.url)
390
+
391
+ # wait for the page
392
+ if utils.get_config_log_html():
393
+ logging.debug(f"Response HTML:\n{driver.page_source}")
394
+ html_element = driver.find_element(By.TAG_NAME, "html")
395
+ page_title = driver.title
396
+
397
+ # find access denied titles
398
+ for title in ACCESS_DENIED_TITLES:
399
+ if page_title.startswith(title):
400
+ raise Exception('Cloudflare has blocked this request. '
401
+ 'Probably your IP is banned for this site, check in your web browser.')
402
+ # find access denied selectors
403
+ for selector in ACCESS_DENIED_SELECTORS:
404
+ found_elements = driver.find_elements(By.CSS_SELECTOR, selector)
405
+ if len(found_elements) > 0:
406
+ raise Exception('Cloudflare has blocked this request. '
407
+ 'Probably your IP is banned for this site, check in your web browser.')
408
+
409
+ # find challenge by title
410
+ challenge_found = False
411
+ for title in CHALLENGE_TITLES:
412
+ if title.lower() == page_title.lower():
413
+ challenge_found = True
414
+ logging.info("Challenge detected. Title found: " + page_title)
415
+ break
416
+ if not challenge_found:
417
+ # find challenge by selectors
418
+ for selector in CHALLENGE_SELECTORS:
419
+ found_elements = driver.find_elements(By.CSS_SELECTOR, selector)
420
+ if len(found_elements) > 0:
421
+ challenge_found = True
422
+ logging.info("Challenge detected. Selector found: " + selector)
423
+ break
424
+
425
+ attempt = 0
426
+ if challenge_found:
427
+ while True:
428
+ try:
429
+ attempt = attempt + 1
430
+ # wait until the title changes
431
+ for title in CHALLENGE_TITLES:
432
+ logging.debug("Waiting for title (attempt " + str(attempt) + "): " + title)
433
+ WebDriverWait(driver, SHORT_TIMEOUT).until_not(title_is(title))
434
+
435
+ # then wait until all the selectors disappear
436
+ for selector in CHALLENGE_SELECTORS:
437
+ logging.debug("Waiting for selector (attempt " + str(attempt) + "): " + selector)
438
+ WebDriverWait(driver, SHORT_TIMEOUT).until_not(
439
+ presence_of_element_located((By.CSS_SELECTOR, selector)))
440
+
441
+ # all elements not found
442
+ break
443
+
444
+ except TimeoutException:
445
+ logging.debug("Timeout waiting for selector")
446
+
447
+ click_verify(driver)
448
+
449
+ # update the html (cloudflare reloads the page every 5 s)
450
+ html_element = driver.find_element(By.TAG_NAME, "html")
451
+
452
+ # waits until cloudflare redirection ends
453
+ logging.debug("Waiting for redirect")
454
+ # noinspection PyBroadException
455
+ try:
456
+ WebDriverWait(driver, SHORT_TIMEOUT).until(staleness_of(html_element))
457
+ except Exception:
458
+ logging.debug("Timeout waiting for redirect")
459
+
460
+ logging.info("Challenge solved!")
461
+ res.message = "Challenge solved!"
462
+ else:
463
+ logging.info("Challenge not detected!")
464
+ res.message = "Challenge not detected!"
465
+
466
+ challenge_res = ChallengeResolutionResultT({})
467
+ challenge_res.url = driver.current_url
468
+ challenge_res.status = 200 # todo: fix, selenium not provides this info
469
+ challenge_res.cookies = driver.get_cookies()
470
+ challenge_res.userAgent = utils.get_user_agent(driver)
471
+ challenge_res.turnstile_token = turnstile_token
472
+
473
+ if not req.returnOnlyCookies:
474
+ challenge_res.headers = {} # todo: fix, selenium not provides this info
475
+
476
+ if req.waitInSeconds and req.waitInSeconds > 0:
477
+ logging.info("Waiting " + str(req.waitInSeconds) + " seconds before returning the response...")
478
+ time.sleep(req.waitInSeconds)
479
+
480
+ challenge_res.response = driver.page_source
481
+
482
+ if req.returnScreenshot:
483
+ challenge_res.screenshot = driver.get_screenshot_as_base64()
484
+
485
+ res.result = challenge_res
486
+ return res
487
+
488
+
489
+ def _post_request(req: V1RequestBase, driver: WebDriver):
490
+ post_form = f'<form id="hackForm" action="{req.url}" method="POST">'
491
+ query_string = req.postData if req.postData and req.postData[0] != '?' else req.postData[1:] if req.postData else ''
492
+ pairs = query_string.split('&')
493
+ for pair in pairs:
494
+ parts = pair.split('=', 1)
495
+ # noinspection PyBroadException
496
+ try:
497
+ name = unquote(parts[0])
498
+ except Exception:
499
+ name = parts[0]
500
+ if name == 'submit':
501
+ continue
502
+ # noinspection PyBroadException
503
+ try:
504
+ value = unquote(parts[1]) if len(parts) > 1 else ''
505
+ except Exception:
506
+ value = parts[1] if len(parts) > 1 else ''
507
+ # Protection of " character, for syntax
508
+ value=value.replace('"','&quot;')
509
+ post_form += f'<input type="text" name="{escape(quote(name))}" value="{escape(quote(value))}"><br>'
510
+ post_form += '</form>'
511
+ html_content = f"""
512
+ <!DOCTYPE html>
513
+ <html>
514
+ <body>
515
+ {post_form}
516
+ <script>document.getElementById('hackForm').submit();</script>
517
+ </body>
518
+ </html>"""
519
+ driver.get("data:text/html;charset=utf-8,{html_content}".format(html_content=html_content))
flaresolverr/metrics.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ from prometheus_client import Counter, Histogram, start_http_server
4
+ import time
5
+
6
+ REQUEST_COUNTER = Counter(
7
+ name='flaresolverr_request',
8
+ documentation='Total requests with result',
9
+ labelnames=['domain', 'result']
10
+ )
11
+ REQUEST_DURATION = Histogram(
12
+ name='flaresolverr_request_duration',
13
+ documentation='Request duration in seconds',
14
+ labelnames=['domain'],
15
+ buckets=[0, 10, 25, 50]
16
+ )
17
+
18
+
19
+ def serve(port):
20
+ start_http_server(port=port)
21
+ while True:
22
+ time.sleep(600)
23
+
24
+
25
+ def start_metrics_http_server(prometheus_port: int):
26
+ logging.info(f"Serving Prometheus exporter on http://0.0.0.0:{prometheus_port}/metrics")
27
+ from threading import Thread
28
+ Thread(
29
+ target=serve,
30
+ kwargs=dict(port=prometheus_port),
31
+ daemon=True,
32
+ ).start()
flaresolverr/sessions.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from dataclasses import dataclass
3
+ from datetime import datetime, timedelta
4
+ from typing import Optional, Tuple
5
+ from uuid import uuid1
6
+
7
+ from selenium.webdriver.chrome.webdriver import WebDriver
8
+
9
+ import utils
10
+
11
+
12
+ @dataclass
13
+ class Session:
14
+ session_id: str
15
+ driver: WebDriver
16
+ created_at: datetime
17
+
18
+ def lifetime(self) -> timedelta:
19
+ return datetime.now() - self.created_at
20
+
21
+
22
+ class SessionsStorage:
23
+ """SessionsStorage creates, stores and process all the sessions"""
24
+
25
+ def __init__(self):
26
+ self.sessions = {}
27
+
28
+ def create(self, session_id: Optional[str] = None, proxy: Optional[dict] = None,
29
+ force_new: Optional[bool] = False) -> Tuple[Session, bool]:
30
+ """create creates new instance of WebDriver if necessary,
31
+ assign defined (or newly generated) session_id to the instance
32
+ and returns the session object. If a new session has been created
33
+ second argument is set to True.
34
+
35
+ Note: The function is idempotent, so in case if session_id
36
+ already exists in the storage a new instance of WebDriver won't be created
37
+ and existing session will be returned. Second argument defines if
38
+ new session has been created (True) or an existing one was used (False).
39
+ """
40
+ session_id = session_id or str(uuid1())
41
+
42
+ if force_new:
43
+ self.destroy(session_id)
44
+
45
+ if self.exists(session_id):
46
+ return self.sessions[session_id], False
47
+
48
+ driver = utils.get_webdriver(proxy)
49
+ created_at = datetime.now()
50
+ session = Session(session_id, driver, created_at)
51
+
52
+ self.sessions[session_id] = session
53
+
54
+ return session, True
55
+
56
+ def exists(self, session_id: str) -> bool:
57
+ return session_id in self.sessions
58
+
59
+ def destroy(self, session_id: str) -> bool:
60
+ """destroy closes the driver instance and removes session from the storage.
61
+ The function is noop if session_id doesn't exist.
62
+ The function returns True if session was found and destroyed,
63
+ and False if session_id wasn't found.
64
+ """
65
+ if not self.exists(session_id):
66
+ return False
67
+
68
+ session = self.sessions.pop(session_id)
69
+ if utils.PLATFORM_VERSION == "nt":
70
+ session.driver.close()
71
+ session.driver.quit()
72
+ return True
73
+
74
+ def get(self, session_id: str, ttl: Optional[timedelta] = None) -> Tuple[Session, bool]:
75
+ session, fresh = self.create(session_id)
76
+
77
+ if ttl is not None and not fresh and session.lifetime() > ttl:
78
+ logging.debug(f'session\'s lifetime has expired, so the session is recreated (session_id={session_id})')
79
+ session, fresh = self.create(session_id, force_new=True)
80
+
81
+ return session, fresh
82
+
83
+ def session_ids(self) -> list[str]:
84
+ return list(self.sessions.keys())
flaresolverr/tests.py ADDED
@@ -0,0 +1,655 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ from typing import Optional
3
+
4
+ from webtest import TestApp
5
+
6
+ from dtos import IndexResponse, HealthResponse, V1ResponseBase, STATUS_OK, STATUS_ERROR
7
+ import flaresolverr
8
+ import utils
9
+
10
+
11
+ def _find_obj_by_key(key: str, value: str, _list: list) -> Optional[dict]:
12
+ for obj in _list:
13
+ if obj[key] == value:
14
+ return obj
15
+ return None
16
+
17
+
18
+ class TestFlareSolverr(unittest.TestCase):
19
+
20
+ proxy_url = "http://127.0.0.1:8888"
21
+ proxy_socks_url = "socks5://127.0.0.1:1080"
22
+ google_url = "https://www.google.com"
23
+ post_url = "https://httpbin.org/post"
24
+ cloudflare_url = "https://nowsecure.nl/"
25
+ cloudflare_url_2 = "https://idope.se/torrent-list/harry/"
26
+ ddos_guard_url = "https://www.litres.ru/"
27
+ fairlane_url = "https://www.pararius.com/apartments/amsterdam"
28
+ custom_cloudflare_url = "https://www.muziekfabriek.org/"
29
+ cloudflare_blocked_url = "https://cpasbiens3.fr/index.php?do=search&subaction=search"
30
+
31
+ app = TestApp(flaresolverr.app)
32
+ # wait until the server is ready
33
+ app.get('/')
34
+
35
+ def test_wrong_endpoint(self):
36
+ res = self.app.get('/wrong', status=404)
37
+ self.assertEqual(res.status_code, 404)
38
+
39
+ body = res.json
40
+ self.assertEqual("Not found: '/wrong'", body['error'])
41
+ self.assertEqual(404, body['status_code'])
42
+
43
+ def test_index_endpoint(self):
44
+ res = self.app.get('/')
45
+ self.assertEqual(res.status_code, 200)
46
+
47
+ body = IndexResponse(res.json)
48
+ self.assertEqual("FlareSolverr is ready!", body.msg)
49
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
50
+ self.assertIn("Chrome/", body.userAgent)
51
+
52
+ def test_health_endpoint(self):
53
+ res = self.app.get('/health')
54
+ self.assertEqual(res.status_code, 200)
55
+
56
+ body = HealthResponse(res.json)
57
+ self.assertEqual(STATUS_OK, body.status)
58
+
59
+ def test_v1_endpoint_wrong_cmd(self):
60
+ res = self.app.post_json('/v1', {
61
+ "cmd": "request.bad",
62
+ "url": self.google_url
63
+ }, status=500)
64
+ self.assertEqual(res.status_code, 500)
65
+
66
+ body = V1ResponseBase(res.json)
67
+ self.assertEqual(STATUS_ERROR, body.status)
68
+ self.assertEqual("Error: Request parameter 'cmd' = 'request.bad' is invalid.", body.message)
69
+ self.assertGreater(body.startTimestamp, 10000)
70
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
71
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
72
+
73
+ def test_v1_endpoint_request_get_no_cloudflare(self):
74
+ res = self.app.post_json('/v1', {
75
+ "cmd": "request.get",
76
+ "url": self.google_url
77
+ })
78
+ self.assertEqual(res.status_code, 200)
79
+
80
+ body = V1ResponseBase(res.json)
81
+ self.assertEqual(STATUS_OK, body.status)
82
+ self.assertEqual("Challenge not detected!", body.message)
83
+ self.assertGreater(body.startTimestamp, 10000)
84
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
85
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
86
+
87
+ solution = body.solution
88
+ self.assertIn(self.google_url, solution.url)
89
+ self.assertEqual(solution.status, 200)
90
+ self.assertIs(len(solution.headers), 0)
91
+ self.assertIn("<title>Google</title>", solution.response)
92
+ self.assertGreater(len(solution.cookies), 0)
93
+ self.assertIn("Chrome/", solution.userAgent)
94
+
95
+ def test_v1_endpoint_request_get_disable_resources(self):
96
+ res = self.app.post_json("/v1", {
97
+ "cmd": "request.get",
98
+ "url": self.google_url,
99
+ "disableMedia": True
100
+ })
101
+ self.assertEqual(res.status_code, 200)
102
+
103
+ body = V1ResponseBase(res.json)
104
+ self.assertEqual(STATUS_OK, body.status)
105
+ self.assertEqual("Challenge not detected!", body.message)
106
+ self.assertGreater(body.startTimestamp, 10000)
107
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
108
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
109
+
110
+ solution = body.solution
111
+ self.assertIn(self.google_url, solution.url)
112
+ self.assertEqual(solution.status, 200)
113
+ self.assertIs(len(solution.headers), 0)
114
+ self.assertIn("<title>Google</title>", solution.response)
115
+ self.assertGreater(len(solution.cookies), 0)
116
+ self.assertIn("Chrome/", solution.userAgent)
117
+
118
+ def test_v1_endpoint_request_get_cloudflare_js_1(self):
119
+ res = self.app.post_json('/v1', {
120
+ "cmd": "request.get",
121
+ "url": self.cloudflare_url
122
+ })
123
+ self.assertEqual(res.status_code, 200)
124
+
125
+ body = V1ResponseBase(res.json)
126
+ self.assertEqual(STATUS_OK, body.status)
127
+ self.assertEqual("Challenge solved!", body.message)
128
+ self.assertGreater(body.startTimestamp, 10000)
129
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
130
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
131
+
132
+ solution = body.solution
133
+ self.assertIn(self.cloudflare_url, solution.url)
134
+ self.assertEqual(solution.status, 200)
135
+ self.assertIs(len(solution.headers), 0)
136
+ self.assertIn("<title>nowSecure</title>", solution.response)
137
+ self.assertGreater(len(solution.cookies), 0)
138
+ self.assertIn("Chrome/", solution.userAgent)
139
+
140
+ cf_cookie = _find_obj_by_key("name", "cf_clearance", solution.cookies)
141
+ self.assertIsNotNone(cf_cookie, "Cloudflare cookie not found")
142
+ self.assertGreater(len(cf_cookie["value"]), 30)
143
+
144
+ def test_v1_endpoint_request_get_cloudflare_js_2(self):
145
+ res = self.app.post_json('/v1', {
146
+ "cmd": "request.get",
147
+ "url": self.cloudflare_url_2
148
+ })
149
+ self.assertEqual(res.status_code, 200)
150
+
151
+ body = V1ResponseBase(res.json)
152
+ self.assertEqual(STATUS_OK, body.status)
153
+ self.assertEqual("Challenge solved!", body.message)
154
+ self.assertGreater(body.startTimestamp, 10000)
155
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
156
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
157
+
158
+ solution = body.solution
159
+ self.assertIn(self.cloudflare_url_2, solution.url)
160
+ self.assertEqual(solution.status, 200)
161
+ self.assertIs(len(solution.headers), 0)
162
+ self.assertIn("<title>harry - idope torrent search</title>", solution.response)
163
+ self.assertGreater(len(solution.cookies), 0)
164
+ self.assertIn("Chrome/", solution.userAgent)
165
+
166
+ cf_cookie = _find_obj_by_key("name", "cf_clearance", solution.cookies)
167
+ self.assertIsNotNone(cf_cookie, "Cloudflare cookie not found")
168
+ self.assertGreater(len(cf_cookie["value"]), 30)
169
+
170
+ def test_v1_endpoint_request_get_ddos_guard_js(self):
171
+ res = self.app.post_json('/v1', {
172
+ "cmd": "request.get",
173
+ "url": self.ddos_guard_url
174
+ })
175
+ self.assertEqual(res.status_code, 200)
176
+
177
+ body = V1ResponseBase(res.json)
178
+ self.assertEqual(STATUS_OK, body.status)
179
+ self.assertEqual("Challenge solved!", body.message)
180
+ self.assertGreater(body.startTimestamp, 10000)
181
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
182
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
183
+
184
+ solution = body.solution
185
+ self.assertIn(self.ddos_guard_url, solution.url)
186
+ self.assertEqual(solution.status, 200)
187
+ self.assertIs(len(solution.headers), 0)
188
+ self.assertIn("<title>Литрес", solution.response)
189
+ self.assertGreater(len(solution.cookies), 0)
190
+ self.assertIn("Chrome/", solution.userAgent)
191
+
192
+ cf_cookie = _find_obj_by_key("name", "__ddg1_", solution.cookies)
193
+ self.assertIsNotNone(cf_cookie, "DDOS-Guard cookie not found")
194
+ self.assertGreater(len(cf_cookie["value"]), 10)
195
+
196
+ def test_v1_endpoint_request_get_fairlane_js(self):
197
+ res = self.app.post_json('/v1', {
198
+ "cmd": "request.get",
199
+ "url": self.fairlane_url
200
+ })
201
+ self.assertEqual(res.status_code, 200)
202
+
203
+ body = V1ResponseBase(res.json)
204
+ self.assertEqual(STATUS_OK, body.status)
205
+ self.assertEqual("Challenge solved!", body.message)
206
+ self.assertGreater(body.startTimestamp, 10000)
207
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
208
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
209
+
210
+ solution = body.solution
211
+ self.assertIn(self.fairlane_url, solution.url)
212
+ self.assertEqual(solution.status, 200)
213
+ self.assertIs(len(solution.headers), 0)
214
+ self.assertIn("<title>Rental Apartments Amsterdam</title>", solution.response)
215
+ self.assertGreater(len(solution.cookies), 0)
216
+ self.assertIn("Chrome/", solution.userAgent)
217
+
218
+ cf_cookie = _find_obj_by_key("name", "fl_pass_v2_b", solution.cookies)
219
+ self.assertIsNotNone(cf_cookie, "Fairlane cookie not found")
220
+ self.assertGreater(len(cf_cookie["value"]), 50)
221
+
222
+ def test_v1_endpoint_request_get_custom_cloudflare_js(self):
223
+ res = self.app.post_json('/v1', {
224
+ "cmd": "request.get",
225
+ "url": self.custom_cloudflare_url
226
+ })
227
+ self.assertEqual(res.status_code, 200)
228
+
229
+ body = V1ResponseBase(res.json)
230
+ self.assertEqual(STATUS_OK, body.status)
231
+ self.assertEqual("Challenge solved!", body.message)
232
+ self.assertGreater(body.startTimestamp, 10000)
233
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
234
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
235
+
236
+ solution = body.solution
237
+ self.assertIn(self.custom_cloudflare_url, solution.url)
238
+ self.assertEqual(solution.status, 200)
239
+ self.assertIs(len(solution.headers), 0)
240
+ self.assertIn("<title>MuziekFabriek : Aanmelden</title>", solution.response)
241
+ self.assertGreater(len(solution.cookies), 0)
242
+ self.assertIn("Chrome/", solution.userAgent)
243
+
244
+ cf_cookie = _find_obj_by_key("name", "ct_anti_ddos_key", solution.cookies)
245
+ self.assertIsNotNone(cf_cookie, "Custom Cloudflare cookie not found")
246
+ self.assertGreater(len(cf_cookie["value"]), 10)
247
+
248
+ # todo: test Cmd 'request.get' should return fail with Cloudflare CAPTCHA
249
+
250
+ def test_v1_endpoint_request_get_cloudflare_blocked(self):
251
+ res = self.app.post_json('/v1', {
252
+ "cmd": "request.get",
253
+ "url": self.cloudflare_blocked_url
254
+ }, status=500)
255
+ self.assertEqual(res.status_code, 500)
256
+
257
+ body = V1ResponseBase(res.json)
258
+ self.assertEqual(STATUS_ERROR, body.status)
259
+ self.assertEqual("Error: Error solving the challenge. Cloudflare has blocked this request. "
260
+ "Probably your IP is banned for this site, check in your web browser.", body.message)
261
+ self.assertGreater(body.startTimestamp, 10000)
262
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
263
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
264
+
265
+ def test_v1_endpoint_request_get_cookies_param(self):
266
+ res = self.app.post_json('/v1', {
267
+ "cmd": "request.get",
268
+ "url": self.google_url,
269
+ "cookies": [
270
+ {
271
+ "name": "testcookie1",
272
+ "value": "testvalue1"
273
+ },
274
+ {
275
+ "name": "testcookie2",
276
+ "value": "testvalue2"
277
+ }
278
+ ]
279
+ })
280
+ self.assertEqual(res.status_code, 200)
281
+
282
+ body = V1ResponseBase(res.json)
283
+ self.assertEqual(STATUS_OK, body.status)
284
+ self.assertEqual("Challenge not detected!", body.message)
285
+ self.assertGreater(body.startTimestamp, 10000)
286
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
287
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
288
+
289
+ solution = body.solution
290
+ self.assertIn(self.google_url, solution.url)
291
+ self.assertEqual(solution.status, 200)
292
+ self.assertIs(len(solution.headers), 0)
293
+ self.assertIn("<title>Google</title>", solution.response)
294
+ self.assertGreater(len(solution.cookies), 1)
295
+ self.assertIn("Chrome/", solution.userAgent)
296
+
297
+ user_cookie1 = _find_obj_by_key("name", "testcookie1", solution.cookies)
298
+ self.assertIsNotNone(user_cookie1, "User cookie 1 not found")
299
+ self.assertEqual("testvalue1", user_cookie1["value"])
300
+
301
+ user_cookie2 = _find_obj_by_key("name", "testcookie2", solution.cookies)
302
+ self.assertIsNotNone(user_cookie2, "User cookie 2 not found")
303
+ self.assertEqual("testvalue2", user_cookie2["value"])
304
+
305
+ def test_v1_endpoint_request_get_returnOnlyCookies_param(self):
306
+ res = self.app.post_json('/v1', {
307
+ "cmd": "request.get",
308
+ "url": self.google_url,
309
+ "returnOnlyCookies": True
310
+ })
311
+ self.assertEqual(res.status_code, 200)
312
+
313
+ body = V1ResponseBase(res.json)
314
+ self.assertEqual(STATUS_OK, body.status)
315
+ self.assertEqual("Challenge not detected!", body.message)
316
+ self.assertGreater(body.startTimestamp, 10000)
317
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
318
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
319
+
320
+ solution = body.solution
321
+ self.assertIn(self.google_url, solution.url)
322
+ self.assertEqual(solution.status, 200)
323
+ self.assertIsNone(solution.headers)
324
+ self.assertIsNone(solution.response)
325
+ self.assertGreater(len(solution.cookies), 0)
326
+ self.assertIn("Chrome/", solution.userAgent)
327
+
328
+ def test_v1_endpoint_request_get_proxy_http_param(self):
329
+ """
330
+ To configure TinyProxy in local:
331
+ * sudo vim /etc/tinyproxy/tinyproxy.conf
332
+ * edit => LogFile "/tmp/tinyproxy.log"
333
+ * edit => Syslog Off
334
+ * sudo tinyproxy -d
335
+ * sudo tail -f /tmp/tinyproxy.log
336
+ """
337
+ res = self.app.post_json('/v1', {
338
+ "cmd": "request.get",
339
+ "url": self.google_url,
340
+ "proxy": {
341
+ "url": self.proxy_url
342
+ }
343
+ })
344
+ self.assertEqual(res.status_code, 200)
345
+
346
+ body = V1ResponseBase(res.json)
347
+ self.assertEqual(STATUS_OK, body.status)
348
+ self.assertEqual("Challenge not detected!", body.message)
349
+ self.assertGreater(body.startTimestamp, 10000)
350
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
351
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
352
+
353
+ solution = body.solution
354
+ self.assertIn(self.google_url, solution.url)
355
+ self.assertEqual(solution.status, 200)
356
+ self.assertIs(len(solution.headers), 0)
357
+ self.assertIn("<title>Google</title>", solution.response)
358
+ self.assertGreater(len(solution.cookies), 0)
359
+ self.assertIn("Chrome/", solution.userAgent)
360
+
361
+ def test_v1_endpoint_request_get_proxy_http_param_with_credentials(self):
362
+ """
363
+ To configure TinyProxy in local:
364
+ * sudo vim /etc/tinyproxy/tinyproxy.conf
365
+ * edit => LogFile "/tmp/tinyproxy.log"
366
+ * edit => Syslog Off
367
+ * add => BasicAuth testuser testpass
368
+ * sudo tinyproxy -d
369
+ * sudo tail -f /tmp/tinyproxy.log
370
+ """
371
+ res = self.app.post_json('/v1', {
372
+ "cmd": "request.get",
373
+ "url": self.google_url,
374
+ "proxy": {
375
+ "url": self.proxy_url,
376
+ "username": "testuser",
377
+ "password": "testpass"
378
+ }
379
+ })
380
+ self.assertEqual(res.status_code, 200)
381
+
382
+ body = V1ResponseBase(res.json)
383
+ self.assertEqual(STATUS_OK, body.status)
384
+ self.assertEqual("Challenge not detected!", body.message)
385
+ self.assertGreater(body.startTimestamp, 10000)
386
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
387
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
388
+
389
+ solution = body.solution
390
+ self.assertIn(self.google_url, solution.url)
391
+ self.assertEqual(solution.status, 200)
392
+ self.assertIs(len(solution.headers), 0)
393
+ self.assertIn("<title>Google</title>", solution.response)
394
+ self.assertGreater(len(solution.cookies), 0)
395
+ self.assertIn("Chrome/", solution.userAgent)
396
+
397
+ def test_v1_endpoint_request_get_proxy_socks_param(self):
398
+ """
399
+ To configure Dante in local:
400
+ * https://linuxhint.com/set-up-a-socks5-proxy-on-ubuntu-with-dante/
401
+ * sudo vim /etc/sockd.conf
402
+ * sudo systemctl restart sockd.service
403
+ * curl --socks5 socks5://127.0.0.1:1080 https://www.google.com
404
+ """
405
+ res = self.app.post_json('/v1', {
406
+ "cmd": "request.get",
407
+ "url": self.google_url,
408
+ "proxy": {
409
+ "url": self.proxy_socks_url
410
+ }
411
+ })
412
+ self.assertEqual(res.status_code, 200)
413
+
414
+ body = V1ResponseBase(res.json)
415
+ self.assertEqual(STATUS_OK, body.status)
416
+ self.assertEqual("Challenge not detected!", body.message)
417
+ self.assertGreater(body.startTimestamp, 10000)
418
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
419
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
420
+
421
+ solution = body.solution
422
+ self.assertIn(self.google_url, solution.url)
423
+ self.assertEqual(solution.status, 200)
424
+ self.assertIs(len(solution.headers), 0)
425
+ self.assertIn("<title>Google</title>", solution.response)
426
+ self.assertGreater(len(solution.cookies), 0)
427
+ self.assertIn("Chrome/", solution.userAgent)
428
+
429
+ def test_v1_endpoint_request_get_proxy_wrong_param(self):
430
+ res = self.app.post_json('/v1', {
431
+ "cmd": "request.get",
432
+ "url": self.google_url,
433
+ "proxy": {
434
+ "url": "http://127.0.0.1:43210"
435
+ }
436
+ }, status=500)
437
+ self.assertEqual(res.status_code, 500)
438
+
439
+ body = V1ResponseBase(res.json)
440
+ self.assertEqual(STATUS_ERROR, body.status)
441
+ self.assertIn("Error: Error solving the challenge. Message: unknown error: net::ERR_PROXY_CONNECTION_FAILED",
442
+ body.message)
443
+ self.assertGreater(body.startTimestamp, 10000)
444
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
445
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
446
+
447
+ def test_v1_endpoint_request_get_fail_timeout(self):
448
+ res = self.app.post_json('/v1', {
449
+ "cmd": "request.get",
450
+ "url": self.google_url,
451
+ "maxTimeout": 10
452
+ }, status=500)
453
+ self.assertEqual(res.status_code, 500)
454
+
455
+ body = V1ResponseBase(res.json)
456
+ self.assertEqual(STATUS_ERROR, body.status)
457
+ self.assertEqual("Error: Error solving the challenge. Timeout after 0.01 seconds.", body.message)
458
+ self.assertGreater(body.startTimestamp, 10000)
459
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
460
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
461
+
462
+ def test_v1_endpoint_request_get_fail_bad_domain(self):
463
+ res = self.app.post_json('/v1', {
464
+ "cmd": "request.get",
465
+ "url": "https://www.google.combad"
466
+ }, status=500)
467
+ self.assertEqual(res.status_code, 500)
468
+
469
+ body = V1ResponseBase(res.json)
470
+ self.assertEqual(STATUS_ERROR, body.status)
471
+ self.assertIn("Message: unknown error: net::ERR_NAME_NOT_RESOLVED", body.message)
472
+
473
+ def test_v1_endpoint_request_get_deprecated_param(self):
474
+ res = self.app.post_json('/v1', {
475
+ "cmd": "request.get",
476
+ "url": self.google_url,
477
+ "userAgent": "Test User-Agent" # was removed in v2, not used
478
+ })
479
+ self.assertEqual(res.status_code, 200)
480
+
481
+ body = V1ResponseBase(res.json)
482
+ self.assertEqual(STATUS_OK, body.status)
483
+ self.assertEqual("Challenge not detected!", body.message)
484
+
485
+ def test_v1_endpoint_request_post_no_cloudflare(self):
486
+ res = self.app.post_json('/v1', {
487
+ "cmd": "request.post",
488
+ "url": self.post_url,
489
+ "postData": "param1=value1&param2=value2"
490
+ })
491
+ self.assertEqual(res.status_code, 200)
492
+
493
+ body = V1ResponseBase(res.json)
494
+ self.assertEqual(STATUS_OK, body.status)
495
+ self.assertEqual("Challenge not detected!", body.message)
496
+ self.assertGreater(body.startTimestamp, 10000)
497
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
498
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
499
+
500
+ solution = body.solution
501
+ self.assertIn(self.post_url, solution.url)
502
+ self.assertEqual(solution.status, 200)
503
+ self.assertIs(len(solution.headers), 0)
504
+ self.assertIn('"form": {\n "param1": "value1", \n "param2": "value2"\n }', solution.response)
505
+ self.assertEqual(len(solution.cookies), 0)
506
+ self.assertIn("Chrome/", solution.userAgent)
507
+
508
+ def test_v1_endpoint_request_post_cloudflare(self):
509
+ res = self.app.post_json('/v1', {
510
+ "cmd": "request.post",
511
+ "url": self.cloudflare_url,
512
+ "postData": "param1=value1&param2=value2"
513
+ })
514
+ self.assertEqual(res.status_code, 200)
515
+
516
+ body = V1ResponseBase(res.json)
517
+ self.assertEqual(STATUS_OK, body.status)
518
+ self.assertEqual("Challenge solved!", body.message)
519
+ self.assertGreater(body.startTimestamp, 10000)
520
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
521
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
522
+
523
+ solution = body.solution
524
+ self.assertIn(self.cloudflare_url, solution.url)
525
+ self.assertEqual(solution.status, 200)
526
+ self.assertIs(len(solution.headers), 0)
527
+ self.assertIn("<title>405 Not Allowed</title>", solution.response)
528
+ self.assertGreater(len(solution.cookies), 0)
529
+ self.assertIn("Chrome/", solution.userAgent)
530
+
531
+ cf_cookie = _find_obj_by_key("name", "cf_clearance", solution.cookies)
532
+ self.assertIsNotNone(cf_cookie, "Cloudflare cookie not found")
533
+ self.assertGreater(len(cf_cookie["value"]), 30)
534
+
535
+ def test_v1_endpoint_request_post_fail_no_post_data(self):
536
+ res = self.app.post_json('/v1', {
537
+ "cmd": "request.post",
538
+ "url": self.google_url
539
+ }, status=500)
540
+ self.assertEqual(res.status_code, 500)
541
+
542
+ body = V1ResponseBase(res.json)
543
+ self.assertEqual(STATUS_ERROR, body.status)
544
+ self.assertIn("Request parameter 'postData' is mandatory in 'request.post' command", body.message)
545
+
546
+ def test_v1_endpoint_request_post_deprecated_param(self):
547
+ res = self.app.post_json('/v1', {
548
+ "cmd": "request.post",
549
+ "url": self.google_url,
550
+ "postData": "param1=value1&param2=value2",
551
+ "userAgent": "Test User-Agent" # was removed in v2, not used
552
+ })
553
+ self.assertEqual(res.status_code, 200)
554
+
555
+ body = V1ResponseBase(res.json)
556
+ self.assertEqual(STATUS_OK, body.status)
557
+ self.assertEqual("Challenge not detected!", body.message)
558
+
559
+ def test_v1_endpoint_sessions_create_without_session(self):
560
+ res = self.app.post_json('/v1', {
561
+ "cmd": "sessions.create"
562
+ })
563
+ self.assertEqual(res.status_code, 200)
564
+
565
+ body = V1ResponseBase(res.json)
566
+ self.assertEqual(STATUS_OK, body.status)
567
+ self.assertEqual("Session created successfully.", body.message)
568
+ self.assertIsNotNone(body.session)
569
+
570
+ def test_v1_endpoint_sessions_create_with_session(self):
571
+ res = self.app.post_json('/v1', {
572
+ "cmd": "sessions.create",
573
+ "session": "test_create_session"
574
+ })
575
+ self.assertEqual(res.status_code, 200)
576
+
577
+ body = V1ResponseBase(res.json)
578
+ self.assertEqual(STATUS_OK, body.status)
579
+ self.assertEqual("Session created successfully.", body.message)
580
+ self.assertEqual(body.session, "test_create_session")
581
+
582
+ def test_v1_endpoint_sessions_create_with_proxy(self):
583
+ res = self.app.post_json('/v1', {
584
+ "cmd": "sessions.create",
585
+ "proxy": {
586
+ "url": self.proxy_url
587
+ }
588
+ })
589
+ self.assertEqual(res.status_code, 200)
590
+
591
+ body = V1ResponseBase(res.json)
592
+ self.assertEqual(STATUS_OK, body.status)
593
+ self.assertEqual("Session created successfully.", body.message)
594
+ self.assertIsNotNone(body.session)
595
+
596
+ def test_v1_endpoint_sessions_list(self):
597
+ self.app.post_json('/v1', {
598
+ "cmd": "sessions.create",
599
+ "session": "test_list_sessions"
600
+ })
601
+ res = self.app.post_json('/v1', {
602
+ "cmd": "sessions.list"
603
+ })
604
+ self.assertEqual(res.status_code, 200)
605
+
606
+ body = V1ResponseBase(res.json)
607
+ self.assertEqual(STATUS_OK, body.status)
608
+ self.assertEqual("", body.message)
609
+ self.assertGreaterEqual(len(body.sessions), 1)
610
+ self.assertIn("test_list_sessions", body.sessions)
611
+
612
+ def test_v1_endpoint_sessions_destroy_existing_session(self):
613
+ self.app.post_json('/v1', {
614
+ "cmd": "sessions.create",
615
+ "session": "test_destroy_sessions"
616
+ })
617
+ res = self.app.post_json('/v1', {
618
+ "cmd": "sessions.destroy",
619
+ "session": "test_destroy_sessions"
620
+ })
621
+ self.assertEqual(res.status_code, 200)
622
+
623
+ body = V1ResponseBase(res.json)
624
+ self.assertEqual(STATUS_OK, body.status)
625
+ self.assertEqual("The session has been removed.", body.message)
626
+
627
+ def test_v1_endpoint_sessions_destroy_non_existing_session(self):
628
+ res = self.app.post_json('/v1', {
629
+ "cmd": "sessions.destroy",
630
+ "session": "non_existing_session_name"
631
+ }, status=500)
632
+ self.assertEqual(res.status_code, 500)
633
+
634
+ body = V1ResponseBase(res.json)
635
+ self.assertEqual(STATUS_ERROR, body.status)
636
+ self.assertEqual("Error: The session doesn't exist.", body.message)
637
+
638
+ def test_v1_endpoint_request_get_with_session(self):
639
+ self.app.post_json('/v1', {
640
+ "cmd": "sessions.create",
641
+ "session": "test_request_sessions"
642
+ })
643
+ res = self.app.post_json('/v1', {
644
+ "cmd": "request.get",
645
+ "session": "test_request_sessions",
646
+ "url": self.google_url
647
+ })
648
+ self.assertEqual(res.status_code, 200)
649
+
650
+ body = V1ResponseBase(res.json)
651
+ self.assertEqual(STATUS_OK, body.status)
652
+
653
+
654
+ if __name__ == '__main__':
655
+ unittest.main()
flaresolverr/tests_sites.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+
3
+ from webtest import TestApp
4
+
5
+ from dtos import V1ResponseBase, STATUS_OK
6
+ import flaresolverr
7
+ import utils
8
+
9
+
10
+ def _find_obj_by_key(key: str, value: str, _list: list) -> dict | None:
11
+ for obj in _list:
12
+ if obj[key] == value:
13
+ return obj
14
+ return None
15
+
16
+
17
+ def asset_cloudflare_solution(self, res, site_url, site_text):
18
+ self.assertEqual(res.status_code, 200)
19
+
20
+ body = V1ResponseBase(res.json)
21
+ self.assertEqual(STATUS_OK, body.status)
22
+ self.assertEqual("Challenge solved!", body.message)
23
+ self.assertGreater(body.startTimestamp, 10000)
24
+ self.assertGreaterEqual(body.endTimestamp, body.startTimestamp)
25
+ self.assertEqual(utils.get_flaresolverr_version(), body.version)
26
+
27
+ solution = body.solution
28
+ self.assertIn(site_url, solution.url)
29
+ self.assertEqual(solution.status, 200)
30
+ self.assertIs(len(solution.headers), 0)
31
+ self.assertIn(site_text, solution.response)
32
+ self.assertGreater(len(solution.cookies), 0)
33
+ self.assertIn("Chrome/", solution.userAgent)
34
+
35
+ cf_cookie = _find_obj_by_key("name", "cf_clearance", solution.cookies)
36
+ self.assertIsNotNone(cf_cookie, "Cloudflare cookie not found")
37
+ self.assertGreater(len(cf_cookie["value"]), 30)
38
+
39
+
40
+ class TestFlareSolverr(unittest.TestCase):
41
+ app = TestApp(flaresolverr.app)
42
+ # wait until the server is ready
43
+ app.get('/')
44
+
45
+ def test_v1_endpoint_request_get_cloudflare(self):
46
+ sites_get = [
47
+ ('nowsecure', 'https://nowsecure.nl', '<title>nowSecure</title>'),
48
+ ('0magnet', 'https://0magnet.com/search?q=2022', 'Torrent Search - ØMagnet'),
49
+ ('1337x', 'https://1337x.unblockit.cat/cat/Movies/time/desc/1/', ''),
50
+ ('avistaz', 'https://avistaz.to/api/v1/jackett/torrents?in=1&type=0&search=',
51
+ '<title>Access denied</title>'),
52
+ ('badasstorrents', 'https://badasstorrents.com/torrents/search/720p/date/desc',
53
+ '<title>Latest Torrents - BadassTorrents</title>'),
54
+ ('bt4g', 'https://bt4g.org/search/2022', '<title>Download 2022 Torrents - BT4G</title>'),
55
+ ('cinemaz', 'https://cinemaz.to/api/v1/jackett/torrents?in=1&type=0&search=',
56
+ '<title>Access denied</title>'),
57
+ ('epublibre', 'https://epublibre.unblockit.cat/catalogo/index/0/nuevo/todos/sin/todos/--/ajax',
58
+ '<title>epublibre - catálogo</title>'),
59
+ ('ext', 'https://ext.to/latest/?order=age&sort=desc',
60
+ '<title>Download Latest Torrents - EXT Torrents</title>'),
61
+ ('extratorrent', 'https://extratorrent.st/search/?srt=added&order=desc&search=720p&new=1&x=0&y=0',
62
+ 'Page 1 - ExtraTorrent'),
63
+ ('idope', 'https://idope.se/browse.html', '<title>Recent Torrents</title>'),
64
+ ('limetorrents', 'https://limetorrents.unblockninja.com/latest100',
65
+ '<title>Latest 100 torrents - LimeTorrents</title>'),
66
+ ('privatehd', 'https://privatehd.to/api/v1/jackett/torrents?in=1&type=0&search=',
67
+ '<title>Access denied</title>'),
68
+ ('torrentcore', 'https://torrentcore.xyz/index', '<title>Torrent[CORE] - Torrent community.</title>'),
69
+ ('torrentqq223', 'https://torrentqq223.com/torrent/newest.html', 'https://torrentqq223.com/ads/'),
70
+ ('36dm', 'https://www.36dm.club/1.html', 'https://www.36dm.club/yesterday-1.html'),
71
+ ('erai-raws', 'https://www.erai-raws.info/feed/?type=magnet', '403 Forbidden'),
72
+ ('teamos', 'https://www.teamos.xyz/torrents/?filename=&freeleech=',
73
+ '<title>Log in | Team OS : Your Only Destination To Custom OS !!</title>'),
74
+ ('yts', 'https://yts.unblockninja.com/api/v2/list_movies.json?query_term=&limit=50&sort=date_added',
75
+ '{"movie_count":')
76
+ ]
77
+ for site_name, site_url, site_text in sites_get:
78
+ with self.subTest(msg=site_name):
79
+ res = self.app.post_json('/v1', {
80
+ "cmd": "request.get",
81
+ "url": site_url
82
+ })
83
+ asset_cloudflare_solution(self, res, site_url, site_text)
84
+
85
+ def test_v1_endpoint_request_post_cloudflare(self):
86
+ sites_post = [
87
+ ('nnmclub', 'https://nnmclub.to/forum/tracker.php', '<title>Трекер :: NNM-Club</title>',
88
+ 'prev_sd=0&prev_a=0&prev_my=0&prev_n=0&prev_shc=0&prev_shf=1&prev_sha=1&prev_shs=0&prev_shr=0&prev_sht=0&f%5B%5D=-1&o=1&s=2&tm=-1&shf=1&sha=1&ta=-1&sns=-1&sds=-1&nm=&pn=&submit=%CF%EE%E8%F1%EA')
89
+ ]
90
+
91
+ for site_name, site_url, site_text, post_data in sites_post:
92
+ with self.subTest(msg=site_name):
93
+ res = self.app.post_json('/v1', {
94
+ "cmd": "request.post",
95
+ "url": site_url,
96
+ "postData": post_data
97
+ })
98
+ asset_cloudflare_solution(self, res, site_url, site_text)
99
+
100
+
101
+ if __name__ == '__main__':
102
+ unittest.main()
flaresolverr/undetected_chromedriver/__init__.py ADDED
@@ -0,0 +1,910 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ """
4
+
5
+ 888 888 d8b
6
+ 888 888 Y8P
7
+ 888 888
8
+ .d8888b 88888b. 888d888 .d88b. 88888b.d88b. .d88b. .d88888 888d888 888 888 888 .d88b. 888d888
9
+ d88P" 888 "88b 888P" d88""88b 888 "888 "88b d8P Y8b d88" 888 888P" 888 888 888 d8P Y8b 888P"
10
+ 888 888 888 888 888 888 888 888 888 88888888 888 888 888 888 Y88 88P 88888888 888
11
+ Y88b. 888 888 888 Y88..88P 888 888 888 Y8b. Y88b 888 888 888 Y8bd8P Y8b. 888
12
+ "Y8888P 888 888 888 "Y88P" 888 888 888 "Y8888 "Y88888 888 888 Y88P "Y8888 888 88888888
13
+
14
+ by UltrafunkAmsterdam (https://github.com/ultrafunkamsterdam)
15
+
16
+ """
17
+ from __future__ import annotations
18
+
19
+
20
+ __version__ = "3.5.5"
21
+
22
+ import json
23
+ import logging
24
+ import os
25
+ import pathlib
26
+ import re
27
+ import shutil
28
+ import subprocess
29
+ import sys
30
+ import tempfile
31
+ import time
32
+ from weakref import finalize
33
+
34
+ import selenium.webdriver.chrome.service
35
+ import selenium.webdriver.chrome.webdriver
36
+ from selenium.webdriver.common.by import By
37
+ import selenium.webdriver.chromium.service
38
+ import selenium.webdriver.remote.command
39
+ import selenium.webdriver.remote.webdriver
40
+
41
+ from .cdp import CDP
42
+ from .dprocess import start_detached
43
+ from .options import ChromeOptions
44
+ from .patcher import IS_POSIX
45
+ from .patcher import Patcher
46
+ from .reactor import Reactor
47
+ from .webelement import UCWebElement
48
+ from .webelement import WebElement
49
+
50
+
51
+ __all__ = (
52
+ "Chrome",
53
+ "ChromeOptions",
54
+ "Patcher",
55
+ "Reactor",
56
+ "CDP",
57
+ "find_chrome_executable",
58
+ )
59
+
60
+ logger = logging.getLogger("uc")
61
+ logger.setLevel(logging.getLogger().getEffectiveLevel())
62
+
63
+
64
+ class Chrome(selenium.webdriver.chrome.webdriver.WebDriver):
65
+ """
66
+
67
+ Controls the ChromeDriver and allows you to drive the browser.
68
+
69
+ The webdriver file will be downloaded by this module automatically,
70
+ you do not need to specify this. however, you may if you wish.
71
+
72
+ Attributes
73
+ ----------
74
+
75
+ Methods
76
+ -------
77
+
78
+ reconnect()
79
+
80
+ this can be useful in case of heavy detection methods
81
+ -stops the chromedriver service which runs in the background
82
+ -starts the chromedriver service which runs in the background
83
+ -recreate session
84
+
85
+
86
+ start_session(capabilities=None, browser_profile=None)
87
+
88
+ differentiates from the regular method in that it does not
89
+ require a capabilities argument. The capabilities are automatically
90
+ recreated from the options at creation time.
91
+
92
+ --------------------------------------------------------------------------
93
+ NOTE:
94
+ Chrome has everything included to work out of the box.
95
+ it does not `need` customizations.
96
+ any customizations MAY lead to trigger bot migitation systems.
97
+
98
+ --------------------------------------------------------------------------
99
+ """
100
+
101
+ _instances = set()
102
+ session_id = None
103
+ debug = False
104
+
105
+ def __init__(
106
+ self,
107
+ options=None,
108
+ user_data_dir=None,
109
+ driver_executable_path=None,
110
+ browser_executable_path=None,
111
+ port=0,
112
+ enable_cdp_events=False,
113
+ # service_args=None,
114
+ # service_creationflags=None,
115
+ desired_capabilities=None,
116
+ advanced_elements=False,
117
+ # service_log_path=None,
118
+ keep_alive=True,
119
+ log_level=0,
120
+ headless=False,
121
+ version_main=None,
122
+ patcher_force_close=False,
123
+ suppress_welcome=True,
124
+ use_subprocess=False,
125
+ debug=False,
126
+ no_sandbox=True,
127
+ windows_headless=False,
128
+ user_multi_procs: bool = False,
129
+ **kw,
130
+ ):
131
+ """
132
+ Creates a new instance of the chrome driver.
133
+
134
+ Starts the service and then creates new instance of chrome driver.
135
+
136
+ Parameters
137
+ ----------
138
+
139
+ options: ChromeOptions, optional, default: None - automatic useful defaults
140
+ this takes an instance of ChromeOptions, mainly to customize browser behavior.
141
+ anything other dan the default, for example extensions or startup options
142
+ are not supported in case of failure, and can probably lowers your undetectability.
143
+
144
+
145
+ user_data_dir: str , optional, default: None (creates temp profile)
146
+ if user_data_dir is a path to a valid chrome profile directory, use it,
147
+ and turn off automatic removal mechanism at exit.
148
+
149
+ driver_executable_path: str, optional, default: None(=downloads and patches new binary)
150
+
151
+ browser_executable_path: str, optional, default: None - use find_chrome_executable
152
+ Path to the browser executable.
153
+ If not specified, make sure the executable's folder is in $PATH
154
+
155
+ port: int, optional, default: 0
156
+ port to be used by the chromedriver executable, this is NOT the debugger port.
157
+ leave it at 0 unless you know what you are doing.
158
+ the default value of 0 automatically picks an available port.
159
+
160
+ enable_cdp_events: bool, default: False
161
+ :: currently for chrome only
162
+ this enables the handling of wire messages
163
+ when enabled, you can subscribe to CDP events by using:
164
+
165
+ driver.add_cdp_listener("Network.dataReceived", yourcallback)
166
+ # yourcallback is an callable which accepts exactly 1 dict as parameter
167
+
168
+
169
+ service_args: list of str, optional, default: None
170
+ arguments to pass to the driver service
171
+
172
+ desired_capabilities: dict, optional, default: None - auto from config
173
+ Dictionary object with non-browser specific capabilities only, such as "item" or "loggingPref".
174
+
175
+ advanced_elements: bool, optional, default: False
176
+ makes it easier to recognize elements like you know them from html/browser inspection, especially when working
177
+ in an interactive environment
178
+
179
+ default webelement repr:
180
+ <selenium.webdriver.remote.webelement.WebElement (session="85ff0f671512fa535630e71ee951b1f2", element="6357cb55-92c3-4c0f-9416-b174f9c1b8c4")>
181
+
182
+ advanced webelement repr
183
+ <WebElement(<a class="mobile-show-inline-block mc-update-infos init-ok" href="#" id="main-cat-switcher-mobile">)>
184
+
185
+ note: when retrieving large amounts of elements ( example: find_elements_by_tag("*") ) and print them, it does take a little more time.
186
+
187
+
188
+ service_log_path: str, optional, default: None
189
+ path to log information from the driver.
190
+
191
+ keep_alive: bool, optional, default: True
192
+ Whether to configure ChromeRemoteConnection to use HTTP keep-alive.
193
+
194
+ log_level: int, optional, default: adapts to python global log level
195
+
196
+ headless: bool, optional, default: False
197
+ can also be specified in the options instance.
198
+ Specify whether you want to use the browser in headless mode.
199
+ warning: this lowers undetectability and not fully supported.
200
+
201
+ version_main: int, optional, default: None (=auto)
202
+ if you, for god knows whatever reason, use
203
+ an older version of Chrome. You can specify it's full rounded version number
204
+ here. Example: 87 for all versions of 87
205
+
206
+ patcher_force_close: bool, optional, default: False
207
+ instructs the patcher to do whatever it can to access the chromedriver binary
208
+ if the file is locked, it will force shutdown all instances.
209
+ setting it is not recommended, unless you know the implications and think
210
+ you might need it.
211
+
212
+ suppress_welcome: bool, optional , default: True
213
+ a "welcome" alert might show up on *nix-like systems asking whether you want to set
214
+ chrome as your default browser, and if you want to send even more data to google.
215
+ now, in case you are nag-fetishist, or a diagnostics data feeder to google, you can set this to False.
216
+ Note: if you don't handle the nag screen in time, the browser loses it's connection and throws an Exception.
217
+
218
+ use_subprocess: bool, optional , default: True,
219
+
220
+ False (the default) makes sure Chrome will get it's own process (so no subprocess of chromedriver.exe or python
221
+ This fixes a LOT of issues, like multithreaded run, but mst importantly. shutting corectly after
222
+ program exits or using .quit()
223
+ you should be knowing what you're doing, and know how python works.
224
+
225
+ unfortunately, there is always an edge case in which one would like to write an single script with the only contents being:
226
+ --start script--
227
+ import undetected_chromedriver as uc
228
+ d = uc.Chrome()
229
+ d.get('https://somesite/')
230
+ ---end script --
231
+
232
+ and will be greeted with an error, since the program exists before chrome has a change to launch.
233
+ in that case you can set this to `True`. The browser will start via subprocess, and will keep running most of times.
234
+ ! setting it to True comes with NO support when being detected. !
235
+
236
+ no_sandbox: bool, optional, default=True
237
+ uses the --no-sandbox option, and additionally does suppress the "unsecure option" status bar
238
+ this option has a default of True since many people seem to run this as root (....) , and chrome does not start
239
+ when running as root without using --no-sandbox flag.
240
+
241
+ user_multi_procs:
242
+ set to true when you are using multithreads/multiprocessing
243
+ ensures not all processes are trying to modify a binary which is in use by another.
244
+ for this to work. YOU MUST HAVE AT LEAST 1 UNDETECTED_CHROMEDRIVER BINARY IN YOUR ROAMING DATA FOLDER.
245
+ this requirement can be easily satisfied, by just running this program "normal" and close/kill it.
246
+
247
+
248
+ """
249
+
250
+ finalize(self, self._ensure_close, self)
251
+ self.debug = debug
252
+ self.patcher = Patcher(
253
+ executable_path=driver_executable_path,
254
+ force=patcher_force_close,
255
+ version_main=version_main,
256
+ user_multi_procs=user_multi_procs,
257
+ )
258
+ # self.patcher.auto(user_multiprocess = user_multi_num_procs)
259
+ self.patcher.auto()
260
+
261
+ # self.patcher = patcher
262
+ if not options:
263
+ options = ChromeOptions()
264
+
265
+ try:
266
+ if hasattr(options, "_session") and options._session is not None:
267
+ # prevent reuse of options,
268
+ # as it just appends arguments, not replace them
269
+ # you'll get conflicts starting chrome
270
+ raise RuntimeError("you cannot reuse the ChromeOptions object")
271
+ except AttributeError:
272
+ pass
273
+
274
+ options._session = self
275
+
276
+ if not options.debugger_address:
277
+ debug_port = (
278
+ port
279
+ if port != 0
280
+ else selenium.webdriver.common.service.utils.free_port()
281
+ )
282
+ debug_host = "127.0.0.1"
283
+ options.debugger_address = "%s:%d" % (debug_host, debug_port)
284
+ else:
285
+ debug_host, debug_port = options.debugger_address.split(":")
286
+ debug_port = int(debug_port)
287
+
288
+ if enable_cdp_events:
289
+ options.set_capability(
290
+ "goog:loggingPrefs", {"performance": "ALL", "browser": "ALL"}
291
+ )
292
+
293
+ options.add_argument("--remote-debugging-host=%s" % debug_host)
294
+ options.add_argument("--remote-debugging-port=%s" % debug_port)
295
+
296
+ if user_data_dir:
297
+ options.add_argument("--user-data-dir=%s" % user_data_dir)
298
+
299
+ language, keep_user_data_dir = None, bool(user_data_dir)
300
+
301
+ # see if a custom user profile is specified in options
302
+ for arg in options.arguments:
303
+
304
+ if any([_ in arg for _ in ("--headless", "headless")]):
305
+ options.arguments.remove(arg)
306
+ options.headless = True
307
+
308
+ if "lang" in arg:
309
+ m = re.search("(?:--)?lang(?:[ =])?(.*)", arg)
310
+ try:
311
+ language = m[1]
312
+ except IndexError:
313
+ logger.debug("will set the language to en-US,en;q=0.9")
314
+ language = "en-US,en;q=0.9"
315
+
316
+ if "user-data-dir" in arg:
317
+ m = re.search("(?:--)?user-data-dir(?:[ =])?(.*)", arg)
318
+ try:
319
+ user_data_dir = m[1]
320
+ logger.debug(
321
+ "user-data-dir found in user argument %s => %s" % (arg, m[1])
322
+ )
323
+ keep_user_data_dir = True
324
+
325
+ except IndexError:
326
+ logger.debug(
327
+ "no user data dir could be extracted from supplied argument %s "
328
+ % arg
329
+ )
330
+
331
+ if not user_data_dir:
332
+ # backward compatiblity
333
+ # check if an old uc.ChromeOptions is used, and extract the user data dir
334
+
335
+ if hasattr(options, "user_data_dir") and getattr(
336
+ options, "user_data_dir", None
337
+ ):
338
+ import warnings
339
+
340
+ warnings.warn(
341
+ "using ChromeOptions.user_data_dir might stop working in future versions."
342
+ "use uc.Chrome(user_data_dir='/xyz/some/data') in case you need existing profile folder"
343
+ )
344
+ options.add_argument("--user-data-dir=%s" % options.user_data_dir)
345
+ keep_user_data_dir = True
346
+ logger.debug(
347
+ "user_data_dir property found in options object: %s" % user_data_dir
348
+ )
349
+
350
+ else:
351
+ user_data_dir = os.path.normpath(tempfile.mkdtemp())
352
+ keep_user_data_dir = False
353
+ arg = "--user-data-dir=%s" % user_data_dir
354
+ options.add_argument(arg)
355
+ logger.debug(
356
+ "created a temporary folder in which the user-data (profile) will be stored during this\n"
357
+ "session, and added it to chrome startup arguments: %s" % arg
358
+ )
359
+
360
+ if not language:
361
+ try:
362
+ import locale
363
+
364
+ language = locale.getdefaultlocale()[0].replace("_", "-")
365
+ except Exception:
366
+ pass
367
+ if not language:
368
+ language = "en-US"
369
+
370
+ options.add_argument("--lang=%s" % language)
371
+
372
+ if not options.binary_location:
373
+ options.binary_location = (
374
+ browser_executable_path or find_chrome_executable()
375
+ )
376
+
377
+ if not options.binary_location or not \
378
+ pathlib.Path(options.binary_location).exists():
379
+ raise FileNotFoundError(
380
+ "\n---------------------\n"
381
+ "Could not determine browser executable."
382
+ "\n---------------------\n"
383
+ "Make sure your browser is installed in the default location (path).\n"
384
+ "If you are sure about the browser executable, you can specify it using\n"
385
+ "the `browser_executable_path='{}` parameter.\n\n"
386
+ .format("/path/to/browser/executable" if IS_POSIX else "c:/path/to/your/browser.exe")
387
+ )
388
+
389
+ self._delay = 3
390
+
391
+ self.user_data_dir = user_data_dir
392
+ self.keep_user_data_dir = keep_user_data_dir
393
+
394
+ if suppress_welcome:
395
+ options.arguments.extend(["--no-default-browser-check", "--no-first-run"])
396
+ if no_sandbox:
397
+ options.arguments.extend(["--no-sandbox", "--test-type"])
398
+
399
+ if headless or getattr(options, 'headless', None):
400
+ #workaround until a better checking is found
401
+ try:
402
+ v_main = int(self.patcher.version_main) if self.patcher.version_main else 108
403
+ if v_main < 108:
404
+ options.add_argument("--headless=chrome")
405
+ elif v_main >= 108:
406
+ options.add_argument("--headless=new")
407
+ except:
408
+ logger.warning("could not detect version_main."
409
+ "therefore, we are assuming it is chrome 108 or higher")
410
+ options.add_argument("--headless=new")
411
+
412
+ options.add_argument("--window-size=1920,1080")
413
+ options.add_argument("--start-maximized")
414
+ options.add_argument("--no-sandbox")
415
+ # fixes "could not connect to chrome" error when running
416
+ # on linux using privileged user like root (which i don't recommend)
417
+
418
+ options.add_argument(
419
+ "--log-level=%d" % log_level
420
+ or divmod(logging.getLogger().getEffectiveLevel(), 10)[0]
421
+ )
422
+
423
+ if hasattr(options, "handle_prefs"):
424
+ options.handle_prefs(user_data_dir)
425
+
426
+ # fix exit_type flag to prevent tab-restore nag
427
+ try:
428
+ with open(
429
+ os.path.join(user_data_dir, "Default/Preferences"),
430
+ encoding="latin1",
431
+ mode="r+",
432
+ ) as fs:
433
+ config = json.load(fs)
434
+ if config["profile"]["exit_type"] is not None:
435
+ # fixing the restore-tabs-nag
436
+ config["profile"]["exit_type"] = None
437
+ fs.seek(0, 0)
438
+ json.dump(config, fs)
439
+ fs.truncate() # the file might be shorter
440
+ logger.debug("fixed exit_type flag")
441
+ except Exception as e:
442
+ logger.debug("did not find a bad exit_type flag ")
443
+
444
+ self.options = options
445
+
446
+ if not desired_capabilities:
447
+ desired_capabilities = options.to_capabilities()
448
+
449
+ if not use_subprocess and not windows_headless:
450
+ self.browser_pid = start_detached(
451
+ options.binary_location, *options.arguments
452
+ )
453
+ else:
454
+ startupinfo = None
455
+ if os.name == 'nt' and windows_headless:
456
+ # STARTUPINFO() is Windows only
457
+ startupinfo = subprocess.STARTUPINFO()
458
+ startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
459
+ browser = subprocess.Popen(
460
+ [options.binary_location, *options.arguments],
461
+ stdin=subprocess.PIPE,
462
+ stdout=subprocess.PIPE,
463
+ stderr=subprocess.PIPE,
464
+ close_fds=IS_POSIX,
465
+ startupinfo=startupinfo
466
+ )
467
+ self.browser_pid = browser.pid
468
+
469
+
470
+ service = selenium.webdriver.chromium.service.ChromiumService(
471
+ self.patcher.executable_path
472
+ )
473
+
474
+ super().__init__(
475
+ service=service,
476
+ options=options,
477
+ keep_alive=keep_alive,
478
+ )
479
+
480
+ self.reactor = None
481
+
482
+ if enable_cdp_events:
483
+ if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
484
+ logging.getLogger(
485
+ "selenium.webdriver.remote.remote_connection"
486
+ ).setLevel(20)
487
+ reactor = Reactor(self)
488
+ reactor.start()
489
+ self.reactor = reactor
490
+
491
+ if advanced_elements:
492
+ self._web_element_cls = UCWebElement
493
+ else:
494
+ self._web_element_cls = WebElement
495
+
496
+ if headless or getattr(options, 'headless', None):
497
+ self._configure_headless()
498
+
499
+ def _configure_headless(self):
500
+ orig_get = self.get
501
+ logger.info("setting properties for headless")
502
+
503
+ def get_wrapped(*args, **kwargs):
504
+ if self.execute_script("return navigator.webdriver"):
505
+ logger.info("patch navigator.webdriver")
506
+ self.execute_cdp_cmd(
507
+ "Page.addScriptToEvaluateOnNewDocument",
508
+ {
509
+ "source": """
510
+ Object.defineProperty(window, "navigator", {
511
+ value: new Proxy(navigator, {
512
+ has: (target, key) => (key === "webdriver" ? false : key in target),
513
+ get: (target, key) =>
514
+ key === "webdriver"
515
+ ? false
516
+ : typeof target[key] === "function"
517
+ ? target[key].bind(target)
518
+ : target[key],
519
+ }),
520
+ });
521
+ """
522
+ },
523
+ )
524
+
525
+ logger.info("patch user-agent string")
526
+ self.execute_cdp_cmd(
527
+ "Network.setUserAgentOverride",
528
+ {
529
+ "userAgent": self.execute_script(
530
+ "return navigator.userAgent"
531
+ ).replace("Headless", "")
532
+ },
533
+ )
534
+ self.execute_cdp_cmd(
535
+ "Page.addScriptToEvaluateOnNewDocument",
536
+ {
537
+ "source": """
538
+ Object.defineProperty(navigator, 'maxTouchPoints', {get: () => 1});
539
+ Object.defineProperty(navigator.connection, 'rtt', {get: () => 100});
540
+
541
+ // https://github.com/microlinkhq/browserless/blob/master/packages/goto/src/evasions/chrome-runtime.js
542
+ window.chrome = {
543
+ app: {
544
+ isInstalled: false,
545
+ InstallState: {
546
+ DISABLED: 'disabled',
547
+ INSTALLED: 'installed',
548
+ NOT_INSTALLED: 'not_installed'
549
+ },
550
+ RunningState: {
551
+ CANNOT_RUN: 'cannot_run',
552
+ READY_TO_RUN: 'ready_to_run',
553
+ RUNNING: 'running'
554
+ }
555
+ },
556
+ runtime: {
557
+ OnInstalledReason: {
558
+ CHROME_UPDATE: 'chrome_update',
559
+ INSTALL: 'install',
560
+ SHARED_MODULE_UPDATE: 'shared_module_update',
561
+ UPDATE: 'update'
562
+ },
563
+ OnRestartRequiredReason: {
564
+ APP_UPDATE: 'app_update',
565
+ OS_UPDATE: 'os_update',
566
+ PERIODIC: 'periodic'
567
+ },
568
+ PlatformArch: {
569
+ ARM: 'arm',
570
+ ARM64: 'arm64',
571
+ MIPS: 'mips',
572
+ MIPS64: 'mips64',
573
+ X86_32: 'x86-32',
574
+ X86_64: 'x86-64'
575
+ },
576
+ PlatformNaclArch: {
577
+ ARM: 'arm',
578
+ MIPS: 'mips',
579
+ MIPS64: 'mips64',
580
+ X86_32: 'x86-32',
581
+ X86_64: 'x86-64'
582
+ },
583
+ PlatformOs: {
584
+ ANDROID: 'android',
585
+ CROS: 'cros',
586
+ LINUX: 'linux',
587
+ MAC: 'mac',
588
+ OPENBSD: 'openbsd',
589
+ WIN: 'win'
590
+ },
591
+ RequestUpdateCheckStatus: {
592
+ NO_UPDATE: 'no_update',
593
+ THROTTLED: 'throttled',
594
+ UPDATE_AVAILABLE: 'update_available'
595
+ }
596
+ }
597
+ }
598
+
599
+ // https://github.com/microlinkhq/browserless/blob/master/packages/goto/src/evasions/navigator-permissions.js
600
+ if (!window.Notification) {
601
+ window.Notification = {
602
+ permission: 'denied'
603
+ }
604
+ }
605
+
606
+ const originalQuery = window.navigator.permissions.query
607
+ window.navigator.permissions.__proto__.query = parameters =>
608
+ parameters.name === 'notifications'
609
+ ? Promise.resolve({ state: window.Notification.permission })
610
+ : originalQuery(parameters)
611
+
612
+ const oldCall = Function.prototype.call
613
+ function call() {
614
+ return oldCall.apply(this, arguments)
615
+ }
616
+ Function.prototype.call = call
617
+
618
+ const nativeToStringFunctionString = Error.toString().replace(/Error/g, 'toString')
619
+ const oldToString = Function.prototype.toString
620
+
621
+ function functionToString() {
622
+ if (this === window.navigator.permissions.query) {
623
+ return 'function query() { [native code] }'
624
+ }
625
+ if (this === functionToString) {
626
+ return nativeToStringFunctionString
627
+ }
628
+ return oldCall.call(oldToString, this)
629
+ }
630
+ // eslint-disable-next-line
631
+ Function.prototype.toString = functionToString
632
+ """
633
+ },
634
+ )
635
+ return orig_get(*args, **kwargs)
636
+
637
+ self.get = get_wrapped
638
+
639
+ # def _get_cdc_props(self):
640
+ # return self.execute_script(
641
+ # """
642
+ # let objectToInspect = window,
643
+ # result = [];
644
+ # while(objectToInspect !== null)
645
+ # { result = result.concat(Object.getOwnPropertyNames(objectToInspect));
646
+ # objectToInspect = Object.getPrototypeOf(objectToInspect); }
647
+ #
648
+ # return result.filter(i => i.match(/^([a-zA-Z]){27}(Array|Promise|Symbol)$/ig))
649
+ # """
650
+ # )
651
+ #
652
+ # def _hook_remove_cdc_props(self):
653
+ # self.execute_cdp_cmd(
654
+ # "Page.addScriptToEvaluateOnNewDocument",
655
+ # {
656
+ # "source": """
657
+ # let objectToInspect = window,
658
+ # result = [];
659
+ # while(objectToInspect !== null)
660
+ # { result = result.concat(Object.getOwnPropertyNames(objectToInspect));
661
+ # objectToInspect = Object.getPrototypeOf(objectToInspect); }
662
+ # result.forEach(p => p.match(/^([a-zA-Z]){27}(Array|Promise|Symbol)$/ig)
663
+ # &&delete window[p]&&console.log('removed',p))
664
+ # """
665
+ # },
666
+ # )
667
+
668
+ def get(self, url):
669
+ # if self._get_cdc_props():
670
+ # self._hook_remove_cdc_props()
671
+ return super().get(url)
672
+
673
+ def add_cdp_listener(self, event_name, callback):
674
+ if (
675
+ self.reactor
676
+ and self.reactor is not None
677
+ and isinstance(self.reactor, Reactor)
678
+ ):
679
+ self.reactor.add_event_handler(event_name, callback)
680
+ return self.reactor.handlers
681
+ return False
682
+
683
+ def clear_cdp_listeners(self):
684
+ if self.reactor and isinstance(self.reactor, Reactor):
685
+ self.reactor.handlers.clear()
686
+
687
+ def window_new(self):
688
+ self.execute(
689
+ selenium.webdriver.remote.command.Command.NEW_WINDOW, {"type": "window"}
690
+ )
691
+
692
+ def tab_new(self, url: str):
693
+ """
694
+ this opens a url in a new tab.
695
+ apparently, that passes all tests directly!
696
+
697
+ Parameters
698
+ ----------
699
+ url
700
+
701
+ Returns
702
+ -------
703
+
704
+ """
705
+ if not hasattr(self, "cdp"):
706
+ from .cdp import CDP
707
+
708
+ cdp = CDP(self.options)
709
+ cdp.tab_new(url)
710
+
711
+ def reconnect(self, timeout=0.1):
712
+ try:
713
+ self.service.stop()
714
+ except Exception as e:
715
+ logger.debug(e)
716
+ time.sleep(timeout)
717
+ try:
718
+ self.service.start()
719
+ except Exception as e:
720
+ logger.debug(e)
721
+
722
+ try:
723
+ self.start_session()
724
+ except Exception as e:
725
+ logger.debug(e)
726
+
727
+ def start_session(self, capabilities=None, browser_profile=None):
728
+ if not capabilities:
729
+ capabilities = self.options.to_capabilities()
730
+ super().start_session(capabilities)
731
+ # super(Chrome, self).start_session(capabilities, browser_profile) # Original explicit call commented out
732
+
733
+ def find_elements_recursive(self, by, value):
734
+ """
735
+ find elements in all frames
736
+ this is a generator function, which is needed
737
+ since if it would return a list of elements, they
738
+ will be stale on arrival.
739
+ using generator, when the element is returned we are in the correct frame
740
+ to use it directly
741
+ Args:
742
+ by: By
743
+ value: str
744
+ Returns: Generator[webelement.WebElement]
745
+ """
746
+ def search_frame(f=None):
747
+ if not f:
748
+ # ensure we are on main content frame
749
+ self.switch_to.default_content()
750
+ else:
751
+ self.switch_to.frame(f)
752
+ for elem in self.find_elements(by, value):
753
+ yield elem
754
+ # switch back to main content, otherwise we will get StaleElementReferenceException
755
+ self.switch_to.default_content()
756
+
757
+ # search root frame
758
+ for elem in search_frame():
759
+ yield elem
760
+ # get iframes
761
+ frames = self.find_elements('css selector', 'iframe')
762
+
763
+ # search per frame
764
+ for f in frames:
765
+ for elem in search_frame(f):
766
+ yield elem
767
+
768
+ def quit(self):
769
+ try:
770
+ self.service.stop()
771
+ self.service.process.kill()
772
+ self.command_executor.close()
773
+ self.service.process.wait(5)
774
+ logger.debug("webdriver process ended")
775
+ except (AttributeError, RuntimeError, OSError):
776
+ pass
777
+ try:
778
+ self.reactor.event.set()
779
+ logger.debug("shutting down reactor")
780
+ except AttributeError:
781
+ pass
782
+ try:
783
+ os.kill(self.browser_pid, 15)
784
+ logger.debug("gracefully closed browser")
785
+ except Exception as e: # noqa
786
+ pass
787
+ if (
788
+ hasattr(self, "keep_user_data_dir")
789
+ and hasattr(self, "user_data_dir")
790
+ and not self.keep_user_data_dir
791
+ ):
792
+ for _ in range(5):
793
+ try:
794
+ shutil.rmtree(self.user_data_dir, ignore_errors=False)
795
+ except FileNotFoundError:
796
+ pass
797
+ except (RuntimeError, OSError, PermissionError) as e:
798
+ logger.debug(
799
+ "When removing the temp profile, a %s occured: %s\nretrying..."
800
+ % (e.__class__.__name__, e)
801
+ )
802
+ else:
803
+ logger.debug("successfully removed %s" % self.user_data_dir)
804
+ break
805
+
806
+ try:
807
+ time.sleep(0.1)
808
+ except OSError:
809
+ pass
810
+
811
+ # dereference patcher, so patcher can start cleaning up as well.
812
+ # this must come last, otherwise it will throw 'in use' errors
813
+ self.patcher = None
814
+
815
+ def __getattribute__(self, item):
816
+ if not super().__getattribute__("debug"):
817
+ return super().__getattribute__(item)
818
+ else:
819
+ import inspect
820
+
821
+ original = super().__getattribute__(item)
822
+ if inspect.ismethod(original) and not inspect.isclass(original):
823
+
824
+ def newfunc(*args, **kwargs):
825
+ logger.debug(
826
+ "calling %s with args %s and kwargs %s\n"
827
+ % (original.__qualname__, args, kwargs)
828
+ )
829
+ return original(*args, **kwargs)
830
+
831
+ return newfunc
832
+ return original
833
+
834
+ def __enter__(self):
835
+ return self
836
+
837
+ def __exit__(self, exc_type, exc_val, exc_tb):
838
+ self.service.stop()
839
+ time.sleep(self._delay)
840
+ self.service.start()
841
+ self.start_session()
842
+
843
+ def __hash__(self):
844
+ return hash(self.options.debugger_address)
845
+
846
+ def __dir__(self):
847
+ return object.__dir__(self)
848
+
849
+ def __del__(self):
850
+ try:
851
+ self.service.process.kill()
852
+ except: # noqa
853
+ pass
854
+ self.quit()
855
+
856
+ @classmethod
857
+ def _ensure_close(cls, self):
858
+ # needs to be a classmethod so finalize can find the reference
859
+ logger.info("ensuring close")
860
+ if (
861
+ hasattr(self, "service")
862
+ and hasattr(self.service, "process")
863
+ and hasattr(self.service.process, "kill")
864
+ ):
865
+ self.service.process.kill()
866
+
867
+
868
+ def find_chrome_executable():
869
+ """
870
+ Finds the chrome, chrome beta, chrome canary, chromium executable
871
+
872
+ Returns
873
+ -------
874
+ executable_path : str
875
+ the full file path to found executable
876
+
877
+ """
878
+ candidates = set()
879
+ if IS_POSIX:
880
+ for item in os.environ.get("PATH").split(os.pathsep):
881
+ for subitem in (
882
+ "google-chrome",
883
+ "chromium",
884
+ "chromium-browser",
885
+ "chrome",
886
+ "google-chrome-stable",
887
+ ):
888
+ candidates.add(os.sep.join((item, subitem)))
889
+ if "darwin" in sys.platform:
890
+ candidates.update(
891
+ [
892
+ "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
893
+ "/Applications/Chromium.app/Contents/MacOS/Chromium",
894
+ ]
895
+ )
896
+ else:
897
+ for item in map(
898
+ os.environ.get,
899
+ ("PROGRAMFILES", "PROGRAMFILES(X86)", "LOCALAPPDATA", "PROGRAMW6432"),
900
+ ):
901
+ if item is not None:
902
+ for subitem in (
903
+ "Google/Chrome/Application",
904
+ ):
905
+ candidates.add(os.sep.join((item, subitem, "chrome.exe")))
906
+ for candidate in candidates:
907
+ logger.debug('checking if %s exists and is executable' % candidate)
908
+ if os.path.exists(candidate) and os.access(candidate, os.X_OK):
909
+ logger.debug('found! using %s' % candidate)
910
+ return os.path.normpath(candidate)
flaresolverr/undetected_chromedriver/cdp.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # this module is part of undetected_chromedriver
3
+
4
+ import json
5
+ import logging
6
+
7
+ import requests
8
+ import websockets
9
+
10
+
11
+ log = logging.getLogger(__name__)
12
+
13
+
14
+ class CDPObject(dict):
15
+ def __init__(self, *a, **k):
16
+ super().__init__(*a, **k)
17
+ self.__dict__ = self
18
+ for k in self.__dict__:
19
+ if isinstance(self.__dict__[k], dict):
20
+ self.__dict__[k] = CDPObject(self.__dict__[k])
21
+ elif isinstance(self.__dict__[k], list):
22
+ for i in range(len(self.__dict__[k])):
23
+ if isinstance(self.__dict__[k][i], dict):
24
+ self.__dict__[k][i] = CDPObject(self)
25
+
26
+ def __repr__(self):
27
+ tpl = f"{self.__class__.__name__}(\n\t{{}}\n\t)"
28
+ return tpl.format("\n ".join(f"{k} = {v}" for k, v in self.items()))
29
+
30
+
31
+ class PageElement(CDPObject):
32
+ pass
33
+
34
+
35
+ class CDP:
36
+ log = logging.getLogger("CDP")
37
+
38
+ endpoints = CDPObject(
39
+ {
40
+ "json": "/json",
41
+ "protocol": "/json/protocol",
42
+ "list": "/json/list",
43
+ "new": "/json/new?{url}",
44
+ "activate": "/json/activate/{id}",
45
+ "close": "/json/close/{id}",
46
+ }
47
+ )
48
+
49
+ def __init__(self, options: "ChromeOptions"): # noqa
50
+ self.server_addr = "http://{0}:{1}".format(*options.debugger_address.split(":"))
51
+
52
+ self._reqid = 0
53
+ self._session = requests.Session()
54
+ self._last_resp = None
55
+ self._last_json = None
56
+
57
+ resp = self.get(self.endpoints.json) # noqa
58
+ self.sessionId = resp[0]["id"]
59
+ self.wsurl = resp[0]["webSocketDebuggerUrl"]
60
+
61
+ def tab_activate(self, id=None):
62
+ if not id:
63
+ active_tab = self.tab_list()[0]
64
+ id = active_tab.id # noqa
65
+ self.wsurl = active_tab.webSocketDebuggerUrl # noqa
66
+ return self.post(self.endpoints["activate"].format(id=id))
67
+
68
+ def tab_list(self):
69
+ retval = self.get(self.endpoints["list"])
70
+ return [PageElement(o) for o in retval]
71
+
72
+ def tab_new(self, url):
73
+ return self.post(self.endpoints["new"].format(url=url))
74
+
75
+ def tab_close_last_opened(self):
76
+ sessions = self.tab_list()
77
+ opentabs = [s for s in sessions if s["type"] == "page"]
78
+ return self.post(self.endpoints["close"].format(id=opentabs[-1]["id"]))
79
+
80
+ async def send(self, method: str, params: dict):
81
+ self._reqid += 1
82
+ async with websockets.connect(self.wsurl) as ws:
83
+ await ws.send(
84
+ json.dumps({"method": method, "params": params, "id": self._reqid})
85
+ )
86
+ self._last_resp = await ws.recv()
87
+ self._last_json = json.loads(self._last_resp)
88
+ self.log.info(self._last_json)
89
+
90
+ def get(self, uri):
91
+ resp = self._session.get(self.server_addr + uri)
92
+ try:
93
+ self._last_resp = resp
94
+ self._last_json = resp.json()
95
+ except Exception:
96
+ return
97
+ else:
98
+ return self._last_json
99
+
100
+ def post(self, uri, data: dict = None):
101
+ if not data:
102
+ data = {}
103
+ resp = self._session.post(self.server_addr + uri, json=data)
104
+ try:
105
+ self._last_resp = resp
106
+ self._last_json = resp.json()
107
+ except Exception:
108
+ return self._last_resp
109
+
110
+ @property
111
+ def last_json(self):
112
+ return self._last_json
flaresolverr/undetected_chromedriver/devtool.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from collections.abc import Mapping
3
+ from collections.abc import Sequence
4
+ from functools import wraps
5
+ import os
6
+ import logging
7
+ import threading
8
+ import time
9
+ import traceback
10
+ from typing import Any
11
+ from typing import Awaitable
12
+ from typing import Callable
13
+ from typing import List
14
+ from typing import Optional
15
+
16
+
17
+ class Structure(dict):
18
+ """
19
+ This is a dict-like object structure, which you should subclass
20
+ Only properties defined in the class context are used on initialization.
21
+
22
+ See example
23
+ """
24
+
25
+ _store = {}
26
+
27
+ def __init__(self, *a, **kw):
28
+ """
29
+ Instantiate a new instance.
30
+
31
+ :param a:
32
+ :param kw:
33
+ """
34
+
35
+ super().__init__()
36
+
37
+ # auxiliar dict
38
+ d = dict(*a, **kw)
39
+ for k, v in d.items():
40
+ if isinstance(v, Mapping):
41
+ self[k] = self.__class__(v)
42
+ elif isinstance(v, Sequence) and not isinstance(v, (str, bytes)):
43
+ self[k] = [self.__class__(i) for i in v]
44
+ else:
45
+ self[k] = v
46
+ super().__setattr__("__dict__", self)
47
+
48
+ def __getattr__(self, item):
49
+ return getattr(super(), item)
50
+
51
+ def __getitem__(self, item):
52
+ return super().__getitem__(item)
53
+
54
+ def __setattr__(self, key, value):
55
+ self.__setitem__(key, value)
56
+
57
+ def __setitem__(self, key, value):
58
+ super().__setitem__(key, value)
59
+
60
+ def update(self, *a, **kw):
61
+ super().update(*a, **kw)
62
+
63
+ def __eq__(self, other):
64
+ return frozenset(other.items()) == frozenset(self.items())
65
+
66
+ def __hash__(self):
67
+ return hash(frozenset(self.items()))
68
+
69
+ @classmethod
70
+ def __init_subclass__(cls, **kwargs):
71
+ cls._store = {}
72
+
73
+ def _normalize_strings(self):
74
+ for k, v in self.copy().items():
75
+ if isinstance(v, (str)):
76
+ self[k] = v.strip()
77
+
78
+
79
+ def timeout(seconds=3, on_timeout: Optional[Callable[[callable], Any]] = None):
80
+ def wrapper(func):
81
+ @wraps(func)
82
+ def wrapped(*args, **kwargs):
83
+ def function_reached_timeout():
84
+ if on_timeout:
85
+ on_timeout(func)
86
+ else:
87
+ raise TimeoutError("function call timed out")
88
+
89
+ t = threading.Timer(interval=seconds, function=function_reached_timeout)
90
+ t.start()
91
+ try:
92
+ return func(*args, **kwargs)
93
+ except:
94
+ t.cancel()
95
+ raise
96
+ finally:
97
+ t.cancel()
98
+
99
+ return wrapped
100
+
101
+ return wrapper
102
+
103
+
104
+ def test():
105
+ import sys, os
106
+
107
+ sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
108
+ import undetected_chromedriver as uc
109
+ import threading
110
+
111
+ def collector(
112
+ driver: uc.Chrome,
113
+ stop_event: threading.Event,
114
+ on_event_coro: Optional[Callable[[List[str]], Awaitable[Any]]] = None,
115
+ listen_events: Sequence = ("browser", "network", "performance"),
116
+ ):
117
+ def threaded(driver, stop_event, on_event_coro):
118
+ async def _ensure_service_started():
119
+ while (
120
+ getattr(driver, "service", False)
121
+ and getattr(driver.service, "process", False)
122
+ and driver.service.process.poll()
123
+ ):
124
+ print("waiting for driver service to come back on")
125
+ await asyncio.sleep(0.05)
126
+ # await asyncio.sleep(driver._delay or .25)
127
+
128
+ async def get_log_lines(typ):
129
+ await _ensure_service_started()
130
+ return driver.get_log(typ)
131
+
132
+ async def looper():
133
+ while not stop_event.is_set():
134
+ log_lines = []
135
+ try:
136
+ for _ in listen_events:
137
+ try:
138
+ log_lines += await get_log_lines(_)
139
+ except:
140
+ if logging.getLogger().getEffectiveLevel() <= 10:
141
+ traceback.print_exc()
142
+ continue
143
+ if log_lines and on_event_coro:
144
+ await on_event_coro(log_lines)
145
+ except Exception as e:
146
+ if logging.getLogger().getEffectiveLevel() <= 10:
147
+ traceback.print_exc()
148
+
149
+ loop = asyncio.new_event_loop()
150
+ asyncio.set_event_loop(loop)
151
+ loop.run_until_complete(looper())
152
+
153
+ t = threading.Thread(target=threaded, args=(driver, stop_event, on_event_coro))
154
+ t.start()
155
+
156
+ async def on_event(data):
157
+ print("on_event")
158
+ print("data:", data)
159
+
160
+ def func_called(fn):
161
+ def wrapped(*args, **kwargs):
162
+ print(
163
+ "func called! %s (args: %s, kwargs: %s)" % (fn.__name__, args, kwargs)
164
+ )
165
+ while driver.service.process and driver.service.process.poll() is not None:
166
+ time.sleep(0.1)
167
+ res = fn(*args, **kwargs)
168
+ print("func completed! (result: %s)" % res)
169
+ return res
170
+
171
+ return wrapped
172
+
173
+ logging.basicConfig(level=10)
174
+
175
+ options = uc.ChromeOptions()
176
+ options.set_capability(
177
+ "goog:loggingPrefs", {"performance": "ALL", "browser": "ALL", "network": "ALL"}
178
+ )
179
+
180
+ driver = uc.Chrome(version_main=96, options=options)
181
+
182
+ # driver.command_executor._request = timeout(seconds=1)(driver.command_executor._request)
183
+ driver.command_executor._request = func_called(driver.command_executor._request)
184
+ collector_stop = threading.Event()
185
+ collector(driver, collector_stop, on_event)
186
+
187
+ driver.get("https://nowsecure.nl")
188
+
189
+ time.sleep(10)
190
+
191
+ if os.name == "nt":
192
+ driver.close()
193
+ driver.quit()
flaresolverr/undetected_chromedriver/dprocess.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import atexit
2
+ import logging
3
+ import multiprocessing
4
+ import os
5
+ import platform
6
+ import signal
7
+ from subprocess import PIPE
8
+ from subprocess import Popen
9
+ import sys
10
+
11
+
12
+ CREATE_NEW_PROCESS_GROUP = 0x00000200
13
+ DETACHED_PROCESS = 0x00000008
14
+
15
+ REGISTERED = []
16
+
17
+
18
+ def start_detached(executable, *args):
19
+ """
20
+ Starts a fully independent subprocess (with no parent)
21
+ :param executable: executable
22
+ :param args: arguments to the executable, eg: ['--param1_key=param1_val', '-vvv' ...]
23
+ :return: pid of the grandchild process
24
+ """
25
+
26
+ # create pipe
27
+ reader, writer = multiprocessing.Pipe(False)
28
+
29
+ # do not keep reference
30
+ process = multiprocessing.Process(
31
+ target=_start_detached,
32
+ args=(executable, *args),
33
+ kwargs={"writer": writer},
34
+ daemon=True,
35
+ )
36
+ process.start()
37
+ process.join()
38
+ # receive pid from pipe
39
+ pid = reader.recv()
40
+ REGISTERED.append(pid)
41
+ # close pipes
42
+ writer.close()
43
+ reader.close()
44
+ process.close()
45
+
46
+ return pid
47
+
48
+
49
+ def _start_detached(executable, *args, writer: multiprocessing.Pipe = None):
50
+ # configure launch
51
+ kwargs = {}
52
+ if platform.system() == "Windows":
53
+ kwargs.update(creationflags=DETACHED_PROCESS | CREATE_NEW_PROCESS_GROUP)
54
+ elif sys.version_info < (3, 2):
55
+ # assume posix
56
+ kwargs.update(preexec_fn=os.setsid)
57
+ else: # Python 3.2+ and Unix
58
+ kwargs.update(start_new_session=True)
59
+
60
+ # run
61
+ p = Popen([executable, *args], stdin=PIPE, stdout=PIPE, stderr=PIPE, **kwargs)
62
+
63
+ # send pid to pipe
64
+ writer.send(p.pid)
65
+ sys.exit()
66
+
67
+
68
+ def _cleanup():
69
+ for pid in REGISTERED:
70
+ try:
71
+ logging.getLogger(__name__).debug("cleaning up pid %d " % pid)
72
+ os.kill(pid, signal.SIGTERM)
73
+ except: # noqa
74
+ pass
75
+
76
+
77
+ atexit.register(_cleanup)
flaresolverr/undetected_chromedriver/options.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # this module is part of undetected_chromedriver
3
+
4
+
5
+ import json
6
+ import os
7
+
8
+ from selenium.webdriver.chromium.options import ChromiumOptions as _ChromiumOptions
9
+
10
+
11
+ class ChromeOptions(_ChromiumOptions):
12
+ _session = None
13
+ _user_data_dir = None
14
+
15
+ @property
16
+ def user_data_dir(self):
17
+ return self._user_data_dir
18
+
19
+ @user_data_dir.setter
20
+ def user_data_dir(self, path: str):
21
+ """
22
+ Sets the browser profile folder to use, or creates a new profile
23
+ at given <path>.
24
+
25
+ Parameters
26
+ ----------
27
+ path: str
28
+ the path to a chrome profile folder
29
+ if it does not exist, a new profile will be created at given location
30
+ """
31
+ apath = os.path.abspath(path)
32
+ self._user_data_dir = os.path.normpath(apath)
33
+
34
+ @staticmethod
35
+ def _undot_key(key, value):
36
+ """turn a (dotted key, value) into a proper nested dict"""
37
+ if "." in key:
38
+ key, rest = key.split(".", 1)
39
+ value = ChromeOptions._undot_key(rest, value)
40
+ return {key: value}
41
+
42
+ @staticmethod
43
+ def _merge_nested(a, b):
44
+ """
45
+ merges b into a
46
+ leaf values in a are overwritten with values from b
47
+ """
48
+ for key in b:
49
+ if key in a:
50
+ if isinstance(a[key], dict) and isinstance(b[key], dict):
51
+ ChromeOptions._merge_nested(a[key], b[key])
52
+ continue
53
+ a[key] = b[key]
54
+ return a
55
+
56
+ def handle_prefs(self, user_data_dir):
57
+ prefs = self.experimental_options.get("prefs")
58
+ if prefs:
59
+ user_data_dir = user_data_dir or self._user_data_dir
60
+ default_path = os.path.join(user_data_dir, "Default")
61
+ os.makedirs(default_path, exist_ok=True)
62
+
63
+ # undot prefs dict keys
64
+ undot_prefs = {}
65
+ for key, value in prefs.items():
66
+ undot_prefs = self._merge_nested(
67
+ undot_prefs, self._undot_key(key, value)
68
+ )
69
+
70
+ prefs_file = os.path.join(default_path, "Preferences")
71
+ if os.path.exists(prefs_file):
72
+ with open(prefs_file, encoding="latin1", mode="r") as f:
73
+ undot_prefs = self._merge_nested(json.load(f), undot_prefs)
74
+
75
+ with open(prefs_file, encoding="latin1", mode="w") as f:
76
+ json.dump(undot_prefs, f)
77
+
78
+ # remove the experimental_options to avoid an error
79
+ del self._experimental_options["prefs"]
80
+
81
+ @classmethod
82
+ def from_options(cls, options):
83
+ o = cls()
84
+ o.__dict__.update(options.__dict__)
85
+ return o
flaresolverr/undetected_chromedriver/patcher.py ADDED
@@ -0,0 +1,473 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # this module is part of undetected_chromedriver
3
+
4
+ from packaging.version import Version as LooseVersion
5
+ import io
6
+ import json
7
+ import logging
8
+ import os
9
+ import pathlib
10
+ import platform
11
+ import random
12
+ import re
13
+ import shutil
14
+ import string
15
+ import subprocess
16
+ import sys
17
+ import time
18
+ from urllib.request import urlopen
19
+ from urllib.request import urlretrieve
20
+ import zipfile
21
+ from multiprocessing import Lock
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ IS_POSIX = sys.platform.startswith(("darwin", "cygwin", "linux", "linux2", "freebsd"))
26
+
27
+
28
+ class Patcher(object):
29
+ lock = Lock()
30
+ exe_name = "chromedriver%s"
31
+
32
+ platform = sys.platform
33
+ if platform.endswith("win32"):
34
+ d = "~/appdata/roaming/undetected_chromedriver"
35
+ elif "LAMBDA_TASK_ROOT" in os.environ:
36
+ d = "/tmp/undetected_chromedriver"
37
+ elif platform.startswith(("linux", "linux2")):
38
+ d = "~/.local/share/undetected_chromedriver"
39
+ elif platform.endswith("darwin"):
40
+ d = "~/Library/Application Support/undetected_chromedriver"
41
+ else:
42
+ d = "~/.undetected_chromedriver"
43
+ data_path = os.path.abspath(os.path.expanduser(d))
44
+
45
+ def __init__(
46
+ self,
47
+ executable_path=None,
48
+ force=False,
49
+ version_main: int = 0,
50
+ user_multi_procs=False,
51
+ ):
52
+ """
53
+ Args:
54
+ executable_path: None = automatic
55
+ a full file path to the chromedriver executable
56
+ force: False
57
+ terminate processes which are holding lock
58
+ version_main: 0 = auto
59
+ specify main chrome version (rounded, ex: 82)
60
+ """
61
+ self.force = force
62
+ self._custom_exe_path = False
63
+ prefix = "undetected"
64
+ self.user_multi_procs = user_multi_procs
65
+
66
+ try:
67
+ # Try to convert version_main into an integer
68
+ version_main_int = int(version_main)
69
+ # check if version_main_int is less than or equal to e.g 114
70
+ self.is_old_chromedriver = version_main and version_main_int <= 114
71
+ except (ValueError,TypeError):
72
+ # Check not running inside Docker
73
+ if not os.path.exists("/app/chromedriver"):
74
+ # If the conversion fails, log an error message
75
+ logging.info("version_main cannot be converted to an integer")
76
+ # Set self.is_old_chromedriver to False if the conversion fails
77
+ self.is_old_chromedriver = False
78
+
79
+ # Needs to be called before self.exe_name is accessed
80
+ self._set_platform_name()
81
+
82
+ if not os.path.exists(self.data_path):
83
+ os.makedirs(self.data_path, exist_ok=True)
84
+
85
+ if not executable_path:
86
+ if sys.platform.startswith("freebsd"):
87
+ self.executable_path = os.path.join(
88
+ self.data_path, self.exe_name
89
+ )
90
+ else:
91
+ self.executable_path = os.path.join(
92
+ self.data_path, "_".join([prefix, self.exe_name])
93
+ )
94
+
95
+ if not IS_POSIX:
96
+ if executable_path:
97
+ if not executable_path[-4:] == ".exe":
98
+ executable_path += ".exe"
99
+
100
+ self.zip_path = os.path.join(self.data_path, prefix)
101
+
102
+ if not executable_path:
103
+ if not self.user_multi_procs:
104
+ self.executable_path = os.path.abspath(
105
+ os.path.join(".", self.executable_path)
106
+ )
107
+
108
+ if executable_path:
109
+ self._custom_exe_path = True
110
+ self.executable_path = executable_path
111
+
112
+ # Set the correct repository to download the Chromedriver from
113
+ if self.is_old_chromedriver:
114
+ self.url_repo = "https://chromedriver.storage.googleapis.com"
115
+ else:
116
+ self.url_repo = "https://googlechromelabs.github.io/chrome-for-testing"
117
+
118
+ self.version_main = version_main
119
+ self.version_full = None
120
+
121
+ def _set_platform_name(self):
122
+ """
123
+ Set the platform and exe name based on the platform undetected_chromedriver is running on
124
+ in order to download the correct chromedriver.
125
+ """
126
+ if self.platform.endswith("win32"):
127
+ self.platform_name = "win32"
128
+ self.exe_name %= ".exe"
129
+ if self.platform.endswith(("linux", "linux2")):
130
+ self.platform_name = "linux64"
131
+ self.exe_name %= ""
132
+ if self.platform.endswith("darwin"):
133
+ if self.is_old_chromedriver:
134
+ self.platform_name = "mac64"
135
+ else:
136
+ self.platform_name = "mac-x64"
137
+ self.exe_name %= ""
138
+ if self.platform.startswith("freebsd"):
139
+ self.platform_name = "freebsd"
140
+ self.exe_name %= ""
141
+
142
+ def auto(self, executable_path=None, force=False, version_main=None, _=None):
143
+ """
144
+
145
+ Args:
146
+ executable_path:
147
+ force:
148
+ version_main:
149
+
150
+ Returns:
151
+
152
+ """
153
+ p = pathlib.Path(self.data_path)
154
+ if self.user_multi_procs:
155
+ with Lock():
156
+ files = list(p.rglob("*chromedriver*"))
157
+ most_recent = max(files, key=lambda f: f.stat().st_mtime)
158
+ files.remove(most_recent)
159
+ list(map(lambda f: f.unlink(), files))
160
+ if self.is_binary_patched(most_recent):
161
+ self.executable_path = str(most_recent)
162
+ return True
163
+
164
+ if executable_path:
165
+ self.executable_path = executable_path
166
+ self._custom_exe_path = True
167
+
168
+ if self._custom_exe_path:
169
+ ispatched = self.is_binary_patched(self.executable_path)
170
+ if not ispatched:
171
+ return self.patch_exe()
172
+ else:
173
+ return
174
+
175
+ if version_main:
176
+ self.version_main = version_main
177
+ if force is True:
178
+ self.force = force
179
+
180
+
181
+ if self.platform_name == "freebsd":
182
+ chromedriver_path = shutil.which("chromedriver")
183
+
184
+ if not os.path.isfile(chromedriver_path) or not os.access(chromedriver_path, os.X_OK):
185
+ logging.error("Chromedriver not installed!")
186
+ return
187
+
188
+ version_path = os.path.join(os.path.dirname(self.executable_path), "version.txt")
189
+
190
+ process = os.popen(f'"{chromedriver_path}" --version')
191
+ chromedriver_version = process.read().split(' ')[1].split(' ')[0]
192
+ process.close()
193
+
194
+ current_version = None
195
+ if os.path.isfile(version_path) or os.access(version_path, os.X_OK):
196
+ with open(version_path, 'r') as f:
197
+ current_version = f.read()
198
+
199
+ if current_version != chromedriver_version:
200
+ logging.info("Copying chromedriver executable...")
201
+ shutil.copy(chromedriver_path, self.executable_path)
202
+ os.chmod(self.executable_path, 0o755)
203
+
204
+ with open(version_path, 'w') as f:
205
+ f.write(chromedriver_version)
206
+
207
+ logging.info("Chromedriver executable copied!")
208
+ else:
209
+ try:
210
+ os.unlink(self.executable_path)
211
+ except PermissionError:
212
+ if self.force:
213
+ self.force_kill_instances(self.executable_path)
214
+ return self.auto(force=not self.force)
215
+ try:
216
+ if self.is_binary_patched():
217
+ # assumes already running AND patched
218
+ return True
219
+ except PermissionError:
220
+ pass
221
+ # return False
222
+ except FileNotFoundError:
223
+ pass
224
+
225
+ release = self.fetch_release_number()
226
+ self.version_main = release.major
227
+ self.version_full = release
228
+ self.unzip_package(self.fetch_package())
229
+
230
+ return self.patch()
231
+
232
+ def driver_binary_in_use(self, path: str = None) -> bool:
233
+ """
234
+ naive test to check if a found chromedriver binary is
235
+ currently in use
236
+
237
+ Args:
238
+ path: a string or PathLike object to the binary to check.
239
+ if not specified, we check use this object's executable_path
240
+ """
241
+ if not path:
242
+ path = self.executable_path
243
+ p = pathlib.Path(path)
244
+
245
+ if not p.exists():
246
+ raise OSError("file does not exist: %s" % p)
247
+ try:
248
+ with open(p, mode="a+b") as fs:
249
+ exc = []
250
+ try:
251
+
252
+ fs.seek(0, 0)
253
+ except PermissionError as e:
254
+ exc.append(e) # since some systems apprently allow seeking
255
+ # we conduct another test
256
+ try:
257
+ fs.readline()
258
+ except PermissionError as e:
259
+ exc.append(e)
260
+
261
+ if exc:
262
+
263
+ return True
264
+ return False
265
+ # ok safe to assume this is in use
266
+ except Exception as e:
267
+ # logger.exception("whoops ", e)
268
+ pass
269
+
270
+ def cleanup_unused_files(self):
271
+ p = pathlib.Path(self.data_path)
272
+ items = list(p.glob("*undetected*"))
273
+ for item in items:
274
+ try:
275
+ item.unlink()
276
+ except:
277
+ pass
278
+
279
+ def patch(self):
280
+ self.patch_exe()
281
+ return self.is_binary_patched()
282
+
283
+ def fetch_release_number(self):
284
+ """
285
+ Gets the latest major version available, or the latest major version of self.target_version if set explicitly.
286
+ :return: version string
287
+ :rtype: LooseVersion
288
+ """
289
+ # Endpoint for old versions of Chromedriver (114 and below)
290
+ if self.is_old_chromedriver:
291
+ path = f"/latest_release_{self.version_main}"
292
+ path = path.upper()
293
+ logger.debug("getting release number from %s" % path)
294
+ return LooseVersion(urlopen(self.url_repo + path).read().decode())
295
+
296
+ # Endpoint for new versions of Chromedriver (115+)
297
+ if not self.version_main:
298
+ # Fetch the latest version
299
+ path = "/last-known-good-versions-with-downloads.json"
300
+ logger.debug("getting release number from %s" % path)
301
+ with urlopen(self.url_repo + path) as conn:
302
+ response = conn.read().decode()
303
+
304
+ last_versions = json.loads(response)
305
+ return LooseVersion(last_versions["channels"]["Stable"]["version"])
306
+
307
+ # Fetch the latest minor version of the major version provided
308
+ path = "/latest-versions-per-milestone-with-downloads.json"
309
+ logger.debug("getting release number from %s" % path)
310
+ with urlopen(self.url_repo + path) as conn:
311
+ response = conn.read().decode()
312
+
313
+ major_versions = json.loads(response)
314
+ return LooseVersion(major_versions["milestones"][str(self.version_main)]["version"])
315
+
316
+ def parse_exe_version(self):
317
+ with io.open(self.executable_path, "rb") as f:
318
+ for line in iter(lambda: f.readline(), b""):
319
+ match = re.search(rb"platform_handle\x00content\x00([0-9.]*)", line)
320
+ if match:
321
+ return LooseVersion(match[1].decode())
322
+
323
+ def fetch_package(self):
324
+ """
325
+ Downloads ChromeDriver from source
326
+
327
+ :return: path to downloaded file
328
+ """
329
+ zip_name = f"chromedriver_{self.platform_name}.zip"
330
+ if self.is_old_chromedriver:
331
+ download_url = "%s/%s/%s" % (self.url_repo, str(self.version_full), zip_name)
332
+ else:
333
+ zip_name = zip_name.replace("_", "-", 1)
334
+ download_url = "https://storage.googleapis.com/chrome-for-testing-public/%s/%s/%s"
335
+ download_url %= (str(self.version_full), self.platform_name, zip_name)
336
+
337
+ logger.debug("downloading from %s" % download_url)
338
+ return urlretrieve(download_url)[0]
339
+
340
+ def unzip_package(self, fp):
341
+ """
342
+ Does what it says
343
+
344
+ :return: path to unpacked executable
345
+ """
346
+ exe_path = self.exe_name
347
+ if not self.is_old_chromedriver:
348
+ # The new chromedriver unzips into its own folder
349
+ zip_name = f"chromedriver-{self.platform_name}"
350
+ exe_path = os.path.join(zip_name, self.exe_name)
351
+
352
+ logger.debug("unzipping %s" % fp)
353
+ try:
354
+ os.unlink(self.zip_path)
355
+ except (FileNotFoundError, OSError):
356
+ pass
357
+
358
+ os.makedirs(self.zip_path, mode=0o755, exist_ok=True)
359
+ with zipfile.ZipFile(fp, mode="r") as zf:
360
+ zf.extractall(self.zip_path)
361
+ os.rename(os.path.join(self.zip_path, exe_path), self.executable_path)
362
+ os.remove(fp)
363
+ shutil.rmtree
364
+ os.chmod(self.executable_path, 0o755)
365
+ return self.executable_path
366
+
367
+ @staticmethod
368
+ def force_kill_instances(exe_name):
369
+ """
370
+ kills running instances.
371
+ :param: executable name to kill, may be a path as well
372
+
373
+ :return: True on success else False
374
+ """
375
+ exe_name = os.path.basename(exe_name)
376
+ if IS_POSIX:
377
+ # Using shell=True for pidof, consider a more robust pid finding method if issues arise.
378
+ # pgrep can be an alternative: ["pgrep", "-f", exe_name]
379
+ # Or psutil if adding a dependency is acceptable.
380
+ command = f"pidof {exe_name}"
381
+ try:
382
+ result = subprocess.run(command, shell=True, capture_output=True, text=True, check=True)
383
+ pids = result.stdout.strip().split()
384
+ if pids:
385
+ subprocess.run(["kill", "-9"] + pids, check=False) # Changed from -f -9 to -9 as -f is not standard for kill
386
+ return True
387
+ return False # No PIDs found
388
+ except subprocess.CalledProcessError: # pidof returns 1 if no process found
389
+ return False # No process found
390
+ except Exception as e:
391
+ logger.debug(f"Error killing process on POSIX: {e}")
392
+ return False
393
+ else:
394
+ try:
395
+ # TASKKILL /F /IM chromedriver.exe
396
+ result = subprocess.run(["taskkill", "/f", "/im", exe_name], check=False, capture_output=True)
397
+ # taskkill returns 0 if process was killed, 128 if not found.
398
+ return result.returncode == 0
399
+ except Exception as e:
400
+ logger.debug(f"Error killing process on Windows: {e}")
401
+ return False
402
+
403
+ @staticmethod
404
+ def gen_random_cdc():
405
+ cdc = random.choices(string.ascii_letters, k=27)
406
+ return "".join(cdc).encode()
407
+
408
+ def is_binary_patched(self, executable_path=None):
409
+ executable_path = executable_path or self.executable_path
410
+ try:
411
+ with io.open(executable_path, "rb") as fh:
412
+ return fh.read().find(b"undetected chromedriver") != -1
413
+ except FileNotFoundError:
414
+ return False
415
+
416
+ def patch_exe(self):
417
+ start = time.perf_counter()
418
+ logger.info("patching driver executable %s" % self.executable_path)
419
+ with io.open(self.executable_path, "r+b") as fh:
420
+ content = fh.read()
421
+ # match_injected_codeblock = re.search(rb"{window.*;}", content)
422
+ match_injected_codeblock = re.search(rb"\{window\.cdc.*?;\}", content)
423
+ if match_injected_codeblock:
424
+ target_bytes = match_injected_codeblock[0]
425
+ new_target_bytes = (
426
+ b'{console.log("undetected chromedriver 1337!")}'.ljust(
427
+ len(target_bytes), b" "
428
+ )
429
+ )
430
+ new_content = content.replace(target_bytes, new_target_bytes)
431
+ if new_content == content:
432
+ logger.warning(
433
+ "something went wrong patching the driver binary. could not find injection code block"
434
+ )
435
+ else:
436
+ logger.debug(
437
+ "found block:\n%s\nreplacing with:\n%s"
438
+ % (target_bytes, new_target_bytes)
439
+ )
440
+ fh.seek(0)
441
+ fh.write(new_content)
442
+ logger.debug(
443
+ "patching took us {:.2f} seconds".format(time.perf_counter() - start)
444
+ )
445
+
446
+ def __repr__(self):
447
+ return "{0:s}({1:s})".format(
448
+ self.__class__.__name__,
449
+ self.executable_path,
450
+ )
451
+
452
+ def __del__(self):
453
+ if self._custom_exe_path:
454
+ # if the driver binary is specified by user
455
+ # we assume it is important enough to not delete it
456
+ return
457
+ else:
458
+ timeout = 3 # stop trying after this many seconds
459
+ t = time.monotonic()
460
+ now = lambda: time.monotonic()
461
+ while now() - t > timeout:
462
+ # we don't want to wait until the end of time
463
+ try:
464
+ if self.user_multi_procs:
465
+ break
466
+ os.unlink(self.executable_path)
467
+ logger.debug("successfully unlinked %s" % self.executable_path)
468
+ break
469
+ except (OSError, RuntimeError, PermissionError):
470
+ time.sleep(0.01)
471
+ continue
472
+ except FileNotFoundError:
473
+ break
flaresolverr/undetected_chromedriver/reactor.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # this module is part of undetected_chromedriver
3
+
4
+ import asyncio
5
+ import json
6
+ import logging
7
+ import threading
8
+
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class Reactor(threading.Thread):
14
+ def __init__(self, driver: "Chrome"):
15
+ super().__init__()
16
+
17
+ self.driver = driver
18
+ self.loop = asyncio.new_event_loop()
19
+
20
+ self.lock = threading.Lock()
21
+ self.event = threading.Event()
22
+ self.daemon = True
23
+ self.handlers = {}
24
+
25
+ def add_event_handler(self, method_name, callback: callable):
26
+ """
27
+
28
+ Parameters
29
+ ----------
30
+ event_name: str
31
+ example "Network.responseReceived"
32
+
33
+ callback: callable
34
+ callable which accepts 1 parameter: the message object dictionary
35
+
36
+ Returns
37
+ -------
38
+
39
+ """
40
+ with self.lock:
41
+ self.handlers[method_name.lower()] = callback
42
+
43
+ @property
44
+ def running(self):
45
+ return not self.event.is_set()
46
+
47
+ def run(self):
48
+ try:
49
+ asyncio.set_event_loop(self.loop)
50
+ self.loop.run_until_complete(self.listen())
51
+ except Exception as e:
52
+ logger.warning("Reactor.run() => %s", e)
53
+
54
+ async def _wait_service_started(self):
55
+ while True:
56
+ with self.lock:
57
+ if (
58
+ getattr(self.driver, "service", None)
59
+ and getattr(self.driver.service, "process", None)
60
+ and self.driver.service.process.poll()
61
+ ):
62
+ await asyncio.sleep(self.driver._delay or 0.25)
63
+ else:
64
+ break
65
+
66
+ async def listen(self):
67
+ while self.running:
68
+ await self._wait_service_started()
69
+ await asyncio.sleep(1)
70
+
71
+ try:
72
+ with self.lock:
73
+ log_entries = self.driver.get_log("performance")
74
+
75
+ for entry in log_entries:
76
+ try:
77
+ obj_serialized: str = entry.get("message")
78
+ obj = json.loads(obj_serialized)
79
+ message = obj.get("message")
80
+ method = message.get("method")
81
+
82
+ if "*" in self.handlers:
83
+ await self.loop.run_in_executor(
84
+ None, self.handlers["*"], message
85
+ )
86
+ elif method.lower() in self.handlers:
87
+ await self.loop.run_in_executor(
88
+ None, self.handlers[method.lower()], message
89
+ )
90
+
91
+ # print(type(message), message)
92
+ except Exception as e:
93
+ raise e from None
94
+
95
+ except Exception as e:
96
+ if "invalid session id" in str(e):
97
+ pass
98
+ else:
99
+ logging.debug("exception ignored :", e)
flaresolverr/undetected_chromedriver/webelement.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ from selenium.webdriver.common.by import By
4
+ import selenium.webdriver.remote.webelement
5
+
6
+
7
+ class WebElement(selenium.webdriver.remote.webelement.WebElement):
8
+ def click_safe(self):
9
+ super().click()
10
+ self._parent.reconnect(0.1)
11
+
12
+ def children(
13
+ self, tag=None, recursive=False
14
+ ) -> List[selenium.webdriver.remote.webelement.WebElement]:
15
+ """
16
+ returns direct child elements of current element
17
+ :param tag: str, if supplied, returns <tag> nodes only
18
+ """
19
+ script = "return [... arguments[0].children]"
20
+ if tag:
21
+ script += ".filter( node => node.tagName === '%s')" % tag.upper()
22
+ if recursive:
23
+ return list(_recursive_children(self, tag))
24
+ return list(self._parent.execute_script(script, self))
25
+
26
+
27
+ class UCWebElement(WebElement):
28
+ """
29
+ Custom WebElement class which makes it easier to view elements when
30
+ working in an interactive environment.
31
+
32
+ standard webelement repr:
33
+ <selenium.webdriver.remote.webelement.WebElement (session="85ff0f671512fa535630e71ee951b1f2", element="6357cb55-92c3-4c0f-9416-b174f9c1b8c4")>
34
+
35
+ using this WebElement class:
36
+ <WebElement(<a class="mobile-show-inline-block mc-update-infos init-ok" href="#" id="main-cat-switcher-mobile">)>
37
+
38
+ """
39
+
40
+ def __init__(self, parent, id_):
41
+ super().__init__(parent, id_)
42
+ self._attrs = None
43
+
44
+ @property
45
+ def attrs(self):
46
+ if not self._attrs:
47
+ self._attrs = self._parent.execute_script(
48
+ """
49
+ var items = {};
50
+ for (index = 0; index < arguments[0].attributes.length; ++index)
51
+ {
52
+ items[arguments[0].attributes[index].name] = arguments[0].attributes[index].value
53
+ };
54
+ return items;
55
+ """,
56
+ self,
57
+ )
58
+ return self._attrs
59
+
60
+ def __repr__(self):
61
+ strattrs = " ".join([f'{k}="{v}"' for k, v in self.attrs.items()])
62
+ if strattrs:
63
+ strattrs = " " + strattrs
64
+ return f"{self.__class__.__name__} <{self.tag_name}{strattrs}>"
65
+
66
+
67
+ def _recursive_children(element, tag: str = None, _results=None):
68
+ """
69
+ returns all children of <element> recursively
70
+
71
+ :param element: `WebElement` object.
72
+ find children below this <element>
73
+
74
+ :param tag: str = None.
75
+ if provided, return only <tag> elements. example: 'a', or 'img'
76
+ :param _results: do not use!
77
+ """
78
+ results = _results or set()
79
+ for element in element.children():
80
+ if tag:
81
+ if element.tag_name == tag:
82
+ results.add(element)
83
+ else:
84
+ results.add(element)
85
+ results |= _recursive_children(element, tag, results)
86
+ return results
flaresolverr/utils.py ADDED
@@ -0,0 +1,376 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ import platform
5
+ import re
6
+ import shutil
7
+ import sys
8
+ import tempfile
9
+ import urllib.parse
10
+
11
+ from selenium.webdriver.chrome.webdriver import WebDriver
12
+ import undetected_chromedriver as uc
13
+
14
+ FLARESOLVERR_VERSION = None
15
+ PLATFORM_VERSION = None
16
+ CHROME_EXE_PATH = None
17
+ CHROME_MAJOR_VERSION = None
18
+ USER_AGENT = None
19
+ XVFB_DISPLAY = None
20
+ PATCHED_DRIVER_PATH = None
21
+
22
+
23
+ def get_config_log_html() -> bool:
24
+ return os.environ.get('LOG_HTML', 'false').lower() == 'true'
25
+
26
+
27
+ def get_config_headless() -> bool:
28
+ return os.environ.get('HEADLESS', 'true').lower() == 'true'
29
+
30
+
31
+ def get_config_disable_media() -> bool:
32
+ return os.environ.get('DISABLE_MEDIA', 'false').lower() == 'true'
33
+
34
+
35
+ def get_flaresolverr_version() -> str:
36
+ global FLARESOLVERR_VERSION
37
+ if FLARESOLVERR_VERSION is not None:
38
+ return FLARESOLVERR_VERSION
39
+
40
+ package_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'package.json')
41
+ if not os.path.isfile(package_path):
42
+ package_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'package.json')
43
+ with open(package_path) as f:
44
+ FLARESOLVERR_VERSION = json.loads(f.read())['version']
45
+ return FLARESOLVERR_VERSION
46
+
47
+ def get_current_platform() -> str:
48
+ global PLATFORM_VERSION
49
+ if PLATFORM_VERSION is not None:
50
+ return PLATFORM_VERSION
51
+ PLATFORM_VERSION = os.name
52
+ return PLATFORM_VERSION
53
+
54
+
55
+ def create_proxy_extension(proxy: dict) -> str:
56
+ parsed_url = urllib.parse.urlparse(proxy['url'])
57
+ scheme = parsed_url.scheme
58
+ host = parsed_url.hostname
59
+ port = parsed_url.port
60
+ username = proxy['username']
61
+ password = proxy['password']
62
+ manifest_json = """
63
+ {
64
+ "version": "1.0.0",
65
+ "manifest_version": 3,
66
+ "name": "Chrome Proxy",
67
+ "permissions": [
68
+ "proxy",
69
+ "tabs",
70
+ "storage",
71
+ "webRequest",
72
+ "webRequestAuthProvider"
73
+ ],
74
+ "host_permissions": [
75
+ "<all_urls>"
76
+ ],
77
+ "background": {
78
+ "service_worker": "background.js"
79
+ },
80
+ "minimum_chrome_version": "76.0.0"
81
+ }
82
+ """
83
+
84
+ background_js = """
85
+ var config = {
86
+ mode: "fixed_servers",
87
+ rules: {
88
+ singleProxy: {
89
+ scheme: "%s",
90
+ host: "%s",
91
+ port: %d
92
+ },
93
+ bypassList: ["localhost"]
94
+ }
95
+ };
96
+
97
+ chrome.proxy.settings.set({value: config, scope: "regular"}, function() {});
98
+
99
+ function callbackFn(details) {
100
+ return {
101
+ authCredentials: {
102
+ username: "%s",
103
+ password: "%s"
104
+ }
105
+ };
106
+ }
107
+
108
+ chrome.webRequest.onAuthRequired.addListener(
109
+ callbackFn,
110
+ { urls: ["<all_urls>"] },
111
+ ['blocking']
112
+ );
113
+ """ % (
114
+ scheme,
115
+ host,
116
+ port,
117
+ username,
118
+ password
119
+ )
120
+
121
+ proxy_extension_dir = tempfile.mkdtemp()
122
+
123
+ with open(os.path.join(proxy_extension_dir, "manifest.json"), "w") as f:
124
+ f.write(manifest_json)
125
+
126
+ with open(os.path.join(proxy_extension_dir, "background.js"), "w") as f:
127
+ f.write(background_js)
128
+
129
+ return proxy_extension_dir
130
+
131
+
132
+ def get_webdriver(proxy: dict = None) -> WebDriver:
133
+ global PATCHED_DRIVER_PATH, USER_AGENT
134
+ logging.debug('Launching web browser...')
135
+
136
+ # undetected_chromedriver
137
+ options = uc.ChromeOptions()
138
+ options.add_argument('--no-sandbox')
139
+ options.add_argument('--window-size=1280,1024') # Smaller window for less overhead
140
+ options.add_argument('--disable-search-engine-choice-screen')
141
+ options.add_argument('--disable-setuid-sandbox')
142
+ options.add_argument('--disable-dev-shm-usage')
143
+ options.add_argument('--no-zygote')
144
+ options.add_argument('--disable-gpu') # Disable GPU for faster headless boot
145
+ options.add_argument('--mute-audio')
146
+ options.add_argument('--disable-notifications')
147
+ options.add_argument('--disable-popup-blocking')
148
+ options.add_argument('--disable-extensions')
149
+ options.add_argument('--disable-blink-features=AutomationControlled')
150
+
151
+ # Force headless and invisibility
152
+ options.add_argument('--headless=new')
153
+
154
+ IS_ARMARCH = platform.machine().startswith(('arm', 'aarch'))
155
+ if IS_ARMARCH:
156
+ options.add_argument('--disable-gpu-sandbox')
157
+ options.add_argument('--ignore-certificate-errors')
158
+ options.add_argument('--ignore-ssl-errors')
159
+
160
+ language = os.environ.get('LANG', None)
161
+ if language is not None:
162
+ options.add_argument('--accept-lang=%s' % language)
163
+
164
+ # Fix for Chrome 117 | https://github.com/FlareSolverr/FlareSolverr/issues/910
165
+ if USER_AGENT is not None:
166
+ options.add_argument('--user-agent=%s' % USER_AGENT)
167
+
168
+ proxy_extension_dir = None
169
+ if proxy and all(key in proxy for key in ['url', 'username', 'password']):
170
+ proxy_extension_dir = create_proxy_extension(proxy)
171
+ options.add_argument("--disable-features=DisableLoadExtensionCommandLineSwitch")
172
+ options.add_argument("--load-extension=%s" % os.path.abspath(proxy_extension_dir))
173
+ elif proxy and 'url' in proxy:
174
+ proxy_url = proxy['url']
175
+ logging.debug("Using webdriver proxy: %s", proxy_url)
176
+ options.add_argument('--proxy-server=%s' % proxy_url)
177
+
178
+ # note: headless mode is detected (headless = True)
179
+ # we launch the browser in head-full mode with the window hidden
180
+ windows_headless = True if os.name == 'nt' else False
181
+ if get_config_headless():
182
+ if os.name != 'nt':
183
+ start_xvfb_display()
184
+
185
+ # Override for absolute invisibility on Windows
186
+ if os.name == 'nt':
187
+ options.add_argument('--hide-scrollbars')
188
+ options.add_argument('--disable-logging')
189
+ options.add_argument('--log-level=3')
190
+
191
+ # if we are inside the Docker container, we avoid downloading the driver
192
+ driver_exe_path = None
193
+ version_main = None
194
+ if os.path.exists("/app/chromedriver"):
195
+ # running inside Docker
196
+ driver_exe_path = "/app/chromedriver"
197
+ else:
198
+ version_main = get_chrome_major_version()
199
+ if PATCHED_DRIVER_PATH is not None:
200
+ driver_exe_path = PATCHED_DRIVER_PATH
201
+
202
+ # detect chrome path
203
+ browser_executable_path = get_chrome_exe_path()
204
+
205
+ # CRITICAL: Clean up undetected_chromedriver cache on Windows to avoid WinError 183
206
+ if os.name == 'nt':
207
+ try:
208
+ uc_path = os.path.join(os.environ.get('APPDATA', ''), 'undetected_chromedriver')
209
+ if os.path.exists(uc_path):
210
+ # Try to remove the file that usually causes WinError 183
211
+ target_exe = os.path.join(uc_path, 'undetected_chromedriver.exe')
212
+ if os.path.exists(target_exe):
213
+ try: os.remove(target_exe)
214
+ except: pass
215
+ except: pass
216
+
217
+ # downloads and patches the chromedriver
218
+ # if we don't set driver_executable_path it downloads, patches, and deletes the driver each time
219
+ try:
220
+ driver = uc.Chrome(options=options, browser_executable_path=browser_executable_path,
221
+ driver_executable_path=driver_exe_path, version_main=version_main,
222
+ windows_headless=windows_headless, headless=get_config_headless())
223
+ except Exception as e:
224
+ logging.error("Error starting Chrome: %s" % e)
225
+ # No point in continuing if we cannot retrieve the driver
226
+ raise e
227
+
228
+ # save the patched driver to avoid re-downloads
229
+ if driver_exe_path is None:
230
+ try:
231
+ target_path = os.path.join(driver.patcher.data_path, driver.patcher.exe_name)
232
+ if target_path != driver.patcher.executable_path:
233
+ # On Windows, we might get WinError 183 if the file is locked or exists
234
+ if os.path.exists(target_path):
235
+ try: os.remove(target_path)
236
+ except: pass
237
+ shutil.copy(driver.patcher.executable_path, target_path)
238
+ PATCHED_DRIVER_PATH = target_path
239
+ except Exception as e:
240
+ logging.warning(f"Failed to save patched driver: {e}")
241
+
242
+ # clean up proxy extension directory
243
+ if proxy_extension_dir is not None:
244
+ shutil.rmtree(proxy_extension_dir)
245
+
246
+ # selenium vanilla
247
+ # options = webdriver.ChromeOptions()
248
+ # options.add_argument('--no-sandbox')
249
+ # options.add_argument('--window-size=1920,1080')
250
+ # options.add_argument('--disable-setuid-sandbox')
251
+ # options.add_argument('--disable-dev-shm-usage')
252
+ # driver = webdriver.Chrome(options=options)
253
+
254
+ return driver
255
+
256
+
257
+ def get_chrome_exe_path() -> str:
258
+ global CHROME_EXE_PATH
259
+ if CHROME_EXE_PATH is not None:
260
+ return CHROME_EXE_PATH
261
+ # linux pyinstaller bundle
262
+ chrome_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'chrome', "chrome")
263
+ if os.path.exists(chrome_path):
264
+ if not os.access(chrome_path, os.X_OK):
265
+ raise Exception(f'Chrome binary "{chrome_path}" is not executable. '
266
+ f'Please, extract the archive with "tar xzf <file.tar.gz>".')
267
+ CHROME_EXE_PATH = chrome_path
268
+ return CHROME_EXE_PATH
269
+ # windows pyinstaller bundle
270
+ chrome_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'chrome', "chrome.exe")
271
+ if os.path.exists(chrome_path):
272
+ CHROME_EXE_PATH = chrome_path
273
+ return CHROME_EXE_PATH
274
+ # system
275
+ CHROME_EXE_PATH = uc.find_chrome_executable()
276
+ return CHROME_EXE_PATH
277
+
278
+
279
+ def get_chrome_major_version() -> str:
280
+ global CHROME_MAJOR_VERSION
281
+ if CHROME_MAJOR_VERSION is not None:
282
+ return CHROME_MAJOR_VERSION
283
+
284
+ if os.name == 'nt':
285
+ # Example: '104.0.5112.79'
286
+ try:
287
+ complete_version = extract_version_nt_executable(get_chrome_exe_path())
288
+ except Exception:
289
+ try:
290
+ complete_version = extract_version_nt_registry()
291
+ except Exception:
292
+ # Example: '104.0.5112.79'
293
+ complete_version = extract_version_nt_folder()
294
+ else:
295
+ chrome_path = get_chrome_exe_path()
296
+ process = os.popen(f'"{chrome_path}" --version')
297
+ # Example 1: 'Chromium 104.0.5112.79 Arch Linux\n'
298
+ # Example 2: 'Google Chrome 104.0.5112.79 Arch Linux\n'
299
+ complete_version = process.read()
300
+ process.close()
301
+
302
+ CHROME_MAJOR_VERSION = complete_version.split('.')[0].split(' ')[-1]
303
+ return CHROME_MAJOR_VERSION
304
+
305
+
306
+ def extract_version_nt_executable(exe_path: str) -> str:
307
+ import pefile
308
+ pe = pefile.PE(exe_path, fast_load=True)
309
+ pe.parse_data_directories(
310
+ directories=[pefile.DIRECTORY_ENTRY["IMAGE_DIRECTORY_ENTRY_RESOURCE"]]
311
+ )
312
+ return pe.FileInfo[0][0].StringTable[0].entries[b"FileVersion"].decode('utf-8')
313
+
314
+
315
+ def extract_version_nt_registry() -> str:
316
+ stream = os.popen(
317
+ 'reg query "HKLM\\SOFTWARE\\Wow6432Node\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\Google Chrome"')
318
+ output = stream.read()
319
+ google_version = ''
320
+ for letter in output[output.rindex('DisplayVersion REG_SZ') + 24:]:
321
+ if letter != '\n':
322
+ google_version += letter
323
+ else:
324
+ break
325
+ return google_version.strip()
326
+
327
+
328
+ def extract_version_nt_folder() -> str:
329
+ # Check if the Chrome folder exists in the x32 or x64 Program Files folders.
330
+ for i in range(2):
331
+ path = 'C:\\Program Files' + (' (x86)' if i else '') + '\\Google\\Chrome\\Application'
332
+ if os.path.isdir(path):
333
+ paths = [f.path for f in os.scandir(path) if f.is_dir()]
334
+ for path in paths:
335
+ filename = os.path.basename(path)
336
+ pattern = r'\d+\.\d+\.\d+\.\d+'
337
+ match = re.search(pattern, filename)
338
+ if match and match.group():
339
+ # Found a Chrome version.
340
+ return match.group(0)
341
+ return ''
342
+
343
+
344
+ def get_user_agent(driver=None) -> str:
345
+ global USER_AGENT
346
+ if USER_AGENT is not None:
347
+ return USER_AGENT
348
+
349
+ try:
350
+ if driver is None:
351
+ driver = get_webdriver()
352
+ USER_AGENT = driver.execute_script("return navigator.userAgent")
353
+ # Fix for Chrome 117 | https://github.com/FlareSolverr/FlareSolverr/issues/910
354
+ USER_AGENT = re.sub('HEADLESS', '', USER_AGENT, flags=re.IGNORECASE)
355
+ return USER_AGENT
356
+ except Exception as e:
357
+ raise Exception("Error getting browser User-Agent. " + str(e))
358
+ finally:
359
+ if driver is not None:
360
+ if PLATFORM_VERSION == "nt":
361
+ driver.close()
362
+ driver.quit()
363
+
364
+
365
+ def start_xvfb_display():
366
+ global XVFB_DISPLAY
367
+ if XVFB_DISPLAY is None:
368
+ from xvfbwrapper import Xvfb
369
+ XVFB_DISPLAY = Xvfb()
370
+ XVFB_DISPLAY.start()
371
+
372
+
373
+ def object_to_dict(_object):
374
+ json_dict = json.loads(json.dumps(_object, default=lambda o: o.__dict__))
375
+ # remove hidden fields
376
+ return {k: v for k, v in json_dict.items() if not k.startswith('__')}
keep_alive.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Keep-Alive Service to prevent Render.com from sleeping
3
+ Pings the server every 10 minutes to maintain activity
4
+ """
5
+ import asyncio
6
+ import httpx
7
+ import logging
8
+ from datetime import datetime
9
+
10
+ logger = logging.getLogger("keep_alive")
11
+
12
+ class KeepAliveService:
13
+ def __init__(self, base_url: str = "http://localhost:7860"):
14
+ self.base_url = base_url
15
+ self.running = False
16
+ self.ping_interval = 600 # 10 minutes
17
+
18
+ async def start(self):
19
+ """Start the keep-alive service"""
20
+ self.running = True
21
+ logger.info("🔄 Keep-Alive service started (pinging every 10 minutes)")
22
+
23
+ while self.running:
24
+ try:
25
+ await asyncio.sleep(self.ping_interval)
26
+ await self._ping()
27
+ except Exception as e:
28
+ logger.error(f"Keep-Alive error: {e}")
29
+
30
+ async def _ping(self):
31
+ """Send a ping to keep the service alive"""
32
+ try:
33
+ async with httpx.AsyncClient(timeout=10.0) as client:
34
+ response = await client.get(f"{self.base_url}/health")
35
+ if response.status_code == 200:
36
+ logger.info(f"✅ Keep-Alive ping successful at {datetime.now().strftime('%H:%M:%S')}")
37
+ else:
38
+ logger.warning(f"⚠️ Keep-Alive ping returned {response.status_code}")
39
+ except Exception as e:
40
+ logger.warning(f"Keep-Alive ping failed: {e}")
41
+
42
+ def stop(self):
43
+ """Stop the keep-alive service"""
44
+ self.running = False
45
+ logger.info("Keep-Alive service stopped")
46
+
47
+ keep_alive = KeepAliveService()
main.py ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import time
3
+ from typing import List, Optional
4
+ from fastapi import FastAPI, Request, HTTPException, Query
5
+ from fastapi.responses import JSONResponse, FileResponse, StreamingResponse, RedirectResponse
6
+ from fastapi.middleware.cors import CORSMiddleware
7
+ from fastapi.middleware.gzip import GZipMiddleware
8
+ import httpx
9
+ from scraper.engine import scraper
10
+ from downloader import downloader
11
+ import os
12
+ import re
13
+ from urllib.parse import unquote, quote
14
+ from fastapi.staticfiles import StaticFiles
15
+ from database import init_db
16
+ from keep_alive import keep_alive
17
+ import asyncio
18
+ import io
19
+
20
+ # Configure logging
21
+ logging.basicConfig(
22
+ level=logging.INFO,
23
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
24
+ datefmt="%Y-%m-%d %H:%M:%S",
25
+ )
26
+ logger = logging.getLogger("backend")
27
+
28
+ app = FastAPI(title="MEIH Movies API", version="2.0.0")
29
+
30
+ # --- Simple Caching Layer ---
31
+ class MemoryCache:
32
+ def __init__(self):
33
+ self._cache = {}
34
+
35
+ def get(self, key: str):
36
+ item = self._cache.get(key)
37
+ if item:
38
+ expire_time, data = item
39
+ if time.time() < expire_time:
40
+ return data
41
+ else:
42
+ del self._cache[key]
43
+ return None
44
+
45
+ def set(self, key: str, data, ttl_seconds: int = 600): # Default 10 mins
46
+ self._cache[key] = (time.time() + ttl_seconds, data)
47
+
48
+ cache = MemoryCache()
49
+
50
+ async def warm_scraper():
51
+ """Warms up the scraper by making an initial request to sync cookies."""
52
+ logger.info("🔥 Warming up scraper in background...")
53
+ try:
54
+ # Give services a few more seconds to be truly ready
55
+ await asyncio.sleep(5)
56
+ await scraper.fetch_home(page=1)
57
+ logger.info("✅ Scraper warmed up and cookies synced")
58
+ except Exception as e:
59
+ logger.warning(f"⚠️ Scraper warmup failed (will retry on first request): {e}")
60
+
61
+ @app.on_event("startup")
62
+ async def startup_event():
63
+ await init_db()
64
+ logger.info("🚀 Database initialized and ready")
65
+
66
+ # Detect if running on Hugging Face
67
+ is_hf = os.environ.get("SPACE_ID") is not None or os.environ.get("HF_SPACE") is not None
68
+
69
+ if not is_hf:
70
+ # Start Keep-Alive service (only for non-HF environments)
71
+ asyncio.create_task(keep_alive.start())
72
+ # Start Warm-up service
73
+ asyncio.create_task(warm_scraper())
74
+ # Start Nitro Pre-fetch (Populates cache in background)
75
+ if hasattr(scraper, '_turbo_prefetch'):
76
+ asyncio.create_task(scraper._turbo_prefetch())
77
+ logger.info("🔄 Background services activated")
78
+ else:
79
+ logger.info("🤗 Running on Hugging Face - Lightweight mode enabled")
80
+ # Just warm up the scraper without heavy pre-fetching
81
+ asyncio.create_task(warm_scraper())
82
+
83
+
84
+ # Enable CORS for frontend
85
+ app.add_middleware(
86
+ CORSMiddleware,
87
+ allow_origins=["*"],
88
+ allow_credentials=True,
89
+ allow_methods=["*"],
90
+ allow_headers=["*"],
91
+ )
92
+ app.add_middleware(GZipMiddleware, minimum_size=1000)
93
+
94
+ @app.get("/")
95
+ async def root():
96
+ return {
97
+ "status": "online",
98
+ "engine": "Nitro-Power Larooza Engine",
99
+ "engine_status": "WARM" if scraper._cookies_synced else "COLD",
100
+ "cached_keys": list(cache._cache.keys())
101
+ }
102
+
103
+ @app.get("/latest")
104
+ async def get_latest(page: int = 1):
105
+ cache_key = f"latest_{page}"
106
+ cached = cache.get(cache_key)
107
+ if cached:
108
+ return cached
109
+
110
+ try:
111
+ items = await scraper.fetch_home(page=page)
112
+ if items:
113
+ cache.set(cache_key, items)
114
+ return items
115
+ except Exception as e:
116
+ logger.error(f"Error fetching latest: {e}")
117
+ raise HTTPException(status_code=500, detail=str(e))
118
+
119
+ @app.get("/category/{cat_id}")
120
+ async def get_category(cat_id: str, page: int = 1):
121
+ cache_key = f"cat_{cat_id}_{page}"
122
+ cached = cache.get(cache_key)
123
+ if cached:
124
+ return cached
125
+
126
+ try:
127
+ items = await scraper.fetch_category(cat_id, page=page)
128
+ if items:
129
+ cache.set(cache_key, items)
130
+ return items
131
+ except Exception as e:
132
+ logger.error(f"Error fetching category {cat_id}: {e}")
133
+ raise HTTPException(status_code=500, detail=str(e))
134
+
135
+ @app.get("/search")
136
+ async def search(q: str):
137
+ cache_key = f"search_{q}"
138
+ cached = cache.get(cache_key)
139
+ if cached:
140
+ return cached
141
+
142
+ try:
143
+ items = await scraper.search(q)
144
+ if items:
145
+ cache.set(cache_key, items, ttl_seconds=3600) # Search results cache longer
146
+ return items
147
+ except Exception as e:
148
+ logger.error(f"Error searching for {q}: {e}")
149
+ raise HTTPException(status_code=500, detail=str(e))
150
+
151
+ @app.get("/details/{safe_id}")
152
+ async def get_details(safe_id: str):
153
+ cache_key = f"details_{safe_id}"
154
+ cached = cache.get(cache_key)
155
+ if cached:
156
+ return cached
157
+
158
+ try:
159
+ details = await scraper.fetch_details(safe_id)
160
+ if not details:
161
+ return JSONResponse(status_code=404, content={"error": "Content not found"})
162
+
163
+ cache.set(cache_key, details, ttl_seconds=86400) # Details cache for 24h
164
+ return details
165
+ except Exception as e:
166
+ logger.error(f"Error fetching details for {safe_id}: {e}")
167
+ raise HTTPException(status_code=500, detail=str(e))
168
+
169
+ @app.get("/proxy/image")
170
+ async def proxy_image(url: str):
171
+ if not url:
172
+ raise HTTPException(status_code=400, detail="URL is required")
173
+
174
+ url = unquote(url)
175
+
176
+ # --- Image Disk Cache ---
177
+ cache_dir = os.path.join(base_dir, "cache", "images")
178
+ os.makedirs(cache_dir, exist_ok=True)
179
+
180
+ # Generate simple hash for filename
181
+ import hashlib
182
+ url_hash = hashlib.md5(url.encode()).hexdigest()
183
+ cache_path = os.path.join(cache_dir, f"{url_hash}.img")
184
+
185
+ # 1. Check if cached
186
+ if os.path.exists(cache_path):
187
+ # Check cache age (optional - 1 week)
188
+ if time.time() - os.path.getmtime(cache_path) < 604800:
189
+ return FileResponse(
190
+ cache_path,
191
+ media_type="image/jpeg", # Approximate, browser will handle
192
+ headers={"Cache-Control": "public, max-age=31536000"}
193
+ )
194
+
195
+ try:
196
+ # Using follow_redirects and a longer timeout for images
197
+ async with httpx.AsyncClient(timeout=20.0, follow_redirects=True) as client:
198
+ resp = await client.get(url, headers={"User-Agent": scraper.headers["User-Agent"]})
199
+ if resp.status_code == 200:
200
+ # Save to cache
201
+ content = resp.content
202
+ with open(cache_path, "wb") as f:
203
+ f.write(content)
204
+
205
+ # Return the image stream directly
206
+ return StreamingResponse(
207
+ io.BytesIO(content),
208
+ media_type=resp.headers.get("Content-Type", "image/jpeg"),
209
+ headers={"Cache-Control": "public, max-age=31536000"}
210
+ )
211
+ else:
212
+ logger.warning(f"Failed to proxy image {url} (Status: {resp.status_code})")
213
+ return JSONResponse(status_code=resp.status_code, content={"error": f"Failed (Status {resp.status_code})"})
214
+ except httpx.TimeoutException:
215
+ logger.warning(f"Timeout proxying image: {url}")
216
+ return JSONResponse(status_code=504, content={"error": "Image timeout"})
217
+ except Exception as e:
218
+ logger.error(f"Proxy image error for {url}: {type(e).__name__} - {str(e)}")
219
+ return JSONResponse(status_code=500, content={"error": str(e)})
220
+
221
+ @app.get("/download/info")
222
+ async def get_download_info(url: str):
223
+ try:
224
+ info = await downloader.get_info(url)
225
+ return info
226
+ except Exception as e:
227
+ logger.error(f"Download info error for {url}: {e}")
228
+ raise HTTPException(status_code=500, detail=str(e))
229
+
230
+
231
+
232
+ @app.get("/download/file")
233
+ async def download_file(url: str, filename: str = "video.mp4"):
234
+ """Handles file downloads, proxying if necessary to bypass IP blocks or hotlink protection."""
235
+ if not url:
236
+ raise HTTPException(status_code=400, detail="URL is required")
237
+
238
+ url = unquote(url)
239
+
240
+ # Domains that REQUIRE proxying (IP-bound or strict hotlink protection)
241
+ proxy_domains = [
242
+ "googlevideo.com",
243
+ "manifest.googlevideo.com",
244
+ "larozavideo.net",
245
+ "larooza.site",
246
+ "larooza.mom",
247
+ "laroza-tv.net",
248
+ "youtube.com",
249
+ "youtu.be"
250
+ ]
251
+
252
+ should_proxy = any(domain in url for domain in proxy_domains)
253
+
254
+ if should_proxy:
255
+ logger.info(f"🛡️ Proxying download: {filename[:50]}...")
256
+
257
+ # Clean filename for the ASCII part of Content-Disposition
258
+ # Remove non-ASCII characters for the fallback filename
259
+ ascii_filename = re.sub(r'[^\x00-\x7F]+', '_', filename)
260
+ encoded_filename = quote(filename)
261
+
262
+ async def stream_generator():
263
+ async with httpx.AsyncClient(timeout=None, follow_redirects=True) as client:
264
+ try:
265
+ async with client.stream("GET", url, headers={"User-Agent": scraper.headers["User-Agent"]}) as resp:
266
+ if resp.status_code != 200:
267
+ logger.error(f"Proxy source returned {resp.status_code}")
268
+ return
269
+
270
+ # We can't easily set Content-Length here because StreamingResponse
271
+ # starts before we have all chunks, but we can set it in the outer response
272
+ async for chunk in resp.aiter_bytes(chunk_size=1024*1024):
273
+ yield chunk
274
+ except Exception as e:
275
+ logger.error(f"Streaming error: {e}")
276
+
277
+ # Get initial headers to find content length/type if possible
278
+ try:
279
+ async with httpx.AsyncClient(timeout=10.0, follow_redirects=True) as client:
280
+ head_resp = await client.head(url, headers={"User-Agent": scraper.headers["User-Agent"]})
281
+ content_length = head_resp.headers.get("Content-Length")
282
+ content_type = head_resp.headers.get("Content-Type", "video/mp4")
283
+ except:
284
+ content_length = None
285
+ content_type = "video/mp4"
286
+
287
+ headers = {
288
+ "Content-Disposition": f"attachment; filename=\"{ascii_filename}\"; filename*=UTF-8''{encoded_filename}",
289
+ "Access-Control-Expose-Headers": "Content-Disposition"
290
+ }
291
+ if content_length:
292
+ headers["Content-Length"] = content_length
293
+
294
+ return StreamingResponse(stream_generator(), media_type=content_type, headers=headers)
295
+
296
+ # For other sources, a simple redirect is much faster and saves server bandwidth
297
+ return RedirectResponse(url=url)
298
+
299
+ @app.get("/health")
300
+ async def health():
301
+ # Check FlareSolverr
302
+ fs_status = "OFFLINE"
303
+ try:
304
+ # Increase timeout as solver might be busy
305
+ async with httpx.AsyncClient(timeout=5.0) as client:
306
+ resp = await client.get("http://localhost:8191/health")
307
+ if resp.status_code == 200:
308
+ fs_status = "ONLINE"
309
+ except:
310
+ pass
311
+
312
+ return {
313
+ "backend": "ONLINE",
314
+ "flaresolverr": fs_status,
315
+ "scraper_sync": scraper._cookies_synced,
316
+ "timestamp": time.time()
317
+ }
318
+
319
+ # --- Frontend Mounting ---
320
+ # This ensures that our React app is served directly by FastAPI in production
321
+ # Check both relative and same-level structures for Docker/Local compatibility
322
+ base_dir = os.path.dirname(__file__)
323
+ frontend_path = os.path.join(base_dir, "meih-netflix-clone", "dist")
324
+
325
+ if not os.path.exists(frontend_path):
326
+ # Try one level up (local dev structure)
327
+ frontend_path = os.path.join(base_dir, "..", "meih-netflix-clone", "dist")
328
+
329
+ if os.path.exists(frontend_path):
330
+ # Assets are usually in dist/assets and referenced as /assets/ in Vite
331
+ assets_path = os.path.join(frontend_path, "assets")
332
+ if os.path.exists(assets_path):
333
+ app.mount("/assets", StaticFiles(directory=assets_path), name="assets")
334
+
335
+ @app.get("/{full_path:path}")
336
+ async def serve_frontend(full_path: str):
337
+ # Prevent infinite recursion for API routes if someone hits a wrong URL
338
+ if full_path.startswith(("api/", "latest", "category/", "search", "details", "proxy", "download", "health")):
339
+ return JSONResponse(status_code=404, content={"error": "Not Found"})
340
+ # If the path starts with api/ or other backend routes, it should have been caught above
341
+ # Otherwise, serve the main index.html for React Router to handle
342
+ file_path = os.path.join(frontend_path, full_path)
343
+ if os.path.exists(file_path) and os.path.isfile(file_path):
344
+ return FileResponse(file_path)
345
+ return FileResponse(os.path.join(frontend_path, "index.html"))
346
+ else:
347
+ logger.warning(f"Frontend dist folder not found at {frontend_path}. Frontend serving disabled.")
348
+
349
+ if __name__ == "__main__":
350
+ import uvicorn
351
+ # Use port 7860 for Hugging Face Spaces compatibility
352
+ uvicorn.run(app, host="0.0.0.0", port=7860)
package.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "meih-movies-api",
3
+ "version": "1.0.0",
4
+ "description": "Nitro-powered movie scraping API",
5
+ "main": "main.py",
6
+ "scripts": {
7
+ "start": "bash start.sh"
8
+ },
9
+ "engines": {
10
+ "node": ">=18.x"
11
+ }
12
+ }
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ httpx[http2]
4
+ beautifulsoup4
5
+ curl-cffi
6
+ yt-dlp
7
+ pydantic
8
+ python-multipart
9
+ aiohttp
10
+ aiosqlite
11
+ certifi
12
+ websockets
13
+ packaging
14
+ setuptools
scraper/engine.py ADDED
@@ -0,0 +1,996 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import httpx
3
+ import re
4
+ import logging
5
+ import base64
6
+ import random
7
+ import os
8
+ import time
9
+ from typing import List, Dict, Optional
10
+ from bs4 import BeautifulSoup
11
+ from curl_cffi.requests import AsyncSession
12
+ from urllib.parse import urljoin, quote
13
+ from scraper.proxy_fetcher import proxy_fetcher
14
+ # Optional dependencies for heavy bypasses
15
+ try:
16
+ import undetected_chromedriver as uc
17
+ from selenium.webdriver.common.by import By
18
+ from selenium.webdriver.support.ui import WebDriverWait
19
+ from selenium.webdriver.support import expected_conditions as EC
20
+ HAS_SELENIUM = True
21
+ except ImportError:
22
+ HAS_SELENIUM = False
23
+ logger.warning("⚠️ Selenium/Undetected-Chromedriver not installed. Nuclear bypass will be disabled.")
24
+
25
+ # Clean, strictly used logger
26
+ logging.basicConfig(level=logging.INFO)
27
+ logger = logging.getLogger("scraper")
28
+
29
+ class LaroozaScraper:
30
+ MIRRORS = ["https://q.larozavideo.net", "https://larooza.mom", "https://larooza.site", "https://m.laroza-tv.net"]
31
+ BASE_URL = "https://q.larozavideo.net"
32
+ TARGET_URL = "https://q.larozavideo.net/newvideos1.php"
33
+ _blacklisted_mirrors = {}
34
+
35
+ # Permanent Aliases -> Keywords search
36
+ CATEGORY_KEYWORDS = {
37
+ "arabic-movies": ["أفلام عربية", "افلام عربية", "افلام عربي", "arabic-movies33"],
38
+ "english-movies": ["افلام اجنبية", "أفلام أجنبية", "افلام اجنبي", "أجنبي", "all_movies_13"],
39
+ "indian-movies": ["افلام هندي", "أفلام هندية", "هندي", "indian-movies9"],
40
+ "anime-movies": ["افلام انمي", "أفلام أنمي", "انمي", "anime-movies-7"],
41
+ "dubbed-movies": ["افلام مدبلجة", "أفلام مدبلجة", "مدبلج", "7-aflammdblgh"],
42
+ "turkish-series": ["مسلسلات تركية", "تركي", "turkish-3isk-seriess47"],
43
+ "arabic-series": ["مسلسلات عربية", "عربي", "arabic-series46"],
44
+ "english-series": ["مسلسلات اجنبية", "أجنبي", "english-series10"],
45
+ "ramadan-2025": ["رمضان 2025", "13-ramadan-2025"],
46
+ "ramadan-2024": ["رمضان 2024", "28-ramadan-2024"],
47
+ "ramadan-2023": ["رمضان 2023", "10-ramadan-2023"],
48
+ "asian-movies": ["آسيوي", "اسيوي", "آسيوية", "6-asian-movies"],
49
+ "asian-series": ["مسلسلات اسياوية", "اسياوية", "6-asya"],
50
+ "turkish-movies": ["افلام تركية", "أفلام تركية", "8-aflam3isk"],
51
+ "anime-series": ["مسلسلات انمي", "كرتون", "6-anime-series"],
52
+ "indian-series": ["مسلسلات هندية", "11indian-series"],
53
+ "tv-programs": ["برامج تلفزيون", "tv-programs12"],
54
+ "plays": ["مسرحيات", "masrh-5"]
55
+ }
56
+
57
+ # Manual Fallbacks for reliability
58
+ HARDCODED_FALLBACKS = {
59
+ "arabic-movies": "arabic-movies33",
60
+ "english-movies": "all_movies_13",
61
+ "indian-movies": "indian-movies9",
62
+ "asian-movies": "6-asian-movies",
63
+ "anime-movies": "anime-movies-7",
64
+ "dubbed-movies": "7-aflammdblgh",
65
+ "turkish-movies": "8-aflam3isk",
66
+ "arabic-series": "arabic-series46",
67
+ "ramadan-2025": "13-ramadan-2025",
68
+ "ramadan-2024": "28-ramadan-2024",
69
+ "ramadan-2023": "10-ramadan-2023",
70
+ "english-series": "english-series10",
71
+ "turkish-series": "turkish-3isk-seriess47",
72
+ "indian-series": "11indian-series",
73
+ "tv-programs": "tv-programs12",
74
+ "plays": "masrh-5",
75
+ "anime-series": "6-anime-series",
76
+ "asian-series": "6-asya"
77
+ }
78
+
79
+ def __init__(self):
80
+ # Primary fetcher: curl-cffi (Fastest, TLS Impersonation)
81
+ # Using chrome120 and disabling SSL verify for maximum compatibility
82
+ self.session = AsyncSession(impersonate="chrome120", timeout=30, verify=False)
83
+ self._cookies_synced = False
84
+ self._last_pw_solve = 0
85
+ self._ua_synced = None
86
+ self._chrome_version = None
87
+ self._domain_lock = asyncio.Lock()
88
+ self._warming_lock = asyncio.Lock()
89
+ self._proxy_refresh_interval = 1800 # 30 minutes
90
+ self._proxy_refresh_time = 0
91
+ self._semaphore = asyncio.Semaphore(5) # Reduced from 15 for stability
92
+ self._optimization_started = False
93
+ self._is_prefetching = False
94
+ self._domain_detected = False
95
+
96
+
97
+ # Hybrid Configuration
98
+ self.REMOTE_SOLVER_URL = "https://meih-movies-api.onrender.com/remote-fetch"
99
+ self.IS_RENDER = os.environ.get("RENDER") is not None
100
+ self.IS_HUGGINGFACE = os.environ.get("SPACE_ID") is not None
101
+
102
+ # Free Proxy Pool for Hugging Face (to bypass IP bans)
103
+ self._free_proxy_pool = []
104
+ self._proxy_pool_last_refresh = 0
105
+
106
+ self.headers = {
107
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
108
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
109
+ "Accept-Language": "ar,en-US;q=0.9,en;q=0.8",
110
+ "Accept-Encoding": "gzip, deflate, br",
111
+ "Referer": "https://www.google.com/",
112
+ "Connection": "keep-alive",
113
+ "Sec-Fetch-Dest": "document",
114
+ "Sec-Fetch-Mode": "navigate",
115
+ "Sec-Fetch-Site": "cross-site",
116
+ }
117
+ self._session_initialized = False
118
+ self._session_warmed_at = 0
119
+ self._httpx_client = None
120
+
121
+ # --- Proxy Rotation System ---
122
+ proxy_str = os.getenv("PROXY_LIST", "")
123
+ self.proxies = [p.strip() for p in proxy_str.split(",") if p.strip()]
124
+ self._current_proxy_idx = 0
125
+ if self.proxies:
126
+ logger.info(f"✓ Proxy rotation enabled with {len(self.proxies)} endpoints")
127
+ self._category_map = {}
128
+ self._last_discovery = 0
129
+ self._discovery_lock = asyncio.Lock()
130
+
131
+ # --- Mirror & Performance ---
132
+ self._cache = {} # {url: (timestamp, data)}
133
+ self._cache_ttl = 3600 # 1 hour for data
134
+ self._free_proxies = []
135
+ self._optimization_started = False
136
+ self._uc_lock = asyncio.Lock()
137
+ self._solver_lock = asyncio.Lock() # Guard against multiple solvers
138
+
139
+ # We'll start optimization on the first request to avoid "no running loop" error
140
+
141
+ async def _optimize_connection(self):
142
+ """Find the fastest mirror and warm up the engine"""
143
+ # 1. Check if we already have a reasonably fresh fastest mirror
144
+ now = time.time()
145
+ if hasattr(self, '_fastest_mirror_detected_at') and now - self._fastest_mirror_detected_at < 3600:
146
+ return
147
+
148
+ logger.info("🔍 Testing mirror speeds (Optimized)...")
149
+
150
+ async def test_mirror(mirror):
151
+ try:
152
+ # very aggressive timeout for discovery
153
+ start = time.time()
154
+ test_url = f"{mirror}/newvideos1.php"
155
+ async with httpx.AsyncClient(timeout=1.5, follow_redirects=True, verify=False) as client:
156
+ resp = await client.get(test_url)
157
+ if resp.status_code == 200:
158
+ return (time.time() - start, mirror)
159
+ except:
160
+ pass
161
+ return (999, mirror)
162
+
163
+ results = await asyncio.gather(*(test_mirror(m) for m in self.MIRRORS))
164
+ results.sort()
165
+
166
+ min_time, fastest_mirror = results[0]
167
+
168
+ if min_time < 999:
169
+ logger.info(f"⚡ Fastest mirror: {fastest_mirror} ({min_time:.2f}s)")
170
+ self.BASE_URL = fastest_mirror
171
+ self.TARGET_URL = f"{fastest_mirror}/newvideos1.php"
172
+ self._fastest_mirror_detected_at = now
173
+ else:
174
+ logger.warning("⚠️ No mirrors responded quickly, using default.")
175
+ self._fastest_mirror_detected_at = now - 3300 # Retry sooner
176
+
177
+
178
+ async def _refresh_free_proxies(self):
179
+ """Fetch free proxies from public APIs (for cloud deployments)"""
180
+ # Enable on both Hugging Face and Render.com
181
+ if not (self.IS_HUGGINGFACE or self.IS_RENDER):
182
+ return
183
+
184
+ now = time.time()
185
+ if now - self._proxy_pool_last_refresh < 300: # Refresh every 5 minutes
186
+ return
187
+
188
+ logger.info("🔄 Refreshing free proxy pool...")
189
+ self._proxy_pool_last_refresh = now
190
+
191
+ proxy_sources = [
192
+ "https://api.proxyscrape.com/v2/?request=get&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all",
193
+ "https://www.proxy-list.download/api/v1/get?type=http",
194
+ ]
195
+
196
+ new_proxies = []
197
+ for source in proxy_sources:
198
+ try:
199
+ async with httpx.AsyncClient(timeout=10.0) as client:
200
+ resp = await client.get(source)
201
+ if resp.status_code == 200:
202
+ proxies = resp.text.strip().split('\n')
203
+ for proxy in proxies[:10]: # Take first 10 from each source
204
+ proxy = proxy.strip()
205
+ if proxy and ':' in proxy:
206
+ new_proxies.append(f"http://{proxy}")
207
+ except Exception as e:
208
+ logger.warning(f"Failed to fetch proxies from {source}: {e}")
209
+
210
+ if new_proxies:
211
+ self._free_proxy_pool = new_proxies
212
+ logger.info(f"✅ Loaded {len(new_proxies)} free proxies")
213
+ else:
214
+ logger.warning("⚠️ No free proxies available")
215
+
216
+ async def _discover_categories(self, force=False):
217
+ """Build the category map dynamically from the homepage"""
218
+ async with self._discovery_lock:
219
+ if not force and time.time() - self._last_discovery < 3600: # Cache for 1 hour
220
+ return
221
+
222
+ logger.info("Refreshing category mapping...")
223
+ html = await self._get_html(self.BASE_URL)
224
+ if not html: return
225
+
226
+ soup = BeautifulSoup(html, 'html.parser')
227
+ new_map = {}
228
+
229
+ # Find all category links
230
+ for a in soup.find_all('a', href=True):
231
+ href = a['href']
232
+ if 'cat=' not in href: continue
233
+
234
+ cat_id = href.split('cat=')[-1].split('&')[0]
235
+ text = a.get_text(strip=True).lower()
236
+
237
+ # Match against keywords
238
+ for alias, keywords in self.CATEGORY_KEYWORDS.items():
239
+ if alias not in new_map:
240
+ if any(k in text for k in keywords):
241
+ new_map[alias] = cat_id
242
+
243
+ if new_map:
244
+ self._category_map = new_map
245
+ self._last_discovery = time.time()
246
+ logger.info(f"✓ Mapped {len(new_map)} categories: {new_map}")
247
+
248
+ async def _resolve_cat_id(self, cat_id: str) -> str:
249
+ """Resolves an alias to a real ID, or returns the original if not an alias"""
250
+ await self._discover_categories()
251
+ # 1. Check dynamic map
252
+ if cat_id in self._category_map:
253
+ return self._category_map[cat_id]
254
+
255
+ # 2. Check hardcoded fallbacks if dynamic failed
256
+ if cat_id in self.HARDCODED_FALLBACKS:
257
+ return self.HARDCODED_FALLBACKS[cat_id]
258
+
259
+ return cat_id
260
+
261
+ async def _warm_session(self):
262
+ """Warm up session with the detected working mirror"""
263
+ if not self._domain_detected:
264
+ # We already set defaults in __init__ / class, just confirm
265
+ logger.info(f"🚀 Targeting exclusive source: {self.TARGET_URL}")
266
+ self._domain_detected = True
267
+
268
+ if not self._session_initialized:
269
+ self._session_initialized = True # Mark as init even if basic get fails, as PW will solve it
270
+
271
+ async def _refresh_free_proxies(self):
272
+ """Refresh free proxy list if needed"""
273
+ if time.time() - self._proxy_refresh_time > self._proxy_refresh_interval:
274
+ logger.info("Refreshing free proxy pool...")
275
+ self._free_proxies = await proxy_fetcher.get_working_proxies(max_count=15)
276
+ self._proxy_refresh_time = time.time()
277
+ logger.info(f"Loaded {len(self._free_proxies)} working free proxies")
278
+
279
+ def _get_proxy(self) -> Optional[str]:
280
+ # On cloud platforms (HF or Render), prioritize free proxy pool
281
+ if (self.IS_HUGGINGFACE or self.IS_RENDER) and self._free_proxy_pool:
282
+ proxy = self._free_proxy_pool[self._current_proxy_idx % len(self._free_proxy_pool)]
283
+ self._current_proxy_idx += 1
284
+ return proxy
285
+
286
+ # Try free proxies first (legacy proxy_fetcher)
287
+ if self._free_proxies:
288
+ proxy = self._free_proxies[self._current_proxy_idx % len(self._free_proxies)]
289
+ self._current_proxy_idx += 1
290
+ return proxy
291
+
292
+ # Fallback to configured proxies
293
+ if not self.proxies: return None
294
+ proxy = self.proxies[self._current_proxy_idx % len(self.proxies)]
295
+ self._current_proxy_idx += 1
296
+ return proxy
297
+
298
+
299
+ async def _get_html_with_undetected_chrome(self, url: str) -> Optional[str]:
300
+ """The 'NUCLEAR Option': Undetected-Chromedriver with safety locks for Windows"""
301
+ if not HAS_SELENIUM:
302
+ logger.error("❌ Cannot use UC: Selenium/Undetected-Chromedriver not installed.")
303
+ return None
304
+
305
+ async with self._uc_lock:
306
+ logger.info(f"💣 Launching Undetected-Chrome NUCLEAR Bypass for {url}...")
307
+
308
+ def get_chrome_version():
309
+ try:
310
+ import winreg
311
+ key = winreg.OpenKey(winreg.HKEY_CURRENT_USER, r'Software\Google\Chrome\BLBeacon')
312
+ version, _ = winreg.QueryValueEx(key, 'version')
313
+ return int(version.split('.')[0])
314
+ except:
315
+ return 120 # Fallback
316
+
317
+ if not self._chrome_version:
318
+ self._chrome_version = get_chrome_version()
319
+
320
+ def chrome_task():
321
+ driver = None
322
+ try:
323
+ options = uc.ChromeOptions()
324
+ options.add_argument('--headless')
325
+ options.add_argument('--no-sandbox')
326
+ options.add_argument('--disable-dev-shm-usage')
327
+ options.add_argument('--disable-gpu')
328
+ options.add_argument('--window-size=1280,1024')
329
+ options.add_argument('--mute-audio')
330
+ options.add_argument('--disable-notifications')
331
+ options.add_argument('--disable-popup-blocking')
332
+ options.add_argument('--hide-scrollbars')
333
+ options.add_argument('--disable-logging')
334
+ options.add_argument('--log-level=3')
335
+ options.add_argument('--no-first-run')
336
+ options.add_argument('--no-default-browser-check')
337
+ options.add_argument('--no-pings')
338
+ options.add_argument('--disable-blink-features=AutomationControlled')
339
+
340
+ # Disable images for maximum speed
341
+ prefs = {
342
+ 'profile.managed_default_content_settings.images': 2,
343
+ 'profile.default_content_settings.images': 2
344
+ }
345
+ options.add_experimental_option('prefs', prefs)
346
+
347
+ driver = uc.Chrome(options=options, version_main=self._chrome_version)
348
+ driver.set_page_load_timeout(60)
349
+
350
+ logger.info(f"💣 UC Fetching: {url}")
351
+ driver.get(url)
352
+
353
+ # Wait for either content or challenge
354
+ time.sleep(10) # Heavy sleep for UC
355
+
356
+ html = driver.page_source
357
+
358
+ # Basic sync of UA
359
+ ua = driver.execute_script("return navigator.userAgent")
360
+ if ua:
361
+ self.headers["User-Agent"] = ua
362
+
363
+ return html
364
+ except Exception as e:
365
+ logger.error(f"Undetected-Chrome failure: {e}")
366
+ return None
367
+ finally:
368
+ if driver:
369
+ try: driver.quit()
370
+ except: pass
371
+
372
+ loop = asyncio.get_event_loop()
373
+ return await loop.run_in_executor(None, chrome_task)
374
+
375
+ async def _get_html_with_flaresolverr(self, url: str) -> Optional[str]:
376
+ """FlareSolverr with Singleton Lock to avoid browser bloat"""
377
+ async with self._solver_lock:
378
+ # Re-check cache inside lock
379
+ if url in self._cache:
380
+ return self._cache[url][1]
381
+
382
+ logger.info(f"✨ Requesting FlareSolverr solve for {url}...")
383
+
384
+ flaresolverr_url = "http://localhost:8191/v1"
385
+ payload = {
386
+ "cmd": "request.get",
387
+ "url": url,
388
+ "maxTimeout": 60000
389
+ }
390
+
391
+ # Connection Retry Loop
392
+ max_conn_retries = 5 # Increased retries
393
+ for conn_attempt in range(max_conn_retries):
394
+ try:
395
+ async with httpx.AsyncClient(timeout=90.0) as client:
396
+ response = await client.post(flaresolverr_url, json=payload)
397
+ if response.status_code == 200:
398
+ data = response.json()
399
+ if data.get('status') == 'ok':
400
+ solution = data.get('solution', {})
401
+ html = solution.get('response', '')
402
+
403
+ # SYNCING LOGIC
404
+ cookies = solution.get('cookies', [])
405
+ ua = solution.get('userAgent', '')
406
+ if ua:
407
+ self._ua_synced = ua
408
+ self.headers["User-Agent"] = ua
409
+
410
+ for cookie in cookies:
411
+ # Ensure domain is set for proper cookie handling
412
+ domain = cookie.get('domain')
413
+ if not domain and url:
414
+ try:
415
+ domain = urlparse(url).netloc
416
+ if domain.startswith('www.'):
417
+ domain = domain[4:]
418
+ except:
419
+ pass
420
+
421
+ if domain:
422
+ self.session.cookies.set(
423
+ cookie['name'],
424
+ cookie['value'],
425
+ domain=domain,
426
+ path=cookie.get('path', '/'),
427
+ secure=cookie.get('secure', False),
428
+ expires=cookie.get('expires')
429
+ )
430
+
431
+ self._cookies_synced = True
432
+ self._last_pw_solve = time.time()
433
+ logger.info("✅ Session Synced!")
434
+ return html
435
+ else:
436
+ logger.warning(f"FlareSolverr error: {data.get('message')}")
437
+ else:
438
+ logger.warning(f"FlareSolverr returned status {response.status_code}")
439
+ except Exception as e:
440
+ if conn_attempt < max_conn_retries - 1:
441
+ logger.warning(f"FlareSolverr comm failed (attempt {conn_attempt+1}/{max_conn_retries}): {e}. Retrying...")
442
+ await asyncio.sleep(2)
443
+ else:
444
+ logger.error(f"FlareSolverr comm failed after {max_conn_retries} attempts: {e}")
445
+ return None
446
+
447
+ async def _turbo_prefetch(self):
448
+ """Pre-fetch all major categories in parallel to populate cache instantly"""
449
+ if self._is_prefetching: return
450
+ self._is_prefetching = True
451
+ logger.info("🚀 NITRO MODE: Starting concurrent background pre-fetch...")
452
+
453
+ try:
454
+ # List of high-priority tasks
455
+ tasks = [self.fetch_home(page=1)]
456
+
457
+ # Map of key categories to pre-warm
458
+ priority_cats = list(self.CATEGORY_KEYWORDS.keys())[:15]
459
+ for cat_id in priority_cats:
460
+ tasks.append(self.fetch_category(cat_id, page=1))
461
+
462
+ # Run everything in parallel with semaphore protection
463
+ await asyncio.gather(*tasks, return_exceptions=True)
464
+ logger.info(f"⚡ NITRO MODE complete! Cache primed with {len(self._cache)} items.")
465
+ except Exception as e:
466
+ logger.error(f"Nitro pre-fetch failed: {e}")
467
+ finally:
468
+ self._is_prefetching = False
469
+
470
+ async def _get_html(self, url: str, max_retries: int = 1, follow_meta=True) -> Optional[str]:
471
+ """Nitro-Speed Fetch with Parallel Safety"""
472
+ if not self._optimization_started:
473
+ self._optimization_started = True
474
+ asyncio.create_task(self._optimize_connection())
475
+
476
+ async with self._semaphore:
477
+ now = time.time()
478
+
479
+ # 0. Cache Check
480
+ if url in self._cache:
481
+ ts, data = self._cache[url]
482
+ if now - ts < self._cache_ttl:
483
+ return data
484
+
485
+ # Sanitize URL - Skip landing pages
486
+ if any(x in url for x in ["/gaza.20", "/gaza.18", "/gaza.22"]):
487
+ logger.info(f"Sanitizing landing page URL: {url} -> {self.TARGET_URL}")
488
+ url = self.TARGET_URL
489
+
490
+ # Refresh free proxies if on cloud platforms
491
+ if self.IS_HUGGINGFACE or self.IS_RENDER:
492
+ await self._refresh_free_proxies()
493
+
494
+ proxy = self._get_proxy()
495
+ proxy_dict = {"http": proxy, "https": proxy} if proxy else None
496
+
497
+ # 1. Nitro Path (curl-cffi)
498
+ logger.info(f"🚀 Nitro Path (curl-cffi) for {url}")
499
+ try:
500
+ # Increased timeout to 45s to handle extremely slow responses
501
+ resp = await self.session.get(url, headers=self.headers, timeout=45, proxies=proxy_dict)
502
+ status_code = resp.status_code
503
+ logger.info(f"📡 Nitro Path response: {status_code} ({len(resp.content)} bytes)")
504
+
505
+ if status_code == 200:
506
+ text = resp.text
507
+ # Improve Meta Refresh detection (Larooza uses this heavily for domain rotation)
508
+ refresh_match = re.search(r'http-equiv=["\']refresh["\'].*?content=["\']\d+;\s*url=(.*?)["\']', text, re.I)
509
+ if not refresh_match:
510
+ refresh_match = re.search(r'content=["\']\d+;\s*url=(.*?)["\']', text, re.I)
511
+
512
+ if refresh_match and follow_meta:
513
+ new_url_raw = refresh_match.group(1).strip("'\" ")
514
+ new_url = urljoin(url, new_url_raw)
515
+
516
+ # Preserve query parameters if the new URL doesn't have them but the old one did
517
+ if "?" not in new_url and "?" in url:
518
+ query = url.split("?")[-1]
519
+ new_url = f"{new_url}?{query}" if not new_url.endswith("?") else f"{new_url}{query}"
520
+
521
+ # If redirecting to a known landing page or ad-trap, skip it
522
+ if any(x in new_url for x in ["gaza.20", "gaza.18", "gaza.22", "gaza.24"]):
523
+ logger.info(f"🚫 Skipping ad-trap redirect: {new_url}")
524
+ new_url = self.TARGET_URL
525
+
526
+ logger.info(f"🔄 Following meta refresh to: {new_url}")
527
+ return await self._get_html(new_url, max_retries=max_retries, follow_meta=False)
528
+
529
+ # More robust Cloudflare & Landing Page detection
530
+ text_lower = text.lower()
531
+ cf_markers = ["challenge-running", "cf-ray", "cloudflare-static", "just a moment", "verify you are human", "checking your browser"]
532
+ is_cf = any(x in text_lower for x in cf_markers) or "id=\"challenge-form\"" in text_lower
533
+
534
+ # Detect landing page even if 200 OK (gaza.20 redirect in JS or Meta)
535
+ is_landing = "gaza.20" in text_lower or "gaza.18" in text_lower or "gaza.22" in text_lower
536
+
537
+ if is_cf:
538
+ logger.warning(f"⚠️ Cloudflare detected in Nitro response for {url}")
539
+ elif is_landing and follow_meta:
540
+ logger.info(f"🔄 Landing page detected in content for {url}, forcing target...")
541
+ return await self._get_html(self.TARGET_URL, max_retries=max_retries, follow_meta=False)
542
+ else:
543
+ self._cache[url] = (now, text)
544
+ return text
545
+ elif status_code == 404:
546
+ logger.warning(f"⚠️ Nitro Path 404 for {url} on mirror {self.BASE_URL}")
547
+ # If this was a mirror, fallback to primary domain
548
+ primary_primary = self.MIRRORS[0]
549
+ if self.BASE_URL != primary_primary:
550
+ fallback_url = url.replace(self.BASE_URL, primary_primary)
551
+ logger.info(f"🔁 Falling back to primary domain: {fallback_url}")
552
+ return await self._get_html(fallback_url, max_retries=max_retries, follow_meta=True)
553
+ elif status_code == 403:
554
+ logger.warning(f"🚫 Nitro Path 403 for {url}, falling back to solvers...")
555
+ except Exception as e:
556
+ logger.error(f"❌ Nitro Path error for {url}: {e}")
557
+
558
+ # 2. Solver Path
559
+ for att in range(max_retries):
560
+ # Use a specific lock for solver to prevent multiple concurrent solver requests for the same URL
561
+ # but allow different URLs in parallel. For simplicity, we use the existing semaphore and a small delay.
562
+
563
+ # Check cache again just in case another task filled it
564
+ if url in self._cache:
565
+ return self._cache[url][1]
566
+
567
+ html = await self._get_html_with_flaresolverr(url)
568
+ if html:
569
+ self._cache[url] = (now, html)
570
+ return html
571
+
572
+ # UC Fallback for critical pages
573
+ if att == max_retries - 1:
574
+ logger.info(f"UC Fallback for: {url}")
575
+ res = await self._get_html_with_undetected_chrome(url)
576
+ if res: return res
577
+
578
+ return None
579
+
580
+ def _extract_items(self, soup: BeautifulSoup) -> List[Dict]:
581
+ """Ultra-Fast Content Extraction with Deep Image Probing"""
582
+ items = []
583
+ if not soup: return []
584
+
585
+ if soup.title:
586
+ logger.info(f"Extracting: {soup.title.string}")
587
+ if "challenge" in str(soup.title).lower() or "cloudflare" in str(soup.title).lower():
588
+ return []
589
+
590
+ # Ultra-Strong Coverage for all Larooza Variants & Mirrors
591
+ containers = soup.select('.thumbnail, .pm-li-video, .pm-video-thumb, .video-block, .movie-item, li.col-xs-6, .box, .video-box, .video-item, .post-item')
592
+ if not containers:
593
+ # Deep scan for any link that looks like a video
594
+ containers = soup.select('a[href*="video.php"], a[href*="watch.php"], .video-listing-content, .card-video')
595
+
596
+ seen_urls = set()
597
+ for tag in containers:
598
+ # 1. Fast Link Detection
599
+ link = tag if (tag.name == 'a' and 'video.php' in tag.get('href', '')) else \
600
+ (tag.select_one('a.ellipsis') or tag.find('a', href=lambda x: x and 'video.php' in x))
601
+
602
+ if not link: continue
603
+ href = link.get('href')
604
+ if not href: continue
605
+
606
+ full_link = urljoin(self.BASE_URL, href)
607
+ if full_link in seen_urls: continue
608
+ seen_urls.add(full_link)
609
+
610
+ # 2. Extract Title & Clean it
611
+ title_node = tag.select_one('h3, h2, .title, .ellipsis, .video-title, p')
612
+ title = title_node.get_text(strip=True) if title_node else ""
613
+ if not title and link:
614
+ title = link.get('title') or link.get_text(strip=True)
615
+
616
+ # Clean Title (Remove noisy tags for premium look)
617
+ for t_tag in ["مشاهدة", "فيلم", "مسلسل", "كامل", "HDCAM", "HD", "WEB-DL", "Cam", "مترجم", "اون لاين", "مدبلج"]:
618
+ title = title.replace(t_tag, "").strip()
619
+ title = re.sub(r'\d{4}', '', title).strip("- ").strip() # Remove Year
620
+
621
+ # 3. Deep Image Probing
622
+ img_node = tag.select_one('img')
623
+ img_url = ""
624
+ if img_node:
625
+ # Try all possible lazy-load attributes, prefer potential real URLs over base64
626
+ candidates = [
627
+ img_node.get('data-src'),
628
+ img_node.get('data-lazy-src'),
629
+ img_node.get('data-original'),
630
+ img_node.get('srcset'),
631
+ img_node.get('src')
632
+ ]
633
+ for c in candidates:
634
+ if c and not c.startswith('data:'):
635
+ # Ensure it's a real URL
636
+ if c.startswith('http') or c.startswith('//') or c.startswith('/'):
637
+ img_url = c
638
+ break
639
+
640
+ # If still no image, try to find ANY attribute that looks like a URL
641
+ if not img_url:
642
+ for attr, val in img_node.attrs.items():
643
+ if isinstance(val, str) and (val.startswith('http') or '.jpg' in val or '.png' in val) and not val.startswith('data:'):
644
+ img_url = val
645
+ break
646
+
647
+ if img_url and "," in img_url: # Handle srcset
648
+ img_url = img_url.split(",")[0].split(" ")[0]
649
+
650
+ # Fallback: Check for background-image in style
651
+ if not img_url:
652
+ style = tag.get('style') or ""
653
+ if 'background-image' in style:
654
+ m = re.search(r'url\([\'"]?(.*?)[\'"]?\)', style)
655
+ if m:
656
+ img_url = m.group(1)
657
+
658
+ if not img_url or img_url.startswith('data:'):
659
+ img_url = "https://placehold.co/600x400/000000/FFFFFF?text=No+Poster"
660
+
661
+ # Absolute URL correction
662
+ if img_url.startswith('//'): img_url = 'https:' + img_url
663
+ elif img_url.startswith('/'): img_url = self.BASE_URL + img_url
664
+
665
+ # Proxy through our backend for stability
666
+ poster = f"/proxy/image?url={quote(img_url)}"
667
+
668
+ # 4. Speed-optimized Series Detection
669
+ lt = title.lower()
670
+ content_type = "series" if any(x in lt for x in ['حلقة', 'مسلسل', 'episode', 'season', 'series']) else "movie"
671
+
672
+ items.append({
673
+ "id": base64.urlsafe_b64encode(full_link.encode()).decode(),
674
+ "title": title,
675
+ "poster": poster,
676
+ "type": content_type,
677
+ "duration": tag.select_one('.duration, .pm-label-duration, .time').get_text(strip=True) if tag.select_one('.duration, .pm-label-duration, .time') else ""
678
+ })
679
+ return items
680
+
681
+ async def fetch_home(self, page: int = 1) -> List[Dict]:
682
+ target = f"{self.TARGET_URL}?page={page}"
683
+ html = await self._get_html(target, max_retries=3)
684
+ if not html:
685
+ logger.error(f"Failed to fetch home page: {target}")
686
+ return []
687
+
688
+ items = self._extract_items(BeautifulSoup(html, 'html.parser'))
689
+ logger.info(f"Fetched {len(items)} items from {target}")
690
+ return items
691
+
692
+ async def fetch_category(self, cat_id: str, page: int = 1) -> List[Dict]:
693
+ resolved_id = await self._resolve_cat_id(cat_id)
694
+ target = f"{self.BASE_URL}/category.php?cat={resolved_id}&page={page}"
695
+ html = await self._get_html(target, max_retries=3)
696
+ return self._extract_items(BeautifulSoup(html, 'html.parser')) if html else []
697
+
698
+ def _normalize_number(self, text: str) -> int:
699
+ """Extract episode number from Arabic/English text"""
700
+ # Arabic number words mapping
701
+ arabic_map = {
702
+ 'الأولى': 1, 'الاولى': 1, 'الثانية': 2, 'الثالثة': 3, 'الرابعة': 4,
703
+ 'الخامسة': 5, 'السادسة': 6, 'السابعة': 7, 'الثامنة': 8, 'التاسعة': 9,
704
+ 'العاشرة': 10, 'الحادية': 11, 'الثانية عشر': 12, 'الثالثة عشر': 13,
705
+ 'الرابعة عشر': 14, 'الخامسة عشر': 15, 'السادسة عشر': 16, 'السابعة عشر': 17,
706
+ 'الثامنة عشر': 18, 'التاسعة عشر': 19, 'العشرون': 20, 'الاخيرة': 999
707
+ }
708
+
709
+ # Try to find numeric digits first (most reliable)
710
+ match = re.search(r'(\d+)', text)
711
+ if match:
712
+ return int(match.group(1))
713
+
714
+ # Try Arabic number words
715
+ text_lower = text.lower()
716
+ for arabic_word, num in arabic_map.items():
717
+ if arabic_word in text_lower:
718
+ return num
719
+
720
+ # Try to extract from patterns like "الحلقة X" or "Episode X"
721
+ patterns = [
722
+ r'(?:الحلقة|حلقة|episode|ep)\s*[:\-]?\s*(\d+)',
723
+ r'(\d+)\s*(?:الحلقة|حلقة|episode|ep)',
724
+ ]
725
+ for pattern in patterns:
726
+ match = re.search(pattern, text_lower)
727
+ if match:
728
+ return int(match.group(1))
729
+
730
+ return 0
731
+
732
+ def _safe_get_episode(self, text: str, name_hint: str = None) -> int:
733
+ """Smarter episode number extraction with common patterns"""
734
+ # Remove common noise
735
+ clean = re.sub(r'\(.*?\)', '', text)
736
+ clean = re.sub(r'\[.*?\]', '', clean)
737
+
738
+ if name_hint:
739
+ # Remove the series name from the text to avoid matching numbers in the title (e.g. "2 قهوة")
740
+ clean = clean.replace(name_hint, "").strip()
741
+
742
+ # 1. Look for number after keywords (Most reliable)
743
+ m = re.search(r'(?:الحلقة|حلقة|ep|episode|part|p)\s*(\d+)', clean, re.I)
744
+ if m: return int(m.group(1))
745
+
746
+ # 2. Direct digits (Fallback)
747
+ m = re.search(r'(\d+)', clean)
748
+ if m: return int(m.group(1))
749
+
750
+ # 3. Word matches
751
+ return self._normalize_number(clean)
752
+
753
+ async def search(self, query: str) -> List[Dict]:
754
+ url = f"{self.BASE_URL}/search.php?keywords={quote(query)}"
755
+ html = await self._get_html(url, max_retries=2)
756
+ return self._extract_items(BeautifulSoup(html, 'html.parser')) if html else []
757
+
758
+ async def fetch_details(self, safe_id: str) -> Dict:
759
+ try:
760
+ url = base64.urlsafe_b64decode(safe_id).decode()
761
+ except: return {}
762
+
763
+ html = await self._get_html(url)
764
+ if not html: return {}
765
+
766
+ soup = BeautifulSoup(html, 'html.parser')
767
+
768
+ # Follow play.php for watch servers
769
+ watch_html = html
770
+ watch_soup = soup
771
+ play_a = soup.select_one('a[href*="play.php"]')
772
+ if play_a:
773
+ p_url = urljoin(self.BASE_URL, play_a.get('href'))
774
+ p_html = await self._get_html(p_url)
775
+ if p_html:
776
+ watch_soup = BeautifulSoup(p_html, 'html.parser')
777
+ watch_html = p_html
778
+
779
+ title = soup.find('h1').get_text(strip=True) if soup.find('h1') else "Unknown"
780
+ is_series = bool(soup.select('.episodes-list, .season-episodes, .vid-episodes')) or any(x in title for x in ["حلقة", "مسلسل", "الموسم"])
781
+
782
+ raw_poster = soup.select_one('meta[property="og:image"]')['content'] if soup.select_one('meta[property="og:image"]') else ""
783
+ if not raw_poster:
784
+ img_tag = soup.select_one('.poster img, .movie-poster img, .pm-video-watch-main img')
785
+ if img_tag:
786
+ raw_poster = img_tag.get('src') or img_tag.get('data-src')
787
+
788
+ poster = ""
789
+ if raw_poster:
790
+ full_poster_url = urljoin(self.BASE_URL, raw_poster)
791
+ poster = f"/proxy/image?url={quote(full_poster_url)}"
792
+
793
+ response = {
794
+ "id": safe_id, "title": title,
795
+ "description": soup.select_one('.story, .desc, .entry-content').get_text(strip=True) if soup.select_one('.story, .desc, .entry-content') else "",
796
+ "poster": poster,
797
+ "type": "series" if is_series else "movie",
798
+ "seasons": [], "episodes": [], "servers": [], "download_links": []
799
+ }
800
+
801
+ # --- Episodes ---
802
+ if is_series:
803
+ unique_eps = {}
804
+
805
+ # 1. Proactive Search: Look for a "Series Category" link
806
+ cat_link = None
807
+
808
+ # A. Check Breadcrumbs (Very reliable for series category)
809
+ for bc in soup.select('.breadcrumb a, .bread-crumb a, .breadcrumbs a, .pm-breadcrumb a'):
810
+ href = bc.get('href')
811
+ if href and ('cat=' in href or 'ser=' in href):
812
+ # Skip generic high-level categories if possible?
813
+ # Actually, we filter by title later, so it's okay.
814
+ cat_link = urljoin(self.BASE_URL, href)
815
+ if 'ser=' in href: # Prefer ser= over cat=
816
+ break
817
+
818
+ # Extract clean series name for filtering
819
+ clean_title = title.replace("مسلسل", "").strip()
820
+ # Try to get name before "الحلقة" or "المواسم"
821
+ series_name = re.split(r'الحلقة|الموسم|حلقة|season|episode', clean_title, flags=re.I)[0].strip()
822
+ # Arabic numeral support for filtering
823
+ series_name_alt = series_name.replace('0','٠').replace('1','١').replace('2','٢').replace('3','٣').replace('4','٤').replace('5','٥').replace('6','٦').replace('7','٧').replace('8','٨').replace('9','٩')
824
+
825
+ logger.info(f"Targeting series name: {series_name} (Alt: {series_name_alt})")
826
+
827
+ # B. Check if Title itself is a link to the category or series
828
+ if not cat_link:
829
+ title_link = soup.select_one('h1 a[href*="cat="], h1 a[href*="ser="], h1 a[href*="tag.php"]')
830
+ if title_link:
831
+ cat_link = urljoin(self.BASE_URL, title_link['href'])
832
+
833
+ # C. General search in links with strict patterns
834
+ if not cat_link:
835
+ for a in soup.find_all('a', href=True):
836
+ href = a['href']
837
+ a_text = a.get_text(strip=True)
838
+ # High-confidence patterns
839
+ if any(x in a_text for x in ["المسلسل:", "جميع الحلقات", "حلقات المسلسل", "كل الحلقات"]):
840
+ cat_link = urljoin(self.BASE_URL, href)
841
+ logger.info(f"Found cat_link via labels: {cat_link}")
842
+ break
843
+
844
+ # D. Fallback search by title
845
+ if not cat_link:
846
+ for a in soup.find_all('a', href=True):
847
+ href = a['href']
848
+ if any(x in href for x in ['ser=', 'cat=', 'tag.php']):
849
+ a_text = a.get_text(strip=True)
850
+ if (series_name and series_name in a_text) or (series_name_alt and series_name_alt in a_text):
851
+ cat_link = urljoin(self.BASE_URL, href)
852
+ logger.info(f"Found cat_link via fallback title search: {cat_link}")
853
+ break
854
+
855
+ if cat_link:
856
+ try:
857
+ # Determine type: view-serie.php, category.php, tag.php
858
+ is_view_serie = 'view-serie' in cat_link
859
+ param_name = 'ser' if is_view_serie else ('t' if 'tag.php' in cat_link else 'cat')
860
+
861
+ # Robust ID extraction
862
+ match = re.search(f'[?&]{param_name}=([^&]+)', cat_link)
863
+ if match:
864
+ cat_id = match.group(1)
865
+ base_deep_url = f"{self.BASE_URL}/tag.php?t={cat_id}" if param_name == 't' else \
866
+ (f"{self.BASE_URL}/view-serie.php?ser={cat_id}" if is_view_serie else \
867
+ f"{self.BASE_URL}/category.php?cat={cat_id}")
868
+
869
+ logger.info(f"Deep scraping episodes from {cat_link} (ID: {cat_id})")
870
+ # Fetch first 5 pages
871
+ for p in range(1, 6):
872
+ target_p = f"{base_deep_url}&page={p}" if p > 1 else base_deep_url
873
+ p_html = await self._get_html(target_p)
874
+ if not p_html: break
875
+ p_items = self._extract_items(BeautifulSoup(p_html, 'html.parser'))
876
+
877
+ if not p_items: break
878
+ for item in p_items:
879
+ # Filter Check: Use a fuzzy name match
880
+ i_title = item['title']
881
+ # Must match at least the first 2 words if possible, or the whole name
882
+ name_parts = series_name.split()
883
+ match_key = " ".join(name_parts[:2]) if len(name_parts) >= 2 else series_name
884
+
885
+ if match_key in i_title or series_name in i_title or series_name_alt in i_title:
886
+ e_num = self._safe_get_episode(i_title, name_hint=series_name)
887
+ if e_num and e_num not in unique_eps:
888
+ unique_eps[e_num] = {
889
+ "id": item['id'],
890
+ "episode": e_num,
891
+ "title": i_title
892
+ }
893
+ if len(p_items) < 10: break
894
+ except Exception as e:
895
+ logger.error(f"Category episode fetch failed: {e}")
896
+
897
+ # 2. Local fallback: Scrape episodes from the current page
898
+ for ep in soup.select('.episodes-list a, .season-episodes a, .vid-episodes a, ul.episodes li a, div.caption h3 a, .movie-item a, .related-vids a'):
899
+ ep_href = ep.get('href')
900
+ if not ep_href or 'video.php' not in ep_href: continue
901
+ ep_url = urljoin(self.BASE_URL, ep_href)
902
+ ep_text = ep.get_text(strip=True)
903
+
904
+ # If text is empty, check for nested title
905
+ if not ep_text:
906
+ inner = ep.find(['h3', 'span', 'strong'])
907
+ if inner: ep_text = inner.get_text(strip=True)
908
+
909
+ # CRITICAL FILTER: Item must belong to this series
910
+ if series_name and series_name not in ep_text:
911
+ continue
912
+
913
+ ep_num = self._safe_get_episode(ep_text, name_hint=series_name)
914
+ if ep_num and ep_num not in unique_eps:
915
+ unique_eps[ep_num] = {
916
+ "id": base64.urlsafe_b64encode(ep_url.encode()).decode(),
917
+ "episode": ep_num,
918
+ "title": ep_text
919
+ }
920
+
921
+ response['episodes'] = sorted(list(unique_eps.values()), key=lambda x: x['episode'])
922
+ response['seasons'] = [{"number": 1, "episodes": response['episodes']}]
923
+
924
+ # --- WATCH SERVERS ---
925
+ watch_urls = set()
926
+
927
+ def is_valid_srv(url_str: str) -> bool:
928
+ if not url_str or 'javascript' in url_str: return False
929
+ if 'larooza' in url_str and 'video.php' in url_str: return False
930
+ if any(x in url_str.lower() for x in ['beacon', 'analytics', 'pixel', 'ads.', 'google', 'facebook']): return False
931
+ return True
932
+
933
+ # 1. Primary: WatchList & Source tags
934
+ server_selectors = [
935
+ 'ul.WatchList li', '.server-list li', '#servers li', '.watch-servers li',
936
+ '.video-servers-list li', 'div.servers a', '.player-servers li'
937
+ ]
938
+
939
+ for sel in server_selectors:
940
+ for li in watch_soup.select(sel):
941
+ s_url = li.get('data-embed-url') or li.get('data-link') or li.get('data-embed') or li.get('data-src') or li.get('data-url')
942
+ if not s_url:
943
+ a_tag = li.find('a', href=True)
944
+ if a_tag and not a_tag['href'].startswith('javascript'):
945
+ s_url = a_tag['href']
946
+
947
+ if s_url and is_valid_srv(s_url):
948
+ if s_url.startswith('//'): s_url = "https:" + s_url
949
+ full_s_url = urljoin(self.BASE_URL, s_url)
950
+ if full_s_url not in watch_urls:
951
+ watch_urls.add(full_s_url)
952
+ name = li.get_text(strip=True) or f"سيرفر {len(response['servers']) + 1}"
953
+ response['servers'].append({"name": name, "url": full_s_url, "type": "iframe"})
954
+
955
+ # 2. Secondary: Deep Iframe Scan
956
+ for ifr in watch_soup.select('iframe[src], embed[src], video source[src]'):
957
+ src = ifr.get('src')
958
+ if is_valid_srv(src):
959
+ if src.startswith('//'): src = "https:" + src
960
+ full_s_url = urljoin(self.BASE_URL, src)
961
+ if full_s_url not in watch_urls:
962
+ watch_urls.add(full_s_url)
963
+ response['servers'].append({"name": f"سيرفر سريع {len(response['servers']) + 1}", "url": full_s_url, "type": "iframe"})
964
+
965
+ # 3. Regex Fallback (Scripts & Global)
966
+ patterns = [
967
+ r'iframe.*?src=["\'](https?://[^"\']+)["\']',
968
+ r'embedUrl["\']\s*:\s*["\'](https?://[^"\']+)["\']',
969
+ r'file["\']\s*:\s*["\'](https?://[^"\']+\.m3u8)["\']',
970
+ r'source\s*src=["\'](https?://[^"\']+)["\']'
971
+ ]
972
+ for pattern in patterns:
973
+ for match in re.findall(pattern, watch_html, re.I):
974
+ if is_valid_srv(match) and match not in watch_urls:
975
+ watch_urls.add(match)
976
+ response['servers'].append({"name": f"سيرفر احتياطي {len(response['servers']) + 1}", "url": match, "type": "iframe"})
977
+
978
+ # Clean duplicates and sort by quality/relevance if possible
979
+ # For now, just ensuring uniqueness
980
+
981
+ # --- Downloads ---
982
+ dl_url = url.replace('video.php', 'download.php').replace('play.php', 'download.php')
983
+ dl_html = await self._get_html(dl_url)
984
+ if dl_html:
985
+ dl_soup = BeautifulSoup(dl_html, 'html.parser')
986
+ for mirror in dl_soup.select('a[target="_blank"]'):
987
+ m_url = mirror.get('href')
988
+ if m_url and 'http' in m_url:
989
+ if any(x in m_url.lower() for x in ['wa.me', 'facebook.com', 'twitter.com', 'telegram.me', 't.me', 'sharer.php']):
990
+ continue
991
+ q_text = mirror.get_text(strip=True).replace("اضغط هنا للتحميل", "").replace("تحميل الملف", "").strip() or "رابط تحميل"
992
+ response['download_links'].append({"quality": q_text, "url": m_url})
993
+
994
+ return response
995
+
996
+ scraper = LaroozaScraper()
scraper/proxy_fetcher.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Free Proxy Fetcher - Automatically fetches and validates free proxies
3
+ """
4
+ import aiohttp
5
+ import asyncio
6
+ import logging
7
+
8
+ logger = logging.getLogger("proxy_fetcher")
9
+
10
+ class FreeProxyFetcher:
11
+ def __init__(self):
12
+ self.proxies = []
13
+ self.last_fetch = 0
14
+
15
+ async def fetch_free_proxies(self):
16
+ """Fetch free proxies from public APIs"""
17
+ proxy_sources = [
18
+ "https://api.proxyscrape.com/v2/?request=get&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all",
19
+ "https://www.proxy-list.download/api/v1/get?type=http",
20
+ ]
21
+
22
+ all_proxies = []
23
+ async with aiohttp.ClientSession() as session:
24
+ for source in proxy_sources:
25
+ try:
26
+ async with session.get(source, timeout=10) as resp:
27
+ if resp.status == 200:
28
+ text = await resp.text()
29
+ proxies = [f"http://{line.strip()}" for line in text.split('\n') if line.strip()]
30
+ all_proxies.extend(proxies[:20]) # Take first 20 from each source
31
+ logger.info(f"Fetched {len(proxies)} proxies from {source}")
32
+ except Exception as e:
33
+ logger.error(f"Failed to fetch from {source}: {e}")
34
+
35
+ self.proxies = all_proxies
36
+ logger.info(f"Total free proxies loaded: {len(self.proxies)}")
37
+ return self.proxies
38
+
39
+ async def validate_proxy(self, proxy, test_url="https://httpbin.org/ip"):
40
+ """Test if a proxy works"""
41
+ try:
42
+ async with aiohttp.ClientSession() as session:
43
+ async with session.get(test_url, proxy=proxy, timeout=5) as resp:
44
+ if resp.status == 200:
45
+ return True
46
+ except:
47
+ pass
48
+ return False
49
+
50
+ async def get_working_proxies(self, max_count=10):
51
+ """Get validated working proxies"""
52
+ if not self.proxies:
53
+ await self.fetch_free_proxies()
54
+
55
+ working = []
56
+ tasks = [self.validate_proxy(p) for p in self.proxies[:30]]
57
+ results = await asyncio.gather(*tasks, return_exceptions=True)
58
+
59
+ for proxy, is_working in zip(self.proxies[:30], results):
60
+ if is_working and len(working) < max_count:
61
+ working.append(proxy)
62
+
63
+ logger.info(f"Validated {len(working)} working proxies")
64
+ return working
65
+
66
+ proxy_fetcher = FreeProxyFetcher()
start.sh ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -e
3
+
4
+ echo "--- STARTING MULTI-SERVICE BOOT ---"
5
+
6
+ # Step 1: Start FlareSolverr
7
+ echo "[1/3] Launching FlareSolverr in background..."
8
+ export PYTHONPATH=$PYTHONPATH:/app/flaresolverr
9
+ export PORT=8191
10
+ export LOG_LEVEL=info
11
+
12
+ # Run FlareSolverr with its own directory as CWD
13
+ (cd /app/flaresolverr && python3 flaresolverr.py) &
14
+
15
+ # Step 2: Health Check for FlareSolverr
16
+ echo "[2/3] Waiting for FlareSolverr to bind to port 8191..."
17
+ MAX_RETRIES=30
18
+ COUNT=0
19
+ while ! curl -s http://localhost:8191/health > /dev/null; do
20
+ sleep 1
21
+ COUNT=$((COUNT+1))
22
+ if [ $COUNT -ge $MAX_RETRIES ]; then
23
+ echo "⚠️ FlareSolverr failed to start in time, continuing to FastAPI anyway..."
24
+ break
25
+ fi
26
+ done
27
+ echo "✅ FlareSolverr is ready!"
28
+
29
+ # Step 3: Start FastAPI
30
+ echo "[3/3] Launching FastAPI on port 7860..."
31
+ uvicorn main:app --host 0.0.0.0 --port 7860 --log-level info
start_render.sh ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -e
3
+
4
+ echo "--- RENDER.COM DEPLOYMENT ---"
5
+
6
+ # Step 1: Start FlareSolverr
7
+ echo "[1/2] Launching FlareSolverr in background..."
8
+ export PYTHONPATH=$PYTHONPATH:/opt/render/project/src/flaresolverr
9
+ export PORT_FS=8191
10
+ export LOG_LEVEL=info
11
+
12
+ (cd /opt/render/project/src/flaresolverr && python3 flaresolverr.py) &
13
+
14
+ # Wait for FlareSolverr
15
+ echo "[2/2] Waiting for FlareSolverr..."
16
+ sleep 5
17
+
18
+ echo "✅ FlareSolverr ready!"
19
+ echo "--- Starting FastAPI on port $PORT ---"
20
+
21
+ # Render provides $PORT automatically
22
+ uvicorn main:app --host 0.0.0.0 --port ${PORT:-7860} --log-level info
tools/analyze_structure.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ import sys
3
+ import io
4
+
5
+ # Set encoding for Windows terminal
6
+ if sys.platform == 'win32':
7
+ sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
8
+
9
+ def analyze_html():
10
+ with open("flaresolverr_output.html", "r", encoding="utf-8") as f:
11
+ html = f.read()
12
+
13
+ soup = BeautifulSoup(html, 'html.parser')
14
+
15
+ print("--- Analyzing Links ---")
16
+ links = soup.find_all('a', href=True)
17
+ for i, a in enumerate(links[:100]):
18
+ href = a['href']
19
+ text = a.get_text(strip=True)
20
+ if 'cat=' in href or 'video' in href or 'movie' in href or 'series' in href:
21
+ print(f"{i}: Text: {text} | Href: {href}")
22
+
23
+ print("\n--- Analyzing Containers ---")
24
+ # Look for common patterns in classes
25
+ classes = set()
26
+ for tag in soup.find_all(True, class_=True):
27
+ for c in tag['class']:
28
+ classes.add(c)
29
+
30
+ print(f"Found {len(classes)} unique classes.")
31
+ # Print classes that might be containers
32
+ potential = [c for c in classes if any(x in c.lower() for x in ['item', 'video', 'movie', 'thumb', 'card', 'block', 'col'])]
33
+ print(f"Potential container classes: {potential}")
34
+
35
+ if __name__ == "__main__":
36
+ analyze_html()
tools/check_mirrors.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import httpx
3
+ from curl_cffi.requests import AsyncSession
4
+
5
+ async def check_mirrors():
6
+ mirrors = [
7
+ "https://larooza.mom",
8
+ "https://larooza.site",
9
+ "https://laroza-tv.net",
10
+ "https://larozavideo.net",
11
+ "https://larooza.video",
12
+ "https://q.larozavideo.net"
13
+ ]
14
+
15
+ headers = {
16
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
17
+ }
18
+
19
+ for mirror in mirrors:
20
+ print(f"Checking {mirror}...")
21
+ try:
22
+ # Try curl-cffi first
23
+ async with AsyncSession(impersonate="chrome110") as s:
24
+ resp = await s.get(mirror, headers=headers, timeout=10)
25
+ print(f" [curl-cffi] {mirror}: {resp.status_code} | Title: {resp.text[:100].replace('\n', ' ')}")
26
+
27
+ async with httpx.AsyncClient(http2=True, timeout=10) as client:
28
+ resp = await client.get(mirror, headers=headers)
29
+ print(f" [httpx] {mirror}: {resp.status_code} | Title: {resp.text[:100].replace('\n', ' ')}")
30
+ except Exception as e:
31
+ print(f" [Error] {mirror}: {e}")
32
+
33
+ if __name__ == "__main__":
34
+ asyncio.run(check_mirrors())
tools/debug_fs.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import httpx
3
+ import json
4
+ import sys
5
+
6
+ # Set encoding to utf-8 for windows console
7
+ if sys.platform == "win32":
8
+ import codecs
9
+ sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
10
+
11
+ async def test():
12
+ urls = [
13
+ "https://q.larozavideo.net/home.24",
14
+ "https://q.larozavideo.net/newvideos1.php",
15
+ "https://q.larozavideo.net/category.php?cat=all_movies_13"
16
+ ]
17
+
18
+ flaresolverr_url = "http://127.0.0.1:8191/v1"
19
+
20
+ async with httpx.AsyncClient(timeout=90.0) as client:
21
+ for url in urls:
22
+ print(f"\n--- Testing {url} ---")
23
+ payload = {
24
+ "cmd": "request.get",
25
+ "url": url,
26
+ "maxTimeout": 60000
27
+ }
28
+ try:
29
+ response = await client.post(flaresolverr_url, json=payload)
30
+ if response.status_code == 200:
31
+ data = response.json()
32
+ if data.get('status') == 'ok':
33
+ solution = data.get('solution', {})
34
+ html = solution.get('response', '')
35
+ title = solution.get('title', '')
36
+ print(f"Title found: {title}")
37
+
38
+ if "video.php" in html or ".thumbnail" in html or ".box" in html:
39
+ print("FOUND: Movie items are present in HTML!")
40
+ else:
41
+ print("FAILED: No movie items in HTML.")
42
+ print(f"Snippet: {html[:500]}")
43
+ else:
44
+ print(f"FlareSolverr message: {data.get('message')}")
45
+ else:
46
+ print(f"Server error: {response.status_code}")
47
+ except Exception as e:
48
+ print(f"Script error: {e}")
49
+
50
+ if __name__ == "__main__":
51
+ asyncio.run(test())
tools/debug_mirrors.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import httpx
3
+ from bs4 import BeautifulSoup
4
+
5
+ async def debug_fetch():
6
+ mirrors = ["https://q.larozavideo.net", "https://larooza.mom", "https://larooza.site", "https://m.laroza-tv.net"]
7
+ async with httpx.AsyncClient(timeout=10.0, follow_redirects=True) as client:
8
+ for mirror in mirrors:
9
+ print(f"\n--- Checking mirror: {mirror} ---")
10
+ try:
11
+ resp = await client.get(mirror, headers={"User-Agent": "Mozilla/5.0"})
12
+ print(f"Status: {resp.status_code}")
13
+ if resp.status_code == 200:
14
+ soup = BeautifulSoup(resp.text, 'html.parser')
15
+ title = soup.title.string if soup.title else "No title"
16
+ print(f"Title: {title}")
17
+
18
+ selectors = ['.thumbnail', '.pm-li-video', '.pm-video-thumb', '.video-block', '.movie-item', 'li.col-xs-6', '.box', '.video-box', '.video-item', '.post-item']
19
+ found = False
20
+ for sel in selectors:
21
+ count = len(soup.select(sel))
22
+ if count > 0:
23
+ print(f" Found {count} items with selector {sel}")
24
+ found = True
25
+
26
+ if not found:
27
+ video_links = len(soup.select('a[href*="video.php"], a[href*="watch.php"]'))
28
+ print(f" Found {video_links} video/watch links.")
29
+ else:
30
+ print(f" Snippet: {resp.text[:200]}")
31
+ except Exception as e:
32
+ print(f" Error: {e}")
33
+
34
+ if __name__ == "__main__":
35
+ asyncio.run(debug_fetch())
tools/debug_scraper.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import sys
3
+ import os
4
+
5
+ # Add the current directory to path
6
+ sys.path.append(os.getcwd())
7
+
8
+ from scraper.engine import LaroozaScraper
9
+
10
+ # Set encoding to utf-8 for windows console
11
+ if sys.platform == "win32":
12
+ import codecs
13
+ sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
14
+
15
+ async def test():
16
+ scraper = LaroozaScraper()
17
+ print("DEBUG: Fetching latest movies...")
18
+ items = await scraper.fetch_home(page=1)
19
+ print(f"DEBUG: Found {len(items)} items.")
20
+ if items:
21
+ for i, item in enumerate(items[:3]):
22
+ print(f" {i+1}. {item['title']} - ID: {item['id'][:20]}...")
23
+ else:
24
+ print("DEBUG: ❌ fetch_home returned 0 items.")
25
+
26
+ if __name__ == "__main__":
27
+ asyncio.run(test())
tools/dump_html.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import httpx
2
+ import asyncio
3
+ from bs4 import BeautifulSoup
4
+
5
+ async def dump_html():
6
+ url = "https://larooza.mom" # Using the one that gave 0 links
7
+ headers = {
8
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
9
+ }
10
+ async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
11
+ print(f"Fetching {url}...")
12
+ resp = await client.get(url, headers=headers)
13
+ print(f"Status: {resp.status_code}")
14
+ with open("dump.html", "w", encoding="utf-8") as f:
15
+ f.write(resp.text)
16
+ print("HTML dumped to dump.html")
17
+
18
+ soup = BeautifulSoup(resp.text, 'html.parser')
19
+ links = soup.select('a')
20
+ print(f"Total links: {len(links)}")
21
+ for a in links[:20]:
22
+ print(f"Link: {a.get('href')} | Text: {a.get_text(strip=True)[:30]}")
23
+
24
+ if __name__ == "__main__":
25
+ asyncio.run(dump_html())
tools/dump_html_v2.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import httpx
2
+ import asyncio
3
+ from bs4 import BeautifulSoup
4
+
5
+ async def dump_html():
6
+ url = "https://q.larozavideo.net/newvideos1.php"
7
+ headers = {
8
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
9
+ }
10
+ async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
11
+ print(f"Fetching {url}...")
12
+ resp = await client.get(url, headers=headers)
13
+ print(f"Status: {resp.status_code}")
14
+ print(f"Final URL: {resp.url}")
15
+
16
+ soup = BeautifulSoup(resp.text, 'html.parser')
17
+ containers = soup.select('.thumbnail, .pm-li-video, .pm-video-thumb, .video-block, .movie-item, li.col-xs-6, .box, .video-box, .video-item, .post-item')
18
+ print(f"Found {len(containers)} item containers.")
19
+
20
+ if len(containers) == 0:
21
+ print("Snippet of HTML:")
22
+ print(resp.text[:1000])
23
+
24
+ if __name__ == "__main__":
25
+ asyncio.run(dump_html())
tools/extra/diagnose.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import httpx
3
+ import asyncio
4
+ import os
5
+
6
+ async def check_service(name, url):
7
+ try:
8
+ async with httpx.AsyncClient(timeout=5.0) as client:
9
+ resp = await client.get(url)
10
+ print(f"✅ {name} is UP ({url}) - Status: {resp.status_code}")
11
+ return True
12
+ except Exception as e:
13
+ print(f"❌ {name} is DOWN ({url}) - Error: {e}")
14
+ return False
15
+
16
+ async def main():
17
+ print("--- Diagnostics ---")
18
+ await check_service("Backend", "http://localhost:8000/health")
19
+ await check_service("FlareSolverr", "http://localhost:8191/health")
20
+
21
+ # Try to find the tunnel URL from local logs if possible
22
+ print("\n--- Searching for Tunnel URL ---")
23
+ # This is a bit tricky, but we can try to find recent cloudflared logs
24
+ # Cloudflared usually doesn't log to a file unless specified, but we'll check common names
25
+
26
+ if __name__ == "__main__":
27
+ asyncio.run(main())
tools/extra/expose_to_internet.bat ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @echo off
2
+ echo ==========================================
3
+ echo CLOUDFLARE TUNNEL - EXPOSE TO INTERNET
4
+ echo ==========================================
5
+
6
+ REM Download Cloudflared (if not exists)
7
+ if not exist cloudflared.exe (
8
+ echo Downloading Cloudflare Tunnel...
9
+ powershell -Command "Invoke-WebRequest -Uri 'https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-windows-amd64.exe' -OutFile 'cloudflared.exe'"
10
+ )
11
+
12
+ REM Start tunnel
13
+ echo Starting Cloudflare Tunnel...
14
+ echo Your backend will be accessible via a public URL in a moment...
15
+ echo.
16
+ cloudflared.exe tunnel --url http://localhost:8000
17
+
18
+ pause