Spaces:
Paused
Paused
Upload server.js
Browse files
server.js
ADDED
|
@@ -0,0 +1,442 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import express from "express";
|
| 2 |
+
import cors from "cors";
|
| 3 |
+
import { chromium } from "playwright-extra";
|
| 4 |
+
import stealthPlugin from "puppeteer-extra-plugin-stealth";
|
| 5 |
+
|
| 6 |
+
chromium.use(stealthPlugin());
|
| 7 |
+
|
| 8 |
+
const app = express();
|
| 9 |
+
const PORT = process.env.PORT || 7860;
|
| 10 |
+
|
| 11 |
+
app.use(cors());
|
| 12 |
+
app.use(express.json());
|
| 13 |
+
|
| 14 |
+
// Language detection from VTT filenames
|
| 15 |
+
const LANG_PATTERNS = [
|
| 16 |
+
{ pattern: /(_eng|[-_]en)\.vtt/i, lang: "English", code: "en" },
|
| 17 |
+
{ pattern: /(_ara|[-_]ar)\.vtt/i, lang: "Arabic", code: "ar" },
|
| 18 |
+
{ pattern: /(_fre|[-_]fr)\.vtt/i, lang: "French", code: "fr" },
|
| 19 |
+
{ pattern: /(_spa|[-_]es)\.vtt/i, lang: "Spanish", code: "es" },
|
| 20 |
+
{ pattern: /(_ger|[-_]de)\.vtt/i, lang: "German", code: "de" },
|
| 21 |
+
{ pattern: /(_tur|[-_]tr)\.vtt/i, lang: "Turkish", code: "tr" },
|
| 22 |
+
{ pattern: /(_por|[-_]pt)\.vtt/i, lang: "Portuguese", code: "pt" },
|
| 23 |
+
{ pattern: /(_ita|[-_]it)\.vtt/i, lang: "Italian", code: "it" },
|
| 24 |
+
{ pattern: /(_dut|[-_]nl)\.vtt/i, lang: "Dutch", code: "nl" },
|
| 25 |
+
{ pattern: /(_rus|[-_]ru)\.vtt/i, lang: "Russian", code: "ru" },
|
| 26 |
+
{ pattern: /(_chi|[-_]zh)\.vtt/i, lang: "Chinese", code: "zh" },
|
| 27 |
+
{ pattern: /(_jpn|[-_]ja)\.vtt/i, lang: "Japanese", code: "ja" },
|
| 28 |
+
{ pattern: /(_kor|[-_]ko)\.vtt/i, lang: "Korean", code: "ko" },
|
| 29 |
+
{ pattern: /(_hin|[-_]hi)\.vtt/i, lang: "Hindi", code: "hi" },
|
| 30 |
+
{ pattern: /(_ind|[-_]id)\.vtt/i, lang: "Indonesian", code: "id" },
|
| 31 |
+
{ pattern: /(_may|[-_]ms)\.vtt/i, lang: "Malay", code: "ms" },
|
| 32 |
+
{ pattern: /_sli\.vtt/i, lang: "Slovenian", code: "sl" },
|
| 33 |
+
];
|
| 34 |
+
|
| 35 |
+
// Global browser instance with memory management
|
| 36 |
+
let browser;
|
| 37 |
+
let requestCount = 0;
|
| 38 |
+
const MAX_REQUESTS_BEFORE_RECYCLE = 10; // Recycle browser every N requests
|
| 39 |
+
let activeRequests = 0;
|
| 40 |
+
const MAX_CONCURRENT = 2; // Max simultaneous scraping requests
|
| 41 |
+
|
| 42 |
+
async function getBrowser() {
|
| 43 |
+
if (!browser || !browser.isConnected()) {
|
| 44 |
+
console.log("Launching fresh browser instance...");
|
| 45 |
+
browser = await chromium.launch({
|
| 46 |
+
headless: true,
|
| 47 |
+
args: [
|
| 48 |
+
"--no-sandbox",
|
| 49 |
+
"--disable-setuid-sandbox",
|
| 50 |
+
"--disable-dev-shm-usage",
|
| 51 |
+
"--disable-gpu",
|
| 52 |
+
"--disable-extensions",
|
| 53 |
+
"--disable-background-networking",
|
| 54 |
+
],
|
| 55 |
+
});
|
| 56 |
+
requestCount = 0;
|
| 57 |
+
}
|
| 58 |
+
return browser;
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
async function recycleBrowser() {
|
| 62 |
+
if (browser) {
|
| 63 |
+
console.log(`[MEMORY] Recycling browser after ${requestCount} requests...`);
|
| 64 |
+
try { await browser.close(); } catch (e) { /* ignore */ }
|
| 65 |
+
browser = null;
|
| 66 |
+
}
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
// Label-to-ISO-code mapping for metadata-based subtitle labels
|
| 70 |
+
const LABEL_TO_CODE = {
|
| 71 |
+
'arabic': 'ar', 'english': 'en', 'french': 'fr', 'spanish': 'es',
|
| 72 |
+
'german': 'de', 'turkish': 'tr', 'portuguese': 'pt', 'italian': 'it',
|
| 73 |
+
'dutch': 'nl', 'russian': 'ru', 'chinese': 'zh', 'japanese': 'ja',
|
| 74 |
+
'korean': 'ko', 'hindi': 'hi', 'indonesian': 'id', 'malay': 'ms',
|
| 75 |
+
'slovenian': 'sl', 'swedish': 'sv', 'norwegian': 'no', 'danish': 'da',
|
| 76 |
+
'finnish': 'fi', 'polish': 'pl', 'romanian': 'ro', 'croatian': 'hr',
|
| 77 |
+
'czech': 'cs', 'hungarian': 'hu', 'greek': 'el', 'thai': 'th',
|
| 78 |
+
'vietnamese': 'vi', 'hebrew': 'he', 'persian': 'fa', 'urdu': 'ur',
|
| 79 |
+
};
|
| 80 |
+
|
| 81 |
+
function labelToCode(label) {
|
| 82 |
+
if (!label) return null;
|
| 83 |
+
const base = label.toLowerCase().replace(/[\d\s]+$/g, '').trim(); // "English Hi2" -> "english hi" -> "english"
|
| 84 |
+
const clean = base.replace(/\s+hi$/i, '').trim(); // "english hi" -> "english"
|
| 85 |
+
return LABEL_TO_CODE[clean] || LABEL_TO_CODE[base] || null;
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
function detectLang(url) {
|
| 89 |
+
const lowerUrl = url.toLowerCase();
|
| 90 |
+
for (const { pattern, lang, code } of LANG_PATTERNS) {
|
| 91 |
+
if (pattern.test(lowerUrl)) return { lang, code };
|
| 92 |
+
}
|
| 93 |
+
// Also check if the filename itself is a language name (e.g. /Arabic.vtt)
|
| 94 |
+
const filenameMatch = lowerUrl.match(/\/([a-z]+[\d]*)\.vtt/i);
|
| 95 |
+
if (filenameMatch) {
|
| 96 |
+
const code = labelToCode(filenameMatch[1]);
|
| 97 |
+
if (code) return { lang: filenameMatch[1], code };
|
| 98 |
+
}
|
| 99 |
+
return { lang: "Unknown", code: "und" };
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
/**
|
| 103 |
+
* Step 1: Fetch the moviesapi.club page via plain HTTP and extract the
|
| 104 |
+
* vidora.stream/embed/ iframe src URL.
|
| 105 |
+
*/
|
| 106 |
+
async function getEmbedUrl(tmdbId, type = "movie", season, episode) {
|
| 107 |
+
let pageUrl;
|
| 108 |
+
if (type === "tv" && season && episode) {
|
| 109 |
+
pageUrl = `https://ww2.moviesapi.to/tv/${tmdbId}-${season}-${episode}`;
|
| 110 |
+
} else {
|
| 111 |
+
pageUrl = `https://ww2.moviesapi.to/movie/${tmdbId}`;
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
console.log(`[STEP1] Fetching ${pageUrl} via Playwright...`);
|
| 115 |
+
const b = await getBrowser();
|
| 116 |
+
const context = await b.newContext({
|
| 117 |
+
userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
| 118 |
+
});
|
| 119 |
+
const page = await context.newPage();
|
| 120 |
+
|
| 121 |
+
try {
|
| 122 |
+
await page.goto(pageUrl, { waitUntil: "networkidle", timeout: 25000 });
|
| 123 |
+
|
| 124 |
+
// Wait for potential redirects and iframe loading
|
| 125 |
+
await page.waitForTimeout(4000);
|
| 126 |
+
|
| 127 |
+
// Find the most likely player iframe
|
| 128 |
+
const embedUrl = await page.evaluate(() => {
|
| 129 |
+
const iframes = Array.from(document.querySelectorAll('iframe'));
|
| 130 |
+
// prioritize known domains, then fall back to any iframe with src
|
| 131 |
+
const playerIframe = iframes.find(f =>
|
| 132 |
+
f.src && (
|
| 133 |
+
f.src.includes('vidora.stream') ||
|
| 134 |
+
f.src.includes('flixcdn.cyou') ||
|
| 135 |
+
f.src.includes('/embed/') ||
|
| 136 |
+
f.src.includes('vidsrc') ||
|
| 137 |
+
f.src.includes('rabbitstream') ||
|
| 138 |
+
f.src.includes('2embed')
|
| 139 |
+
)
|
| 140 |
+
) || iframes.find(f => f.src && f.src.startsWith('http'));
|
| 141 |
+
return playerIframe ? playerIframe.src : null;
|
| 142 |
+
});
|
| 143 |
+
|
| 144 |
+
if (embedUrl) {
|
| 145 |
+
console.log(`[STEP1] Found embed URL: ${embedUrl}`);
|
| 146 |
+
return embedUrl;
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
// Fallback to searching the whole HTML if iframe not found via selector
|
| 150 |
+
const html = await page.content();
|
| 151 |
+
const iframeMatch = html.match(/src=["'](https?:\/\/[^"']+(vidora\.stream|flixcdn\.cyou|vidsrc|embed|rabbitstream|2embed)[^"']*)["']/i)
|
| 152 |
+
|| html.match(/src=["'](https?:\/\/[^"']+)["'].*?<\/iframe>/i);
|
| 153 |
+
|
| 154 |
+
if (iframeMatch) {
|
| 155 |
+
console.log(`[STEP1] Found embed URL (Regex): ${iframeMatch[1]}`);
|
| 156 |
+
return iframeMatch[1];
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
// Log HTML for debugging when nothing is found
|
| 160 |
+
const pageText = await page.evaluate(() => document.body?.innerText || '');
|
| 161 |
+
console.log(`[STEP1] No player iframe found for ID ${tmdbId}. Page text: ${pageText.substring(0, 300)}`);
|
| 162 |
+
console.log(`[STEP1] Page URL after redirects: ${page.url()}`);
|
| 163 |
+
console.log(`[STEP1] Iframes found: ${await page.evaluate(() => document.querySelectorAll('iframe').length)}`);
|
| 164 |
+
return null;
|
| 165 |
+
} catch (err) {
|
| 166 |
+
console.error(`[STEP1 ERROR] ${err.message}`);
|
| 167 |
+
return null;
|
| 168 |
+
} finally {
|
| 169 |
+
await page.close().catch(() => { });
|
| 170 |
+
await context.close().catch(() => { });
|
| 171 |
+
}
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
/**
|
| 175 |
+
* Step 2: Use Playwright to navigate to the embed URL and intercept
|
| 176 |
+
* VTT/SRT subtitle network requests.
|
| 177 |
+
*/
|
| 178 |
+
async function scrapeSubtitles(embedUrl, langs = ["en", "ar"]) {
|
| 179 |
+
console.log(`[STEP2] Scraping subtitles from ${embedUrl} ...`);
|
| 180 |
+
const vttUrls = [];
|
| 181 |
+
|
| 182 |
+
// Check if the URL itself contains subtitle metadata (common in flixcdn)
|
| 183 |
+
try {
|
| 184 |
+
const urlObj = new URL(embedUrl);
|
| 185 |
+
const subsParam = urlObj.searchParams.get('subs') || (embedUrl.includes('#') ? new URLSearchParams(embedUrl.split('#')[1]).get('subs') : null);
|
| 186 |
+
if (subsParam) {
|
| 187 |
+
console.log(`[STEP2] Found 'subs' parameter in URL`);
|
| 188 |
+
const decodedSubs = JSON.parse(decodeURIComponent(subsParam));
|
| 189 |
+
if (Array.isArray(decodedSubs)) {
|
| 190 |
+
decodedSubs.forEach(s => {
|
| 191 |
+
if (s.url && !vttUrls.find(v => v.url === s.url)) {
|
| 192 |
+
let { lang, code } = detectLang(s.url);
|
| 193 |
+
// If metadata provides a label, use it for both display and code
|
| 194 |
+
if (s.label) {
|
| 195 |
+
const labelCode = labelToCode(s.label);
|
| 196 |
+
if (labelCode) {
|
| 197 |
+
code = labelCode;
|
| 198 |
+
lang = s.label;
|
| 199 |
+
} else {
|
| 200 |
+
lang = s.label;
|
| 201 |
+
}
|
| 202 |
+
}
|
| 203 |
+
console.log(`[STEP2] Found subtitle (URL Metadata - ${lang} [${code}]): ${s.url}`);
|
| 204 |
+
vttUrls.push({ url: s.url, lang, code });
|
| 205 |
+
}
|
| 206 |
+
});
|
| 207 |
+
}
|
| 208 |
+
}
|
| 209 |
+
} catch (e) {
|
| 210 |
+
// Not a URL with subs param or invalid JSON
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
// MEMORY OPTIMIZATION: If we found subtitles in the URL metadata, skip Playwright!
|
| 214 |
+
if (vttUrls.length === 0) {
|
| 215 |
+
console.log(`[STEP2] No subtitles in URL metadata. Launching Playwright to hunt for tracks...`);
|
| 216 |
+
const b = await getBrowser();
|
| 217 |
+
const context = await b.newContext({
|
| 218 |
+
userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120 Safari/537.36",
|
| 219 |
+
});
|
| 220 |
+
const page = await context.newPage();
|
| 221 |
+
|
| 222 |
+
page.on("request", (request) => {
|
| 223 |
+
const reqUrl = request.url();
|
| 224 |
+
if (/\.(vtt|srt)(\?.*)?$/i.test(reqUrl)) {
|
| 225 |
+
if (!vttUrls.find((v) => v.url === reqUrl)) {
|
| 226 |
+
const { lang, code } = detectLang(reqUrl);
|
| 227 |
+
console.log(`[STEP2] Found subtitle (${lang}): ${reqUrl}`);
|
| 228 |
+
vttUrls.push({ url: reqUrl, lang, code });
|
| 229 |
+
}
|
| 230 |
+
}
|
| 231 |
+
});
|
| 232 |
+
|
| 233 |
+
try {
|
| 234 |
+
await page.goto(embedUrl, { waitUntil: "domcontentloaded", timeout: 30000 });
|
| 235 |
+
await page.waitForTimeout(3000);
|
| 236 |
+
|
| 237 |
+
// Try extracting tracks directly from DOM/JWPlayer config (More reliable)
|
| 238 |
+
const tracks = await page.evaluate(() => {
|
| 239 |
+
const found = [];
|
| 240 |
+
|
| 241 |
+
// 1. Look for JWPlayer tracks
|
| 242 |
+
if (window.jwplayer && window.jwplayer().getConfig) {
|
| 243 |
+
const config = window.jwplayer().getConfig();
|
| 244 |
+
if (config.playlist && config.playlist[0] && config.playlist[0].tracks) {
|
| 245 |
+
config.playlist[0].tracks.forEach(t => {
|
| 246 |
+
if (t.file && (t.file.includes('.vtt') || t.file.includes('.srt'))) {
|
| 247 |
+
found.push(t.file);
|
| 248 |
+
}
|
| 249 |
+
});
|
| 250 |
+
}
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
// 2. Look for script tags with JSON configs
|
| 254 |
+
document.querySelectorAll('script').forEach(s => {
|
| 255 |
+
const content = s.textContent;
|
| 256 |
+
if (content.includes('tracks') && content.includes('.vtt')) {
|
| 257 |
+
const matches = content.match(/https?:\/\/[^"']+\.(vtt|srt)[^"']*/g);
|
| 258 |
+
if (matches) found.push(...matches);
|
| 259 |
+
}
|
| 260 |
+
});
|
| 261 |
+
|
| 262 |
+
// 3. Look for video/track elements
|
| 263 |
+
document.querySelectorAll('track').forEach(t => {
|
| 264 |
+
if (t.src) found.push(t.src);
|
| 265 |
+
});
|
| 266 |
+
|
| 267 |
+
return found;
|
| 268 |
+
});
|
| 269 |
+
|
| 270 |
+
tracks.forEach(url => {
|
| 271 |
+
console.log(`[STEP2] Evaluated Track: ${url}`);
|
| 272 |
+
if (!vttUrls.find(v => v.url === url)) {
|
| 273 |
+
const { lang, code } = detectLang(url);
|
| 274 |
+
console.log(`[STEP2] Found subtitle (DOM): ${url} [${code}]`);
|
| 275 |
+
vttUrls.push({ url, lang, code });
|
| 276 |
+
}
|
| 277 |
+
});
|
| 278 |
+
|
| 279 |
+
const box = await page.locator("body").boundingBox();
|
| 280 |
+
if (box) {
|
| 281 |
+
await page.mouse.click(box.x + box.width / 2, box.y + box.height / 2);
|
| 282 |
+
}
|
| 283 |
+
|
| 284 |
+
await page.waitForTimeout(5000);
|
| 285 |
+
} catch (err) {
|
| 286 |
+
console.error(`[STEP2] Navigation error: ${err.message}`);
|
| 287 |
+
}
|
| 288 |
+
|
| 289 |
+
await page.close().catch(() => { });
|
| 290 |
+
await context.close().catch(() => { });
|
| 291 |
+
} else {
|
| 292 |
+
console.log(`[STEP2] [MEMORY OPTIMIZATION] Skipping Playwright since ${vttUrls.length} tracks were found in metadata.`);
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
const filtered = vttUrls.filter((v) => langs.includes(v.code) || v.code === "und");
|
| 296 |
+
console.log(`[STEP2] Total VTTs: ${vttUrls.length}, filtered: ${filtered.length}`);
|
| 297 |
+
|
| 298 |
+
const results = [];
|
| 299 |
+
for (const track of filtered) {
|
| 300 |
+
try {
|
| 301 |
+
console.log(`[DOWNLOAD] Attempting ${track.url}`);
|
| 302 |
+
const resp = await fetch(track.url, {
|
| 303 |
+
headers: {
|
| 304 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120 Safari/537.36',
|
| 305 |
+
'Referer': embedUrl
|
| 306 |
+
}
|
| 307 |
+
});
|
| 308 |
+
console.log(`[DOWNLOAD] Status: ${resp.status} for ${track.url}`);
|
| 309 |
+
if (resp.ok) {
|
| 310 |
+
const content = await resp.text();
|
| 311 |
+
console.log(`[DOWNLOAD] Content length: ${content.length}`);
|
| 312 |
+
if (content.length > 50) {
|
| 313 |
+
results.push({
|
| 314 |
+
lang: track.lang,
|
| 315 |
+
lang_code: track.code,
|
| 316 |
+
url: track.url,
|
| 317 |
+
content,
|
| 318 |
+
});
|
| 319 |
+
}
|
| 320 |
+
}
|
| 321 |
+
} catch (e) {
|
| 322 |
+
console.error(`[DOWNLOAD ERROR] ${e.message}`);
|
| 323 |
+
}
|
| 324 |
+
}
|
| 325 |
+
|
| 326 |
+
return results;
|
| 327 |
+
}
|
| 328 |
+
|
| 329 |
+
// βββ Subtitle Endpoint ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 330 |
+
|
| 331 |
+
async function handleGetSubtitles(req, res) {
|
| 332 |
+
const data = req.method === "POST" ? req.body : req.query;
|
| 333 |
+
const tmdb_id = data.tmdb_id;
|
| 334 |
+
|
| 335 |
+
if (!tmdb_id) {
|
| 336 |
+
return res.status(400).json({ error: "Missing tmdb_id parameter" });
|
| 337 |
+
}
|
| 338 |
+
|
| 339 |
+
// Concurrency limiter
|
| 340 |
+
if (activeRequests >= MAX_CONCURRENT) {
|
| 341 |
+
console.log(`[API] Rejecting request for ${tmdb_id} β too many concurrent requests (${activeRequests}/${MAX_CONCURRENT})`);
|
| 342 |
+
return res.status(429).json({ error: "Server busy, try again in a few seconds", tmdb_id });
|
| 343 |
+
}
|
| 344 |
+
|
| 345 |
+
activeRequests++;
|
| 346 |
+
const type = data.type || "movie";
|
| 347 |
+
const season = data.season;
|
| 348 |
+
const episode = data.episode;
|
| 349 |
+
const langs = (data.langs || "ar,en").split(",").map((l) => l.trim());
|
| 350 |
+
|
| 351 |
+
console.log(`\nββββββββββββοΏ½οΏ½οΏ½βββββββββββββββββββββββββββββββ`);
|
| 352 |
+
console.log(`[API] ${req.method} Request: tmdb_id=${tmdb_id}, type=${type}, langs=${langs.join(",")} (active: ${activeRequests})`);
|
| 353 |
+
|
| 354 |
+
try {
|
| 355 |
+
const embedUrl = await getEmbedUrl(tmdb_id, type, season, episode);
|
| 356 |
+
if (!embedUrl) {
|
| 357 |
+
return res.json({ tmdb_id, count: 0, subtitles: [], error: "No embed URL found" });
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
const subtitles = await scrapeSubtitles(embedUrl, langs);
|
| 361 |
+
|
| 362 |
+
console.log(`[API] Returning ${subtitles.length} subtitles for tmdb_id=${tmdb_id}`);
|
| 363 |
+
res.json({ tmdb_id, count: subtitles.length, subtitles });
|
| 364 |
+
} catch (err) {
|
| 365 |
+
console.error(`[API ERROR] ${err.message}`);
|
| 366 |
+
res.status(500).json({ error: "Scraping failed", details: err.message });
|
| 367 |
+
} finally {
|
| 368 |
+
activeRequests--;
|
| 369 |
+
requestCount++;
|
| 370 |
+
// Recycle browser periodically to free memory
|
| 371 |
+
if (requestCount >= MAX_REQUESTS_BEFORE_RECYCLE && activeRequests === 0) {
|
| 372 |
+
await recycleBrowser();
|
| 373 |
+
}
|
| 374 |
+
}
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
+
app.get("/get-subtitles", handleGetSubtitles);
|
| 378 |
+
app.post("/get-subtitles", handleGetSubtitles);
|
| 379 |
+
|
| 380 |
+
app.get("/", (req, res) => {
|
| 381 |
+
const memUsage = process.memoryUsage();
|
| 382 |
+
res.json({
|
| 383 |
+
status: "running",
|
| 384 |
+
message: "π¬ Subtitle Scraper API",
|
| 385 |
+
requestCount,
|
| 386 |
+
activeRequests,
|
| 387 |
+
memory: {
|
| 388 |
+
rss: `${Math.round(memUsage.rss / 1024 / 1024)}MB`,
|
| 389 |
+
heap: `${Math.round(memUsage.heapUsed / 1024 / 1024)}MB / ${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`,
|
| 390 |
+
},
|
| 391 |
+
});
|
| 392 |
+
});
|
| 393 |
+
|
| 394 |
+
// βββ Start ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 395 |
+
|
| 396 |
+
app.get("/debug-screenshot", async (req, res) => {
|
| 397 |
+
const { url } = req.query;
|
| 398 |
+
if (!url) return res.status(400).send("URL required");
|
| 399 |
+
|
| 400 |
+
let page;
|
| 401 |
+
try {
|
| 402 |
+
const browserInstance = await getBrowser();
|
| 403 |
+
page = await browserInstance.newPage();
|
| 404 |
+
await page.goto(url, { waitUntil: 'networkidle', timeout: 30000 });
|
| 405 |
+
const buffer = await page.screenshot({ fullPage: true });
|
| 406 |
+
res.setHeader('Content-Type', 'image/png');
|
| 407 |
+
res.send(buffer);
|
| 408 |
+
} catch (err) {
|
| 409 |
+
res.status(500).send(err.message);
|
| 410 |
+
} finally {
|
| 411 |
+
if (page) await page.close();
|
| 412 |
+
}
|
| 413 |
+
});
|
| 414 |
+
|
| 415 |
+
app.listen(PORT, "0.0.0.0", () => {
|
| 416 |
+
console.log(`Subtitle Scraper API listening on port ${PORT}`);
|
| 417 |
+
getBrowser()
|
| 418 |
+
.then(() => console.log("Browser initialized. Ready to scrape."))
|
| 419 |
+
.catch(err => {
|
| 420 |
+
console.error("CRITICAL: Failed to initialize browser on startup:", err.message);
|
| 421 |
+
});
|
| 422 |
+
});
|
| 423 |
+
|
| 424 |
+
process.on("SIGINT", async () => {
|
| 425 |
+
if (browser) await browser.close();
|
| 426 |
+
process.exit();
|
| 427 |
+
});
|
| 428 |
+
process.on("SIGTERM", async () => {
|
| 429 |
+
if (browser) await browser.close();
|
| 430 |
+
process.exit();
|
| 431 |
+
});
|
| 432 |
+
|
| 433 |
+
// Prevent crashes from killing the server
|
| 434 |
+
process.on("uncaughtException", (err) => {
|
| 435 |
+
console.error("[CRASH GUARD] Uncaught exception:", err.message);
|
| 436 |
+
// Reset browser on crash
|
| 437 |
+
browser = null;
|
| 438 |
+
});
|
| 439 |
+
process.on("unhandledRejection", (reason) => {
|
| 440 |
+
console.error("[CRASH GUARD] Unhandled rejection:", reason?.message || reason);
|
| 441 |
+
browser = null;
|
| 442 |
+
});
|