Hugoglu99 commited on
Commit
a0a672c
Β·
verified Β·
1 Parent(s): 966cb6c

Upload server.js

Browse files
Files changed (1) hide show
  1. server.js +442 -0
server.js ADDED
@@ -0,0 +1,442 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import express from "express";
2
+ import cors from "cors";
3
+ import { chromium } from "playwright-extra";
4
+ import stealthPlugin from "puppeteer-extra-plugin-stealth";
5
+
6
+ chromium.use(stealthPlugin());
7
+
8
+ const app = express();
9
+ const PORT = process.env.PORT || 7860;
10
+
11
+ app.use(cors());
12
+ app.use(express.json());
13
+
14
+ // Language detection from VTT filenames
15
+ const LANG_PATTERNS = [
16
+ { pattern: /(_eng|[-_]en)\.vtt/i, lang: "English", code: "en" },
17
+ { pattern: /(_ara|[-_]ar)\.vtt/i, lang: "Arabic", code: "ar" },
18
+ { pattern: /(_fre|[-_]fr)\.vtt/i, lang: "French", code: "fr" },
19
+ { pattern: /(_spa|[-_]es)\.vtt/i, lang: "Spanish", code: "es" },
20
+ { pattern: /(_ger|[-_]de)\.vtt/i, lang: "German", code: "de" },
21
+ { pattern: /(_tur|[-_]tr)\.vtt/i, lang: "Turkish", code: "tr" },
22
+ { pattern: /(_por|[-_]pt)\.vtt/i, lang: "Portuguese", code: "pt" },
23
+ { pattern: /(_ita|[-_]it)\.vtt/i, lang: "Italian", code: "it" },
24
+ { pattern: /(_dut|[-_]nl)\.vtt/i, lang: "Dutch", code: "nl" },
25
+ { pattern: /(_rus|[-_]ru)\.vtt/i, lang: "Russian", code: "ru" },
26
+ { pattern: /(_chi|[-_]zh)\.vtt/i, lang: "Chinese", code: "zh" },
27
+ { pattern: /(_jpn|[-_]ja)\.vtt/i, lang: "Japanese", code: "ja" },
28
+ { pattern: /(_kor|[-_]ko)\.vtt/i, lang: "Korean", code: "ko" },
29
+ { pattern: /(_hin|[-_]hi)\.vtt/i, lang: "Hindi", code: "hi" },
30
+ { pattern: /(_ind|[-_]id)\.vtt/i, lang: "Indonesian", code: "id" },
31
+ { pattern: /(_may|[-_]ms)\.vtt/i, lang: "Malay", code: "ms" },
32
+ { pattern: /_sli\.vtt/i, lang: "Slovenian", code: "sl" },
33
+ ];
34
+
35
+ // Global browser instance with memory management
36
+ let browser;
37
+ let requestCount = 0;
38
+ const MAX_REQUESTS_BEFORE_RECYCLE = 10; // Recycle browser every N requests
39
+ let activeRequests = 0;
40
+ const MAX_CONCURRENT = 2; // Max simultaneous scraping requests
41
+
42
+ async function getBrowser() {
43
+ if (!browser || !browser.isConnected()) {
44
+ console.log("Launching fresh browser instance...");
45
+ browser = await chromium.launch({
46
+ headless: true,
47
+ args: [
48
+ "--no-sandbox",
49
+ "--disable-setuid-sandbox",
50
+ "--disable-dev-shm-usage",
51
+ "--disable-gpu",
52
+ "--disable-extensions",
53
+ "--disable-background-networking",
54
+ ],
55
+ });
56
+ requestCount = 0;
57
+ }
58
+ return browser;
59
+ }
60
+
61
+ async function recycleBrowser() {
62
+ if (browser) {
63
+ console.log(`[MEMORY] Recycling browser after ${requestCount} requests...`);
64
+ try { await browser.close(); } catch (e) { /* ignore */ }
65
+ browser = null;
66
+ }
67
+ }
68
+
69
+ // Label-to-ISO-code mapping for metadata-based subtitle labels
70
+ const LABEL_TO_CODE = {
71
+ 'arabic': 'ar', 'english': 'en', 'french': 'fr', 'spanish': 'es',
72
+ 'german': 'de', 'turkish': 'tr', 'portuguese': 'pt', 'italian': 'it',
73
+ 'dutch': 'nl', 'russian': 'ru', 'chinese': 'zh', 'japanese': 'ja',
74
+ 'korean': 'ko', 'hindi': 'hi', 'indonesian': 'id', 'malay': 'ms',
75
+ 'slovenian': 'sl', 'swedish': 'sv', 'norwegian': 'no', 'danish': 'da',
76
+ 'finnish': 'fi', 'polish': 'pl', 'romanian': 'ro', 'croatian': 'hr',
77
+ 'czech': 'cs', 'hungarian': 'hu', 'greek': 'el', 'thai': 'th',
78
+ 'vietnamese': 'vi', 'hebrew': 'he', 'persian': 'fa', 'urdu': 'ur',
79
+ };
80
+
81
+ function labelToCode(label) {
82
+ if (!label) return null;
83
+ const base = label.toLowerCase().replace(/[\d\s]+$/g, '').trim(); // "English Hi2" -> "english hi" -> "english"
84
+ const clean = base.replace(/\s+hi$/i, '').trim(); // "english hi" -> "english"
85
+ return LABEL_TO_CODE[clean] || LABEL_TO_CODE[base] || null;
86
+ }
87
+
88
+ function detectLang(url) {
89
+ const lowerUrl = url.toLowerCase();
90
+ for (const { pattern, lang, code } of LANG_PATTERNS) {
91
+ if (pattern.test(lowerUrl)) return { lang, code };
92
+ }
93
+ // Also check if the filename itself is a language name (e.g. /Arabic.vtt)
94
+ const filenameMatch = lowerUrl.match(/\/([a-z]+[\d]*)\.vtt/i);
95
+ if (filenameMatch) {
96
+ const code = labelToCode(filenameMatch[1]);
97
+ if (code) return { lang: filenameMatch[1], code };
98
+ }
99
+ return { lang: "Unknown", code: "und" };
100
+ }
101
+
102
+ /**
103
+ * Step 1: Fetch the moviesapi.club page via plain HTTP and extract the
104
+ * vidora.stream/embed/ iframe src URL.
105
+ */
106
+ async function getEmbedUrl(tmdbId, type = "movie", season, episode) {
107
+ let pageUrl;
108
+ if (type === "tv" && season && episode) {
109
+ pageUrl = `https://ww2.moviesapi.to/tv/${tmdbId}-${season}-${episode}`;
110
+ } else {
111
+ pageUrl = `https://ww2.moviesapi.to/movie/${tmdbId}`;
112
+ }
113
+
114
+ console.log(`[STEP1] Fetching ${pageUrl} via Playwright...`);
115
+ const b = await getBrowser();
116
+ const context = await b.newContext({
117
+ userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
118
+ });
119
+ const page = await context.newPage();
120
+
121
+ try {
122
+ await page.goto(pageUrl, { waitUntil: "networkidle", timeout: 25000 });
123
+
124
+ // Wait for potential redirects and iframe loading
125
+ await page.waitForTimeout(4000);
126
+
127
+ // Find the most likely player iframe
128
+ const embedUrl = await page.evaluate(() => {
129
+ const iframes = Array.from(document.querySelectorAll('iframe'));
130
+ // prioritize known domains, then fall back to any iframe with src
131
+ const playerIframe = iframes.find(f =>
132
+ f.src && (
133
+ f.src.includes('vidora.stream') ||
134
+ f.src.includes('flixcdn.cyou') ||
135
+ f.src.includes('/embed/') ||
136
+ f.src.includes('vidsrc') ||
137
+ f.src.includes('rabbitstream') ||
138
+ f.src.includes('2embed')
139
+ )
140
+ ) || iframes.find(f => f.src && f.src.startsWith('http'));
141
+ return playerIframe ? playerIframe.src : null;
142
+ });
143
+
144
+ if (embedUrl) {
145
+ console.log(`[STEP1] Found embed URL: ${embedUrl}`);
146
+ return embedUrl;
147
+ }
148
+
149
+ // Fallback to searching the whole HTML if iframe not found via selector
150
+ const html = await page.content();
151
+ const iframeMatch = html.match(/src=["'](https?:\/\/[^"']+(vidora\.stream|flixcdn\.cyou|vidsrc|embed|rabbitstream|2embed)[^"']*)["']/i)
152
+ || html.match(/src=["'](https?:\/\/[^"']+)["'].*?<\/iframe>/i);
153
+
154
+ if (iframeMatch) {
155
+ console.log(`[STEP1] Found embed URL (Regex): ${iframeMatch[1]}`);
156
+ return iframeMatch[1];
157
+ }
158
+
159
+ // Log HTML for debugging when nothing is found
160
+ const pageText = await page.evaluate(() => document.body?.innerText || '');
161
+ console.log(`[STEP1] No player iframe found for ID ${tmdbId}. Page text: ${pageText.substring(0, 300)}`);
162
+ console.log(`[STEP1] Page URL after redirects: ${page.url()}`);
163
+ console.log(`[STEP1] Iframes found: ${await page.evaluate(() => document.querySelectorAll('iframe').length)}`);
164
+ return null;
165
+ } catch (err) {
166
+ console.error(`[STEP1 ERROR] ${err.message}`);
167
+ return null;
168
+ } finally {
169
+ await page.close().catch(() => { });
170
+ await context.close().catch(() => { });
171
+ }
172
+ }
173
+
174
+ /**
175
+ * Step 2: Use Playwright to navigate to the embed URL and intercept
176
+ * VTT/SRT subtitle network requests.
177
+ */
178
+ async function scrapeSubtitles(embedUrl, langs = ["en", "ar"]) {
179
+ console.log(`[STEP2] Scraping subtitles from ${embedUrl} ...`);
180
+ const vttUrls = [];
181
+
182
+ // Check if the URL itself contains subtitle metadata (common in flixcdn)
183
+ try {
184
+ const urlObj = new URL(embedUrl);
185
+ const subsParam = urlObj.searchParams.get('subs') || (embedUrl.includes('#') ? new URLSearchParams(embedUrl.split('#')[1]).get('subs') : null);
186
+ if (subsParam) {
187
+ console.log(`[STEP2] Found 'subs' parameter in URL`);
188
+ const decodedSubs = JSON.parse(decodeURIComponent(subsParam));
189
+ if (Array.isArray(decodedSubs)) {
190
+ decodedSubs.forEach(s => {
191
+ if (s.url && !vttUrls.find(v => v.url === s.url)) {
192
+ let { lang, code } = detectLang(s.url);
193
+ // If metadata provides a label, use it for both display and code
194
+ if (s.label) {
195
+ const labelCode = labelToCode(s.label);
196
+ if (labelCode) {
197
+ code = labelCode;
198
+ lang = s.label;
199
+ } else {
200
+ lang = s.label;
201
+ }
202
+ }
203
+ console.log(`[STEP2] Found subtitle (URL Metadata - ${lang} [${code}]): ${s.url}`);
204
+ vttUrls.push({ url: s.url, lang, code });
205
+ }
206
+ });
207
+ }
208
+ }
209
+ } catch (e) {
210
+ // Not a URL with subs param or invalid JSON
211
+ }
212
+
213
+ // MEMORY OPTIMIZATION: If we found subtitles in the URL metadata, skip Playwright!
214
+ if (vttUrls.length === 0) {
215
+ console.log(`[STEP2] No subtitles in URL metadata. Launching Playwright to hunt for tracks...`);
216
+ const b = await getBrowser();
217
+ const context = await b.newContext({
218
+ userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120 Safari/537.36",
219
+ });
220
+ const page = await context.newPage();
221
+
222
+ page.on("request", (request) => {
223
+ const reqUrl = request.url();
224
+ if (/\.(vtt|srt)(\?.*)?$/i.test(reqUrl)) {
225
+ if (!vttUrls.find((v) => v.url === reqUrl)) {
226
+ const { lang, code } = detectLang(reqUrl);
227
+ console.log(`[STEP2] Found subtitle (${lang}): ${reqUrl}`);
228
+ vttUrls.push({ url: reqUrl, lang, code });
229
+ }
230
+ }
231
+ });
232
+
233
+ try {
234
+ await page.goto(embedUrl, { waitUntil: "domcontentloaded", timeout: 30000 });
235
+ await page.waitForTimeout(3000);
236
+
237
+ // Try extracting tracks directly from DOM/JWPlayer config (More reliable)
238
+ const tracks = await page.evaluate(() => {
239
+ const found = [];
240
+
241
+ // 1. Look for JWPlayer tracks
242
+ if (window.jwplayer && window.jwplayer().getConfig) {
243
+ const config = window.jwplayer().getConfig();
244
+ if (config.playlist && config.playlist[0] && config.playlist[0].tracks) {
245
+ config.playlist[0].tracks.forEach(t => {
246
+ if (t.file && (t.file.includes('.vtt') || t.file.includes('.srt'))) {
247
+ found.push(t.file);
248
+ }
249
+ });
250
+ }
251
+ }
252
+
253
+ // 2. Look for script tags with JSON configs
254
+ document.querySelectorAll('script').forEach(s => {
255
+ const content = s.textContent;
256
+ if (content.includes('tracks') && content.includes('.vtt')) {
257
+ const matches = content.match(/https?:\/\/[^"']+\.(vtt|srt)[^"']*/g);
258
+ if (matches) found.push(...matches);
259
+ }
260
+ });
261
+
262
+ // 3. Look for video/track elements
263
+ document.querySelectorAll('track').forEach(t => {
264
+ if (t.src) found.push(t.src);
265
+ });
266
+
267
+ return found;
268
+ });
269
+
270
+ tracks.forEach(url => {
271
+ console.log(`[STEP2] Evaluated Track: ${url}`);
272
+ if (!vttUrls.find(v => v.url === url)) {
273
+ const { lang, code } = detectLang(url);
274
+ console.log(`[STEP2] Found subtitle (DOM): ${url} [${code}]`);
275
+ vttUrls.push({ url, lang, code });
276
+ }
277
+ });
278
+
279
+ const box = await page.locator("body").boundingBox();
280
+ if (box) {
281
+ await page.mouse.click(box.x + box.width / 2, box.y + box.height / 2);
282
+ }
283
+
284
+ await page.waitForTimeout(5000);
285
+ } catch (err) {
286
+ console.error(`[STEP2] Navigation error: ${err.message}`);
287
+ }
288
+
289
+ await page.close().catch(() => { });
290
+ await context.close().catch(() => { });
291
+ } else {
292
+ console.log(`[STEP2] [MEMORY OPTIMIZATION] Skipping Playwright since ${vttUrls.length} tracks were found in metadata.`);
293
+ }
294
+
295
+ const filtered = vttUrls.filter((v) => langs.includes(v.code) || v.code === "und");
296
+ console.log(`[STEP2] Total VTTs: ${vttUrls.length}, filtered: ${filtered.length}`);
297
+
298
+ const results = [];
299
+ for (const track of filtered) {
300
+ try {
301
+ console.log(`[DOWNLOAD] Attempting ${track.url}`);
302
+ const resp = await fetch(track.url, {
303
+ headers: {
304
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120 Safari/537.36',
305
+ 'Referer': embedUrl
306
+ }
307
+ });
308
+ console.log(`[DOWNLOAD] Status: ${resp.status} for ${track.url}`);
309
+ if (resp.ok) {
310
+ const content = await resp.text();
311
+ console.log(`[DOWNLOAD] Content length: ${content.length}`);
312
+ if (content.length > 50) {
313
+ results.push({
314
+ lang: track.lang,
315
+ lang_code: track.code,
316
+ url: track.url,
317
+ content,
318
+ });
319
+ }
320
+ }
321
+ } catch (e) {
322
+ console.error(`[DOWNLOAD ERROR] ${e.message}`);
323
+ }
324
+ }
325
+
326
+ return results;
327
+ }
328
+
329
+ // ─── Subtitle Endpoint ──────────────────────────────────────────────────────
330
+
331
+ async function handleGetSubtitles(req, res) {
332
+ const data = req.method === "POST" ? req.body : req.query;
333
+ const tmdb_id = data.tmdb_id;
334
+
335
+ if (!tmdb_id) {
336
+ return res.status(400).json({ error: "Missing tmdb_id parameter" });
337
+ }
338
+
339
+ // Concurrency limiter
340
+ if (activeRequests >= MAX_CONCURRENT) {
341
+ console.log(`[API] Rejecting request for ${tmdb_id} β€” too many concurrent requests (${activeRequests}/${MAX_CONCURRENT})`);
342
+ return res.status(429).json({ error: "Server busy, try again in a few seconds", tmdb_id });
343
+ }
344
+
345
+ activeRequests++;
346
+ const type = data.type || "movie";
347
+ const season = data.season;
348
+ const episode = data.episode;
349
+ const langs = (data.langs || "ar,en").split(",").map((l) => l.trim());
350
+
351
+ console.log(`\n════════════���═══════════════════════════════`);
352
+ console.log(`[API] ${req.method} Request: tmdb_id=${tmdb_id}, type=${type}, langs=${langs.join(",")} (active: ${activeRequests})`);
353
+
354
+ try {
355
+ const embedUrl = await getEmbedUrl(tmdb_id, type, season, episode);
356
+ if (!embedUrl) {
357
+ return res.json({ tmdb_id, count: 0, subtitles: [], error: "No embed URL found" });
358
+ }
359
+
360
+ const subtitles = await scrapeSubtitles(embedUrl, langs);
361
+
362
+ console.log(`[API] Returning ${subtitles.length} subtitles for tmdb_id=${tmdb_id}`);
363
+ res.json({ tmdb_id, count: subtitles.length, subtitles });
364
+ } catch (err) {
365
+ console.error(`[API ERROR] ${err.message}`);
366
+ res.status(500).json({ error: "Scraping failed", details: err.message });
367
+ } finally {
368
+ activeRequests--;
369
+ requestCount++;
370
+ // Recycle browser periodically to free memory
371
+ if (requestCount >= MAX_REQUESTS_BEFORE_RECYCLE && activeRequests === 0) {
372
+ await recycleBrowser();
373
+ }
374
+ }
375
+ }
376
+
377
+ app.get("/get-subtitles", handleGetSubtitles);
378
+ app.post("/get-subtitles", handleGetSubtitles);
379
+
380
+ app.get("/", (req, res) => {
381
+ const memUsage = process.memoryUsage();
382
+ res.json({
383
+ status: "running",
384
+ message: "🎬 Subtitle Scraper API",
385
+ requestCount,
386
+ activeRequests,
387
+ memory: {
388
+ rss: `${Math.round(memUsage.rss / 1024 / 1024)}MB`,
389
+ heap: `${Math.round(memUsage.heapUsed / 1024 / 1024)}MB / ${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`,
390
+ },
391
+ });
392
+ });
393
+
394
+ // ─── Start ──────────────────────────────────────────────────────────────────
395
+
396
+ app.get("/debug-screenshot", async (req, res) => {
397
+ const { url } = req.query;
398
+ if (!url) return res.status(400).send("URL required");
399
+
400
+ let page;
401
+ try {
402
+ const browserInstance = await getBrowser();
403
+ page = await browserInstance.newPage();
404
+ await page.goto(url, { waitUntil: 'networkidle', timeout: 30000 });
405
+ const buffer = await page.screenshot({ fullPage: true });
406
+ res.setHeader('Content-Type', 'image/png');
407
+ res.send(buffer);
408
+ } catch (err) {
409
+ res.status(500).send(err.message);
410
+ } finally {
411
+ if (page) await page.close();
412
+ }
413
+ });
414
+
415
+ app.listen(PORT, "0.0.0.0", () => {
416
+ console.log(`Subtitle Scraper API listening on port ${PORT}`);
417
+ getBrowser()
418
+ .then(() => console.log("Browser initialized. Ready to scrape."))
419
+ .catch(err => {
420
+ console.error("CRITICAL: Failed to initialize browser on startup:", err.message);
421
+ });
422
+ });
423
+
424
+ process.on("SIGINT", async () => {
425
+ if (browser) await browser.close();
426
+ process.exit();
427
+ });
428
+ process.on("SIGTERM", async () => {
429
+ if (browser) await browser.close();
430
+ process.exit();
431
+ });
432
+
433
+ // Prevent crashes from killing the server
434
+ process.on("uncaughtException", (err) => {
435
+ console.error("[CRASH GUARD] Uncaught exception:", err.message);
436
+ // Reset browser on crash
437
+ browser = null;
438
+ });
439
+ process.on("unhandledRejection", (reason) => {
440
+ console.error("[CRASH GUARD] Unhandled rejection:", reason?.message || reason);
441
+ browser = null;
442
+ });