Update sever.js
Browse files
sever.js
CHANGED
|
@@ -1,23 +1,20 @@
|
|
| 1 |
import express, { json } from "express";
|
| 2 |
import cors from "cors";
|
| 3 |
-
import { chromium } from "playwright
|
| 4 |
-
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
| 5 |
import pLimit from "p-limit";
|
| 6 |
-
import BrowserPool from './pool/BrowserPool.js'; //
|
| 7 |
-
|
|
|
|
|
|
|
| 8 |
import fetch from 'node-fetch';
|
| 9 |
import { JSDOM } from 'jsdom';
|
| 10 |
|
| 11 |
-
// 1. 初始化 Playwright Stealth 插件
|
| 12 |
-
chromium.use(StealthPlugin());
|
| 13 |
-
|
| 14 |
-
// 2. Express 服务器基础设置
|
| 15 |
const app = express();
|
| 16 |
-
const PORT = process.env.PORT ||
|
|
|
|
| 17 |
app.use(cors());
|
| 18 |
app.use(json());
|
| 19 |
|
| 20 |
-
// 3. 配置常量
|
| 21 |
const PROVIDERS = [
|
| 22 |
"https://vidsrc.xyz",
|
| 23 |
"https://vidsrc.in",
|
|
@@ -28,87 +25,71 @@ const PROVIDERS = [
|
|
| 28 |
];
|
| 29 |
|
| 30 |
let browserPool;
|
| 31 |
-
const cache = new LRU({ max: 500, ttl: 15 * 60 * 1000 });
|
| 32 |
-
const limit = pLimit(3); // 并发任务数,可根据服务器性能调整
|
| 33 |
|
| 34 |
-
//
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
if (!response.ok) {
|
| 40 |
-
throw new Error(`HTTP error! Status: ${response.status}`);
|
| 41 |
-
}
|
| 42 |
-
|
| 43 |
-
const html = await response.text();
|
| 44 |
-
const dom = new JSDOM(html);
|
| 45 |
-
const document = dom.window.document;
|
| 46 |
-
|
| 47 |
-
const script = Array.from(document.querySelectorAll('script')).find(s => s.textContent?.includes('new Playerjs'));
|
| 48 |
-
if (!script) {
|
| 49 |
-
console.warn(`[Helper] Player.js script not found on ${finalUrl}`);
|
| 50 |
-
return { hls_url: null, subtitles: [] };
|
| 51 |
-
}
|
| 52 |
-
|
| 53 |
-
const scriptContent = script.textContent;
|
| 54 |
-
let hls_url = null;
|
| 55 |
-
let subtitles = [];
|
| 56 |
-
|
| 57 |
-
const fileMatch = scriptContent.match(/file:"(.*?m3u8.*?)"/);
|
| 58 |
-
if (fileMatch && fileMatch[1]) {
|
| 59 |
-
hls_url = fileMatch[1];
|
| 60 |
-
}
|
| 61 |
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
}
|
| 70 |
-
|
| 71 |
-
return { hls_url, subtitles };
|
| 72 |
-
} catch (error) {
|
| 73 |
-
console.error(`[Helper] Error in getVideoAndSubtitles for ${finalUrl}:`, error);
|
| 74 |
-
throw error;
|
| 75 |
-
}
|
| 76 |
}
|
| 77 |
|
| 78 |
-
//
|
| 79 |
-
async function
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
console.log(`[${domain}] [Method B] Turnstile not found on page, skipping JS bypass.`);
|
| 97 |
-
} else {
|
| 98 |
-
console.warn(`[${domain}] [Method B] An error occurred during Turnstile JS handling: ${error.message}`);
|
| 99 |
}
|
| 100 |
-
}
|
| 101 |
}
|
| 102 |
|
| 103 |
-
|
|
|
|
| 104 |
async function scrapeProvider(domain, url, signal) {
|
| 105 |
-
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
const cleanup = async () => {
|
| 109 |
-
if (
|
| 110 |
-
if (
|
| 111 |
-
if (context) await context.close().catch(()=>{});
|
| 112 |
if (browserInstance) {
|
| 113 |
console.log(`[${domain}] Releasing browser ${browserInstance.id} back to pool.`);
|
| 114 |
await browserPool.release(browserInstance);
|
|
@@ -116,64 +97,76 @@ async function scrapeProvider(domain, url, signal) {
|
|
| 116 |
};
|
| 117 |
|
| 118 |
try {
|
| 119 |
-
if (signal.aborted) throw new Error('Aborted before starting');
|
| 120 |
-
|
| 121 |
browserInstance = await browserPool.get();
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
});
|
| 126 |
-
await context.addInitScript(() => { delete navigator.__proto__.webdriver; });
|
| 127 |
-
|
| 128 |
-
// 方法 A: 设置网络层拦截
|
| 129 |
-
const setupRequestInterception = async (p) => {
|
| 130 |
-
await p.route('**/*', (route) => {
|
| 131 |
-
if (signal.aborted) return route.abort();
|
| 132 |
-
if (route.request().url().includes('/rcp_verify')) {
|
| 133 |
-
console.log(`[${domain}] [Method A] Mocking /rcp_verify network request.`);
|
| 134 |
-
return route.fulfill({ status: 200, contentType: 'application/json', body: '1' });
|
| 135 |
-
}
|
| 136 |
-
return route.continue();
|
| 137 |
-
});
|
| 138 |
-
};
|
| 139 |
|
| 140 |
-
//
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
await page.goto(url, { waitUntil: 'networkidle', timeout: 30000, signal });
|
| 144 |
-
await handleTurnstile(page, domain, signal);
|
| 145 |
-
|
| 146 |
-
const firstIframeSrc = await page.evaluate(() => {
|
| 147 |
-
const iframe = document.querySelector('#player_iframe');
|
| 148 |
-
if (!iframe) return null;
|
| 149 |
-
let src = iframe.getAttribute('src') || '';
|
| 150 |
-
return src.startsWith('//') ? `https:${src}` : src;
|
| 151 |
});
|
| 152 |
-
if (!firstIframeSrc) throw new Error('First iframe (#player_iframe) not found');
|
| 153 |
-
console.log(`[${domain}] Found first iframe src: ${firstIframeSrc}`);
|
| 154 |
|
| 155 |
-
//
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
|
|
|
| 160 |
|
| 161 |
-
|
| 162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
});
|
| 164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
console.log(`[${domain}] Found final iframe src: ${finalIframeSrc}`);
|
|
|
|
| 166 |
|
| 167 |
-
//
|
| 168 |
-
const {
|
| 169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
-
return { source_domain: domain, hls_url, subtitles, error: null };
|
| 172 |
} catch (error) {
|
| 173 |
-
if (
|
| 174 |
-
console.log(`[${domain}]
|
| 175 |
} else {
|
| 176 |
-
console.error(`[${domain}]
|
| 177 |
}
|
| 178 |
throw error;
|
| 179 |
} finally {
|
|
@@ -181,7 +174,7 @@ async function scrapeProvider(domain, url, signal) {
|
|
| 181 |
}
|
| 182 |
}
|
| 183 |
|
| 184 |
-
//
|
| 185 |
app.get("/extract", async (req, res) => {
|
| 186 |
const type = req.query.type || "movie";
|
| 187 |
const tmdb_id = req.query.tmdb_id;
|
|
@@ -222,7 +215,7 @@ app.get("/extract", async (req, res) => {
|
|
| 222 |
|
| 223 |
console.log(`\nSuccess from [${firstSuccessfulResult.source_domain}]. Aborting other scrapers.`);
|
| 224 |
controller.abort();
|
| 225 |
-
|
| 226 |
const response = { success: true, result: firstSuccessfulResult };
|
| 227 |
cache.set(cacheKey, response);
|
| 228 |
res.json(response);
|
|
@@ -238,14 +231,16 @@ app.get("/extract", async (req, res) => {
|
|
| 238 |
}
|
| 239 |
});
|
| 240 |
|
| 241 |
-
// 8. 启动服务器和浏览器池
|
| 242 |
(async () => {
|
| 243 |
try {
|
| 244 |
browserPool = new BrowserPool({
|
| 245 |
chromium: chromium,
|
| 246 |
minSize: 1,
|
| 247 |
-
maxSize:
|
| 248 |
-
maxUsage:
|
|
|
|
|
|
|
|
|
|
| 249 |
});
|
| 250 |
await browserPool.initialize();
|
| 251 |
console.log("Browser pool initialized successfully.");
|
|
@@ -256,7 +251,6 @@ app.get("/extract", async (req, res) => {
|
|
| 256 |
}
|
| 257 |
})();
|
| 258 |
|
| 259 |
-
// 9. 优雅关停
|
| 260 |
process.on("SIGINT", async () => {
|
| 261 |
console.log("Shutting down gracefully...");
|
| 262 |
if (browserPool) await browserPool.shutdown();
|
|
|
|
| 1 |
import express, { json } from "express";
|
| 2 |
import cors from "cors";
|
| 3 |
+
import { chromium } from "playwright";
|
|
|
|
| 4 |
import pLimit from "p-limit";
|
| 5 |
+
import BrowserPool from './pool/BrowserPool.js'; // 您的浏览器池代码保持不变
|
| 6 |
+
// [MODIFICATION] 修正 lru-cache 的导入语法
|
| 7 |
+
import { LRUCache } from 'lru-cache';
|
| 8 |
+
// [MODIFICATION] 导入新的辅助库
|
| 9 |
import fetch from 'node-fetch';
|
| 10 |
import { JSDOM } from 'jsdom';
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
const app = express();
|
| 13 |
+
const PORT = process.env.PORT || 7860; // 建议在 Hugging Face 上使用 7860
|
| 14 |
+
|
| 15 |
app.use(cors());
|
| 16 |
app.use(json());
|
| 17 |
|
|
|
|
| 18 |
const PROVIDERS = [
|
| 19 |
"https://vidsrc.xyz",
|
| 20 |
"https://vidsrc.in",
|
|
|
|
| 25 |
];
|
| 26 |
|
| 27 |
let browserPool;
|
|
|
|
|
|
|
| 28 |
|
| 29 |
+
// [MODIFICATION] 修正 lru-cache 的实例化语法
|
| 30 |
+
const cache = new LRUCache({
|
| 31 |
+
max: 500,
|
| 32 |
+
ttl: 15 * 60 * 1000,
|
| 33 |
+
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
+
const limit = pLimit(2);
|
| 36 |
+
|
| 37 |
+
// [MODIFICATION] 新增的辅助函数,用于处理 CF Turnstile 的前端 JS 绕过
|
| 38 |
+
async function handleTurnstile(page) {
|
| 39 |
+
try {
|
| 40 |
+
await page.waitForSelector('.cf-turnstile', { state: 'visible', timeout: 5000 });
|
| 41 |
+
console.log('Turnstile detected - attempting generic bypass...');
|
| 42 |
+
await page.evaluate(() => {
|
| 43 |
+
if (typeof window.cftCallback === 'function') {
|
| 44 |
+
const mockToken = 'mock-token-' + Math.random().toString(36).substring(2);
|
| 45 |
+
window.cftCallback(mockToken);
|
| 46 |
+
}
|
| 47 |
+
});
|
| 48 |
+
console.log('Turnstile JS callback triggered.');
|
| 49 |
+
await page.waitForTimeout(2000);
|
| 50 |
+
} catch (error) {
|
| 51 |
+
if (error.name.includes('Timeout')) {
|
| 52 |
+
console.log('Turnstile not found on page, skipping bypass.');
|
| 53 |
+
} else {
|
| 54 |
+
console.warn('An error occurred during Turnstile handling:', error.message);
|
| 55 |
+
}
|
| 56 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
}
|
| 58 |
|
| 59 |
+
// [MODIFICATION] 新增的辅助函数,用于从最终页面提取视频链接
|
| 60 |
+
async function getVideoAndSubtitles(finalUrl) {
|
| 61 |
+
try {
|
| 62 |
+
const response = await fetch(finalUrl);
|
| 63 |
+
if (!response.ok) throw new Error(`HTTP error! Status: ${response.status}`);
|
| 64 |
+
const html = await response.text();
|
| 65 |
+
const dom = new JSDOM(html);
|
| 66 |
+
const script = Array.from(dom.window.document.querySelectorAll('script')).find(s => s.textContent?.includes('new Playerjs'));
|
| 67 |
+
if (!script) return { hlsUrl: null, subtitles: [] };
|
| 68 |
+
const fileMatch = script.textContent.match(/file:"(.*?m3u8.*?)"/);
|
| 69 |
+
const subtitlesMatch = script.textContent.match(/subtitle:"(.*?)"/);
|
| 70 |
+
return {
|
| 71 |
+
hlsUrl: fileMatch ? fileMatch[1] : null,
|
| 72 |
+
subtitles: subtitlesMatch ? subtitlesMatch[1].split(',').map(s => s.trim()).filter(Boolean) : []
|
| 73 |
+
};
|
| 74 |
+
} catch (error) {
|
| 75 |
+
console.error('Error in getVideoAndSubtitles:', error);
|
| 76 |
+
throw error;
|
|
|
|
|
|
|
|
|
|
| 77 |
}
|
|
|
|
| 78 |
}
|
| 79 |
|
| 80 |
+
|
| 81 |
+
// [MODIFICATION] 对 scrapeProvider 函数进行“外科手术式”升级
|
| 82 |
async function scrapeProvider(domain, url, signal) {
|
| 83 |
+
if (signal.aborted) throw new Error('Scraping aborted before starting.');
|
| 84 |
+
console.log(`\n[${domain}] Starting UPGRADED scrape for URL: ${url}`);
|
| 85 |
+
|
| 86 |
+
let browserInstance = null;
|
| 87 |
+
let context = null;
|
| 88 |
+
let page = null;
|
| 89 |
|
| 90 |
const cleanup = async () => {
|
| 91 |
+
if (page && !page.isClosed()) await page.close().catch(() => {});
|
| 92 |
+
if (context) await context.close().catch(() => {});
|
|
|
|
| 93 |
if (browserInstance) {
|
| 94 |
console.log(`[${domain}] Releasing browser ${browserInstance.id} back to pool.`);
|
| 95 |
await browserPool.release(browserInstance);
|
|
|
|
| 97 |
};
|
| 98 |
|
| 99 |
try {
|
|
|
|
|
|
|
| 100 |
browserInstance = await browserPool.get();
|
| 101 |
+
const browser = browserInstance.browser;
|
| 102 |
+
console.log(`[${domain}] Acquired browser ${browserInstance.id}`);
|
| 103 |
+
if (signal.aborted) throw new Error('Scraping aborted.');
|
| 104 |
+
|
| 105 |
+
context = await browser.newContext({
|
| 106 |
+
userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
| 107 |
+
ignoreHTTPSErrors: true
|
| 108 |
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
+
// [MODIFICATION] 注入脚本,手动实现 StealthPlugin 的核心功能
|
| 111 |
+
await context.addInitScript(() => {
|
| 112 |
+
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
});
|
|
|
|
|
|
|
| 114 |
|
| 115 |
+
// [MODIFICATION] 设置强大的网络拦截,应用于所有页面
|
| 116 |
+
await context.route("**/*", (route) => {
|
| 117 |
+
if (signal.aborted) return route.abort();
|
| 118 |
+
const request = route.request();
|
| 119 |
+
const reqUrl = request.url();
|
| 120 |
+
const resourceType = request.resourceType();
|
| 121 |
|
| 122 |
+
// 1. 伪造 Turnstile 验证
|
| 123 |
+
if (reqUrl.includes('/rcp_verify')) {
|
| 124 |
+
return route.fulfill({ status: 200, contentType: 'application/json', body: '1' });
|
| 125 |
+
}
|
| 126 |
+
// 2. 阻止不必要的资源以节省内存
|
| 127 |
+
if (['image', 'stylesheet', 'font', 'media'].includes(resourceType)) {
|
| 128 |
+
return route.abort();
|
| 129 |
+
}
|
| 130 |
+
// 3. 放行其他请求
|
| 131 |
+
return route.continue();
|
| 132 |
});
|
| 133 |
+
|
| 134 |
+
page = await context.newPage();
|
| 135 |
+
if (signal.aborted) throw new Error('Scraping aborted.');
|
| 136 |
+
|
| 137 |
+
// --- 新的、多阶段的抓取逻辑 ---
|
| 138 |
+
// 阶段 1: 访问初始页面
|
| 139 |
+
await page.goto(url, { waitUntil: "networkidle", timeout: 60000, signal });
|
| 140 |
+
await handleTurnstile(page);
|
| 141 |
+
|
| 142 |
+
const firstIframeSrc = await page.locator('#player_iframe').getAttribute('src');
|
| 143 |
+
if (!firstIframeSrc) throw new Error('First iframe (#player_iframe) not found');
|
| 144 |
+
console.log(`[${domain}] Found first iframe src: ${firstIframeSrc}`);
|
| 145 |
+
if (signal.aborted) throw new Error('Scraping aborted.');
|
| 146 |
+
|
| 147 |
+
// 阶段 2: 访问第一个 iframe
|
| 148 |
+
await page.goto(firstIframeSrc, { waitUntil: "networkidle", timeout: 60000, signal });
|
| 149 |
+
await handleTurnstile(page);
|
| 150 |
+
|
| 151 |
+
const finalIframeSrc = await page.frameLocator('iframe').locator('iframe').getAttribute('src') || await page.locator('iframe').getAttribute('src');
|
| 152 |
+
if (!finalIframeSrc) throw new Error('Final iframe source not found');
|
| 153 |
console.log(`[${domain}] Found final iframe src: ${finalIframeSrc}`);
|
| 154 |
+
if (signal.aborted) throw new Error('Scraping aborted.');
|
| 155 |
|
| 156 |
+
// 阶段 3: 从最终源提取数据
|
| 157 |
+
const { hlsUrl, subtitles } = await getVideoAndSubtitles(finalIframeSrc);
|
| 158 |
+
|
| 159 |
+
if (!hlsUrl) {
|
| 160 |
+
throw new Error("HLS URL not found after all stages");
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
return { source_domain: domain, hls_url: hlsUrl, subtitles, error: null };
|
| 164 |
|
|
|
|
| 165 |
} catch (error) {
|
| 166 |
+
if (error.name === 'AbortError' || (signal && signal.aborted)) {
|
| 167 |
+
console.log(`[${domain}] Scraping was aborted.`);
|
| 168 |
} else {
|
| 169 |
+
console.error(`[${domain}] Error in scrapeProvider: ${error.message}`);
|
| 170 |
}
|
| 171 |
throw error;
|
| 172 |
} finally {
|
|
|
|
| 174 |
}
|
| 175 |
}
|
| 176 |
|
| 177 |
+
// [MODIFICATION] 您的 /extract 路由和启动逻辑完全保持不变,因为它们的设计已经非常优秀
|
| 178 |
app.get("/extract", async (req, res) => {
|
| 179 |
const type = req.query.type || "movie";
|
| 180 |
const tmdb_id = req.query.tmdb_id;
|
|
|
|
| 215 |
|
| 216 |
console.log(`\nSuccess from [${firstSuccessfulResult.source_domain}]. Aborting other scrapers.`);
|
| 217 |
controller.abort();
|
| 218 |
+
|
| 219 |
const response = { success: true, result: firstSuccessfulResult };
|
| 220 |
cache.set(cacheKey, response);
|
| 221 |
res.json(response);
|
|
|
|
| 231 |
}
|
| 232 |
});
|
| 233 |
|
|
|
|
| 234 |
(async () => {
|
| 235 |
try {
|
| 236 |
browserPool = new BrowserPool({
|
| 237 |
chromium: chromium,
|
| 238 |
minSize: 1,
|
| 239 |
+
maxSize: 5,
|
| 240 |
+
maxUsage: 100,
|
| 241 |
+
launchOptions: { // [MODIFICATION] 为 Docker/HuggingFace 环境添加必要的启动参数
|
| 242 |
+
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
|
| 243 |
+
}
|
| 244 |
});
|
| 245 |
await browserPool.initialize();
|
| 246 |
console.log("Browser pool initialized successfully.");
|
|
|
|
| 251 |
}
|
| 252 |
})();
|
| 253 |
|
|
|
|
| 254 |
process.on("SIGINT", async () => {
|
| 255 |
console.log("Shutting down gracefully...");
|
| 256 |
if (browserPool) await browserPool.shutdown();
|