stnh70 commited on
Commit
e69ef9d
·
verified ·
1 Parent(s): 872dee4

Update server.js

Browse files
Files changed (1) hide show
  1. server.js +86 -95
server.js CHANGED
@@ -1,51 +1,29 @@
1
- // server.js
2
 
3
  import express, { json } from "express";
4
  import cors from "cors";
5
- import puppeteer from 'puppeteer-extra';
6
  import StealthPlugin from 'puppeteer-extra-plugin-stealth';
7
- import { executablePath } from 'puppeteer';
8
  import fetch from 'node-fetch';
9
  import { JSDOM } from 'jsdom';
 
10
 
11
- // --- 1. 初始化 Puppeteer 和插件 ---
12
- puppeteer.use(StealthPlugin());
13
  const app = express();
14
- const PORT = process.env.PORT || 3000;
15
  app.use(cors());
16
  app.use(json());
17
 
 
18
 
19
- // --- 2. 优化后的辅助函数 ---
 
 
20
 
21
- /**
22
- * [优化] 移除了对特定 sitekey 的检查,使其更通用和健壮。
23
- * 现在它会等待任何 Turnstile 控件出现,而不是只寻找特定的那一个。
24
- */
25
- async function handleTurnstile(page) {
26
- try {
27
- await page.waitForSelector('.cf-turnstile', { timeout: 5000 });
28
- console.log('Turnstile detected - attempting generic bypass...');
29
-
30
- await page.evaluate(() => {
31
- if (typeof window.cftCallback === 'function') {
32
- const mockToken = 'mock-token-' + Math.random().toString(36).substring(2);
33
- window.cftCallback(mockToken);
34
- }
35
- });
36
- console.log('Turnstile JS callback triggered.');
37
- await page.waitForTimeout(2000);
38
- } catch (error) {
39
- if (error.name === 'TimeoutError') {
40
- console.log('Turnstile not found on page, skipping bypass.');
41
- } else {
42
- console.warn('An error occurred during Turnstile handling:', error.message);
43
- }
44
- }
45
- }
46
 
47
  /**
48
- * 您的参考代码中用于提取最终视频链接的函数,保持不变。
49
  */
50
  async function getVideoAndSubtitles(finalUrl) {
51
  try {
@@ -56,8 +34,11 @@ async function getVideoAndSubtitles(finalUrl) {
56
  const html = await response.text();
57
  const dom = new JSDOM(html);
58
  const script = Array.from(dom.window.document.querySelectorAll('script')).find(s => s.textContent?.includes('new Playerjs'));
59
- if (!script) return { videoFileUrl: null, subtitleSources: [] };
60
-
 
 
 
61
  const fileMatch = script.textContent.match(/file:"(.*?m3u8.*?)"/);
62
  const subtitlesMatch = script.textContent.match(/subtitle:"(.*?)"/);
63
 
@@ -71,78 +52,93 @@ async function getVideoAndSubtitles(finalUrl) {
71
  }
72
  }
73
 
74
-
75
- // --- 3. 核心抓取逻辑 ---
76
-
77
  /**
78
- * [重大重构]
79
- * - 这是一个统一的抓取函数,支持电影和电视剧。
80
- * - 它接收一个已经启动的 browser 实例,避免了每次请求都重复启动浏览器的巨大开销。
81
- * - 使用了隔离的浏览器上下文 (Incognito Context) 来确保每个请求之间互不干扰。
82
  */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  async function scrapeSource(browser, { type, id, season, episode }) {
84
  let context = null;
85
- console.log(`\nScraping for: type=${type}, id=${id}, s=${season}, e=${episode}`);
86
 
87
  try {
88
- // --- URL 构建 ---
89
  let initialUrl;
90
- const domain = "https://vidsrc.xyz"; // 您可以从一个列表中选择,这里以一个为例
91
  if (type === 'tv') {
92
  initialUrl = `${domain}/embed/tv?tmdb=${id}&season=${season}&episode=${episode}`;
93
- } else { // 默认为 movie
94
  initialUrl = `${domain}/embed/movie/${id}`;
95
  }
96
  console.log(`Initial URL: ${initialUrl}`);
97
 
98
- // --- 浏览器操作 ---
99
- context = await browser.createBrowserContext();
100
- const page = await context.newPage();
101
 
102
- // 设置通用的请求拦截,绕过 Turnstile API 验证
103
- await page.setRequestInterception(true);
104
- page.on('request', (request) => {
105
- if (request.url().includes('/rcp_verify')) {
106
- request.respond({ status: 200, contentType: 'application/json', body: '1' });
107
- } else {
108
- request.continue();
109
  }
 
 
 
 
110
  });
111
 
112
- // 阶段 1: 访问初始页面
113
- await page.goto(initialUrl, { waitUntil: 'networkidle2', timeout: 45000 });
 
 
114
  await handleTurnstile(page);
115
-
116
- const firstIframeSrc = await page.evaluate(() => document.querySelector('#player_iframe')?.src);
117
  if (!firstIframeSrc) throw new Error('First iframe (#player_iframe) not found');
118
  console.log(`First iframe src: ${firstIframeSrc}`);
119
 
120
- // 阶段 2: 访问第一个 Iframe
121
- // [优化] 我们可以在同一个页面对象上导航,无需创建新页面
122
- await page.goto(firstIframeSrc, { waitUntil: 'networkidle2', timeout: 45000 });
123
  await handleTurnstile(page);
124
-
125
- const finalIframeSrc = await page.evaluate(() => document.querySelector('iframe')?.src);
126
  if (!finalIframeSrc) throw new Error('Final iframe source not found');
127
  console.log(`Final iframe src: ${finalIframeSrc}`);
128
 
129
- // 阶段 3: 提取数据
130
  return await getVideoAndSubtitles(finalIframeSrc);
131
 
132
  } catch (error) {
133
  console.error('Full error during scraping process:', error.message);
134
- throw error; // 将错误向上抛出,由 Express 路由统一处理
135
  } finally {
136
  if (context) {
137
- await context.close(); // 关闭上下文,释放所有页面和资源,非常高效
138
  }
139
  }
140
  }
141
 
142
-
143
- // --- 4. Express 服务器和路由 ---
144
-
145
- let browser; // 全局浏览器实例
146
 
147
  app.get("/extract", async (req, res) => {
148
  const type = req.query.type || 'movie';
@@ -150,19 +146,26 @@ app.get("/extract", async (req, res) => {
150
  const season = req.query.season;
151
  const episode = req.query.episode;
152
 
153
- // --- 参数校验 ---
154
  if (!id) {
155
  return res.status(400).json({ success: false, error: "tmdb_id is required" });
156
  }
157
  if (type === 'tv' && (!season || !episode)) {
158
  return res.status(400).json({ success: false, error: "season and episode are required for type 'tv'" });
159
  }
 
 
 
 
 
 
 
160
 
161
  try {
162
  const result = await scrapeSource(browser, { type, id, season, episode });
163
-
164
  if (result && result.videoFileUrl) {
165
- res.status(200).json({ success: true, result });
 
 
166
  } else {
167
  res.status(404).json({ success: false, error: "Could not find video stream from the source." });
168
  }
@@ -171,31 +174,18 @@ app.get("/extract", async (req, res) => {
171
  }
172
  });
173
 
174
-
175
- // --- 5. 服务器启动和优雅关停 ---
176
-
177
  (async () => {
178
  try {
179
- console.log("Launching a persistent browser instance...");
180
- browser = await puppeteer.launch({
181
- headless: true, // ��生产环境中建议设为 true
182
- executablePath: executablePath(),
183
- args: [
184
- '--no-sandbox',
185
- '--disable-setuid-sandbox',
186
- '--disable-dev-shm-usage', // 解决临时文件空间不足的问题
187
- '--disable-accelerated-2d-canvas',
188
- '--no-first-run',
189
- '--no-zygote',
190
- '--single-process', // 在某些资源受限的环境中有帮助
191
- '--disable-gpu'
192
- ],
193
- defaultViewport: null
194
  });
195
  console.log("Browser launched successfully.");
196
-
197
  app.listen(PORT, () => {
198
- console.log(`🚀 Scraper server running at http://localhost:${PORT}`);
199
  });
200
  } catch (error) {
201
  console.error("Failed to launch browser:", error);
@@ -203,6 +193,7 @@ app.get("/extract", async (req, res) => {
203
  }
204
  })();
205
 
 
206
  const gracefulShutdown = async () => {
207
  console.log("\nShutting down gracefully...");
208
  if (browser) {
 
1
+ // server.js (Playwright Final Version - COMPLETE)
2
 
3
  import express, { json } from "express";
4
  import cors from "cors";
5
+ import { chromium } from "playwright-extra";
6
  import StealthPlugin from 'puppeteer-extra-plugin-stealth';
 
7
  import fetch from 'node-fetch';
8
  import { JSDOM } from 'jsdom';
9
+ import LRU from 'lru-cache';
10
 
11
+ // --- 1. 初始化 ---
12
+ chromium.use(StealthPlugin());
13
  const app = express();
 
14
  app.use(cors());
15
  app.use(json());
16
 
17
+ const cache = new LRU({ max: 500, ttl: 15 * 60 * 1000 });
18
 
19
+ // 定义要阻止的资源类型和广告域名,以减少内存消耗
20
+ const blockedResourceTypes = new Set(['image', 'stylesheet', 'font', 'media']);
21
+ const blockedDomains = ['googlesyndication.com', 'googletagmanager.com', 'google-analytics.com', 'doubleclick.net'];
22
 
23
+ // --- 2. 辅助函数 ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  /**
26
+ * 从最终的 HTML 内容中提取视频和字幕信息
27
  */
28
  async function getVideoAndSubtitles(finalUrl) {
29
  try {
 
34
  const html = await response.text();
35
  const dom = new JSDOM(html);
36
  const script = Array.from(dom.window.document.querySelectorAll('script')).find(s => s.textContent?.includes('new Playerjs'));
37
+ if (!script) {
38
+ console.warn(`[Helper] Player.js script not found on ${finalUrl}`);
39
+ return { videoFileUrl: null, subtitleSources: [] };
40
+ }
41
+
42
  const fileMatch = script.textContent.match(/file:"(.*?m3u8.*?)"/);
43
  const subtitlesMatch = script.textContent.match(/subtitle:"(.*?)"/);
44
 
 
52
  }
53
  }
54
 
 
 
 
55
  /**
56
+ * 尝试在页面上找到 Cloudflare Turnstile 并触发其 JS 回调
 
 
 
57
  */
58
+ async function handleTurnstile(page) {
59
+ try {
60
+ await page.waitForSelector('.cf-turnstile', { state: 'visible', timeout: 5000 });
61
+ console.log('Turnstile detected - attempting generic bypass...');
62
+ await page.evaluate(() => {
63
+ if (typeof window.cftCallback === 'function') {
64
+ const mockToken = 'mock-token-' + Math.random().toString(36).substring(2);
65
+ window.cftCallback(mockToken);
66
+ }
67
+ });
68
+ console.log('Turnstile JS callback triggered.');
69
+ await page.waitForTimeout(2000);
70
+ } catch (error) {
71
+ if (error.name.includes('Timeout')) {
72
+ console.log('Turnstile not found on page, skipping bypass.');
73
+ } else {
74
+ console.warn('An error occurred during Turnstile handling:', error.message);
75
+ }
76
+ }
77
+ }
78
+
79
+ // --- 3. 核心抓取逻辑 (Playwright 版本) ---
80
  async function scrapeSource(browser, { type, id, season, episode }) {
81
  let context = null;
82
+ console.log(`\nScraping with Playwright for: type=${type}, id=${id}, s=${season}, e=${episode}`);
83
 
84
  try {
 
85
  let initialUrl;
86
+ const domain = "https://vidsrc.xyz"; // 可以修改为其他域名
87
  if (type === 'tv') {
88
  initialUrl = `${domain}/embed/tv?tmdb=${id}&season=${season}&episode=${episode}`;
89
+ } else {
90
  initialUrl = `${domain}/embed/movie/${id}`;
91
  }
92
  console.log(`Initial URL: ${initialUrl}`);
93
 
94
+ context = await browser.newContext();
 
 
95
 
96
+ await context.route('**/*', (route) => {
97
+ const request = route.request();
98
+ const url = request.url();
99
+ const resourceType = request.resourceType();
100
+
101
+ if (url.includes('/rcp_verify')) {
102
+ return route.fulfill({ status: 200, contentType: 'application/json', body: '1' });
103
  }
104
+ if (blockedResourceTypes.has(resourceType) || blockedDomains.some(domain => url.includes(domain))) {
105
+ return route.abort();
106
+ }
107
+ return route.continue();
108
  });
109
 
110
+ const page = await context.newPage();
111
+
112
+ // 阶段 1
113
+ await page.goto(initialUrl, { waitUntil: 'networkidle', timeout: 60000 });
114
  await handleTurnstile(page);
115
+ const firstIframeSrc = await page.locator('#player_iframe').getAttribute('src');
 
116
  if (!firstIframeSrc) throw new Error('First iframe (#player_iframe) not found');
117
  console.log(`First iframe src: ${firstIframeSrc}`);
118
 
119
+ // 阶段 2
120
+ await page.goto(firstIframeSrc, { waitUntil: 'networkidle', timeout: 60000 });
 
121
  await handleTurnstile(page);
122
+ // 尝试获取嵌套的 iframe src,如果不存在,则获取第一层 iframe src
123
+ const finalIframeSrc = await page.frameLocator('iframe').locator('iframe').getAttribute('src') || await page.locator('iframe').getAttribute('src');
124
  if (!finalIframeSrc) throw new Error('Final iframe source not found');
125
  console.log(`Final iframe src: ${finalIframeSrc}`);
126
 
127
+ // 阶段 3
128
  return await getVideoAndSubtitles(finalIframeSrc);
129
 
130
  } catch (error) {
131
  console.error('Full error during scraping process:', error.message);
132
+ throw error;
133
  } finally {
134
  if (context) {
135
+ await context.close();
136
  }
137
  }
138
  }
139
 
140
+ // --- 4. Express API 路由 ---
141
+ let browser;
 
 
142
 
143
  app.get("/extract", async (req, res) => {
144
  const type = req.query.type || 'movie';
 
146
  const season = req.query.season;
147
  const episode = req.query.episode;
148
 
 
149
  if (!id) {
150
  return res.status(400).json({ success: false, error: "tmdb_id is required" });
151
  }
152
  if (type === 'tv' && (!season || !episode)) {
153
  return res.status(400).json({ success: false, error: "season and episode are required for type 'tv'" });
154
  }
155
+
156
+ const cacheKey = JSON.stringify(req.query);
157
+ const cached = cache.get(cacheKey);
158
+ if (cached) {
159
+ console.log("Serving from cache");
160
+ return res.json(cached);
161
+ }
162
 
163
  try {
164
  const result = await scrapeSource(browser, { type, id, season, episode });
 
165
  if (result && result.videoFileUrl) {
166
+ const response = { success: true, result };
167
+ cache.set(cacheKey, response);
168
+ res.status(200).json(response);
169
  } else {
170
  res.status(404).json({ success: false, error: "Could not find video stream from the source." });
171
  }
 
174
  }
175
  });
176
 
177
+ // --- 5. 服务器启动 ---
 
 
178
  (async () => {
179
  try {
180
+ const PORT = process.env.PORT || 7860;
181
+ console.log("Launching a persistent Playwright browser instance...");
182
+ browser = await chromium.launch({
183
+ headless: true,
184
+ args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
 
 
 
 
 
 
 
 
 
 
185
  });
186
  console.log("Browser launched successfully.");
 
187
  app.listen(PORT, () => {
188
+ console.log(`🚀 Scraper server running on port ${PORT}`);
189
  });
190
  } catch (error) {
191
  console.error("Failed to launch browser:", error);
 
193
  }
194
  })();
195
 
196
+ // --- 6. 优雅关停 ---
197
  const gracefulShutdown = async () => {
198
  console.log("\nShutting down gracefully...");
199
  if (browser) {