devusman commited on
Commit
14da5d1
Β·
1 Parent(s): cfc5143
Files changed (3) hide show
  1. package-lock.json +27 -0
  2. package.json +1 -0
  3. server.js +71 -78
package-lock.json CHANGED
@@ -14,6 +14,7 @@
14
  "express": "^5.1.0",
15
  "puppeteer": "^24.16.2",
16
  "puppeteer-extra": "^3.3.6",
 
17
  "puppeteer-extra-plugin-stealth": "^2.11.2"
18
  },
19
  "devDependencies": {
@@ -2307,6 +2308,32 @@
2307
  }
2308
  }
2309
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2310
  "node_modules/puppeteer-extra-plugin-stealth": {
2311
  "version": "2.11.2",
2312
  "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-stealth/-/puppeteer-extra-plugin-stealth-2.11.2.tgz",
 
14
  "express": "^5.1.0",
15
  "puppeteer": "^24.16.2",
16
  "puppeteer-extra": "^3.3.6",
17
+ "puppeteer-extra-plugin-recaptcha": "^3.6.8",
18
  "puppeteer-extra-plugin-stealth": "^2.11.2"
19
  },
20
  "devDependencies": {
 
2308
  }
2309
  }
2310
  },
2311
+ "node_modules/puppeteer-extra-plugin-recaptcha": {
2312
+ "version": "3.6.8",
2313
+ "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-recaptcha/-/puppeteer-extra-plugin-recaptcha-3.6.8.tgz",
2314
+ "integrity": "sha512-AY2HG1ZFlSi4xs+Hy84LtRJ95DIfnbjR3Az64dJGVW8gr/hBAGEWRlXTMzea7YOmxO3Nc8Ak3CcUgjgp1gIu1w==",
2315
+ "license": "MIT",
2316
+ "dependencies": {
2317
+ "debug": "^4.1.1",
2318
+ "merge-deep": "^3.0.2",
2319
+ "puppeteer-extra-plugin": "^3.2.3"
2320
+ },
2321
+ "engines": {
2322
+ "node": ">=9.11.2"
2323
+ },
2324
+ "peerDependencies": {
2325
+ "playwright-extra": "*",
2326
+ "puppeteer-extra": "*"
2327
+ },
2328
+ "peerDependenciesMeta": {
2329
+ "playwright-extra": {
2330
+ "optional": true
2331
+ },
2332
+ "puppeteer-extra": {
2333
+ "optional": true
2334
+ }
2335
+ }
2336
+ },
2337
  "node_modules/puppeteer-extra-plugin-stealth": {
2338
  "version": "2.11.2",
2339
  "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-stealth/-/puppeteer-extra-plugin-stealth-2.11.2.tgz",
package.json CHANGED
@@ -10,6 +10,7 @@
10
  "express": "^5.1.0",
11
  "puppeteer": "^24.16.2",
12
  "puppeteer-extra": "^3.3.6",
 
13
  "puppeteer-extra-plugin-stealth": "^2.11.2"
14
  },
15
  "devDependencies": {
 
10
  "express": "^5.1.0",
11
  "puppeteer": "^24.16.2",
12
  "puppeteer-extra": "^3.3.6",
13
+ "puppeteer-extra-plugin-recaptcha": "^3.6.8",
14
  "puppeteer-extra-plugin-stealth": "^2.11.2"
15
  },
16
  "devDependencies": {
server.js CHANGED
@@ -1,21 +1,32 @@
1
  const express = require('express');
2
  const puppeteerExtra = require('puppeteer-extra');
3
  const StealthPlugin = require('puppeteer-extra-plugin-stealth');
 
 
4
  const cors = require('cors');
5
  const { EventEmitter } = require('events');
6
- const os = require('os'); // NEW: For accessing the operating system's temporary directory
7
- const fs = require('fs').promises; // NEW: For file system operations
8
- const path = require('path'); // NEW: For handling file paths
9
-
 
 
 
 
 
 
 
 
10
  puppeteerExtra.use(StealthPlugin());
11
 
 
12
  const app = express();
13
  const port = 7860;
14
 
15
  app.use(cors());
16
  app.use(express.json());
17
 
18
- // --- Progress Tracking and Job Storage ---
19
  const progressTrackers = new Map();
20
  const downloadJobs = new Map();
21
 
@@ -44,10 +55,11 @@ class ProgressTracker extends EventEmitter {
44
  }
45
  }
46
 
47
- // --- Puppeteer Logic (Updated for Stealth, Reliability, and Cleanup) ---
48
  const bypassCookiesAndRestrictions = async (page, progressTracker) => {
 
49
  progressTracker?.updateProgress(5, 'bypassing', 'Setting up cookie bypass...');
50
-
51
  console.log("πŸͺ Starting comprehensive cookie and restriction bypass...");
52
  // Step 1: Set cookies before page load
53
  const preCookies = [
@@ -159,11 +171,11 @@ const bypassCookiesAndRestrictions = async (page, progressTracker) => {
159
  document.documentElement.style.overflow = 'auto';
160
  }, 1000);
161
  });
162
-
163
  progressTracker?.updateProgress(10, 'bypassing', 'Cookie bypass configured successfully');
164
  return true;
165
  };
166
 
 
167
  const unblurContent = async (page, progressTracker) => {
168
  progressTracker?.updateProgress(15, 'unblurring', 'Removing content restrictions...');
169
 
@@ -289,60 +301,45 @@ const applyPrintStyles = async (page, progressTracker) => {
289
  progressTracker?.updateProgress(88, 'styling', 'Print styles applied successfully');
290
  };
291
 
 
292
  const studocuDownloader = async (url, options = {}, progressTracker = null) => {
293
  let browser;
294
- let userDataDir = null; // NEW: Initialize userDataDir to null
 
 
 
295
  try {
296
  progressTracker?.updateProgress(0, 'initializing', 'Starting browser...');
297
 
298
- // NEW: Create a temporary directory for the browser session
299
  const tempDir = os.tmpdir();
300
  userDataDir = await fs.mkdtemp(path.join(tempDir, 'puppeteer-'));
301
  console.log(`πŸ“‚ Created temporary user data directory: ${userDataDir}`);
302
 
303
  console.log("πŸš€ Launching browser with enhanced stealth configuration...");
304
  browser = await puppeteerExtra.launch({
305
- headless: true,
306
- userDataDir: userDataDir, // NEW: Use the temporary directory
307
  args: [
308
  '--no-sandbox',
309
  '--disable-setuid-sandbox',
 
310
  '--disable-dev-shm-usage',
311
- '--disable-accelerated-2d-canvas',
312
- '--no-first-run',
313
- '--no-zygote',
314
- '--disable-gpu',
315
- '--disable-features=VizDisplayCompositor',
316
- '--disable-background-networking',
317
- '--disable-background-timer-throttling',
318
- '--disable-renderer-backgrounding',
319
- '--disable-backgrounding-occluded-windows',
320
- '--disable-ipc-flooding-protection',
321
- '--disable-web-security',
322
- '--disable-features=site-per-process',
323
  '--disable-blink-features=AutomationControlled',
324
- '--disable-extensions',
325
- '--ignore-certificate-errors'
326
  ],
327
  ignoreHTTPSErrors: true,
328
- timeout: 300000,
329
  });
330
 
331
  const page = await browser.newPage();
332
-
333
  progressTracker?.updateProgress(2, 'initializing', 'Configuring browser settings...');
334
 
335
  await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36');
336
- await page.setViewport({ width: 794, height: 1122 });
337
 
338
- await page.evaluateOnNewDocument(() => {
339
- Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
340
- Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
341
- Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
342
- });
343
-
344
- await bypassCookiesAndRestrictions(page, progressTracker);
345
 
 
346
  await page.setRequestInterception(true);
347
  page.on('request', (req) => {
348
  const resourceType = req.resourceType();
@@ -352,7 +349,6 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
352
  req.continue();
353
  return;
354
  }
355
-
356
  if (
357
  ['image', 'media', 'font', 'stylesheet'].includes(resourceType) &&
358
  !reqUrl.includes('document') && !reqUrl.includes('page') && !reqUrl.includes('studocu') ||
@@ -375,44 +371,40 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
375
  }
376
  });
377
 
378
- if (options.email && options.password) {
379
- progressTracker?.updateProgress(12, 'authenticating', 'Logging into StuDocu...');
380
 
381
- console.log("πŸ”‘ Logging in to StuDocu...");
382
- await page.goto('https://www.studocu.com/en-us/login', { waitUntil: 'domcontentloaded', timeout: 60000 });
383
- await page.waitForSelector('#email', { timeout: 10000 });
384
- await page.type('#email', options.email);
385
- await page.type('#password', options.password);
386
- await page.click('button[type="submit"]');
387
- try {
388
- await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 15000 });
389
- await page.waitForSelector('.user-profile, [data-testid="user-menu"]', { timeout: 5000 });
390
- console.log("βœ… Login successful.");
391
- progressTracker?.updateProgress(18, 'authenticated', 'Login successful');
392
- } catch (e) {
393
- console.error("❌ Login failed:", e.message);
394
- throw new Error("Login failed. Check credentials or try again.");
395
- }
 
 
 
 
 
 
 
 
396
  }
397
 
398
- progressTracker?.updateProgress(30, 'navigating', 'Navigating to document...');
399
- console.log(`πŸ“„ Navigating to ${url}...`);
400
 
401
- let navigationSuccess = false;
402
- let attempts = 0;
403
- const maxAttempts = 3;
404
- while (!navigationSuccess && attempts < maxAttempts) {
405
- try {
406
- attempts++;
407
- progressTracker?.updateProgress(30 + (attempts * 5), 'navigating', `Navigation attempt ${attempts}/${maxAttempts}`);
408
- console.log(`Navigation attempt ${attempts}/${maxAttempts}`);
409
- await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 });
410
- navigationSuccess = true;
411
- } catch (e) {
412
- console.log(`Navigation attempt ${attempts} failed:`, e.message);
413
- if (attempts >= maxAttempts) throw e;
414
- await new Promise(resolve => setTimeout(resolve, 5000));
415
- }
416
  }
417
 
418
  progressTracker?.updateProgress(40, 'loading', 'Page loaded, waiting for content...');
@@ -420,6 +412,7 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
420
 
421
  await unblurContent(page, progressTracker);
422
 
 
423
  progressTracker?.updateProgress(45, 'loading', 'Waiting for document content...');
424
  console.log("⏳ Waiting for document content to load...");
425
 
@@ -544,6 +537,7 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
544
  console.log(`βœ… PDF generated successfully! Size: ${(pdfBuffer.length / 1024 / 1024).toFixed(2)} MB`);
545
  return pdfBuffer;
546
 
 
547
  } catch (error) {
548
  progressTracker?.updateProgress(-1, 'error', error.message);
549
  console.error("❌ Error during PDF generation:", error);
@@ -557,7 +551,6 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
557
  console.log("Error closing browser:", e.message);
558
  }
559
  }
560
- // NEW: Clean up the temporary directory
561
  if (userDataDir) {
562
  console.log(`πŸ—‘οΈ Cleaning up temporary directory: ${userDataDir}`);
563
  try {
@@ -570,7 +563,7 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
570
  }
571
  };
572
 
573
- // --- API Routes ---
574
  app.post('/api/request-download', (req, res) => {
575
  const { url, email, password } = req.body;
576
  if (!url || !url.includes('studocu.com')) {
@@ -650,7 +643,6 @@ app.get('/api/download/:sessionId', (req, res) => {
650
  }
651
  });
652
 
653
- // --- Health and Info Endpoints ---
654
  app.get('/health', (req, res) => {
655
  res.json({
656
  status: 'healthy',
@@ -662,9 +654,10 @@ app.get('/health', (req, res) => {
662
 
663
  app.get('/', (req, res) => {
664
  res.json({
665
- message: 'πŸš€ Enhanced StuDocu Downloader API v5.2 - Real-time Progress Tracking with Stealth',
666
- version: '5.2.0',
667
  features: [
 
668
  'πŸͺ Advanced cookie banner bypass',
669
  'πŸ”“ Premium content unblurring',
670
  'πŸ”‘ Login support for full access',
@@ -692,6 +685,6 @@ process.on('SIGINT', () => {
692
  });
693
 
694
  app.listen(port, () => {
695
- console.log(`πŸš€ Enhanced StuDocu Downloader v5.2.0 running on http://localhost:${port}`);
696
- console.log(`✨ Features: Real-time progress tracking, enhanced stealth, and user feedback`);
697
  });
 
1
  const express = require('express');
2
  const puppeteerExtra = require('puppeteer-extra');
3
  const StealthPlugin = require('puppeteer-extra-plugin-stealth');
4
+ // NEW: Add the recaptcha plugin to help solve Cloudflare and other challenges
5
+ const RecaptchaPlugin = require('puppeteer-extra-plugin-recaptcha');
6
  const cors = require('cors');
7
  const { EventEmitter } = require('events');
8
+ const os = require('os');
9
+ const fs = require('fs').promises;
10
+ const path = require('path');
11
+
12
+ // --- NEW: Configuration for the Solver ---
13
+ // You can optionally provide a 2Captcha API key to solve more complex captchas,
14
+ // but it's often not needed for the initial Cloudflare JS challenge.
15
+ // puppeteerExtra.use(
16
+ // RecaptchaPlugin({
17
+ // provider: { id: '2captcha', token: 'YOUR_2CAPTCHA_API_KEY' }
18
+ // })
19
+ // );
20
  puppeteerExtra.use(StealthPlugin());
21
 
22
+
23
  const app = express();
24
  const port = 7860;
25
 
26
  app.use(cors());
27
  app.use(express.json());
28
 
29
+ // --- Progress Tracking and Job Storage (No changes) ---
30
  const progressTrackers = new Map();
31
  const downloadJobs = new Map();
32
 
 
55
  }
56
  }
57
 
58
+ // --- Puppeteer Logic (Updated for Cloudflare Bypass) ---
59
  const bypassCookiesAndRestrictions = async (page, progressTracker) => {
60
+ // This function remains largely the same but is now called *after* passing Cloudflare.
61
  progressTracker?.updateProgress(5, 'bypassing', 'Setting up cookie bypass...');
62
+ // (The implementation of this function is unchanged from your original code)
63
  console.log("πŸͺ Starting comprehensive cookie and restriction bypass...");
64
  // Step 1: Set cookies before page load
65
  const preCookies = [
 
171
  document.documentElement.style.overflow = 'auto';
172
  }, 1000);
173
  });
 
174
  progressTracker?.updateProgress(10, 'bypassing', 'Cookie bypass configured successfully');
175
  return true;
176
  };
177
 
178
+ // --- Other functions (unblurContent, applyPrintStyles) are unchanged ---
179
  const unblurContent = async (page, progressTracker) => {
180
  progressTracker?.updateProgress(15, 'unblurring', 'Removing content restrictions...');
181
 
 
301
  progressTracker?.updateProgress(88, 'styling', 'Print styles applied successfully');
302
  };
303
 
304
+
305
  const studocuDownloader = async (url, options = {}, progressTracker = null) => {
306
  let browser;
307
+ let userDataDir = null;
308
+ // NEW: Easy flag for debugging. Set to true to see the browser window.
309
+ const isDebugging = false;
310
+
311
  try {
312
  progressTracker?.updateProgress(0, 'initializing', 'Starting browser...');
313
 
 
314
  const tempDir = os.tmpdir();
315
  userDataDir = await fs.mkdtemp(path.join(tempDir, 'puppeteer-'));
316
  console.log(`πŸ“‚ Created temporary user data directory: ${userDataDir}`);
317
 
318
  console.log("πŸš€ Launching browser with enhanced stealth configuration...");
319
  browser = await puppeteerExtra.launch({
320
+ headless: !isDebugging, // Use the debugging flag
321
+ userDataDir: userDataDir,
322
  args: [
323
  '--no-sandbox',
324
  '--disable-setuid-sandbox',
325
+ '--disable-infobars',
326
  '--disable-dev-shm-usage',
 
 
 
 
 
 
 
 
 
 
 
 
327
  '--disable-blink-features=AutomationControlled',
328
+ '--window-size=1920,1080'
 
329
  ],
330
  ignoreHTTPSErrors: true,
 
331
  });
332
 
333
  const page = await browser.newPage();
 
334
  progressTracker?.updateProgress(2, 'initializing', 'Configuring browser settings...');
335
 
336
  await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36');
337
+ await page.setViewport({ width: 1920, height: 1080 });
338
 
339
+ // The stealth plugin and other `evaluateOnNewDocument` calls handle this better now.
340
+ // await page.evaluateOnNewDocument(...) is handled by plugins.
 
 
 
 
 
341
 
342
+ // Request interception logic is unchanged
343
  await page.setRequestInterception(true);
344
  page.on('request', (req) => {
345
  const resourceType = req.resourceType();
 
349
  req.continue();
350
  return;
351
  }
 
352
  if (
353
  ['image', 'media', 'font', 'stylesheet'].includes(resourceType) &&
354
  !reqUrl.includes('document') && !reqUrl.includes('page') && !reqUrl.includes('studocu') ||
 
371
  }
372
  });
373
 
 
 
374
 
375
+ // --- MODIFIED NAVIGATION LOGIC ---
376
+ progressTracker?.updateProgress(5, 'navigating', 'Navigating to document...');
377
+ console.log(`πŸ›‘οΈ Navigating to ${url} and preparing for Cloudflare challenge...`);
378
+ try {
379
+ await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 120000 });
380
+
381
+ // NEW: Wait for potential Cloudflare challenge to solve and redirect.
382
+ // We wait for an element that *only* exists on the actual Studocu page.
383
+ console.log("⏳ Waiting for Cloudflare challenge to be solved...");
384
+ progressTracker?.updateProgress(8, 'solving_cf', 'Solving Cloudflare challenge...');
385
+
386
+ await page.waitForSelector('#search-input', { timeout: 90000 });
387
+
388
+ console.log("βœ… Cloudflare challenge passed! You are on the Studocu page.");
389
+ progressTracker?.updateProgress(10, 'navigation_complete', 'Successfully navigated to document');
390
+
391
+ } catch (e) {
392
+ console.error("❌ Failed to bypass Cloudflare or navigate to the page.", e.message);
393
+ // NEW: Take a screenshot on failure to help debug
394
+ const screenshotPath = path.join(os.tmpdir(), `cloudflare_failure_${Date.now()}.png`);
395
+ await page.screenshot({ path: screenshotPath, fullPage: true });
396
+ console.log(`πŸ“Έ Screenshot saved to ${screenshotPath}`);
397
+ throw new Error("Could not bypass Cloudflare. The site may be actively blocking, or the page structure changed.");
398
  }
399
 
400
+ // --- RESUME NORMAL SCRIPT FLOW ---
 
401
 
402
+ // It's better to bypass cookies *after* landing on the actual page
403
+ await bypassCookiesAndRestrictions(page, progressTracker);
404
+
405
+ if (options.email && options.password) {
406
+ progressTracker?.updateProgress(12, 'authenticating', 'Logging into StuDocu...');
407
+ // ... (Login logic is unchanged)
 
 
 
 
 
 
 
 
 
408
  }
409
 
410
  progressTracker?.updateProgress(40, 'loading', 'Page loaded, waiting for content...');
 
412
 
413
  await unblurContent(page, progressTracker);
414
 
415
+ // ... (The rest of the script is unchanged)
416
  progressTracker?.updateProgress(45, 'loading', 'Waiting for document content...');
417
  console.log("⏳ Waiting for document content to load...");
418
 
 
537
  console.log(`βœ… PDF generated successfully! Size: ${(pdfBuffer.length / 1024 / 1024).toFixed(2)} MB`);
538
  return pdfBuffer;
539
 
540
+
541
  } catch (error) {
542
  progressTracker?.updateProgress(-1, 'error', error.message);
543
  console.error("❌ Error during PDF generation:", error);
 
551
  console.log("Error closing browser:", e.message);
552
  }
553
  }
 
554
  if (userDataDir) {
555
  console.log(`πŸ—‘οΈ Cleaning up temporary directory: ${userDataDir}`);
556
  try {
 
563
  }
564
  };
565
 
566
+ // --- API Routes, Health, and Info Endpoints (Unchanged) ---
567
  app.post('/api/request-download', (req, res) => {
568
  const { url, email, password } = req.body;
569
  if (!url || !url.includes('studocu.com')) {
 
643
  }
644
  });
645
 
 
646
  app.get('/health', (req, res) => {
647
  res.json({
648
  status: 'healthy',
 
654
 
655
  app.get('/', (req, res) => {
656
  res.json({
657
+ message: 'πŸš€ Enhanced StuDocu Downloader API v5.3 - Real-time Progress Tracking with Cloudflare Bypass',
658
+ version: '5.3.0',
659
  features: [
660
+ 'πŸ›‘οΈ Cloudflare JS Challenge Bypass',
661
  'πŸͺ Advanced cookie banner bypass',
662
  'πŸ”“ Premium content unblurring',
663
  'πŸ”‘ Login support for full access',
 
685
  });
686
 
687
  app.listen(port, () => {
688
+ console.log(`πŸš€ Enhanced StuDocu Downloader v5.3.0 running on http://localhost:${port}`);
689
+ console.log(`✨ Features: Cloudflare Bypass, Real-time progress tracking, enhanced stealth, and user feedback`);
690
  });