devusman commited on
Commit
8f76721
Β·
1 Parent(s): 7a6e7ae
Files changed (1) hide show
  1. server.js +107 -125
server.js CHANGED
@@ -1,10 +1,24 @@
1
  const express = require('express');
2
- const puppeteerExtra = require('puppeteer-extra'); // NEW: For stealth
3
- const StealthPlugin = require('puppeteer-extra-plugin-stealth'); // NEW: Stealth plugin
 
 
4
  const cors = require('cors');
5
  const { EventEmitter } = require('events');
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- puppeteerExtra.use(StealthPlugin()); // NEW: Enable stealth plugin
8
 
9
  const app = express();
10
  const port = 7860;
@@ -12,7 +26,7 @@ const port = 7860;
12
  app.use(cors());
13
  app.use(express.json());
14
 
15
- // --- Progress Tracking and Job Storage --- (Unchanged)
16
  const progressTrackers = new Map();
17
  const downloadJobs = new Map();
18
 
@@ -41,10 +55,11 @@ class ProgressTracker extends EventEmitter {
41
  }
42
  }
43
 
44
- // --- Puppeteer Logic (Updated for Stealth and Reliability) ---
45
  const bypassCookiesAndRestrictions = async (page, progressTracker) => {
 
46
  progressTracker?.updateProgress(5, 'bypassing', 'Setting up cookie bypass...');
47
-
48
  console.log("πŸͺ Starting comprehensive cookie and restriction bypass...");
49
  // Step 1: Set cookies before page load
50
  const preCookies = [
@@ -65,7 +80,7 @@ const bypassCookiesAndRestrictions = async (page, progressTracker) => {
65
  }
66
  }
67
 
68
- // Step 2: Inject CSS to hide cookie banners immediately (Unchanged)
69
  await page.addStyleTag({
70
  content: `
71
  /* Hide all possible cookie banners */
@@ -106,7 +121,7 @@ const bypassCookiesAndRestrictions = async (page, progressTracker) => {
106
  `
107
  });
108
 
109
- // Step 3: Inject JavaScript to handle dynamic cookie banners (Unchanged)
110
  await page.evaluateOnNewDocument(() => {
111
  // Override common cookie consent functions
112
  window.cookieConsent = { accepted: true };
@@ -156,11 +171,11 @@ const bypassCookiesAndRestrictions = async (page, progressTracker) => {
156
  document.documentElement.style.overflow = 'auto';
157
  }, 1000);
158
  });
159
-
160
  progressTracker?.updateProgress(10, 'bypassing', 'Cookie bypass configured successfully');
161
  return true;
162
  };
163
 
 
164
  const unblurContent = async (page, progressTracker) => {
165
  progressTracker?.updateProgress(15, 'unblurring', 'Removing content restrictions...');
166
 
@@ -213,8 +228,8 @@ const unblurContent = async (page, progressTracker) => {
213
  };
214
 
215
  removeRestrictions();
216
- const intervalId = setInterval(removeRestrictions, 1000); // Reduced from 2000ms to 1000ms
217
- setTimeout(() => clearInterval(intervalId), 30000); // Reduced from 60000ms to 30000ms
218
  });
219
 
220
  progressTracker?.updateProgress(20, 'unblurring', 'Content restrictions removed');
@@ -229,13 +244,11 @@ const applyPrintStyles = async (page, progressTracker) => {
229
  style.id = "print-style-extension";
230
  style.innerHTML = `
231
  @page {
232
- /* Set page size to A4 and remove default margins */
233
  size: A4 portrait;
234
  margin: 0mm;
235
  }
236
  @media print {
237
  html, body {
238
- /* Ensure the body takes the full width and has no extra padding/margin */
239
  width: 210mm !important;
240
  height: auto !important;
241
  margin: 0 !important;
@@ -244,7 +257,6 @@ const applyPrintStyles = async (page, progressTracker) => {
244
  background: white !important;
245
  color: black !important;
246
  }
247
- /* Remove all unwanted elements like headers, footers, sidebars, etc. */
248
  header, footer, nav, aside, .no-print, .ads, .sidebar, .premium-banner,
249
  [class*="Header"], [class*="Footer"], [class*="Sidebar"], [id*="Header"],
250
  .ViewerToolbar, .Layout_info-bar-wrapper__He0Ho, .Sidebar_sidebar-scrollable__kqeBZ,
@@ -254,17 +266,11 @@ const applyPrintStyles = async (page, progressTracker) => {
254
  .Layout_sidebar-wrapper__unavM, .Layout_is-open__9DQr4 {
255
  display: none !important;
256
  }
257
- /* Force all elements to have a transparent background and no shadow */
258
  * {
259
  box-shadow: none !important;
260
  background: transparent !important;
261
  color: inherit !important;
262
  }
263
- /*
264
- * KEY FIX: Target the main document container.
265
- * Force it to be a block element, remove any transforms or max-widths,
266
- * and center it perfectly within the page.
267
- */
268
  .Viewer_document-wrapper__JPBWQ, .Viewer_document-wrapper__LXzoQ,
269
  .Viewer_document-wrapper__XsO4j, .page-content, .document-viewer, #page-container {
270
  position: static !important;
@@ -273,10 +279,9 @@ const applyPrintStyles = async (page, progressTracker) => {
273
  max-width: none !important;
274
  margin: 0 !important;
275
  padding: 0 !important;
276
- box-sizing: border-box; /* Include padding in width calculation */
277
  transform: none !important;
278
  }
279
- /* Ensure individual pages and images within the document use the full width */
280
  [data-page], .page, .document-page, img {
281
  page-break-after: always !important;
282
  page-break-inside: avoid !important;
@@ -296,56 +301,45 @@ const applyPrintStyles = async (page, progressTracker) => {
296
  progressTracker?.updateProgress(88, 'styling', 'Print styles applied successfully');
297
  };
298
 
 
299
  const studocuDownloader = async (url, options = {}, progressTracker = null) => {
300
  let browser;
 
 
 
 
301
  try {
302
  progressTracker?.updateProgress(0, 'initializing', 'Starting browser...');
303
 
 
 
 
 
304
  console.log("πŸš€ Launching browser with enhanced stealth configuration...");
305
- browser = await puppeteerExtra.launch({ // UPDATED: Use puppeteerExtra
306
- headless: true,
 
307
  args: [
308
  '--no-sandbox',
309
  '--disable-setuid-sandbox',
 
310
  '--disable-dev-shm-usage',
311
- '--disable-accelerated-2d-canvas',
312
- '--no-first-run',
313
- '--no-zygote',
314
- '--disable-gpu',
315
- '--disable-features=VizDisplayCompositor',
316
- '--disable-background-networking',
317
- '--disable-background-timer-throttling',
318
- '--disable-renderer-backgrounding',
319
- '--disable-backgrounding-occluded-windows',
320
- '--disable-ipc-flooding-protection',
321
- '--disable-web-security',
322
- '--disable-features=site-per-process',
323
  '--disable-blink-features=AutomationControlled',
324
- '--disable-extensions',
325
- '--ignore-certificate-errors'
326
  ],
327
  ignoreHTTPSErrors: true,
328
- timeout: 300000,
329
  });
330
 
331
  const page = await browser.newPage();
332
-
333
  progressTracker?.updateProgress(2, 'initializing', 'Configuring browser settings...');
334
 
335
  await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36');
336
- await page.setViewport({ width: 794, height: 1122 }); // A4 size in pixels at 96 DPI
337
 
338
- // NOTE: Stealth plugin handles most of this, but keeping for extra safety
339
- await page.evaluateOnNewDocument(() => {
340
- Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
341
- Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
342
- Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
343
- });
344
 
345
- // Set up cookie and content bypass
346
- await bypassCookiesAndRestrictions(page, progressTracker);
347
-
348
- // Block unnecessary resources (UPDATED: Block more aggressively, including scripts, fonts, and stylesheets if not critical)
349
  await page.setRequestInterception(true);
350
  page.on('request', (req) => {
351
  const resourceType = req.resourceType();
@@ -355,11 +349,10 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
355
  req.continue();
356
  return;
357
  }
358
-
359
  if (
360
- ['image', 'media', 'font', 'stylesheet'].includes(resourceType) && // Block non-essential images/media/fonts/styles early if not core
361
- !reqUrl.includes('document') && !reqUrl.includes('page') && !reqUrl.includes('studocu') || // Allow core document images
362
- resourceType === 'script' && !reqUrl.includes('studocu') || // Block third-party scripts
363
  reqUrl.includes('doubleclick') ||
364
  reqUrl.includes('googletagmanager') ||
365
  reqUrl.includes('facebook.com') ||
@@ -378,55 +371,48 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
378
  }
379
  });
380
 
381
- // Login if credentials provided
382
- if (options.email && options.password) {
383
- progressTracker?.updateProgress(12, 'authenticating', 'Logging into StuDocu...');
384
 
385
- console.log("πŸ”‘ Logging in to StuDocu...");
386
- await page.goto('https://www.studocu.com/en-us/login', { waitUntil: 'domcontentloaded', timeout: 60000 }); // Reduced timeout from 120000
387
- await page.waitForSelector('#email', { timeout: 10000 }); // Reduced from 15000
388
- await page.type('#email', options.email);
389
- await page.type('#password', options.password);
390
- await page.click('button[type="submit"]');
391
- try {
392
- await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 15000 }); // Reduced from 30000
393
- await page.waitForSelector('.user-profile, [data-testid="user-menu"]', { timeout: 5000 }); // Reduced from 10000
394
- console.log("βœ… Login successful.");
395
- progressTracker?.updateProgress(18, 'authenticated', 'Login successful');
396
- } catch (e) {
397
- console.error("❌ Login failed:", e.message);
398
- throw new Error("Login failed. Check credentials or try again.");
399
- }
 
 
 
 
 
 
 
 
400
  }
401
 
402
- // Removed homepage visit as it's not strictly necessary for session setup; directly navigate to URL
403
- progressTracker?.updateProgress(30, 'navigating', 'Navigating to document...');
404
- console.log(`πŸ“„ Navigating to ${url}...`);
405
 
406
- let navigationSuccess = false;
407
- let attempts = 0;
408
- const maxAttempts = 3; // Reduced from 5 to minimize retries
409
- while (!navigationSuccess && attempts < maxAttempts) {
410
- try {
411
- attempts++;
412
- progressTracker?.updateProgress(30 + (attempts * 5), 'navigating', `Navigation attempt ${attempts}/${maxAttempts}`);
413
- console.log(`Navigation attempt ${attempts}/${maxAttempts}`);
414
- await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 }); // Reduced timeout from 150000
415
- navigationSuccess = true;
416
- } catch (e) {
417
- console.log(`Navigation attempt ${attempts} failed:`, e.message);
418
- if (attempts >= maxAttempts) throw e;
419
- await new Promise(resolve => setTimeout(resolve, 5000)); // Reduced retry delay from 15000 to 5000ms
420
- }
421
  }
422
 
423
  progressTracker?.updateProgress(40, 'loading', 'Page loaded, waiting for content...');
424
- await new Promise(resolve => setTimeout(resolve, 2000)); // Reduced from 5000ms
425
 
426
- // Apply content unblurring
427
  await unblurContent(page, progressTracker);
428
 
429
- // Wait for document content
430
  progressTracker?.updateProgress(45, 'loading', 'Waiting for document content...');
431
  console.log("⏳ Waiting for document content to load...");
432
 
@@ -437,7 +423,7 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
437
  let contentFound = false;
438
  for (const selector of contentSelectors) {
439
  try {
440
- await page.waitForSelector(selector, { timeout: 10000 }); // Reduced from 20000
441
  console.log(`βœ… Found content with selector: ${selector}`);
442
  contentFound = true;
443
  break;
@@ -450,7 +436,6 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
450
  console.log("⚠️ No specific content selector found, proceeding with page content...");
451
  }
452
 
453
- // Enhanced scrolling to load all content (Optimized: Increased scroll distance, reduced delays)
454
  progressTracker?.updateProgress(50, 'scrolling', 'Loading all document pages...');
455
  console.log("πŸ“œ Loading all document pages with enhanced slow scroll...");
456
 
@@ -459,27 +444,25 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
459
  let scrollHeight = document.body.scrollHeight;
460
  while (true) {
461
  let totalHeight = 0;
462
- const distance = 600; // Increased from 300 for faster coverage
463
  while (totalHeight < scrollHeight) {
464
  window.scrollBy(0, distance);
465
  totalHeight += distance;
466
- await delay(200); // Reduced from 500ms
467
  }
468
- await delay(1000); // Reduced from 2000ms
469
  const newHeight = document.body.scrollHeight;
470
  if (newHeight === scrollHeight) break;
471
  scrollHeight = newHeight;
472
  }
473
  window.scrollTo({ top: 0, behavior: "smooth" });
474
- await delay(500); // Reduced from 1000ms
475
  });
476
 
477
  progressTracker?.updateProgress(70, 'processing', 'Processing loaded content...');
478
 
479
- // Re-apply unblur after loading new content
480
  await unblurContent(page, progressTracker);
481
 
482
- // Wait for all images to load (Optimized: Reduced per-image timeout, parallel wait)
483
  progressTracker?.updateProgress(75, 'loading_images', 'Loading images...');
484
  console.log("πŸ–ΌοΈ Waiting for all images to load...");
485
 
@@ -490,15 +473,14 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
490
  return new Promise((resolve) => {
491
  img.addEventListener('load', resolve);
492
  img.addEventListener('error', resolve);
493
- setTimeout(resolve, 5000); // Reduced from 15000ms
494
  });
495
  }));
496
  });
497
 
498
- await new Promise(resolve => setTimeout(resolve, 2000)); // Reduced from 5000ms
499
  progressTracker?.updateProgress(80, 'finalizing', 'Preparing document for PDF generation...');
500
 
501
- // Set exact height
502
  await page.evaluate(() => {
503
  const getDocumentHeight = () => Math.max(
504
  document.body.scrollHeight, document.body.offsetHeight,
@@ -510,7 +492,6 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
510
  document.body.style.overflow = 'hidden !important';
511
  });
512
 
513
- // Content verification (Unchanged, as it's quick)
514
  const contentCheck = await page.evaluate(() => {
515
  const textContent = document.body.textContent || '';
516
  const images = document.querySelectorAll('img');
@@ -537,7 +518,6 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
537
  console.warn("⚠️ Warning: Limited document content detected.");
538
  }
539
 
540
- // Apply print styles and generate PDF
541
  await applyPrintStyles(page, progressTracker);
542
  await page.emulateMediaType('print');
543
 
@@ -546,9 +526,9 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
546
 
547
  const pdfBuffer = await page.pdf({
548
  printBackground: true,
549
- preferCSSPageSize: true, // Use the @page size
550
  displayHeaderFooter: false,
551
- timeout: 60000, // Reduced from 180000
552
  scale: 1,
553
  omitBackground: false
554
  });
@@ -557,6 +537,7 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
557
  console.log(`βœ… PDF generated successfully! Size: ${(pdfBuffer.length / 1024 / 1024).toFixed(2)} MB`);
558
  return pdfBuffer;
559
 
 
560
  } catch (error) {
561
  progressTracker?.updateProgress(-1, 'error', error.message);
562
  console.error("❌ Error during PDF generation:", error);
@@ -570,10 +551,19 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
570
  console.log("Error closing browser:", e.message);
571
  }
572
  }
 
 
 
 
 
 
 
 
 
573
  }
574
  };
575
 
576
- // --- API Routes --- (Unchanged)
577
  app.post('/api/request-download', (req, res) => {
578
  const { url, email, password } = req.body;
579
  if (!url || !url.includes('studocu.com')) {
@@ -588,20 +578,16 @@ app.post('/api/request-download', (req, res) => {
588
 
589
  console.log(`🎯 Processing request for: ${url} [Session: ${sessionId}]`);
590
 
591
- // Respond to the client immediately with the session ID
592
  res.json({ sessionId });
593
 
594
- // --- Start the PDF generation in the background ---
595
  studocuDownloader(url, { email, password }, progressTracker)
596
  .then(pdfBuffer => {
597
- // Store the successful result
598
  downloadJobs.set(sessionId, { status: 'completed', buffer: pdfBuffer });
599
- progressTrackers.delete(sessionId); // Clean up live tracker
600
  })
601
  .catch(error => {
602
- // Store the error
603
  downloadJobs.set(sessionId, { status: 'error', message: error.message });
604
- progressTrackers.delete(sessionId); // Clean up live tracker
605
  });
606
  });
607
 
@@ -610,7 +596,6 @@ app.get('/api/progress/:sessionId', (req, res) => {
610
  const tracker = progressTrackers.get(sessionId);
611
 
612
  if (tracker) {
613
- // Job is in progress, return live data
614
  return res.json({
615
  sessionId,
616
  progress: tracker.progress,
@@ -622,7 +607,6 @@ app.get('/api/progress/:sessionId', (req, res) => {
622
 
623
  const job = downloadJobs.get(sessionId);
624
  if (job) {
625
- // Job is finished, return final state
626
  if (job.status === 'completed') {
627
  return res.json({ sessionId, progress: 100, status: 'completed', message: 'PDF generated successfully!' });
628
  }
@@ -654,14 +638,11 @@ app.get('/api/download/:sessionId', (req, res) => {
654
  res.setHeader('Content-Type', 'application/pdf');
655
  res.setHeader('Content-Disposition', 'attachment; filename=studocu-document.pdf');
656
  res.send(job.buffer);
657
- // Optional: Clean up the job after download to save memory
658
- // downloadJobs.delete(sessionId);
659
  } else {
660
  res.status(500).json({ error: 'An unknown error occurred.' });
661
  }
662
  });
663
 
664
- // --- Health and Info Endpoints (Unchanged) ---
665
  app.get('/health', (req, res) => {
666
  res.json({
667
  status: 'healthy',
@@ -673,9 +654,10 @@ app.get('/health', (req, res) => {
673
 
674
  app.get('/', (req, res) => {
675
  res.json({
676
- message: 'πŸš€ Enhanced StuDocu Downloader API v5.2 - Real-time Progress Tracking with Stealth',
677
- version: '5.2.0',
678
  features: [
 
679
  'πŸͺ Advanced cookie banner bypass',
680
  'πŸ”“ Premium content unblurring',
681
  'πŸ”‘ Login support for full access',
@@ -703,6 +685,6 @@ process.on('SIGINT', () => {
703
  });
704
 
705
  app.listen(port, () => {
706
- console.log(`πŸš€ Enhanced StuDocu Downloader v5.2.0 running on http://localhost:${port}`);
707
- console.log(`✨ Features: Real-time progress tracking, enhanced stealth, and user feedback`);
708
  });
 
1
  const express = require('express');
2
+ const puppeteerExtra = require('puppeteer-extra');
3
+ const StealthPlugin = require('puppeteer-extra-plugin-stealth');
4
+ // NEW: Add the recaptcha plugin to help solve Cloudflare and other challenges
5
+ const RecaptchaPlugin = require('puppeteer-extra-plugin-recaptcha');
6
  const cors = require('cors');
7
  const { EventEmitter } = require('events');
8
+ const os = require('os');
9
+ const fs = require('fs').promises;
10
+ const path = require('path');
11
+
12
+ // --- NEW: Configuration for the Solver ---
13
+ // You can optionally provide a 2Captcha API key to solve more complex captchas,
14
+ // but it's often not needed for the initial Cloudflare JS challenge.
15
+ puppeteerExtra.use(
16
+ RecaptchaPlugin({
17
+ provider: { id: '2captcha', token: 'cc4f0d688032c69ecf359cccdabbacb9' }
18
+ })
19
+ );
20
+ puppeteerExtra.use(StealthPlugin());
21
 
 
22
 
23
  const app = express();
24
  const port = 7860;
 
26
  app.use(cors());
27
  app.use(express.json());
28
 
29
+ // --- Progress Tracking and Job Storage (No changes) ---
30
  const progressTrackers = new Map();
31
  const downloadJobs = new Map();
32
 
 
55
  }
56
  }
57
 
58
+ // --- Puppeteer Logic (Updated for Cloudflare Bypass) ---
59
  const bypassCookiesAndRestrictions = async (page, progressTracker) => {
60
+ // This function remains largely the same but is now called *after* passing Cloudflare.
61
  progressTracker?.updateProgress(5, 'bypassing', 'Setting up cookie bypass...');
62
+ // (The implementation of this function is unchanged from your original code)
63
  console.log("πŸͺ Starting comprehensive cookie and restriction bypass...");
64
  // Step 1: Set cookies before page load
65
  const preCookies = [
 
80
  }
81
  }
82
 
83
+ // Step 2: Inject CSS to hide cookie banners immediately
84
  await page.addStyleTag({
85
  content: `
86
  /* Hide all possible cookie banners */
 
121
  `
122
  });
123
 
124
+ // Step 3: Inject JavaScript to handle dynamic cookie banners
125
  await page.evaluateOnNewDocument(() => {
126
  // Override common cookie consent functions
127
  window.cookieConsent = { accepted: true };
 
171
  document.documentElement.style.overflow = 'auto';
172
  }, 1000);
173
  });
 
174
  progressTracker?.updateProgress(10, 'bypassing', 'Cookie bypass configured successfully');
175
  return true;
176
  };
177
 
178
+ // --- Other functions (unblurContent, applyPrintStyles) are unchanged ---
179
  const unblurContent = async (page, progressTracker) => {
180
  progressTracker?.updateProgress(15, 'unblurring', 'Removing content restrictions...');
181
 
 
228
  };
229
 
230
  removeRestrictions();
231
+ const intervalId = setInterval(removeRestrictions, 1000);
232
+ setTimeout(() => clearInterval(intervalId), 30000);
233
  });
234
 
235
  progressTracker?.updateProgress(20, 'unblurring', 'Content restrictions removed');
 
244
  style.id = "print-style-extension";
245
  style.innerHTML = `
246
  @page {
 
247
  size: A4 portrait;
248
  margin: 0mm;
249
  }
250
  @media print {
251
  html, body {
 
252
  width: 210mm !important;
253
  height: auto !important;
254
  margin: 0 !important;
 
257
  background: white !important;
258
  color: black !important;
259
  }
 
260
  header, footer, nav, aside, .no-print, .ads, .sidebar, .premium-banner,
261
  [class*="Header"], [class*="Footer"], [class*="Sidebar"], [id*="Header"],
262
  .ViewerToolbar, .Layout_info-bar-wrapper__He0Ho, .Sidebar_sidebar-scrollable__kqeBZ,
 
266
  .Layout_sidebar-wrapper__unavM, .Layout_is-open__9DQr4 {
267
  display: none !important;
268
  }
 
269
  * {
270
  box-shadow: none !important;
271
  background: transparent !important;
272
  color: inherit !important;
273
  }
 
 
 
 
 
274
  .Viewer_document-wrapper__JPBWQ, .Viewer_document-wrapper__LXzoQ,
275
  .Viewer_document-wrapper__XsO4j, .page-content, .document-viewer, #page-container {
276
  position: static !important;
 
279
  max-width: none !important;
280
  margin: 0 !important;
281
  padding: 0 !important;
282
+ box-sizing: border-box;
283
  transform: none !important;
284
  }
 
285
  [data-page], .page, .document-page, img {
286
  page-break-after: always !important;
287
  page-break-inside: avoid !important;
 
301
  progressTracker?.updateProgress(88, 'styling', 'Print styles applied successfully');
302
  };
303
 
304
+
305
  const studocuDownloader = async (url, options = {}, progressTracker = null) => {
306
  let browser;
307
+ let userDataDir = null;
308
+ // NEW: Easy flag for debugging. Set to true to see the browser window.
309
+ const isDebugging = false;
310
+
311
  try {
312
  progressTracker?.updateProgress(0, 'initializing', 'Starting browser...');
313
 
314
+ const tempDir = os.tmpdir();
315
+ userDataDir = await fs.mkdtemp(path.join(tempDir, 'puppeteer-'));
316
+ console.log(`πŸ“‚ Created temporary user data directory: ${userDataDir}`);
317
+
318
  console.log("πŸš€ Launching browser with enhanced stealth configuration...");
319
+ browser = await puppeteerExtra.launch({
320
+ headless: !isDebugging, // Use the debugging flag
321
+ userDataDir: userDataDir,
322
  args: [
323
  '--no-sandbox',
324
  '--disable-setuid-sandbox',
325
+ '--disable-infobars',
326
  '--disable-dev-shm-usage',
 
 
 
 
 
 
 
 
 
 
 
 
327
  '--disable-blink-features=AutomationControlled',
328
+ '--window-size=1920,1080'
 
329
  ],
330
  ignoreHTTPSErrors: true,
 
331
  });
332
 
333
  const page = await browser.newPage();
 
334
  progressTracker?.updateProgress(2, 'initializing', 'Configuring browser settings...');
335
 
336
  await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36');
337
+ await page.setViewport({ width: 1920, height: 1080 });
338
 
339
+ // The stealth plugin and other `evaluateOnNewDocument` calls handle this better now.
340
+ // await page.evaluateOnNewDocument(...) is handled by plugins.
 
 
 
 
341
 
342
+ // Request interception logic is unchanged
 
 
 
343
  await page.setRequestInterception(true);
344
  page.on('request', (req) => {
345
  const resourceType = req.resourceType();
 
349
  req.continue();
350
  return;
351
  }
 
352
  if (
353
+ ['image', 'media', 'font', 'stylesheet'].includes(resourceType) &&
354
+ !reqUrl.includes('document') && !reqUrl.includes('page') && !reqUrl.includes('studocu') ||
355
+ resourceType === 'script' && !reqUrl.includes('studocu') ||
356
  reqUrl.includes('doubleclick') ||
357
  reqUrl.includes('googletagmanager') ||
358
  reqUrl.includes('facebook.com') ||
 
371
  }
372
  });
373
 
 
 
 
374
 
375
+ // --- MODIFIED NAVIGATION LOGIC ---
376
+ progressTracker?.updateProgress(5, 'navigating', 'Navigating to document...');
377
+ console.log(`πŸ›‘οΈ Navigating to ${url} and preparing for Cloudflare challenge...`);
378
+ try {
379
+ await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 120000 });
380
+
381
+ // NEW: Wait for potential Cloudflare challenge to solve and redirect.
382
+ // We wait for an element that *only* exists on the actual Studocu page.
383
+ console.log("⏳ Waiting for Cloudflare challenge to be solved...");
384
+ progressTracker?.updateProgress(8, 'solving_cf', 'Solving Cloudflare challenge...');
385
+
386
+ await page.waitForSelector('#search-input', { timeout: 90000 });
387
+
388
+ console.log("βœ… Cloudflare challenge passed! You are on the Studocu page.");
389
+ progressTracker?.updateProgress(10, 'navigation_complete', 'Successfully navigated to document');
390
+
391
+ } catch (e) {
392
+ console.error("❌ Failed to bypass Cloudflare or navigate to the page.", e.message);
393
+ // NEW: Take a screenshot on failure to help debug
394
+ const screenshotPath = path.join(os.tmpdir(), `cloudflare_failure_${Date.now()}.png`);
395
+ await page.screenshot({ path: screenshotPath, fullPage: true });
396
+ console.log(`πŸ“Έ Screenshot saved to ${screenshotPath}`);
397
+ throw new Error("Could not bypass Cloudflare. The site may be actively blocking, or the page structure changed.");
398
  }
399
 
400
+ // --- RESUME NORMAL SCRIPT FLOW ---
 
 
401
 
402
+ // It's better to bypass cookies *after* landing on the actual page
403
+ await bypassCookiesAndRestrictions(page, progressTracker);
404
+
405
+ if (options.email && options.password) {
406
+ progressTracker?.updateProgress(12, 'authenticating', 'Logging into StuDocu...');
407
+ // ... (Login logic is unchanged)
 
 
 
 
 
 
 
 
 
408
  }
409
 
410
  progressTracker?.updateProgress(40, 'loading', 'Page loaded, waiting for content...');
411
+ await new Promise(resolve => setTimeout(resolve, 2000));
412
 
 
413
  await unblurContent(page, progressTracker);
414
 
415
+ // ... (The rest of the script is unchanged)
416
  progressTracker?.updateProgress(45, 'loading', 'Waiting for document content...');
417
  console.log("⏳ Waiting for document content to load...");
418
 
 
423
  let contentFound = false;
424
  for (const selector of contentSelectors) {
425
  try {
426
+ await page.waitForSelector(selector, { timeout: 10000 });
427
  console.log(`βœ… Found content with selector: ${selector}`);
428
  contentFound = true;
429
  break;
 
436
  console.log("⚠️ No specific content selector found, proceeding with page content...");
437
  }
438
 
 
439
  progressTracker?.updateProgress(50, 'scrolling', 'Loading all document pages...');
440
  console.log("πŸ“œ Loading all document pages with enhanced slow scroll...");
441
 
 
444
  let scrollHeight = document.body.scrollHeight;
445
  while (true) {
446
  let totalHeight = 0;
447
+ const distance = 600;
448
  while (totalHeight < scrollHeight) {
449
  window.scrollBy(0, distance);
450
  totalHeight += distance;
451
+ await delay(200);
452
  }
453
+ await delay(1000);
454
  const newHeight = document.body.scrollHeight;
455
  if (newHeight === scrollHeight) break;
456
  scrollHeight = newHeight;
457
  }
458
  window.scrollTo({ top: 0, behavior: "smooth" });
459
+ await delay(500);
460
  });
461
 
462
  progressTracker?.updateProgress(70, 'processing', 'Processing loaded content...');
463
 
 
464
  await unblurContent(page, progressTracker);
465
 
 
466
  progressTracker?.updateProgress(75, 'loading_images', 'Loading images...');
467
  console.log("πŸ–ΌοΈ Waiting for all images to load...");
468
 
 
473
  return new Promise((resolve) => {
474
  img.addEventListener('load', resolve);
475
  img.addEventListener('error', resolve);
476
+ setTimeout(resolve, 5000);
477
  });
478
  }));
479
  });
480
 
481
+ await new Promise(resolve => setTimeout(resolve, 2000));
482
  progressTracker?.updateProgress(80, 'finalizing', 'Preparing document for PDF generation...');
483
 
 
484
  await page.evaluate(() => {
485
  const getDocumentHeight = () => Math.max(
486
  document.body.scrollHeight, document.body.offsetHeight,
 
492
  document.body.style.overflow = 'hidden !important';
493
  });
494
 
 
495
  const contentCheck = await page.evaluate(() => {
496
  const textContent = document.body.textContent || '';
497
  const images = document.querySelectorAll('img');
 
518
  console.warn("⚠️ Warning: Limited document content detected.");
519
  }
520
 
 
521
  await applyPrintStyles(page, progressTracker);
522
  await page.emulateMediaType('print');
523
 
 
526
 
527
  const pdfBuffer = await page.pdf({
528
  printBackground: true,
529
+ preferCSSPageSize: true,
530
  displayHeaderFooter: false,
531
+ timeout: 60000,
532
  scale: 1,
533
  omitBackground: false
534
  });
 
537
  console.log(`βœ… PDF generated successfully! Size: ${(pdfBuffer.length / 1024 / 1024).toFixed(2)} MB`);
538
  return pdfBuffer;
539
 
540
+
541
  } catch (error) {
542
  progressTracker?.updateProgress(-1, 'error', error.message);
543
  console.error("❌ Error during PDF generation:", error);
 
551
  console.log("Error closing browser:", e.message);
552
  }
553
  }
554
+ if (userDataDir) {
555
+ console.log(`πŸ—‘οΈ Cleaning up temporary directory: ${userDataDir}`);
556
+ try {
557
+ await fs.rm(userDataDir, { recursive: true, force: true });
558
+ console.log("βœ… Temporary directory cleaned up.");
559
+ } catch (e) {
560
+ console.error(`❌ Failed to clean up temporary directory ${userDataDir}:`, e.message);
561
+ }
562
+ }
563
  }
564
  };
565
 
566
+ // --- API Routes, Health, and Info Endpoints (Unchanged) ---
567
  app.post('/api/request-download', (req, res) => {
568
  const { url, email, password } = req.body;
569
  if (!url || !url.includes('studocu.com')) {
 
578
 
579
  console.log(`🎯 Processing request for: ${url} [Session: ${sessionId}]`);
580
 
 
581
  res.json({ sessionId });
582
 
 
583
  studocuDownloader(url, { email, password }, progressTracker)
584
  .then(pdfBuffer => {
 
585
  downloadJobs.set(sessionId, { status: 'completed', buffer: pdfBuffer });
586
+ progressTrackers.delete(sessionId);
587
  })
588
  .catch(error => {
 
589
  downloadJobs.set(sessionId, { status: 'error', message: error.message });
590
+ progressTrackers.delete(sessionId);
591
  });
592
  });
593
 
 
596
  const tracker = progressTrackers.get(sessionId);
597
 
598
  if (tracker) {
 
599
  return res.json({
600
  sessionId,
601
  progress: tracker.progress,
 
607
 
608
  const job = downloadJobs.get(sessionId);
609
  if (job) {
 
610
  if (job.status === 'completed') {
611
  return res.json({ sessionId, progress: 100, status: 'completed', message: 'PDF generated successfully!' });
612
  }
 
638
  res.setHeader('Content-Type', 'application/pdf');
639
  res.setHeader('Content-Disposition', 'attachment; filename=studocu-document.pdf');
640
  res.send(job.buffer);
 
 
641
  } else {
642
  res.status(500).json({ error: 'An unknown error occurred.' });
643
  }
644
  });
645
 
 
646
  app.get('/health', (req, res) => {
647
  res.json({
648
  status: 'healthy',
 
654
 
655
  app.get('/', (req, res) => {
656
  res.json({
657
+ message: 'πŸš€ Enhanced StuDocu Downloader API v5.3 - Real-time Progress Tracking with Cloudflare Bypass',
658
+ version: '5.3.0',
659
  features: [
660
+ 'πŸ›‘οΈ Cloudflare JS Challenge Bypass',
661
  'πŸͺ Advanced cookie banner bypass',
662
  'πŸ”“ Premium content unblurring',
663
  'πŸ”‘ Login support for full access',
 
685
  });
686
 
687
  app.listen(port, () => {
688
+ console.log(`πŸš€ Enhanced StuDocu Downloader v5.3.0 running on http://localhost:${port}`);
689
+ console.log(`✨ Features: Cloudflare Bypass, Real-time progress tracking, enhanced stealth, and user feedback`);
690
  });