devusman commited on
Commit
3ef8fed
Β·
1 Parent(s): 7d05c0c
Files changed (1) hide show
  1. server.js +91 -85
server.js CHANGED
@@ -1,20 +1,10 @@
1
  const express = require('express');
2
- const puppeteerExtra = require('puppeteer-extra');
3
- const StealthPlugin = require('puppeteer-extra-plugin-stealth');
4
- const RecaptchaPlugin = require('puppeteer-extra-plugin-recaptcha');
5
  const cors = require('cors');
6
  const { EventEmitter } = require('events');
7
- const os = require('os');
8
- const fs = require('fs').promises;
9
- const path = require('path');
10
-
11
- puppeteerExtra.use(
12
- RecaptchaPlugin({
13
- provider: { id: '2captcha', token: process.env.TWOCAPTCHA_API_KEY || 'YOUR_2CAPTCHA_API_KEY' },
14
- throwOnError: false
15
- })
16
- );
17
- puppeteerExtra.use(StealthPlugin());
18
 
19
  const app = express();
20
  const port = 7860;
@@ -22,7 +12,7 @@ const port = 7860;
22
  app.use(cors());
23
  app.use(express.json());
24
 
25
- // --- Progress Tracking and Job Storage ---
26
  const progressTrackers = new Map();
27
  const downloadJobs = new Map();
28
 
@@ -165,6 +155,7 @@ const bypassCookiesAndRestrictions = async (page, progressTracker) => {
165
  progressTracker?.updateProgress(5, 'bypassing', 'Setting up cookie bypass...');
166
 
167
  console.log("πŸͺ Starting comprehensive cookie and restriction bypass...");
 
168
  const preCookies = [
169
  { name: 'cookieConsent', value: 'accepted', domain: '.studocu.com' },
170
  { name: 'cookie_consent', value: 'true', domain: '.studocu.com' },
@@ -183,9 +174,10 @@ const bypassCookiesAndRestrictions = async (page, progressTracker) => {
183
  }
184
  }
185
 
186
- // Step 2: Inject CSS to hide cookie banners immediately (Unchanged)
187
  await page.addStyleTag({
188
  content: `
 
189
  [id*="cookie" i]:not(img):not(input), [class*="cookie" i]:not(img):not(input), [data-testid*="cookie" i], [aria-label*="cookie" i],
190
  .gdpr-banner, .gdpr-popup, .gdpr-modal, .consent-banner, .consent-popup, .consent-modal, .privacy-banner, .privacy-popup, .privacy-modal,
191
  .cookie-law, .cookie-policy, .cookie-compliance, .onetrust-banner-sdk, #onetrust-consent-sdk, .cmp-banner, .cmp-popup, .cmp-modal,
@@ -197,22 +189,26 @@ const bypassCookiesAndRestrictions = async (page, progressTracker) => {
197
  z-index: -9999 !important;
198
  pointer-events: none !important;
199
  }
200
- /* Remove blur and premium overlays */
201
- [class*="blur" i], [class*="premium" i], [class*="paywall" i], [class*="sample-preview-blur" i] {
 
202
  filter: none !important;
203
  backdrop-filter: none !important;
204
  opacity: 1 !important;
205
  visibility: visible !important;
206
  }
 
207
  .document-content, .page-content, [data-page] {
208
  filter: none !important;
209
  opacity: 1 !important;
210
  visibility: visible !important;
211
  pointer-events: auto !important;
212
  }
 
213
  .fixed-overlay, .sticky-overlay, .content-overlay {
214
  display: none !important;
215
  }
 
216
  html, body {
217
  overflow: auto !important;
218
  position: static !important;
@@ -224,20 +220,24 @@ const bypassCookiesAndRestrictions = async (page, progressTracker) => {
224
  `
225
  });
226
 
 
227
  await page.evaluateOnNewDocument(() => {
 
228
  window.cookieConsent = { accepted: true };
229
  window.gtag = () => { };
230
  window.ga = () => { };
231
  window.dataLayer = [];
232
 
 
233
  const observer = new MutationObserver((mutations) => {
234
  mutations.forEach((mutation) => {
235
  mutation.addedNodes.forEach((node) => {
236
- if (node.nodeType === 1) {
237
  const element = node;
238
  const text = element.textContent || '';
239
  const className = element.className || '';
240
  const id = element.id || '';
 
241
  if (
242
  text.toLowerCase().includes('cookie') ||
243
  text.toLowerCase().includes('consent') ||
@@ -257,6 +257,7 @@ const bypassCookiesAndRestrictions = async (page, progressTracker) => {
257
  });
258
  observer.observe(document.body, { childList: true, subtree: true });
259
 
 
260
  setInterval(() => {
261
  const cookieElements = document.querySelectorAll(`
262
  [id*="cookie" i]:not(img):not(input), [class*="cookie" i]:not(img):not(input), [data-testid*="cookie" i],
@@ -264,6 +265,7 @@ const bypassCookiesAndRestrictions = async (page, progressTracker) => {
264
  .cmp-banner, .cc-banner
265
  `);
266
  cookieElements.forEach(el => el.remove());
 
267
  document.body.style.overflow = 'auto';
268
  document.documentElement.style.overflow = 'auto';
269
  }, 1000);
@@ -372,11 +374,13 @@ const applyPrintStyles = async (page, progressTracker) => {
372
  style.id = "print-style-extension";
373
  style.innerHTML = `
374
  @page {
 
375
  size: A4 portrait;
376
  margin: 0mm;
377
  }
378
  @media print {
379
  html, body {
 
380
  width: 210mm !important;
381
  height: auto !important;
382
  margin: 0 !important;
@@ -387,6 +391,7 @@ const applyPrintStyles = async (page, progressTracker) => {
387
  display: flex;
388
  justify-content: center;
389
  }
 
390
  header, footer, nav, aside, .no-print, .ads, .sidebar, .premium-banner,
391
  [class*="Header"], [class*="Footer"], [class*="Sidebar"], [id*="Header"],
392
  .ViewerToolbar, .Layout_info-bar-wrapper__He0Ho, .Sidebar_sidebar-scrollable__kqeBZ,
@@ -396,11 +401,17 @@ const applyPrintStyles = async (page, progressTracker) => {
396
  .Layout_sidebar-wrapper__unavM, .Layout_is-open__9DQr4 {
397
  display: none !important;
398
  }
 
399
  * {
400
  box-shadow: none !important;
401
  background: transparent !important;
402
  color: inherit !important;
403
  }
 
 
 
 
 
404
  .Viewer_document-wrapper__JPBWQ, .Viewer_document-wrapper__LXzoQ,
405
  .Viewer_document-wrapper__XsO4j, .page-content, .document-viewer, #page-container {
406
  position: static !important;
@@ -409,9 +420,10 @@ const applyPrintStyles = async (page, progressTracker) => {
409
  max-width: none !important;
410
  margin: 0 auto !important; /* Center horizontally */
411
  padding: 0 !important;
412
- box-sizing: border-box;
413
  transform: none !important;
414
  }
 
415
  [data-page], .page, .document-page, img {
416
  page-break-after: always !important;
417
  page-break-inside: avoid !important;
@@ -433,18 +445,12 @@ const applyPrintStyles = async (page, progressTracker) => {
433
 
434
  const studocuDownloader = async (url, options = {}, progressTracker = null) => {
435
  let browser;
436
- let userDataDir = null;
437
  try {
438
  progressTracker?.updateProgress(0, 'initializing', 'Starting browser...');
439
 
440
- const tempDir = os.tmpdir();
441
- userDataDir = await fs.mkdtemp(path.join(tempDir, 'puppeteer-'));
442
- console.log(`πŸ“‚ Created temporary user data directory: ${userDataDir}`);
443
-
444
  console.log("πŸš€ Launching browser with enhanced stealth configuration...");
445
- browser = await puppeteerExtra.launch({
446
  headless: true,
447
- userDataDir: userDataDir,
448
  args: [
449
  '--no-sandbox',
450
  '--disable-setuid-sandbox',
@@ -484,9 +490,10 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
484
 
485
  progressTracker?.updateProgress(2, 'initializing', 'Configuring browser settings...');
486
 
487
- await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36');
488
- await page.setViewport({ width: 794, height: 1122 }); // A4 size in pixels at 96 DPI
489
 
 
490
  await page.evaluateOnNewDocument(() => {
491
  Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
492
  Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
@@ -505,9 +512,10 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
505
  };
506
  });
507
 
 
508
  await bypassCookiesAndRestrictions(page, progressTracker);
509
 
510
- // Block unnecessary resources (UPDATED: Block more aggressively, including scripts, fonts, and stylesheets if not critical)
511
  await page.setRequestInterception(true);
512
  page.on('request', (req) => {
513
  const resourceType = req.resourceType();
@@ -527,7 +535,7 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
527
  if (
528
  ['image', 'media', 'font', 'stylesheet'].includes(resourceType) && // Block non-essential images/media/fonts/styles early if not core
529
  !reqUrl.includes('document') && !reqUrl.includes('page') && !reqUrl.includes('studocu') || // Allow core document images
530
- resourceType === 'script' && !reqUrl.includes('studocu') || // Block third-party scripts
531
  reqUrl.includes('doubleclick') ||
532
  reqUrl.includes('googletagmanager') ||
533
  reqUrl.includes('facebook.com') ||
@@ -546,18 +554,21 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
546
  }
547
  });
548
 
 
549
  if (options.email && options.password) {
550
  progressTracker?.updateProgress(12, 'authenticating', 'Logging into StuDocu...');
551
 
552
  console.log("πŸ”‘ Logging in to StuDocu...");
553
- await page.goto('https://www.studocu.com/en-us/login', { waitUntil: 'domcontentloaded', timeout: 60000 }); // Reduced timeout from 120000
554
- await page.waitForSelector('#email', { timeout: 10000 }); // Reduced from 15000
 
 
555
  await page.type('#email', options.email);
556
  await page.type('#password', options.password);
557
  await page.click('button[type="submit"]');
558
  try {
559
- await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 15000 }); // Reduced from 30000
560
- await page.waitForSelector('.user-profile, [data-testid="user-menu"]', { timeout: 5000 }); // Reduced from 10000
561
  console.log("βœ… Login successful.");
562
  progressTracker?.updateProgress(18, 'authenticated', 'Login successful');
563
  } catch (e) {
@@ -578,21 +589,25 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
578
  attempts++;
579
  progressTracker?.updateProgress(30 + (attempts * 5), 'navigating', `Navigation attempt ${attempts}/${maxAttempts}`);
580
  console.log(`Navigation attempt ${attempts}/${maxAttempts}`);
581
- await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 }); // Reduced timeout from 150000
582
  navigationSuccess = true;
583
  } catch (e) {
584
  console.log(`Navigation attempt ${attempts} failed:`, e.message);
585
  if (attempts >= maxAttempts) throw e;
586
- await new Promise(resolve => setTimeout(resolve, 5000)); // Reduced retry delay from 15000 to 5000ms
587
  }
588
  }
589
 
 
 
 
590
  progressTracker?.updateProgress(40, 'loading', 'Page loaded, waiting for content...');
591
- await new Promise(resolve => setTimeout(resolve, 2000)); // Reduced from 5000ms
592
 
593
  // Apply content unblurring
594
  await unblurContent(page, progressTracker);
595
 
 
596
  progressTracker?.updateProgress(45, 'loading', 'Waiting for document content...');
597
  console.log("⏳ Waiting for document content to load...");
598
 
@@ -603,7 +618,7 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
603
  let contentFound = false;
604
  for (const selector of contentSelectors) {
605
  try {
606
- await page.waitForSelector(selector, { timeout: 10000 }); // Reduced from 20000
607
  console.log(`βœ… Found content with selector: ${selector}`);
608
  contentFound = true;
609
  break;
@@ -616,6 +631,7 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
616
  console.log("⚠️ No specific content selector found, proceeding with page content...");
617
  }
618
 
 
619
  progressTracker?.updateProgress(50, 'scrolling', 'Loading all document pages...');
620
  console.log("πŸ“œ Loading all document pages with enhanced slow scroll...");
621
 
@@ -624,26 +640,28 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
624
  let scrollHeight = document.body.scrollHeight;
625
  while (true) {
626
  let totalHeight = 0;
627
- const distance = 600;
628
  while (totalHeight < scrollHeight) {
629
  window.scrollBy(0, distance);
630
  totalHeight += distance;
631
- await delay(200); // Reduced from 500ms
632
  }
633
- await delay(1000); // Reduced from 2000ms
634
  const newHeight = document.body.scrollHeight;
635
  if (newHeight === scrollHeight) break;
636
  scrollHeight = newHeight;
637
  }
638
  window.scrollTo({ top: 0, behavior: "smooth" });
639
- await delay(500); // Reduced from 1000ms
640
  });
641
 
 
642
  await unblurContent(page, progressTracker);
643
 
644
  // New: Fetch clear images for blurred pages
645
  await fetchClearImages(page, progressTracker);
646
 
 
647
  progressTracker?.updateProgress(75, 'loading_images', 'Loading images...');
648
  console.log("πŸ–ΌοΈ Waiting for all images to load...");
649
 
@@ -654,14 +672,15 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
654
  return new Promise((resolve) => {
655
  img.addEventListener('load', resolve);
656
  img.addEventListener('error', resolve);
657
- setTimeout(resolve, 5000); // Reduced from 15000ms
658
  });
659
  }));
660
  });
661
 
662
- await new Promise(resolve => setTimeout(resolve, 2000)); // Reduced from 5000ms
663
  progressTracker?.updateProgress(80, 'finalizing', 'Preparing document for PDF generation...');
664
 
 
665
  await page.evaluate(() => {
666
  const getDocumentHeight = () => Math.max(
667
  document.body.scrollHeight, document.body.offsetHeight,
@@ -673,6 +692,7 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
673
  document.body.style.overflow = 'hidden !important';
674
  });
675
 
 
676
  const contentCheck = await page.evaluate(() => {
677
  const textContent = document.body.textContent || '';
678
  const images = document.querySelectorAll('img');
@@ -699,6 +719,7 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
699
  console.warn("⚠️ Warning: Limited document content detected.");
700
  }
701
 
 
702
  await applyPrintStyles(page, progressTracker);
703
  await page.emulateMediaType('print');
704
 
@@ -707,9 +728,9 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
707
 
708
  const pdfBuffer = await page.pdf({
709
  printBackground: true,
710
- preferCSSPageSize: true,
711
  displayHeaderFooter: false,
712
- timeout: 60000, // Reduced from 180000
713
  scale: 1,
714
  omitBackground: false
715
  });
@@ -731,19 +752,10 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
731
  console.log("Error closing browser:", e.message);
732
  }
733
  }
734
- if (userDataDir) {
735
- console.log(`πŸ—‘οΈ Cleaning up temporary directory: ${userDataDir}`);
736
- try {
737
- await fs.rm(userDataDir, { recursive: true, force: true });
738
- console.log("βœ… Temporary directory cleaned up.");
739
- } catch (e) {
740
- console.error(`❌ Failed to clean up temporary directory ${userDataDir}:`, e.message);
741
- }
742
- }
743
  }
744
  };
745
 
746
- // --- API Routes ---
747
  app.post('/api/request-download', (req, res) => {
748
  const { url, email, password } = req.body;
749
  if (!url || !url.includes('studocu.com')) {
@@ -754,30 +766,25 @@ app.post('/api/request-download', (req, res) => {
754
  const progressTracker = new ProgressTracker(sessionId);
755
 
756
  progressTrackers.set(sessionId, progressTracker);
757
- downloadJobs.set(sessionId, { status: 'processing', createdAt: Date.now() }); // MODIFIED: Added createdAt
758
 
759
  console.log(`🎯 Processing request for: ${url} [Session: ${sessionId}]`);
760
 
 
761
  res.json({ sessionId });
762
 
 
763
  studocuDownloader(url, { email, password }, progressTracker)
764
  .then(pdfBuffer => {
765
- downloadJobs.set(sessionId, { status: 'completed', buffer: pdfBuffer, createdAt: Date.now() });
766
- progressTrackers.delete(sessionId);
 
767
  })
768
  .catch(error => {
769
- downloadJobs.set(sessionId, { status: 'error', message: error.message, createdAt: Date.now() });
770
- progressTrackers.delete(sessionId);
 
771
  });
772
-
773
- // NEW: Timeout for job cleanup
774
- setTimeout(() => {
775
- if (downloadJobs.has(sessionId) && downloadJobs.get(sessionId).status === 'processing') {
776
- downloadJobs.set(sessionId, { status: 'error', message: 'Job timed out after 5 minutes', createdAt: Date.now() });
777
- progressTrackers.delete(sessionId);
778
- console.log(`πŸ•’ Session ${sessionId} timed out`);
779
- }
780
- }, 300000); // 5 minutes
781
  });
782
 
783
  app.get('/api/progress/:sessionId', (req, res) => {
@@ -785,6 +792,7 @@ app.get('/api/progress/:sessionId', (req, res) => {
785
  const tracker = progressTrackers.get(sessionId);
786
 
787
  if (tracker) {
 
788
  return res.json({
789
  sessionId,
790
  progress: tracker.progress,
@@ -796,6 +804,7 @@ app.get('/api/progress/:sessionId', (req, res) => {
796
 
797
  const job = downloadJobs.get(sessionId);
798
  if (job) {
 
799
  if (job.status === 'completed') {
800
  return res.json({ sessionId, progress: 100, status: 'completed', message: 'PDF generated successfully!' });
801
  }
@@ -816,12 +825,7 @@ app.get('/api/download/:sessionId', (req, res) => {
816
  }
817
 
818
  if (job.status === 'processing') {
819
- const elapsed = Date.now() - job.createdAt;
820
- if (elapsed > 300000) { // 5 minutes
821
- downloadJobs.set(sessionId, { status: 'error', message: 'Download timed out after 5 minutes' });
822
- return res.status(500).json({ error: 'Download timed out after 5 minutes.' });
823
- }
824
- return res.status(400).json({ error: 'Download is still processing. Please try again in a few seconds.' });
825
  }
826
 
827
  if (job.status === 'error') {
@@ -832,13 +836,14 @@ app.get('/api/download/:sessionId', (req, res) => {
832
  res.setHeader('Content-Type', 'application/pdf');
833
  res.setHeader('Content-Disposition', 'attachment; filename=studocu-document.pdf');
834
  res.send(job.buffer);
835
- downloadJobs.delete(sessionId); // MODIFIED: Clean up after successful download
 
836
  } else {
837
  res.status(500).json({ error: 'An unknown error occurred.' });
838
  }
839
  });
840
 
841
- // --- Health and Info Endpoints ---
842
  app.get('/health', (req, res) => {
843
  res.json({
844
  status: 'healthy',
@@ -850,16 +855,17 @@ app.get('/health', (req, res) => {
850
 
851
  app.get('/', (req, res) => {
852
  res.json({
853
- message: 'πŸš€ Enhanced StuDocu Downloader API v5.2 - Real-time Progress Tracking with Stealth',
854
- version: '5.2.0',
855
  features: [
856
- 'πŸ›‘οΈ Cloudflare CAPTCHA bypass with 2Captcha',
857
  'πŸͺ Advanced cookie banner bypass',
858
  'πŸ”“ Premium content unblurring',
859
  'πŸ”‘ Login support for full access',
860
  'πŸ“Š Real-time progress tracking via polling',
861
  'πŸ“„ Clean PDF generation with print styles',
862
- 'πŸ•΅οΈ Enhanced stealth to evade bot detection'
 
 
863
  ],
864
  endpoints: {
865
  request: 'POST /api/request-download (body: {url, filename?, email?, password?})',
@@ -881,6 +887,6 @@ process.on('SIGINT', () => {
881
  });
882
 
883
  app.listen(port, () => {
884
- console.log(`πŸš€ Enhanced StuDocu Downloader v5.2.0 running on http://localhost:${port}`);
885
- console.log(`✨ Features: Real-time progress tracking, enhanced stealth, and user feedback`);
886
  });
 
1
  const express = require('express');
2
+ const puppeteerExtra = require('puppeteer-extra'); // NEW: For stealth
3
+ const StealthPlugin = require('puppeteer-extra-plugin-stealth'); // NEW: Stealth plugin
 
4
  const cors = require('cors');
5
  const { EventEmitter } = require('events');
6
+
7
+ puppeteerExtra.use(StealthPlugin()); // NEW: Enable stealth plugin
 
 
 
 
 
 
 
 
 
8
 
9
  const app = express();
10
  const port = 7860;
 
12
  app.use(cors());
13
  app.use(express.json());
14
 
15
+ // --- Progress Tracking and Job Storage --- (Unchanged)
16
  const progressTrackers = new Map();
17
  const downloadJobs = new Map();
18
 
 
155
  progressTracker?.updateProgress(5, 'bypassing', 'Setting up cookie bypass...');
156
 
157
  console.log("πŸͺ Starting comprehensive cookie and restriction bypass...");
158
+ // Step 1: Set cookies before page load
159
  const preCookies = [
160
  { name: 'cookieConsent', value: 'accepted', domain: '.studocu.com' },
161
  { name: 'cookie_consent', value: 'true', domain: '.studocu.com' },
 
174
  }
175
  }
176
 
177
+ // Step 2: Inject CSS to hide cookie banners immediately (Updated: Added more selectors for previews and blurred overlays)
178
  await page.addStyleTag({
179
  content: `
180
+ /* Hide all possible cookie banners */
181
  [id*="cookie" i]:not(img):not(input), [class*="cookie" i]:not(img):not(input), [data-testid*="cookie" i], [aria-label*="cookie" i],
182
  .gdpr-banner, .gdpr-popup, .gdpr-modal, .consent-banner, .consent-popup, .consent-modal, .privacy-banner, .privacy-popup, .privacy-modal,
183
  .cookie-law, .cookie-policy, .cookie-compliance, .onetrust-banner-sdk, #onetrust-consent-sdk, .cmp-banner, .cmp-popup, .cmp-modal,
 
189
  z-index: -9999 !important;
190
  pointer-events: none !important;
191
  }
192
+ /* Remove blur and premium overlays, including previews */
193
+ [class*="blur" i], [class*="premium" i], [class*="paywall" i], [class*="sample-preview-blur" i], [class*="preview" i], [class*="blurred-container" i], [class*="blurred" i] {
194
+ display: none !important;
195
  filter: none !important;
196
  backdrop-filter: none !important;
197
  opacity: 1 !important;
198
  visibility: visible !important;
199
  }
200
+ /* Ensure document content is visible */
201
  .document-content, .page-content, [data-page] {
202
  filter: none !important;
203
  opacity: 1 !important;
204
  visibility: visible !important;
205
  pointer-events: auto !important;
206
  }
207
+ /* Remove fixed overlays */
208
  .fixed-overlay, .sticky-overlay, .content-overlay {
209
  display: none !important;
210
  }
211
+ /* Restore scrolling */
212
  html, body {
213
  overflow: auto !important;
214
  position: static !important;
 
220
  `
221
  });
222
 
223
+ // Step 3: Inject JavaScript to handle dynamic cookie banners (Unchanged)
224
  await page.evaluateOnNewDocument(() => {
225
+ // Override common cookie consent functions
226
  window.cookieConsent = { accepted: true };
227
  window.gtag = () => { };
228
  window.ga = () => { };
229
  window.dataLayer = [];
230
 
231
+ // Mutation observer to catch dynamically added cookie banners
232
  const observer = new MutationObserver((mutations) => {
233
  mutations.forEach((mutation) => {
234
  mutation.addedNodes.forEach((node) => {
235
+ if (node.nodeType === 1) { // Element node
236
  const element = node;
237
  const text = element.textContent || '';
238
  const className = element.className || '';
239
  const id = element.id || '';
240
+ // Check if this looks like a cookie banner
241
  if (
242
  text.toLowerCase().includes('cookie') ||
243
  text.toLowerCase().includes('consent') ||
 
257
  });
258
  observer.observe(document.body, { childList: true, subtree: true });
259
 
260
+ // Set up periodic cleanup
261
  setInterval(() => {
262
  const cookieElements = document.querySelectorAll(`
263
  [id*="cookie" i]:not(img):not(input), [class*="cookie" i]:not(img):not(input), [data-testid*="cookie" i],
 
265
  .cmp-banner, .cc-banner
266
  `);
267
  cookieElements.forEach(el => el.remove());
268
+ // Restore body scroll
269
  document.body.style.overflow = 'auto';
270
  document.documentElement.style.overflow = 'auto';
271
  }, 1000);
 
374
  style.id = "print-style-extension";
375
  style.innerHTML = `
376
  @page {
377
+ /* Set page size to A4 and remove default margins */
378
  size: A4 portrait;
379
  margin: 0mm;
380
  }
381
  @media print {
382
  html, body {
383
+ /* Ensure the body takes the full width and has no extra padding/margin */
384
  width: 210mm !important;
385
  height: auto !important;
386
  margin: 0 !important;
 
391
  display: flex;
392
  justify-content: center;
393
  }
394
+ /* Remove all unwanted elements like headers, footers, sidebars, etc. */
395
  header, footer, nav, aside, .no-print, .ads, .sidebar, .premium-banner,
396
  [class*="Header"], [class*="Footer"], [class*="Sidebar"], [id*="Header"],
397
  .ViewerToolbar, .Layout_info-bar-wrapper__He0Ho, .Sidebar_sidebar-scrollable__kqeBZ,
 
401
  .Layout_sidebar-wrapper__unavM, .Layout_is-open__9DQr4 {
402
  display: none !important;
403
  }
404
+ /* Force all elements to have a transparent background and no shadow */
405
  * {
406
  box-shadow: none !important;
407
  background: transparent !important;
408
  color: inherit !important;
409
  }
410
+ /*
411
+ * KEY FIX: Target the main document container.
412
+ * Force it to be a block element, remove any transforms or max-widths,
413
+ * and center it perfectly within the page.
414
+ */
415
  .Viewer_document-wrapper__JPBWQ, .Viewer_document-wrapper__LXzoQ,
416
  .Viewer_document-wrapper__XsO4j, .page-content, .document-viewer, #page-container {
417
  position: static !important;
 
420
  max-width: none !important;
421
  margin: 0 auto !important; /* Center horizontally */
422
  padding: 0 !important;
423
+ box-sizing: border-box; /* Include padding in width calculation */
424
  transform: none !important;
425
  }
426
+ /* Ensure individual pages and images within the document use the full width */
427
  [data-page], .page, .document-page, img {
428
  page-break-after: always !important;
429
  page-break-inside: avoid !important;
 
445
 
446
  const studocuDownloader = async (url, options = {}, progressTracker = null) => {
447
  let browser;
 
448
  try {
449
  progressTracker?.updateProgress(0, 'initializing', 'Starting browser...');
450
 
 
 
 
 
451
  console.log("πŸš€ Launching browser with enhanced stealth configuration...");
452
+ browser = await puppeteerExtra.launch({ // UPDATED: Use puppeteerExtra
453
  headless: true,
 
454
  args: [
455
  '--no-sandbox',
456
  '--disable-setuid-sandbox',
 
490
 
491
  progressTracker?.updateProgress(2, 'initializing', 'Configuring browser settings...');
492
 
493
+ await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
494
+ await page.setViewport({ width: 1920, height: 1080 }); // NEW: Use full HD for more realistic viewport, adjust back if needed for A4
495
 
496
+ // NOTE: Stealth plugin handles most of this, but keeping for extra safety
497
  await page.evaluateOnNewDocument(() => {
498
  Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
499
  Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
 
512
  };
513
  });
514
 
515
+ // Set up cookie and content bypass
516
  await bypassCookiesAndRestrictions(page, progressTracker);
517
 
518
+ // Block unnecessary resources (UPDATED: Loosened for Cloudflare - allow cloudflare.com requests)
519
  await page.setRequestInterception(true);
520
  page.on('request', (req) => {
521
  const resourceType = req.resourceType();
 
535
  if (
536
  ['image', 'media', 'font', 'stylesheet'].includes(resourceType) && // Block non-essential images/media/fonts/styles early if not core
537
  !reqUrl.includes('document') && !reqUrl.includes('page') && !reqUrl.includes('studocu') || // Allow core document images
538
+ resourceType === 'script' && !reqUrl.includes('studocu') && !reqUrl.includes('cloudflare') || // Block third-party scripts except Cloudflare
539
  reqUrl.includes('doubleclick') ||
540
  reqUrl.includes('googletagmanager') ||
541
  reqUrl.includes('facebook.com') ||
 
554
  }
555
  });
556
 
557
+ // Login if credentials provided
558
  if (options.email && options.password) {
559
  progressTracker?.updateProgress(12, 'authenticating', 'Logging into StuDocu...');
560
 
561
  console.log("πŸ”‘ Logging in to StuDocu...");
562
+ await page.goto('https://www.studocu.com/en-us/login', { waitUntil: 'domcontentloaded', timeout: 120000 });
563
+ // NEW: Handle potential Cloudflare on login page
564
+ await handleCloudflareChallenge(page, progressTracker);
565
+ await page.waitForSelector('#email', { timeout: 15000 });
566
  await page.type('#email', options.email);
567
  await page.type('#password', options.password);
568
  await page.click('button[type="submit"]');
569
  try {
570
+ await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 30000 });
571
+ await page.waitForSelector('.user-profile, [data-testid="user-menu"]', { timeout: 10000 });
572
  console.log("βœ… Login successful.");
573
  progressTracker?.updateProgress(18, 'authenticated', 'Login successful');
574
  } catch (e) {
 
589
  attempts++;
590
  progressTracker?.updateProgress(30 + (attempts * 5), 'navigating', `Navigation attempt ${attempts}/${maxAttempts}`);
591
  console.log(`Navigation attempt ${attempts}/${maxAttempts}`);
592
+ await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 120000 }); // Increased from 60000
593
  navigationSuccess = true;
594
  } catch (e) {
595
  console.log(`Navigation attempt ${attempts} failed:`, e.message);
596
  if (attempts >= maxAttempts) throw e;
597
+ await new Promise(resolve => setTimeout(resolve, 10000)); // Increased retry delay to 10s for stability
598
  }
599
  }
600
 
601
+ // NEW: Handle Cloudflare after navigation
602
+ await handleCloudflareChallenge(page, progressTracker);
603
+
604
  progressTracker?.updateProgress(40, 'loading', 'Page loaded, waiting for content...');
605
+ await new Promise(resolve => setTimeout(resolve, 5000)); // Increased from 2000ms for better loading
606
 
607
  // Apply content unblurring
608
  await unblurContent(page, progressTracker);
609
 
610
+ // Wait for document content
611
  progressTracker?.updateProgress(45, 'loading', 'Waiting for document content...');
612
  console.log("⏳ Waiting for document content to load...");
613
 
 
618
  let contentFound = false;
619
  for (const selector of contentSelectors) {
620
  try {
621
+ await page.waitForSelector(selector, { timeout: 20000 }); // Increased from 10000
622
  console.log(`βœ… Found content with selector: ${selector}`);
623
  contentFound = true;
624
  break;
 
631
  console.log("⚠️ No specific content selector found, proceeding with page content...");
632
  }
633
 
634
+ // Enhanced scrolling to load all content (Optimized: Increased scroll distance, reduced delays)
635
  progressTracker?.updateProgress(50, 'scrolling', 'Loading all document pages...');
636
  console.log("πŸ“œ Loading all document pages with enhanced slow scroll...");
637
 
 
640
  let scrollHeight = document.body.scrollHeight;
641
  while (true) {
642
  let totalHeight = 0;
643
+ const distance = 600; // Increased from 300 for faster coverage
644
  while (totalHeight < scrollHeight) {
645
  window.scrollBy(0, distance);
646
  totalHeight += distance;
647
+ await delay(300); // Increased from 200ms for large docs stability
648
  }
649
+ await delay(2000); // Increased from 1000ms
650
  const newHeight = document.body.scrollHeight;
651
  if (newHeight === scrollHeight) break;
652
  scrollHeight = newHeight;
653
  }
654
  window.scrollTo({ top: 0, behavior: "smooth" });
655
+ await delay(1000); // Increased from 500ms
656
  });
657
 
658
+ // Re-apply unblur after loading new content
659
  await unblurContent(page, progressTracker);
660
 
661
  // New: Fetch clear images for blurred pages
662
  await fetchClearImages(page, progressTracker);
663
 
664
+ // Wait for all images to load (Optimized: Reduced per-image timeout, parallel wait)
665
  progressTracker?.updateProgress(75, 'loading_images', 'Loading images...');
666
  console.log("πŸ–ΌοΈ Waiting for all images to load...");
667
 
 
672
  return new Promise((resolve) => {
673
  img.addEventListener('load', resolve);
674
  img.addEventListener('error', resolve);
675
+ setTimeout(resolve, 10000); // Increased from 5000ms for large docs
676
  });
677
  }));
678
  });
679
 
680
+ await new Promise(resolve => setTimeout(resolve, 5000)); // Increased from 2000ms
681
  progressTracker?.updateProgress(80, 'finalizing', 'Preparing document for PDF generation...');
682
 
683
+ // Set exact height
684
  await page.evaluate(() => {
685
  const getDocumentHeight = () => Math.max(
686
  document.body.scrollHeight, document.body.offsetHeight,
 
692
  document.body.style.overflow = 'hidden !important';
693
  });
694
 
695
+ // Content verification (Unchanged, as it's quick)
696
  const contentCheck = await page.evaluate(() => {
697
  const textContent = document.body.textContent || '';
698
  const images = document.querySelectorAll('img');
 
719
  console.warn("⚠️ Warning: Limited document content detected.");
720
  }
721
 
722
+ // Apply print styles and generate PDF
723
  await applyPrintStyles(page, progressTracker);
724
  await page.emulateMediaType('print');
725
 
 
728
 
729
  const pdfBuffer = await page.pdf({
730
  printBackground: true,
731
+ preferCSSPageSize: true, // Use the @page size
732
  displayHeaderFooter: false,
733
+ timeout: 180000, // Increased back to 180000 for large PDFs
734
  scale: 1,
735
  omitBackground: false
736
  });
 
752
  console.log("Error closing browser:", e.message);
753
  }
754
  }
 
 
 
 
 
 
 
 
 
755
  }
756
  };
757
 
758
+ // --- API Routes --- (Unchanged)
759
  app.post('/api/request-download', (req, res) => {
760
  const { url, email, password } = req.body;
761
  if (!url || !url.includes('studocu.com')) {
 
766
  const progressTracker = new ProgressTracker(sessionId);
767
 
768
  progressTrackers.set(sessionId, progressTracker);
769
+ downloadJobs.set(sessionId, { status: 'processing' });
770
 
771
  console.log(`🎯 Processing request for: ${url} [Session: ${sessionId}]`);
772
 
773
+ // Respond to the client immediately with the session ID
774
  res.json({ sessionId });
775
 
776
+ // --- Start the PDF generation in the background ---
777
  studocuDownloader(url, { email, password }, progressTracker)
778
  .then(pdfBuffer => {
779
+ // Store the successful result
780
+ downloadJobs.set(sessionId, { status: 'completed', buffer: pdfBuffer });
781
+ progressTrackers.delete(sessionId); // Clean up live tracker
782
  })
783
  .catch(error => {
784
+ // Store the error
785
+ downloadJobs.set(sessionId, { status: 'error', message: error.message });
786
+ progressTrackers.delete(sessionId); // Clean up live tracker
787
  });
 
 
 
 
 
 
 
 
 
788
  });
789
 
790
  app.get('/api/progress/:sessionId', (req, res) => {
 
792
  const tracker = progressTrackers.get(sessionId);
793
 
794
  if (tracker) {
795
+ // Job is in progress, return live data
796
  return res.json({
797
  sessionId,
798
  progress: tracker.progress,
 
804
 
805
  const job = downloadJobs.get(sessionId);
806
  if (job) {
807
+ // Job is finished, return final state
808
  if (job.status === 'completed') {
809
  return res.json({ sessionId, progress: 100, status: 'completed', message: 'PDF generated successfully!' });
810
  }
 
825
  }
826
 
827
  if (job.status === 'processing') {
828
+ return res.status(400).json({ error: 'Download is still processing.' });
 
 
 
 
 
829
  }
830
 
831
  if (job.status === 'error') {
 
836
  res.setHeader('Content-Type', 'application/pdf');
837
  res.setHeader('Content-Disposition', 'attachment; filename=studocu-document.pdf');
838
  res.send(job.buffer);
839
+ // Optional: Clean up the job after download to save memory
840
+ // downloadJobs.delete(sessionId);
841
  } else {
842
  res.status(500).json({ error: 'An unknown error occurred.' });
843
  }
844
  });
845
 
846
+ // --- Health and Info Endpoints (Unchanged) ---
847
  app.get('/health', (req, res) => {
848
  res.json({
849
  status: 'healthy',
 
855
 
856
  app.get('/', (req, res) => {
857
  res.json({
858
+ message: 'πŸš€ Enhanced StuDocu Downloader API v5.3 - Real-time Progress Tracking with Cloudflare Bypass',
859
+ version: '5.3.0',
860
  features: [
 
861
  'πŸͺ Advanced cookie banner bypass',
862
  'πŸ”“ Premium content unblurring',
863
  'πŸ”‘ Login support for full access',
864
  'πŸ“Š Real-time progress tracking via polling',
865
  'πŸ“„ Clean PDF generation with print styles',
866
+ 'πŸ•΅οΈ Enhanced stealth to evade bot detection',
867
+ '☁️ Automatic Cloudflare challenge handling',
868
+ 'πŸ§‘ Human-like behavior simulation'
869
  ],
870
  endpoints: {
871
  request: 'POST /api/request-download (body: {url, filename?, email?, password?})',
 
887
  });
888
 
889
  app.listen(port, () => {
890
+ console.log(`πŸš€ Enhanced StuDocu Downloader v5.3.0 running on http://localhost:${port}`);
891
+ console.log(`✨ Features: Real-time progress tracking, enhanced stealth, Cloudflare bypass, and user feedback`);
892
  });