const express = require('express'); const puppeteerExtra = require('puppeteer-extra'); const StealthPlugin = require('puppeteer-extra-plugin-stealth'); // NEW: Add the recaptcha plugin to help solve Cloudflare and other challenges const RecaptchaPlugin = require('puppeteer-extra-plugin-recaptcha'); const cors = require('cors'); const { EventEmitter } = require('events'); const os = require('os'); const fs = require('fs').promises; const path = require('path'); // --- NEW: Configuration for the Solver --- // You can optionally provide a 2Captcha API key to solve more complex captchas, // but it's often not needed for the initial Cloudflare JS challenge. puppeteerExtra.use( RecaptchaPlugin({ provider: { id: '2captcha', token: 'cc4f0d688032c69ecf359cccdabbacb9' } }) ); puppeteerExtra.use(StealthPlugin()); const app = express(); const port = 7860; app.use(cors()); app.use(express.json()); // --- Progress Tracking and Job Storage (No changes) --- const progressTrackers = new Map(); const downloadJobs = new Map(); class ProgressTracker extends EventEmitter { constructor(sessionId) { super(); this.sessionId = sessionId; this.progress = 0; this.status = 'initializing'; this.message = ''; } updateProgress(progress, status, message) { this.progress = progress; this.status = status; this.message = message; const update = { sessionId: this.sessionId, progress, status, message, timestamp: new Date().toISOString() }; this.emit('progress', update); console.log(`📊 [${this.sessionId}] ${progress}% - ${status}: ${message}`); } } // --- Puppeteer Logic (Updated for Cloudflare Bypass) --- const bypassCookiesAndRestrictions = async (page, progressTracker) => { // This function remains largely the same but is now called *after* passing Cloudflare. progressTracker?.updateProgress(5, 'bypassing', 'Setting up cookie bypass...'); // (The implementation of this function is unchanged from your original code) console.log("🍪 Starting comprehensive cookie and restriction bypass..."); // Step 1: Set cookies before page load const preCookies = [ { name: 'cookieConsent', value: 'accepted', domain: '.studocu.com' }, { name: 'cookie_consent', value: 'true', domain: '.studocu.com' }, { name: 'gdpr_consent', value: 'accepted', domain: '.studocu.com' }, { name: 'privacy_policy_accepted', value: 'true', domain: '.studocu.com' }, { name: 'user_consent', value: '1', domain: '.studocu.com' }, { name: 'analytics_consent', value: 'false', domain: '.studocu.com' }, { name: 'marketing_consent', value: 'false', domain: '.studocu.com' }, { name: 'functional_consent', value: 'true', domain: '.studocu.com' }, ]; for (const cookie of preCookies) { try { await page.setCookie(cookie); } catch (e) { console.log(`Failed to set cookie ${cookie.name}:`, e.message); } } // Step 2: Inject CSS to hide cookie banners immediately await page.addStyleTag({ content: ` /* Hide all possible cookie banners */ [id*="cookie" i]:not(img):not(input), [class*="cookie" i]:not(img):not(input), [data-testid*="cookie" i], [aria-label*="cookie" i], .gdpr-banner, .gdpr-popup, .gdpr-modal, .consent-banner, .consent-popup, .consent-modal, .privacy-banner, .privacy-popup, .privacy-modal, .cookie-law, .cookie-policy, .cookie-compliance, .onetrust-banner-sdk, #onetrust-consent-sdk, .cmp-banner, .cmp-popup, .cmp-modal, [class*="CookieBanner"], [class*="CookieNotice"], [class*="ConsentBanner"], [class*="ConsentManager"], .cc-banner, .cc-window, .cc-compliance, div[style*="position: fixed"]:has-text("cookie"), div[style*="position: fixed"]:has-text("consent"), .fixed:has-text("cookie"), .fixed:has-text("consent") { display: none !important; visibility: hidden !important; opacity: 0 !important; z-index: -9999 !important; pointer-events: none !important; } /* Remove blur and premium overlays */ [class*="blur" i], [class*="premium" i], [class*="paywall" i], [class*="sample-preview-blur" i] { filter: none !important; backdrop-filter: none !important; opacity: 1 !important; visibility: visible !important; } /* Ensure document content is visible */ .document-content, .page-content, [data-page] { filter: none !important; opacity: 1 !important; visibility: visible !important; pointer-events: auto !important; } /* Remove fixed overlays */ .fixed-overlay, .sticky-overlay, .content-overlay { display: none !important; } /* Restore scrolling */ html, body { overflow: auto !important; position: static !important; } ` }); // Step 3: Inject JavaScript to handle dynamic cookie banners await page.evaluateOnNewDocument(() => { // Override common cookie consent functions window.cookieConsent = { accepted: true }; window.gtag = () => { }; window.ga = () => { }; window.dataLayer = []; // Mutation observer to catch dynamically added cookie banners const observer = new MutationObserver((mutations) => { mutations.forEach((mutation) => { mutation.addedNodes.forEach((node) => { if (node.nodeType === 1) { // Element node const element = node; const text = element.textContent || ''; const className = element.className || ''; const id = element.id || ''; // Check if this looks like a cookie banner if ( text.toLowerCase().includes('cookie') || text.toLowerCase().includes('consent') || text.toLowerCase().includes('privacy policy') || className.toLowerCase().includes('cookie') || className.toLowerCase().includes('consent') || className.toLowerCase().includes('gdpr') || id.toLowerCase().includes('cookie') || id.toLowerCase().includes('consent') ) { console.log('Removing detected cookie banner:', element); element.remove(); } } }); }); }); observer.observe(document.body, { childList: true, subtree: true }); // Set up periodic cleanup setInterval(() => { const cookieElements = document.querySelectorAll(` [id*="cookie" i]:not(img):not(input), [class*="cookie" i]:not(img):not(input), [data-testid*="cookie" i], .gdpr-banner, .consent-banner, .privacy-banner, .onetrust-banner-sdk, #onetrust-consent-sdk, .cmp-banner, .cc-banner `); cookieElements.forEach(el => el.remove()); // Restore body scroll document.body.style.overflow = 'auto'; document.documentElement.style.overflow = 'auto'; }, 1000); }); progressTracker?.updateProgress(10, 'bypassing', 'Cookie bypass configured successfully'); return true; }; // --- Other functions (unblurContent, applyPrintStyles) are unchanged --- const unblurContent = async (page, progressTracker) => { progressTracker?.updateProgress(15, 'unblurring', 'Removing content restrictions...'); console.log("🔓 Unblurring content and bypassing premium restrictions..."); await page.evaluate(() => { const removeRestrictions = () => { const removeBySelector = (selector) => { document.querySelectorAll(selector).forEach(el => el.remove()); }; removeBySelector("#adbox, .adsbox, .ad-box, .banner-ads, .advert"); removeBySelector(".PremiumBannerBlobWrapper_overflow-wrapper__xsaS8"); const removeBlur = (element = document) => { element.querySelectorAll("*").forEach(el => { const style = window.getComputedStyle(el); if ( style.filter?.includes("blur") || style.backdropFilter?.includes("blur") || parseFloat(style.opacity) < 1 || (el.className && el.className.toString().toLowerCase().includes("blur")) || (el.className && el.className.toString().toLowerCase().includes("premium")) ) { el.style.filter = "none !important"; el.style.backdropFilter = "none !important"; el.style.opacity = "1 !important"; if (el.classList) { el.classList.remove("blur", "blurred", "premium-blur"); } } }); }; removeBlur(); removeBySelector('[class*="blur" i], [class*="premium" i], [class*="paywall" i]'); const contentSelectors = [ '.document-content', '.page-content', '.content', '[data-page]', '[data-testid*="document"]', '[data-testid*="page"]', '.page', '.document-page', 'main', 'article' ]; contentSelectors.forEach(selector => { document.querySelectorAll(selector).forEach(el => { el.style.setProperty('filter', 'none', 'important'); el.style.setProperty('opacity', '1', 'important'); el.style.setProperty('visibility', 'visible', 'important'); el.style.setProperty('display', 'block', 'important'); el.style.setProperty('pointer-events', 'auto', 'important'); }); }); }; removeRestrictions(); const intervalId = setInterval(removeRestrictions, 1000); setTimeout(() => clearInterval(intervalId), 30000); }); progressTracker?.updateProgress(20, 'unblurring', 'Content restrictions removed'); }; const applyPrintStyles = async (page, progressTracker) => { progressTracker?.updateProgress(85, 'styling', 'Applying print styles...'); console.log("🖨️ Applying print styles for clean PDF..."); await page.evaluate(() => { const style = document.createElement("style"); style.id = "print-style-extension"; style.innerHTML = ` @page { size: A4 portrait; margin: 0mm; } @media print { html, body { width: 210mm !important; height: auto !important; margin: 0 !important; padding: 0 !important; overflow: visible !important; background: white !important; color: black !important; } header, footer, nav, aside, .no-print, .ads, .sidebar, .premium-banner, [class*="Header"], [class*="Footer"], [class*="Sidebar"], [id*="Header"], .ViewerToolbar, .Layout_info-bar-wrapper__He0Ho, .Sidebar_sidebar-scrollable__kqeBZ, .HeaderWrapper_header-wrapper__mCmf3, .Layout_visible-content-bottom-wrapper-sticky__yaaAB, .Layout_bottom-section-wrapper__yBWWk, .Layout_footer-wrapper__bheJQ, .InlineBanner_inline-banner-wrapper__DAi5X, .banner-wrapper, #top-bar-wrapper, .Layout_sidebar-wrapper__unavM, .Layout_is-open__9DQr4 { display: none !important; } * { box-shadow: none !important; background: transparent !important; color: inherit !important; } .Viewer_document-wrapper__JPBWQ, .Viewer_document-wrapper__LXzoQ, .Viewer_document-wrapper__XsO4j, .page-content, .document-viewer, #page-container { position: static !important; display: block !important; width: 100% !important; max-width: none !important; margin: 0 !important; padding: 0 !important; box-sizing: border-box; transform: none !important; } [data-page], .page, .document-page, img { page-break-after: always !important; page-break-inside: avoid !important; page-break-before: avoid !important; width: 100% !important; max-width: 100% !important; height: auto !important; display: block !important; margin: 0 !important; padding: 0 !important; } } `; document.head.appendChild(style); }); progressTracker?.updateProgress(88, 'styling', 'Print styles applied successfully'); }; const studocuDownloader = async (url, options = {}, progressTracker = null) => { let browser; let userDataDir = null; // NEW: Easy flag for debugging. Set to true to see the browser window. const isDebugging = false; try { progressTracker?.updateProgress(0, 'initializing', 'Starting browser...'); const tempDir = os.tmpdir(); userDataDir = await fs.mkdtemp(path.join(tempDir, 'puppeteer-')); console.log(`📂 Created temporary user data directory: ${userDataDir}`); console.log("🚀 Launching browser with enhanced stealth configuration..."); browser = await puppeteerExtra.launch({ headless: !isDebugging, // Use the debugging flag userDataDir: userDataDir, args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-infobars', '--disable-dev-shm-usage', '--disable-blink-features=AutomationControlled', '--window-size=1920,1080' ], ignoreHTTPSErrors: true, }); const page = await browser.newPage(); progressTracker?.updateProgress(2, 'initializing', 'Configuring browser settings...'); await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'); await page.setViewport({ width: 1920, height: 1080 }); // The stealth plugin and other `evaluateOnNewDocument` calls handle this better now. // await page.evaluateOnNewDocument(...) is handled by plugins. // Request interception logic is unchanged await page.setRequestInterception(true); page.on('request', (req) => { const resourceType = req.resourceType(); const reqUrl = req.url().toLowerCase(); if (resourceType === 'document') { req.continue(); return; } if ( ['image', 'media', 'font', 'stylesheet'].includes(resourceType) && !reqUrl.includes('document') && !reqUrl.includes('page') && !reqUrl.includes('studocu') || resourceType === 'script' && !reqUrl.includes('studocu') || reqUrl.includes('doubleclick') || reqUrl.includes('googletagmanager') || reqUrl.includes('facebook.com') || reqUrl.includes('twitter.com') || reqUrl.includes('analytics') || reqUrl.includes('gtm') || reqUrl.includes('hotjar') || reqUrl.includes('mixpanel') || reqUrl.includes('onetrust') || reqUrl.includes('cookielaw') || (resourceType === 'other' && reqUrl.includes('/track/')) ) { req.abort(); } else { req.continue(); } }); // --- MODIFIED NAVIGATION LOGIC --- progressTracker?.updateProgress(5, 'navigating', 'Navigating to document...'); console.log(`🛡️ Navigating to ${url} and preparing for Cloudflare challenge...`); try { await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 120000 }); // NEW: Wait for potential Cloudflare challenge to solve and redirect. // We wait for an element that *only* exists on the actual Studocu page. console.log("⏳ Waiting for Cloudflare challenge to be solved..."); progressTracker?.updateProgress(8, 'solving_cf', 'Solving Cloudflare challenge...'); await page.waitForSelector('#search-input', { timeout: 90000 }); console.log("✅ Cloudflare challenge passed! You are on the Studocu page."); progressTracker?.updateProgress(10, 'navigation_complete', 'Successfully navigated to document'); } catch (e) { console.error("❌ Failed to bypass Cloudflare or navigate to the page.", e.message); // NEW: Take a screenshot on failure to help debug const screenshotPath = path.join(os.tmpdir(), `cloudflare_failure_${Date.now()}.png`); await page.screenshot({ path: screenshotPath, fullPage: true }); console.log(`📸 Screenshot saved to ${screenshotPath}`); throw new Error("Could not bypass Cloudflare. The site may be actively blocking, or the page structure changed."); } // --- RESUME NORMAL SCRIPT FLOW --- // It's better to bypass cookies *after* landing on the actual page await bypassCookiesAndRestrictions(page, progressTracker); if (options.email && options.password) { progressTracker?.updateProgress(12, 'authenticating', 'Logging into StuDocu...'); // ... (Login logic is unchanged) } progressTracker?.updateProgress(40, 'loading', 'Page loaded, waiting for content...'); await new Promise(resolve => setTimeout(resolve, 2000)); await unblurContent(page, progressTracker); // ... (The rest of the script is unchanged) progressTracker?.updateProgress(45, 'loading', 'Waiting for document content...'); console.log("⏳ Waiting for document content to load..."); const contentSelectors = [ '.document-content', '.page-content', '[data-page]', '[data-testid*="document"]', 'img[src*="document"]', 'img[src*="page"]', '.page', 'main img', 'article img' ]; let contentFound = false; for (const selector of contentSelectors) { try { await page.waitForSelector(selector, { timeout: 10000 }); console.log(`✅ Found content with selector: ${selector}`); contentFound = true; break; } catch (e) { console.log(`❌ Selector ${selector} not found, trying next...`); } } if (!contentFound) { console.log("⚠️ No specific content selector found, proceeding with page content..."); } progressTracker?.updateProgress(50, 'scrolling', 'Loading all document pages...'); console.log("📜 Loading all document pages with enhanced slow scroll..."); await page.evaluate(async () => { const delay = (ms) => new Promise((res) => setTimeout(res, ms)); let scrollHeight = document.body.scrollHeight; while (true) { let totalHeight = 0; const distance = 600; while (totalHeight < scrollHeight) { window.scrollBy(0, distance); totalHeight += distance; await delay(200); } await delay(1000); const newHeight = document.body.scrollHeight; if (newHeight === scrollHeight) break; scrollHeight = newHeight; } window.scrollTo({ top: 0, behavior: "smooth" }); await delay(500); }); progressTracker?.updateProgress(70, 'processing', 'Processing loaded content...'); await unblurContent(page, progressTracker); progressTracker?.updateProgress(75, 'loading_images', 'Loading images...'); console.log("🖼️ Waiting for all images to load..."); await page.evaluate(async () => { const images = Array.from(document.querySelectorAll('img')); await Promise.all(images.map(img => { if (img.complete) return Promise.resolve(); return new Promise((resolve) => { img.addEventListener('load', resolve); img.addEventListener('error', resolve); setTimeout(resolve, 5000); }); })); }); await new Promise(resolve => setTimeout(resolve, 2000)); progressTracker?.updateProgress(80, 'finalizing', 'Preparing document for PDF generation...'); await page.evaluate(() => { const getDocumentHeight = () => Math.max( document.body.scrollHeight, document.body.offsetHeight, document.documentElement.clientHeight, document.documentElement.scrollHeight, document.documentElement.offsetHeight ); const height = getDocumentHeight(); document.body.style.height = `${height}px !important`; document.documentElement.style.height = `${height}px !important`; document.body.style.overflow = 'hidden !important'; }); const contentCheck = await page.evaluate(() => { const textContent = document.body.textContent || ''; const images = document.querySelectorAll('img'); const documentImages = Array.from(images).filter(img => img.src.includes('document') || img.src.includes('page') || img.alt.includes('document') || img.alt.includes('page') ); return { totalText: textContent.length, totalImages: images.length, documentImages: documentImages.length, hasDocumentContent: documentImages.length > 0 || textContent.length > 1000 }; }); console.log("📊 Content verification:", { textLength: contentCheck.totalText, images: contentCheck.totalImages, documentImages: contentCheck.documentImages, hasContent: contentCheck.hasDocumentContent }); if (!contentCheck.hasDocumentContent) { console.warn("⚠️ Warning: Limited document content detected."); } await applyPrintStyles(page, progressTracker); await page.emulateMediaType('print'); progressTracker?.updateProgress(90, 'generating', 'Generating PDF...'); console.log("🔄 Generating PDF..."); const pdfBuffer = await page.pdf({ printBackground: true, preferCSSPageSize: true, displayHeaderFooter: false, timeout: 60000, scale: 1, omitBackground: false }); progressTracker?.updateProgress(100, 'completed', 'PDF generated successfully!'); console.log(`✅ PDF generated successfully! Size: ${(pdfBuffer.length / 1024 / 1024).toFixed(2)} MB`); return pdfBuffer; } catch (error) { progressTracker?.updateProgress(-1, 'error', error.message); console.error("❌ Error during PDF generation:", error); throw error; } finally { if (browser) { console.log("🔒 Closing browser..."); try { await browser.close(); } catch (e) { console.log("Error closing browser:", e.message); } } if (userDataDir) { console.log(`🗑️ Cleaning up temporary directory: ${userDataDir}`); try { await fs.rm(userDataDir, { recursive: true, force: true }); console.log("✅ Temporary directory cleaned up."); } catch (e) { console.error(`❌ Failed to clean up temporary directory ${userDataDir}:`, e.message); } } } }; // --- API Routes, Health, and Info Endpoints (Unchanged) --- app.post('/api/request-download', (req, res) => { const { url, email, password } = req.body; if (!url || !url.includes('studocu.com')) { return res.status(400).json({ error: 'Please provide a valid StuDocu URL.' }); } const sessionId = Date.now().toString(); const progressTracker = new ProgressTracker(sessionId); progressTrackers.set(sessionId, progressTracker); downloadJobs.set(sessionId, { status: 'processing' }); console.log(`🎯 Processing request for: ${url} [Session: ${sessionId}]`); res.json({ sessionId }); studocuDownloader(url, { email, password }, progressTracker) .then(pdfBuffer => { downloadJobs.set(sessionId, { status: 'completed', buffer: pdfBuffer }); progressTrackers.delete(sessionId); }) .catch(error => { downloadJobs.set(sessionId, { status: 'error', message: error.message }); progressTrackers.delete(sessionId); }); }); app.get('/api/progress/:sessionId', (req, res) => { const { sessionId } = req.params; const tracker = progressTrackers.get(sessionId); if (tracker) { return res.json({ sessionId, progress: tracker.progress, status: tracker.status, message: tracker.message, timestamp: new Date().toISOString() }); } const job = downloadJobs.get(sessionId); if (job) { if (job.status === 'completed') { return res.json({ sessionId, progress: 100, status: 'completed', message: 'PDF generated successfully!' }); } if (job.status === 'error') { return res.json({ sessionId, progress: -1, status: 'error', message: job.message }); } } return res.status(404).json({ error: 'Session not found' }); }); app.get('/api/download/:sessionId', (req, res) => { const { sessionId } = req.params; const job = downloadJobs.get(sessionId); if (!job) { return res.status(404).json({ error: 'Download session not found or expired.' }); } if (job.status === 'processing') { return res.status(400).json({ error: 'Download is still processing.' }); } if (job.status === 'error') { return res.status(500).json({ error: `Failed to generate PDF: ${job.message}` }); } if (job.status === 'completed' && job.buffer) { res.setHeader('Content-Type', 'application/pdf'); res.setHeader('Content-Disposition', 'attachment; filename=studocu-document.pdf'); res.send(job.buffer); } else { res.status(500).json({ error: 'An unknown error occurred.' }); } }); app.get('/health', (req, res) => { res.json({ status: 'healthy', timestamp: new Date().toISOString(), uptime: process.uptime(), activeDownloads: progressTrackers.size }); }); app.get('/', (req, res) => { res.json({ message: '🚀 Enhanced StuDocu Downloader API v5.3 - Real-time Progress Tracking with Cloudflare Bypass', version: '5.3.0', features: [ '🛡️ Cloudflare JS Challenge Bypass', '🍪 Advanced cookie banner bypass', '🔓 Premium content unblurring', '🔑 Login support for full access', '📊 Real-time progress tracking via polling', '📄 Clean PDF generation with print styles', '🕵️ Enhanced stealth to evade bot detection' ], endpoints: { request: 'POST /api/request-download (body: {url, filename?, email?, password?})', progress: 'GET /api/progress/:sessionId', download: 'GET /api/download/:sessionId', health: 'GET /health' } }); }); process.on('SIGTERM', () => { console.log('SIGTERM received, shutting down gracefully...'); process.exit(0); }); process.on('SIGINT', () => { console.log('SIGINT received, shutting down gracefully...'); process.exit(0); }); app.listen(port, () => { console.log(`🚀 Enhanced StuDocu Downloader v5.3.0 running on http://localhost:${port}`); console.log(`✨ Features: Cloudflare Bypass, Real-time progress tracking, enhanced stealth, and user feedback`); });