const express = require('express'); const puppeteer = require('puppeteer'); const cors = require('cors'); const { EventEmitter } = require('events'); const app = express(); const port = 7860; app.use(cors()); app.use(express.json()); // Progress tracking system class ProgressTracker extends EventEmitter { constructor(sessionId) { super(); this.sessionId = sessionId; this.progress = 0; this.status = 'initializing'; this.message = ''; } updateProgress(progress, status, message) { this.progress = progress; this.status = status; this.message = message; const progressData = { sessionId: this.sessionId, progress, status, message, timestamp: new Date().toISOString() }; // Emit the 'progress' event for SSE listeners this.emit('progress', progressData); console.log(`📊 [${this.sessionId}] ${progress}% - ${status}: ${message}`); } } // Store active progress trackers const progressTrackers = new Map(); /** * Advanced cookie banner and content bypass for StuDocu */ const bypassCookiesAndRestrictions = async (page, progressTracker) => { progressTracker?.updateProgress(5, 'bypassing', 'Setting up cookie bypass...'); console.log("🍪 Starting comprehensive cookie and restriction bypass..."); // Step 1: Set cookies before page load const preCookies = [ { name: 'cookieConsent', value: 'accepted', domain: '.studocu.com' }, { name: 'cookie_consent', value: 'true', domain: '.studocu.com' }, { name: 'gdpr_consent', value: 'accepted', domain: '.studocu.com' }, { name: 'privacy_policy_accepted', value: 'true', domain: '.studocu.com' }, { name: 'user_consent', value: '1', domain: '.studocu.com' }, { name: 'analytics_consent', value: 'false', domain: '.studocu.com' }, { name: 'marketing_consent', value: 'false', domain: '.studocu.com' }, { name: 'functional_consent', value: 'true', domain: '.studocu.com' }, ]; for (const cookie of preCookies) { try { await page.setCookie(cookie); } catch (e) { console.log(`Failed to set cookie ${cookie.name}:`, e.message); } } // Step 2: Inject CSS to hide cookie banners immediately await page.addStyleTag({ content: ` /* Hide all possible cookie banners */ [id*="cookie" i]:not(img):not(input), [class*="cookie" i]:not(img):not(input), [data-testid*="cookie" i], [aria-label*="cookie" i], .gdpr-banner, .gdpr-popup, .gdpr-modal, .consent-banner, .consent-popup, .consent-modal, .privacy-banner, .privacy-popup, .privacy-modal, .cookie-law, .cookie-policy, .cookie-compliance, .onetrust-banner-sdk, #onetrust-consent-sdk, .cmp-banner, .cmp-popup, .cmp-modal, [class*="CookieBanner"], [class*="CookieNotice"], [class*="ConsentBanner"], [class*="ConsentManager"], .cc-banner, .cc-window, .cc-compliance, div[style*="position: fixed"]:has-text("cookie"), div[style*="position: fixed"]:has-text("consent"), .fixed:has-text("cookie"), .fixed:has-text("consent") { display: none !important; visibility: hidden !important; opacity: 0 !important; z-index: -9999 !important; pointer-events: none !important; } /* Remove blur and premium overlays */ [class*="blur" i], [class*="premium" i], [class*="paywall" i], [class*="sample-preview-blur" i] { filter: none !important; backdrop-filter: none !important; opacity: 1 !important; visibility: visible !important; } /* Ensure document content is visible */ .document-content, .page-content, [data-page] { filter: none !important; opacity: 1 !important; visibility: visible !important; pointer-events: auto !important; } /* Remove fixed overlays */ .fixed-overlay, .sticky-overlay, .content-overlay { display: none !important; } /* Restore scrolling */ html, body { overflow: auto !important; position: static !important; } ` }); // Step 3: Inject JavaScript to handle dynamic cookie banners await page.evaluateOnNewDocument(() => { // Override common cookie consent functions window.cookieConsent = { accepted: true }; window.gtag = () => { }; window.ga = () => { }; window.dataLayer = []; // Mutation observer to catch dynamically added cookie banners const observer = new MutationObserver((mutations) => { mutations.forEach((mutation) => { mutation.addedNodes.forEach((node) => { if (node.nodeType === 1) { // Element node const element = node; const text = element.textContent || ''; const className = element.className || ''; const id = element.id || ''; // Check if this looks like a cookie banner if ( text.toLowerCase().includes('cookie') || text.toLowerCase().includes('consent') || text.toLowerCase().includes('privacy policy') || className.toLowerCase().includes('cookie') || className.toLowerCase().includes('consent') || className.toLowerCase().includes('gdpr') || id.toLowerCase().includes('cookie') || id.toLowerCase().includes('consent') ) { console.log('Removing detected cookie banner:', element); element.remove(); } } }); }); }); observer.observe(document.body, { childList: true, subtree: true }); // Set up periodic cleanup setInterval(() => { const cookieElements = document.querySelectorAll(` [id*="cookie" i]:not(img):not(input), [class*="cookie" i]:not(img):not(input), [data-testid*="cookie" i], .gdpr-banner, .consent-banner, .privacy-banner, .onetrust-banner-sdk, #onetrust-consent-sdk, .cmp-banner, .cc-banner `); cookieElements.forEach(el => el.remove()); // Restore body scroll document.body.style.overflow = 'auto'; document.documentElement.style.overflow = 'auto'; }, 1000); }); progressTracker?.updateProgress(10, 'bypassing', 'Cookie bypass configured successfully'); return true; }; /** * Enhanced content unblurring and premium bypass */ const unblurContent = async (page, progressTracker) => { progressTracker?.updateProgress(15, 'unblurring', 'Removing content restrictions...'); console.log("🔓 Unblurring content and bypassing premium restrictions..."); await page.evaluate(() => { const removeRestrictions = () => { const removeBySelector = (selector) => { document.querySelectorAll(selector).forEach(el => el.remove()); }; removeBySelector("#adbox, .adsbox, .ad-box, .banner-ads, .advert"); removeBySelector(".PremiumBannerBlobWrapper_overflow-wrapper__xsaS8"); const removeBlur = (element = document) => { element.querySelectorAll("*").forEach(el => { const style = window.getComputedStyle(el); if ( style.filter?.includes("blur") || style.backdropFilter?.includes("blur") || parseFloat(style.opacity) < 1 || (el.className && el.className.toString().toLowerCase().includes("blur")) || (el.className && el.className.toString().toLowerCase().includes("premium")) ) { el.style.filter = "none !important"; el.style.backdropFilter = "none !important"; el.style.opacity = "1 !important"; if (el.classList) { el.classList.remove("blur", "blurred", "premium-blur"); } } }); }; removeBlur(); removeBySelector('[class*="blur" i], [class*="premium" i], [class*="paywall" i]'); const contentSelectors = [ '.document-content', '.page-content', '.content', '[data-page]', '[data-testid*="document"]', '[data-testid*="page"]', '.page', '.document-page', 'main', 'article' ]; contentSelectors.forEach(selector => { document.querySelectorAll(selector).forEach(el => { el.style.setProperty('filter', 'none', 'important'); el.style.setProperty('opacity', '1', 'important'); el.style.setProperty('visibility', 'visible', 'important'); el.style.setProperty('display', 'block', 'important'); el.style.setProperty('pointer-events', 'auto', 'important'); }); }); }; removeRestrictions(); const intervalId = setInterval(removeRestrictions, 2000); setTimeout(() => clearInterval(intervalId), 60000); }); progressTracker?.updateProgress(20, 'unblurring', 'Content restrictions removed'); }; /** * Apply print styles for clean PDF output */ const applyPrintStyles = async (page, progressTracker) => { progressTracker?.updateProgress(85, 'styling', 'Applying print styles...'); console.log("🖨️ Applying print styles for clean PDF..."); await page.evaluate(() => { const style = document.createElement("style"); style.id = "print-style-extension"; style.innerHTML = ` @page { size: A4 portrait; margin: 5mm; } @media print { html, body { margin: 0 !important; padding: 0 !important; overflow: visible !important; } header, footer, nav, aside, .no-print, .ads, .sidebar, .premium-banner, .ViewerToolbar, .Layout_info-bar-wrapper__He0Ho, .Sidebar_sidebar-scrollable__kqeBZ, .HeaderWrapper_header-wrapper__mCmf3, .Layout_visible-content-bottom-wrapper-sticky__yaaAB, .Layout_bottom-section-wrapper__yBWWk, .Layout_footer-wrapper__bheJQ, .InlineBanner_inline-banner-wrapper__DAi5X, .banner-wrapper, #top-bar-wrapper, .Layout_sidebar-wrapper__unavM, .Layout_is-open__9DQr4 { display: none !important; } body { background: white !important; color: black !important; } * { box-shadow: none !important; background: transparent !important; } .Viewer_document-wrapper__JPBWQ, .Viewer_document-wrapper__LXzoQ, .Viewer_document-wrapper__XsO4j, .page-content { display: flex !important; flex-direction: column !important; width: 100% !important; max-width: 210mm !important; margin: 0 auto !important; } [data-page], .page, .document-page, img { page-break-after: always !important; page-break-inside: avoid !important; page-break-before: avoid !important; width: 100% !important; max-width: 100% !important; height: auto !important; } } `; document.head.appendChild(style); }); progressTracker?.updateProgress(88, 'styling', 'Print styles applied successfully'); }; /** * Enhanced StuDocu downloader with progress tracking */ const studocuDownloader = async (url, options = {}, progressTracker = null) => { let browser; try { progressTracker?.updateProgress(0, 'initializing', 'Starting browser...'); console.log("🚀 Launching browser with stealth configuration..."); browser = await puppeteer.launch({ headless: true, args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-accelerated-2d-canvas', '--no-first-run', '--no-zygote', '--disable-gpu', '--disable-features=VizDisplayCompositor', '--disable-background-networking', '--disable-background-timer-throttling', '--disable-renderer-backgrounding', '--disable-backgrounding-occluded-windows', '--disable-ipc-flooding-protection', '--disable-web-security', '--disable-features=site-per-process', '--disable-blink-features=AutomationControlled', '--disable-extensions' ], timeout: 300000, }); const page = await browser.newPage(); progressTracker?.updateProgress(2, 'initializing', 'Configuring browser settings...'); await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'); await page.setViewport({ width: 794, height: 1122 }); await page.evaluateOnNewDocument(() => { Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] }); }); // Set up cookie and content bypass await bypassCookiesAndRestrictions(page, progressTracker); // Block unnecessary resources await page.setRequestInterception(true); page.on('request', (req) => { const resourceType = req.resourceType(); const reqUrl = req.url(); if ( reqUrl.includes('doubleclick') || reqUrl.includes('googletagmanager') || reqUrl.includes('facebook.com') || reqUrl.includes('twitter.com') || reqUrl.includes('analytics') || reqUrl.includes('gtm') || reqUrl.includes('hotjar') || reqUrl.includes('mixpanel') || reqUrl.includes('onetrust') || reqUrl.includes('cookielaw') || (resourceType === 'other' && reqUrl.includes('track')) ) { req.abort(); } else { req.continue(); } }); // Login if credentials provided if (options.email && options.password) { progressTracker?.updateProgress(12, 'authenticating', 'Logging into StuDocu...'); console.log("🔑 Logging in to StuDocu..."); await page.goto('https://www.studocu.com/en-us/login', { waitUntil: 'domcontentloaded', timeout: 60000 }); await page.waitForSelector('#email', { timeout: 15000 }); await page.type('#email', options.email); await page.type('#password', options.password); await page.click('button[type="submit"]'); try { await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 30000 }); await page.waitForSelector('.user-profile, [data-testid="user-menu"]', { timeout: 10000 }); console.log("✅ Login successful."); progressTracker?.updateProgress(18, 'authenticated', 'Login successful'); } catch (e) { console.error("❌ Login failed:", e.message); throw new Error("Login failed. Check credentials or try again."); } } progressTracker?.updateProgress(25, 'navigating', 'Navigating to document...'); console.log(`📄 Navigating to ${url}...`); let navigationSuccess = false; let attempts = 0; const maxAttempts = 3; while (!navigationSuccess && attempts < maxAttempts) { try { attempts++; progressTracker?.updateProgress(25 + (attempts * 5), 'navigating', `Navigation attempt ${attempts}/${maxAttempts}`); console.log(`Navigation attempt ${attempts}/${maxAttempts}`); await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 }); navigationSuccess = true; } catch (e) { console.log(`Navigation attempt ${attempts} failed:`, e.message); if (attempts >= maxAttempts) throw e; await new Promise(resolve => setTimeout(resolve, 5000)); } } progressTracker?.updateProgress(40, 'loading', 'Page loaded, waiting for content...'); await new Promise(resolve => setTimeout(resolve, 5000)); // Apply content unblurring await unblurContent(page, progressTracker); // Wait for document content progressTracker?.updateProgress(45, 'loading', 'Waiting for document content...'); console.log("⏳ Waiting for document content to load..."); const contentSelectors = [ '.document-content', '.page-content', '[data-page]', '[data-testid*="document"]', 'img[src*="document"]', 'img[src*="page"]', '.page', 'main img', 'article img' ]; let contentFound = false; for (const selector of contentSelectors) { try { await page.waitForSelector(selector, { timeout: 20000 }); console.log(`✅ Found content with selector: ${selector}`); contentFound = true; break; } catch (e) { console.log(`❌ Selector ${selector} not found, trying next...`); } } if (!contentFound) { console.log("⚠️ No specific content selector found, proceeding with page content..."); } // Enhanced scrolling to load all content progressTracker?.updateProgress(50, 'scrolling', 'Loading all document pages...'); console.log("📜 Loading all document pages with enhanced slow scroll..."); await page.evaluate(async () => { const delay = (ms) => new Promise((res) => setTimeout(res, ms)); let scrollHeight = document.body.scrollHeight; while (true) { let totalHeight = 0; const distance = 300; while (totalHeight < scrollHeight) { window.scrollBy(0, distance); totalHeight += distance; await delay(500); } await delay(2000); const newHeight = document.body.scrollHeight; if (newHeight === scrollHeight) break; scrollHeight = newHeight; } window.scrollTo({ top: 0, behavior: "smooth" }); await delay(1000); }); progressTracker?.updateProgress(70, 'processing', 'Processing loaded content...'); // Re-apply unblur after loading new content await unblurContent(page, progressTracker); // Wait for all images to load progressTracker?.updateProgress(75, 'loading_images', 'Loading images...'); console.log("🖼️ Waiting for all images to load..."); await page.evaluate(async () => { const images = Array.from(document.querySelectorAll('img')); await Promise.all(images.map(img => { if (img.complete) return Promise.resolve(); return new Promise((resolve) => { img.addEventListener('load', resolve); img.addEventListener('error', resolve); setTimeout(resolve, 15000); }); })); }); await new Promise(resolve => setTimeout(resolve, 5000)); progressTracker?.updateProgress(80, 'finalizing', 'Preparing document for PDF generation...'); // Set exact height await page.evaluate(() => { const getDocumentHeight = () => Math.max( document.body.scrollHeight, document.body.offsetHeight, document.documentElement.clientHeight, document.documentElement.scrollHeight, document.documentElement.offsetHeight ); const height = getDocumentHeight(); document.body.style.height = `${height}px !important`; document.documentElement.style.height = `${height}px !important`; document.body.style.overflow = 'hidden !important'; }); // Content verification const contentCheck = await page.evaluate(() => { const textContent = document.body.textContent || ''; const images = document.querySelectorAll('img'); const documentImages = Array.from(images).filter(img => img.src.includes('document') || img.src.includes('page') || img.alt.includes('document') || img.alt.includes('page') ); return { totalText: textContent.length, totalImages: images.length, documentImages: documentImages.length, hasDocumentContent: documentImages.length > 0 || textContent.length > 1000 }; }); console.log("📊 Content verification:", { textLength: contentCheck.totalText, images: contentCheck.totalImages, documentImages: contentCheck.documentImages, hasContent: contentCheck.hasDocumentContent }); if (!contentCheck.hasDocumentContent) { console.warn("⚠️ Warning: Limited document content detected."); } // Apply print styles and generate PDF await applyPrintStyles(page, progressTracker); await page.emulateMediaType('print'); progressTracker?.updateProgress(90, 'generating', 'Generating PDF...'); console.log("🔄 Generating PDF..."); const pdfBuffer = await page.pdf({ printBackground: true, preferCSSPageSize: true, displayHeaderFooter: false, timeout: 180000, scale: 1, omitBackground: false }); progressTracker?.updateProgress(100, 'completed', 'PDF generated successfully!'); console.log(`✅ PDF generated successfully! Size: ${(pdfBuffer.length / 1024 / 1024).toFixed(2)} MB`); return pdfBuffer; } catch (error) { progressTracker?.updateProgress(-1, 'error', error.message); console.error("❌ Error during PDF generation:", error); throw error; } finally { if (browser) { console.log("🔒 Closing browser..."); try { await browser.close(); } catch (e) { console.log("Error closing browser:", e.message); } } } }; // API Routes // Enhanced download endpoint with progress tracking app.post('/api/download', async (req, res) => { // Note: The original client code was sending sessionId in the body. // We will use the one passed in the body or generate a new one. const { url, filename, email, password, sessionId: reqSessionId } = req.body; if (!url) { return res.status(400).json({ error: 'URL is required.' }); } if (!url.includes('studocu.com')) { return res.status(400).json({ error: 'Please provide a valid StuDocu URL.' }); } let normalizedUrl = url.trim(); if (!normalizedUrl.startsWith('http')) { normalizedUrl = 'https://' + normalizedUrl; } // Use the session ID from the request or create a new one. const sessionId = reqSessionId || Date.now().toString(); const progressTracker = new ProgressTracker(sessionId); progressTrackers.set(sessionId, progressTracker); console.log(`🎯 Processing request for: ${normalizedUrl} [Session: ${sessionId}]`); // We don't wait for the downloader to finish. // It runs in the background while we immediately return a response. studocuDownloader(normalizedUrl, { filename, email, password }, progressTracker) .then(pdfBuffer => { // Store the result for the user to download later or handle as needed progressTracker.pdfBuffer = pdfBuffer; console.log(`🎉 PDF is ready for download [Session: ${sessionId}]`); }) .catch(error => { console.error(`❌ Failed to process ${normalizedUrl}:`, error.message); // You can emit a final error event here if you want progressTracker.updateProgress(-1, 'error', error.message || 'An unknown error occurred.'); }) .finally(() => { // Optional: Clean up the tracker after some time setTimeout(() => { const tracker = progressTrackers.get(sessionId); // Don't delete if there's a PDF buffer waiting to be downloaded if (tracker && !tracker.pdfBuffer) { progressTrackers.delete(sessionId); } }, 300000); // 5 minutes }); // Immediately respond to the client so it can start listening to the progress stream. res.status(202).json({ message: "Download process started.", sessionId: sessionId }); }); // *************************************************************** // ** NEW SERVER-SENT EVENTS (SSE) ENDPOINT FOR REAL-TIME PROGRESS ** // *************************************************************** app.get('/api/progress-stream/:sessionId', (req, res) => { const { sessionId } = req.params; const tracker = progressTrackers.get(sessionId); if (!tracker) { return res.status(404).json({ error: 'Session not found' }); } // Set headers for SSE res.setHeader('Content-Type', 'text/event-stream'); res.setHeader('Cache-Control', 'no-cache'); res.setHeader('Connection', 'keep-alive'); res.flushHeaders(); // Flush the headers to establish the connection // The function that sends data to the client const sendProgress = (data) => { res.write(`data: ${JSON.stringify(data)}\n\n`); }; // Attach the listener to the specific tracker instance tracker.on('progress', sendProgress); // Handle client disconnect req.on('close', () => { // Remove the listener for this specific client tracker.removeListener('progress', sendProgress); console.log(`🔌 Client disconnected for session: ${sessionId}`); }); }); // Your old polling endpoint (can be kept for debugging or removed) app.get('/api/progress/:sessionId', (req, res) => { const { sessionId } = req.params; const tracker = progressTrackers.get(sessionId); if (!tracker) { return res.status(404).json({ error: 'Session not found' }); } res.json({ sessionId, progress: tracker.progress, status: tracker.status, message: tracker.message, timestamp: new Date().toISOString() }); }); // Health and info endpoints app.get('/health', (req, res) => { res.json({ status: 'healthy', timestamp: new Date().toISOString(), uptime: process.uptime(), activeDownloads: progressTrackers.size }); }); app.get('/', (req, res) => { res.json({ message: '🚀 Enhanced StuDocu Downloader API v5.0 - Real-time Progress Tracking', version: '5.0.0', features: [ '🍪 Advanced cookie banner bypass', '🔓 Premium content unblurring', '🔑 Login support for full access', '📊 Real-time progress tracking via polling', '📄 Clean PDF generation with print styles' ], endpoints: { download: 'POST /api/download (body: {url, filename?, email?, password?})', progress: 'GET /api/progress/:sessionId', health: 'GET /health' } }); }); process.on('SIGTERM', () => { console.log('SIGTERM received, shutting down gracefully...'); process.exit(0); }); process.on('SIGINT', () => { console.log('SIGINT received, shutting down gracefully...'); process.exit(0); }); app.listen(port, () => { console.log(`🚀 Enhanced StuDocu Downloader v5.0.0 running on http://localhost:${port}`); console.log(`✨ Features: Real-time progress tracking and enhanced user feedback`); });