test / server.js
devusman's picture
update
dee4c6e
raw
history blame
29.4 kB
const express = require('express');
const puppeteer = require('puppeteer');
const cors = require('cors');
const { EventEmitter } = require('events');
const app = express();
const port = 7860;
app.use(cors());
app.use(express.json());
// Progress tracking system
class ProgressTracker extends EventEmitter {
constructor(sessionId) {
super();
this.sessionId = sessionId;
this.progress = 0;
this.status = 'initializing';
this.message = '';
}
updateProgress(progress, status, message) {
this.progress = progress;
this.status = status;
this.message = message;
const progressData = {
sessionId: this.sessionId,
progress,
status,
message,
timestamp: new Date().toISOString()
};
// Emit the 'progress' event for SSE listeners
this.emit('progress', progressData);
console.log(`πŸ“Š [${this.sessionId}] ${progress}% - ${status}: ${message}`);
}
}
// Store active progress trackers
const progressTrackers = new Map();
/**
* Advanced cookie banner and content bypass for StuDocu
*/
const bypassCookiesAndRestrictions = async (page, progressTracker) => {
progressTracker?.updateProgress(5, 'bypassing', 'Setting up cookie bypass...');
console.log("πŸͺ Starting comprehensive cookie and restriction bypass...");
// Step 1: Set cookies before page load
const preCookies = [
{ name: 'cookieConsent', value: 'accepted', domain: '.studocu.com' },
{ name: 'cookie_consent', value: 'true', domain: '.studocu.com' },
{ name: 'gdpr_consent', value: 'accepted', domain: '.studocu.com' },
{ name: 'privacy_policy_accepted', value: 'true', domain: '.studocu.com' },
{ name: 'user_consent', value: '1', domain: '.studocu.com' },
{ name: 'analytics_consent', value: 'false', domain: '.studocu.com' },
{ name: 'marketing_consent', value: 'false', domain: '.studocu.com' },
{ name: 'functional_consent', value: 'true', domain: '.studocu.com' },
];
for (const cookie of preCookies) {
try {
await page.setCookie(cookie);
} catch (e) {
console.log(`Failed to set cookie ${cookie.name}:`, e.message);
}
}
// Step 2: Inject CSS to hide cookie banners immediately
await page.addStyleTag({
content: `
/* Hide all possible cookie banners */
[id*="cookie" i]:not(img):not(input), [class*="cookie" i]:not(img):not(input), [data-testid*="cookie" i], [aria-label*="cookie" i],
.gdpr-banner, .gdpr-popup, .gdpr-modal, .consent-banner, .consent-popup, .consent-modal, .privacy-banner, .privacy-popup, .privacy-modal,
.cookie-law, .cookie-policy, .cookie-compliance, .onetrust-banner-sdk, #onetrust-consent-sdk, .cmp-banner, .cmp-popup, .cmp-modal,
[class*="CookieBanner"], [class*="CookieNotice"], [class*="ConsentBanner"], [class*="ConsentManager"], .cc-banner, .cc-window, .cc-compliance,
div[style*="position: fixed"]:has-text("cookie"), div[style*="position: fixed"]:has-text("consent"), .fixed:has-text("cookie"), .fixed:has-text("consent") {
display: none !important;
visibility: hidden !important;
opacity: 0 !important;
z-index: -9999 !important;
pointer-events: none !important;
}
/* Remove blur and premium overlays */
[class*="blur" i], [class*="premium" i], [class*="paywall" i], [class*="sample-preview-blur" i] {
filter: none !important;
backdrop-filter: none !important;
opacity: 1 !important;
visibility: visible !important;
}
/* Ensure document content is visible */
.document-content, .page-content, [data-page] {
filter: none !important;
opacity: 1 !important;
visibility: visible !important;
pointer-events: auto !important;
}
/* Remove fixed overlays */
.fixed-overlay, .sticky-overlay, .content-overlay {
display: none !important;
}
/* Restore scrolling */
html, body {
overflow: auto !important;
position: static !important;
}
`
});
// Step 3: Inject JavaScript to handle dynamic cookie banners
await page.evaluateOnNewDocument(() => {
// Override common cookie consent functions
window.cookieConsent = { accepted: true };
window.gtag = () => { };
window.ga = () => { };
window.dataLayer = [];
// Mutation observer to catch dynamically added cookie banners
const observer = new MutationObserver((mutations) => {
mutations.forEach((mutation) => {
mutation.addedNodes.forEach((node) => {
if (node.nodeType === 1) { // Element node
const element = node;
const text = element.textContent || '';
const className = element.className || '';
const id = element.id || '';
// Check if this looks like a cookie banner
if (
text.toLowerCase().includes('cookie') ||
text.toLowerCase().includes('consent') ||
text.toLowerCase().includes('privacy policy') ||
className.toLowerCase().includes('cookie') ||
className.toLowerCase().includes('consent') ||
className.toLowerCase().includes('gdpr') ||
id.toLowerCase().includes('cookie') ||
id.toLowerCase().includes('consent')
) {
console.log('Removing detected cookie banner:', element);
element.remove();
}
}
});
});
});
observer.observe(document.body, { childList: true, subtree: true });
// Set up periodic cleanup
setInterval(() => {
const cookieElements = document.querySelectorAll(`
[id*="cookie" i]:not(img):not(input), [class*="cookie" i]:not(img):not(input), [data-testid*="cookie" i],
.gdpr-banner, .consent-banner, .privacy-banner, .onetrust-banner-sdk, #onetrust-consent-sdk,
.cmp-banner, .cc-banner
`);
cookieElements.forEach(el => el.remove());
// Restore body scroll
document.body.style.overflow = 'auto';
document.documentElement.style.overflow = 'auto';
}, 1000);
});
progressTracker?.updateProgress(10, 'bypassing', 'Cookie bypass configured successfully');
return true;
};
/**
* Enhanced content unblurring and premium bypass
*/
const unblurContent = async (page, progressTracker) => {
progressTracker?.updateProgress(15, 'unblurring', 'Removing content restrictions...');
console.log("πŸ”“ Unblurring content and bypassing premium restrictions...");
await page.evaluate(() => {
const removeRestrictions = () => {
const removeBySelector = (selector) => {
document.querySelectorAll(selector).forEach(el => el.remove());
};
removeBySelector("#adbox, .adsbox, .ad-box, .banner-ads, .advert");
removeBySelector(".PremiumBannerBlobWrapper_overflow-wrapper__xsaS8");
const removeBlur = (element = document) => {
element.querySelectorAll("*").forEach(el => {
const style = window.getComputedStyle(el);
if (
style.filter?.includes("blur") ||
style.backdropFilter?.includes("blur") ||
parseFloat(style.opacity) < 1 ||
(el.className && el.className.toString().toLowerCase().includes("blur")) ||
(el.className && el.className.toString().toLowerCase().includes("premium"))
) {
el.style.filter = "none !important";
el.style.backdropFilter = "none !important";
el.style.opacity = "1 !important";
if (el.classList) {
el.classList.remove("blur", "blurred", "premium-blur");
}
}
});
};
removeBlur();
removeBySelector('[class*="blur" i], [class*="premium" i], [class*="paywall" i]');
const contentSelectors = [
'.document-content', '.page-content', '.content', '[data-page]', '[data-testid*="document"]',
'[data-testid*="page"]', '.page', '.document-page', 'main', 'article'
];
contentSelectors.forEach(selector => {
document.querySelectorAll(selector).forEach(el => {
el.style.setProperty('filter', 'none', 'important');
el.style.setProperty('opacity', '1', 'important');
el.style.setProperty('visibility', 'visible', 'important');
el.style.setProperty('display', 'block', 'important');
el.style.setProperty('pointer-events', 'auto', 'important');
});
});
};
removeRestrictions();
const intervalId = setInterval(removeRestrictions, 2000);
setTimeout(() => clearInterval(intervalId), 60000);
});
progressTracker?.updateProgress(20, 'unblurring', 'Content restrictions removed');
};
/**
* Apply print styles for clean PDF output
*/
const applyPrintStyles = async (page, progressTracker) => {
progressTracker?.updateProgress(85, 'styling', 'Applying print styles...');
console.log("πŸ–¨οΈ Applying print styles for clean PDF...");
await page.evaluate(() => {
const style = document.createElement("style");
style.id = "print-style-extension";
style.innerHTML = `
@page {
size: A4 portrait;
margin: 5mm;
}
@media print {
html, body {
margin: 0 !important;
padding: 0 !important;
overflow: visible !important;
}
header, footer, nav, aside, .no-print, .ads, .sidebar, .premium-banner,
.ViewerToolbar, .Layout_info-bar-wrapper__He0Ho, .Sidebar_sidebar-scrollable__kqeBZ,
.HeaderWrapper_header-wrapper__mCmf3, .Layout_visible-content-bottom-wrapper-sticky__yaaAB,
.Layout_bottom-section-wrapper__yBWWk, .Layout_footer-wrapper__bheJQ,
.InlineBanner_inline-banner-wrapper__DAi5X, .banner-wrapper, #top-bar-wrapper,
.Layout_sidebar-wrapper__unavM, .Layout_is-open__9DQr4 {
display: none !important;
}
body {
background: white !important;
color: black !important;
}
* {
box-shadow: none !important;
background: transparent !important;
}
.Viewer_document-wrapper__JPBWQ, .Viewer_document-wrapper__LXzoQ, .Viewer_document-wrapper__XsO4j, .page-content {
display: flex !important;
flex-direction: column !important;
width: 100% !important;
max-width: 210mm !important;
margin: 0 auto !important;
}
[data-page], .page, .document-page, img {
page-break-after: always !important;
page-break-inside: avoid !important;
page-break-before: avoid !important;
width: 100% !important;
max-width: 100% !important;
height: auto !important;
}
}
`;
document.head.appendChild(style);
});
progressTracker?.updateProgress(88, 'styling', 'Print styles applied successfully');
};
/**
* Enhanced StuDocu downloader with progress tracking
*/
const studocuDownloader = async (url, options = {}, progressTracker = null) => {
let browser;
try {
progressTracker?.updateProgress(0, 'initializing', 'Starting browser...');
console.log("πŸš€ Launching browser with stealth configuration...");
browser = await puppeteer.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--no-first-run',
'--no-zygote',
'--disable-gpu',
'--disable-features=VizDisplayCompositor',
'--disable-background-networking',
'--disable-background-timer-throttling',
'--disable-renderer-backgrounding',
'--disable-backgrounding-occluded-windows',
'--disable-ipc-flooding-protection',
'--disable-web-security',
'--disable-features=site-per-process',
'--disable-blink-features=AutomationControlled',
'--disable-extensions'
],
timeout: 300000,
});
const page = await browser.newPage();
progressTracker?.updateProgress(2, 'initializing', 'Configuring browser settings...');
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
await page.setViewport({ width: 794, height: 1122 });
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
});
// Set up cookie and content bypass
await bypassCookiesAndRestrictions(page, progressTracker);
// Block unnecessary resources
await page.setRequestInterception(true);
page.on('request', (req) => {
const resourceType = req.resourceType();
const reqUrl = req.url();
if (
reqUrl.includes('doubleclick') ||
reqUrl.includes('googletagmanager') ||
reqUrl.includes('facebook.com') ||
reqUrl.includes('twitter.com') ||
reqUrl.includes('analytics') ||
reqUrl.includes('gtm') ||
reqUrl.includes('hotjar') ||
reqUrl.includes('mixpanel') ||
reqUrl.includes('onetrust') ||
reqUrl.includes('cookielaw') ||
(resourceType === 'other' && reqUrl.includes('track'))
) {
req.abort();
} else {
req.continue();
}
});
// Login if credentials provided
if (options.email && options.password) {
progressTracker?.updateProgress(12, 'authenticating', 'Logging into StuDocu...');
console.log("πŸ”‘ Logging in to StuDocu...");
await page.goto('https://www.studocu.com/en-us/login', { waitUntil: 'domcontentloaded', timeout: 60000 });
await page.waitForSelector('#email', { timeout: 15000 });
await page.type('#email', options.email);
await page.type('#password', options.password);
await page.click('button[type="submit"]');
try {
await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 30000 });
await page.waitForSelector('.user-profile, [data-testid="user-menu"]', { timeout: 10000 });
console.log("βœ… Login successful.");
progressTracker?.updateProgress(18, 'authenticated', 'Login successful');
} catch (e) {
console.error("❌ Login failed:", e.message);
throw new Error("Login failed. Check credentials or try again.");
}
}
progressTracker?.updateProgress(25, 'navigating', 'Navigating to document...');
console.log(`πŸ“„ Navigating to ${url}...`);
let navigationSuccess = false;
let attempts = 0;
const maxAttempts = 3;
while (!navigationSuccess && attempts < maxAttempts) {
try {
attempts++;
progressTracker?.updateProgress(25 + (attempts * 5), 'navigating', `Navigation attempt ${attempts}/${maxAttempts}`);
console.log(`Navigation attempt ${attempts}/${maxAttempts}`);
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 });
navigationSuccess = true;
} catch (e) {
console.log(`Navigation attempt ${attempts} failed:`, e.message);
if (attempts >= maxAttempts) throw e;
await new Promise(resolve => setTimeout(resolve, 5000));
}
}
progressTracker?.updateProgress(40, 'loading', 'Page loaded, waiting for content...');
await new Promise(resolve => setTimeout(resolve, 5000));
// Apply content unblurring
await unblurContent(page, progressTracker);
// Wait for document content
progressTracker?.updateProgress(45, 'loading', 'Waiting for document content...');
console.log("⏳ Waiting for document content to load...");
const contentSelectors = [
'.document-content', '.page-content', '[data-page]', '[data-testid*="document"]',
'img[src*="document"]', 'img[src*="page"]', '.page', 'main img', 'article img'
];
let contentFound = false;
for (const selector of contentSelectors) {
try {
await page.waitForSelector(selector, { timeout: 20000 });
console.log(`βœ… Found content with selector: ${selector}`);
contentFound = true;
break;
} catch (e) {
console.log(`❌ Selector ${selector} not found, trying next...`);
}
}
if (!contentFound) {
console.log("⚠️ No specific content selector found, proceeding with page content...");
}
// Enhanced scrolling to load all content
progressTracker?.updateProgress(50, 'scrolling', 'Loading all document pages...');
console.log("πŸ“œ Loading all document pages with enhanced slow scroll...");
await page.evaluate(async () => {
const delay = (ms) => new Promise((res) => setTimeout(res, ms));
let scrollHeight = document.body.scrollHeight;
while (true) {
let totalHeight = 0;
const distance = 300;
while (totalHeight < scrollHeight) {
window.scrollBy(0, distance);
totalHeight += distance;
await delay(500);
}
await delay(2000);
const newHeight = document.body.scrollHeight;
if (newHeight === scrollHeight) break;
scrollHeight = newHeight;
}
window.scrollTo({ top: 0, behavior: "smooth" });
await delay(1000);
});
progressTracker?.updateProgress(70, 'processing', 'Processing loaded content...');
// Re-apply unblur after loading new content
await unblurContent(page, progressTracker);
// Wait for all images to load
progressTracker?.updateProgress(75, 'loading_images', 'Loading images...');
console.log("πŸ–ΌοΈ Waiting for all images to load...");
await page.evaluate(async () => {
const images = Array.from(document.querySelectorAll('img'));
await Promise.all(images.map(img => {
if (img.complete) return Promise.resolve();
return new Promise((resolve) => {
img.addEventListener('load', resolve);
img.addEventListener('error', resolve);
setTimeout(resolve, 15000);
});
}));
});
await new Promise(resolve => setTimeout(resolve, 5000));
progressTracker?.updateProgress(80, 'finalizing', 'Preparing document for PDF generation...');
// Set exact height
await page.evaluate(() => {
const getDocumentHeight = () => Math.max(
document.body.scrollHeight, document.body.offsetHeight,
document.documentElement.clientHeight, document.documentElement.scrollHeight, document.documentElement.offsetHeight
);
const height = getDocumentHeight();
document.body.style.height = `${height}px !important`;
document.documentElement.style.height = `${height}px !important`;
document.body.style.overflow = 'hidden !important';
});
// Content verification
const contentCheck = await page.evaluate(() => {
const textContent = document.body.textContent || '';
const images = document.querySelectorAll('img');
const documentImages = Array.from(images).filter(img =>
img.src.includes('document') || img.src.includes('page') ||
img.alt.includes('document') || img.alt.includes('page')
);
return {
totalText: textContent.length,
totalImages: images.length,
documentImages: documentImages.length,
hasDocumentContent: documentImages.length > 0 || textContent.length > 1000
};
});
console.log("πŸ“Š Content verification:", {
textLength: contentCheck.totalText,
images: contentCheck.totalImages,
documentImages: contentCheck.documentImages,
hasContent: contentCheck.hasDocumentContent
});
if (!contentCheck.hasDocumentContent) {
console.warn("⚠️ Warning: Limited document content detected.");
}
// Apply print styles and generate PDF
await applyPrintStyles(page, progressTracker);
await page.emulateMediaType('print');
progressTracker?.updateProgress(90, 'generating', 'Generating PDF...');
console.log("πŸ”„ Generating PDF...");
const pdfBuffer = await page.pdf({
printBackground: true,
preferCSSPageSize: true,
displayHeaderFooter: false,
timeout: 180000,
scale: 1,
omitBackground: false
});
progressTracker?.updateProgress(100, 'completed', 'PDF generated successfully!');
console.log(`βœ… PDF generated successfully! Size: ${(pdfBuffer.length / 1024 / 1024).toFixed(2)} MB`);
return pdfBuffer;
} catch (error) {
progressTracker?.updateProgress(-1, 'error', error.message);
console.error("❌ Error during PDF generation:", error);
throw error;
} finally {
if (browser) {
console.log("πŸ”’ Closing browser...");
try {
await browser.close();
} catch (e) {
console.log("Error closing browser:", e.message);
}
}
}
};
// API Routes
// Enhanced download endpoint with progress tracking
app.post('/api/download', async (req, res) => {
// Note: The original client code was sending sessionId in the body.
// We will use the one passed in the body or generate a new one.
const { url, filename, email, password, sessionId: reqSessionId } = req.body;
if (!url) {
return res.status(400).json({ error: 'URL is required.' });
}
if (!url.includes('studocu.com')) {
return res.status(400).json({ error: 'Please provide a valid StuDocu URL.' });
}
let normalizedUrl = url.trim();
if (!normalizedUrl.startsWith('http')) {
normalizedUrl = 'https://' + normalizedUrl;
}
// Use the session ID from the request or create a new one.
const sessionId = reqSessionId || Date.now().toString();
const progressTracker = new ProgressTracker(sessionId);
progressTrackers.set(sessionId, progressTracker);
console.log(`🎯 Processing request for: ${normalizedUrl} [Session: ${sessionId}]`);
// We don't wait for the downloader to finish.
// It runs in the background while we immediately return a response.
studocuDownloader(normalizedUrl, { filename, email, password }, progressTracker)
.then(pdfBuffer => {
// Store the result for the user to download later or handle as needed
progressTracker.pdfBuffer = pdfBuffer;
console.log(`πŸŽ‰ PDF is ready for download [Session: ${sessionId}]`);
})
.catch(error => {
console.error(`❌ Failed to process ${normalizedUrl}:`, error.message);
// You can emit a final error event here if you want
progressTracker.updateProgress(-1, 'error', error.message || 'An unknown error occurred.');
})
.finally(() => {
// Optional: Clean up the tracker after some time
setTimeout(() => {
const tracker = progressTrackers.get(sessionId);
// Don't delete if there's a PDF buffer waiting to be downloaded
if (tracker && !tracker.pdfBuffer) {
progressTrackers.delete(sessionId);
}
}, 300000); // 5 minutes
});
// Immediately respond to the client so it can start listening to the progress stream.
res.status(202).json({
message: "Download process started.",
sessionId: sessionId
});
});
// ***************************************************************
// ** NEW SERVER-SENT EVENTS (SSE) ENDPOINT FOR REAL-TIME PROGRESS **
// ***************************************************************
app.get('/api/progress-stream/:sessionId', (req, res) => {
const { sessionId } = req.params;
const tracker = progressTrackers.get(sessionId);
if (!tracker) {
return res.status(404).json({ error: 'Session not found' });
}
// Set headers for SSE
res.setHeader('Content-Type', 'text/event-stream');
res.setHeader('Cache-Control', 'no-cache');
res.setHeader('Connection', 'keep-alive');
res.flushHeaders(); // Flush the headers to establish the connection
// The function that sends data to the client
const sendProgress = (data) => {
res.write(`data: ${JSON.stringify(data)}\n\n`);
};
// Attach the listener to the specific tracker instance
tracker.on('progress', sendProgress);
// Handle client disconnect
req.on('close', () => {
// Remove the listener for this specific client
tracker.removeListener('progress', sendProgress);
console.log(`πŸ”Œ Client disconnected for session: ${sessionId}`);
});
});
// Your old polling endpoint (can be kept for debugging or removed)
app.get('/api/progress/:sessionId', (req, res) => {
const { sessionId } = req.params;
const tracker = progressTrackers.get(sessionId);
if (!tracker) {
return res.status(404).json({ error: 'Session not found' });
}
res.json({
sessionId,
progress: tracker.progress,
status: tracker.status,
message: tracker.message,
timestamp: new Date().toISOString()
});
});
// Health and info endpoints
app.get('/health', (req, res) => {
res.json({
status: 'healthy',
timestamp: new Date().toISOString(),
uptime: process.uptime(),
activeDownloads: progressTrackers.size
});
});
app.get('/', (req, res) => {
res.json({
message: 'πŸš€ Enhanced StuDocu Downloader API v5.0 - Real-time Progress Tracking',
version: '5.0.0',
features: [
'πŸͺ Advanced cookie banner bypass',
'πŸ”“ Premium content unblurring',
'πŸ”‘ Login support for full access',
'πŸ“Š Real-time progress tracking via polling',
'πŸ“„ Clean PDF generation with print styles'
],
endpoints: {
download: 'POST /api/download (body: {url, filename?, email?, password?})',
progress: 'GET /api/progress/:sessionId',
health: 'GET /health'
}
});
});
process.on('SIGTERM', () => {
console.log('SIGTERM received, shutting down gracefully...');
process.exit(0);
});
process.on('SIGINT', () => {
console.log('SIGINT received, shutting down gracefully...');
process.exit(0);
});
app.listen(port, () => {
console.log(`πŸš€ Enhanced StuDocu Downloader v5.0.0 running on http://localhost:${port}`);
console.log(`✨ Features: Real-time progress tracking and enhanced user feedback`);
});