test / server.js
devusman's picture
captacha solver
0f0b365
raw
history blame
29.4 kB
const express = require('express');
const puppeteerExtra = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
// NEW: Add the recaptcha plugin to help solve Cloudflare and other challenges
const RecaptchaPlugin = require('puppeteer-extra-plugin-recaptcha');
const cors = require('cors');
const { EventEmitter } = require('events');
const os = require('os');
const fs = require('fs').promises;
const path = require('path');
// --- NEW: Configuration for the Solver ---
// You can optionally provide a 2Captcha API key to solve more complex captchas,
// but it's often not needed for the initial Cloudflare JS challenge.
puppeteerExtra.use(
RecaptchaPlugin({
provider: { id: '2captcha', token: 'cc4f0d688032c69ecf359cccdabbacb9' }
})
);
puppeteerExtra.use(StealthPlugin());
const app = express();
const port = 7860;
app.use(cors());
app.use(express.json());
// --- Progress Tracking and Job Storage (No changes) ---
const progressTrackers = new Map();
const downloadJobs = new Map();
class ProgressTracker extends EventEmitter {
constructor(sessionId) {
super();
this.sessionId = sessionId;
this.progress = 0;
this.status = 'initializing';
this.message = '';
}
updateProgress(progress, status, message) {
this.progress = progress;
this.status = status;
this.message = message;
const update = {
sessionId: this.sessionId,
progress,
status,
message,
timestamp: new Date().toISOString()
};
this.emit('progress', update);
console.log(`πŸ“Š [${this.sessionId}] ${progress}% - ${status}: ${message}`);
}
}
// --- Puppeteer Logic (Updated for Cloudflare Bypass) ---
const bypassCookiesAndRestrictions = async (page, progressTracker) => {
// This function remains largely the same but is now called *after* passing Cloudflare.
progressTracker?.updateProgress(5, 'bypassing', 'Setting up cookie bypass...');
// (The implementation of this function is unchanged from your original code)
console.log("πŸͺ Starting comprehensive cookie and restriction bypass...");
// Step 1: Set cookies before page load
const preCookies = [
{ name: 'cookieConsent', value: 'accepted', domain: '.studocu.com' },
{ name: 'cookie_consent', value: 'true', domain: '.studocu.com' },
{ name: 'gdpr_consent', value: 'accepted', domain: '.studocu.com' },
{ name: 'privacy_policy_accepted', value: 'true', domain: '.studocu.com' },
{ name: 'user_consent', value: '1', domain: '.studocu.com' },
{ name: 'analytics_consent', value: 'false', domain: '.studocu.com' },
{ name: 'marketing_consent', value: 'false', domain: '.studocu.com' },
{ name: 'functional_consent', value: 'true', domain: '.studocu.com' },
];
for (const cookie of preCookies) {
try {
await page.setCookie(cookie);
} catch (e) {
console.log(`Failed to set cookie ${cookie.name}:`, e.message);
}
}
// Step 2: Inject CSS to hide cookie banners immediately
await page.addStyleTag({
content: `
/* Hide all possible cookie banners */
[id*="cookie" i]:not(img):not(input), [class*="cookie" i]:not(img):not(input), [data-testid*="cookie" i], [aria-label*="cookie" i],
.gdpr-banner, .gdpr-popup, .gdpr-modal, .consent-banner, .consent-popup, .consent-modal, .privacy-banner, .privacy-popup, .privacy-modal,
.cookie-law, .cookie-policy, .cookie-compliance, .onetrust-banner-sdk, #onetrust-consent-sdk, .cmp-banner, .cmp-popup, .cmp-modal,
[class*="CookieBanner"], [class*="CookieNotice"], [class*="ConsentBanner"], [class*="ConsentManager"], .cc-banner, .cc-window, .cc-compliance,
div[style*="position: fixed"]:has-text("cookie"), div[style*="position: fixed"]:has-text("consent"), .fixed:has-text("cookie"), .fixed:has-text("consent") {
display: none !important;
visibility: hidden !important;
opacity: 0 !important;
z-index: -9999 !important;
pointer-events: none !important;
}
/* Remove blur and premium overlays */
[class*="blur" i], [class*="premium" i], [class*="paywall" i], [class*="sample-preview-blur" i] {
filter: none !important;
backdrop-filter: none !important;
opacity: 1 !important;
visibility: visible !important;
}
/* Ensure document content is visible */
.document-content, .page-content, [data-page] {
filter: none !important;
opacity: 1 !important;
visibility: visible !important;
pointer-events: auto !important;
}
/* Remove fixed overlays */
.fixed-overlay, .sticky-overlay, .content-overlay {
display: none !important;
}
/* Restore scrolling */
html, body {
overflow: auto !important;
position: static !important;
}
`
});
// Step 3: Inject JavaScript to handle dynamic cookie banners
await page.evaluateOnNewDocument(() => {
// Override common cookie consent functions
window.cookieConsent = { accepted: true };
window.gtag = () => { };
window.ga = () => { };
window.dataLayer = [];
// Mutation observer to catch dynamically added cookie banners
const observer = new MutationObserver((mutations) => {
mutations.forEach((mutation) => {
mutation.addedNodes.forEach((node) => {
if (node.nodeType === 1) { // Element node
const element = node;
const text = element.textContent || '';
const className = element.className || '';
const id = element.id || '';
// Check if this looks like a cookie banner
if (
text.toLowerCase().includes('cookie') ||
text.toLowerCase().includes('consent') ||
text.toLowerCase().includes('privacy policy') ||
className.toLowerCase().includes('cookie') ||
className.toLowerCase().includes('consent') ||
className.toLowerCase().includes('gdpr') ||
id.toLowerCase().includes('cookie') ||
id.toLowerCase().includes('consent')
) {
console.log('Removing detected cookie banner:', element);
element.remove();
}
}
});
});
});
observer.observe(document.body, { childList: true, subtree: true });
// Set up periodic cleanup
setInterval(() => {
const cookieElements = document.querySelectorAll(`
[id*="cookie" i]:not(img):not(input), [class*="cookie" i]:not(img):not(input), [data-testid*="cookie" i],
.gdpr-banner, .consent-banner, .privacy-banner, .onetrust-banner-sdk, #onetrust-consent-sdk,
.cmp-banner, .cc-banner
`);
cookieElements.forEach(el => el.remove());
// Restore body scroll
document.body.style.overflow = 'auto';
document.documentElement.style.overflow = 'auto';
}, 1000);
});
progressTracker?.updateProgress(10, 'bypassing', 'Cookie bypass configured successfully');
return true;
};
// --- Other functions (unblurContent, applyPrintStyles) are unchanged ---
const unblurContent = async (page, progressTracker) => {
progressTracker?.updateProgress(15, 'unblurring', 'Removing content restrictions...');
console.log("πŸ”“ Unblurring content and bypassing premium restrictions...");
await page.evaluate(() => {
const removeRestrictions = () => {
const removeBySelector = (selector) => {
document.querySelectorAll(selector).forEach(el => el.remove());
};
removeBySelector("#adbox, .adsbox, .ad-box, .banner-ads, .advert");
removeBySelector(".PremiumBannerBlobWrapper_overflow-wrapper__xsaS8");
const removeBlur = (element = document) => {
element.querySelectorAll("*").forEach(el => {
const style = window.getComputedStyle(el);
if (
style.filter?.includes("blur") ||
style.backdropFilter?.includes("blur") ||
parseFloat(style.opacity) < 1 ||
(el.className && el.className.toString().toLowerCase().includes("blur")) ||
(el.className && el.className.toString().toLowerCase().includes("premium"))
) {
el.style.filter = "none !important";
el.style.backdropFilter = "none !important";
el.style.opacity = "1 !important";
if (el.classList) {
el.classList.remove("blur", "blurred", "premium-blur");
}
}
});
};
removeBlur();
removeBySelector('[class*="blur" i], [class*="premium" i], [class*="paywall" i]');
const contentSelectors = [
'.document-content', '.page-content', '.content', '[data-page]', '[data-testid*="document"]',
'[data-testid*="page"]', '.page', '.document-page', 'main', 'article'
];
contentSelectors.forEach(selector => {
document.querySelectorAll(selector).forEach(el => {
el.style.setProperty('filter', 'none', 'important');
el.style.setProperty('opacity', '1', 'important');
el.style.setProperty('visibility', 'visible', 'important');
el.style.setProperty('display', 'block', 'important');
el.style.setProperty('pointer-events', 'auto', 'important');
});
});
};
removeRestrictions();
const intervalId = setInterval(removeRestrictions, 1000);
setTimeout(() => clearInterval(intervalId), 30000);
});
progressTracker?.updateProgress(20, 'unblurring', 'Content restrictions removed');
};
const applyPrintStyles = async (page, progressTracker) => {
progressTracker?.updateProgress(85, 'styling', 'Applying print styles...');
console.log("πŸ–¨οΈ Applying print styles for clean PDF...");
await page.evaluate(() => {
const style = document.createElement("style");
style.id = "print-style-extension";
style.innerHTML = `
@page {
size: A4 portrait;
margin: 0mm;
}
@media print {
html, body {
width: 210mm !important;
height: auto !important;
margin: 0 !important;
padding: 0 !important;
overflow: visible !important;
background: white !important;
color: black !important;
}
header, footer, nav, aside, .no-print, .ads, .sidebar, .premium-banner,
[class*="Header"], [class*="Footer"], [class*="Sidebar"], [id*="Header"],
.ViewerToolbar, .Layout_info-bar-wrapper__He0Ho, .Sidebar_sidebar-scrollable__kqeBZ,
.HeaderWrapper_header-wrapper__mCmf3, .Layout_visible-content-bottom-wrapper-sticky__yaaAB,
.Layout_bottom-section-wrapper__yBWWk, .Layout_footer-wrapper__bheJQ,
.InlineBanner_inline-banner-wrapper__DAi5X, .banner-wrapper, #top-bar-wrapper,
.Layout_sidebar-wrapper__unavM, .Layout_is-open__9DQr4 {
display: none !important;
}
* {
box-shadow: none !important;
background: transparent !important;
color: inherit !important;
}
.Viewer_document-wrapper__JPBWQ, .Viewer_document-wrapper__LXzoQ,
.Viewer_document-wrapper__XsO4j, .page-content, .document-viewer, #page-container {
position: static !important;
display: block !important;
width: 100% !important;
max-width: none !important;
margin: 0 !important;
padding: 0 !important;
box-sizing: border-box;
transform: none !important;
}
[data-page], .page, .document-page, img {
page-break-after: always !important;
page-break-inside: avoid !important;
page-break-before: avoid !important;
width: 100% !important;
max-width: 100% !important;
height: auto !important;
display: block !important;
margin: 0 !important;
padding: 0 !important;
}
}
`;
document.head.appendChild(style);
});
progressTracker?.updateProgress(88, 'styling', 'Print styles applied successfully');
};
const studocuDownloader = async (url, options = {}, progressTracker = null) => {
let browser;
let userDataDir = null;
// NEW: Easy flag for debugging. Set to true to see the browser window.
const isDebugging = false;
try {
progressTracker?.updateProgress(0, 'initializing', 'Starting browser...');
const tempDir = os.tmpdir();
userDataDir = await fs.mkdtemp(path.join(tempDir, 'puppeteer-'));
console.log(`πŸ“‚ Created temporary user data directory: ${userDataDir}`);
console.log("πŸš€ Launching browser with enhanced stealth configuration...");
browser = await puppeteerExtra.launch({
headless: !isDebugging, // Use the debugging flag
userDataDir: userDataDir,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-infobars',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled',
'--window-size=1920,1080'
],
ignoreHTTPSErrors: true,
});
const page = await browser.newPage();
progressTracker?.updateProgress(2, 'initializing', 'Configuring browser settings...');
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36');
await page.setViewport({ width: 1920, height: 1080 });
// The stealth plugin and other `evaluateOnNewDocument` calls handle this better now.
// await page.evaluateOnNewDocument(...) is handled by plugins.
// Request interception logic is unchanged
await page.setRequestInterception(true);
page.on('request', (req) => {
const resourceType = req.resourceType();
const reqUrl = req.url().toLowerCase();
if (resourceType === 'document') {
req.continue();
return;
}
if (
['image', 'media', 'font', 'stylesheet'].includes(resourceType) &&
!reqUrl.includes('document') && !reqUrl.includes('page') && !reqUrl.includes('studocu') ||
resourceType === 'script' && !reqUrl.includes('studocu') ||
reqUrl.includes('doubleclick') ||
reqUrl.includes('googletagmanager') ||
reqUrl.includes('facebook.com') ||
reqUrl.includes('twitter.com') ||
reqUrl.includes('analytics') ||
reqUrl.includes('gtm') ||
reqUrl.includes('hotjar') ||
reqUrl.includes('mixpanel') ||
reqUrl.includes('onetrust') ||
reqUrl.includes('cookielaw') ||
(resourceType === 'other' && reqUrl.includes('/track/'))
) {
req.abort();
} else {
req.continue();
}
});
// --- MODIFIED NAVIGATION LOGIC ---
progressTracker?.updateProgress(5, 'navigating', 'Navigating to document...');
console.log(`πŸ›‘οΈ Navigating to ${url} and preparing for Cloudflare challenge...`);
try {
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 120000 });
// NEW: Wait for potential Cloudflare challenge to solve and redirect.
// We wait for an element that *only* exists on the actual Studocu page.
console.log("⏳ Waiting for Cloudflare challenge to be solved...");
progressTracker?.updateProgress(8, 'solving_cf', 'Solving Cloudflare challenge...');
await page.waitForSelector('#search-input', { timeout: 90000 });
console.log("βœ… Cloudflare challenge passed! You are on the Studocu page.");
progressTracker?.updateProgress(10, 'navigation_complete', 'Successfully navigated to document');
} catch (e) {
console.error("❌ Failed to bypass Cloudflare or navigate to the page.", e.message);
// NEW: Take a screenshot on failure to help debug
const screenshotPath = path.join(os.tmpdir(), `cloudflare_failure_${Date.now()}.png`);
await page.screenshot({ path: screenshotPath, fullPage: true });
console.log(`πŸ“Έ Screenshot saved to ${screenshotPath}`);
throw new Error("Could not bypass Cloudflare. The site may be actively blocking, or the page structure changed.");
}
// --- RESUME NORMAL SCRIPT FLOW ---
// It's better to bypass cookies *after* landing on the actual page
await bypassCookiesAndRestrictions(page, progressTracker);
if (options.email && options.password) {
progressTracker?.updateProgress(12, 'authenticating', 'Logging into StuDocu...');
// ... (Login logic is unchanged)
}
progressTracker?.updateProgress(40, 'loading', 'Page loaded, waiting for content...');
await new Promise(resolve => setTimeout(resolve, 2000));
await unblurContent(page, progressTracker);
// ... (The rest of the script is unchanged)
progressTracker?.updateProgress(45, 'loading', 'Waiting for document content...');
console.log("⏳ Waiting for document content to load...");
const contentSelectors = [
'.document-content', '.page-content', '[data-page]', '[data-testid*="document"]',
'img[src*="document"]', 'img[src*="page"]', '.page', 'main img', 'article img'
];
let contentFound = false;
for (const selector of contentSelectors) {
try {
await page.waitForSelector(selector, { timeout: 10000 });
console.log(`βœ… Found content with selector: ${selector}`);
contentFound = true;
break;
} catch (e) {
console.log(`❌ Selector ${selector} not found, trying next...`);
}
}
if (!contentFound) {
console.log("⚠️ No specific content selector found, proceeding with page content...");
}
progressTracker?.updateProgress(50, 'scrolling', 'Loading all document pages...');
console.log("πŸ“œ Loading all document pages with enhanced slow scroll...");
await page.evaluate(async () => {
const delay = (ms) => new Promise((res) => setTimeout(res, ms));
let scrollHeight = document.body.scrollHeight;
while (true) {
let totalHeight = 0;
const distance = 600;
while (totalHeight < scrollHeight) {
window.scrollBy(0, distance);
totalHeight += distance;
await delay(200);
}
await delay(1000);
const newHeight = document.body.scrollHeight;
if (newHeight === scrollHeight) break;
scrollHeight = newHeight;
}
window.scrollTo({ top: 0, behavior: "smooth" });
await delay(500);
});
progressTracker?.updateProgress(70, 'processing', 'Processing loaded content...');
await unblurContent(page, progressTracker);
progressTracker?.updateProgress(75, 'loading_images', 'Loading images...');
console.log("πŸ–ΌοΈ Waiting for all images to load...");
await page.evaluate(async () => {
const images = Array.from(document.querySelectorAll('img'));
await Promise.all(images.map(img => {
if (img.complete) return Promise.resolve();
return new Promise((resolve) => {
img.addEventListener('load', resolve);
img.addEventListener('error', resolve);
setTimeout(resolve, 5000);
});
}));
});
await new Promise(resolve => setTimeout(resolve, 2000));
progressTracker?.updateProgress(80, 'finalizing', 'Preparing document for PDF generation...');
await page.evaluate(() => {
const getDocumentHeight = () => Math.max(
document.body.scrollHeight, document.body.offsetHeight,
document.documentElement.clientHeight, document.documentElement.scrollHeight, document.documentElement.offsetHeight
);
const height = getDocumentHeight();
document.body.style.height = `${height}px !important`;
document.documentElement.style.height = `${height}px !important`;
document.body.style.overflow = 'hidden !important';
});
const contentCheck = await page.evaluate(() => {
const textContent = document.body.textContent || '';
const images = document.querySelectorAll('img');
const documentImages = Array.from(images).filter(img =>
img.src.includes('document') || img.src.includes('page') ||
img.alt.includes('document') || img.alt.includes('page')
);
return {
totalText: textContent.length,
totalImages: images.length,
documentImages: documentImages.length,
hasDocumentContent: documentImages.length > 0 || textContent.length > 1000
};
});
console.log("πŸ“Š Content verification:", {
textLength: contentCheck.totalText,
images: contentCheck.totalImages,
documentImages: contentCheck.documentImages,
hasContent: contentCheck.hasDocumentContent
});
if (!contentCheck.hasDocumentContent) {
console.warn("⚠️ Warning: Limited document content detected.");
}
await applyPrintStyles(page, progressTracker);
await page.emulateMediaType('print');
progressTracker?.updateProgress(90, 'generating', 'Generating PDF...');
console.log("πŸ”„ Generating PDF...");
const pdfBuffer = await page.pdf({
printBackground: true,
preferCSSPageSize: true,
displayHeaderFooter: false,
timeout: 60000,
scale: 1,
omitBackground: false
});
progressTracker?.updateProgress(100, 'completed', 'PDF generated successfully!');
console.log(`βœ… PDF generated successfully! Size: ${(pdfBuffer.length / 1024 / 1024).toFixed(2)} MB`);
return pdfBuffer;
} catch (error) {
progressTracker?.updateProgress(-1, 'error', error.message);
console.error("❌ Error during PDF generation:", error);
throw error;
} finally {
if (browser) {
console.log("πŸ”’ Closing browser...");
try {
await browser.close();
} catch (e) {
console.log("Error closing browser:", e.message);
}
}
if (userDataDir) {
console.log(`πŸ—‘οΈ Cleaning up temporary directory: ${userDataDir}`);
try {
await fs.rm(userDataDir, { recursive: true, force: true });
console.log("βœ… Temporary directory cleaned up.");
} catch (e) {
console.error(`❌ Failed to clean up temporary directory ${userDataDir}:`, e.message);
}
}
}
};
// --- API Routes, Health, and Info Endpoints (Unchanged) ---
app.post('/api/request-download', (req, res) => {
const { url, email, password } = req.body;
if (!url || !url.includes('studocu.com')) {
return res.status(400).json({ error: 'Please provide a valid StuDocu URL.' });
}
const sessionId = Date.now().toString();
const progressTracker = new ProgressTracker(sessionId);
progressTrackers.set(sessionId, progressTracker);
downloadJobs.set(sessionId, { status: 'processing' });
console.log(`🎯 Processing request for: ${url} [Session: ${sessionId}]`);
res.json({ sessionId });
studocuDownloader(url, { email, password }, progressTracker)
.then(pdfBuffer => {
downloadJobs.set(sessionId, { status: 'completed', buffer: pdfBuffer });
progressTrackers.delete(sessionId);
})
.catch(error => {
downloadJobs.set(sessionId, { status: 'error', message: error.message });
progressTrackers.delete(sessionId);
});
});
app.get('/api/progress/:sessionId', (req, res) => {
const { sessionId } = req.params;
const tracker = progressTrackers.get(sessionId);
if (tracker) {
return res.json({
sessionId,
progress: tracker.progress,
status: tracker.status,
message: tracker.message,
timestamp: new Date().toISOString()
});
}
const job = downloadJobs.get(sessionId);
if (job) {
if (job.status === 'completed') {
return res.json({ sessionId, progress: 100, status: 'completed', message: 'PDF generated successfully!' });
}
if (job.status === 'error') {
return res.json({ sessionId, progress: -1, status: 'error', message: job.message });
}
}
return res.status(404).json({ error: 'Session not found' });
});
app.get('/api/download/:sessionId', (req, res) => {
const { sessionId } = req.params;
const job = downloadJobs.get(sessionId);
if (!job) {
return res.status(404).json({ error: 'Download session not found or expired.' });
}
if (job.status === 'processing') {
return res.status(400).json({ error: 'Download is still processing.' });
}
if (job.status === 'error') {
return res.status(500).json({ error: `Failed to generate PDF: ${job.message}` });
}
if (job.status === 'completed' && job.buffer) {
res.setHeader('Content-Type', 'application/pdf');
res.setHeader('Content-Disposition', 'attachment; filename=studocu-document.pdf');
res.send(job.buffer);
} else {
res.status(500).json({ error: 'An unknown error occurred.' });
}
});
app.get('/health', (req, res) => {
res.json({
status: 'healthy',
timestamp: new Date().toISOString(),
uptime: process.uptime(),
activeDownloads: progressTrackers.size
});
});
app.get('/', (req, res) => {
res.json({
message: 'πŸš€ Enhanced StuDocu Downloader API v5.3 - Real-time Progress Tracking with Cloudflare Bypass',
version: '5.3.0',
features: [
'πŸ›‘οΈ Cloudflare JS Challenge Bypass',
'πŸͺ Advanced cookie banner bypass',
'πŸ”“ Premium content unblurring',
'πŸ”‘ Login support for full access',
'πŸ“Š Real-time progress tracking via polling',
'πŸ“„ Clean PDF generation with print styles',
'πŸ•΅οΈ Enhanced stealth to evade bot detection'
],
endpoints: {
request: 'POST /api/request-download (body: {url, filename?, email?, password?})',
progress: 'GET /api/progress/:sessionId',
download: 'GET /api/download/:sessionId',
health: 'GET /health'
}
});
});
process.on('SIGTERM', () => {
console.log('SIGTERM received, shutting down gracefully...');
process.exit(0);
});
process.on('SIGINT', () => {
console.log('SIGINT received, shutting down gracefully...');
process.exit(0);
});
app.listen(port, () => {
console.log(`πŸš€ Enhanced StuDocu Downloader v5.3.0 running on http://localhost:${port}`);
console.log(`✨ Features: Cloudflare Bypass, Real-time progress tracking, enhanced stealth, and user feedback`);
});