duebot-test / src /services /scraper.service.js
Ali00922's picture
Upload 12 files
c4be319 verified
const puppeteer = require('puppeteer');
const { decrypt } = require('../utils/crypto');
/**
* Scrapes the Moodle LMS dashboard for deadlines.
* @param {string} username - The plaintext LMS username.
* @param {string} encryptedPassword - The AES-encrypted password from DB.
* @param {string} iv - The initialization vector for decryption.
* @returns {Promise<Array>} - Array of deadline objects.
*/
async function scrapeDeadlines(username, encryptedPassword, iv) {
let browser;
try {
// Decrypt the password
const plaintextPassword = decrypt(encryptedPassword, iv);
// Launch an invisible Chrome browser
console.log(`[Scraper] Launching browser for user: ${username}...`);
browser = await puppeteer.launch({
headless: 'new',
executablePath: process.env.CHROME_BIN || null,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu'
]
});
const page = await browser.newPage();
// --- BOOYAH: Fast Puppeteer (Block heavy assets) ---
await page.setRequestInterception(true);
page.on('request', (req) => {
if (['image', 'stylesheet', 'font', 'media'].includes(req.resourceType())) {
req.abort(); // Drops page load time from ~30s to ~5s!
} else {
req.continue();
}
});
// Bulletproof timeouts to prevent hanging indefinitely
page.setDefaultNavigationTimeout(45000);
page.setDefaultTimeout(45000);
// Navigate to the Moodle login page
// If they visit /my/, it automatically redirects to the login screen
await page.goto('https://lms.nust.edu.pk/portal/my/', { waitUntil: 'networkidle2' });
// Wait for the login fields to appear (standard Moodle selectors)
await page.waitForSelector('#username', { timeout: 10000 });
await page.waitForSelector('#password', { timeout: 10000 });
// Type credentials and login
await page.type('#username', username);
await page.type('#password', plaintextPassword);
console.log(`[Scraper] Credentials entered for ${username}. Logging in...`);
// Click the login button and wait for navigation back to the dashboard
await Promise.all([
page.waitForNavigation({ waitUntil: 'networkidle2' }),
page.click('#loginbtn') // the standard Moodle login button ID
]);
console.log(`[Scraper] Successfully logged into dashboard for ${username}. Waiting for timeline...`);
// Wait for the timeline block to load
// Note: Moodle 4.x Timeline items usually have 'data-region="event-item"'
let hasItems = true;
await page.waitForSelector('[data-region="event-list-item"]', { timeout: 15000 }).catch(async () => {
// VERIFICATION: Check if timeline is officially empty, or if this is a Moodle crash/timeout
const bodyText = await page.evaluate(() => document.body.innerText);
if (bodyText.includes("No upcoming activities due") || bodyText.includes("No activities require action")) {
console.log(`[Scraper] Verified officially empty timeline for ${username}.`);
hasItems = false;
} else {
throw new Error("Timeline failed to load completely. Suspected Moodle outage. Aborting scrape.");
}
});
if (!hasItems) {
return []; // Completely safe to return empty, student has no assignments!
}
// Extract assignments from the Timeline block
const deadlines = await page.evaluate(() => {
const results = [];
// This selects the standard Moodle Timeline event container
const eventItems = document.querySelectorAll('[data-region="event-list-item"]');
eventItems.forEach(item => {
try {
// Extracting course name (Usually in a small badge or link inside the item)
const courseEl = item.querySelector('.text-truncate');
let courseName = courseEl ? courseEl.innerText.trim() : 'Unknown Course';
// Extracting assignment title
const titleEl = item.querySelector('.text-truncate a') || item.querySelector('.event-name-container a');
let assignmentTitle = titleEl ? titleEl.innerText.trim() : 'Unknown Assignment';
// Check for that pesky emoji and "is due" that might be injected.
// We let stringFormatter handle most of it, but ensure we don't pass massive objects.
// Find the date by looking at the parent DOM structure
// Moodle groups items under a parent <div> which is preceded by an <h5> containing the date
let dateHeader = null;
let parent = item.parentElement;
if (parent && parent.previousElementSibling && parent.previousElementSibling.tagName === 'H5') {
dateHeader = parent.previousElementSibling.innerText.trim();
}
// Extract the time from inside the item
const timeEl = item.querySelector('.text-right') || item.querySelector('.date');
const timeString = timeEl ? timeEl.innerText.trim() : '';
// Combine date and time
const dateString = dateHeader && timeString ? `${dateHeader}, ${timeString}` : timeString;
// DEBUG: Grab the raw HTML of the item so we can inspect it in the terminal later if needed
const rawHtml = item.innerHTML;
if (assignmentTitle && dateString) {
results.push({
courseName,
assignmentTitle,
dateString,
rawHtml: results.length === 0 ? rawHtml : undefined // only save html for the first item to avoid returning massive strings
});
}
} catch (err) {
// Skip malformed items
}
});
return results;
});
console.log(`[Scraper] Extracted ${deadlines.length} raw deadlines for ${username}.`);
return deadlines;
} catch (error) {
console.error(`[Scraper] Error scraping for ${username}:`, error.message);
return [];
} finally {
if (browser) {
try {
await browser.close();
console.log(`[Scraper] Browser closed for ${username}.`);
} catch (closeErr) {
console.error(`[Scraper] Error closing browser gracefully for ${username}, forcefully killing process...`, closeErr.message);
if (browser.process() != null) {
browser.process().kill('SIGKILL');
}
}
}
}
}
module.exports = { scrapeDeadlines };