import { chromium } from "playwright"; import { isAllowedUrl } from "./url-policy.js"; const MAX_CONCURRENT = 3; const MAX_CONTENT_LENGTH = 100_000; let browserPromise = null; let activeRenders = 0; async function getBrowser() { if (!browserPromise) { browserPromise = chromium.launch({ headless: true }).catch((err) => { browserPromise = null; throw err; }); } return browserPromise; } export async function renderUrl(url, { timeout = 30000, scroll = true } = {}) { if (activeRenders >= MAX_CONCURRENT) { throw new Error("Too many concurrent renders, try again later"); } activeRenders++; let page; try { const b = await getBrowser(); page = await b.newPage(); } catch (err) { activeRenders--; browserPromise = null; // Reset on browser failure throw new Error(`Browser launch failed: ${err.message}`); } try { await page.setExtraHTTPHeaders({ "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", }); // Block requests to private/loopback addresses (catches redirects that // would bypass the entry-point URL check). Each subresource — main // navigation, iframes, fetch/XHR, images — runs through isAllowedUrl. await page.route("**", (route) => { const reqUrl = route.request().url(); // Allow data:, blob:, about: — non-network schemes if (!/^https?:/i.test(reqUrl)) { return route.continue(); } if (!isAllowedUrl(reqUrl)) { return route.abort("blockedbyclient"); } return route.continue(); }); await page.goto(url, { waitUntil: "domcontentloaded", timeout }); // Smart wait: poll until content stabilizes let prevLen = 0; let stableCount = 0; for (let i = 0; i < 20; i++) { await page.waitForTimeout(250); const curLen = await page.evaluate(() => document.body.innerText.length); if (curLen === prevLen && curLen > 100) { stableCount++; if (stableCount >= 2) break; } else { stableCount = 0; } prevLen = curLen; } // Scroll to trigger lazy-loaded content if (scroll) { for (let i = 0; i < 3; i++) { await page.evaluate(() => window.scrollBy(0, 1000)); await page.waitForTimeout(500); } await page.waitForTimeout(500); } const extracted = await page.evaluate(() => { const remove = document.querySelectorAll( 'script, style, nav[aria-label="Footer"], [role="complementary"]', ); remove.forEach((el) => el.remove()); const title = document.title; const description = document.querySelector('meta[name="description"]')?.content || document.querySelector('meta[property="og:description"]')?.content || ""; const headings = Array.from( document.querySelectorAll("h1, h2, h3"), (el) => ({ level: el.tagName, text: el.innerText.trim() }), ).filter((h) => h.text.length > 0); const links = Array.from( document.querySelectorAll("a[href]"), (el) => ({ text: el.innerText.trim(), href: el.href }), ) .filter((l) => l.text.length > 0 && l.href.startsWith("http")) .slice(0, 50); const content = document.body.innerText; return { title, description, headings, links, content }; }); if (extracted.content.length > MAX_CONTENT_LENGTH) { extracted.content = extracted.content.slice(0, MAX_CONTENT_LENGTH); } return { title: extracted.title, description: extracted.description, headings: extracted.headings, links: extracted.links, content: extracted.content, url, renderedAt: new Date().toISOString(), }; } catch (err) { // If page interaction fails, browser may be dead if (err.message?.includes("Target closed") || err.message?.includes("Browser closed")) { browserPromise = null; } throw err; } finally { activeRenders--; await page.close().catch(() => {}); } } export async function closeBrowser() { if (browserPromise) { const b = await browserPromise.catch(() => null); if (b) await b.close().catch(() => {}); browserPromise = null; } }