Spaces:
Running
Running
File size: 4,285 Bytes
dcf8b6b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | import { chromium } from "playwright";
import { isAllowedUrl } from "./url-policy.js";
const MAX_CONCURRENT = 3;
const MAX_CONTENT_LENGTH = 100_000;
let browserPromise = null;
let activeRenders = 0;
async function getBrowser() {
if (!browserPromise) {
browserPromise = chromium.launch({ headless: true }).catch((err) => {
browserPromise = null;
throw err;
});
}
return browserPromise;
}
export async function renderUrl(url, { timeout = 30000, scroll = true } = {}) {
if (activeRenders >= MAX_CONCURRENT) {
throw new Error("Too many concurrent renders, try again later");
}
activeRenders++;
let page;
try {
const b = await getBrowser();
page = await b.newPage();
} catch (err) {
activeRenders--;
browserPromise = null; // Reset on browser failure
throw new Error(`Browser launch failed: ${err.message}`);
}
try {
await page.setExtraHTTPHeaders({
"User-Agent":
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
});
// Block requests to private/loopback addresses (catches redirects that
// would bypass the entry-point URL check). Each subresource — main
// navigation, iframes, fetch/XHR, images — runs through isAllowedUrl.
await page.route("**", (route) => {
const reqUrl = route.request().url();
// Allow data:, blob:, about: — non-network schemes
if (!/^https?:/i.test(reqUrl)) {
return route.continue();
}
if (!isAllowedUrl(reqUrl)) {
return route.abort("blockedbyclient");
}
return route.continue();
});
await page.goto(url, { waitUntil: "domcontentloaded", timeout });
// Smart wait: poll until content stabilizes
let prevLen = 0;
let stableCount = 0;
for (let i = 0; i < 20; i++) {
await page.waitForTimeout(250);
const curLen = await page.evaluate(() => document.body.innerText.length);
if (curLen === prevLen && curLen > 100) {
stableCount++;
if (stableCount >= 2) break;
} else {
stableCount = 0;
}
prevLen = curLen;
}
// Scroll to trigger lazy-loaded content
if (scroll) {
for (let i = 0; i < 3; i++) {
await page.evaluate(() => window.scrollBy(0, 1000));
await page.waitForTimeout(500);
}
await page.waitForTimeout(500);
}
const extracted = await page.evaluate(() => {
const remove = document.querySelectorAll(
'script, style, nav[aria-label="Footer"], [role="complementary"]',
);
remove.forEach((el) => el.remove());
const title = document.title;
const description =
document.querySelector('meta[name="description"]')?.content ||
document.querySelector('meta[property="og:description"]')?.content ||
"";
const headings = Array.from(
document.querySelectorAll("h1, h2, h3"),
(el) => ({ level: el.tagName, text: el.innerText.trim() }),
).filter((h) => h.text.length > 0);
const links = Array.from(
document.querySelectorAll("a[href]"),
(el) => ({ text: el.innerText.trim(), href: el.href }),
)
.filter((l) => l.text.length > 0 && l.href.startsWith("http"))
.slice(0, 50);
const content = document.body.innerText;
return { title, description, headings, links, content };
});
if (extracted.content.length > MAX_CONTENT_LENGTH) {
extracted.content = extracted.content.slice(0, MAX_CONTENT_LENGTH);
}
return {
title: extracted.title,
description: extracted.description,
headings: extracted.headings,
links: extracted.links,
content: extracted.content,
url,
renderedAt: new Date().toISOString(),
};
} catch (err) {
// If page interaction fails, browser may be dead
if (err.message?.includes("Target closed") || err.message?.includes("Browser closed")) {
browserPromise = null;
}
throw err;
} finally {
activeRenders--;
await page.close().catch(() => {});
}
}
export async function closeBrowser() {
if (browserPromise) {
const b = await browserPromise.catch(() => null);
if (b) await b.close().catch(() => {});
browserPromise = null;
}
}
|