Spaces:
Sleeping
Sleeping
feat: structured output with description, headings, links
Browse files- renderer.js +31 -6
renderer.js
CHANGED
|
@@ -63,20 +63,45 @@ export async function renderUrl(url, { timeout = 30000, scroll = true } = {}) {
|
|
| 63 |
await page.waitForTimeout(500);
|
| 64 |
}
|
| 65 |
|
| 66 |
-
const
|
| 67 |
-
let content = await page.evaluate(() => {
|
| 68 |
const remove = document.querySelectorAll(
|
| 69 |
'script, style, nav[aria-label="Footer"], [role="complementary"]',
|
| 70 |
);
|
| 71 |
remove.forEach((el) => el.remove());
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
});
|
| 74 |
|
| 75 |
-
if (content.length > MAX_CONTENT_LENGTH) {
|
| 76 |
-
content = content.slice(0, MAX_CONTENT_LENGTH);
|
| 77 |
}
|
| 78 |
|
| 79 |
-
return {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
} catch (err) {
|
| 81 |
// If page interaction fails, browser may be dead
|
| 82 |
if (err.message?.includes("Target closed") || err.message?.includes("Browser closed")) {
|
|
|
|
| 63 |
await page.waitForTimeout(500);
|
| 64 |
}
|
| 65 |
|
| 66 |
+
const extracted = await page.evaluate(() => {
|
|
|
|
| 67 |
const remove = document.querySelectorAll(
|
| 68 |
'script, style, nav[aria-label="Footer"], [role="complementary"]',
|
| 69 |
);
|
| 70 |
remove.forEach((el) => el.remove());
|
| 71 |
+
|
| 72 |
+
const title = document.title;
|
| 73 |
+
const description =
|
| 74 |
+
document.querySelector('meta[name="description"]')?.content ||
|
| 75 |
+
document.querySelector('meta[property="og:description"]')?.content ||
|
| 76 |
+
"";
|
| 77 |
+
const headings = Array.from(
|
| 78 |
+
document.querySelectorAll("h1, h2, h3"),
|
| 79 |
+
(el) => ({ level: el.tagName, text: el.innerText.trim() }),
|
| 80 |
+
).filter((h) => h.text.length > 0);
|
| 81 |
+
const links = Array.from(
|
| 82 |
+
document.querySelectorAll("a[href]"),
|
| 83 |
+
(el) => ({ text: el.innerText.trim(), href: el.href }),
|
| 84 |
+
)
|
| 85 |
+
.filter((l) => l.text.length > 0 && l.href.startsWith("http"))
|
| 86 |
+
.slice(0, 50);
|
| 87 |
+
const content = document.body.innerText;
|
| 88 |
+
|
| 89 |
+
return { title, description, headings, links, content };
|
| 90 |
});
|
| 91 |
|
| 92 |
+
if (extracted.content.length > MAX_CONTENT_LENGTH) {
|
| 93 |
+
extracted.content = extracted.content.slice(0, MAX_CONTENT_LENGTH);
|
| 94 |
}
|
| 95 |
|
| 96 |
+
return {
|
| 97 |
+
title: extracted.title,
|
| 98 |
+
description: extracted.description,
|
| 99 |
+
headings: extracted.headings,
|
| 100 |
+
links: extracted.links,
|
| 101 |
+
content: extracted.content,
|
| 102 |
+
url,
|
| 103 |
+
renderedAt: new Date().toISOString(),
|
| 104 |
+
};
|
| 105 |
} catch (err) {
|
| 106 |
// If page interaction fails, browser may be dead
|
| 107 |
if (err.message?.includes("Target closed") || err.message?.includes("Browser closed")) {
|