tantk commited on
Commit
c862a09
·
1 Parent(s): 543c521

feat: structured output with description, headings, links

Browse files
Files changed (1) hide show
  1. renderer.js +31 -6
renderer.js CHANGED
@@ -63,20 +63,45 @@ export async function renderUrl(url, { timeout = 30000, scroll = true } = {}) {
63
  await page.waitForTimeout(500);
64
  }
65
 
66
- const title = await page.title();
67
- let content = await page.evaluate(() => {
68
  const remove = document.querySelectorAll(
69
  'script, style, nav[aria-label="Footer"], [role="complementary"]',
70
  );
71
  remove.forEach((el) => el.remove());
72
- return document.body.innerText;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  });
74
 
75
- if (content.length > MAX_CONTENT_LENGTH) {
76
- content = content.slice(0, MAX_CONTENT_LENGTH);
77
  }
78
 
79
- return { title, content, url, renderedAt: new Date().toISOString() };
 
 
 
 
 
 
 
 
80
  } catch (err) {
81
  // If page interaction fails, browser may be dead
82
  if (err.message?.includes("Target closed") || err.message?.includes("Browser closed")) {
 
63
  await page.waitForTimeout(500);
64
  }
65
 
66
+ const extracted = await page.evaluate(() => {
 
67
  const remove = document.querySelectorAll(
68
  'script, style, nav[aria-label="Footer"], [role="complementary"]',
69
  );
70
  remove.forEach((el) => el.remove());
71
+
72
+ const title = document.title;
73
+ const description =
74
+ document.querySelector('meta[name="description"]')?.content ||
75
+ document.querySelector('meta[property="og:description"]')?.content ||
76
+ "";
77
+ const headings = Array.from(
78
+ document.querySelectorAll("h1, h2, h3"),
79
+ (el) => ({ level: el.tagName, text: el.innerText.trim() }),
80
+ ).filter((h) => h.text.length > 0);
81
+ const links = Array.from(
82
+ document.querySelectorAll("a[href]"),
83
+ (el) => ({ text: el.innerText.trim(), href: el.href }),
84
+ )
85
+ .filter((l) => l.text.length > 0 && l.href.startsWith("http"))
86
+ .slice(0, 50);
87
+ const content = document.body.innerText;
88
+
89
+ return { title, description, headings, links, content };
90
  });
91
 
92
+ if (extracted.content.length > MAX_CONTENT_LENGTH) {
93
+ extracted.content = extracted.content.slice(0, MAX_CONTENT_LENGTH);
94
  }
95
 
96
+ return {
97
+ title: extracted.title,
98
+ description: extracted.description,
99
+ headings: extracted.headings,
100
+ links: extracted.links,
101
+ content: extracted.content,
102
+ url,
103
+ renderedAt: new Date().toISOString(),
104
+ };
105
  } catch (err) {
106
  // If page interaction fails, browser may be dead
107
  if (err.message?.includes("Target closed") || err.message?.includes("Browser closed")) {