File size: 4,285 Bytes
dcf8b6b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import { chromium } from "playwright";
import { isAllowedUrl } from "./url-policy.js";

const MAX_CONCURRENT = 3;
const MAX_CONTENT_LENGTH = 100_000;
let browserPromise = null;
let activeRenders = 0;

async function getBrowser() {
  if (!browserPromise) {
    browserPromise = chromium.launch({ headless: true }).catch((err) => {
      browserPromise = null;
      throw err;
    });
  }
  return browserPromise;
}

export async function renderUrl(url, { timeout = 30000, scroll = true } = {}) {
  if (activeRenders >= MAX_CONCURRENT) {
    throw new Error("Too many concurrent renders, try again later");
  }
  activeRenders++;

  let page;
  try {
    const b = await getBrowser();
    page = await b.newPage();
  } catch (err) {
    activeRenders--;
    browserPromise = null; // Reset on browser failure
    throw new Error(`Browser launch failed: ${err.message}`);
  }

  try {
    await page.setExtraHTTPHeaders({
      "User-Agent":
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    });

    // Block requests to private/loopback addresses (catches redirects that
    // would bypass the entry-point URL check). Each subresource — main
    // navigation, iframes, fetch/XHR, images — runs through isAllowedUrl.
    await page.route("**", (route) => {
      const reqUrl = route.request().url();
      // Allow data:, blob:, about: — non-network schemes
      if (!/^https?:/i.test(reqUrl)) {
        return route.continue();
      }
      if (!isAllowedUrl(reqUrl)) {
        return route.abort("blockedbyclient");
      }
      return route.continue();
    });

    await page.goto(url, { waitUntil: "domcontentloaded", timeout });

    // Smart wait: poll until content stabilizes
    let prevLen = 0;
    let stableCount = 0;
    for (let i = 0; i < 20; i++) {
      await page.waitForTimeout(250);
      const curLen = await page.evaluate(() => document.body.innerText.length);
      if (curLen === prevLen && curLen > 100) {
        stableCount++;
        if (stableCount >= 2) break;
      } else {
        stableCount = 0;
      }
      prevLen = curLen;
    }

    // Scroll to trigger lazy-loaded content
    if (scroll) {
      for (let i = 0; i < 3; i++) {
        await page.evaluate(() => window.scrollBy(0, 1000));
        await page.waitForTimeout(500);
      }
      await page.waitForTimeout(500);
    }

    const extracted = await page.evaluate(() => {
      const remove = document.querySelectorAll(
        'script, style, nav[aria-label="Footer"], [role="complementary"]',
      );
      remove.forEach((el) => el.remove());

      const title = document.title;
      const description =
        document.querySelector('meta[name="description"]')?.content ||
        document.querySelector('meta[property="og:description"]')?.content ||
        "";
      const headings = Array.from(
        document.querySelectorAll("h1, h2, h3"),
        (el) => ({ level: el.tagName, text: el.innerText.trim() }),
      ).filter((h) => h.text.length > 0);
      const links = Array.from(
        document.querySelectorAll("a[href]"),
        (el) => ({ text: el.innerText.trim(), href: el.href }),
      )
        .filter((l) => l.text.length > 0 && l.href.startsWith("http"))
        .slice(0, 50);
      const content = document.body.innerText;

      return { title, description, headings, links, content };
    });

    if (extracted.content.length > MAX_CONTENT_LENGTH) {
      extracted.content = extracted.content.slice(0, MAX_CONTENT_LENGTH);
    }

    return {
      title: extracted.title,
      description: extracted.description,
      headings: extracted.headings,
      links: extracted.links,
      content: extracted.content,
      url,
      renderedAt: new Date().toISOString(),
    };
  } catch (err) {
    // If page interaction fails, browser may be dead
    if (err.message?.includes("Target closed") || err.message?.includes("Browser closed")) {
      browserPromise = null;
    }
    throw err;
  } finally {
    activeRenders--;
    await page.close().catch(() => {});
  }
}

export async function closeBrowser() {
  if (browserPromise) {
    const b = await browserPromise.catch(() => null);
    if (b) await b.close().catch(() => {});
    browserPromise = null;
  }
}