Spaces:
Paused
Paused
Always close playwright page (#1171)
Browse filesfix: always close playwright page
Co-authored-by: Nathan Sarrazin <sarrazin.nathan@gmail.com>
src/lib/server/websearch/scrape/scrape.ts
CHANGED
|
@@ -24,37 +24,41 @@ export const scrape = (maxCharsPerElem: number) =>
|
|
| 24 |
export async function scrapeUrl(url: string, maxCharsPerElem: number) {
|
| 25 |
const { res, page } = await loadPage(url);
|
| 26 |
|
| 27 |
-
|
|
|
|
| 28 |
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
title,
|
| 43 |
-
markdownTree: htmlToMarkdownTree(
|
| 44 |
title,
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
| 50 |
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
| 60 |
}
|
|
|
|
| 24 |
export async function scrapeUrl(url: string, maxCharsPerElem: number) {
|
| 25 |
const { res, page } = await loadPage(url);
|
| 26 |
|
| 27 |
+
try {
|
| 28 |
+
if (!res) throw Error("Failed to load page");
|
| 29 |
|
| 30 |
+
// Check if it's a non-html content type that we can handle directly
|
| 31 |
+
// TODO: direct mappings to markdown can be added for markdown, csv and others
|
| 32 |
+
const contentType = res.headers()["content-type"] ?? "";
|
| 33 |
+
if (
|
| 34 |
+
contentType.includes("text/plain") ||
|
| 35 |
+
contentType.includes("text/markdown") ||
|
| 36 |
+
contentType.includes("application/json") ||
|
| 37 |
+
contentType.includes("application/xml") ||
|
| 38 |
+
contentType.includes("text/csv")
|
| 39 |
+
) {
|
| 40 |
+
const title = await page.title();
|
| 41 |
+
const content = await page.content();
|
| 42 |
+
return {
|
|
|
|
|
|
|
| 43 |
title,
|
| 44 |
+
markdownTree: htmlToMarkdownTree(
|
| 45 |
+
title,
|
| 46 |
+
[{ tagName: "p", attributes: {}, content: [content] }],
|
| 47 |
+
maxCharsPerElem
|
| 48 |
+
),
|
| 49 |
+
};
|
| 50 |
+
}
|
| 51 |
|
| 52 |
+
const scrapedOutput = await timeout(page.evaluate(spatialParser), 2000)
|
| 53 |
+
.then(({ elements, ...parsed }) => ({
|
| 54 |
+
...parsed,
|
| 55 |
+
markdownTree: htmlToMarkdownTree(parsed.title, elements, maxCharsPerElem),
|
| 56 |
+
}))
|
| 57 |
+
.catch((cause) => {
|
| 58 |
+
throw Error("Parsing failed", { cause });
|
| 59 |
+
});
|
| 60 |
+
return scrapedOutput;
|
| 61 |
+
} finally {
|
| 62 |
+
page.close();
|
| 63 |
+
}
|
| 64 |
}
|