File size: 1,210 Bytes
2c00ea8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import type { AppendUpdate } from "../runWebSearch";
import type { WebSearchScrapedSource, WebSearchSource } from "$lib/types/WebSearch";
import { loadPage } from "./playwright";

import { spatialParser } from "./parser";
import { htmlToMarkdownTree } from "../markdown/tree";
import { timeout } from "$lib/utils/timeout";

export const scrape =
	(appendUpdate: AppendUpdate, maxCharsPerElem: number) =>
	async (source: WebSearchSource): Promise<WebSearchScrapedSource | undefined> => {
		try {
			const page = await scrapeUrl(source.link, maxCharsPerElem);
			appendUpdate("Browsing webpage", [source.link]);
			return { ...source, page };
		} catch (e) {
			const message = e instanceof Error ? e.message : String(e);
			appendUpdate("Failed to parse webpage", [message, source.link], "error");
		}
	};

export async function scrapeUrl(url: string, maxCharsPerElem: number) {
	const page = await loadPage(url);

	return timeout(page.evaluate(spatialParser), 2000)
		.then(({ elements, ...parsed }) => ({
			...parsed,
			markdownTree: htmlToMarkdownTree(parsed.title, elements, maxCharsPerElem),
		}))
		.catch((cause) => {
			throw Error("Parsing failed", { cause });
		})
		.finally(() => page.close());
}