| | import { JSDOM, VirtualConsole } from "jsdom"; |
| |
|
| | export async function parseWeb(url: string) { |
| | const abortController = new AbortController(); |
| | setTimeout(() => abortController.abort(), 10000); |
| | const r = await fetch(url, { signal: abortController.signal, credentials: "omit" }).catch(); |
| |
|
| | if (r.headers.get("content-type")?.includes("text/html")) { |
| | const virtualConsole = new VirtualConsole(); |
| | virtualConsole.on("error", () => { |
| | |
| | }); |
| |
|
| | |
| | const dom = new JSDOM((await r.text()) ?? "", { |
| | virtualConsole, |
| | }); |
| |
|
| | const { document } = dom.window; |
| | const paragraphs = document.querySelectorAll("p, table, pre, ul, ol"); |
| |
|
| | if (!paragraphs.length) { |
| | throw new Error(`webpage doesn't have any parseable element`); |
| | } |
| | const paragraphTexts = Array.from(paragraphs).map((p) => p.textContent); |
| |
|
| | |
| | const text = paragraphTexts.join(" ").replace(/ {2}|\r\n|\n|\r/gm, ""); |
| |
|
| | return text; |
| | } else if ( |
| | r.headers.get("content-type")?.includes("text/plain") || |
| | r.headers.get("content-type")?.includes("text/markdown") |
| | ) { |
| | return r.text(); |
| | } else { |
| | throw new Error("Unsupported content type"); |
| | } |
| | } |
| |
|