| import { JSDOM, VirtualConsole } from "jsdom"; |
|
|
| export async function parseWeb(url: string) { |
| const abortController = new AbortController(); |
| setTimeout(() => abortController.abort(), 10000); |
| const htmlString = await fetch(url, { signal: abortController.signal }) |
| .then((response) => response.text()) |
| .catch(); |
|
|
| const virtualConsole = new VirtualConsole(); |
| virtualConsole.on("error", () => { |
| |
| }); |
|
|
| |
| const dom = new JSDOM(htmlString ?? "", { |
| virtualConsole, |
| }); |
|
|
| const { document } = dom.window; |
| const textElTags = "p"; |
| const paragraphs = document.querySelectorAll(textElTags); |
| if (!paragraphs.length) { |
| throw new Error(`webpage doesn't have any "${textElTags}" element`); |
| } |
| const paragraphTexts = Array.from(paragraphs).map((p) => p.textContent); |
|
|
| |
| const text = paragraphTexts.join(" ").replace(/ {2}|\r\n|\n|\r/gm, ""); |
| return text; |
| } |
|
|