chat-v4 / chat-ui /src /lib /server /websearch /parseNalogGovRu.ts
muryshev's picture
added wake up request
18cee9d
import { JSDOM, VirtualConsole } from "jsdom";
export async function parseNalogGovRu(url: string) {
const abortController = new AbortController();
setTimeout(() => abortController.abort(), 10000);
const htmlString = await fetch(url, { signal: abortController.signal })
.then((response) => response.text())
.catch();
const virtualConsole = new VirtualConsole();
virtualConsole.on("error", () => {
// No-op to skip console errors.
});
// put the html string into a DOM
const dom = new JSDOM(htmlString ?? "", {
virtualConsole,
});
const { document } = dom.window;
const textElTags = "h1, .wrap-content p";
let paragraphs = document.querySelectorAll(textElTags);
if (!paragraphs.length) {
const fallbackElTags = "p";
paragraphs = document.querySelectorAll(fallbackElTags);
if (!paragraphs.length) {
throw new Error(`Произошла ошибка при обработке страницы. Возможно, на сайте включена защита от ботов.`);
}
}
const paragraphTexts = Array.from(paragraphs).map((p) => p.textContent);
// combine text contents from paragraphs and then remove newlines and multiple spaces
const text = paragraphTexts.filter(t => t !== "")
.map(t => !t?.endsWith('.') ? t + '.' : t)
.join(" ")
.replace(/ {2}|\r\n|\n|\r/gm, "");
return text;
}