NEON / backend /src /search /extract.ts
picklefried706's picture
Upload folder using huggingface_hub
40a9423 verified
import { JSDOM } from "jsdom";
import { Readability } from "@mozilla/readability";
import { fetchText } from "./fetch.js";
export async function extractReadable(url: string) {
const html = await fetchText(url, {
headers: {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36"
}
});
const dom = new JSDOM(html, { url });
const reader = new Readability(dom.window.document);
const article = reader.parse();
if (article?.textContent) {
return {
title: article.title ?? "",
content: article.textContent.replace(/\s+/g, " ").trim()
};
}
const fallbackText = dom.window.document.body?.textContent ?? "";
return {
title: dom.window.document.title ?? "",
content: fallbackText.replace(/\s+/g, " ").trim()
};
}