export type ExtractMode = "markdown" | "text"; function decodeEntities(value: string): string { return value .replace(/ /gi, " ") .replace(/&/gi, "&") .replace(/"/gi, '"') .replace(/'/gi, "'") .replace(/</gi, "<") .replace(/>/gi, ">") .replace(/([0-9a-f]+);/gi, (_, hex) => String.fromCharCode(Number.parseInt(hex, 16))) .replace(/(\d+);/gi, (_, dec) => String.fromCharCode(Number.parseInt(dec, 10))); } function stripTags(value: string): string { return decodeEntities(value.replace(/<[^>]+>/g, "")); } function normalizeWhitespace(value: string): string { return value .replace(/\r/g, "") .replace(/[ \t]+\n/g, "\n") .replace(/\n{3,}/g, "\n\n") .replace(/[ \t]{2,}/g, " ") .trim(); } export function htmlToMarkdown(html: string): { text: string; title?: string } { const titleMatch = html.match(/