import { load as loadHtml } from "cheerio"; function parsePdfDate(pdfDate?: string | null): string | null { if (!pdfDate) return null; // PDF date format: D:YYYYMMDDHHmmSSOHH'mm' // Example: D:20240102153045-08'00' const m = pdfDate.match( /^D:(\d{4})(\d{2})?(\d{2})?(\d{2})?(\d{2})?(\d{2})?([Zz]|([+\-])(\d{2})'?(\d{2})'?)?$/, ); if (!m) return null; const [_, y, mo = "01", d = "01", h = "00", mi = "00", s = "00", z, sign, tzH, tzM] = m; const yyyy = y; const MM = mo.padStart(2, "0"); const dd = d.padStart(2, "0"); const HH = h.padStart(2, "0"); const MMm = mi.padStart(2, "0"); const SS = s.padStart(2, "0"); let offset = "Z"; if (z && z.toUpperCase() !== "Z" && tzH && tzM) { offset = `${sign}${tzH}:${tzM}`; } // Build ISO string const iso = `${yyyy}-${MM}-${dd}T${HH}:${MMm}:${SS}${offset}`; const date = new Date(iso); return isNaN(date.getTime()) ? null : date.toISOString(); } type HtmlLikeMetadata = { title: string | null; language: string | null; urlSource: string | null; timestamp: string; description: string | null; keywords: string | null; author: string | null; ogTitle: string | null; ogDescription: string | null; ogImage: string | null; ogUrl: string | null; ogSiteName: string | null; articleAuthor: string | null; publishedTime: string | null; modifiedTime: string | null; canonical: string | null; favicon: string | null; jsonLd: any[]; statusCode: number; }; export function extractLinksFromConvertedHtml(html: string): { url: string; text: string }[] { const $ = loadHtml(html); return $("a[href]") .map((_, a) => { const url = $(a).attr("href") || ""; const text = $(a).text()?.trim() || ""; return { url, text }; }) .get(); } export function buildHtmlLikeMetadataFromPdf( pdfMeta: any, opts: { urlSource?: string | null; statusCode?: number; htmlForFallback?: string | null }, ): HtmlLikeMetadata { const { urlSource = null, statusCode = 200, htmlForFallback = null } = opts; // Try to get a title from meta, fallback to