export function normalizeInputText(rawText: string): string { return String(rawText || "") .replace(/\r/g, "\n") .replace(/\bblanco\b/gi, " ") .replace(/\bletras blanco\b/gi, " ") .replace(/\bblancobl\b/gi, " ") .replace(/[ \t]+/g, " ") .replace(/\n{2,}/g, "\n") .trim(); } export function removeAdministrativePatterns(text: string): string { const patternsToRemove = [ /Nombre:.*$/gim, /Número:.*$/gim, /Historia:.*$/gim, /T\.\s*Sanitaria:.*$/gim, /Solicitante:.*$/gim, /Servicio:.*$/gim, /Destino[s]?:.*$/gim, /Destinos:.*$/gim, /Centro:.*$/gim, /Sexo:.*$/gim, /Edad:.*$/gim, /Habitación:.*$/gim, /Cama:.*$/gim, /Recep\.?Muestra.*$/gim, /Fch\.?Informe.*$/gim, /Fecha de análisis:.*$/gim, /Resultados validados por:.*$/gim, /Tipo de Muestra:.*$/gim, /Tipo de informe:.*$/gim, /Página\s*\d+\/\d+/gim, /Hospital Ernest Lluch.*$/gim, /H\.\s*ERNEST LLUCH.*$/gim, /INFORME DE RESULTADOS.*$/gim, /Calatayud,.*$/gim, /Tecnica:.*$/gim, /T[eé]cnica:.*$/gim, /_{5,}/g ]; let cleaned = text; patternsToRemove.forEach((rx) => { cleaned = cleaned.replace(rx, ""); }); return cleaned; } export function removeDirectIdentifiers(text: string): string { return text .replace(/\b[A-Z]{2}\d{9,}[A-Z]?\b/g, " ") .replace(/\b\d{6,}\b/g, " ") .replace(/\b\d{1,2}\/\d{1,2}\/\d{2,4}\b/g, " ") .replace(/\b\d{1,2}:\d{2}:\d{2}\b/g, " ") .replace(/\b[A-Z]{2,}-[A-Z]{2,}\b/g, " ") .replace(/[A-ZÁÉÍÓÚÑ]{2,},\s*[A-ZÁÉÍÓÚÑ\s]{2,}/g, " "); } export function keepUsefulLabLines(lines: string[]): string[] { const allowedSectionPatterns = [ /^BIOQUIMICA$/i, /^BIOQUIMICA GENERAL$/i, /^HEMATOLOGIA$/i, /^HEMATIMETRIA$/i, /^HEMOSTASIA$/i, /^GASOMETRIA$/i, /^ORINA$/i, /^COAGULACION$/i, /^OTRAS PRUEBAS:?$/i ]; const obviousNoisePatterns = [ /^Magnitud Resultado Unidades Intervalo de Referencia Biológico$/i, /^Tipo de informe:.*$/i, /^Resultados validados por:.*$/i, /^Tipo de Muestra:.*$/i, /^\.+$/, /^-$/, /^Hospital .*$/i, /^INFORME DE RESULTADOS$/i, /^Nombre:.*$/i, /^Historia:.*$/i, /^Sexo:.*$/i, /^Edad:.*$/i, /^Recep\.Muestra.*$/i, /^Fch\.Informe.*$/i ]; return lines.filter((line) => { if (!line || line.length < 2) return false; if (obviousNoisePatterns.some((rx) => rx.test(line))) return false; if (allowedSectionPatterns.some((rx) => rx.test(line))) return true; const hasPend = /\bPEND\b/i.test(line); const hasValue = /\b\d+[.,]?\d*\b/.test(line); const hasRange = /\b\d+[.,]?\d*\s*-\s*\d+[.,]?\d*\b/.test(line); const hasUnits = /\b(mg\/dL|g\/dL|mmol\/L|mEq\/L|mil\/mm3|mill\/mm3|fl|pg|%|u\/L|ui\/L|ng\/mL|mL\/min\/1\.73m\^?2|mL\/min|seg|s|mill|mil|pg|fl)\b/i.test( line ); // If it has a value and units, or a value and a range, or is a section header, keep it. // Also keep lines that look like parameters if they are reasonably long. const looksLikeParameter = /^[A-ZÁÉÍÓÚÑ][A-ZÁÉÍÓÚÑ0-9\s\.\-\(\)\/]{3,}$/i.test(line); return hasPend || hasRange || hasUnits || hasValue || looksLikeParameter; }); } export function finalizeCleanLines(text: string): string[] { return text .split("\n") .map((line) => line.trim()) .map((line) => line.replace(/[ \t]+/g, " ").trim()) .filter(Boolean); } export function anonymizeLabsText(rawText: string): string { let text = normalizeInputText(rawText); text = removeAdministrativePatterns(text); text = removeDirectIdentifiers(text); const cleanedLines = finalizeCleanLines(text); const usefulLines = keepUsefulLabLines(cleanedLines); const result = usefulLines.join("\n").trim(); console.log("Anonymized Labs Text length:", result.length); return result; } export function anonymizeTreatmentsText(rawText: string): string { let text = normalizeInputText(rawText); text = removeAdministrativePatterns(text); text = removeDirectIdentifiers(text); const cleanedLines = finalizeCleanLines(text).filter((line) => { const hasDose = /\b\d+[.,]?\d*\s?(mg|mcg|g|ui|ml|ug)\b/i.test(line); const hasFrequency = /(cada\s+\d+\s*h|cada\s+\d+\s*horas|al d[ií]a|si precisa|por la noche|por la ma[ñn]ana|semanal|q\d+h)/i.test( line ); const looksLikeDrug = /^[A-ZÁÉÍÓÚÑa-záéíóúñ][A-ZÁÉÍÓÚÑa-záéíóúñ0-9\s\/\.\-]+$/i.test(line); return looksLikeDrug || hasDose || hasFrequency; }); const result = cleanedLines.join("\n").trim(); console.log("Anonymized Treatments Text length:", result.length); return result; }