File size: 4,738 Bytes
2a40140 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
export function normalizeInputText(rawText: string): string {
return String(rawText || "")
.replace(/\r/g, "\n")
.replace(/\bblanco\b/gi, " ")
.replace(/\bletras blanco\b/gi, " ")
.replace(/\bblancobl\b/gi, " ")
.replace(/[ \t]+/g, " ")
.replace(/\n{2,}/g, "\n")
.trim();
}
export function removeAdministrativePatterns(text: string): string {
const patternsToRemove = [
/Nombre:.*$/gim,
/Número:.*$/gim,
/Historia:.*$/gim,
/T\.\s*Sanitaria:.*$/gim,
/Solicitante:.*$/gim,
/Servicio:.*$/gim,
/Destino[s]?:.*$/gim,
/Destinos:.*$/gim,
/Centro:.*$/gim,
/Sexo:.*$/gim,
/Edad:.*$/gim,
/Habitación:.*$/gim,
/Cama:.*$/gim,
/Recep\.?Muestra.*$/gim,
/Fch\.?Informe.*$/gim,
/Fecha de análisis:.*$/gim,
/Resultados validados por:.*$/gim,
/Tipo de Muestra:.*$/gim,
/Tipo de informe:.*$/gim,
/Página\s*\d+\/\d+/gim,
/Hospital Ernest Lluch.*$/gim,
/H\.\s*ERNEST LLUCH.*$/gim,
/INFORME DE RESULTADOS.*$/gim,
/Calatayud,.*$/gim,
/Tecnica:.*$/gim,
/T[eé]cnica:.*$/gim,
/_{5,}/g
];
let cleaned = text;
patternsToRemove.forEach((rx) => {
cleaned = cleaned.replace(rx, "");
});
return cleaned;
}
export function removeDirectIdentifiers(text: string): string {
return text
.replace(/\b[A-Z]{2}\d{9,}[A-Z]?\b/g, " ")
.replace(/\b\d{6,}\b/g, " ")
.replace(/\b\d{1,2}\/\d{1,2}\/\d{2,4}\b/g, " ")
.replace(/\b\d{1,2}:\d{2}:\d{2}\b/g, " ")
.replace(/\b[A-Z]{2,}-[A-Z]{2,}\b/g, " ")
.replace(/[A-ZÁÉÍÓÚÑ]{2,},\s*[A-ZÁÉÍÓÚÑ\s]{2,}/g, " ");
}
export function keepUsefulLabLines(lines: string[]): string[] {
const allowedSectionPatterns = [
/^BIOQUIMICA$/i,
/^BIOQUIMICA GENERAL$/i,
/^HEMATOLOGIA$/i,
/^HEMATIMETRIA$/i,
/^HEMOSTASIA$/i,
/^GASOMETRIA$/i,
/^ORINA$/i,
/^COAGULACION$/i,
/^OTRAS PRUEBAS:?$/i
];
const obviousNoisePatterns = [
/^Magnitud Resultado Unidades Intervalo de Referencia Biológico$/i,
/^Tipo de informe:.*$/i,
/^Resultados validados por:.*$/i,
/^Tipo de Muestra:.*$/i,
/^\.+$/,
/^-$/,
/^Hospital .*$/i,
/^INFORME DE RESULTADOS$/i,
/^Nombre:.*$/i,
/^Historia:.*$/i,
/^Sexo:.*$/i,
/^Edad:.*$/i,
/^Recep\.Muestra.*$/i,
/^Fch\.Informe.*$/i
];
return lines.filter((line) => {
if (!line || line.length < 2) return false;
if (obviousNoisePatterns.some((rx) => rx.test(line))) return false;
if (allowedSectionPatterns.some((rx) => rx.test(line))) return true;
const hasPend = /\bPEND\b/i.test(line);
const hasValue = /\b\d+[.,]?\d*\b/.test(line);
const hasRange = /\b\d+[.,]?\d*\s*-\s*\d+[.,]?\d*\b/.test(line);
const hasUnits =
/\b(mg\/dL|g\/dL|mmol\/L|mEq\/L|mil\/mm3|mill\/mm3|fl|pg|%|u\/L|ui\/L|ng\/mL|mL\/min\/1\.73m\^?2|mL\/min|seg|s|mill|mil|pg|fl)\b/i.test(
line
);
// If it has a value and units, or a value and a range, or is a section header, keep it.
// Also keep lines that look like parameters if they are reasonably long.
const looksLikeParameter = /^[A-ZÁÉÍÓÚÑ][A-ZÁÉÍÓÚÑ0-9\s\.\-\(\)\/]{3,}$/i.test(line);
return hasPend || hasRange || hasUnits || hasValue || looksLikeParameter;
});
}
export function finalizeCleanLines(text: string): string[] {
return text
.split("\n")
.map((line) => line.trim())
.map((line) => line.replace(/[ \t]+/g, " ").trim())
.filter(Boolean);
}
export function anonymizeLabsText(rawText: string): string {
let text = normalizeInputText(rawText);
text = removeAdministrativePatterns(text);
text = removeDirectIdentifiers(text);
const cleanedLines = finalizeCleanLines(text);
const usefulLines = keepUsefulLabLines(cleanedLines);
const result = usefulLines.join("\n").trim();
console.log("Anonymized Labs Text length:", result.length);
return result;
}
export function anonymizeTreatmentsText(rawText: string): string {
let text = normalizeInputText(rawText);
text = removeAdministrativePatterns(text);
text = removeDirectIdentifiers(text);
const cleanedLines = finalizeCleanLines(text).filter((line) => {
const hasDose =
/\b\d+[.,]?\d*\s?(mg|mcg|g|ui|ml|ug)\b/i.test(line);
const hasFrequency =
/(cada\s+\d+\s*h|cada\s+\d+\s*horas|al d[ií]a|si precisa|por la noche|por la ma[ñn]ana|semanal|q\d+h)/i.test(
line
);
const looksLikeDrug =
/^[A-ZÁÉÍÓÚÑa-záéíóúñ][A-ZÁÉÍÓÚÑa-záéíóúñ0-9\s\/\.\-]+$/i.test(line);
return looksLikeDrug || hasDose || hasFrequency;
});
const result = cleanedLines.join("\n").trim();
console.log("Anonymized Treatments Text length:", result.length);
return result;
}
|