File size: 3,833 Bytes
74f2b46 99f62cc 74f2b46 99f62cc 74f2b46 99f62cc 74f2b46 99f62cc 74f2b46 99f62cc 74f2b46 99f62cc 74f2b46 99f62cc 74f2b46 99f62cc 74f2b46 99f62cc 74f2b46 99f62cc 74f2b46 99f62cc 74f2b46 99f62cc 74f2b46 99f62cc 74f2b46 99f62cc 74f2b46 99f62cc 74f2b46 99f62cc 74f2b46 99f62cc 74f2b46 99f62cc 74f2b46 99f62cc 74f2b46 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 | use once_cell::sync::Lazy;
use regex::Regex;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PiiMatch {
pub entity_type: String,
pub original: String,
pub placeholder: String,
pub start: usize,
pub end: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RedactionResult {
pub redacted_text: String,
pub pii_map: Vec<PiiMatch>,
}
static PATTERNS: Lazy<Vec<(Regex, &'static str)>> = Lazy::new(compile_patterns);
fn compile_patterns() -> Vec<(Regex, &'static str)> {
let mut patterns = Vec::new();
for (pattern, label) in [
(r"\b(?:Dr\.|Dr|Professor|Prof\.)\s+[A-Z][a-z]+\b", "PROVIDER_NAME"),
(r"\b(?:Patient|Pt\.|Mr\.|Mrs\.|Ms\.)\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b", "PERSON_NAME"),
(r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b", "PERSON_NAME"),
(r"\b\d{1,2}/\d{1,2}/\d{2,4}\b", "DATE"),
(r"\b\d{4}-\d{2}-\d{2}\b", "DATE"),
(r"\b\d{3}-\d{2}-\d{4}\b", "SSN"),
(r"\bMRN[:\s-]*[A-Z0-9-]{3,}\b", "MRN"),
(r"\b\d{10}\b", "PHONE"),
(r"\b\+?\d[\d\s().-]{7,}\b", "PHONE"),
(r"\b[A-Z]{2}\d{2}\s?\d{2}\s?\d{2}\s?\d\b", "NHS_NUMBER"),
(r"\b\d{2,3}\s?(?:years?|yo|yrs?)\b", "AGE"),
(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b", "EMAIL"),
(r"\b(?:insurance|member|policy)\s*(?:id|number|#)?\s*[:=]?\s*[A-Z0-9-]{5,}\b", "INSURANCE_ID"),
(r"\b(?:\d{1,3}\s)?[A-Za-z]+\s(?:Street|St\.|Road|Rd\.|Avenue|Ave\.|Boulevard|Blvd\.|Lane|Ln\.)\b", "ADDRESS"),
] {
if let Ok(re) = Regex::new(pattern) {
patterns.push((re, label));
}
}
patterns
}
pub fn redact_pii(raw_text: &str) -> RedactionResult {
let mut spans: Vec<(usize, usize, &'static str, String)> = Vec::new();
for (re, label) in PATTERNS.iter() {
for mat in re.find_iter(raw_text) {
spans.push((mat.start(), mat.end(), *label, mat.as_str().to_string()));
}
}
spans.sort_by(|a, b| a.0.cmp(&b.0).then_with(|| (b.1 - b.0).cmp(&(a.1 - a.0))));
let mut selected: Vec<(usize, usize, &'static str, String)> = Vec::new();
let mut cursor = 0usize;
for span in spans {
if span.0 >= cursor {
cursor = span.1;
selected.push(span);
}
}
let mut counts = std::collections::HashMap::<&'static str, usize>::new();
let mut redacted = String::with_capacity(raw_text.len() + 64);
let mut pii_map = Vec::with_capacity(selected.len());
let mut index = 0usize;
for (start, end, label, original) in selected {
if start > index {
redacted.push_str(&raw_text[index..start]);
}
let count = counts.entry(label).or_insert(0);
*count += 1;
let placeholder = format!("[{}_{}]", label, count);
redacted.push_str(&placeholder);
pii_map.push(PiiMatch {
entity_type: label.to_string(),
original,
placeholder,
start,
end,
});
index = end;
}
if index < raw_text.len() {
redacted.push_str(&raw_text[index..]);
}
RedactionResult { redacted_text: redacted, pii_map }
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn redaction_replaces_sensitive_text() {
let input = "Patient John Smith, 45 yo, MRN 847291A, phone 5551234567, SSN 123-45-6789.";
let result = redact_pii(input);
assert!(!result.redacted_text.contains("John Smith"));
assert!(!result.redacted_text.contains("5551234567"));
assert!(result.redacted_text.contains("[PERSON_NAME_1]") || result.redacted_text.contains("[PERSON_NAME_2]"));
assert!(result.redacted_text.contains("[PHONE_1]"));
assert!(result.redacted_text.contains("[SSN_1]"));
}
}
|