File size: 3,833 Bytes
74f2b46
 
 
99f62cc
 
 
 
 
 
74f2b46
 
99f62cc
 
74f2b46
 
 
 
 
 
 
 
 
 
 
99f62cc
74f2b46
 
99f62cc
74f2b46
99f62cc
74f2b46
99f62cc
74f2b46
99f62cc
74f2b46
99f62cc
74f2b46
 
 
 
 
 
 
 
 
99f62cc
74f2b46
 
99f62cc
74f2b46
99f62cc
74f2b46
99f62cc
 
 
74f2b46
99f62cc
74f2b46
 
 
 
 
 
99f62cc
 
 
74f2b46
 
 
 
 
 
 
 
 
 
99f62cc
74f2b46
 
 
 
 
 
 
 
99f62cc
74f2b46
 
 
 
 
99f62cc
 
74f2b46
99f62cc
 
 
 
 
 
 
74f2b46
 
 
 
 
 
 
 
99f62cc
74f2b46
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
use once_cell::sync::Lazy;
use regex::Regex;
use serde::{Deserialize, Serialize};

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PiiMatch {
    pub entity_type: String,
    pub original: String,
    pub placeholder: String,
    pub start: usize,
    pub end: usize,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RedactionResult {
    pub redacted_text: String,
    pub pii_map: Vec<PiiMatch>,
}

static PATTERNS: Lazy<Vec<(Regex, &'static str)>> = Lazy::new(compile_patterns);

fn compile_patterns() -> Vec<(Regex, &'static str)> {
    let mut patterns = Vec::new();
    for (pattern, label) in [
        (r"\b(?:Dr\.|Dr|Professor|Prof\.)\s+[A-Z][a-z]+\b", "PROVIDER_NAME"),
        (r"\b(?:Patient|Pt\.|Mr\.|Mrs\.|Ms\.)\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b", "PERSON_NAME"),
        (r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b", "PERSON_NAME"),
        (r"\b\d{1,2}/\d{1,2}/\d{2,4}\b", "DATE"),
        (r"\b\d{4}-\d{2}-\d{2}\b", "DATE"),
        (r"\b\d{3}-\d{2}-\d{4}\b", "SSN"),
        (r"\bMRN[:\s-]*[A-Z0-9-]{3,}\b", "MRN"),
        (r"\b\d{10}\b", "PHONE"),
        (r"\b\+?\d[\d\s().-]{7,}\b", "PHONE"),
        (r"\b[A-Z]{2}\d{2}\s?\d{2}\s?\d{2}\s?\d\b", "NHS_NUMBER"),
        (r"\b\d{2,3}\s?(?:years?|yo|yrs?)\b", "AGE"),
        (r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b", "EMAIL"),
        (r"\b(?:insurance|member|policy)\s*(?:id|number|#)?\s*[:=]?\s*[A-Z0-9-]{5,}\b", "INSURANCE_ID"),
        (r"\b(?:\d{1,3}\s)?[A-Za-z]+\s(?:Street|St\.|Road|Rd\.|Avenue|Ave\.|Boulevard|Blvd\.|Lane|Ln\.)\b", "ADDRESS"),
    ] {
        if let Ok(re) = Regex::new(pattern) {
            patterns.push((re, label));
        }
    }
    patterns
}

pub fn redact_pii(raw_text: &str) -> RedactionResult {
    let mut spans: Vec<(usize, usize, &'static str, String)> = Vec::new();

    for (re, label) in PATTERNS.iter() {
        for mat in re.find_iter(raw_text) {
            spans.push((mat.start(), mat.end(), *label, mat.as_str().to_string()));
        }
    }

    spans.sort_by(|a, b| a.0.cmp(&b.0).then_with(|| (b.1 - b.0).cmp(&(a.1 - a.0))));

    let mut selected: Vec<(usize, usize, &'static str, String)> = Vec::new();
    let mut cursor = 0usize;
    for span in spans {
        if span.0 >= cursor {
            cursor = span.1;
            selected.push(span);
        }
    }

    let mut counts = std::collections::HashMap::<&'static str, usize>::new();
    let mut redacted = String::with_capacity(raw_text.len() + 64);
    let mut pii_map = Vec::with_capacity(selected.len());
    let mut index = 0usize;

    for (start, end, label, original) in selected {
        if start > index {
            redacted.push_str(&raw_text[index..start]);
        }
        let count = counts.entry(label).or_insert(0);
        *count += 1;
        let placeholder = format!("[{}_{}]", label, count);
        redacted.push_str(&placeholder);
        pii_map.push(PiiMatch {
            entity_type: label.to_string(),
            original,
            placeholder,
            start,
            end,
        });
        index = end;
    }

    if index < raw_text.len() {
        redacted.push_str(&raw_text[index..]);
    }

    RedactionResult { redacted_text: redacted, pii_map }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn redaction_replaces_sensitive_text() {
        let input = "Patient John Smith, 45 yo, MRN 847291A, phone 5551234567, SSN 123-45-6789.";
        let result = redact_pii(input);
        assert!(!result.redacted_text.contains("John Smith"));
        assert!(!result.redacted_text.contains("5551234567"));
        assert!(result.redacted_text.contains("[PERSON_NAME_1]") || result.redacted_text.contains("[PERSON_NAME_2]"));
        assert!(result.redacted_text.contains("[PHONE_1]"));
        assert!(result.redacted_text.contains("[SSN_1]"));
    }
}