File size: 1,995 Bytes
c1f3739
 
 
279c5c4
afe6838
233d8ee
389c5f0
c1f3739
 
 
 
 
afe6838
c1f3739
f0e2099
afe6838
 
 
c1f3739
 
 
afe6838
 
 
 
c1f3739
 
 
 
 
afe6838
c1f3739
 
 
afe6838
c1f3739
 
 
 
 
afe6838
c1f3739
 
afe6838
c1f3739
 
 
 
afe6838
c1f3739
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from pathlib import Path
import re

from .helpers import dateparser_detect, describe_relative_date


def load_kenyan_names(filepath="data/processed/kenyan_names.txt"):
    if not Path(filepath).exists():
        return set()
    with open(filepath, "r", encoding="utf-8") as f:
        return set(line.strip().lower() for line in f if line.strip())


kenyan_names = load_kenyan_names()


def name_list_detect(text_names):
    words = re.findall(r"\b\w+\b", text_names)
    matches = [w for w in words if w.lower() in kenyan_names]
    return matches


def detect_and_redact_phi(text_input):
    names_found = name_list_detect(text_input)
    dates_found = dateparser_detect(text_input)

    phi_detected = bool(names_found or dates_found)

    for match, dt in dates_found:
        relative = describe_relative_date(dt)
        text_input = text_input.replace(match, relative)

    for name in names_found:
        pattern = re.compile(rf"\b{name}\b", re.IGNORECASE)
        text_input = pattern.sub("[name]", text_input)

    return {
        "phi_detected": phi_detected,
        "kenyan_name_matches": names_found,
        "dates": [d[0] for d in dates_found],
        "redacted_text": text_input,
    }


if __name__ == "__main__":
    print("\n🔍 PHI Detection Tool (Kenyan context + redaction with relative dates)\n")
    while True:
        text = input("Enter clinical text (or 'q' to quit):\n> ")
        if text.lower() == "q":
            break
        results = detect_and_redact_phi(text)

        if results["phi_detected"]:
            print("\n⚠️  Possible PHI detected!")
            if results["kenyan_name_matches"]:
                print(" - Possible Kenyan names:", results["kenyan_name_matches"])
            if results["dates"]:
                print(" - Dates detected:", results["dates"])

            print("\n🛡️  Redacted text:")
            print(results["redacted_text"])
        else:
            print("\n✅ No PHI detected.")
        print("\n---\n")