Spaces:
Sleeping
Sleeping
File size: 1,995 Bytes
c1f3739 279c5c4 afe6838 233d8ee 389c5f0 c1f3739 afe6838 c1f3739 f0e2099 afe6838 c1f3739 afe6838 c1f3739 afe6838 c1f3739 afe6838 c1f3739 afe6838 c1f3739 afe6838 c1f3739 afe6838 c1f3739 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 | from pathlib import Path
import re
from .helpers import dateparser_detect, describe_relative_date
def load_kenyan_names(filepath="data/processed/kenyan_names.txt"):
if not Path(filepath).exists():
return set()
with open(filepath, "r", encoding="utf-8") as f:
return set(line.strip().lower() for line in f if line.strip())
kenyan_names = load_kenyan_names()
def name_list_detect(text_names):
words = re.findall(r"\b\w+\b", text_names)
matches = [w for w in words if w.lower() in kenyan_names]
return matches
def detect_and_redact_phi(text_input):
names_found = name_list_detect(text_input)
dates_found = dateparser_detect(text_input)
phi_detected = bool(names_found or dates_found)
for match, dt in dates_found:
relative = describe_relative_date(dt)
text_input = text_input.replace(match, relative)
for name in names_found:
pattern = re.compile(rf"\b{name}\b", re.IGNORECASE)
text_input = pattern.sub("[name]", text_input)
return {
"phi_detected": phi_detected,
"kenyan_name_matches": names_found,
"dates": [d[0] for d in dates_found],
"redacted_text": text_input,
}
if __name__ == "__main__":
print("\n🔍 PHI Detection Tool (Kenyan context + redaction with relative dates)\n")
while True:
text = input("Enter clinical text (or 'q' to quit):\n> ")
if text.lower() == "q":
break
results = detect_and_redact_phi(text)
if results["phi_detected"]:
print("\n⚠️ Possible PHI detected!")
if results["kenyan_name_matches"]:
print(" - Possible Kenyan names:", results["kenyan_name_matches"])
if results["dates"]:
print(" - Dates detected:", results["dates"])
print("\n🛡️ Redacted text:")
print(results["redacted_text"])
else:
print("\n✅ No PHI detected.")
print("\n---\n")
|