File size: 4,909 Bytes
c443e05
 
 
2f87d34
c443e05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f87d34
 
c443e05
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# regulatory_ai_gradio.py
import re
import gradio as gr

# Optional Hugging Face NER
try:
    from transformers import pipeline
    ner_pipeline = pipeline("ner", grouped_entities=True)
    HF_AVAILABLE = True
except Exception:
    ner_pipeline = None
    HF_AVAILABLE = False

KENYAN_HOSPITALS = [
    "Kenyatta National Hospital",
    "Moi Teaching and Referral Hospital",
    "Aga Khan University Hospital",
]
KENYAN_COUNTIES = ["Nairobi", "Kisumu", "Mombasa", "Nakuru"]

REGEX_PATTERNS = {
    "EMAIL": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
    "PHONE": r"(\+254|0)(7|1)\d{8}",
    "NHIF": r"NHIF\s?(No\.|Number)?\s?\d+",
    "NAT_ID": r"\b\d{7,8}\b",
    "NCT_ID": r"NCT\d{8}",
    "DATE": r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b",
}

COMPLIANCE_RULES = {
    "EMAIL": ("redact", "Direct contact identifier. Remove."),
    "PHONE": ("redact", "Direct contact identifier. Remove."),
    "NHIF": ("mask", "NHIF number is sensitive. Mask or pseudonymize."),
    "NAT_ID": ("mask", "National ID is sensitive. Mask or pseudonymize."),
    "NCT_ID": ("keep", "Public trial identifier. Keep unless policy dictates masking."),
    "DATE": ("generalize", "Generalize dates (e.g., keep only year)."),
    "ORG": ("generalize", "Institution names may indirectly identify. Generalize."),
    "LOC": ("generalize", "Geographic info may re-identify. Generalize."),
    "PER": ("redact", "Personal names are identifiers. Remove."),
}

# Define colors per label
ENTITY_COLORS = {
    "EMAIL": "red",
    "PHONE": "orange",
    "NHIF": "purple",
    "NAT_ID": "purple",
    "NCT_ID": "green",
    "DATE": "blue",
    "ORG": "teal",
    "LOC": "brown",
    "PER": "pink",
}

def analyze_text(text, use_ner=False):
    findings = []

    # regex detections
    for label, pattern in REGEX_PATTERNS.items():
        for match in re.finditer(pattern, text):
            findings.append({
                "label": label,
                "text": match.group(),
                "span": match.span(),
                "source": "regex"
            })

    # gazetteer detections
    for org in KENYAN_HOSPITALS:
        if org in text:
            findings.append({"label": "ORG", "text": org,
                             "span": (text.index(org), text.index(org)+len(org)),
                             "source": "gazetteer"})
    for county in KENYAN_COUNTIES:
        if county in text:
            findings.append({"label": "LOC", "text": county,
                             "span": (text.index(county), text.index(county)+len(county)),
                             "source": "gazetteer"})

    # HF NER detections
    if use_ner and HF_AVAILABLE:
        ner_results = ner_pipeline(text)
        for ent in ner_results:
            findings.append({
                "label": ent["entity_group"],
                "text": ent["word"],
                "span": (ent["start"], ent["end"]),
                "score": ent["score"],
                "source": "hf_ner"
            })

    # apply rules + sanitize + highlights
    sanitized = text
    notes = []
    highlights = []

    for f in sorted(findings, key=lambda x: -x["span"][0]):  # backwards replacement
        label = f["label"]
        rule = COMPLIANCE_RULES.get(label)
        if not rule:
            continue
        action, advice = rule
        f["action"], f["advice"] = action, advice

        # replacement for sanitized
        if action == "redact":
            replacement = f"[REDACTED {label}]"
        elif action == "mask":
            replacement = f"[MASKED: {'*' * (len(f['text'])-2)}{f['text'][-2:]}]"
        elif action == "generalize":
            replacement = f"[{label} (GENERALIZED)]"
        else:
            replacement = f["text"]

        sanitized = sanitized[:f["span"][0]] + replacement + sanitized[f["span"][1]:]

        notes.append(f"- [{label}] \"{f['text']}\" → {action.upper()} | {advice}")
        highlights.append((f["text"], label))

    return sanitized, highlights, "\n".join(notes) if notes else "No sensitive entities found."

# --- Gradio UI ---
with gr.Blocks() as demo:
    gr.Markdown("## 🏥 Regulatory AI Demo (Kenya Context)\nPaste healthcare text to sanitize and get compliance notes.")
    with gr.Row():
        input_text = gr.Textbox(lines=8, label="Input Text")
    with gr.Row():
        use_ner = gr.Checkbox(label="Use Hugging Face NER (if available)", value=False)
    with gr.Row():
        btn = gr.Button("Analyze")
    with gr.Row():
        sanitized_output = gr.Textbox(lines=8, label="Sanitized Text")
    with gr.Row():
        highlighted_output = gr.HighlightedText(label="Detected Entities (Highlighted)")
    with gr.Row():
        notes_output = gr.Textbox(lines=8, label="Compliance Notes")

    btn.click(analyze_text, inputs=[input_text, use_ner],
              outputs=[sanitized_output, highlighted_output, notes_output])

if __name__ == "__main__":
    demo.launch()