Spaces:
Sleeping
Sleeping
File size: 4,909 Bytes
c443e05 2f87d34 c443e05 2f87d34 c443e05 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | # regulatory_ai_gradio.py
import re
import gradio as gr
# Optional Hugging Face NER
try:
from transformers import pipeline
ner_pipeline = pipeline("ner", grouped_entities=True)
HF_AVAILABLE = True
except Exception:
ner_pipeline = None
HF_AVAILABLE = False
KENYAN_HOSPITALS = [
"Kenyatta National Hospital",
"Moi Teaching and Referral Hospital",
"Aga Khan University Hospital",
]
KENYAN_COUNTIES = ["Nairobi", "Kisumu", "Mombasa", "Nakuru"]
REGEX_PATTERNS = {
"EMAIL": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
"PHONE": r"(\+254|0)(7|1)\d{8}",
"NHIF": r"NHIF\s?(No\.|Number)?\s?\d+",
"NAT_ID": r"\b\d{7,8}\b",
"NCT_ID": r"NCT\d{8}",
"DATE": r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b",
}
COMPLIANCE_RULES = {
"EMAIL": ("redact", "Direct contact identifier. Remove."),
"PHONE": ("redact", "Direct contact identifier. Remove."),
"NHIF": ("mask", "NHIF number is sensitive. Mask or pseudonymize."),
"NAT_ID": ("mask", "National ID is sensitive. Mask or pseudonymize."),
"NCT_ID": ("keep", "Public trial identifier. Keep unless policy dictates masking."),
"DATE": ("generalize", "Generalize dates (e.g., keep only year)."),
"ORG": ("generalize", "Institution names may indirectly identify. Generalize."),
"LOC": ("generalize", "Geographic info may re-identify. Generalize."),
"PER": ("redact", "Personal names are identifiers. Remove."),
}
# Define colors per label
ENTITY_COLORS = {
"EMAIL": "red",
"PHONE": "orange",
"NHIF": "purple",
"NAT_ID": "purple",
"NCT_ID": "green",
"DATE": "blue",
"ORG": "teal",
"LOC": "brown",
"PER": "pink",
}
def analyze_text(text, use_ner=False):
findings = []
# regex detections
for label, pattern in REGEX_PATTERNS.items():
for match in re.finditer(pattern, text):
findings.append({
"label": label,
"text": match.group(),
"span": match.span(),
"source": "regex"
})
# gazetteer detections
for org in KENYAN_HOSPITALS:
if org in text:
findings.append({"label": "ORG", "text": org,
"span": (text.index(org), text.index(org)+len(org)),
"source": "gazetteer"})
for county in KENYAN_COUNTIES:
if county in text:
findings.append({"label": "LOC", "text": county,
"span": (text.index(county), text.index(county)+len(county)),
"source": "gazetteer"})
# HF NER detections
if use_ner and HF_AVAILABLE:
ner_results = ner_pipeline(text)
for ent in ner_results:
findings.append({
"label": ent["entity_group"],
"text": ent["word"],
"span": (ent["start"], ent["end"]),
"score": ent["score"],
"source": "hf_ner"
})
# apply rules + sanitize + highlights
sanitized = text
notes = []
highlights = []
for f in sorted(findings, key=lambda x: -x["span"][0]): # backwards replacement
label = f["label"]
rule = COMPLIANCE_RULES.get(label)
if not rule:
continue
action, advice = rule
f["action"], f["advice"] = action, advice
# replacement for sanitized
if action == "redact":
replacement = f"[REDACTED {label}]"
elif action == "mask":
replacement = f"[MASKED: {'*' * (len(f['text'])-2)}{f['text'][-2:]}]"
elif action == "generalize":
replacement = f"[{label} (GENERALIZED)]"
else:
replacement = f["text"]
sanitized = sanitized[:f["span"][0]] + replacement + sanitized[f["span"][1]:]
notes.append(f"- [{label}] \"{f['text']}\" → {action.upper()} | {advice}")
highlights.append((f["text"], label))
return sanitized, highlights, "\n".join(notes) if notes else "No sensitive entities found."
# --- Gradio UI ---
with gr.Blocks() as demo:
gr.Markdown("## 🏥 Regulatory AI Demo (Kenya Context)\nPaste healthcare text to sanitize and get compliance notes.")
with gr.Row():
input_text = gr.Textbox(lines=8, label="Input Text")
with gr.Row():
use_ner = gr.Checkbox(label="Use Hugging Face NER (if available)", value=False)
with gr.Row():
btn = gr.Button("Analyze")
with gr.Row():
sanitized_output = gr.Textbox(lines=8, label="Sanitized Text")
with gr.Row():
highlighted_output = gr.HighlightedText(label="Detected Entities (Highlighted)")
with gr.Row():
notes_output = gr.Textbox(lines=8, label="Compliance Notes")
btn.click(analyze_text, inputs=[input_text, use_ner],
outputs=[sanitized_output, highlighted_output, notes_output])
if __name__ == "__main__":
demo.launch()
|