Spaces:

AndaiMD
/

Regulatory_ai

Sleeping

File size: 4,909 Bytes

# regulatory_ai_gradio.py
import re
import gradio as gr

# Optional Hugging Face NER
try:
    from transformers import pipeline
    ner_pipeline = pipeline("ner", grouped_entities=True)
    HF_AVAILABLE = True
except Exception:
    ner_pipeline = None
    HF_AVAILABLE = False

KENYAN_HOSPITALS = [
    "Kenyatta National Hospital",
    "Moi Teaching and Referral Hospital",
    "Aga Khan University Hospital",
]
KENYAN_COUNTIES = ["Nairobi", "Kisumu", "Mombasa", "Nakuru"]

REGEX_PATTERNS = {
    "EMAIL": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
    "PHONE": r"(\+254|0)(7|1)\d{8}",
    "NHIF": r"NHIF\s?(No\.|Number)?\s?\d+",
    "NAT_ID": r"\b\d{7,8}\b",
    "NCT_ID": r"NCT\d{8}",
    "DATE": r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b",
}

COMPLIANCE_RULES = {
    "EMAIL": ("redact", "Direct contact identifier. Remove."),
    "PHONE": ("redact", "Direct contact identifier. Remove."),
    "NHIF": ("mask", "NHIF number is sensitive. Mask or pseudonymize."),
    "NAT_ID": ("mask", "National ID is sensitive. Mask or pseudonymize."),
    "NCT_ID": ("keep", "Public trial identifier. Keep unless policy dictates masking."),
    "DATE": ("generalize", "Generalize dates (e.g., keep only year)."),
    "ORG": ("generalize", "Institution names may indirectly identify. Generalize."),
    "LOC": ("generalize", "Geographic info may re-identify. Generalize."),
    "PER": ("redact", "Personal names are identifiers. Remove."),
}

# Define colors per label
ENTITY_COLORS = {
    "EMAIL": "red",
    "PHONE": "orange",
    "NHIF": "purple",
    "NAT_ID": "purple",
    "NCT_ID": "green",
    "DATE": "blue",
    "ORG": "teal",
    "LOC": "brown",
    "PER": "pink",
}

def analyze_text(text, use_ner=False):
    findings = []

    # regex detections
    for label, pattern in REGEX_PATTERNS.items():
        for match in re.finditer(pattern, text):
            findings.append({
                "label": label,
                "text": match.group(),
                "span": match.span(),
                "source": "regex"
            })

    # gazetteer detections
    for org in KENYAN_HOSPITALS:
        if org in text:
            findings.append({"label": "ORG", "text": org,
                             "span": (text.index(org), text.index(org)+len(org)),
                             "source": "gazetteer"})
    for county in KENYAN_COUNTIES:
        if county in text:
            findings.append({"label": "LOC", "text": county,
                             "span": (text.index(county), text.index(county)+len(county)),
                             "source": "gazetteer"})

    # HF NER detections
    if use_ner and HF_AVAILABLE:
        ner_results = ner_pipeline(text)
        for ent in ner_results:
            findings.append({
                "label": ent["entity_group"],
                "text": ent["word"],
                "span": (ent["start"], ent["end"]),
                "score": ent["score"],
                "source": "hf_ner"
            })

    # apply rules + sanitize + highlights
    sanitized = text
    notes = []
    highlights = []

    for f in sorted(findings, key=lambda x: -x["span"][0]):  # backwards replacement
        label = f["label"]
        rule = COMPLIANCE_RULES.get(label)
        if not rule:
            continue
        action, advice = rule
        f["action"], f["advice"] = action, advice

        # replacement for sanitized
        if action == "redact":
            replacement = f"[REDACTED {label}]"
        elif action == "mask":
            replacement = f"[MASKED: {'*' * (len(f['text'])-2)}{f['text'][-2:]}]"
        elif action == "generalize":
            replacement = f"[{label} (GENERALIZED)]"
        else:
            replacement = f["text"]

        sanitized = sanitized[:f["span"][0]] + replacement + sanitized[f["span"][1]:]

        notes.append(f"- [{label}] \"{f['text']}\" → {action.upper()} | {advice}")
        highlights.append((f["text"], label))

    return sanitized, highlights, "\n".join(notes) if notes else "No sensitive entities found."

# --- Gradio UI ---
with gr.Blocks() as demo:
    gr.Markdown("## 🏥 Regulatory AI Demo (Kenya Context)\nPaste healthcare text to sanitize and get compliance notes.")
    with gr.Row():
        input_text = gr.Textbox(lines=8, label="Input Text")
    with gr.Row():
        use_ner = gr.Checkbox(label="Use Hugging Face NER (if available)", value=False)
    with gr.Row():
        btn = gr.Button("Analyze")
    with gr.Row():
        sanitized_output = gr.Textbox(lines=8, label="Sanitized Text")
    with gr.Row():
        highlighted_output = gr.HighlightedText(label="Detected Entities (Highlighted)")
    with gr.Row():
        notes_output = gr.Textbox(lines=8, label="Compliance Notes")

    btn.click(analyze_text, inputs=[input_text, use_ner],
              outputs=[sanitized_output, highlighted_output, notes_output])

if __name__ == "__main__":
    demo.launch()