Spaces:
Sleeping
Sleeping
| # regulatory_ai_gradio.py | |
| import re | |
| import gradio as gr | |
| # Optional Hugging Face NER | |
| try: | |
| from transformers import pipeline | |
| ner_pipeline = pipeline("ner", grouped_entities=True) | |
| HF_AVAILABLE = True | |
| except Exception: | |
| ner_pipeline = None | |
| HF_AVAILABLE = False | |
| KENYAN_HOSPITALS = [ | |
| "Kenyatta National Hospital", | |
| "Moi Teaching and Referral Hospital", | |
| "Aga Khan University Hospital", | |
| ] | |
| KENYAN_COUNTIES = ["Nairobi", "Kisumu", "Mombasa", "Nakuru"] | |
| REGEX_PATTERNS = { | |
| "EMAIL": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", | |
| "PHONE": r"(\+254|0)(7|1)\d{8}", | |
| "NHIF": r"NHIF\s?(No\.|Number)?\s?\d+", | |
| "NAT_ID": r"\b\d{7,8}\b", | |
| "NCT_ID": r"NCT\d{8}", | |
| "DATE": r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b", | |
| } | |
| COMPLIANCE_RULES = { | |
| "EMAIL": ("redact", "Direct contact identifier. Remove."), | |
| "PHONE": ("redact", "Direct contact identifier. Remove."), | |
| "NHIF": ("mask", "NHIF number is sensitive. Mask or pseudonymize."), | |
| "NAT_ID": ("mask", "National ID is sensitive. Mask or pseudonymize."), | |
| "NCT_ID": ("keep", "Public trial identifier. Keep unless policy dictates masking."), | |
| "DATE": ("generalize", "Generalize dates (e.g., keep only year)."), | |
| "ORG": ("generalize", "Institution names may indirectly identify. Generalize."), | |
| "LOC": ("generalize", "Geographic info may re-identify. Generalize."), | |
| "PER": ("redact", "Personal names are identifiers. Remove."), | |
| } | |
| # Define colors per label | |
| ENTITY_COLORS = { | |
| "EMAIL": "red", | |
| "PHONE": "orange", | |
| "NHIF": "purple", | |
| "NAT_ID": "purple", | |
| "NCT_ID": "green", | |
| "DATE": "blue", | |
| "ORG": "teal", | |
| "LOC": "brown", | |
| "PER": "pink", | |
| } | |
| def analyze_text(text, use_ner=False): | |
| findings = [] | |
| # regex detections | |
| for label, pattern in REGEX_PATTERNS.items(): | |
| for match in re.finditer(pattern, text): | |
| findings.append({ | |
| "label": label, | |
| "text": match.group(), | |
| "span": match.span(), | |
| "source": "regex" | |
| }) | |
| # gazetteer detections | |
| for org in KENYAN_HOSPITALS: | |
| if org in text: | |
| findings.append({"label": "ORG", "text": org, | |
| "span": (text.index(org), text.index(org)+len(org)), | |
| "source": "gazetteer"}) | |
| for county in KENYAN_COUNTIES: | |
| if county in text: | |
| findings.append({"label": "LOC", "text": county, | |
| "span": (text.index(county), text.index(county)+len(county)), | |
| "source": "gazetteer"}) | |
| # HF NER detections | |
| if use_ner and HF_AVAILABLE: | |
| ner_results = ner_pipeline(text) | |
| for ent in ner_results: | |
| findings.append({ | |
| "label": ent["entity_group"], | |
| "text": ent["word"], | |
| "span": (ent["start"], ent["end"]), | |
| "score": ent["score"], | |
| "source": "hf_ner" | |
| }) | |
| # apply rules + sanitize + highlights | |
| sanitized = text | |
| notes = [] | |
| highlights = [] | |
| for f in sorted(findings, key=lambda x: -x["span"][0]): # backwards replacement | |
| label = f["label"] | |
| rule = COMPLIANCE_RULES.get(label) | |
| if not rule: | |
| continue | |
| action, advice = rule | |
| f["action"], f["advice"] = action, advice | |
| # replacement for sanitized | |
| if action == "redact": | |
| replacement = f"[REDACTED {label}]" | |
| elif action == "mask": | |
| replacement = f"[MASKED: {'*' * (len(f['text'])-2)}{f['text'][-2:]}]" | |
| elif action == "generalize": | |
| replacement = f"[{label} (GENERALIZED)]" | |
| else: | |
| replacement = f["text"] | |
| sanitized = sanitized[:f["span"][0]] + replacement + sanitized[f["span"][1]:] | |
| notes.append(f"- [{label}] \"{f['text']}\" → {action.upper()} | {advice}") | |
| highlights.append((f["text"], label)) | |
| return sanitized, highlights, "\n".join(notes) if notes else "No sensitive entities found." | |
| # --- Gradio UI --- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## 🏥 Regulatory AI Demo (Kenya Context)\nPaste healthcare text to sanitize and get compliance notes.") | |
| with gr.Row(): | |
| input_text = gr.Textbox(lines=8, label="Input Text") | |
| with gr.Row(): | |
| use_ner = gr.Checkbox(label="Use Hugging Face NER (if available)", value=False) | |
| with gr.Row(): | |
| btn = gr.Button("Analyze") | |
| with gr.Row(): | |
| sanitized_output = gr.Textbox(lines=8, label="Sanitized Text") | |
| with gr.Row(): | |
| highlighted_output = gr.HighlightedText(label="Detected Entities (Highlighted)") | |
| with gr.Row(): | |
| notes_output = gr.Textbox(lines=8, label="Compliance Notes") | |
| btn.click(analyze_text, inputs=[input_text, use_ner], | |
| outputs=[sanitized_output, highlighted_output, notes_output]) | |
| if __name__ == "__main__": | |
| demo.launch() | |