Spaces:

AndaiMD
/

Regulatory_ai

Sleeping

App Files Files Community

Regulatory_ai / app.py

AndaiMD

gradio app

7f0a9f7 5 months ago

raw

history blame contribute delete

4.91 kB

	# regulatory_ai_gradio.py
	import re
	import gradio as gr

	# Optional Hugging Face NER
	try:
	from transformers import pipeline
	ner_pipeline = pipeline("ner", grouped_entities=True)
	HF_AVAILABLE = True
	except Exception:
	ner_pipeline = None
	HF_AVAILABLE = False

	KENYAN_HOSPITALS = [
	"Kenyatta National Hospital",
	"Moi Teaching and Referral Hospital",
	"Aga Khan University Hospital",
	]
	KENYAN_COUNTIES = ["Nairobi", "Kisumu", "Mombasa", "Nakuru"]

	REGEX_PATTERNS = {
	"EMAIL": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
	"PHONE": r"(\+254\|0)(7\|1)\d{8}",
	"NHIF": r"NHIF\s?(No\.\|Number)?\s?\d+",
	"NAT_ID": r"\b\d{7,8}\b",
	"NCT_ID": r"NCT\d{8}",
	"DATE": r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b",
	}

	COMPLIANCE_RULES = {
	"EMAIL": ("redact", "Direct contact identifier. Remove."),
	"PHONE": ("redact", "Direct contact identifier. Remove."),
	"NHIF": ("mask", "NHIF number is sensitive. Mask or pseudonymize."),
	"NAT_ID": ("mask", "National ID is sensitive. Mask or pseudonymize."),
	"NCT_ID": ("keep", "Public trial identifier. Keep unless policy dictates masking."),
	"DATE": ("generalize", "Generalize dates (e.g., keep only year)."),
	"ORG": ("generalize", "Institution names may indirectly identify. Generalize."),
	"LOC": ("generalize", "Geographic info may re-identify. Generalize."),
	"PER": ("redact", "Personal names are identifiers. Remove."),
	}

	# Define colors per label
	ENTITY_COLORS = {
	"EMAIL": "red",
	"PHONE": "orange",
	"NHIF": "purple",
	"NAT_ID": "purple",
	"NCT_ID": "green",
	"DATE": "blue",
	"ORG": "teal",
	"LOC": "brown",
	"PER": "pink",
	}

	def analyze_text(text, use_ner=False):
	findings = []

	# regex detections
	for label, pattern in REGEX_PATTERNS.items():
	for match in re.finditer(pattern, text):
	findings.append({
	"label": label,
	"text": match.group(),
	"span": match.span(),
	"source": "regex"
	})

	# gazetteer detections
	for org in KENYAN_HOSPITALS:
	if org in text:
	findings.append({"label": "ORG", "text": org,
	"span": (text.index(org), text.index(org)+len(org)),
	"source": "gazetteer"})
	for county in KENYAN_COUNTIES:
	if county in text:
	findings.append({"label": "LOC", "text": county,
	"span": (text.index(county), text.index(county)+len(county)),
	"source": "gazetteer"})

	# HF NER detections
	if use_ner and HF_AVAILABLE:
	ner_results = ner_pipeline(text)
	for ent in ner_results:
	findings.append({
	"label": ent["entity_group"],
	"text": ent["word"],
	"span": (ent["start"], ent["end"]),
	"score": ent["score"],
	"source": "hf_ner"
	})

	# apply rules + sanitize + highlights
	sanitized = text
	notes = []
	highlights = []

	for f in sorted(findings, key=lambda x: -x["span"][0]): # backwards replacement
	label = f["label"]
	rule = COMPLIANCE_RULES.get(label)
	if not rule:
	continue
	action, advice = rule
	f["action"], f["advice"] = action, advice

	# replacement for sanitized
	if action == "redact":
	replacement = f"[REDACTED {label}]"
	elif action == "mask":
	replacement = f"[MASKED: {'' (len(f['text'])-2)}{f['text'][-2:]}]"
	elif action == "generalize":
	replacement = f"[{label} (GENERALIZED)]"
	else:
	replacement = f["text"]

	sanitized = sanitized[:f["span"][0]] + replacement + sanitized[f["span"][1]:]

	notes.append(f"- [{label}] \"{f['text']}\" → {action.upper()} \| {advice}")
	highlights.append((f["text"], label))

	return sanitized, highlights, "\n".join(notes) if notes else "No sensitive entities found."

	# --- Gradio UI ---
	with gr.Blocks() as demo:
	gr.Markdown("## 🏥 Regulatory AI Demo (Kenya Context)\nPaste healthcare text to sanitize and get compliance notes.")
	with gr.Row():
	input_text = gr.Textbox(lines=8, label="Input Text")
	with gr.Row():
	use_ner = gr.Checkbox(label="Use Hugging Face NER (if available)", value=False)
	with gr.Row():
	btn = gr.Button("Analyze")
	with gr.Row():
	sanitized_output = gr.Textbox(lines=8, label="Sanitized Text")
	with gr.Row():
	highlighted_output = gr.HighlightedText(label="Detected Entities (Highlighted)")
	with gr.Row():
	notes_output = gr.Textbox(lines=8, label="Compliance Notes")

	btn.click(analyze_text, inputs=[input_text, use_ner],
	outputs=[sanitized_output, highlighted_output, notes_output])

	if __name__ == "__main__":
	demo.launch()