import re import gradio as gr from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline # Load pretrained NER model tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER") model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER") ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") # --- Core Redaction Function --- def redact_text(input_text, custom_pattern=None, pattern_type=None): redacted_text = input_text # Apply model-based NER redaction ner_results = ner_pipeline(input_text) entities = sorted(ner_results, key=lambda x: x["start"], reverse=True) for ent in entities: redacted_text = ( redacted_text[: ent["start"]] + "[REDACTED]" + redacted_text[ent["end"] :] ) # Apply custom regex redaction if provided if pattern_type == "Date (YYYY-MM-DD or DD/MM/YYYY)": custom_pattern = r"\b\d{4}-\d{2}-\d{2}\b|\b\d{2}/\d{2}/\d{4}\b" elif pattern_type == "16-digit Number (e.g., Credit Card)": custom_pattern = r"\b\d{16}\b" if custom_pattern: redacted_text = re.sub(custom_pattern, "[REDACTED]", redacted_text) return redacted_text # --- Gradio Interface --- with gr.Blocks(title="PII Redactor") as demo: gr.Markdown("## 🔒 PII Redactor using BERT NER + Custom Regex") with gr.Row(): text_input = gr.Textbox( label="Input Text", placeholder="Enter text containing PII (e.g. names, dates, locations, credit cards)...", lines=5, ) with gr.Row(): pattern_type = gr.Dropdown( ["None", "Date (YYYY-MM-DD or DD/MM/YYYY)", "16-digit Number (e.g., Credit Card)"], label="Select Regex Sample", value="None" ) custom_pattern = gr.Textbox( label="Or Enter Custom Regex Pattern", placeholder=r"e.g. \b\d{3}-\d{2}-\d{4}\b for SSN", ) redact_button = gr.Button("🔍 Redact PII") output_text = gr.Textbox(label="Redacted Output", lines=5) redact_button.click( fn=redact_text, inputs=[text_input, custom_pattern, pattern_type], outputs=output_text, ) # Launch the app if __name__ == "__main__": demo.launch()