open-redactor / app.py
Suchinthana
Init commit
d4fde93
import re
import gradio as gr
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
# Load pretrained NER model
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
# --- Core Redaction Function ---
def redact_text(input_text, custom_pattern=None, pattern_type=None):
redacted_text = input_text
# Apply model-based NER redaction
ner_results = ner_pipeline(input_text)
entities = sorted(ner_results, key=lambda x: x["start"], reverse=True)
for ent in entities:
redacted_text = (
redacted_text[: ent["start"]] + "[REDACTED]" + redacted_text[ent["end"] :]
)
# Apply custom regex redaction if provided
if pattern_type == "Date (YYYY-MM-DD or DD/MM/YYYY)":
custom_pattern = r"\b\d{4}-\d{2}-\d{2}\b|\b\d{2}/\d{2}/\d{4}\b"
elif pattern_type == "16-digit Number (e.g., Credit Card)":
custom_pattern = r"\b\d{16}\b"
if custom_pattern:
redacted_text = re.sub(custom_pattern, "[REDACTED]", redacted_text)
return redacted_text
# --- Gradio Interface ---
with gr.Blocks(title="PII Redactor") as demo:
gr.Markdown("## πŸ”’ PII Redactor using BERT NER + Custom Regex")
with gr.Row():
text_input = gr.Textbox(
label="Input Text",
placeholder="Enter text containing PII (e.g. names, dates, locations, credit cards)...",
lines=5,
)
with gr.Row():
pattern_type = gr.Dropdown(
["None", "Date (YYYY-MM-DD or DD/MM/YYYY)", "16-digit Number (e.g., Credit Card)"],
label="Select Regex Sample",
value="None"
)
custom_pattern = gr.Textbox(
label="Or Enter Custom Regex Pattern",
placeholder=r"e.g. \b\d{3}-\d{2}-\d{4}\b for SSN",
)
redact_button = gr.Button("πŸ” Redact PII")
output_text = gr.Textbox(label="Redacted Output", lines=5)
redact_button.click(
fn=redact_text,
inputs=[text_input, custom_pattern, pattern_type],
outputs=output_text,
)
# Launch the app
if __name__ == "__main__":
demo.launch()