piegoneer
removed model link due to hugging face not allowing people to open links
ef42238
import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
# 1. Load the model from your Hugging Face repository
# Replace this with your exact model ID!
MODEL_ID = "Negative-Star-Innovators/MiniLM-L6-finetuned-pii-detection"
print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)
# Initialize pipeline
pii_pipeline = pipeline(
"token-classification",
model=model,
tokenizer=tokenizer,
aggregation_strategy="simple"
)
# 2. Define the redaction function
def redact_pii(text):
if not text.strip():
return ""
# Run the model on the input text
results = pii_pipeline(text)
# If no PII is found, return original text
if not results:
return text
# Sort results in reverse order based on their start index.
# Why? If we replace text from left to right, the string length changes
# and messes up the start/end indexes for the remaining entities.
# Going backwards prevents this!
results_sorted = sorted(results, key=lambda x: x['start'], reverse=True)
redacted_text = text
for entity in results_sorted:
start = entity['start']
end = entity['end']
label = entity['entity_group']
# Replace the sensitive text with a clean [REDACTED LABEL] tag
replacement = f"[REDACTED {label.upper()}]"
redacted_text = redacted_text[:start] + replacement + redacted_text[end:]
return redacted_text
# 3. Build the Gradio User Interface
# We define the input box, the output box, and some default examples.
demo = gr.Interface(
fn=redact_pii,
inputs=gr.Textbox(
lines=5,
label="Input Text",
placeholder="Paste text containing sensitive data (names, emails, routing numbers) here..."
),
outputs=gr.Textbox(
lines=5,
label="Redacted Output"
),
title="🛡️ Secure PII Redaction Playground",
description=(
"Test our highly efficient (90MB) PII detection model that is capable of running locally on your device. "
"It quickly scrubs Personally Identifiable Information entirely on CPU, making it perfect "
"for sanitizing data before sending it to third-party cloud LLMs and other parties."
),
article = (
"📧 **Please reach out if you have a question or feedback. We also do custom projects, consultating, freelance and collaboration:** [thieves@negativestarinnovators.com](mailto:thieves@negativestarinnovators.com)"
),
examples=[
["John Doe's routing number is 123456789 and his email is john.doe@email.com."],
["Please update the shipping address for Jane Smith to 123 Secure Lane. Her phone number is 555-0198."],
["The patient, Michael Johnson, was born on 10/12/1985. His SSN is 000-11-2222."]
],
flagging_mode="never" # Turns off the "Flag" button since we don't need to collect user data
)
# 4. Launch the app
if __name__ == "__main__":
demo.launch()