Spaces:

Negative-Star-Innovators
/

PII-Redaction-Playground

Sleeping

File size: 3,107 Bytes

import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

# 1. Load the model from your Hugging Face repository
# Replace this with your exact model ID!
MODEL_ID = "Negative-Star-Innovators/MiniLM-L6-finetuned-pii-detection"

print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)

# Initialize pipeline
pii_pipeline = pipeline(
    "token-classification", 
    model=model, 
    tokenizer=tokenizer, 
    aggregation_strategy="simple"
)

# 2. Define the redaction function
def redact_pii(text):
    if not text.strip():
        return ""
        
    # Run the model on the input text
    results = pii_pipeline(text)
    
    # If no PII is found, return original text
    if not results:
        return text
        
    # Sort results in reverse order based on their start index.
    # Why? If we replace text from left to right, the string length changes 
    # and messes up the start/end indexes for the remaining entities. 
    # Going backwards prevents this!
    results_sorted = sorted(results, key=lambda x: x['start'], reverse=True)
    
    redacted_text = text
    for entity in results_sorted:
        start = entity['start']
        end = entity['end']
        label = entity['entity_group']
        
        # Replace the sensitive text with a clean [REDACTED LABEL] tag
        replacement = f"[REDACTED {label.upper()}]"
        redacted_text = redacted_text[:start] + replacement + redacted_text[end:]
        
    return redacted_text

# 3. Build the Gradio User Interface
# We define the input box, the output box, and some default examples.
demo = gr.Interface(
    fn=redact_pii,
    inputs=gr.Textbox(
        lines=5, 
        label="Input Text", 
        placeholder="Paste text containing sensitive data (names, emails, routing numbers) here..."
    ),
    outputs=gr.Textbox(
        lines=5, 
        label="Redacted Output"
    ),
    title="🛡️ Secure PII Redaction Playground",
    description=(
        "Test our highly efficient (90MB) PII detection model that is capable of running locally on your device. "
        "It quickly scrubs Personally Identifiable Information entirely on CPU, making it perfect "
        "for sanitizing data before sending it to third-party cloud LLMs and other parties."
    ),
    article = (
        "📧 **Please reach out if you have a question or feedback. We also do custom projects, consultating, freelance and collaboration:** [thieves@negativestarinnovators.com](mailto:thieves@negativestarinnovators.com)"
    ),
    examples=[
        ["John Doe's routing number is 123456789 and his email is john.doe@email.com."],
        ["Please update the shipping address for Jane Smith to 123 Secure Lane. Her phone number is 555-0198."],
        ["The patient, Michael Johnson, was born on 10/12/1985. His SSN is 000-11-2222."]
    ],
    flagging_mode="never" # Turns off the "Flag" button since we don't need to collect user data
)

# 4. Launch the app
if __name__ == "__main__":
    demo.launch()