File size: 3,107 Bytes
285787d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d8e53d
 
 
285787d
07c6972
eb70d23
 
285787d
 
 
 
 
8fb21ee
285787d
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

# 1. Load the model from your Hugging Face repository
# Replace this with your exact model ID!
MODEL_ID = "Negative-Star-Innovators/MiniLM-L6-finetuned-pii-detection"

print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)

# Initialize pipeline
pii_pipeline = pipeline(
    "token-classification", 
    model=model, 
    tokenizer=tokenizer, 
    aggregation_strategy="simple"
)

# 2. Define the redaction function
def redact_pii(text):
    if not text.strip():
        return ""
        
    # Run the model on the input text
    results = pii_pipeline(text)
    
    # If no PII is found, return original text
    if not results:
        return text
        
    # Sort results in reverse order based on their start index.
    # Why? If we replace text from left to right, the string length changes 
    # and messes up the start/end indexes for the remaining entities. 
    # Going backwards prevents this!
    results_sorted = sorted(results, key=lambda x: x['start'], reverse=True)
    
    redacted_text = text
    for entity in results_sorted:
        start = entity['start']
        end = entity['end']
        label = entity['entity_group']
        
        # Replace the sensitive text with a clean [REDACTED LABEL] tag
        replacement = f"[REDACTED {label.upper()}]"
        redacted_text = redacted_text[:start] + replacement + redacted_text[end:]
        
    return redacted_text

# 3. Build the Gradio User Interface
# We define the input box, the output box, and some default examples.
demo = gr.Interface(
    fn=redact_pii,
    inputs=gr.Textbox(
        lines=5, 
        label="Input Text", 
        placeholder="Paste text containing sensitive data (names, emails, routing numbers) here..."
    ),
    outputs=gr.Textbox(
        lines=5, 
        label="Redacted Output"
    ),
    title="🛡️ Secure PII Redaction Playground",
    description=(
        "Test our highly efficient (90MB) PII detection model that is capable of running locally on your device. "
        "It quickly scrubs Personally Identifiable Information entirely on CPU, making it perfect "
        "for sanitizing data before sending it to third-party cloud LLMs and other parties."
    ),
    article = (
        "📧 **Please reach out if you have a question or feedback. We also do custom projects, consultating, freelance and collaboration:** [thieves@negativestarinnovators.com](mailto:thieves@negativestarinnovators.com)"
    ),
    examples=[
        ["John Doe's routing number is 123456789 and his email is john.doe@email.com."],
        ["Please update the shipping address for Jane Smith to 123 Secure Lane. Her phone number is 555-0198."],
        ["The patient, Michael Johnson, was born on 10/12/1985. His SSN is 000-11-2222."]
    ],
    flagging_mode="never" # Turns off the "Flag" button since we don't need to collect user data
)

# 4. Launch the app
if __name__ == "__main__":
    demo.launch()