import gradio as gr from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification # 1. Load the model from your Hugging Face repository # Replace this with your exact model ID! MODEL_ID = "Negative-Star-Innovators/MiniLM-L6-finetuned-pii-detection" print("Loading model...") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForTokenClassification.from_pretrained(MODEL_ID) # Initialize pipeline pii_pipeline = pipeline( "token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple" ) # 2. Define the redaction function def redact_pii(text): if not text.strip(): return "" # Run the model on the input text results = pii_pipeline(text) # If no PII is found, return original text if not results: return text # Sort results in reverse order based on their start index. # Why? If we replace text from left to right, the string length changes # and messes up the start/end indexes for the remaining entities. # Going backwards prevents this! results_sorted = sorted(results, key=lambda x: x['start'], reverse=True) redacted_text = text for entity in results_sorted: start = entity['start'] end = entity['end'] label = entity['entity_group'] # Replace the sensitive text with a clean [REDACTED LABEL] tag replacement = f"[REDACTED {label.upper()}]" redacted_text = redacted_text[:start] + replacement + redacted_text[end:] return redacted_text # 3. Build the Gradio User Interface # We define the input box, the output box, and some default examples. demo = gr.Interface( fn=redact_pii, inputs=gr.Textbox( lines=5, label="Input Text", placeholder="Paste text containing sensitive data (names, emails, routing numbers) here..." ), outputs=gr.Textbox( lines=5, label="Redacted Output" ), title="🛡️ Secure PII Redaction Playground", description=( "Test our highly efficient (90MB) PII detection model that is capable of running locally on your device. " "It quickly scrubs Personally Identifiable Information entirely on CPU, making it perfect " "for sanitizing data before sending it to third-party cloud LLMs and other parties." ), article = ( "📧 **Please reach out if you have a question or feedback. We also do custom projects, consultating, freelance and collaboration:** [thieves@negativestarinnovators.com](mailto:thieves@negativestarinnovators.com)" ), examples=[ ["John Doe's routing number is 123456789 and his email is john.doe@email.com."], ["Please update the shipping address for Jane Smith to 123 Secure Lane. Her phone number is 555-0198."], ["The patient, Michael Johnson, was born on 10/12/1985. His SSN is 000-11-2222."] ], flagging_mode="never" # Turns off the "Flag" button since we don't need to collect user data ) # 4. Launch the app if __name__ == "__main__": demo.launch()