""" HuggingFace Space App for PII Detection This app uses a BERT model to identify Personal Identifiable Information in text. """ import gradio as gr from transformers import AutoTokenizer, AutoModelForTokenClassification import torch # Load the model and tokenizer MODEL_PATH = "./Bert_base_NER_PII43k" tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH) # Entity label colors for visualization ENTITY_COLORS = { "NAME": "#FF6B6B", "EMAIL": "#4ECDC4", "CREDITCARDNUM": "#FFE66D", "IP": "#95E1D3", "PASSWORD": "#F38181", "STREET": "#AA96DA", "ACCOUNTNAME": "#FCBAD3", "ACCOUNTNUM": "#FFFFD2", "USERNAME": "#A8E6CF", "ZIPCODE": "#FFD3B6", "IBAN": "#FFAAA5", "URL": "#FF8B94", "JOB": "#C7CEEA", "GENDER": "#FFDAC1", "ADDRESS": "#B5EAD7", "MAC": "#C9CBA3", "GEO": "#FFE2E2", "NEARBYGPSCOORDINATE": "#F7D9C4", "COINADDRESS": "#FAACA8", "CREDITCARDISSUER": "#DCD6F7", "CURRENCY": "#A6D9F7", "DISPLAYNAME": "#FAD9A1", "NUM": "#D4F1F4", "BIC": "#FFB6B9", "USERAGENT": "#C2E9FB", "ORDINALDIRECTION": "#F6EAC2", } def detect_pii(text): """ Detect PII entities in the input text. Args: text (str): Input text to analyze Returns: list: Highlighted entities for Gradio display str: Summary of detected entities """ if not text.strip(): return None, "Please enter some text to analyze." # Tokenize input inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512) # Get predictions with torch.no_grad(): outputs = model(**inputs) predictions = torch.argmax(outputs.logits, dim=2) # Convert tokens back to words and align with predictions tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) predicted_labels = [model.config.id2label[pred.item()] for pred in predictions[0]] # Reconstruct words and their labels highlighted_entities = [] current_word = "" current_label = None for token, label in zip(tokens, predicted_labels): # Skip special tokens if token in ["[CLS]", "[SEP]", "[PAD]"]: continue # Handle subword tokens (starting with ##) if token.startswith("##"): current_word += token[2:] else: # Save previous word if it exists if current_word: if current_label and current_label != "O": highlighted_entities.append((current_word, current_label)) else: highlighted_entities.append((current_word, None)) current_word = " " # Add space between words current_word += token current_label = label # Add the last word if current_word.strip(): if current_label and current_label != "O": highlighted_entities.append((current_word, current_label)) else: highlighted_entities.append((current_word, None)) # Create summary detected_entities = {} for word, label in highlighted_entities: if label and label != "O": if label not in detected_entities: detected_entities[label] = [] detected_entities[label].append(word.strip()) if detected_entities: summary = "**Detected PII:**\n\n" for entity_type, words in detected_entities.items(): summary += f"- **{entity_type}**: {', '.join(words)}\n" else: summary = "No PII detected in the text." return highlighted_entities, summary # Example texts for users to try examples = [ ["My name is John Smith and my email is john.smith@example.com. I live at 123 Main Street."], ["Please send the payment to IBAN GB29 NWBK 6016 1331 9268 19 or call me at my office."], ["Contact Sarah Johnson at sarah.j@company.org for more details about the project."], ["My credit card number is 4532-1234-5678-9010 and my username is mike_user123."], ] # Create Gradio interface with gr.Blocks(title="PII Detection with BERT", theme=gr.themes.Soft()) as demo: gr.Markdown( """ # 🔍 Personal Identifiable Information (PII) Detector This tool uses a fine-tuned BERT model to automatically detect and highlight personal information in text. It can identify **27 different types** of PII including names, emails, addresses, credit cards, and more. ### How to use: 1. Enter or paste text in the box below 2. Click "Detect PII" to analyze 3. View highlighted entities and summary """ ) with gr.Row(): with gr.Column(): input_text = gr.Textbox( label="Input Text", placeholder="Enter text to analyze for PII...", lines=6, ) detect_btn = gr.Button("🔍 Detect PII", variant="primary") with gr.Column(): output_highlighted = gr.HighlightedText( label="Highlighted PII Entities", combine_adjacent=True, color_map=ENTITY_COLORS, ) output_summary = gr.Markdown(label="Summary") gr.Markdown("### 📝 Try these examples:") gr.Examples( examples=examples, inputs=input_text, ) gr.Markdown( """ ### 🏷️ Detectable Entity Types: **Identity**: NAME, USERNAME, DISPLAYNAME, GENDER, JOB **Contact**: EMAIL, STREET, ADDRESS, ZIPCODE, GEO, NEARBYGPSCOORDINATE **Financial**: CREDITCARDNUM, CREDITCARDISSUER, IBAN, BIC, ACCOUNTNAME, ACCOUNTNUM, CURRENCY, COINADDRESS **Technical**: IP, MAC, URL, USERAGENT, PASSWORD **Other**: NUM, ORDINALDIRECTION --- **Model**: BERT-base fine-tuned on [ai4privacy/pii-masking-43k](https://huggingface.co/datasets/ai4privacy/pii-masking-43k) dataset **Base Model**: [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) """ ) # Connect the button to the function detect_btn.click( fn=detect_pii, inputs=input_text, outputs=[output_highlighted, output_summary] ) # Also trigger on Enter key input_text.submit( fn=detect_pii, inputs=input_text, outputs=[output_highlighted, output_summary] ) # Launch the app if __name__ == "__main__": demo.launch()