""" HuggingFace Space App for PII Detection This app uses a BERT model to identify Personal Identifiable Information in text. """ import gradio as gr from transformers import AutoTokenizer, AutoModelForTokenClassification import torch # Load the model and tokenizer directly from HuggingFace Hub # This avoids needing to upload the large 667MB model file to the Space MODEL_PATH = "vuminhtue/Bert_NER_PII_Multi_Lingual" tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH) # Entity label colors for visualization ENTITY_COLORS = { "NAME": "#FF6B6B", "EMAIL": "#4ECDC4", "CREDITCARDNUMBER": "#FFE66D", "IP": "#95E1D3", "PASSWORD": "#F38181", "STREET": "#AA96DA", "ACCOUNTNAME": "#FCBAD3", "USERNAME": "#A8E6CF", "ZIPCODE": "#FFD3B6", "IBAN": "#FFAAA5", "URL": "#FF8B94", "JOB": "#C7CEEA", "GENDER": "#FFDAC1", "ADDRESS": "#B5EAD7", "MAC": "#C9CBA3", "GEO": "#FFE2E2", "NEARBYGPSCOORDINATE": "#F7D9C4", "COINADDRESS": "#FAACA8", "CREDITCARDISSUER": "#DCD6F7", "CURRENCY": "#A6D9F7", "NUM": "#D4F1F4", "BIC": "#FFB6B9", "ORDINALDIRECTION": "#F6EAC2", "PHONENUMBER": "#FFB3BA", "SSN": "#FF677D", "DATE": "#BAE1FF", "TIME": "#FFFFB5", "AGE": "#FFDFBA", "ORG": "#BAFFC9", "VEHICLEVIN": "#D4A5A5", "VEHICLEVRM": "#9B9B9B", "PHONEIMEI": "#E0BBE4", "PREFIX": "#FFDFD3", "HEIGHT": "#C7CEEA", "WEIGHTS": "#F0E68C", "BLOODTYPE": "#FFB6C1", "COLOR": "#E6E6FA", "MISC": "#D3D3D3", } def detect_pii(text): """ Detect PII entities in the input text. Args: text (str): Input text to analyze Returns: list: Highlighted entities for Gradio display str: Summary of detected entities """ if not text.strip(): return None, "Please enter some text to analyze." # Tokenize input inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512) # Get predictions with torch.no_grad(): outputs = model(**inputs) predictions = torch.argmax(outputs.logits, dim=2) # Convert tokens back to words and align with predictions tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) predicted_labels = [model.config.id2label[pred.item()] for pred in predictions[0]] # Reconstruct words and their labels highlighted_entities = [] current_word = "" current_label = None for token, label in zip(tokens, predicted_labels): # Skip special tokens if token in ["[CLS]", "[SEP]", "[PAD]"]: continue # Handle subword tokens (starting with ##) if token.startswith("##"): current_word += token[2:] else: # Save previous word if it exists if current_word: if current_label and current_label != "O": highlighted_entities.append((current_word, current_label)) else: highlighted_entities.append((current_word, None)) current_word = " " # Add space between words current_word += token current_label = label # Add the last word if current_word.strip(): if current_label and current_label != "O": highlighted_entities.append((current_word, current_label)) else: highlighted_entities.append((current_word, None)) # Create summary detected_entities = {} for word, label in highlighted_entities: if label and label != "O": if label not in detected_entities: detected_entities[label] = [] detected_entities[label].append(word.strip()) if detected_entities: summary = "**Detected PII:**\n\n" for entity_type, words in detected_entities.items(): summary += f"- **{entity_type}**: {', '.join(words)}\n" else: summary = "No PII detected in the text." return highlighted_entities, summary # Example texts for users to try (multilingual) examples = [ ["My name is John Smith and my email is john.smith@example.com. I was born on January 15, 1985."], ["Vui lòng gửi thanh toán đến IBAN GB29 NWBK 6016 1331 9268 19 hoặc gọi cho tôi theo số +1-555-123-4567."], ["Mi nombre es María García y vivo en Calle Mayor 123, Madrid. Mi teléfono es +34-91-123-4567."], ["Je m'appelle Pierre Dubois, mon email est pierre.dubois@email.fr et j'habite à Paris."], ["私の社会保障番号は123-45-6789、クレジットカード番号は4532-1234-5678-9010です。血液型はO型です。"], ["车辆识别号: 1HGBH41JXMN109186, 联系电话: +86-138-0013-8000"], ] # Create Gradio interface with gr.Blocks(title="PII Detection with BERT", theme=gr.themes.Soft()) as demo: gr.Markdown( """ # 🌍 Multilingual PII Detector This tool uses a fine-tuned **multilingual BERT model** to automatically detect and highlight personal information in text. It can identify **39 different types** of PII including names, emails, phone numbers, SSN, dates, and more. **Supports multiple languages!** 🌏 ### How to use: 1. Enter or paste text in the box below (in any supported language) 2. Click "Detect PII" to analyze 3. View highlighted entities and summary """ ) with gr.Row(): with gr.Column(): input_text = gr.Textbox( label="Input Text", placeholder="Enter text to analyze for PII...", lines=6, ) detect_btn = gr.Button("🔍 Detect PII", variant="primary") with gr.Column(): output_highlighted = gr.HighlightedText( label="Highlighted PII Entities", combine_adjacent=True, color_map=ENTITY_COLORS, ) output_summary = gr.Markdown(label="Summary") gr.Markdown("### 📝 Try these examples:") gr.Examples( examples=examples, inputs=input_text, ) gr.Markdown( """ ### 🏷️ Detectable Entity Types (39 types): **Identity**: NAME, USERNAME, PREFIX, GENDER, AGE, JOB, BLOODTYPE **Contact**: EMAIL, PHONENUMBER, PHONEIMEI, STREET, ADDRESS, ZIPCODE, GEO, NEARBYGPSCOORDINATE **Financial**: CREDITCARDNUMBER, CREDITCARDISSUER, IBAN, BIC, ACCOUNTNAME, CURRENCY, COINADDRESS **Government IDs**: SSN (Social Security Number) **Vehicle**: VEHICLEVIN, VEHICLEVRM **Technical**: IP, MAC, URL, PASSWORD **Organization**: ORG **Temporal**: DATE, TIME **Physical**: HEIGHT, WEIGHTS, COLOR **Other**: NUM, ORDINALDIRECTION, MISC --- **Model**: Multilingual BERT-base fine-tuned for PII detection **Base Model**: [google-bert/bert-base-multilingual-uncased](https://huggingface.co/google-bert/bert-base-multilingual-uncased) **Languages**: Supports 100+ languages including English, Spanish, French, German, Chinese, Arabic, and more! """ ) # Connect the button to the function detect_btn.click( fn=detect_pii, inputs=input_text, outputs=[output_highlighted, output_summary] ) # Also trigger on Enter key input_text.submit( fn=detect_pii, inputs=input_text, outputs=[output_highlighted, output_summary] ) # Launch the app if __name__ == "__main__": demo.launch()