Spaces:
Sleeping
Sleeping
| """ | |
| HuggingFace Space App for PII Detection | |
| This app uses a BERT model to identify Personal Identifiable Information in text. | |
| """ | |
| import gradio as gr | |
| from transformers import AutoTokenizer, AutoModelForTokenClassification | |
| import torch | |
| # Load the model and tokenizer directly from HuggingFace Hub | |
| # This avoids needing to upload the large 667MB model file to the Space | |
| MODEL_PATH = "vuminhtue/Bert_NER_PII_Multi_Lingual" | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) | |
| model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH) | |
| # Entity label colors for visualization | |
| ENTITY_COLORS = { | |
| "NAME": "#FF6B6B", | |
| "EMAIL": "#4ECDC4", | |
| "CREDITCARDNUMBER": "#FFE66D", | |
| "IP": "#95E1D3", | |
| "PASSWORD": "#F38181", | |
| "STREET": "#AA96DA", | |
| "ACCOUNTNAME": "#FCBAD3", | |
| "USERNAME": "#A8E6CF", | |
| "ZIPCODE": "#FFD3B6", | |
| "IBAN": "#FFAAA5", | |
| "URL": "#FF8B94", | |
| "JOB": "#C7CEEA", | |
| "GENDER": "#FFDAC1", | |
| "ADDRESS": "#B5EAD7", | |
| "MAC": "#C9CBA3", | |
| "GEO": "#FFE2E2", | |
| "NEARBYGPSCOORDINATE": "#F7D9C4", | |
| "COINADDRESS": "#FAACA8", | |
| "CREDITCARDISSUER": "#DCD6F7", | |
| "CURRENCY": "#A6D9F7", | |
| "NUM": "#D4F1F4", | |
| "BIC": "#FFB6B9", | |
| "ORDINALDIRECTION": "#F6EAC2", | |
| "PHONENUMBER": "#FFB3BA", | |
| "SSN": "#FF677D", | |
| "DATE": "#BAE1FF", | |
| "TIME": "#FFFFB5", | |
| "AGE": "#FFDFBA", | |
| "ORG": "#BAFFC9", | |
| "VEHICLEVIN": "#D4A5A5", | |
| "VEHICLEVRM": "#9B9B9B", | |
| "PHONEIMEI": "#E0BBE4", | |
| "PREFIX": "#FFDFD3", | |
| "HEIGHT": "#C7CEEA", | |
| "WEIGHTS": "#F0E68C", | |
| "BLOODTYPE": "#FFB6C1", | |
| "COLOR": "#E6E6FA", | |
| "MISC": "#D3D3D3", | |
| } | |
| def detect_pii(text): | |
| """ | |
| Detect PII entities in the input text. | |
| Args: | |
| text (str): Input text to analyze | |
| Returns: | |
| list: Highlighted entities for Gradio display | |
| str: Summary of detected entities | |
| """ | |
| if not text.strip(): | |
| return None, "Please enter some text to analyze." | |
| # Tokenize input | |
| inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512) | |
| # Get predictions | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| predictions = torch.argmax(outputs.logits, dim=2) | |
| # Convert tokens back to words and align with predictions | |
| tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) | |
| predicted_labels = [model.config.id2label[pred.item()] for pred in predictions[0]] | |
| # Reconstruct words and their labels | |
| highlighted_entities = [] | |
| current_word = "" | |
| current_label = None | |
| for token, label in zip(tokens, predicted_labels): | |
| # Skip special tokens | |
| if token in ["[CLS]", "[SEP]", "[PAD]"]: | |
| continue | |
| # Handle subword tokens (starting with ##) | |
| if token.startswith("##"): | |
| current_word += token[2:] | |
| else: | |
| # Save previous word if it exists | |
| if current_word: | |
| if current_label and current_label != "O": | |
| highlighted_entities.append((current_word, current_label)) | |
| else: | |
| highlighted_entities.append((current_word, None)) | |
| current_word = " " # Add space between words | |
| current_word += token | |
| current_label = label | |
| # Add the last word | |
| if current_word.strip(): | |
| if current_label and current_label != "O": | |
| highlighted_entities.append((current_word, current_label)) | |
| else: | |
| highlighted_entities.append((current_word, None)) | |
| # Create summary | |
| detected_entities = {} | |
| for word, label in highlighted_entities: | |
| if label and label != "O": | |
| if label not in detected_entities: | |
| detected_entities[label] = [] | |
| detected_entities[label].append(word.strip()) | |
| if detected_entities: | |
| summary = "**Detected PII:**\n\n" | |
| for entity_type, words in detected_entities.items(): | |
| summary += f"- **{entity_type}**: {', '.join(words)}\n" | |
| else: | |
| summary = "No PII detected in the text." | |
| return highlighted_entities, summary | |
| # Example texts for users to try (multilingual) | |
| examples = [ | |
| ["My name is John Smith and my email is john.smith@example.com. I was born on January 15, 1985."], | |
| ["Vui lòng gửi thanh toán đến IBAN GB29 NWBK 6016 1331 9268 19 hoặc gọi cho tôi theo số +1-555-123-4567."], | |
| ["Mi nombre es María García y vivo en Calle Mayor 123, Madrid. Mi teléfono es +34-91-123-4567."], | |
| ["Je m'appelle Pierre Dubois, mon email est pierre.dubois@email.fr et j'habite à Paris."], | |
| ["私の社会保障番号は123-45-6789、クレジットカード番号は4532-1234-5678-9010です。血液型はO型です。"], | |
| ["车辆识别号: 1HGBH41JXMN109186, 联系电话: +86-138-0013-8000"], | |
| ] | |
| # Create Gradio interface | |
| with gr.Blocks(title="PII Detection with BERT", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown( | |
| """ | |
| # 🌍 Multilingual PII Detector | |
| This tool uses a fine-tuned **multilingual BERT model** to automatically detect and highlight personal information in text. | |
| It can identify **39 different types** of PII including names, emails, phone numbers, SSN, dates, and more. | |
| **Supports multiple languages!** 🌏 | |
| ### How to use: | |
| 1. Enter or paste text in the box below (in any supported language) | |
| 2. Click "Detect PII" to analyze | |
| 3. View highlighted entities and summary | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| input_text = gr.Textbox( | |
| label="Input Text", | |
| placeholder="Enter text to analyze for PII...", | |
| lines=6, | |
| ) | |
| detect_btn = gr.Button("🔍 Detect PII", variant="primary") | |
| with gr.Column(): | |
| output_highlighted = gr.HighlightedText( | |
| label="Highlighted PII Entities", | |
| combine_adjacent=True, | |
| color_map=ENTITY_COLORS, | |
| ) | |
| output_summary = gr.Markdown(label="Summary") | |
| gr.Markdown("### 📝 Try these examples:") | |
| gr.Examples( | |
| examples=examples, | |
| inputs=input_text, | |
| ) | |
| gr.Markdown( | |
| """ | |
| ### 🏷️ Detectable Entity Types (39 types): | |
| **Identity**: NAME, USERNAME, PREFIX, GENDER, AGE, JOB, BLOODTYPE | |
| **Contact**: EMAIL, PHONENUMBER, PHONEIMEI, STREET, ADDRESS, ZIPCODE, GEO, NEARBYGPSCOORDINATE | |
| **Financial**: CREDITCARDNUMBER, CREDITCARDISSUER, IBAN, BIC, ACCOUNTNAME, CURRENCY, COINADDRESS | |
| **Government IDs**: SSN (Social Security Number) | |
| **Vehicle**: VEHICLEVIN, VEHICLEVRM | |
| **Technical**: IP, MAC, URL, PASSWORD | |
| **Organization**: ORG | |
| **Temporal**: DATE, TIME | |
| **Physical**: HEIGHT, WEIGHTS, COLOR | |
| **Other**: NUM, ORDINALDIRECTION, MISC | |
| --- | |
| **Model**: Multilingual BERT-base fine-tuned for PII detection | |
| **Base Model**: [google-bert/bert-base-multilingual-uncased](https://huggingface.co/google-bert/bert-base-multilingual-uncased) | |
| **Languages**: Supports 100+ languages including English, Spanish, French, German, Chinese, Arabic, and more! | |
| """ | |
| ) | |
| # Connect the button to the function | |
| detect_btn.click( | |
| fn=detect_pii, | |
| inputs=input_text, | |
| outputs=[output_highlighted, output_summary] | |
| ) | |
| # Also trigger on Enter key | |
| input_text.submit( | |
| fn=detect_pii, | |
| inputs=input_text, | |
| outputs=[output_highlighted, output_summary] | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch() | |