Spaces:
Sleeping
Sleeping
| """ | |
| HuggingFace Space App for PII Detection | |
| This app uses a BERT model to identify Personal Identifiable Information in text. | |
| """ | |
| import gradio as gr | |
| from transformers import AutoTokenizer, AutoModelForTokenClassification | |
| import torch | |
| # Load the model and tokenizer | |
| MODEL_PATH = "./Bert_base_NER_PII43k" | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) | |
| model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH) | |
| # Entity label colors for visualization | |
| ENTITY_COLORS = { | |
| "NAME": "#FF6B6B", | |
| "EMAIL": "#4ECDC4", | |
| "CREDITCARDNUM": "#FFE66D", | |
| "IP": "#95E1D3", | |
| "PASSWORD": "#F38181", | |
| "STREET": "#AA96DA", | |
| "ACCOUNTNAME": "#FCBAD3", | |
| "ACCOUNTNUM": "#FFFFD2", | |
| "USERNAME": "#A8E6CF", | |
| "ZIPCODE": "#FFD3B6", | |
| "IBAN": "#FFAAA5", | |
| "URL": "#FF8B94", | |
| "JOB": "#C7CEEA", | |
| "GENDER": "#FFDAC1", | |
| "ADDRESS": "#B5EAD7", | |
| "MAC": "#C9CBA3", | |
| "GEO": "#FFE2E2", | |
| "NEARBYGPSCOORDINATE": "#F7D9C4", | |
| "COINADDRESS": "#FAACA8", | |
| "CREDITCARDISSUER": "#DCD6F7", | |
| "CURRENCY": "#A6D9F7", | |
| "DISPLAYNAME": "#FAD9A1", | |
| "NUM": "#D4F1F4", | |
| "BIC": "#FFB6B9", | |
| "USERAGENT": "#C2E9FB", | |
| "ORDINALDIRECTION": "#F6EAC2", | |
| } | |
| def detect_pii(text): | |
| """ | |
| Detect PII entities in the input text. | |
| Args: | |
| text (str): Input text to analyze | |
| Returns: | |
| list: Highlighted entities for Gradio display | |
| str: Summary of detected entities | |
| """ | |
| if not text.strip(): | |
| return None, "Please enter some text to analyze." | |
| # Tokenize input | |
| inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512) | |
| # Get predictions | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| predictions = torch.argmax(outputs.logits, dim=2) | |
| # Convert tokens back to words and align with predictions | |
| tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) | |
| predicted_labels = [model.config.id2label[pred.item()] for pred in predictions[0]] | |
| # Reconstruct words and their labels | |
| highlighted_entities = [] | |
| current_word = "" | |
| current_label = None | |
| for token, label in zip(tokens, predicted_labels): | |
| # Skip special tokens | |
| if token in ["[CLS]", "[SEP]", "[PAD]"]: | |
| continue | |
| # Handle subword tokens (starting with ##) | |
| if token.startswith("##"): | |
| current_word += token[2:] | |
| else: | |
| # Save previous word if it exists | |
| if current_word: | |
| if current_label and current_label != "O": | |
| highlighted_entities.append((current_word, current_label)) | |
| else: | |
| highlighted_entities.append((current_word, None)) | |
| current_word = " " # Add space between words | |
| current_word += token | |
| current_label = label | |
| # Add the last word | |
| if current_word.strip(): | |
| if current_label and current_label != "O": | |
| highlighted_entities.append((current_word, current_label)) | |
| else: | |
| highlighted_entities.append((current_word, None)) | |
| # Create summary | |
| detected_entities = {} | |
| for word, label in highlighted_entities: | |
| if label and label != "O": | |
| if label not in detected_entities: | |
| detected_entities[label] = [] | |
| detected_entities[label].append(word.strip()) | |
| if detected_entities: | |
| summary = "**Detected PII:**\n\n" | |
| for entity_type, words in detected_entities.items(): | |
| summary += f"- **{entity_type}**: {', '.join(words)}\n" | |
| else: | |
| summary = "No PII detected in the text." | |
| return highlighted_entities, summary | |
| # Example texts for users to try | |
| examples = [ | |
| ["My name is John Smith and my email is john.smith@example.com. I live at 123 Main Street."], | |
| ["Please send the payment to IBAN GB29 NWBK 6016 1331 9268 19 or call me at my office."], | |
| ["Contact Sarah Johnson at sarah.j@company.org for more details about the project."], | |
| ["My credit card number is 4532-1234-5678-9010 and my username is mike_user123."], | |
| ] | |
| # Create Gradio interface | |
| with gr.Blocks(title="PII Detection with BERT", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown( | |
| """ | |
| # π Personal Identifiable Information (PII) Detector | |
| This tool uses a fine-tuned BERT model to automatically detect and highlight personal information in text. | |
| It can identify **27 different types** of PII including names, emails, addresses, credit cards, and more. | |
| ### How to use: | |
| 1. Enter or paste text in the box below | |
| 2. Click "Detect PII" to analyze | |
| 3. View highlighted entities and summary | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| input_text = gr.Textbox( | |
| label="Input Text", | |
| placeholder="Enter text to analyze for PII...", | |
| lines=6, | |
| ) | |
| detect_btn = gr.Button("π Detect PII", variant="primary") | |
| with gr.Column(): | |
| output_highlighted = gr.HighlightedText( | |
| label="Highlighted PII Entities", | |
| combine_adjacent=True, | |
| color_map=ENTITY_COLORS, | |
| ) | |
| output_summary = gr.Markdown(label="Summary") | |
| gr.Markdown("### π Try these examples:") | |
| gr.Examples( | |
| examples=examples, | |
| inputs=input_text, | |
| ) | |
| gr.Markdown( | |
| """ | |
| ### π·οΈ Detectable Entity Types: | |
| **Identity**: NAME, USERNAME, DISPLAYNAME, GENDER, JOB | |
| **Contact**: EMAIL, STREET, ADDRESS, ZIPCODE, GEO, NEARBYGPSCOORDINATE | |
| **Financial**: CREDITCARDNUM, CREDITCARDISSUER, IBAN, BIC, ACCOUNTNAME, ACCOUNTNUM, CURRENCY, COINADDRESS | |
| **Technical**: IP, MAC, URL, USERAGENT, PASSWORD | |
| **Other**: NUM, ORDINALDIRECTION | |
| --- | |
| **Model**: BERT-base fine-tuned on [ai4privacy/pii-masking-43k](https://huggingface.co/datasets/ai4privacy/pii-masking-43k) dataset | |
| **Base Model**: [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) | |
| """ | |
| ) | |
| # Connect the button to the function | |
| detect_btn.click( | |
| fn=detect_pii, | |
| inputs=input_text, | |
| outputs=[output_highlighted, output_summary] | |
| ) | |
| # Also trigger on Enter key | |
| input_text.submit( | |
| fn=detect_pii, | |
| inputs=input_text, | |
| outputs=[output_highlighted, output_summary] | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch() | |