Spaces:

vuminhtue
/

NER_PII_Bert_Multilingual

Running

App Files Files Community

vuminhtue commited on Oct 14, 2025

Commit

aab1356

verified ·

1 Parent(s): c607b13

Upload app.py

Browse files

Files changed (1) hide show

app.py +225 -0

app.py ADDED Viewed

	@@ -0,0 +1,225 @@

+"""
+HuggingFace Space App for PII Detection
+This app uses a BERT model to identify Personal Identifiable Information in text.
+"""
+import gradio as gr
+from transformers import AutoTokenizer, AutoModelForTokenClassification
+import torch
+# Load the model and tokenizer directly from HuggingFace Hub
+# This avoids needing to upload the large 667MB model file to the Space
+MODEL_PATH = "vuminhtue/Bert_NER_PII_Multi_Lingual"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH)
+# Entity label colors for visualization
+ENTITY_COLORS = {
+    "NAME": "#FF6B6B",
+    "EMAIL": "#4ECDC4",
+    "CREDITCARDNUMBER": "#FFE66D",
+    "IP": "#95E1D3",
+    "PASSWORD": "#F38181",
+    "STREET": "#AA96DA",
+    "ACCOUNTNAME": "#FCBAD3",
+    "USERNAME": "#A8E6CF",
+    "ZIPCODE": "#FFD3B6",
+    "IBAN": "#FFAAA5",
+    "URL": "#FF8B94",
+    "JOB": "#C7CEEA",
+    "GENDER": "#FFDAC1",
+    "ADDRESS": "#B5EAD7",
+    "MAC": "#C9CBA3",
+    "GEO": "#FFE2E2",
+    "NEARBYGPSCOORDINATE": "#F7D9C4",
+    "COINADDRESS": "#FAACA8",
+    "CREDITCARDISSUER": "#DCD6F7",
+    "CURRENCY": "#A6D9F7",
+    "NUM": "#D4F1F4",
+    "BIC": "#FFB6B9",
+    "ORDINALDIRECTION": "#F6EAC2",
+    "PHONENUMBER": "#FFB3BA",
+    "SSN": "#FF677D",
+    "DATE": "#BAE1FF",
+    "TIME": "#FFFFB5",
+    "AGE": "#FFDFBA",
+    "ORG": "#BAFFC9",
+    "VEHICLEVIN": "#D4A5A5",
+    "VEHICLEVRM": "#9B9B9B",
+    "PHONEIMEI": "#E0BBE4",
+    "PREFIX": "#FFDFD3",
+    "HEIGHT": "#C7CEEA",
+    "WEIGHTS": "#F0E68C",
+    "BLOODTYPE": "#FFB6C1",
+    "COLOR": "#E6E6FA",
+    "MISC": "#D3D3D3",
+}
+def detect_pii(text):
+    """
+    Detect PII entities in the input text.
+    Args:
+        text (str): Input text to analyze
+    Returns:
+        list: Highlighted entities for Gradio display
+        str: Summary of detected entities
+    """
+    if not text.strip():
+        return None, "Please enter some text to analyze."
+    # Tokenize input
+    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
+    # Get predictions
+    with torch.no_grad():
+        outputs = model(**inputs)
+        predictions = torch.argmax(outputs.logits, dim=2)
+    # Convert tokens back to words and align with predictions
+    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
+    predicted_labels = [model.config.id2label[pred.item()] for pred in predictions[0]]
+    # Reconstruct words and their labels
+    highlighted_entities = []
+    current_word = ""
+    current_label = None
+    for token, label in zip(tokens, predicted_labels):
+        # Skip special tokens
+        if token in ["[CLS]", "[SEP]", "[PAD]"]:
+            continue
+        # Handle subword tokens (starting with ##)
+        if token.startswith("##"):
+            current_word += token[2:]
+        else:
+            # Save previous word if it exists
+            if current_word:
+                if current_label and current_label != "O":
+                    highlighted_entities.append((current_word, current_label))
+                else:
+                    highlighted_entities.append((current_word, None))
+                current_word = " "  # Add space between words
+            current_word += token
+            current_label = label
+    # Add the last word
+    if current_word.strip():
+        if current_label and current_label != "O":
+            highlighted_entities.append((current_word, current_label))
+        else:
+            highlighted_entities.append((current_word, None))
+    # Create summary
+    detected_entities = {}
+    for word, label in highlighted_entities:
+        if label and label != "O":
+            if label not in detected_entities:
+                detected_entities[label] = []
+            detected_entities[label].append(word.strip())
+    if detected_entities:
+        summary = "**Detected PII:**\n\n"
+        for entity_type, words in detected_entities.items():
+            summary += f"- **{entity_type}**: {', '.join(words)}\n"
+    else:
+        summary = "No PII detected in the text."
+    return highlighted_entities, summary
+# Example texts for users to try (multilingual)
+examples = [
+    ["My name is John Smith and my email is john.smith@example.com. I was born on January 15, 1985."],
+    ["Please send the payment to IBAN GB29 NWBK 6016 1331 9268 19 or call me at +1-555-123-4567."],
+    ["Mi nombre es María García y vivo en Calle Mayor 123, Madrid. Mi teléfono es +34-91-123-4567."],
+    ["Je m'appelle Pierre Dubois, mon email est pierre.dubois@email.fr et j'habite à Paris."],
+    ["My SSN is 123-45-6789 and my credit card number is 4532-1234-5678-9010. My blood type is O+."],
+    ["车辆识别号: 1HGBH41JXMN109186, 联系电话: +86-138-0013-8000"],
+]
+# Create Gradio interface
+with gr.Blocks(title="PII Detection with BERT", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # 🌍 Multilingual PII Detector
+        This tool uses a fine-tuned **multilingual BERT model** to automatically detect and highlight personal information in text.
+        It can identify **39 different types** of PII including names, emails, phone numbers, SSN, dates, and more.
+        **Supports multiple languages!** 🌏
+        ### How to use:
+        1. Enter or paste text in the box below (in any supported language)
+        2. Click "Detect PII" to analyze
+        3. View highlighted entities and summary
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            input_text = gr.Textbox(
+                label="Input Text",
+                placeholder="Enter text to analyze for PII...",
+                lines=6,
+            )
+            detect_btn = gr.Button("🔍 Detect PII", variant="primary")
+        with gr.Column():
+            output_highlighted = gr.HighlightedText(
+                label="Highlighted PII Entities",
+                combine_adjacent=True,
+                color_map=ENTITY_COLORS,
+            )
+            output_summary = gr.Markdown(label="Summary")
+    gr.Markdown("### 📝 Try these examples:")
+    gr.Examples(
+        examples=examples,
+        inputs=input_text,
+    )
+    gr.Markdown(
+        """
+        ### 🏷️ Detectable Entity Types (39 types):
+        **Identity**: NAME, USERNAME, PREFIX, GENDER, AGE, JOB, BLOODTYPE
+        **Contact**: EMAIL, PHONENUMBER, PHONEIMEI, STREET, ADDRESS, ZIPCODE, GEO, NEARBYGPSCOORDINATE
+        **Financial**: CREDITCARDNUMBER, CREDITCARDISSUER, IBAN, BIC, ACCOUNTNAME, CURRENCY, COINADDRESS
+        **Government IDs**: SSN (Social Security Number)
+        **Vehicle**: VEHICLEVIN, VEHICLEVRM
+        **Technical**: IP, MAC, URL, PASSWORD
+        **Organization**: ORG
+        **Temporal**: DATE, TIME
+        **Physical**: HEIGHT, WEIGHTS, COLOR
+        **Other**: NUM, ORDINALDIRECTION, MISC
+        ---
+        **Model**: Multilingual BERT-base fine-tuned for PII detection
+        **Base Model**: [google-bert/bert-base-multilingual-uncased](https://huggingface.co/google-bert/bert-base-multilingual-uncased)
+        **Languages**: Supports 100+ languages including English, Spanish, French, German, Chinese, Arabic, and more!
+        """
+    )
+    # Connect the button to the function
+    detect_btn.click(
+        fn=detect_pii,
+        inputs=input_text,
+        outputs=[output_highlighted, output_summary]
+    )
+    # Also trigger on Enter key
+    input_text.submit(
+        fn=detect_pii,
+        inputs=input_text,
+        outputs=[output_highlighted, output_summary]
+    )
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()