Spaces:

vuminhtue
/

NER_PII_Bert_Multilingual

Sleeping

File size: 7,754 Bytes

"""
HuggingFace Space App for PII Detection
This app uses a BERT model to identify Personal Identifiable Information in text.
"""

import gradio as gr
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# Load the model and tokenizer directly from HuggingFace Hub
# This avoids needing to upload the large 667MB model file to the Space
MODEL_PATH = "vuminhtue/Bert_NER_PII_Multi_Lingual"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH)

# Entity label colors for visualization
ENTITY_COLORS = {
    "NAME": "#FF6B6B",
    "EMAIL": "#4ECDC4",
    "CREDITCARDNUMBER": "#FFE66D",
    "IP": "#95E1D3",
    "PASSWORD": "#F38181",
    "STREET": "#AA96DA",
    "ACCOUNTNAME": "#FCBAD3",
    "USERNAME": "#A8E6CF",
    "ZIPCODE": "#FFD3B6",
    "IBAN": "#FFAAA5",
    "URL": "#FF8B94",
    "JOB": "#C7CEEA",
    "GENDER": "#FFDAC1",
    "ADDRESS": "#B5EAD7",
    "MAC": "#C9CBA3",
    "GEO": "#FFE2E2",
    "NEARBYGPSCOORDINATE": "#F7D9C4",
    "COINADDRESS": "#FAACA8",
    "CREDITCARDISSUER": "#DCD6F7",
    "CURRENCY": "#A6D9F7",
    "NUM": "#D4F1F4",
    "BIC": "#FFB6B9",
    "ORDINALDIRECTION": "#F6EAC2",
    "PHONENUMBER": "#FFB3BA",
    "SSN": "#FF677D",
    "DATE": "#BAE1FF",
    "TIME": "#FFFFB5",
    "AGE": "#FFDFBA",
    "ORG": "#BAFFC9",
    "VEHICLEVIN": "#D4A5A5",
    "VEHICLEVRM": "#9B9B9B",
    "PHONEIMEI": "#E0BBE4",
    "PREFIX": "#FFDFD3",
    "HEIGHT": "#C7CEEA",
    "WEIGHTS": "#F0E68C",
    "BLOODTYPE": "#FFB6C1",
    "COLOR": "#E6E6FA",
    "MISC": "#D3D3D3",
}


def detect_pii(text):
    """
    Detect PII entities in the input text.
    
    Args:
        text (str): Input text to analyze
        
    Returns:
        list: Highlighted entities for Gradio display
        str: Summary of detected entities
    """
    if not text.strip():
        return None, "Please enter some text to analyze."
    
    # Tokenize input
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    
    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=2)
    
    # Convert tokens back to words and align with predictions
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    predicted_labels = [model.config.id2label[pred.item()] for pred in predictions[0]]
    
    # Reconstruct words and their labels
    highlighted_entities = []
    current_word = ""
    current_label = None
    
    for token, label in zip(tokens, predicted_labels):
        # Skip special tokens
        if token in ["[CLS]", "[SEP]", "[PAD]"]:
            continue
            
        # Handle subword tokens (starting with ##)
        if token.startswith("##"):
            current_word += token[2:]
        else:
            # Save previous word if it exists
            if current_word:
                if current_label and current_label != "O":
                    highlighted_entities.append((current_word, current_label))
                else:
                    highlighted_entities.append((current_word, None))
                current_word = " "  # Add space between words
            
            current_word += token
            current_label = label
    
    # Add the last word
    if current_word.strip():
        if current_label and current_label != "O":
            highlighted_entities.append((current_word, current_label))
        else:
            highlighted_entities.append((current_word, None))
    
    # Create summary
    detected_entities = {}
    for word, label in highlighted_entities:
        if label and label != "O":
            if label not in detected_entities:
                detected_entities[label] = []
            detected_entities[label].append(word.strip())
    
    if detected_entities:
        summary = "**Detected PII:**\n\n"
        for entity_type, words in detected_entities.items():
            summary += f"- **{entity_type}**: {', '.join(words)}\n"
    else:
        summary = "No PII detected in the text."
    
    return highlighted_entities, summary


# Example texts for users to try (multilingual)
examples = [
    ["My name is John Smith and my email is john.smith@example.com. I was born on January 15, 1985."],
    ["Vui lòng gửi thanh toán đến IBAN GB29 NWBK 6016 1331 9268 19 hoặc gọi cho tôi theo số +1-555-123-4567."],
    ["Mi nombre es María García y vivo en Calle Mayor 123, Madrid. Mi teléfono es +34-91-123-4567."],
    ["Je m'appelle Pierre Dubois, mon email est pierre.dubois@email.fr et j'habite à Paris."],
    ["私の社会保障番号は123-45-6789、クレジットカード番号は4532-1234-5678-9010です。血液型はO型です。"],
    ["车辆识别号: 1HGBH41JXMN109186, 联系电话: +86-138-0013-8000"],
]

# Create Gradio interface
with gr.Blocks(title="PII Detection with BERT", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # 🌍 Multilingual PII Detector
        
        This tool uses a fine-tuned **multilingual BERT model** to automatically detect and highlight personal information in text.
        It can identify **39 different types** of PII including names, emails, phone numbers, SSN, dates, and more.
        
        **Supports multiple languages!** 🌏
        
        ### How to use:
        1. Enter or paste text in the box below (in any supported language)
        2. Click "Detect PII" to analyze
        3. View highlighted entities and summary
        """
    )
    
    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(
                label="Input Text",
                placeholder="Enter text to analyze for PII...",
                lines=6,
            )
            detect_btn = gr.Button("🔍 Detect PII", variant="primary")
        
        with gr.Column():
            output_highlighted = gr.HighlightedText(
                label="Highlighted PII Entities",
                combine_adjacent=True,
                color_map=ENTITY_COLORS,
            )
            output_summary = gr.Markdown(label="Summary")
    
    gr.Markdown("### 📝 Try these examples:")
    gr.Examples(
        examples=examples,
        inputs=input_text,
    )
    
    gr.Markdown(
        """
        ### 🏷️ Detectable Entity Types (39 types):
        
        **Identity**: NAME, USERNAME, PREFIX, GENDER, AGE, JOB, BLOODTYPE  
        **Contact**: EMAIL, PHONENUMBER, PHONEIMEI, STREET, ADDRESS, ZIPCODE, GEO, NEARBYGPSCOORDINATE  
        **Financial**: CREDITCARDNUMBER, CREDITCARDISSUER, IBAN, BIC, ACCOUNTNAME, CURRENCY, COINADDRESS  
        **Government IDs**: SSN (Social Security Number)  
        **Vehicle**: VEHICLEVIN, VEHICLEVRM  
        **Technical**: IP, MAC, URL, PASSWORD  
        **Organization**: ORG  
        **Temporal**: DATE, TIME  
        **Physical**: HEIGHT, WEIGHTS, COLOR  
        **Other**: NUM, ORDINALDIRECTION, MISC
        
        ---
        **Model**: Multilingual BERT-base fine-tuned for PII detection  
        **Base Model**: [google-bert/bert-base-multilingual-uncased](https://huggingface.co/google-bert/bert-base-multilingual-uncased)  
        **Languages**: Supports 100+ languages including English, Spanish, French, German, Chinese, Arabic, and more!
        """
    )
    
    # Connect the button to the function
    detect_btn.click(
        fn=detect_pii,
        inputs=input_text,
        outputs=[output_highlighted, output_summary]
    )
    
    # Also trigger on Enter key
    input_text.submit(
        fn=detect_pii,
        inputs=input_text,
        outputs=[output_highlighted, output_summary]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()