Spaces:

vuminhtue
/

Bert_base_NER_PII_43k

Sleeping

File size: 6,646 Bytes

2a4d835

"""
HuggingFace Space App for PII Detection
This app uses a BERT model to identify Personal Identifiable Information in text.
"""

import gradio as gr
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# Load the model and tokenizer
MODEL_PATH = "./Bert_base_NER_PII43k"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH)

# Entity label colors for visualization
ENTITY_COLORS = {
    "NAME": "#FF6B6B",
    "EMAIL": "#4ECDC4",
    "CREDITCARDNUM": "#FFE66D",
    "IP": "#95E1D3",
    "PASSWORD": "#F38181",
    "STREET": "#AA96DA",
    "ACCOUNTNAME": "#FCBAD3",
    "ACCOUNTNUM": "#FFFFD2",
    "USERNAME": "#A8E6CF",
    "ZIPCODE": "#FFD3B6",
    "IBAN": "#FFAAA5",
    "URL": "#FF8B94",
    "JOB": "#C7CEEA",
    "GENDER": "#FFDAC1",
    "ADDRESS": "#B5EAD7",
    "MAC": "#C9CBA3",
    "GEO": "#FFE2E2",
    "NEARBYGPSCOORDINATE": "#F7D9C4",
    "COINADDRESS": "#FAACA8",
    "CREDITCARDISSUER": "#DCD6F7",
    "CURRENCY": "#A6D9F7",
    "DISPLAYNAME": "#FAD9A1",
    "NUM": "#D4F1F4",
    "BIC": "#FFB6B9",
    "USERAGENT": "#C2E9FB",
    "ORDINALDIRECTION": "#F6EAC2",
}


def detect_pii(text):
    """
    Detect PII entities in the input text.
    
    Args:
        text (str): Input text to analyze
        
    Returns:
        list: Highlighted entities for Gradio display
        str: Summary of detected entities
    """
    if not text.strip():
        return None, "Please enter some text to analyze."
    
    # Tokenize input
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    
    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=2)
    
    # Convert tokens back to words and align with predictions
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    predicted_labels = [model.config.id2label[pred.item()] for pred in predictions[0]]
    
    # Reconstruct words and their labels
    highlighted_entities = []
    current_word = ""
    current_label = None
    
    for token, label in zip(tokens, predicted_labels):
        # Skip special tokens
        if token in ["[CLS]", "[SEP]", "[PAD]"]:
            continue
            
        # Handle subword tokens (starting with ##)
        if token.startswith("##"):
            current_word += token[2:]
        else:
            # Save previous word if it exists
            if current_word:
                if current_label and current_label != "O":
                    highlighted_entities.append((current_word, current_label))
                else:
                    highlighted_entities.append((current_word, None))
                current_word = " "  # Add space between words
            
            current_word += token
            current_label = label
    
    # Add the last word
    if current_word.strip():
        if current_label and current_label != "O":
            highlighted_entities.append((current_word, current_label))
        else:
            highlighted_entities.append((current_word, None))
    
    # Create summary
    detected_entities = {}
    for word, label in highlighted_entities:
        if label and label != "O":
            if label not in detected_entities:
                detected_entities[label] = []
            detected_entities[label].append(word.strip())
    
    if detected_entities:
        summary = "**Detected PII:**\n\n"
        for entity_type, words in detected_entities.items():
            summary += f"- **{entity_type}**: {', '.join(words)}\n"
    else:
        summary = "No PII detected in the text."
    
    return highlighted_entities, summary


# Example texts for users to try
examples = [
    ["My name is John Smith and my email is john.smith@example.com. I live at 123 Main Street."],
    ["Please send the payment to IBAN GB29 NWBK 6016 1331 9268 19 or call me at my office."],
    ["Contact Sarah Johnson at sarah.j@company.org for more details about the project."],
    ["My credit card number is 4532-1234-5678-9010 and my username is mike_user123."],
]

# Create Gradio interface
with gr.Blocks(title="PII Detection with BERT", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # 🔍 Personal Identifiable Information (PII) Detector
        
        This tool uses a fine-tuned BERT model to automatically detect and highlight personal information in text.
        It can identify **27 different types** of PII including names, emails, addresses, credit cards, and more.
        
        ### How to use:
        1. Enter or paste text in the box below
        2. Click "Detect PII" to analyze
        3. View highlighted entities and summary
        """
    )
    
    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(
                label="Input Text",
                placeholder="Enter text to analyze for PII...",
                lines=6,
            )
            detect_btn = gr.Button("🔍 Detect PII", variant="primary")
        
        with gr.Column():
            output_highlighted = gr.HighlightedText(
                label="Highlighted PII Entities",
                combine_adjacent=True,
                color_map=ENTITY_COLORS,
            )
            output_summary = gr.Markdown(label="Summary")
    
    gr.Markdown("### 📝 Try these examples:")
    gr.Examples(
        examples=examples,
        inputs=input_text,
    )
    
    gr.Markdown(
        """
        ### 🏷️ Detectable Entity Types:
        
        **Identity**: NAME, USERNAME, DISPLAYNAME, GENDER, JOB  
        **Contact**: EMAIL, STREET, ADDRESS, ZIPCODE, GEO, NEARBYGPSCOORDINATE  
        **Financial**: CREDITCARDNUM, CREDITCARDISSUER, IBAN, BIC, ACCOUNTNAME, ACCOUNTNUM, CURRENCY, COINADDRESS  
        **Technical**: IP, MAC, URL, USERAGENT, PASSWORD  
        **Other**: NUM, ORDINALDIRECTION
        
        ---
        **Model**: BERT-base fine-tuned on [ai4privacy/pii-masking-43k](https://huggingface.co/datasets/ai4privacy/pii-masking-43k) dataset  
        **Base Model**: [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased)
        """
    )
    
    # Connect the button to the function
    detect_btn.click(
        fn=detect_pii,
        inputs=input_text,
        outputs=[output_highlighted, output_summary]
    )
    
    # Also trigger on Enter key
    input_text.submit(
        fn=detect_pii,
        inputs=input_text,
        outputs=[output_highlighted, output_summary]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()