vuminhtue's picture
Update app.py
543c945 verified
"""
HuggingFace Space App for PII Detection
This app uses a BERT model to identify Personal Identifiable Information in text.
"""
import gradio as gr
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
# Load the model and tokenizer directly from HuggingFace Hub
# This avoids needing to upload the large 667MB model file to the Space
MODEL_PATH = "vuminhtue/Bert_NER_PII_Multi_Lingual"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH)
# Entity label colors for visualization
ENTITY_COLORS = {
"NAME": "#FF6B6B",
"EMAIL": "#4ECDC4",
"CREDITCARDNUMBER": "#FFE66D",
"IP": "#95E1D3",
"PASSWORD": "#F38181",
"STREET": "#AA96DA",
"ACCOUNTNAME": "#FCBAD3",
"USERNAME": "#A8E6CF",
"ZIPCODE": "#FFD3B6",
"IBAN": "#FFAAA5",
"URL": "#FF8B94",
"JOB": "#C7CEEA",
"GENDER": "#FFDAC1",
"ADDRESS": "#B5EAD7",
"MAC": "#C9CBA3",
"GEO": "#FFE2E2",
"NEARBYGPSCOORDINATE": "#F7D9C4",
"COINADDRESS": "#FAACA8",
"CREDITCARDISSUER": "#DCD6F7",
"CURRENCY": "#A6D9F7",
"NUM": "#D4F1F4",
"BIC": "#FFB6B9",
"ORDINALDIRECTION": "#F6EAC2",
"PHONENUMBER": "#FFB3BA",
"SSN": "#FF677D",
"DATE": "#BAE1FF",
"TIME": "#FFFFB5",
"AGE": "#FFDFBA",
"ORG": "#BAFFC9",
"VEHICLEVIN": "#D4A5A5",
"VEHICLEVRM": "#9B9B9B",
"PHONEIMEI": "#E0BBE4",
"PREFIX": "#FFDFD3",
"HEIGHT": "#C7CEEA",
"WEIGHTS": "#F0E68C",
"BLOODTYPE": "#FFB6C1",
"COLOR": "#E6E6FA",
"MISC": "#D3D3D3",
}
def detect_pii(text):
"""
Detect PII entities in the input text.
Args:
text (str): Input text to analyze
Returns:
list: Highlighted entities for Gradio display
str: Summary of detected entities
"""
if not text.strip():
return None, "Please enter some text to analyze."
# Tokenize input
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
# Get predictions
with torch.no_grad():
outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=2)
# Convert tokens back to words and align with predictions
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
predicted_labels = [model.config.id2label[pred.item()] for pred in predictions[0]]
# Reconstruct words and their labels
highlighted_entities = []
current_word = ""
current_label = None
for token, label in zip(tokens, predicted_labels):
# Skip special tokens
if token in ["[CLS]", "[SEP]", "[PAD]"]:
continue
# Handle subword tokens (starting with ##)
if token.startswith("##"):
current_word += token[2:]
else:
# Save previous word if it exists
if current_word:
if current_label and current_label != "O":
highlighted_entities.append((current_word, current_label))
else:
highlighted_entities.append((current_word, None))
current_word = " " # Add space between words
current_word += token
current_label = label
# Add the last word
if current_word.strip():
if current_label and current_label != "O":
highlighted_entities.append((current_word, current_label))
else:
highlighted_entities.append((current_word, None))
# Create summary
detected_entities = {}
for word, label in highlighted_entities:
if label and label != "O":
if label not in detected_entities:
detected_entities[label] = []
detected_entities[label].append(word.strip())
if detected_entities:
summary = "**Detected PII:**\n\n"
for entity_type, words in detected_entities.items():
summary += f"- **{entity_type}**: {', '.join(words)}\n"
else:
summary = "No PII detected in the text."
return highlighted_entities, summary
# Example texts for users to try (multilingual)
examples = [
["My name is John Smith and my email is john.smith@example.com. I was born on January 15, 1985."],
["Vui lòng gửi thanh toán đến IBAN GB29 NWBK 6016 1331 9268 19 hoặc gọi cho tôi theo số +1-555-123-4567."],
["Mi nombre es María García y vivo en Calle Mayor 123, Madrid. Mi teléfono es +34-91-123-4567."],
["Je m'appelle Pierre Dubois, mon email est pierre.dubois@email.fr et j'habite à Paris."],
["私の社会保障番号は123-45-6789、クレジットカード番号は4532-1234-5678-9010です。血液型はO型です。"],
["车辆识别号: 1HGBH41JXMN109186, 联系电话: +86-138-0013-8000"],
]
# Create Gradio interface
with gr.Blocks(title="PII Detection with BERT", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# 🌍 Multilingual PII Detector
This tool uses a fine-tuned **multilingual BERT model** to automatically detect and highlight personal information in text.
It can identify **39 different types** of PII including names, emails, phone numbers, SSN, dates, and more.
**Supports multiple languages!** 🌏
### How to use:
1. Enter or paste text in the box below (in any supported language)
2. Click "Detect PII" to analyze
3. View highlighted entities and summary
"""
)
with gr.Row():
with gr.Column():
input_text = gr.Textbox(
label="Input Text",
placeholder="Enter text to analyze for PII...",
lines=6,
)
detect_btn = gr.Button("🔍 Detect PII", variant="primary")
with gr.Column():
output_highlighted = gr.HighlightedText(
label="Highlighted PII Entities",
combine_adjacent=True,
color_map=ENTITY_COLORS,
)
output_summary = gr.Markdown(label="Summary")
gr.Markdown("### 📝 Try these examples:")
gr.Examples(
examples=examples,
inputs=input_text,
)
gr.Markdown(
"""
### 🏷️ Detectable Entity Types (39 types):
**Identity**: NAME, USERNAME, PREFIX, GENDER, AGE, JOB, BLOODTYPE
**Contact**: EMAIL, PHONENUMBER, PHONEIMEI, STREET, ADDRESS, ZIPCODE, GEO, NEARBYGPSCOORDINATE
**Financial**: CREDITCARDNUMBER, CREDITCARDISSUER, IBAN, BIC, ACCOUNTNAME, CURRENCY, COINADDRESS
**Government IDs**: SSN (Social Security Number)
**Vehicle**: VEHICLEVIN, VEHICLEVRM
**Technical**: IP, MAC, URL, PASSWORD
**Organization**: ORG
**Temporal**: DATE, TIME
**Physical**: HEIGHT, WEIGHTS, COLOR
**Other**: NUM, ORDINALDIRECTION, MISC
---
**Model**: Multilingual BERT-base fine-tuned for PII detection
**Base Model**: [google-bert/bert-base-multilingual-uncased](https://huggingface.co/google-bert/bert-base-multilingual-uncased)
**Languages**: Supports 100+ languages including English, Spanish, French, German, Chinese, Arabic, and more!
"""
)
# Connect the button to the function
detect_btn.click(
fn=detect_pii,
inputs=input_text,
outputs=[output_highlighted, output_summary]
)
# Also trigger on Enter key
input_text.submit(
fn=detect_pii,
inputs=input_text,
outputs=[output_highlighted, output_summary]
)
# Launch the app
if __name__ == "__main__":
demo.launch()