vuminhtue's picture
Upload 12 files
2a4d835 verified
"""
HuggingFace Space App for PII Detection
This app uses a BERT model to identify Personal Identifiable Information in text.
"""
import gradio as gr
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
# Load the model and tokenizer
MODEL_PATH = "./Bert_base_NER_PII43k"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH)
# Entity label colors for visualization
ENTITY_COLORS = {
"NAME": "#FF6B6B",
"EMAIL": "#4ECDC4",
"CREDITCARDNUM": "#FFE66D",
"IP": "#95E1D3",
"PASSWORD": "#F38181",
"STREET": "#AA96DA",
"ACCOUNTNAME": "#FCBAD3",
"ACCOUNTNUM": "#FFFFD2",
"USERNAME": "#A8E6CF",
"ZIPCODE": "#FFD3B6",
"IBAN": "#FFAAA5",
"URL": "#FF8B94",
"JOB": "#C7CEEA",
"GENDER": "#FFDAC1",
"ADDRESS": "#B5EAD7",
"MAC": "#C9CBA3",
"GEO": "#FFE2E2",
"NEARBYGPSCOORDINATE": "#F7D9C4",
"COINADDRESS": "#FAACA8",
"CREDITCARDISSUER": "#DCD6F7",
"CURRENCY": "#A6D9F7",
"DISPLAYNAME": "#FAD9A1",
"NUM": "#D4F1F4",
"BIC": "#FFB6B9",
"USERAGENT": "#C2E9FB",
"ORDINALDIRECTION": "#F6EAC2",
}
def detect_pii(text):
"""
Detect PII entities in the input text.
Args:
text (str): Input text to analyze
Returns:
list: Highlighted entities for Gradio display
str: Summary of detected entities
"""
if not text.strip():
return None, "Please enter some text to analyze."
# Tokenize input
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
# Get predictions
with torch.no_grad():
outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=2)
# Convert tokens back to words and align with predictions
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
predicted_labels = [model.config.id2label[pred.item()] for pred in predictions[0]]
# Reconstruct words and their labels
highlighted_entities = []
current_word = ""
current_label = None
for token, label in zip(tokens, predicted_labels):
# Skip special tokens
if token in ["[CLS]", "[SEP]", "[PAD]"]:
continue
# Handle subword tokens (starting with ##)
if token.startswith("##"):
current_word += token[2:]
else:
# Save previous word if it exists
if current_word:
if current_label and current_label != "O":
highlighted_entities.append((current_word, current_label))
else:
highlighted_entities.append((current_word, None))
current_word = " " # Add space between words
current_word += token
current_label = label
# Add the last word
if current_word.strip():
if current_label and current_label != "O":
highlighted_entities.append((current_word, current_label))
else:
highlighted_entities.append((current_word, None))
# Create summary
detected_entities = {}
for word, label in highlighted_entities:
if label and label != "O":
if label not in detected_entities:
detected_entities[label] = []
detected_entities[label].append(word.strip())
if detected_entities:
summary = "**Detected PII:**\n\n"
for entity_type, words in detected_entities.items():
summary += f"- **{entity_type}**: {', '.join(words)}\n"
else:
summary = "No PII detected in the text."
return highlighted_entities, summary
# Example texts for users to try
examples = [
["My name is John Smith and my email is john.smith@example.com. I live at 123 Main Street."],
["Please send the payment to IBAN GB29 NWBK 6016 1331 9268 19 or call me at my office."],
["Contact Sarah Johnson at sarah.j@company.org for more details about the project."],
["My credit card number is 4532-1234-5678-9010 and my username is mike_user123."],
]
# Create Gradio interface
with gr.Blocks(title="PII Detection with BERT", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# πŸ” Personal Identifiable Information (PII) Detector
This tool uses a fine-tuned BERT model to automatically detect and highlight personal information in text.
It can identify **27 different types** of PII including names, emails, addresses, credit cards, and more.
### How to use:
1. Enter or paste text in the box below
2. Click "Detect PII" to analyze
3. View highlighted entities and summary
"""
)
with gr.Row():
with gr.Column():
input_text = gr.Textbox(
label="Input Text",
placeholder="Enter text to analyze for PII...",
lines=6,
)
detect_btn = gr.Button("πŸ” Detect PII", variant="primary")
with gr.Column():
output_highlighted = gr.HighlightedText(
label="Highlighted PII Entities",
combine_adjacent=True,
color_map=ENTITY_COLORS,
)
output_summary = gr.Markdown(label="Summary")
gr.Markdown("### πŸ“ Try these examples:")
gr.Examples(
examples=examples,
inputs=input_text,
)
gr.Markdown(
"""
### 🏷️ Detectable Entity Types:
**Identity**: NAME, USERNAME, DISPLAYNAME, GENDER, JOB
**Contact**: EMAIL, STREET, ADDRESS, ZIPCODE, GEO, NEARBYGPSCOORDINATE
**Financial**: CREDITCARDNUM, CREDITCARDISSUER, IBAN, BIC, ACCOUNTNAME, ACCOUNTNUM, CURRENCY, COINADDRESS
**Technical**: IP, MAC, URL, USERAGENT, PASSWORD
**Other**: NUM, ORDINALDIRECTION
---
**Model**: BERT-base fine-tuned on [ai4privacy/pii-masking-43k](https://huggingface.co/datasets/ai4privacy/pii-masking-43k) dataset
**Base Model**: [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased)
"""
)
# Connect the button to the function
detect_btn.click(
fn=detect_pii,
inputs=input_text,
outputs=[output_highlighted, output_summary]
)
# Also trigger on Enter key
input_text.submit(
fn=detect_pii,
inputs=input_text,
outputs=[output_highlighted, output_summary]
)
# Launch the app
if __name__ == "__main__":
demo.launch()