Spaces:
Sleeping
Sleeping
Delete app.py
Browse files
app.py
DELETED
|
@@ -1,224 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
HuggingFace Space App for PII Detection
|
| 3 |
-
This app uses a BERT model to identify Personal Identifiable Information in text.
|
| 4 |
-
"""
|
| 5 |
-
|
| 6 |
-
import gradio as gr
|
| 7 |
-
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
| 8 |
-
import torch
|
| 9 |
-
|
| 10 |
-
# Load the model and tokenizer
|
| 11 |
-
MODEL_PATH = "./Bert_NER_PII_Multilingual"
|
| 12 |
-
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
|
| 13 |
-
model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH)
|
| 14 |
-
|
| 15 |
-
# Entity label colors for visualization
|
| 16 |
-
ENTITY_COLORS = {
|
| 17 |
-
"NAME": "#FF6B6B",
|
| 18 |
-
"EMAIL": "#4ECDC4",
|
| 19 |
-
"CREDITCARDNUMBER": "#FFE66D",
|
| 20 |
-
"IP": "#95E1D3",
|
| 21 |
-
"PASSWORD": "#F38181",
|
| 22 |
-
"STREET": "#AA96DA",
|
| 23 |
-
"ACCOUNTNAME": "#FCBAD3",
|
| 24 |
-
"USERNAME": "#A8E6CF",
|
| 25 |
-
"ZIPCODE": "#FFD3B6",
|
| 26 |
-
"IBAN": "#FFAAA5",
|
| 27 |
-
"URL": "#FF8B94",
|
| 28 |
-
"JOB": "#C7CEEA",
|
| 29 |
-
"GENDER": "#FFDAC1",
|
| 30 |
-
"ADDRESS": "#B5EAD7",
|
| 31 |
-
"MAC": "#C9CBA3",
|
| 32 |
-
"GEO": "#FFE2E2",
|
| 33 |
-
"NEARBYGPSCOORDINATE": "#F7D9C4",
|
| 34 |
-
"COINADDRESS": "#FAACA8",
|
| 35 |
-
"CREDITCARDISSUER": "#DCD6F7",
|
| 36 |
-
"CURRENCY": "#A6D9F7",
|
| 37 |
-
"NUM": "#D4F1F4",
|
| 38 |
-
"BIC": "#FFB6B9",
|
| 39 |
-
"ORDINALDIRECTION": "#F6EAC2",
|
| 40 |
-
"PHONENUMBER": "#FFB3BA",
|
| 41 |
-
"SSN": "#FF677D",
|
| 42 |
-
"DATE": "#BAE1FF",
|
| 43 |
-
"TIME": "#FFFFB5",
|
| 44 |
-
"AGE": "#FFDFBA",
|
| 45 |
-
"ORG": "#BAFFC9",
|
| 46 |
-
"VEHICLEVIN": "#D4A5A5",
|
| 47 |
-
"VEHICLEVRM": "#9B9B9B",
|
| 48 |
-
"PHONEIMEI": "#E0BBE4",
|
| 49 |
-
"PREFIX": "#FFDFD3",
|
| 50 |
-
"HEIGHT": "#C7CEEA",
|
| 51 |
-
"WEIGHTS": "#F0E68C",
|
| 52 |
-
"BLOODTYPE": "#FFB6C1",
|
| 53 |
-
"COLOR": "#E6E6FA",
|
| 54 |
-
"MISC": "#D3D3D3",
|
| 55 |
-
}
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
def detect_pii(text):
|
| 59 |
-
"""
|
| 60 |
-
Detect PII entities in the input text.
|
| 61 |
-
|
| 62 |
-
Args:
|
| 63 |
-
text (str): Input text to analyze
|
| 64 |
-
|
| 65 |
-
Returns:
|
| 66 |
-
list: Highlighted entities for Gradio display
|
| 67 |
-
str: Summary of detected entities
|
| 68 |
-
"""
|
| 69 |
-
if not text.strip():
|
| 70 |
-
return None, "Please enter some text to analyze."
|
| 71 |
-
|
| 72 |
-
# Tokenize input
|
| 73 |
-
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
|
| 74 |
-
|
| 75 |
-
# Get predictions
|
| 76 |
-
with torch.no_grad():
|
| 77 |
-
outputs = model(**inputs)
|
| 78 |
-
predictions = torch.argmax(outputs.logits, dim=2)
|
| 79 |
-
|
| 80 |
-
# Convert tokens back to words and align with predictions
|
| 81 |
-
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
|
| 82 |
-
predicted_labels = [model.config.id2label[pred.item()] for pred in predictions[0]]
|
| 83 |
-
|
| 84 |
-
# Reconstruct words and their labels
|
| 85 |
-
highlighted_entities = []
|
| 86 |
-
current_word = ""
|
| 87 |
-
current_label = None
|
| 88 |
-
|
| 89 |
-
for token, label in zip(tokens, predicted_labels):
|
| 90 |
-
# Skip special tokens
|
| 91 |
-
if token in ["[CLS]", "[SEP]", "[PAD]"]:
|
| 92 |
-
continue
|
| 93 |
-
|
| 94 |
-
# Handle subword tokens (starting with ##)
|
| 95 |
-
if token.startswith("##"):
|
| 96 |
-
current_word += token[2:]
|
| 97 |
-
else:
|
| 98 |
-
# Save previous word if it exists
|
| 99 |
-
if current_word:
|
| 100 |
-
if current_label and current_label != "O":
|
| 101 |
-
highlighted_entities.append((current_word, current_label))
|
| 102 |
-
else:
|
| 103 |
-
highlighted_entities.append((current_word, None))
|
| 104 |
-
current_word = " " # Add space between words
|
| 105 |
-
|
| 106 |
-
current_word += token
|
| 107 |
-
current_label = label
|
| 108 |
-
|
| 109 |
-
# Add the last word
|
| 110 |
-
if current_word.strip():
|
| 111 |
-
if current_label and current_label != "O":
|
| 112 |
-
highlighted_entities.append((current_word, current_label))
|
| 113 |
-
else:
|
| 114 |
-
highlighted_entities.append((current_word, None))
|
| 115 |
-
|
| 116 |
-
# Create summary
|
| 117 |
-
detected_entities = {}
|
| 118 |
-
for word, label in highlighted_entities:
|
| 119 |
-
if label and label != "O":
|
| 120 |
-
if label not in detected_entities:
|
| 121 |
-
detected_entities[label] = []
|
| 122 |
-
detected_entities[label].append(word.strip())
|
| 123 |
-
|
| 124 |
-
if detected_entities:
|
| 125 |
-
summary = "**Detected PII:**\n\n"
|
| 126 |
-
for entity_type, words in detected_entities.items():
|
| 127 |
-
summary += f"- **{entity_type}**: {', '.join(words)}\n"
|
| 128 |
-
else:
|
| 129 |
-
summary = "No PII detected in the text."
|
| 130 |
-
|
| 131 |
-
return highlighted_entities, summary
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
# Example texts for users to try (multilingual)
|
| 135 |
-
examples = [
|
| 136 |
-
["My name is John Smith and my email is john.smith@example.com. I was born on January 15, 1985."],
|
| 137 |
-
["Please send the payment to IBAN GB29 NWBK 6016 1331 9268 19 or call me at +1-555-123-4567."],
|
| 138 |
-
["Mi nombre es María García y vivo en Calle Mayor 123, Madrid. Mi teléfono es +34-91-123-4567."],
|
| 139 |
-
["Je m'appelle Pierre Dubois, mon email est pierre.dubois@email.fr et j'habite à Paris."],
|
| 140 |
-
["My SSN is 123-45-6789 and my credit card number is 4532-1234-5678-9010. My blood type is O+."],
|
| 141 |
-
["车辆识别号: 1HGBH41JXMN109186, 联系电话: +86-138-0013-8000"],
|
| 142 |
-
]
|
| 143 |
-
|
| 144 |
-
# Create Gradio interface
|
| 145 |
-
with gr.Blocks(title="PII Detection with BERT", theme=gr.themes.Soft()) as demo:
|
| 146 |
-
gr.Markdown(
|
| 147 |
-
"""
|
| 148 |
-
# 🌍 Multilingual PII Detector
|
| 149 |
-
|
| 150 |
-
This tool uses a fine-tuned **multilingual BERT model** to automatically detect and highlight personal information in text.
|
| 151 |
-
It can identify **39 different types** of PII including names, emails, phone numbers, SSN, dates, and more.
|
| 152 |
-
|
| 153 |
-
**Supports multiple languages!** 🌏
|
| 154 |
-
|
| 155 |
-
### How to use:
|
| 156 |
-
1. Enter or paste text in the box below (in any supported language)
|
| 157 |
-
2. Click "Detect PII" to analyze
|
| 158 |
-
3. View highlighted entities and summary
|
| 159 |
-
"""
|
| 160 |
-
)
|
| 161 |
-
|
| 162 |
-
with gr.Row():
|
| 163 |
-
with gr.Column():
|
| 164 |
-
input_text = gr.Textbox(
|
| 165 |
-
label="Input Text",
|
| 166 |
-
placeholder="Enter text to analyze for PII...",
|
| 167 |
-
lines=6,
|
| 168 |
-
)
|
| 169 |
-
detect_btn = gr.Button("🔍 Detect PII", variant="primary")
|
| 170 |
-
|
| 171 |
-
with gr.Column():
|
| 172 |
-
output_highlighted = gr.HighlightedText(
|
| 173 |
-
label="Highlighted PII Entities",
|
| 174 |
-
combine_adjacent=True,
|
| 175 |
-
color_map=ENTITY_COLORS,
|
| 176 |
-
)
|
| 177 |
-
output_summary = gr.Markdown(label="Summary")
|
| 178 |
-
|
| 179 |
-
gr.Markdown("### 📝 Try these examples:")
|
| 180 |
-
gr.Examples(
|
| 181 |
-
examples=examples,
|
| 182 |
-
inputs=input_text,
|
| 183 |
-
)
|
| 184 |
-
|
| 185 |
-
gr.Markdown(
|
| 186 |
-
"""
|
| 187 |
-
### 🏷️ Detectable Entity Types (39 types):
|
| 188 |
-
|
| 189 |
-
**Identity**: NAME, USERNAME, PREFIX, GENDER, AGE, JOB, BLOODTYPE
|
| 190 |
-
**Contact**: EMAIL, PHONENUMBER, PHONEIMEI, STREET, ADDRESS, ZIPCODE, GEO, NEARBYGPSCOORDINATE
|
| 191 |
-
**Financial**: CREDITCARDNUMBER, CREDITCARDISSUER, IBAN, BIC, ACCOUNTNAME, CURRENCY, COINADDRESS
|
| 192 |
-
**Government IDs**: SSN (Social Security Number)
|
| 193 |
-
**Vehicle**: VEHICLEVIN, VEHICLEVRM
|
| 194 |
-
**Technical**: IP, MAC, URL, PASSWORD
|
| 195 |
-
**Organization**: ORG
|
| 196 |
-
**Temporal**: DATE, TIME
|
| 197 |
-
**Physical**: HEIGHT, WEIGHTS, COLOR
|
| 198 |
-
**Other**: NUM, ORDINALDIRECTION, MISC
|
| 199 |
-
|
| 200 |
-
---
|
| 201 |
-
**Model**: Multilingual BERT-base fine-tuned for PII detection
|
| 202 |
-
**Base Model**: [google-bert/bert-base-multilingual-uncased](https://huggingface.co/google-bert/bert-base-multilingual-uncased)
|
| 203 |
-
**Languages**: Supports 100+ languages including English, Spanish, French, German, Chinese, Arabic, and more!
|
| 204 |
-
"""
|
| 205 |
-
)
|
| 206 |
-
|
| 207 |
-
# Connect the button to the function
|
| 208 |
-
detect_btn.click(
|
| 209 |
-
fn=detect_pii,
|
| 210 |
-
inputs=input_text,
|
| 211 |
-
outputs=[output_highlighted, output_summary]
|
| 212 |
-
)
|
| 213 |
-
|
| 214 |
-
# Also trigger on Enter key
|
| 215 |
-
input_text.submit(
|
| 216 |
-
fn=detect_pii,
|
| 217 |
-
inputs=input_text,
|
| 218 |
-
outputs=[output_highlighted, output_summary]
|
| 219 |
-
)
|
| 220 |
-
|
| 221 |
-
# Launch the app
|
| 222 |
-
if __name__ == "__main__":
|
| 223 |
-
demo.launch()
|
| 224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|