vuminhtue commited on
Commit
aab1356
·
verified ·
1 Parent(s): c607b13

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +225 -0
app.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HuggingFace Space App for PII Detection
3
+ This app uses a BERT model to identify Personal Identifiable Information in text.
4
+ """
5
+
6
+ import gradio as gr
7
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
8
+ import torch
9
+
10
+ # Load the model and tokenizer directly from HuggingFace Hub
11
+ # This avoids needing to upload the large 667MB model file to the Space
12
+ MODEL_PATH = "vuminhtue/Bert_NER_PII_Multi_Lingual"
13
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
14
+ model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH)
15
+
16
+ # Entity label colors for visualization
17
+ ENTITY_COLORS = {
18
+ "NAME": "#FF6B6B",
19
+ "EMAIL": "#4ECDC4",
20
+ "CREDITCARDNUMBER": "#FFE66D",
21
+ "IP": "#95E1D3",
22
+ "PASSWORD": "#F38181",
23
+ "STREET": "#AA96DA",
24
+ "ACCOUNTNAME": "#FCBAD3",
25
+ "USERNAME": "#A8E6CF",
26
+ "ZIPCODE": "#FFD3B6",
27
+ "IBAN": "#FFAAA5",
28
+ "URL": "#FF8B94",
29
+ "JOB": "#C7CEEA",
30
+ "GENDER": "#FFDAC1",
31
+ "ADDRESS": "#B5EAD7",
32
+ "MAC": "#C9CBA3",
33
+ "GEO": "#FFE2E2",
34
+ "NEARBYGPSCOORDINATE": "#F7D9C4",
35
+ "COINADDRESS": "#FAACA8",
36
+ "CREDITCARDISSUER": "#DCD6F7",
37
+ "CURRENCY": "#A6D9F7",
38
+ "NUM": "#D4F1F4",
39
+ "BIC": "#FFB6B9",
40
+ "ORDINALDIRECTION": "#F6EAC2",
41
+ "PHONENUMBER": "#FFB3BA",
42
+ "SSN": "#FF677D",
43
+ "DATE": "#BAE1FF",
44
+ "TIME": "#FFFFB5",
45
+ "AGE": "#FFDFBA",
46
+ "ORG": "#BAFFC9",
47
+ "VEHICLEVIN": "#D4A5A5",
48
+ "VEHICLEVRM": "#9B9B9B",
49
+ "PHONEIMEI": "#E0BBE4",
50
+ "PREFIX": "#FFDFD3",
51
+ "HEIGHT": "#C7CEEA",
52
+ "WEIGHTS": "#F0E68C",
53
+ "BLOODTYPE": "#FFB6C1",
54
+ "COLOR": "#E6E6FA",
55
+ "MISC": "#D3D3D3",
56
+ }
57
+
58
+
59
+ def detect_pii(text):
60
+ """
61
+ Detect PII entities in the input text.
62
+
63
+ Args:
64
+ text (str): Input text to analyze
65
+
66
+ Returns:
67
+ list: Highlighted entities for Gradio display
68
+ str: Summary of detected entities
69
+ """
70
+ if not text.strip():
71
+ return None, "Please enter some text to analyze."
72
+
73
+ # Tokenize input
74
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
75
+
76
+ # Get predictions
77
+ with torch.no_grad():
78
+ outputs = model(**inputs)
79
+ predictions = torch.argmax(outputs.logits, dim=2)
80
+
81
+ # Convert tokens back to words and align with predictions
82
+ tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
83
+ predicted_labels = [model.config.id2label[pred.item()] for pred in predictions[0]]
84
+
85
+ # Reconstruct words and their labels
86
+ highlighted_entities = []
87
+ current_word = ""
88
+ current_label = None
89
+
90
+ for token, label in zip(tokens, predicted_labels):
91
+ # Skip special tokens
92
+ if token in ["[CLS]", "[SEP]", "[PAD]"]:
93
+ continue
94
+
95
+ # Handle subword tokens (starting with ##)
96
+ if token.startswith("##"):
97
+ current_word += token[2:]
98
+ else:
99
+ # Save previous word if it exists
100
+ if current_word:
101
+ if current_label and current_label != "O":
102
+ highlighted_entities.append((current_word, current_label))
103
+ else:
104
+ highlighted_entities.append((current_word, None))
105
+ current_word = " " # Add space between words
106
+
107
+ current_word += token
108
+ current_label = label
109
+
110
+ # Add the last word
111
+ if current_word.strip():
112
+ if current_label and current_label != "O":
113
+ highlighted_entities.append((current_word, current_label))
114
+ else:
115
+ highlighted_entities.append((current_word, None))
116
+
117
+ # Create summary
118
+ detected_entities = {}
119
+ for word, label in highlighted_entities:
120
+ if label and label != "O":
121
+ if label not in detected_entities:
122
+ detected_entities[label] = []
123
+ detected_entities[label].append(word.strip())
124
+
125
+ if detected_entities:
126
+ summary = "**Detected PII:**\n\n"
127
+ for entity_type, words in detected_entities.items():
128
+ summary += f"- **{entity_type}**: {', '.join(words)}\n"
129
+ else:
130
+ summary = "No PII detected in the text."
131
+
132
+ return highlighted_entities, summary
133
+
134
+
135
+ # Example texts for users to try (multilingual)
136
+ examples = [
137
+ ["My name is John Smith and my email is john.smith@example.com. I was born on January 15, 1985."],
138
+ ["Please send the payment to IBAN GB29 NWBK 6016 1331 9268 19 or call me at +1-555-123-4567."],
139
+ ["Mi nombre es María García y vivo en Calle Mayor 123, Madrid. Mi teléfono es +34-91-123-4567."],
140
+ ["Je m'appelle Pierre Dubois, mon email est pierre.dubois@email.fr et j'habite à Paris."],
141
+ ["My SSN is 123-45-6789 and my credit card number is 4532-1234-5678-9010. My blood type is O+."],
142
+ ["车辆识别号: 1HGBH41JXMN109186, 联系电话: +86-138-0013-8000"],
143
+ ]
144
+
145
+ # Create Gradio interface
146
+ with gr.Blocks(title="PII Detection with BERT", theme=gr.themes.Soft()) as demo:
147
+ gr.Markdown(
148
+ """
149
+ # 🌍 Multilingual PII Detector
150
+
151
+ This tool uses a fine-tuned **multilingual BERT model** to automatically detect and highlight personal information in text.
152
+ It can identify **39 different types** of PII including names, emails, phone numbers, SSN, dates, and more.
153
+
154
+ **Supports multiple languages!** 🌏
155
+
156
+ ### How to use:
157
+ 1. Enter or paste text in the box below (in any supported language)
158
+ 2. Click "Detect PII" to analyze
159
+ 3. View highlighted entities and summary
160
+ """
161
+ )
162
+
163
+ with gr.Row():
164
+ with gr.Column():
165
+ input_text = gr.Textbox(
166
+ label="Input Text",
167
+ placeholder="Enter text to analyze for PII...",
168
+ lines=6,
169
+ )
170
+ detect_btn = gr.Button("🔍 Detect PII", variant="primary")
171
+
172
+ with gr.Column():
173
+ output_highlighted = gr.HighlightedText(
174
+ label="Highlighted PII Entities",
175
+ combine_adjacent=True,
176
+ color_map=ENTITY_COLORS,
177
+ )
178
+ output_summary = gr.Markdown(label="Summary")
179
+
180
+ gr.Markdown("### 📝 Try these examples:")
181
+ gr.Examples(
182
+ examples=examples,
183
+ inputs=input_text,
184
+ )
185
+
186
+ gr.Markdown(
187
+ """
188
+ ### 🏷️ Detectable Entity Types (39 types):
189
+
190
+ **Identity**: NAME, USERNAME, PREFIX, GENDER, AGE, JOB, BLOODTYPE
191
+ **Contact**: EMAIL, PHONENUMBER, PHONEIMEI, STREET, ADDRESS, ZIPCODE, GEO, NEARBYGPSCOORDINATE
192
+ **Financial**: CREDITCARDNUMBER, CREDITCARDISSUER, IBAN, BIC, ACCOUNTNAME, CURRENCY, COINADDRESS
193
+ **Government IDs**: SSN (Social Security Number)
194
+ **Vehicle**: VEHICLEVIN, VEHICLEVRM
195
+ **Technical**: IP, MAC, URL, PASSWORD
196
+ **Organization**: ORG
197
+ **Temporal**: DATE, TIME
198
+ **Physical**: HEIGHT, WEIGHTS, COLOR
199
+ **Other**: NUM, ORDINALDIRECTION, MISC
200
+
201
+ ---
202
+ **Model**: Multilingual BERT-base fine-tuned for PII detection
203
+ **Base Model**: [google-bert/bert-base-multilingual-uncased](https://huggingface.co/google-bert/bert-base-multilingual-uncased)
204
+ **Languages**: Supports 100+ languages including English, Spanish, French, German, Chinese, Arabic, and more!
205
+ """
206
+ )
207
+
208
+ # Connect the button to the function
209
+ detect_btn.click(
210
+ fn=detect_pii,
211
+ inputs=input_text,
212
+ outputs=[output_highlighted, output_summary]
213
+ )
214
+
215
+ # Also trigger on Enter key
216
+ input_text.submit(
217
+ fn=detect_pii,
218
+ inputs=input_text,
219
+ outputs=[output_highlighted, output_summary]
220
+ )
221
+
222
+ # Launch the app
223
+ if __name__ == "__main__":
224
+ demo.launch()
225
+