vuminhtue commited on
Commit
c607b13
·
verified ·
1 Parent(s): de2069b

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -224
app.py DELETED
@@ -1,224 +0,0 @@
1
- """
2
- HuggingFace Space App for PII Detection
3
- This app uses a BERT model to identify Personal Identifiable Information in text.
4
- """
5
-
6
- import gradio as gr
7
- from transformers import AutoTokenizer, AutoModelForTokenClassification
8
- import torch
9
-
10
- # Load the model and tokenizer
11
- MODEL_PATH = "./Bert_NER_PII_Multilingual"
12
- tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
13
- model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH)
14
-
15
- # Entity label colors for visualization
16
- ENTITY_COLORS = {
17
- "NAME": "#FF6B6B",
18
- "EMAIL": "#4ECDC4",
19
- "CREDITCARDNUMBER": "#FFE66D",
20
- "IP": "#95E1D3",
21
- "PASSWORD": "#F38181",
22
- "STREET": "#AA96DA",
23
- "ACCOUNTNAME": "#FCBAD3",
24
- "USERNAME": "#A8E6CF",
25
- "ZIPCODE": "#FFD3B6",
26
- "IBAN": "#FFAAA5",
27
- "URL": "#FF8B94",
28
- "JOB": "#C7CEEA",
29
- "GENDER": "#FFDAC1",
30
- "ADDRESS": "#B5EAD7",
31
- "MAC": "#C9CBA3",
32
- "GEO": "#FFE2E2",
33
- "NEARBYGPSCOORDINATE": "#F7D9C4",
34
- "COINADDRESS": "#FAACA8",
35
- "CREDITCARDISSUER": "#DCD6F7",
36
- "CURRENCY": "#A6D9F7",
37
- "NUM": "#D4F1F4",
38
- "BIC": "#FFB6B9",
39
- "ORDINALDIRECTION": "#F6EAC2",
40
- "PHONENUMBER": "#FFB3BA",
41
- "SSN": "#FF677D",
42
- "DATE": "#BAE1FF",
43
- "TIME": "#FFFFB5",
44
- "AGE": "#FFDFBA",
45
- "ORG": "#BAFFC9",
46
- "VEHICLEVIN": "#D4A5A5",
47
- "VEHICLEVRM": "#9B9B9B",
48
- "PHONEIMEI": "#E0BBE4",
49
- "PREFIX": "#FFDFD3",
50
- "HEIGHT": "#C7CEEA",
51
- "WEIGHTS": "#F0E68C",
52
- "BLOODTYPE": "#FFB6C1",
53
- "COLOR": "#E6E6FA",
54
- "MISC": "#D3D3D3",
55
- }
56
-
57
-
58
- def detect_pii(text):
59
- """
60
- Detect PII entities in the input text.
61
-
62
- Args:
63
- text (str): Input text to analyze
64
-
65
- Returns:
66
- list: Highlighted entities for Gradio display
67
- str: Summary of detected entities
68
- """
69
- if not text.strip():
70
- return None, "Please enter some text to analyze."
71
-
72
- # Tokenize input
73
- inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
74
-
75
- # Get predictions
76
- with torch.no_grad():
77
- outputs = model(**inputs)
78
- predictions = torch.argmax(outputs.logits, dim=2)
79
-
80
- # Convert tokens back to words and align with predictions
81
- tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
82
- predicted_labels = [model.config.id2label[pred.item()] for pred in predictions[0]]
83
-
84
- # Reconstruct words and their labels
85
- highlighted_entities = []
86
- current_word = ""
87
- current_label = None
88
-
89
- for token, label in zip(tokens, predicted_labels):
90
- # Skip special tokens
91
- if token in ["[CLS]", "[SEP]", "[PAD]"]:
92
- continue
93
-
94
- # Handle subword tokens (starting with ##)
95
- if token.startswith("##"):
96
- current_word += token[2:]
97
- else:
98
- # Save previous word if it exists
99
- if current_word:
100
- if current_label and current_label != "O":
101
- highlighted_entities.append((current_word, current_label))
102
- else:
103
- highlighted_entities.append((current_word, None))
104
- current_word = " " # Add space between words
105
-
106
- current_word += token
107
- current_label = label
108
-
109
- # Add the last word
110
- if current_word.strip():
111
- if current_label and current_label != "O":
112
- highlighted_entities.append((current_word, current_label))
113
- else:
114
- highlighted_entities.append((current_word, None))
115
-
116
- # Create summary
117
- detected_entities = {}
118
- for word, label in highlighted_entities:
119
- if label and label != "O":
120
- if label not in detected_entities:
121
- detected_entities[label] = []
122
- detected_entities[label].append(word.strip())
123
-
124
- if detected_entities:
125
- summary = "**Detected PII:**\n\n"
126
- for entity_type, words in detected_entities.items():
127
- summary += f"- **{entity_type}**: {', '.join(words)}\n"
128
- else:
129
- summary = "No PII detected in the text."
130
-
131
- return highlighted_entities, summary
132
-
133
-
134
- # Example texts for users to try (multilingual)
135
- examples = [
136
- ["My name is John Smith and my email is john.smith@example.com. I was born on January 15, 1985."],
137
- ["Please send the payment to IBAN GB29 NWBK 6016 1331 9268 19 or call me at +1-555-123-4567."],
138
- ["Mi nombre es María García y vivo en Calle Mayor 123, Madrid. Mi teléfono es +34-91-123-4567."],
139
- ["Je m'appelle Pierre Dubois, mon email est pierre.dubois@email.fr et j'habite à Paris."],
140
- ["My SSN is 123-45-6789 and my credit card number is 4532-1234-5678-9010. My blood type is O+."],
141
- ["车辆识别号: 1HGBH41JXMN109186, 联系电话: +86-138-0013-8000"],
142
- ]
143
-
144
- # Create Gradio interface
145
- with gr.Blocks(title="PII Detection with BERT", theme=gr.themes.Soft()) as demo:
146
- gr.Markdown(
147
- """
148
- # 🌍 Multilingual PII Detector
149
-
150
- This tool uses a fine-tuned **multilingual BERT model** to automatically detect and highlight personal information in text.
151
- It can identify **39 different types** of PII including names, emails, phone numbers, SSN, dates, and more.
152
-
153
- **Supports multiple languages!** 🌏
154
-
155
- ### How to use:
156
- 1. Enter or paste text in the box below (in any supported language)
157
- 2. Click "Detect PII" to analyze
158
- 3. View highlighted entities and summary
159
- """
160
- )
161
-
162
- with gr.Row():
163
- with gr.Column():
164
- input_text = gr.Textbox(
165
- label="Input Text",
166
- placeholder="Enter text to analyze for PII...",
167
- lines=6,
168
- )
169
- detect_btn = gr.Button("🔍 Detect PII", variant="primary")
170
-
171
- with gr.Column():
172
- output_highlighted = gr.HighlightedText(
173
- label="Highlighted PII Entities",
174
- combine_adjacent=True,
175
- color_map=ENTITY_COLORS,
176
- )
177
- output_summary = gr.Markdown(label="Summary")
178
-
179
- gr.Markdown("### 📝 Try these examples:")
180
- gr.Examples(
181
- examples=examples,
182
- inputs=input_text,
183
- )
184
-
185
- gr.Markdown(
186
- """
187
- ### 🏷️ Detectable Entity Types (39 types):
188
-
189
- **Identity**: NAME, USERNAME, PREFIX, GENDER, AGE, JOB, BLOODTYPE
190
- **Contact**: EMAIL, PHONENUMBER, PHONEIMEI, STREET, ADDRESS, ZIPCODE, GEO, NEARBYGPSCOORDINATE
191
- **Financial**: CREDITCARDNUMBER, CREDITCARDISSUER, IBAN, BIC, ACCOUNTNAME, CURRENCY, COINADDRESS
192
- **Government IDs**: SSN (Social Security Number)
193
- **Vehicle**: VEHICLEVIN, VEHICLEVRM
194
- **Technical**: IP, MAC, URL, PASSWORD
195
- **Organization**: ORG
196
- **Temporal**: DATE, TIME
197
- **Physical**: HEIGHT, WEIGHTS, COLOR
198
- **Other**: NUM, ORDINALDIRECTION, MISC
199
-
200
- ---
201
- **Model**: Multilingual BERT-base fine-tuned for PII detection
202
- **Base Model**: [google-bert/bert-base-multilingual-uncased](https://huggingface.co/google-bert/bert-base-multilingual-uncased)
203
- **Languages**: Supports 100+ languages including English, Spanish, French, German, Chinese, Arabic, and more!
204
- """
205
- )
206
-
207
- # Connect the button to the function
208
- detect_btn.click(
209
- fn=detect_pii,
210
- inputs=input_text,
211
- outputs=[output_highlighted, output_summary]
212
- )
213
-
214
- # Also trigger on Enter key
215
- input_text.submit(
216
- fn=detect_pii,
217
- inputs=input_text,
218
- outputs=[output_highlighted, output_summary]
219
- )
220
-
221
- # Launch the app
222
- if __name__ == "__main__":
223
- demo.launch()
224
-