rishabh5752 commited on
Commit
98e775c
·
verified ·
1 Parent(s): 3e57e50

Added App.py and Requirements.txt

Browse files
Files changed (2) hide show
  1. app (9).py +939 -0
  2. requirements (8).txt +5 -0
app (9).py ADDED
@@ -0,0 +1,939 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ import re
4
+ import json
5
+ import torch
6
+ from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
7
+ import faker
8
+ from typing import List, Dict, Any, Optional
9
+ import pandas as pd
10
+
11
+ class EnhancedPiiProtectionPipeline:
12
+ """
13
+ A comprehensive PII protection pipeline that:
14
+ 1. Uses regex for all detectable patterns first
15
+ 2. Uses multiple custom NER models for remaining detection
16
+ 3. Provides three protection methods: labeling, masking, and synthesis
17
+ 4. Handles general, Indian-specific, address, and medical contexts
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ main_model_name: str = "Kashish-jain/pii-protection-model",
23
+ medical_model_name: str = "Kashish-jain/pii-protection-medical",
24
+ use_medical_model: bool = False
25
+ ):
26
+ """
27
+ Initialize the comprehensive PII protection pipeline.
28
+
29
+ Args:
30
+ main_model_name: HuggingFace model name or path for the main PII model
31
+ medical_model_name: HuggingFace model name for the medical NER model
32
+ use_medical_model: Whether to load and use the medical model
33
+ """
34
+ # Main model
35
+ self.main_tokenizer = AutoTokenizer.from_pretrained(main_model_name)
36
+ self.main_model = pipeline("ner", model=main_model_name, tokenizer=self.main_tokenizer, aggregation_strategy="simple")
37
+
38
+ # Address-specific model - implementation simplified
39
+ self.address_model = self.main_model # Fallback to main model for simplicity
40
+
41
+ # Medical model
42
+ self.use_medical_model = use_medical_model
43
+ self.medical_model = None
44
+ self.medical_tokenizer = None
45
+
46
+ if use_medical_model and medical_model_name:
47
+ try:
48
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
49
+ self.device = device
50
+
51
+ self.medical_tokenizer = AutoTokenizer.from_pretrained(medical_model_name)
52
+ self.medical_model = pipeline(
53
+ "ner",
54
+ model=medical_model_name,
55
+ tokenizer=self.medical_tokenizer,
56
+ aggregation_strategy="simple",
57
+ device=0 if torch.cuda.is_available() else -1
58
+ )
59
+ print(f"Medical model '{medical_model_name}' loaded successfully")
60
+ except Exception as e:
61
+ print(f"Warning: Could not load medical model. Error: {str(e)}")
62
+ self.use_medical_model = False
63
+
64
+ self.faker = faker.Faker('en_IN')
65
+
66
+ # Set up regex patterns for common PII entities - IMPROVED PATTERNS
67
+ self.regex_patterns = {
68
+ # Phone numbers - Fixed to prevent partial matches
69
+ 'PHONENUMBER': r'(?<!\w)(?:\+91[\-\s]?[789]\d{9}|(?:\+91[\-\s]?)?\d{3}[\-\.\s]?\d{3}[\-\.\s]?\d{4}|(?:\d{3}[\-\s]?){2}\d{4})(?!\d)',
70
+
71
+ # Email
72
+ 'EMAIL': r'(?<!\w)[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}(?!\w)',
73
+
74
+ # IP addresses
75
+ 'IPV4': r'(?<!\w)(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?!\w)',
76
+
77
+ # Credit cards
78
+ 'CREDITCARDNUMBER': r'(?<!\w)(?:4\d{12}(?:\d{3})?|5[1-5]\d{14}|6(?:011|5\d{2})\d{12}|3[47]\d{13}|3(?:0[0-5]|[68]\d)\d{11}|(?:2131|1800|35\d{3})\d{11})(?!\w)',
79
+
80
+ # PAN (Indian Permanent Account Number)
81
+ 'PAN': r'(?<!\w)[A-Z]{5}[0-9]{4}[A-Z](?!\w)',
82
+
83
+ # Aadhar (Indian ID)
84
+ 'AADHAR': r'(?<!\w)(?:\d{4}\s\d{4}\s\d{4}|\d{12})(?!\d)',
85
+
86
+ # Passport
87
+ 'PASSPORT': r'(?<!\w)[A-Z]{1,2}\d{7}(?!\w)',
88
+
89
+ # URL
90
+ 'URL': r'(?<!\w)https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&//=]*)(?!\w)',
91
+
92
+ # Dates
93
+ 'DOB': r'(?<!\w)(?:0[1-9]|[12][0-9]|3[01])[/\-\.](?:0[1-9]|1[0-2])[/\-\.](?:19|20)\d{2}(?!\w)',
94
+
95
+ # PINCODE
96
+ 'PINCODE': r'(?<!\w)(?:PIN[\s-]*)?\d{6}(?!\d)',
97
+
98
+ # Bank account & IBAN
99
+ 'ACCOUNTNUMBER': r'(?<!\w)(?:A/C|Account|ACC)(?:ount)?\s*(?:Number|No|#)?[:\s-]*(\d{9,17})(?!\d)',
100
+ 'IBAN_CODE': r'(?<!\w)(?:IBAN|International Bank Account Number)?[:\s]*[A-Z]{2}\d{2}[A-Z0-9]{4}[0-9]{7}(?:[0-9]{0,16})(?!\w)',
101
+
102
+ # Social Security Number (US)
103
+ 'SSN': r'(?<!\w)\d{3}[-\s]?\d{2}[-\s]?\d{4}(?!\w)',
104
+
105
+ # Driver's License (simplified)
106
+ 'DRIVER_LICENSE': r'(?<!\w)(?:[A-Z]{1,2}-\d{5,8}|\d{7,9}|[A-Z]\d{3}-\d{4}-\d{4}|\d{3}-\d{2}-\d{4})(?!\w)'
107
+ }
108
+
109
+ # Medical entity regex patterns - ENHANCED to only capture the value part, not label
110
+ self.medical_regex_patterns = {
111
+ 'DOCTORNAME': r'(?:Dr\.?|Doctor)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)',
112
+ 'PATIENTID': r'(?:Patient\s+ID|ID|MRN)[\s-]*[:]\s*([A-Z0-9]{5,12})', # Modified to use a capture group
113
+ 'MEDICALID': r'(?:Medical\s+Record|MRN|Patient\s+ID)[\s-]*[:]\s*([A-Z0-9]{4,15})', # Modified to use a capture group
114
+ }
115
+
116
+ # Separated measurements with capture groups to get just the values, not labels
117
+ self.measurement_patterns = {
118
+ # Height with capture group for just the measurement value
119
+ 'HEIGHT': r'(?:Height|Ht)[\s-]*[:]\s*((?:\d{1,2}\'\s*(?:\d{1,2}\")?|\d{3}\s*cm|\d{1,2}\.\d{1,2}\s*m))',
120
+
121
+ # Weight with capture group for just the measurement value
122
+ 'WEIGHT': r'(?:Weight|Wt)[\s-]*[:]\s*((?:\d{1,3}(?:\.\d{1,2})?\s*(?:kg|lbs?|pounds?|kilograms?)))',
123
+
124
+ # Blood group/type with separate regex for the value only
125
+ 'BLOOD_TYPE': r'(?:Blood\s+[Tt]ype|Blood\s+[Gg]roup)[\s-]*[:]\s*((?:A|B|AB|O)[+-])',
126
+ }
127
+
128
+ # Standalone measurement patterns (no labels)
129
+ self.standalone_medical_patterns = {
130
+ 'HEIGHT_STANDALONE': r'(?<!\w)(?:\d{1,2}\'\s*\d{1,2}\"|\d{1,2}\'\d{1,2}\"|\d{1,2}\'|\d{3}\s*cm|\d{1,2}\.\d{1,2}\s*m)(?!\w)',
131
+ 'WEIGHT_STANDALONE': r'(?<!\w)(?:\d{1,3}(?:\.\d{1,2})?\s*(?:kg|lbs?|pounds?|kilograms?))(?!\w)',
132
+ 'BLOOD_TYPE_STANDALONE': r'(?<!\w)(?:A|B|AB|O)[+-](?!\w)'
133
+ }
134
+
135
+ # Combine all regex patterns
136
+ self.all_regex_patterns = {
137
+ **self.regex_patterns,
138
+ **self.medical_regex_patterns,
139
+ **self.measurement_patterns,
140
+ **self.standalone_medical_patterns
141
+ }
142
+
143
+ def regex_detection(self, text: str) -> List[Dict[str, Any]]:
144
+ """Detect PII using regex patterns with improved capture groups."""
145
+ entities = []
146
+
147
+ for entity_type, pattern in self.all_regex_patterns.items():
148
+ for match in re.finditer(pattern, text, re.IGNORECASE):
149
+ # For patterns with capture groups, use the first group if it exists
150
+ if match.groups() and match.group(1):
151
+ # For labeled patterns with capture groups (e.g., "Height: 5'6"")
152
+ captured_text = match.group(1)
153
+ # Calculate start/end positions for the captured group
154
+ start = match.start(1)
155
+ end = match.end(1)
156
+ else:
157
+ # For patterns without capture groups or standalone measurements
158
+ captured_text = match.group(0)
159
+ start = match.start(0)
160
+ end = match.end(0)
161
+
162
+ # Handle standalone height/weight by renaming them
163
+ if entity_type == 'HEIGHT_STANDALONE':
164
+ entity_type = 'HEIGHT'
165
+ elif entity_type == 'WEIGHT_STANDALONE':
166
+ entity_type = 'WEIGHT'
167
+ elif entity_type == 'BLOOD_TYPE_STANDALONE':
168
+ entity_type = 'BLOOD_TYPE'
169
+
170
+ entities.append({
171
+ "text": captured_text,
172
+ "label": entity_type,
173
+ "start": start,
174
+ "end": end,
175
+ "score": 0.95, # High confidence for regex matches
176
+ "_original_text": text # Store original text for context
177
+ })
178
+
179
+ return entities
180
+
181
+ def ner_detection(self, text: str, model_type: str = "main") -> List[Dict[str, Any]]:
182
+ """
183
+ Detect PII using NER models
184
+
185
+ Args:
186
+ text: Text to analyze
187
+ model_type: Type of model to use ("main", "medical")
188
+ """
189
+ if model_type == "medical" and not self.use_medical_model:
190
+ return []
191
+
192
+ model = self.medical_model if model_type == "medical" else self.main_model
193
+
194
+ try:
195
+ results = model(text)
196
+
197
+ # Convert to standard format
198
+ entities = []
199
+ for result in results:
200
+ # Skip low confidence predictions
201
+ if result.get('score', 0) < 0.5:
202
+ continue
203
+
204
+ # Clean entity type
205
+ entity_type = result.get('entity_group', result.get('entity', '')).replace('B-', '').replace('I-', '')
206
+
207
+ entities.append({
208
+ "text": result.get('word', text[result['start']:result['end']]),
209
+ "label": entity_type,
210
+ "start": result['start'],
211
+ "end": result['end'],
212
+ "score": result.get('score', 0.7),
213
+ "_original_text": text # Store original text for context
214
+ })
215
+
216
+ return entities
217
+ except Exception as e:
218
+ print(f"Error with NER detection: {str(e)}")
219
+ return []
220
+
221
+ def merge_entities(self, entities: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
222
+ """Merge adjacent entities of the same or related types that likely form a single entity"""
223
+ if not entities:
224
+ return []
225
+
226
+ # Sort entities by start position
227
+ entities.sort(key=lambda x: x['start'])
228
+ merged = []
229
+
230
+ # Define related entity groups (entities that could be part of the same larger entity)
231
+ related_types = {
232
+ 'NAME': ['FIRSTNAME', 'MIDDLENAME', 'LASTNAME', 'PREFIX'],
233
+ 'ADDRESS': ['STREET', 'CITY', 'STATE', 'ZIPCODE', 'BUILDINGNUMBER'],
234
+ 'PHONENUMBER': ['PHONENUMBER'] # Explicitly add PHONENUMBER to prevent merging with other types
235
+ }
236
+
237
+ # Flatten the related types for quick lookup
238
+ related_types_flat = {}
239
+ for main_type, sub_types in related_types.items():
240
+ for sub_type in sub_types:
241
+ related_types_flat[sub_type] = main_type
242
+
243
+ # Helper function to check if two entity types are related
244
+ def are_related(type1, type2):
245
+ # Same type is related
246
+ if type1 == type2:
247
+ return True
248
+
249
+ # Prevent merging PHONENUMBER with other types
250
+ if type1 == 'PHONENUMBER' or type2 == 'PHONENUMBER':
251
+ return type1 == type2
252
+
253
+ # Check if they're in the same group
254
+ for group, types in related_types.items():
255
+ if type1 in types and type2 in types:
256
+ return True
257
+ if type1 == group and type2 in types:
258
+ return True
259
+ if type2 == group and type1 in types:
260
+ return True
261
+
262
+ # Check through the flattened related types
263
+ if type1 in related_types_flat and related_types_flat[type1] == type2:
264
+ return True
265
+ if type2 in related_types_flat and related_types_flat[type2] == type1:
266
+ return True
267
+
268
+ return False
269
+
270
+ for entity in entities:
271
+ if not merged:
272
+ merged.append(entity.copy())
273
+ continue
274
+
275
+ last = merged[-1]
276
+
277
+ # Maximum space between tokens that could be part of the same entity
278
+ # For adjacent words, this would typically be 1 (the space)
279
+ max_gap = 5
280
+
281
+ # Check if entities could be part of the same larger entity:
282
+ # 1. Same or related entity type
283
+ # 2. Within a reasonable distance
284
+ # 3. No other complete word between them
285
+ if (are_related(entity['label'], last['label']) and
286
+ entity['start'] - last['end'] <= max_gap):
287
+
288
+ # Get the text between the two entities
289
+ between_text = entity.get('_original_text', '')[last['end']:entity['start']] \
290
+ if '_original_text' in entity and '_original_text' in last \
291
+ else ' '
292
+
293
+ # Only merge if the gap contains just spaces or very simple punctuation
294
+ if between_text.strip() in ['', ' ', '.', ',', '-', '_']:
295
+ # Create merged entity with all text between start and end
296
+ if '_original_text' in entity and '_original_text' in last:
297
+ full_text = last['_original_text'][last['start']:entity['end']]
298
+ else:
299
+ full_text = last['text'] + between_text + entity['text']
300
+
301
+ last['text'] = full_text
302
+ last['end'] = entity['end']
303
+
304
+ # When merging different entity types, prefer the broader category
305
+ if last['label'] in related_types_flat and entity['label'] == related_types_flat[last['label']]:
306
+ last['label'] = entity['label']
307
+ elif entity['label'] in related_types_flat and last['label'] == related_types_flat[entity['label']]:
308
+ # Keep last['label'] as is
309
+ pass
310
+
311
+ last['score'] = max(last.get('score', 0), entity.get('score', 0))
312
+ else:
313
+ merged.append(entity.copy())
314
+ else:
315
+ merged.append(entity.copy())
316
+
317
+ return merged
318
+
319
+ def remove_overlapping_entities(self, entities: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
320
+ """Remove overlapping entities by keeping the highest scoring one"""
321
+ if not entities:
322
+ return []
323
+
324
+ # Sort by start position
325
+ entities.sort(key=lambda x: x['start'])
326
+
327
+ # Identify overlapping entities
328
+ non_overlapping = []
329
+ i = 0
330
+ while i < len(entities):
331
+ current = entities[i]
332
+
333
+ # Find all entities that overlap with the current one
334
+ overlapping = [current]
335
+ j = i + 1
336
+ while j < len(entities) and entities[j]['start'] < current['end']:
337
+ overlapping.append(entities[j])
338
+ j += 1
339
+
340
+ # Keep the highest scoring entity from overlapping group
341
+ if len(overlapping) > 1:
342
+ best_entity = max(overlapping, key=lambda x: x.get('score', 0))
343
+ non_overlapping.append(best_entity)
344
+ else:
345
+ non_overlapping.append(current)
346
+
347
+ # Move index to start after all overlapping entities
348
+ i = j
349
+
350
+ return non_overlapping
351
+
352
+ def generate_synthetic_value(self, entity_type: str, original_value: str = None) -> str:
353
+ """Generate realistic synthetic data for PII."""
354
+ try:
355
+ if entity_type in ['PERSON', 'NAME', 'FIRSTNAME', 'LASTNAME']:
356
+ return self.faker.name()
357
+
358
+ elif entity_type == 'EMAIL':
359
+ return self.faker.email()
360
+
361
+ elif entity_type == 'PHONENUMBER':
362
+ return self.faker.phone_number()
363
+
364
+ elif entity_type == 'PAN':
365
+ return self.faker.bothify('?????####?').upper()
366
+
367
+ elif entity_type == 'AADHAR':
368
+ return ' '.join([self.faker.numerify('####') for _ in range(3)])
369
+
370
+ elif entity_type == 'CREDITCARDNUMBER' or entity_type == 'CREDIT_CARD':
371
+ return self.faker.credit_card_number()
372
+
373
+ elif entity_type == 'ACCOUNTNUMBER' or entity_type == 'IBAN_CODE' or entity_type == 'BANK_NUMBER':
374
+ return self.faker.bban()
375
+
376
+ elif entity_type == 'PASSPORT' or entity_type == 'US_PASSPORT':
377
+ return f"{self.faker.random_letter().upper()}{self.faker.random_letter().upper()}{self.faker.numerify('######')}"
378
+
379
+ elif entity_type == 'DOB' or entity_type == 'DATE_TIME':
380
+ return self.faker.date_of_birth(minimum_age=18, maximum_age=90).strftime('%d/%m/%Y')
381
+
382
+ elif entity_type == 'IPV4' or entity_type == 'IP_ADDRESS':
383
+ return self.faker.ipv4()
384
+
385
+ elif entity_type == 'URL':
386
+ return self.faker.url()
387
+
388
+ elif entity_type == 'PINCODE':
389
+ return self.faker.postcode()
390
+
391
+ elif entity_type == 'CITY' or entity_type == 'LOCATION':
392
+ return self.faker.city()
393
+
394
+ elif entity_type == 'STATE':
395
+ return self.faker.state()
396
+
397
+ elif entity_type == 'SSN' or entity_type == 'US_SSN':
398
+ return self.faker.ssn()
399
+
400
+ elif entity_type == 'DRIVER_LICENSE' or entity_type == 'US_DRIVER_LICENSE':
401
+ return self.faker.bothify('?#######')
402
+
403
+ elif entity_type == 'CRYPTO':
404
+ return self.faker.cryptocurrency_code() + self.faker.bothify('??##??##??##??')
405
+
406
+ # Medical entity generation
407
+ elif entity_type == 'DOCTORNAME':
408
+ return f"Dr. {self.faker.last_name()}"
409
+
410
+ elif entity_type == 'PATIENTID' or entity_type == 'MEDICALID':
411
+ return self.faker.bothify('PT#######')
412
+
413
+ elif entity_type == 'HEIGHT':
414
+ # Generate a realistic height in feet and inches
415
+ feet = self.faker.random_int(min=4, max=6)
416
+ inches = self.faker.random_int(min=0, max=11)
417
+ return f"{feet}'{inches}\""
418
+
419
+ elif entity_type == 'WEIGHT':
420
+ # Generate a realistic weight in kg
421
+ weight = self.faker.random_int(min=45, max=100)
422
+ return f"{weight}kg"
423
+
424
+ elif entity_type == 'BLOOD_TYPE':
425
+ # Generate a random blood type
426
+ blood_groups = ['A+', 'A-', 'B+', 'B-', 'AB+', 'AB-', 'O+', 'O-']
427
+ return self.faker.random_element(blood_groups)
428
+
429
+ else:
430
+ # Fallback for unknown types
431
+ return f"[SYNTHETIC_{entity_type}]"
432
+
433
+ except Exception as e:
434
+ print(f"Error generating synthetic value: {str(e)}")
435
+ return f"[SYNTHETIC_{entity_type}]"
436
+
437
+ def process_text(self, text: str, model_type: str = "main", protection_method: str = "replace") -> Dict[str, Any]:
438
+ """
439
+ Process text to detect and protect PII
440
+
441
+ Args:
442
+ text: Input text to process
443
+ model_type: Type of model to use ("main", "medical")
444
+ protection_method: Protection method ("replace", "mask", "synthesize")
445
+
446
+ Returns:
447
+ Dict containing protected text and detected entities
448
+ """
449
+ # Step 1: Get entities from regex
450
+ regex_entities = self.regex_detection(text)
451
+
452
+ # Step 2: Get entities from NER model
453
+ ner_entities = self.ner_detection(text, model_type)
454
+
455
+ # Step 3: Combine and process entities
456
+ all_entities = regex_entities + ner_entities
457
+ merged_entities = self.merge_entities(all_entities)
458
+ final_entities = self.remove_overlapping_entities(merged_entities)
459
+
460
+ # Step 4: Create protected text based on method
461
+ protected_text = text
462
+
463
+ # Sort entities by start position in reverse to avoid index issues when replacing
464
+ final_entities_sorted = sorted(final_entities, key=lambda x: x['start'], reverse=True)
465
+
466
+ if protection_method == "mask":
467
+ # Mask with asterisks
468
+ for entity in final_entities_sorted:
469
+ mask = '*' * len(entity['text'])
470
+ protected_text = protected_text[:entity['start']] + mask + protected_text[entity['end']:]
471
+
472
+ elif protection_method == "synthesize":
473
+ # Replace with synthetic values
474
+ for entity in final_entities_sorted:
475
+ synthetic = self.generate_synthetic_value(entity['label'], entity['text'])
476
+ protected_text = protected_text[:entity['start']] + synthetic + protected_text[entity['end']:]
477
+
478
+ else: # replace (default)
479
+ # Replace with entity tags
480
+ for entity in final_entities_sorted:
481
+ tag = f"[{entity['label']}]"
482
+ protected_text = protected_text[:entity['start']] + tag + protected_text[entity['end']:]
483
+
484
+ # Create findings table
485
+ findings = []
486
+ for i, entity in enumerate(final_entities):
487
+ findings.append({
488
+ "index": i,
489
+ "entity_type": entity['label'],
490
+ "text": entity['text'],
491
+ "start": entity['start'],
492
+ "end": entity['end'],
493
+ "confidence": round(entity.get('score', 1.0), 2)
494
+ })
495
+
496
+ return {
497
+ "protected_text": protected_text,
498
+ "entities": final_entities,
499
+ "findings": findings
500
+ }
501
+
502
+
503
+ # Example input text
504
+ example_text = """
505
+ Hi, my name is John Doe and I'm originally from Delhi.
506
+ On 11/10/2024 I visited https://www.google.com and sent an email to abc@gmail.com, from IP 192.168.0.1.
507
+ My phone number: +91-1234321216.
508
+ """
509
+
510
+ medical_example_text = """
511
+ Patient name: John Doe
512
+ Date of Birth: 05/12/1982
513
+ Patient ID: PT789456
514
+ Contact: +91-9876543210
515
+ Dr. Robert Johnson has prescribed medication penicillin on 12/12/2024.
516
+ Blood type: O+, Height: 5'6", Weight: 145kg
517
+ """
518
+
519
+ # Create Gradio Interface
520
+ def process_input(text, model_type, protection_method):
521
+ # Initialize pipeline with Hugging Face model paths
522
+ main_model_name = "Kashish-jain/pii-protection-model"
523
+ medical_model_name = "Kashish-jain/pii-protection-medical"
524
+ use_medical = model_type == "medical"
525
+
526
+ pipeline = EnhancedPiiProtectionPipeline(
527
+ main_model_name=main_model_name,
528
+ medical_model_name=medical_model_name,
529
+ use_medical_model=use_medical
530
+ )
531
+
532
+ # Process the text
533
+ result = pipeline.process_text(text, model_type, protection_method)
534
+
535
+ # Create findings table
536
+ if result["findings"]:
537
+ df = pd.DataFrame(result["findings"])
538
+ df = df.rename(columns={
539
+ "index": "#",
540
+ "entity_type": "Entity type",
541
+ "text": "Text",
542
+ "start": "Start",
543
+ "end": "End",
544
+ "confidence": "Confidence"
545
+ })
546
+ else:
547
+ df = pd.DataFrame(columns=["#", "Entity type", "Text", "Start", "End", "Confidence"])
548
+
549
+ # Count detected entities by type
550
+ if result["findings"]:
551
+ entity_counts = df["Entity type"].value_counts().to_dict()
552
+ entity_summary = ", ".join([f"{count} {entity}" for entity, count in entity_counts.items()])
553
+ else:
554
+ entity_summary = "No entities detected"
555
+
556
+ return result["protected_text"], df, entity_summary
557
+
558
+ # Update input text based on model type
559
+ def update_input_text(model_type):
560
+ if model_type == "medical":
561
+ return medical_example_text
562
+ else:
563
+ return example_text
564
+
565
+ # Custom CSS for a minimalistic, clean design
566
+ custom_css = """
567
+ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=Playfair+Display:wght@400;700&display=swap');
568
+
569
+ :root {
570
+ --font-sans: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
571
+ --font-serif: 'Playfair Display', Georgia, Cambria, 'Times New Roman', Times, serif;
572
+
573
+ --color-primary: #2563eb;
574
+ --color-primary-light: #3b82f6;
575
+ --color-primary-dark: #1d4ed8;
576
+
577
+ --color-secondary: #64748b;
578
+ --color-secondary-light: #94a3b8;
579
+
580
+ --color-background: #00000f;
581
+ --color-surface: #f8fafc;
582
+ --color-border: #e2e8f0;
583
+
584
+ --color-text: #1e293b;
585
+ --color-text-light: #64748b;
586
+
587
+ --color-success: #10b981;
588
+ --color-warning: #f59e0b;
589
+ --color-error: #ef4444;
590
+
591
+ --shadow-sm: 0 1px 2px 0 rgba(0, 0, 0, 0.05);
592
+ --shadow: 0 1px 3px 0 rgba(0, 0, 0, 0.1), 0 1px 2px 0 rgba(0, 0, 0, 0.06);
593
+ --shadow-md: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
594
+ --shadow-lg: 0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05);
595
+
596
+ --radius-sm: 0.25rem;
597
+ --radius: 0.375rem;
598
+ --radius-md: 0.5rem;
599
+ --radius-lg: 0.75rem;
600
+
601
+ --spacing-1: 0.25rem;
602
+ --spacing-2: 0.5rem;
603
+ --spacing-3: 0.75rem;
604
+ --spacing-4: 1rem;
605
+ --spacing-6: 1.5rem;
606
+ --spacing-8: 2rem;
607
+ --spacing-12: 3rem;
608
+ }
609
+
610
+ body, .gradio-container {
611
+ font-family: var(--font-sans);
612
+ color: var(--color-text);
613
+ background-color: var(--color-background);
614
+ line-height: 1.5;
615
+ }
616
+
617
+ /* Typography */
618
+ h1, h2, h3 {
619
+ font-family: var(--font-serif);
620
+ font-weight: 700;
621
+ line-height: 1.2;
622
+ margin-bottom: var(--spacing-4);
623
+ }
624
+
625
+ h1 {
626
+ font-size: 2.25rem;
627
+ color: var(--color-text-light);
628
+ }
629
+
630
+ h2 {
631
+ font-size: 1.5rem;
632
+ color: var(--color-text);
633
+ }
634
+
635
+ h3 {
636
+ font-size: 1.25rem;
637
+ color: var(--color-text);
638
+ }
639
+
640
+ p {
641
+ margin-bottom: var(--spacing-4);
642
+ }
643
+
644
+ /* Layout Components */
645
+ .container {
646
+ max-width: 1500px;
647
+ margin: 0 auto;
648
+ padding: var(--spacing-6);
649
+ }
650
+
651
+ .card {
652
+ background-color: var(--color-surface);
653
+ border-radius: var(--radius);
654
+ box-shadow: var(--shadow);
655
+ padding: var(--spacing-6);
656
+ margin-bottom: var(--spacing-6);
657
+ border: 1px solid var(--color-border);
658
+ }
659
+
660
+ /* Form Elements */
661
+ .gradio-button.primary {
662
+ background-color: var(--color-secondary-light);
663
+ color: white;
664
+ font-weight: 500;
665
+ border-radius: var(--radius);
666
+ padding: var(--spacing-3) var(--spacing-6);
667
+ transition: all 0.2s ease;
668
+ border: none;
669
+ box-shadow: var(--shadow);
670
+ }
671
+
672
+ .gradio-button.primary:hover {
673
+ background-color: var(--color-secondary);
674
+ box-shadow: var(--shadow-md);
675
+ transform: translateY(-1px);
676
+ }
677
+
678
+ .gradio-button.primary:active {
679
+ transform: translateY(0);
680
+ }
681
+
682
+ .gradio-dropdown, .gradio-textbox, .gradio-textarea {
683
+ border-radius: var(--radius);
684
+ border: 1px solid var(--color-border);
685
+ padding: var(--spacing-3);
686
+ background-color: var(--color-background);
687
+ transition: border-color 0.2s ease;
688
+ }
689
+
690
+ .gradio-dropdown:focus, .gradio-textbox:focus, .gradio-textarea:focus {
691
+ border-color: var(--color-primary-light);
692
+ outline: none;
693
+ box-shadow: 0 0 0 3px rgba(37, 99, 235, 0.1);
694
+ }
695
+
696
+ /* Tabs */
697
+ .gradio-tabs {
698
+ margin-bottom: var(--spacing-6);
699
+ }
700
+
701
+ .gradio-tab-button {
702
+ padding: var(--spacing-3) var(--spacing-6);
703
+ font-weight: 500;
704
+ color: var(--color-text-light);
705
+ border-bottom: 2px solid transparent;
706
+ transition: all 0.2s ease;
707
+ }
708
+
709
+ .gradio-tab-button.selected {
710
+ color: var(--color-primary);
711
+ border-bottom-color: var(--color-primary);
712
+ }
713
+
714
+ /* Accordion */
715
+ .gradio-accordion {
716
+ border: 1px solid var(--color-border);
717
+ border-radius: var(--radius);
718
+ margin-bottom: var(--spacing-6);
719
+ overflow: hidden;
720
+ }
721
+
722
+ .gradio-accordion-header {
723
+ padding: var(--spacing-4);
724
+ font-weight: 500;
725
+ background-color: var(--color-surface);
726
+ border-bottom: 1px solid var(--color-border);
727
+ cursor: pointer;
728
+ }
729
+
730
+ .gradio-accordion-content {
731
+ padding: var(--spacing-4);
732
+ background-color: var(--color-background);
733
+ }
734
+
735
+ /* Table */
736
+ table {
737
+ width: 100%;
738
+ border-collapse: collapse;
739
+ margin-bottom: var(--spacing-6);
740
+ }
741
+
742
+ th {
743
+ background-color: var(--color-surface);
744
+ padding: var(--spacing-3) var(--spacing-4);
745
+ text-align: left;
746
+ font-weight: 600;
747
+ color: var(--color-text);
748
+ border-bottom: 2px solid var(--color-border);
749
+ }
750
+
751
+ td {
752
+ padding: var(--spacing-3) var(--spacing-4);
753
+ border-bottom: 1px solid var(--color-border);
754
+ }
755
+
756
+ /* Dark mode support */
757
+ @media (prefers-color-scheme: dark) {
758
+ :root {
759
+ --color-background: #0f172a;
760
+ --color-surface: #1e293b;
761
+ --color-border: #334155;
762
+ --color-text: #f8fafc;
763
+ --color-text-light: #cbd5e1;
764
+ }
765
+ }
766
+
767
+ /* Custom components */
768
+ .entity-badge {
769
+ display: inline-block;
770
+ padding: 0.25rem 0.5rem;
771
+ border-radius: 9999px;
772
+ font-size: 0.75rem;
773
+ font-weight: 500;
774
+ background-color: var(--color-primary-light);
775
+ color: white;
776
+ margin-right: 0.5rem;
777
+ margin-bottom: 0.5rem;
778
+ }
779
+
780
+ .summary-container {
781
+ background-color: var(--color-surface);
782
+ border-radius: var(--radius);
783
+ padding: var(--spacing-4);
784
+ margin-bottom: var(--spacing-6);
785
+ border: 1px solid var(--color-border);
786
+ }
787
+
788
+ .icon-text {
789
+ display: flex;
790
+ align-items: center;
791
+ gap: var(--spacing-2);
792
+ }
793
+
794
+ .icon-text svg {
795
+ width: 1.25rem;
796
+ height: 1.25rem;
797
+ color: var(--color-primary);
798
+ }
799
+
800
+ /* Responsive adjustments */
801
+ @media (max-width: 768px) {
802
+ .container {
803
+ padding: var(--spacing-4);
804
+ }
805
+
806
+ h1 {
807
+ font-size: 1.75rem;
808
+ }
809
+
810
+ .card {
811
+ padding: var(--spacing-4);
812
+ }
813
+ }
814
+ """
815
+
816
+ # Create the Gradio interface with enhanced styling
817
+ with gr.Blocks(css=custom_css, theme=gr.themes.Base()) as demo:
818
+ # Header section
819
+ with gr.Column(elem_classes="container"):
820
+ gr.Markdown("""
821
+ # 🛡️ PII Protection Tool
822
+
823
+ Detect, protect and de-identify personally identifiable information.
824
+ """)
825
+
826
+ # Main content area
827
+ with gr.Column(elem_classes="card"):
828
+ # Configuration section
829
+ with gr.Row():
830
+ with gr.Column(scale=1):
831
+ model_dropdown = gr.Dropdown(
832
+ choices=[
833
+ ("General Purpose", "main"),
834
+ ("Medical Context", "medical")
835
+ ],
836
+ value="main",
837
+ label="Model Type",
838
+ elem_classes="form-control"
839
+ )
840
+
841
+ with gr.Column(scale=1):
842
+ protection_dropdown = gr.Dropdown(
843
+ choices=[
844
+ ("Replace with Tags", "replace"),
845
+ ("Mask with Asterisks", "mask"),
846
+ ("Generate Synthetic Data", "synthesize")
847
+ ],
848
+ value="replace",
849
+ label="Protection Method",
850
+ elem_classes="form-control"
851
+ )
852
+
853
+ # Divider
854
+ gr.Markdown("---")
855
+
856
+ # Input/Output section
857
+ with gr.Row():
858
+ # Input column
859
+ with gr.Column():
860
+ gr.Markdown("### Input Text")
861
+ input_text = gr.TextArea(
862
+ label="",
863
+ value=example_text,
864
+ lines=10,
865
+ elem_classes="text-input"
866
+ )
867
+
868
+ # Output column
869
+ with gr.Column():
870
+ gr.Markdown("### Protected Output")
871
+ output_text = gr.TextArea(
872
+ label="",
873
+ lines=10,
874
+ elem_classes="text-output"
875
+ )
876
+
877
+ # Summary section
878
+ with gr.Column(elem_classes="summary-container"):
879
+ gr.Markdown("### Entity Summary")
880
+ entity_summary = gr.Textbox(
881
+ label="",
882
+ interactive=False,
883
+ elem_classes="entity-summary"
884
+ )
885
+
886
+ # Action button
887
+ submit_btn = gr.Button(
888
+ "Process Text",
889
+ variant="primary",
890
+ elem_classes="submit-button"
891
+ )
892
+
893
+ # Findings section
894
+ with gr.Column(elem_classes="card"):
895
+ gr.Markdown("### Detected Entities")
896
+ findings_table = gr.DataFrame(
897
+ headers=["#", "Entity type", "Text", "Start", "End", "Confidence"],
898
+ elem_classes="findings-table"
899
+ )
900
+
901
+ # Help section
902
+ with gr.Accordion("Help & Information", open=False, elem_classes="help-accordion"):
903
+ gr.Markdown("""
904
+ #### De-identification Methods
905
+
906
+ - **Replace with Tags**: Replaces each detected entity with its entity type tag (e.g., [NAME])
907
+ - **Mask with Asterisks**: Replaces each detected entity with asterisks (*)
908
+ - **Generate Synthetic Data**: Replaces each detected entity with realistic synthetic data
909
+
910
+ #### Model Types
911
+
912
+ - **General Purpose**: Optimized for common PII elements
913
+ - **Medical Context**: Enhanced detection for healthcare-related PII
914
+
915
+ #### Entity Types Detected
916
+
917
+ - **Personal**: NAME, EMAIL, PHONENUMBER, DOB
918
+ - **Financial**: CREDITCARDNUMBER, ACCOUNTNUMBER, PAN, IBAN_CODE, SSN
919
+ - **Location**: ADDRESS, CITY, STATE, PINCODE, IPV4
920
+ - **Medical**: DOCTORNAME, PATIENTID, MEDICALID
921
+ - **Other**: URL, PASSPORT, DRIVER_LICENSE
922
+ """)
923
+
924
+ # Set up event handlers
925
+ submit_btn.click(
926
+ fn=process_input,
927
+ inputs=[input_text, model_dropdown, protection_dropdown],
928
+ outputs=[output_text, findings_table, entity_summary]
929
+ )
930
+
931
+ model_dropdown.change(
932
+ fn=update_input_text,
933
+ inputs=[model_dropdown],
934
+ outputs=[input_text]
935
+ )
936
+
937
+ # Launch the app
938
+ if __name__ == "__main__":
939
+ demo.launch()
requirements (8).txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ transformers>=4.30.0
3
+ torch>=2.0.0
4
+ faker>=18.4.0
5
+ pandas>=2.0.0