darwinkernelpanic commited on
Commit
ad750fd
Β·
verified Β·
1 Parent(s): a68c636

Upload pii_extension.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. pii_extension.py +115 -12
pii_extension.py CHANGED
@@ -18,6 +18,93 @@ class PIILabel(Enum):
18
  SOCIAL_MEDIA = "social_media"
19
  URL = "url"
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  class PIIDetector:
22
  """Detect PII in text with context awareness"""
23
 
@@ -168,6 +255,7 @@ class PIIDetector:
168
  def scan(self, text: str, age: int) -> Dict:
169
  """
170
  Full PII scan with age-appropriate rules
 
171
 
172
  Returns:
173
  {
@@ -177,45 +265,53 @@ class PIIDetector:
177
  "social_media_allowed": bool,
178
  "grooming_risk": float,
179
  "action": "allow" | "block" | "flag",
180
- "reason": str
 
 
181
  }
182
  """
 
 
 
 
 
 
183
  pii_found = []
184
  pii_types = set()
185
 
186
- # Detect various PII types
187
- emails = self.detect_emails(text)
188
  if emails:
189
  pii_types.add(PIILabel.EMAIL)
190
  for email, start, end in emails:
191
  pii_found.append({"type": "email", "value": email, "start": start, "end": end})
192
 
193
- phones = self.detect_phones(text)
194
  if phones:
195
  pii_types.add(PIILabel.PHONE)
196
  for phone, start, end in phones:
197
  pii_found.append({"type": "phone", "value": phone, "start": start, "end": end})
198
 
199
- addresses = self.detect_addresses(text)
200
  if addresses:
201
  pii_types.add(PIILabel.ADDRESS)
202
  for addr, start, end in addresses:
203
  pii_found.append({"type": "address", "value": addr, "start": start, "end": end})
204
 
205
- credit_cards = self.detect_credit_cards(text)
206
  if credit_cards:
207
  pii_types.add(PIILabel.CREDIT_CARD)
208
  for cc, start, end in credit_cards:
209
  pii_found.append({"type": "credit_card", "value": cc, "start": start, "end": end})
210
 
211
- ssns = self.detect_ssn(text)
212
  if ssns:
213
  pii_types.add(PIILabel.SSN)
214
  for ssn, start, end in ssns:
215
  pii_found.append({"type": "ssn", "value": ssn, "start": start, "end": end})
216
 
217
- # Social media detection
218
- social_links = self.detect_social_media(text)
219
  has_social_media = len(social_links) > 0
220
 
221
  if has_social_media:
@@ -234,8 +330,8 @@ class PIIDetector:
234
  action = "block"
235
  reason = f"PII detected: {', '.join([p.value for p in critical_pii])}"
236
  elif has_social_media:
237
- # Social media rules
238
- is_grooming, grooming_risk, grooming_keywords = self.detect_grooming_context(text)
239
 
240
  if age < 13:
241
  # Under 13: Block ALL social media sharing
@@ -261,6 +357,10 @@ class PIIDetector:
261
  elif grooming_risk > 0:
262
  social_media_allowed = False
263
 
 
 
 
 
264
  return {
265
  "has_pii": len(pii_types) > 0,
266
  "pii_types": [p.value for p in pii_types],
@@ -270,7 +370,10 @@ class PIIDetector:
270
  "grooming_keywords": grooming_keywords,
271
  "action": action,
272
  "reason": reason,
273
- "age": age
 
 
 
274
  }
275
 
276
 
 
18
  SOCIAL_MEDIA = "social_media"
19
  URL = "url"
20
 
21
+ class UnicodeDeobfuscator:
22
+ """Detect and normalize unicode obfuscation attempts"""
23
+
24
+ # Unicode ranges for suspicious characters
25
+ CIRCLED_LETTERS = range(0x24B6, 0x24EA) # β’Ά-β“©
26
+ MATHEMATICAL_CHARS = range(0x1D400, 0x1D800) # 𝐀-𝑍, etc
27
+ FULLWIDTH_CHARS = range(0xFF01, 0xFF5F) # !-}
28
+ DOUBLE_STRUCK = range(0x2100, 0x2150) # β„‚, ℍ, etc
29
+ BOX_DRAWING = range(0x2500, 0x2580) # β”Œβ”€β” etc
30
+ BLOCK_ELEMENTS = range(0x2580, 0x25A0) # β–€-β–Ÿ
31
+
32
+ # Mapping of circled letters to normal
33
+ CIRCLED_MAP = {
34
+ # Uppercase
35
+ 'β’Ά': 'A', 'β’·': 'B', 'β’Έ': 'C', 'β’Ή': 'D', 'β’Ί': 'E',
36
+ 'β’»': 'F', 'β’Ό': 'G', 'β’½': 'H', 'β’Ύ': 'I', 'β’Ώ': 'J',
37
+ 'β“€': 'K', 'Ⓛ': 'L', 'β“‚': 'M', 'Ⓝ': 'N', 'β“„': 'O',
38
+ 'β“…': 'P', 'Ⓠ': 'Q', 'Ⓡ': 'R', 'β“ˆ': 'S', 'Ⓣ': 'T',
39
+ 'β“Š': 'U', 'β“‹': 'V', 'β“Œ': 'W', 'Ⓧ': 'X', 'β“Ž': 'Y', 'Ⓩ': 'Z',
40
+ # Lowercase
41
+ 'ⓐ': 'a', 'β“‘': 'b', 'β“’': 'c', 'β““': 'd', 'β“”': 'e',
42
+ 'β“•': 'f', 'β“–': 'g', 'β“—': 'h', 'β“˜': 'i', 'β“™': 'j',
43
+ 'β“š': 'k', 'β“›': 'l', 'β“œ': 'm', 'ⓝ': 'n', 'β“ž': 'o',
44
+ 'β“Ÿ': 'p', 'β“ ': 'q', 'β“‘': 'r', 'β“’': 's', 'β“£': 't',
45
+ 'β“€': 'u', 'β“₯': 'v', 'ⓦ': 'w', 'β“§': 'x', 'ⓨ': 'y', 'β“©': 'z',
46
+ }
47
+
48
+ @classmethod
49
+ def detect_obfuscation(cls, text: str) -> Tuple[bool, List[Tuple[str, str]], str]:
50
+ """
51
+ Detect unicode obfuscation
52
+ Returns: (is_obfuscated, [(char, type)], normalized_text)
53
+ """
54
+ suspicious = []
55
+ normalized = []
56
+
57
+ for char in text:
58
+ code = ord(char)
59
+
60
+ # Check circled letters
61
+ if char in cls.CIRCLED_MAP:
62
+ suspicious.append((char, 'circled'))
63
+ normalized.append(cls.CIRCLED_MAP[char])
64
+ # Check double-struck
65
+ elif code in cls.DOUBLE_STRUCK:
66
+ suspicious.append((char, 'double-struck'))
67
+ # Map common double-struck to normal
68
+ if char == 'β„‚':
69
+ normalized.append('C')
70
+ elif char == 'ℍ':
71
+ normalized.append('H')
72
+ elif char == 'β„•':
73
+ normalized.append('N')
74
+ elif char == 'β„™':
75
+ normalized.append('P')
76
+ elif char == 'β„š':
77
+ normalized.append('Q')
78
+ elif char == 'ℝ':
79
+ normalized.append('R')
80
+ elif char == 'β„€':
81
+ normalized.append('Z')
82
+ else:
83
+ normalized.append(char)
84
+ # Check fullwidth
85
+ elif code in cls.FULLWIDTH_CHARS:
86
+ suspicious.append((char, 'fullwidth'))
87
+ # Convert to normal ASCII
88
+ normalized.append(chr(code - 0xFEE0))
89
+ # Check mathematical
90
+ elif code in cls.MATHEMATICAL_CHARS:
91
+ suspicious.append((char, 'mathematical'))
92
+ normalized.append(char) # Keep as-is for now
93
+ else:
94
+ normalized.append(char)
95
+
96
+ is_obfuscated = len(suspicious) > 0
97
+ normalized_text = ''.join(normalized)
98
+
99
+ return is_obfuscated, suspicious, normalized_text
100
+
101
+ @classmethod
102
+ def normalize(cls, text: str) -> str:
103
+ """Quick normalize without detection details"""
104
+ _, _, normalized = cls.detect_obfuscation(text)
105
+ return normalized
106
+
107
+
108
  class PIIDetector:
109
  """Detect PII in text with context awareness"""
110
 
 
255
  def scan(self, text: str, age: int) -> Dict:
256
  """
257
  Full PII scan with age-appropriate rules
258
+ Also detects unicode obfuscation
259
 
260
  Returns:
261
  {
 
265
  "social_media_allowed": bool,
266
  "grooming_risk": float,
267
  "action": "allow" | "block" | "flag",
268
+ "reason": str,
269
+ "obfuscation_detected": bool,
270
+ "normalized_text": str
271
  }
272
  """
273
+ # Step 0: Detect unicode obfuscation
274
+ is_obfuscated, suspicious_chars, normalized_text = UnicodeDeobfuscator.detect_obfuscation(text)
275
+
276
+ # Use normalized text for detection if obfuscated
277
+ detection_text = normalized_text if is_obfuscated else text
278
+
279
  pii_found = []
280
  pii_types = set()
281
 
282
+ # Detect various PII types (using normalized text if obfuscated)
283
+ emails = self.detect_emails(detection_text)
284
  if emails:
285
  pii_types.add(PIILabel.EMAIL)
286
  for email, start, end in emails:
287
  pii_found.append({"type": "email", "value": email, "start": start, "end": end})
288
 
289
+ phones = self.detect_phones(detection_text)
290
  if phones:
291
  pii_types.add(PIILabel.PHONE)
292
  for phone, start, end in phones:
293
  pii_found.append({"type": "phone", "value": phone, "start": start, "end": end})
294
 
295
+ addresses = self.detect_addresses(detection_text)
296
  if addresses:
297
  pii_types.add(PIILabel.ADDRESS)
298
  for addr, start, end in addresses:
299
  pii_found.append({"type": "address", "value": addr, "start": start, "end": end})
300
 
301
+ credit_cards = self.detect_credit_cards(detection_text)
302
  if credit_cards:
303
  pii_types.add(PIILabel.CREDIT_CARD)
304
  for cc, start, end in credit_cards:
305
  pii_found.append({"type": "credit_card", "value": cc, "start": start, "end": end})
306
 
307
+ ssns = self.detect_ssn(detection_text)
308
  if ssns:
309
  pii_types.add(PIILabel.SSN)
310
  for ssn, start, end in ssns:
311
  pii_found.append({"type": "ssn", "value": ssn, "start": start, "end": end})
312
 
313
+ # Social media detection (also on normalized text)
314
+ social_links = self.detect_social_media(detection_text)
315
  has_social_media = len(social_links) > 0
316
 
317
  if has_social_media:
 
330
  action = "block"
331
  reason = f"PII detected: {', '.join([p.value for p in critical_pii])}"
332
  elif has_social_media:
333
+ # Social media rules (use normalized text for grooming detection)
334
+ is_grooming, grooming_risk, grooming_keywords = self.detect_grooming_context(detection_text)
335
 
336
  if age < 13:
337
  # Under 13: Block ALL social media sharing
 
357
  elif grooming_risk > 0:
358
  social_media_allowed = False
359
 
360
+ # Add obfuscation info to reason if detected
361
+ if is_obfuscated and action == "allow":
362
+ reason = f"Unicode obfuscation detected and normalized. {reason}"
363
+
364
  return {
365
  "has_pii": len(pii_types) > 0,
366
  "pii_types": [p.value for p in pii_types],
 
370
  "grooming_keywords": grooming_keywords,
371
  "action": action,
372
  "reason": reason,
373
+ "age": age,
374
+ "obfuscation_detected": is_obfuscated,
375
+ "obfuscation_chars": [(c, t) for c, t in suspicious_chars] if is_obfuscated else [],
376
+ "normalized_text": normalized_text if is_obfuscated else text
377
  }
378
 
379