Upload pii_extension.py with huggingface_hub
Browse files- pii_extension.py +29 -25
pii_extension.py
CHANGED
|
@@ -37,14 +37,15 @@ class PIIDetector:
|
|
| 37 |
re.compile(r'\b\d{7,10}\b'), # Plain digits 7-10 chars
|
| 38 |
]
|
| 39 |
|
| 40 |
-
# Address patterns (
|
| 41 |
self.address_patterns = [
|
| 42 |
-
re.compile(r'\b\d+\s+[A-Za-z]
|
| 43 |
re.compile(r'\b(?:PO|P\.O\.)\s*Box\s*\d+\b', re.IGNORECASE),
|
|
|
|
| 44 |
]
|
| 45 |
|
| 46 |
-
# Credit card (
|
| 47 |
-
self.cc_pattern = re.compile(r'\b(?:\d{4}[-\s]?){3}\d{4}\b')
|
| 48 |
|
| 49 |
# SSN (US Social Security Number)
|
| 50 |
self.ssn_pattern = re.compile(r'\b\d{3}[-\s]?\d{2}[-\s]?\d{4}\b')
|
|
@@ -225,38 +226,41 @@ class PIIDetector:
|
|
| 225 |
# Check grooming context for social media
|
| 226 |
grooming_risk = 0.0
|
| 227 |
grooming_keywords = []
|
| 228 |
-
social_media_allowed = True
|
| 229 |
|
| 230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
is_grooming, grooming_risk, grooming_keywords = self.detect_grooming_context(text)
|
| 232 |
|
| 233 |
-
# Rules by age
|
| 234 |
if age < 13:
|
| 235 |
# Under 13: Block ALL social media sharing
|
| 236 |
-
social_media_allowed = False
|
| 237 |
action = "block"
|
| 238 |
reason = "Social media sharing not permitted under 13"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
else:
|
| 240 |
-
# 13+: Allow
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
reason = f"Potential grooming detected (risk: {grooming_risk:.0%})"
|
| 245 |
-
else:
|
| 246 |
-
social_media_allowed = True
|
| 247 |
-
action = "allow"
|
| 248 |
-
reason = "Social media permitted for 13+ (no grooming signals)"
|
| 249 |
-
|
| 250 |
-
# Check other PII (blocked for all ages)
|
| 251 |
-
critical_pii = pii_types.intersection({PIILabel.EMAIL, PIILabel.PHONE, PIILabel.ADDRESS, PIILabel.CREDIT_CARD, PIILabel.SSN})
|
| 252 |
-
|
| 253 |
-
if critical_pii:
|
| 254 |
-
action = "block"
|
| 255 |
-
reason = f"PII detected: {', '.join([p.value for p in critical_pii])}"
|
| 256 |
-
elif not has_social_media and not pii_types:
|
| 257 |
action = "allow"
|
| 258 |
reason = "No PII detected"
|
| 259 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
return {
|
| 261 |
"has_pii": len(pii_types) > 0,
|
| 262 |
"pii_types": [p.value for p in pii_types],
|
|
|
|
| 37 |
re.compile(r'\b\d{7,10}\b'), # Plain digits 7-10 chars
|
| 38 |
]
|
| 39 |
|
| 40 |
+
# Address patterns (enhanced street address detection)
|
| 41 |
self.address_patterns = [
|
| 42 |
+
re.compile(r'\b\d+\s+\d*[A-Za-z]+(?:\s+[A-Za-z]+)?\s+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Lane|Ln|Drive|Dr|Court|Ct|Way|Place|Pl|Circle|Cir|Trail|Trl|Parkway|Pkwy)\b', re.IGNORECASE),
|
| 43 |
re.compile(r'\b(?:PO|P\.O\.)\s*Box\s*\d+\b', re.IGNORECASE),
|
| 44 |
+
re.compile(r'\b\d+\s+[A-Za-z]+\s+(?:Street|St|Ave|Road|Rd)\b', re.IGNORECASE),
|
| 45 |
]
|
| 46 |
|
| 47 |
+
# Credit card (enhanced pattern)
|
| 48 |
+
self.cc_pattern = re.compile(r'\b(?:\d{4}[-\s]?){3}\d{4}\b|\b\d{16}\b')
|
| 49 |
|
| 50 |
# SSN (US Social Security Number)
|
| 51 |
self.ssn_pattern = re.compile(r'\b\d{3}[-\s]?\d{2}[-\s]?\d{4}\b')
|
|
|
|
| 226 |
# Check grooming context for social media
|
| 227 |
grooming_risk = 0.0
|
| 228 |
grooming_keywords = []
|
|
|
|
| 229 |
|
| 230 |
+
# Check other PII first (blocked for all ages)
|
| 231 |
+
critical_pii = pii_types.intersection({PIILabel.EMAIL, PIILabel.PHONE, PIILabel.ADDRESS, PIILabel.CREDIT_CARD, PIILabel.SSN})
|
| 232 |
+
|
| 233 |
+
if critical_pii:
|
| 234 |
+
action = "block"
|
| 235 |
+
reason = f"PII detected: {', '.join([p.value for p in critical_pii])}"
|
| 236 |
+
elif has_social_media:
|
| 237 |
+
# Social media rules
|
| 238 |
is_grooming, grooming_risk, grooming_keywords = self.detect_grooming_context(text)
|
| 239 |
|
|
|
|
| 240 |
if age < 13:
|
| 241 |
# Under 13: Block ALL social media sharing
|
|
|
|
| 242 |
action = "block"
|
| 243 |
reason = "Social media sharing not permitted under 13"
|
| 244 |
+
elif is_grooming:
|
| 245 |
+
# 13+: Block if grooming detected
|
| 246 |
+
action = "block"
|
| 247 |
+
reason = f"Potential grooming detected (risk: {grooming_risk:.0%})"
|
| 248 |
else:
|
| 249 |
+
# 13+: Allow social media, no grooming
|
| 250 |
+
action = "allow"
|
| 251 |
+
reason = "Social media permitted for 13+ (no grooming signals)"
|
| 252 |
+
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
action = "allow"
|
| 254 |
reason = "No PII detected"
|
| 255 |
|
| 256 |
+
# Determine if social media is allowed for return value
|
| 257 |
+
social_media_allowed = True
|
| 258 |
+
if has_social_media:
|
| 259 |
+
if age < 13:
|
| 260 |
+
social_media_allowed = False
|
| 261 |
+
elif grooming_risk > 0:
|
| 262 |
+
social_media_allowed = False
|
| 263 |
+
|
| 264 |
return {
|
| 265 |
"has_pii": len(pii_types) > 0,
|
| 266 |
"pii_types": [p.value for p in pii_types],
|