jentegeo
/

jennifer_email_classifier

Model card Files Files and versions

jennifer_email_classifier / utils.py

jentegeo's picture

Upload 8 files

e1992da verified 10 months ago

history blame contribute delete

5.08 kB

	import re
	from typing import List, Dict, Tuple
	from datetime import datetime

	class PIIDetector:
	"""
	Class for detecting and masking Personally Identifiable Information (PII) in text.
	Uses regular expressions and pattern matching to identify PII entities.
	"""

	def __init__(self):
	# Compile regex patterns for different PII types
	self.patterns = {
	"full_name": re.compile(r'\b([A-Z][a-z]+(\s[A-Z][a-z]+)+)\b'),
	"email": re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z\|a-z]{2,}\b'),
	"phone_number": re.compile(r'(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b'),
	"dob": re.compile(r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\|(Jan\|Feb\|Mar\|Apr\|May\|Jun\|Jul\|Aug\|Sep\|Oct\|Nov\|Dec)[a-z]* \d{1,2}, \d{4})\b'),
	"aadhar_num": re.compile(r'\b\d{4}[ -]?\d{4}[ -]?\d{4}\b'),
	"credit_debit_no": re.compile(r'\b(?:\d[ -]*?){13,16}\b'),
	"cvv_no": re.compile(r'\b\d{3,4}\b'),
	"expiry_no": re.compile(r'\b(0[1-9]\|1[0-2])[-/]\d{2}\b')
	}

	def detect_pii(self, text: str) -> List[Dict]:
	"""
	Detect all PII entities in the given text.

	Args:
	text: Input text to scan for PII

	Returns:
	List of dictionaries containing PII entities with their positions and types
	"""
	entities = []

	for entity_type, pattern in self.patterns.items():
	for match in pattern.finditer(text):
	start, end = match.span()
	entity_value = match.group()

	# Additional validation for specific entity types
	if entity_type == "credit_debit_no" and not self._validate_luhn(entity_value):
	continue
	if entity_type == "dob" and not self._validate_date(entity_value):
	continue

	entities.append({
	"position": [start, end],
	"classification": entity_type,
	"entity": entity_value
	})

	# Sort entities by start position to handle masking in order
	entities.sort(key=lambda x: x["position"][0])
	return entities

	def mask_pii(self, text: str, entities: List[Dict]) -> Tuple[str, List[Dict]]:
	"""
	Mask detected PII entities in the text.

	Args:
	text: Original text containing PII
	entities: List of detected PII entities

	Returns:
	Tuple of (masked_text, list_of_masked_entities)
	"""
	masked_text = text
	offset = 0
	masked_entities = []

	for entity in entities:
	start, end = entity["position"]
	entity_type = entity["classification"]
	original_value = entity["entity"]

	# Adjust positions based on previous replacements
	adj_start = start + offset
	adj_end = end + offset

	# Create masked token
	masked_token = f"[{entity_type}]"

	# Replace the entity with masked token
	masked_text = masked_text[:adj_start] + masked_token + masked_text[adj_end:]

	# Update offset for next replacement
	offset += len(masked_token) - (end - start)

	# Store masked entity info
	masked_entities.append({
	"position": [start, end],
	"classification": entity_type,
	"entity": original_value
	})

	return masked_text, masked_entities

	def _validate_luhn(self, card_number: str) -> bool:
	"""Validate credit card number using Luhn algorithm."""
	# Remove non-digit characters
	card_number = re.sub(r'[^0-9]', '', card_number)

	if not card_number.isdigit() or len(card_number) < 13 or len(card_number) > 19:
	return False

	digits = list(map(int, card_number))
	checksum = digits[-1]
	total = 0

	for i, digit in enumerate(digits[:-1]):
	if i % 2 == 0:
	digit *= 2
	if digit > 9:
	digit -= 9
	total += digit

	return (total * 9) % 10 == checksum

	def _validate_date(self, date_str: str) -> bool:
	"""Validate date of birth."""
	try:
	# Try to parse different date formats
	for fmt in ('%m/%d/%Y', '%m-%d-%Y', '%d/%m/%Y', '%d-%m-%Y',
	'%b %d, %Y', '%B %d, %Y'):
	try:
	datetime.strptime(date_str, fmt)
	return True
	except ValueError:
	continue
	return False
	except:
	return False