Spaces:
Sleeping
Sleeping
| import re | |
| from typing import Dict, List, Tuple | |
| def mask_full_name(text: str, ner_pipeline) -> Tuple[str, List[Dict]]: | |
| """ | |
| Mask full names in text using NER model. | |
| Args: | |
| text (str): Input text | |
| ner_pipeline: NER pipeline for name detection | |
| Returns: | |
| Tuple[str, List[Dict]]: Masked text and list of masked entities | |
| """ | |
| entities = ner_pipeline(text) | |
| masked_entities = [] | |
| for ent in sorted(entities, key=lambda x: x['start'], reverse=True): | |
| if ent['entity_group'] in ['PER', 'Person', 'full_name']: | |
| start, end = ent['start'], ent['end'] | |
| original_entity = text[start:end] | |
| masked_entities.append({ | |
| "position": [start, end], | |
| "classification": "full_name", | |
| "entity": original_entity | |
| }) | |
| text = text[:start] + '[full_name]' + text[end:] | |
| return text, masked_entities | |
| def mask_with_regex(text: str) -> Tuple[str, List[Dict]]: | |
| """ | |
| Mask PII using regex patterns. | |
| Args: | |
| text (str): Input text | |
| Returns: | |
| Tuple[str, List[Dict]]: Masked text and list of masked entities | |
| """ | |
| masked_entities = [] | |
| # Email address | |
| emails = list(re.finditer(r'\b[\w.-]+?@\w+?\.\w+?\b', text)) | |
| for match in reversed(emails): | |
| start, end = match.span() | |
| original_entity = text[start:end] | |
| masked_entities.append({ | |
| "position": [start, end], | |
| "classification": "email", | |
| "entity": original_entity | |
| }) | |
| text = text[:start] + '[email]' + text[end:] | |
| # Phone number | |
| phones = list(re.finditer(r'\b(?:(?:\+|0)91[\s.-]?)?\d{10}(?!\d)\b', text)) | |
| for match in reversed(phones): | |
| start, end = match.span() | |
| original_entity = text[start:end] | |
| masked_entities.append({ | |
| "position": [start, end], | |
| "classification": "phone_number", | |
| "entity": original_entity | |
| }) | |
| text = text[:start] + '[phone_number]' + text[end:] | |
| # Date of Birth | |
| dobs = list(re.finditer(r'\b\d{2}[-/]\d{2}[-/]\d{4}\b|\b\d{4}[-/]\d{2}[-/]\d{2}\b', text)) | |
| for match in reversed(dobs): | |
| start, end = match.span() | |
| original_entity = text[start:end] | |
| masked_entities.append({ | |
| "position": [start, end], | |
| "classification": "dob", | |
| "entity": original_entity | |
| }) | |
| text = text[:start] + '[dob]' + text[end:] | |
| # Credit/Debit card number | |
| cards = list(re.finditer(r'\b(?:\d[ -]*?){13,19}\b', text)) | |
| for match in reversed(cards): | |
| start, end = match.span() | |
| original_entity = text[start:end] | |
| masked_entities.append({ | |
| "position": [start, end], | |
| "classification": "credit_debit_no", | |
| "entity": original_entity | |
| }) | |
| text = text[:start] + '[credit_debit_no]' + text[end:] | |
| # Aadhar number | |
| aadhars = list(re.finditer(r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}\b', text)) | |
| for match in reversed(aadhars): | |
| start, end = match.span() | |
| original_entity = text[start:end] | |
| masked_entities.append({ | |
| "position": [start, end], | |
| "classification": "aadhar_num", | |
| "entity": original_entity | |
| }) | |
| text = text[:start] + '[aadhar_num]' + text[end:] | |
| # CVV number | |
| cvvs = list(re.finditer(r'\b\d{3}\b', text)) | |
| for match in reversed(cvvs): | |
| start, end = match.span() | |
| original_entity = text[start:end] | |
| masked_entities.append({ | |
| "position": [start, end], | |
| "classification": "cvv_no", | |
| "entity": original_entity | |
| }) | |
| text = text[:start] + '[cvv_no]' + text[end:] | |
| # Card expiry date | |
| expiries = list(re.finditer(r'\b(0[1-9]|1[0-2])\/?([0-9]{2}|[0-9]{4})\b', text)) | |
| for match in reversed(expiries): | |
| start, end = match.span() | |
| original_entity = text[start:end] | |
| masked_entities.append({ | |
| "position": [start, end], | |
| "classification": "expiry_no", | |
| "entity": original_entity | |
| }) | |
| text = text[:start] + '[expiry_no]' + text[end:] | |
| return text, masked_entities | |
| def mask_pii(text: str, ner_pipeline) -> Tuple[str, List[Dict]]: | |
| """ | |
| Mask all PII in text using both NER and regex patterns. | |
| Args: | |
| text (str): Input text | |
| ner_pipeline: NER pipeline for name detection | |
| Returns: | |
| Tuple[str, List[Dict]]: Masked text and list of all masked entities | |
| """ | |
| text, ner_entities = mask_full_name(text, ner_pipeline) | |
| text, regex_entities = mask_with_regex(text) | |
| return text, ner_entities + regex_entities |