Spaces:
Running
Running
| from gliner import GLiNER | |
| print("Loading model and tokenizer...") | |
| model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1") | |
| print("Model and tokenizer loaded.") | |
| label_to_mask_map = { | |
| "name": "[NAME]", | |
| "nric": "[NRIC]", | |
| "phone number": "[PHONE]", | |
| "address": "[ADDRESS]", | |
| "email": "[EMAIL]", | |
| "person": "[PERSON]", | |
| "organization": "[ORGANIZATION]", | |
| "passport number": "[PASSPORT_NUMBER]", | |
| "credit card number": "[CREDIT_CARD]", | |
| "social security number": "[SSN]", | |
| "health insurance id number": "[HEALTH_INS_ID]", | |
| "date of birth": "[DOB]", | |
| "mobile phone number": "[MOBILE_PHONE]", | |
| "bank account number": "[BANK_ACCOUNT]", | |
| "medication": "[MEDICATION]", | |
| "cpf": "[CPF]", | |
| "driver's license number": "[DRIVER_LICENSE]", | |
| "tax identification number": "[TAX_ID]", | |
| "medical condition": "[MEDICAL_CONDITION]", | |
| "identity card number": "[IDENTITY_CARD]", | |
| "national id number": "[NATIONAL_ID]", | |
| "ip address": "[IP]", | |
| "email address": "[EMAIL]", | |
| "iban": "[IBAN]", | |
| "credit card expiration date": "[CREDIT_CARD_EXP]", | |
| "username": "[USERNAME]", | |
| "health insurance number": "[HEALTH_INS_NUM]", | |
| "registration number": "[REG_NUM]", | |
| "student id number": "[STUDENT_ID]", | |
| "insurance number": "[INSURANCE_NUM]", | |
| "flight number": "[FLIGHT_NUM]", | |
| "landline phone number": "[LANDLINE_PHONE]", | |
| "blood type": "[BLOOD_TYPE]", | |
| "cvv": "[CVV]", | |
| "reservation number": "[RESERVATION_NUM]", | |
| "digital signature": "[DIGITAL_SIGNATURE]", | |
| "social media handle": "[SOCIAL_MEDIA]", | |
| "license plate number": "[LICENSE_PLATE]", | |
| "cnpj": "[CNPJ]", | |
| "postal code": "[POSTAL_CODE]", | |
| "passport_number": "[PASSPORT_NUMBER]", | |
| "serial number": "[SERIAL_NUM]", | |
| "vehicle registration number": "[VEHICLE_REG_NUM]", | |
| "credit card brand": "[CREDIT_CARD_BRAND]", | |
| "fax number": "[FAX]", | |
| "visa number": "[VISA]", | |
| "insurance company": "[INSURANCE_COMPANY]", | |
| "identity document number": "[IDENTITY_DOCUMENT]", | |
| "transaction number": "[TRANSACTION_NUM]", | |
| "national health insurance number": "[NATIONAL_HEALTH_INS]", | |
| "cvc": "[CVC]", | |
| "birth certificate number": "[BIRTH_CERT]", | |
| "train ticket number": "[TRAIN_TICKET]", | |
| "passport expiration date": "[PASSPORT_EXP_DATE]", | |
| "social_security_number": "[SSN]", | |
| } | |
| valid_labels = list(label_to_mask_map.keys()) | |
| def mask_text(text, entities): | |
| """ | |
| Masks the original text by replacing entities with corresponding labels. | |
| Args: | |
| text (str): The original text. | |
| entities (list): A list of dictionaries where each dictionary contains: | |
| - "text": the extracted entity string. | |
| - "label": the label for the entity. | |
| Returns: | |
| str: The masked text. | |
| """ | |
| # Sort entities by their occurrence in text to avoid conflicts | |
| entities_sorted = sorted(entities, key=lambda x: text.find(x["text"]), reverse=True) | |
| # Replace each entity with its corresponding label | |
| for entity in entities_sorted: | |
| label_placeholder = f"{label_to_mask_map[entity['label']]}" | |
| text = text.replace(entity["text"], label_placeholder) | |
| return text | |
| def pii_masking_pipeline( | |
| input_text, | |
| labels=("name", "nric", "phone number", "address", "email"), | |
| ): | |
| """ | |
| Masks the PII entities in the input text. | |
| Args: | |
| input_text (str): The input text to mask. | |
| labels (list): The list of PII entity labels to mask. | |
| Returns: | |
| str: The masked text. | |
| """ | |
| # check that the labels are a subset of valid labels | |
| if not set(labels).issubset(valid_labels): | |
| raise ValueError("Invalid labels provided.") | |
| entities = model.predict_entities(input_text, labels) | |
| masked_text = mask_text(input_text, entities) | |
| return masked_text | |