| import os |
| import torch |
| from transformers import pipeline |
|
|
| class PiiDebertaAnalyzer: |
| """ |
| Implements the DeBERTa V3 model, widely recognized for winning the Kaggle PII Detection competition. |
| It uses a token-classification pipeline to detect PII entities. |
| """ |
| def __init__(self, model_name="lakshyakh93/deberta-large-finetuned-pii"): |
| self.device = 0 if torch.cuda.is_available() else -1 |
| print(f"Loading DeBERTa Model on device: {'GPU' if self.device == 0 else 'CPU'}...") |
| |
| try: |
| |
| hf_token = os.getenv('HF_TOKEN') |
| |
| |
| self.pipe = pipeline( |
| "token-classification", |
| model=model_name, |
| device=self.device, |
| token=hf_token, |
| aggregation_strategy="simple" |
| ) |
| self.model_loaded = True |
| print(f"✅ DeBERTa model '{model_name}' loaded successfully.") |
| except Exception as e: |
| print(f"❌ Failed to load DeBERTa model: {e}") |
| self.model_loaded = False |
|
|
| |
| self.label_mapping = { |
| "NAME_STUDENT": "FIRST_NAME", |
| "EMAIL": "EMAIL", |
| "PHONE_NUM": "PHONE", |
| "STREET_ADDRESS": "LOCATION", |
| "ID_NUM": "SSN", |
| "USERNAME": "FIRST_NAME", |
| "URL_PERSONAL": "URL", |
| "PER": "FIRST_NAME", |
| "LOC": "LOCATION", |
| "ORG": "LOCATION" |
| } |
|
|
| def scan(self, text: str): |
| if not self.model_loaded or not text: |
| return [] |
|
|
| try: |
| results = self.pipe(text) |
| detections = [] |
| |
| for entity in results: |
| |
| original_label = entity.get('entity_group', 'UNKNOWN') |
| mapped_label = self.label_mapping.get(original_label, "DEFAULT") |
| |
| |
| if mapped_label != "DEFAULT": |
| detections.append({ |
| "text": entity['word'].strip(), |
| "label": mapped_label, |
| "start": entity['start'], |
| "end": entity['end'], |
| "source": "DeBERTa", |
| "score": float(entity['score']) |
| }) |
| return detections |
| |
| except Exception as e: |
| print(f"DeBERTa scan error: {e}") |
| return [] |