File size: 2,981 Bytes
9872b76
fb991ac
 
9872b76
 
fb991ac
 
 
 
9872b76
fb991ac
 
9872b76
 
fb991ac
9872b76
 
fb991ac
 
 
 
 
 
9872b76
 
fb991ac
9872b76
 
fb991ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
478f522
01c8e17
478f522
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fb991ac
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import os
import torch
from transformers import pipeline

class PiiDebertaAnalyzer:
    """
    Implements the DeBERTa V3 model, widely recognized for winning the Kaggle PII Detection competition.
    It uses a token-classification pipeline to detect PII entities.
    """
    def __init__(self, model_name="lakshyakh93/deberta-large-finetuned-pii"):
        self.device = 0 if torch.cuda.is_available() else -1
        print(f"Loading DeBERTa Model on device: {'GPU' if self.device == 0 else 'CPU'}...")
        
        try:
            # Get HuggingFace token from environment (for private/gated models)
            hf_token = os.getenv('HF_TOKEN')
            
            # Aggregation strategy 'simple' merges B-TAG and I-TAG into a single entity automatically
            self.pipe = pipeline(
                "token-classification", 
                model=model_name, 
                device=self.device, 
                token=hf_token,  # Use 'token' parameter (use_auth_token is deprecated)
                aggregation_strategy="simple"
            )
            self.model_loaded = True
            print(f"✅ DeBERTa model '{model_name}' loaded successfully.")
        except Exception as e:
            print(f"❌ Failed to load DeBERTa model: {e}")
            self.model_loaded = False

        # Map Kaggle/DeBERTa labels to your App's standard labels
        self.label_mapping = {
            "NAME_STUDENT": "FIRST_NAME",
            "EMAIL": "EMAIL",
            "PHONE_NUM": "PHONE",
            "STREET_ADDRESS": "LOCATION",
            "ID_NUM": "SSN",
            "USERNAME": "FIRST_NAME",
            "URL_PERSONAL": "URL",
            "PER": "FIRST_NAME",  # Generic NER label
            "LOC": "LOCATION",    # Generic NER label
            "ORG": "LOCATION"     # Mapping ORG to Location or ignore based on preference
        }

    def scan(self, text: str):
        if not self.model_loaded or not text:
            return []

        try:
            results = self.pipe(text)
            detections = []
            
            for entity in results:
                # entity looks like: {'entity_group': 'NAME_STUDENT', 'score': 0.99, 'word': 'John Doe', 'start': 0, 'end': 8}
                original_label = entity.get('entity_group', 'UNKNOWN')
                mapped_label = self.label_mapping.get(original_label, "DEFAULT")
                
                # Only include known PII types
                if mapped_label != "DEFAULT":
                    detections.append({
                        "text": entity['word'].strip(),
                        "label": mapped_label,
                        "start": entity['start'],
                        "end": entity['end'],
                        "source": "DeBERTa",
                        "score": float(entity['score'])
                    })
            return detections
            
        except Exception as e:
            print(f"DeBERTa scan error: {e}")
            return []