Spaces:

WORKWITHSHAFISK
/

Segmento-Sense-Backend

Paused

App Files Files Community

WORKWITHSHAFISK commited on Jan 3

Commit

fb991ac

verified ·

1 Parent(s): 01c8e17

Update classifier_manager/deberta_model.py

Browse files

Files changed (1) hide show

classifier_manager/deberta_model.py +33 -19

classifier_manager/deberta_model.py CHANGED Viewed

@@ -1,34 +1,48 @@
 import os
-from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
 class PiiDebertaAnalyzer:
     def __init__(self, model_name="lakshyakh93/deberta-large-finetuned-pii"):
-        self.model = None
-        self.available = False
         try:
-            print(f"Loading DeBERTa Model on device: CPU...")
-            # Get HuggingFace token from environment
             hf_token = os.getenv('HF_TOKEN')
-            # Load with token if available
-            self.model = pipeline(
-                "token-classification",
-                model=model_name,
-                tokenizer=model_name,
-                device=-1,  # CPU
-                use_auth_token=hf_token,  # Add this line
                 aggregation_strategy="simple"
             )
-            self.available = True
             print(f"✅ DeBERTa model '{model_name}' loaded successfully.")
         except Exception as e:
-            print(f"Failed to load DeBERTa model: {e}")
     def scan(self, text: str):
         if not self.model_loaded or not text:
             return []
@@ -56,4 +70,4 @@ class PiiDebertaAnalyzer:
         except Exception as e:
             print(f"DeBERTa scan error: {e}")
-            return []

 import os
+import torch
+from transformers import pipeline
 class PiiDebertaAnalyzer:
+    """
+    Implements the DeBERTa V3 model, widely recognized for winning the Kaggle PII Detection competition.
+    It uses a token-classification pipeline to detect PII entities.
+    """
     def __init__(self, model_name="lakshyakh93/deberta-large-finetuned-pii"):
+        self.device = 0 if torch.cuda.is_available() else -1
+        print(f"Loading DeBERTa Model on device: {'GPU' if self.device == 0 else 'CPU'}...")
         try:
+            # Get HuggingFace token from environment (for private/gated models)
             hf_token = os.getenv('HF_TOKEN')
+            # Aggregation strategy 'simple' merges B-TAG and I-TAG into a single entity automatically
+            self.pipe = pipeline(
+                "token-classification",
+                model=model_name,
+                device=self.device,
+                token=hf_token,  # Use 'token' parameter (use_auth_token is deprecated)
                 aggregation_strategy="simple"
             )
+            self.model_loaded = True
             print(f"✅ DeBERTa model '{model_name}' loaded successfully.")
         except Exception as e:
+            print(f"❌ Failed to load DeBERTa model: {e}")
+            self.model_loaded = False
+        # Map Kaggle/DeBERTa labels to your App's standard labels
+        self.label_mapping = {
+            "NAME_STUDENT": "FIRST_NAME",
+            "EMAIL": "EMAIL",
+            "PHONE_NUM": "PHONE",
+            "STREET_ADDRESS": "LOCATION",
+            "ID_NUM": "SSN",
+            "USERNAME": "FIRST_NAME",
+            "URL_PERSONAL": "URL",
+            "PER": "FIRST_NAME",  # Generic NER label
+            "LOC": "LOCATION",    # Generic NER label
+            "ORG": "LOCATION"     # Mapping ORG to Location or ignore based on preference
+        }
     def scan(self, text: str):
         if not self.model_loaded or not text:
             return []
         except Exception as e:
             print(f"DeBERTa scan error: {e}")
+            return []