WORKWITHSHAFISK commited on
Commit
fb991ac
·
verified ·
1 Parent(s): 01c8e17

Update classifier_manager/deberta_model.py

Browse files
Files changed (1) hide show
  1. classifier_manager/deberta_model.py +33 -19
classifier_manager/deberta_model.py CHANGED
@@ -1,34 +1,48 @@
1
  import os
2
- from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
 
3
 
4
  class PiiDebertaAnalyzer:
 
 
 
 
5
  def __init__(self, model_name="lakshyakh93/deberta-large-finetuned-pii"):
6
- self.model = None
7
- self.available = False
8
 
9
  try:
10
- print(f"Loading DeBERTa Model on device: CPU...")
11
-
12
- # Get HuggingFace token from environment
13
  hf_token = os.getenv('HF_TOKEN')
14
 
15
- # Load with token if available
16
- self.model = pipeline(
17
- "token-classification",
18
- model=model_name,
19
- tokenizer=model_name,
20
- device=-1, # CPU
21
- use_auth_token=hf_token, # Add this line
22
  aggregation_strategy="simple"
23
  )
24
-
25
- self.available = True
26
  print(f"✅ DeBERTa model '{model_name}' loaded successfully.")
27
-
28
  except Exception as e:
29
- print(f"Failed to load DeBERTa model: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
-
32
  def scan(self, text: str):
33
  if not self.model_loaded or not text:
34
  return []
@@ -56,4 +70,4 @@ class PiiDebertaAnalyzer:
56
 
57
  except Exception as e:
58
  print(f"DeBERTa scan error: {e}")
59
- return []
 
1
  import os
2
+ import torch
3
+ from transformers import pipeline
4
 
5
  class PiiDebertaAnalyzer:
6
+ """
7
+ Implements the DeBERTa V3 model, widely recognized for winning the Kaggle PII Detection competition.
8
+ It uses a token-classification pipeline to detect PII entities.
9
+ """
10
  def __init__(self, model_name="lakshyakh93/deberta-large-finetuned-pii"):
11
+ self.device = 0 if torch.cuda.is_available() else -1
12
+ print(f"Loading DeBERTa Model on device: {'GPU' if self.device == 0 else 'CPU'}...")
13
 
14
  try:
15
+ # Get HuggingFace token from environment (for private/gated models)
 
 
16
  hf_token = os.getenv('HF_TOKEN')
17
 
18
+ # Aggregation strategy 'simple' merges B-TAG and I-TAG into a single entity automatically
19
+ self.pipe = pipeline(
20
+ "token-classification",
21
+ model=model_name,
22
+ device=self.device,
23
+ token=hf_token, # Use 'token' parameter (use_auth_token is deprecated)
 
24
  aggregation_strategy="simple"
25
  )
26
+ self.model_loaded = True
 
27
  print(f"✅ DeBERTa model '{model_name}' loaded successfully.")
 
28
  except Exception as e:
29
+ print(f"Failed to load DeBERTa model: {e}")
30
+ self.model_loaded = False
31
+
32
+ # Map Kaggle/DeBERTa labels to your App's standard labels
33
+ self.label_mapping = {
34
+ "NAME_STUDENT": "FIRST_NAME",
35
+ "EMAIL": "EMAIL",
36
+ "PHONE_NUM": "PHONE",
37
+ "STREET_ADDRESS": "LOCATION",
38
+ "ID_NUM": "SSN",
39
+ "USERNAME": "FIRST_NAME",
40
+ "URL_PERSONAL": "URL",
41
+ "PER": "FIRST_NAME", # Generic NER label
42
+ "LOC": "LOCATION", # Generic NER label
43
+ "ORG": "LOCATION" # Mapping ORG to Location or ignore based on preference
44
+ }
45
 
 
46
  def scan(self, text: str):
47
  if not self.model_loaded or not text:
48
  return []
 
70
 
71
  except Exception as e:
72
  print(f"DeBERTa scan error: {e}")
73
+ return []