VGreatVig07 commited on
Commit
507fdc6
·
verified ·
1 Parent(s): 90c9fc5

Upload 6 files

Browse files
utils/__pycache__/masker3.cpython-311.pyc ADDED
Binary file (4.13 kB). View file
 
utils/__pycache__/masker4.cpython-311.pyc ADDED
Binary file (3.07 kB). View file
 
utils/__pycache__/preprocessor.cpython-311.pyc ADDED
Binary file (7.05 kB). View file
 
utils/masker3.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import spacy
3
+ from typing import Dict, Any, List
4
+
5
+ # Load spaCy model
6
+ nlp = spacy.load("en_core_web_sm")
7
+
8
+ def mask_pii(text: str) -> Dict[str, Any]:
9
+ """
10
+ Enhanced PII masking with JSON output format
11
+ """
12
+ masked_text = text
13
+ entities = []
14
+
15
+ def mask_and_record(pattern, label, group=0):
16
+ nonlocal masked_text, entities
17
+ for match in reversed(list(re.finditer(pattern, masked_text))):
18
+ start, end = match.span(group)
19
+ original = match.group(group)
20
+
21
+ # Skip if already masked or overlaps
22
+ if any(e['position'][0] <= start < e['position'][1] for e in entities):
23
+ continue
24
+
25
+ masked_text = masked_text[:start] + f"[{label}]" + masked_text[end:]
26
+ entities.append({
27
+ "position": [start, end],
28
+ "classification": label,
29
+ "entity": original
30
+ })
31
+
32
+ # Specific patterns first
33
+ mask_and_record(r'\b(\d{4}[ -]?\d{4}[ -]?\d{4})\b', 'aadhar_num')
34
+ mask_and_record(r'\b((?:\d[ -]*?){15,18}\d)\b', 'credit_debit_no')
35
+ mask_and_record(r'(?:CVV|CVC|Security Code)[: ]*(\d{3,4})\b', 'cvv_no', 1)
36
+ mask_and_record(r'\b((0[1-9]|1[0-2])[/-](\d{2}|\d{4}))\b', 'expiry_no', 1)
37
+
38
+ dob_patterns = [
39
+ r'\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b',
40
+ r'\b(\d{4}[/-]\d{1,2}[/-]\d{1,2})\b',
41
+ r'\b((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4})\b'
42
+ ]
43
+ for pattern in dob_patterns:
44
+ mask_and_record(pattern, 'dob', 1)
45
+
46
+ mask_and_record(r'(\+?\d{1,3}[-.\s]?)?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,9}', 'phone_number')
47
+ mask_and_record(r'(\b[\w.-]+@[\w.-]+\.\w+\b)', 'email')
48
+
49
+ # spaCy for full names
50
+ doc = nlp(masked_text)
51
+ for ent in reversed(doc.ents):
52
+ if ent.label_ == "PERSON":
53
+ if any(e['position'][0] <= ent.start_char < e['position'][1] for e in entities):
54
+ continue
55
+ masked_text = masked_text[:ent.start_char] + "[full_name]" + masked_text[ent.end_char:]
56
+ entities.append({
57
+ "position": [ent.start_char, ent.end_char],
58
+ "classification": "full_name",
59
+ "entity": ent.text
60
+ })
61
+
62
+ # Optional: Set category based on simple rule or ML model
63
+ category = "sensitive_information"
64
+
65
+ return {
66
+ "input_email_body": text,
67
+ "list_of_masked_entities": sorted(entities, key=lambda x: x["position"][0]),
68
+ "masked_email": masked_text,
69
+ "category_of_the_email": category
70
+ }
71
+
72
+
utils/preprocessor.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ from nltk.corpus import stopwords
3
+ from nltk.stem import WordNetLemmatizer
4
+ import string
5
+ import re
6
+ import joblib
7
+ from sklearn.feature_extraction.text import TfidfVectorizer
8
+ import os
9
+ from pathlib import Path
10
+ from sklearn.exceptions import NotFittedError
11
+
12
+ class IntentClassifier:
13
+ def __init__(self, model_paths):
14
+ # Configure NLTK data path (Docker compatible)
15
+ self._setup_nltk()
16
+
17
+ # Verify and load models
18
+ self._verify_model_paths(model_paths)
19
+ self._load_models(model_paths)
20
+
21
+ # Initialize preprocessing tools
22
+ self.stop_words = set(stopwords.words('english'))
23
+ self.lemmatizer = WordNetLemmatizer()
24
+
25
+
26
+ def _setup_nltk(self):
27
+ """Set up NLTK data path to use local directory only"""
28
+ nltk_data_path = Path(__file__).parent.parent / "models" / "nltk_data"
29
+ nltk.data.path.append(str(nltk_data_path))
30
+
31
+ # Don't download here; just check if data is present
32
+ try:
33
+ stopwords.words('english')
34
+ WordNetLemmatizer().lemmatize('test')
35
+ except LookupError as e:
36
+ raise RuntimeError(f"Required NLTK resources missing in {nltk_data_path}: {str(e)}")
37
+
38
+ def _verify_model_paths(self, model_paths):
39
+ """Verify all model files exist"""
40
+ for name, path in model_paths.items():
41
+ if not Path(path).exists():
42
+ raise FileNotFoundError(
43
+ f"Model file not found: {path} ({name}). "
44
+ f"Current working directory: {os.getcwd()}"
45
+ )
46
+
47
+ def _load_models(self, model_paths):
48
+ """Safely load all required models with validation"""
49
+ try:
50
+ # Load TF-IDF vectorizer with validation
51
+ self.tfidf = joblib.load(model_paths['tfidf'])
52
+ if not hasattr(self.tfidf, 'vocabulary_'):
53
+ raise NotFittedError("TF-IDF vectorizer is not fitted")
54
+
55
+ # Load classifier model
56
+ self.model = joblib.load(model_paths['model'])
57
+
58
+ # Load label encoder
59
+ self.le = joblib.load(model_paths['label_encoder'])
60
+
61
+ except Exception as e:
62
+ raise ValueError(f"Failed to load models: {str(e)}")
63
+
64
+ def preprocess_text(self, text):
65
+ """Standalone text cleaning function"""
66
+ if not isinstance(text, str):
67
+ return ""
68
+
69
+ # Lowercase
70
+ text = text.lower()
71
+
72
+ # Remove email-specific patterns
73
+ text = re.sub(r'\S+@\S+', ' ', text) # Email addresses
74
+ text = re.sub(r'http\S+', ' ', text) # URLs
75
+ text = re.sub(r'www\S+', ' ', text) # URLs
76
+
77
+ # Remove punctuation and numbers
78
+ text = re.sub(r'[^\w\s]', ' ', text)
79
+ text = re.sub(r'\d+', ' ', text)
80
+
81
+ # Tokenize and process
82
+ tokens = text.split()
83
+ tokens = [self.lemmatizer.lemmatize(token)
84
+ for token in tokens
85
+ if token not in self.stop_words and len(token) > 2]
86
+
87
+ return ' '.join(tokens)
88
+
89
+ def predict(self, text):
90
+ """Make prediction on new text with error handling"""
91
+ if not self.tfidf or not self.model or not self.le:
92
+ raise RuntimeError("Classifier not properly initialized")
93
+
94
+ try:
95
+ # Preprocess
96
+ cleaned_text = self.preprocess_text(text)
97
+
98
+ # Vectorize
99
+ vectorized = self.tfidf.transform([cleaned_text])
100
+
101
+ # Predict
102
+ prediction = self.model.predict(vectorized)
103
+
104
+ # Return human-readable label
105
+ return self.le.inverse_transform(prediction)[0]
106
+
107
+ except Exception as e:
108
+ raise ValueError(f"Prediction failed: {str(e)}")
109
+
110
+
111
+ # Initialize with Docker-compatible paths
112
+ MODEL_DIR = Path(__file__).parent.parent / "models"
113
+ model_paths = {
114
+ 'tfidf': "models/tfidf_vectorizer_stack.pkl",
115
+ 'model': "models/intent_classifier_stack.pkl",
116
+ 'label_encoder': "models/label_encoder_stack.pkl"
117
+ }
118
+
119
+ # Initialize classifier with comprehensive error handling
120
+ try:
121
+ classifier = IntentClassifier(model_paths)
122
+ # Verify the TF-IDF vectorizer is properly fitted
123
+ test_vector = classifier.tfidf.transform(["test email"])
124
+ print("Classifier initialized successfully")
125
+ except Exception as e:
126
+ print(f"Failed to initialize classifier: {str(e)}")
127
+ classifier = None
128
+
129
+
130
+
utils/utils.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import nltk
2
+ nltk.download('stopwords', download_dir='models/nltk_data')
3
+ nltk.download('wordnet', download_dir='models/nltk_data')
4
+ nltk.download('omw-1.4', download_dir='models/nltk_data')