Spaces:

VGreatVig07
/

Email_Classifier

Sleeping

+import re
+import spacy
+from typing import Dict, Any, List
+# Load spaCy model
+nlp = spacy.load("en_core_web_sm")
+def mask_pii(text: str) -> Dict[str, Any]:
+    """
+    Enhanced PII masking with JSON output format
+    """
+    masked_text = text
+    entities = []
+    def mask_and_record(pattern, label, group=0):
+        nonlocal masked_text, entities
+        for match in reversed(list(re.finditer(pattern, masked_text))):
+            start, end = match.span(group)
+            original = match.group(group)
+            # Skip if already masked or overlaps
+            if any(e['position'][0] <= start < e['position'][1] for e in entities):
+                continue
+            masked_text = masked_text[:start] + f"[{label}]" + masked_text[end:]
+            entities.append({
+                "position": [start, end],
+                "classification": label,
+                "entity": original
+            })
+    # Specific patterns first
+    mask_and_record(r'\b(\d{4}[ -]?\d{4}[ -]?\d{4})\b', 'aadhar_num')
+    mask_and_record(r'\b((?:\d[ -]*?){15,18}\d)\b', 'credit_debit_no')
+    mask_and_record(r'(?:CVV|CVC|Security Code)[: ]*(\d{3,4})\b', 'cvv_no', 1)
+    mask_and_record(r'\b((0[1-9]|1[0-2])[/-](\d{2}|\d{4}))\b', 'expiry_no', 1)
+    dob_patterns = [
+        r'\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b',
+        r'\b(\d{4}[/-]\d{1,2}[/-]\d{1,2})\b',
+        r'\b((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4})\b'
+    ]
+    for pattern in dob_patterns:
+        mask_and_record(pattern, 'dob', 1)
+    mask_and_record(r'(\+?\d{1,3}[-.\s]?)?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,9}', 'phone_number')
+    mask_and_record(r'(\b[\w.-]+@[\w.-]+\.\w+\b)', 'email')
+    # spaCy for full names
+    doc = nlp(masked_text)
+    for ent in reversed(doc.ents):
+        if ent.label_ == "PERSON":
+            if any(e['position'][0] <= ent.start_char < e['position'][1] for e in entities):
+                continue
+            masked_text = masked_text[:ent.start_char] + "[full_name]" + masked_text[ent.end_char:]
+            entities.append({
+                "position": [ent.start_char, ent.end_char],
+                "classification": "full_name",
+                "entity": ent.text
+            })
+    # Optional: Set category based on simple rule or ML model
+    category = "sensitive_information"
+    return {
+        "input_email_body": text,
+        "list_of_masked_entities": sorted(entities, key=lambda x: x["position"][0]),
+        "masked_email": masked_text,
+        "category_of_the_email": category
+    }

utils/preprocessor.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import nltk
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+import string
+import re
+import joblib
+from sklearn.feature_extraction.text import TfidfVectorizer
+import os
+from pathlib import Path
+from sklearn.exceptions import NotFittedError
+class IntentClassifier:
+    def __init__(self, model_paths):
+        # Configure NLTK data path (Docker compatible)
+        self._setup_nltk()
+        # Verify and load models
+        self._verify_model_paths(model_paths)
+        self._load_models(model_paths)
+        # Initialize preprocessing tools
+        self.stop_words = set(stopwords.words('english'))
+        self.lemmatizer = WordNetLemmatizer()
+    def _setup_nltk(self):
+        """Set up NLTK data path to use local directory only"""
+        nltk_data_path = Path(__file__).parent.parent / "models" / "nltk_data"
+        nltk.data.path.append(str(nltk_data_path))
+        # Don't download here; just check if data is present
+        try:
+            stopwords.words('english')
+            WordNetLemmatizer().lemmatize('test')
+        except LookupError as e:
+            raise RuntimeError(f"Required NLTK resources missing in {nltk_data_path}: {str(e)}")
+    def _verify_model_paths(self, model_paths):
+        """Verify all model files exist"""
+        for name, path in model_paths.items():
+            if not Path(path).exists():
+                raise FileNotFoundError(
+                    f"Model file not found: {path} ({name}). "
+                    f"Current working directory: {os.getcwd()}"
+                )
+    def _load_models(self, model_paths):
+        """Safely load all required models with validation"""
+        try:
+            # Load TF-IDF vectorizer with validation
+            self.tfidf = joblib.load(model_paths['tfidf'])
+            if not hasattr(self.tfidf, 'vocabulary_'):
+                raise NotFittedError("TF-IDF vectorizer is not fitted")
+            # Load classifier model
+            self.model = joblib.load(model_paths['model'])
+            # Load label encoder
+            self.le = joblib.load(model_paths['label_encoder'])
+        except Exception as e:
+            raise ValueError(f"Failed to load models: {str(e)}")
+    def preprocess_text(self, text):
+        """Standalone text cleaning function"""
+        if not isinstance(text, str):
+            return ""
+        # Lowercase
+        text = text.lower()
+        # Remove email-specific patterns
+        text = re.sub(r'\S+@\S+', ' ', text)  # Email addresses
+        text = re.sub(r'http\S+', ' ', text)  # URLs
+        text = re.sub(r'www\S+', ' ', text)   # URLs
+        # Remove punctuation and numbers
+        text = re.sub(r'[^\w\s]', ' ', text)
+        text = re.sub(r'\d+', ' ', text)
+        # Tokenize and process
+        tokens = text.split()
+        tokens = [self.lemmatizer.lemmatize(token)
+                 for token in tokens
+                 if token not in self.stop_words and len(token) > 2]
+        return ' '.join(tokens)
+    def predict(self, text):
+        """Make prediction on new text with error handling"""
+        if not self.tfidf or not self.model or not self.le:
+            raise RuntimeError("Classifier not properly initialized")
+        try:
+            # Preprocess
+            cleaned_text = self.preprocess_text(text)
+            # Vectorize
+            vectorized = self.tfidf.transform([cleaned_text])
+            # Predict
+            prediction = self.model.predict(vectorized)
+            # Return human-readable label
+            return self.le.inverse_transform(prediction)[0]
+        except Exception as e:
+            raise ValueError(f"Prediction failed: {str(e)}")
+# Initialize with Docker-compatible paths
+MODEL_DIR = Path(__file__).parent.parent / "models"
+model_paths = {
+    'tfidf': "models/tfidf_vectorizer_stack.pkl",
+    'model': "models/intent_classifier_stack.pkl",
+    'label_encoder': "models/label_encoder_stack.pkl"
+}
+# Initialize classifier with comprehensive error handling
+try:
+    classifier = IntentClassifier(model_paths)
+    # Verify the TF-IDF vectorizer is properly fitted
+    test_vector = classifier.tfidf.transform(["test email"])
+    print("Classifier initialized successfully")
+except Exception as e:
+    print(f"Failed to initialize classifier: {str(e)}")
+    classifier = None

utils/utils.py ADDED Viewed

	@@ -0,0 +1,4 @@

+import nltk
+nltk.download('stopwords', download_dir='models/nltk_data')
+nltk.download('wordnet', download_dir='models/nltk_data')
+nltk.download('omw-1.4', download_dir='models/nltk_data')