Spaces:

VGreatVig07
/

Email_Classifier

Sleeping

File size: 4,489 Bytes

507fdc6

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
import os
from pathlib import Path
from sklearn.exceptions import NotFittedError

class IntentClassifier:
    def __init__(self, model_paths):
        # Configure NLTK data path (Docker compatible)
        self._setup_nltk()
        
        # Verify and load models
        self._verify_model_paths(model_paths)
        self._load_models(model_paths)
        
        # Initialize preprocessing tools
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    
    def _setup_nltk(self):
        """Set up NLTK data path to use local directory only"""
        nltk_data_path = Path(__file__).parent.parent / "models" / "nltk_data"
        nltk.data.path.append(str(nltk_data_path))
    
        # Don't download here; just check if data is present
        try:
            stopwords.words('english')
            WordNetLemmatizer().lemmatize('test')
        except LookupError as e:
            raise RuntimeError(f"Required NLTK resources missing in {nltk_data_path}: {str(e)}")

    def _verify_model_paths(self, model_paths):
        """Verify all model files exist"""
        for name, path in model_paths.items():
            if not Path(path).exists():
                raise FileNotFoundError(
                    f"Model file not found: {path} ({name}). "
                    f"Current working directory: {os.getcwd()}"
                )

    def _load_models(self, model_paths):
        """Safely load all required models with validation"""
        try:
            # Load TF-IDF vectorizer with validation
            self.tfidf = joblib.load(model_paths['tfidf'])
            if not hasattr(self.tfidf, 'vocabulary_'):
                raise NotFittedError("TF-IDF vectorizer is not fitted")
                
            # Load classifier model
            self.model = joblib.load(model_paths['model'])
            
            # Load label encoder
            self.le = joblib.load(model_paths['label_encoder'])
            
        except Exception as e:
            raise ValueError(f"Failed to load models: {str(e)}")

    def preprocess_text(self, text):
        """Standalone text cleaning function"""
        if not isinstance(text, str):
            return ""
            
        # Lowercase
        text = text.lower()
        
        # Remove email-specific patterns
        text = re.sub(r'\S+@\S+', ' ', text)  # Email addresses
        text = re.sub(r'http\S+', ' ', text)  # URLs
        text = re.sub(r'www\S+', ' ', text)   # URLs
        
        # Remove punctuation and numbers
        text = re.sub(r'[^\w\s]', ' ', text)
        text = re.sub(r'\d+', ' ', text)
        
        # Tokenize and process
        tokens = text.split()
        tokens = [self.lemmatizer.lemmatize(token) 
                 for token in tokens 
                 if token not in self.stop_words and len(token) > 2]
        
        return ' '.join(tokens)
    
    def predict(self, text):
        """Make prediction on new text with error handling"""
        if not self.tfidf or not self.model or not self.le:
            raise RuntimeError("Classifier not properly initialized")
            
        try:
            # Preprocess
            cleaned_text = self.preprocess_text(text)
            
            # Vectorize
            vectorized = self.tfidf.transform([cleaned_text])
            
            # Predict
            prediction = self.model.predict(vectorized)
            
            # Return human-readable label
            return self.le.inverse_transform(prediction)[0]
            
        except Exception as e:
            raise ValueError(f"Prediction failed: {str(e)}")


# Initialize with Docker-compatible paths
MODEL_DIR = Path(__file__).parent.parent / "models"
model_paths = {
    'tfidf': "models/tfidf_vectorizer_stack.pkl",
    'model': "models/intent_classifier_stack.pkl",
    'label_encoder': "models/label_encoder_stack.pkl"
}

# Initialize classifier with comprehensive error handling
try:
    classifier = IntentClassifier(model_paths)
    # Verify the TF-IDF vectorizer is properly fitted
    test_vector = classifier.tfidf.transform(["test email"])
    print("Classifier initialized successfully")
except Exception as e:
    print(f"Failed to initialize classifier: {str(e)}")
    classifier = None