Email_Classifier / utils /preprocessor.py
VGreatVig07's picture
Upload 6 files
507fdc6 verified
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
import os
from pathlib import Path
from sklearn.exceptions import NotFittedError
class IntentClassifier:
def __init__(self, model_paths):
# Configure NLTK data path (Docker compatible)
self._setup_nltk()
# Verify and load models
self._verify_model_paths(model_paths)
self._load_models(model_paths)
# Initialize preprocessing tools
self.stop_words = set(stopwords.words('english'))
self.lemmatizer = WordNetLemmatizer()
def _setup_nltk(self):
"""Set up NLTK data path to use local directory only"""
nltk_data_path = Path(__file__).parent.parent / "models" / "nltk_data"
nltk.data.path.append(str(nltk_data_path))
# Don't download here; just check if data is present
try:
stopwords.words('english')
WordNetLemmatizer().lemmatize('test')
except LookupError as e:
raise RuntimeError(f"Required NLTK resources missing in {nltk_data_path}: {str(e)}")
def _verify_model_paths(self, model_paths):
"""Verify all model files exist"""
for name, path in model_paths.items():
if not Path(path).exists():
raise FileNotFoundError(
f"Model file not found: {path} ({name}). "
f"Current working directory: {os.getcwd()}"
)
def _load_models(self, model_paths):
"""Safely load all required models with validation"""
try:
# Load TF-IDF vectorizer with validation
self.tfidf = joblib.load(model_paths['tfidf'])
if not hasattr(self.tfidf, 'vocabulary_'):
raise NotFittedError("TF-IDF vectorizer is not fitted")
# Load classifier model
self.model = joblib.load(model_paths['model'])
# Load label encoder
self.le = joblib.load(model_paths['label_encoder'])
except Exception as e:
raise ValueError(f"Failed to load models: {str(e)}")
def preprocess_text(self, text):
"""Standalone text cleaning function"""
if not isinstance(text, str):
return ""
# Lowercase
text = text.lower()
# Remove email-specific patterns
text = re.sub(r'\S+@\S+', ' ', text) # Email addresses
text = re.sub(r'http\S+', ' ', text) # URLs
text = re.sub(r'www\S+', ' ', text) # URLs
# Remove punctuation and numbers
text = re.sub(r'[^\w\s]', ' ', text)
text = re.sub(r'\d+', ' ', text)
# Tokenize and process
tokens = text.split()
tokens = [self.lemmatizer.lemmatize(token)
for token in tokens
if token not in self.stop_words and len(token) > 2]
return ' '.join(tokens)
def predict(self, text):
"""Make prediction on new text with error handling"""
if not self.tfidf or not self.model or not self.le:
raise RuntimeError("Classifier not properly initialized")
try:
# Preprocess
cleaned_text = self.preprocess_text(text)
# Vectorize
vectorized = self.tfidf.transform([cleaned_text])
# Predict
prediction = self.model.predict(vectorized)
# Return human-readable label
return self.le.inverse_transform(prediction)[0]
except Exception as e:
raise ValueError(f"Prediction failed: {str(e)}")
# Initialize with Docker-compatible paths
MODEL_DIR = Path(__file__).parent.parent / "models"
model_paths = {
'tfidf': "models/tfidf_vectorizer_stack.pkl",
'model': "models/intent_classifier_stack.pkl",
'label_encoder': "models/label_encoder_stack.pkl"
}
# Initialize classifier with comprehensive error handling
try:
classifier = IntentClassifier(model_paths)
# Verify the TF-IDF vectorizer is properly fitted
test_vector = classifier.tfidf.transform(["test email"])
print("Classifier initialized successfully")
except Exception as e:
print(f"Failed to initialize classifier: {str(e)}")
classifier = None