Spaces:

VGreatVig07
/

Email_Classifier

Sleeping

App Files Files Community

Email_Classifier / utils /preprocessor.py

VGreatVig07

Upload 6 files

507fdc6 verified 8 months ago

raw

history blame contribute delete

4.49 kB

	import nltk
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	import string
	import re
	import joblib
	from sklearn.feature_extraction.text import TfidfVectorizer
	import os
	from pathlib import Path
	from sklearn.exceptions import NotFittedError

	class IntentClassifier:
	def __init__(self, model_paths):
	# Configure NLTK data path (Docker compatible)
	self._setup_nltk()

	# Verify and load models
	self._verify_model_paths(model_paths)
	self._load_models(model_paths)

	# Initialize preprocessing tools
	self.stop_words = set(stopwords.words('english'))
	self.lemmatizer = WordNetLemmatizer()


	def _setup_nltk(self):
	"""Set up NLTK data path to use local directory only"""
	nltk_data_path = Path(__file__).parent.parent / "models" / "nltk_data"
	nltk.data.path.append(str(nltk_data_path))

	# Don't download here; just check if data is present
	try:
	stopwords.words('english')
	WordNetLemmatizer().lemmatize('test')
	except LookupError as e:
	raise RuntimeError(f"Required NLTK resources missing in {nltk_data_path}: {str(e)}")

	def _verify_model_paths(self, model_paths):
	"""Verify all model files exist"""
	for name, path in model_paths.items():
	if not Path(path).exists():
	raise FileNotFoundError(
	f"Model file not found: {path} ({name}). "
	f"Current working directory: {os.getcwd()}"
	)

	def _load_models(self, model_paths):
	"""Safely load all required models with validation"""
	try:
	# Load TF-IDF vectorizer with validation
	self.tfidf = joblib.load(model_paths['tfidf'])
	if not hasattr(self.tfidf, 'vocabulary_'):
	raise NotFittedError("TF-IDF vectorizer is not fitted")

	# Load classifier model
	self.model = joblib.load(model_paths['model'])

	# Load label encoder
	self.le = joblib.load(model_paths['label_encoder'])

	except Exception as e:
	raise ValueError(f"Failed to load models: {str(e)}")

	def preprocess_text(self, text):
	"""Standalone text cleaning function"""
	if not isinstance(text, str):
	return ""

	# Lowercase
	text = text.lower()

	# Remove email-specific patterns
	text = re.sub(r'\S+@\S+', ' ', text) # Email addresses
	text = re.sub(r'http\S+', ' ', text) # URLs
	text = re.sub(r'www\S+', ' ', text) # URLs

	# Remove punctuation and numbers
	text = re.sub(r'[^\w\s]', ' ', text)
	text = re.sub(r'\d+', ' ', text)

	# Tokenize and process
	tokens = text.split()
	tokens = [self.lemmatizer.lemmatize(token)
	for token in tokens
	if token not in self.stop_words and len(token) > 2]

	return ' '.join(tokens)

	def predict(self, text):
	"""Make prediction on new text with error handling"""
	if not self.tfidf or not self.model or not self.le:
	raise RuntimeError("Classifier not properly initialized")

	try:
	# Preprocess
	cleaned_text = self.preprocess_text(text)

	# Vectorize
	vectorized = self.tfidf.transform([cleaned_text])

	# Predict
	prediction = self.model.predict(vectorized)

	# Return human-readable label
	return self.le.inverse_transform(prediction)[0]

	except Exception as e:
	raise ValueError(f"Prediction failed: {str(e)}")


	# Initialize with Docker-compatible paths
	MODEL_DIR = Path(__file__).parent.parent / "models"
	model_paths = {
	'tfidf': "models/tfidf_vectorizer_stack.pkl",
	'model': "models/intent_classifier_stack.pkl",
	'label_encoder': "models/label_encoder_stack.pkl"
	}

	# Initialize classifier with comprehensive error handling
	try:
	classifier = IntentClassifier(model_paths)
	# Verify the TF-IDF vectorizer is properly fitted
	test_vector = classifier.tfidf.transform(["test email"])
	print("Classifier initialized successfully")
	except Exception as e:
	print(f"Failed to initialize classifier: {str(e)}")
	classifier = None