Spaces:
Sleeping
Sleeping
File size: 4,489 Bytes
507fdc6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
import os
from pathlib import Path
from sklearn.exceptions import NotFittedError
class IntentClassifier:
def __init__(self, model_paths):
# Configure NLTK data path (Docker compatible)
self._setup_nltk()
# Verify and load models
self._verify_model_paths(model_paths)
self._load_models(model_paths)
# Initialize preprocessing tools
self.stop_words = set(stopwords.words('english'))
self.lemmatizer = WordNetLemmatizer()
def _setup_nltk(self):
"""Set up NLTK data path to use local directory only"""
nltk_data_path = Path(__file__).parent.parent / "models" / "nltk_data"
nltk.data.path.append(str(nltk_data_path))
# Don't download here; just check if data is present
try:
stopwords.words('english')
WordNetLemmatizer().lemmatize('test')
except LookupError as e:
raise RuntimeError(f"Required NLTK resources missing in {nltk_data_path}: {str(e)}")
def _verify_model_paths(self, model_paths):
"""Verify all model files exist"""
for name, path in model_paths.items():
if not Path(path).exists():
raise FileNotFoundError(
f"Model file not found: {path} ({name}). "
f"Current working directory: {os.getcwd()}"
)
def _load_models(self, model_paths):
"""Safely load all required models with validation"""
try:
# Load TF-IDF vectorizer with validation
self.tfidf = joblib.load(model_paths['tfidf'])
if not hasattr(self.tfidf, 'vocabulary_'):
raise NotFittedError("TF-IDF vectorizer is not fitted")
# Load classifier model
self.model = joblib.load(model_paths['model'])
# Load label encoder
self.le = joblib.load(model_paths['label_encoder'])
except Exception as e:
raise ValueError(f"Failed to load models: {str(e)}")
def preprocess_text(self, text):
"""Standalone text cleaning function"""
if not isinstance(text, str):
return ""
# Lowercase
text = text.lower()
# Remove email-specific patterns
text = re.sub(r'\S+@\S+', ' ', text) # Email addresses
text = re.sub(r'http\S+', ' ', text) # URLs
text = re.sub(r'www\S+', ' ', text) # URLs
# Remove punctuation and numbers
text = re.sub(r'[^\w\s]', ' ', text)
text = re.sub(r'\d+', ' ', text)
# Tokenize and process
tokens = text.split()
tokens = [self.lemmatizer.lemmatize(token)
for token in tokens
if token not in self.stop_words and len(token) > 2]
return ' '.join(tokens)
def predict(self, text):
"""Make prediction on new text with error handling"""
if not self.tfidf or not self.model or not self.le:
raise RuntimeError("Classifier not properly initialized")
try:
# Preprocess
cleaned_text = self.preprocess_text(text)
# Vectorize
vectorized = self.tfidf.transform([cleaned_text])
# Predict
prediction = self.model.predict(vectorized)
# Return human-readable label
return self.le.inverse_transform(prediction)[0]
except Exception as e:
raise ValueError(f"Prediction failed: {str(e)}")
# Initialize with Docker-compatible paths
MODEL_DIR = Path(__file__).parent.parent / "models"
model_paths = {
'tfidf': "models/tfidf_vectorizer_stack.pkl",
'model': "models/intent_classifier_stack.pkl",
'label_encoder': "models/label_encoder_stack.pkl"
}
# Initialize classifier with comprehensive error handling
try:
classifier = IntentClassifier(model_paths)
# Verify the TF-IDF vectorizer is properly fitted
test_vector = classifier.tfidf.transform(["test email"])
print("Classifier initialized successfully")
except Exception as e:
print(f"Failed to initialize classifier: {str(e)}")
classifier = None
|