Spaces:
Sleeping
Sleeping
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.stem import WordNetLemmatizer | |
| import string | |
| import re | |
| import joblib | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| import os | |
| from pathlib import Path | |
| from sklearn.exceptions import NotFittedError | |
| class IntentClassifier: | |
| def __init__(self, model_paths): | |
| # Configure NLTK data path (Docker compatible) | |
| self._setup_nltk() | |
| # Verify and load models | |
| self._verify_model_paths(model_paths) | |
| self._load_models(model_paths) | |
| # Initialize preprocessing tools | |
| self.stop_words = set(stopwords.words('english')) | |
| self.lemmatizer = WordNetLemmatizer() | |
| def _setup_nltk(self): | |
| """Set up NLTK data path to use local directory only""" | |
| nltk_data_path = Path(__file__).parent.parent / "models" / "nltk_data" | |
| nltk.data.path.append(str(nltk_data_path)) | |
| # Don't download here; just check if data is present | |
| try: | |
| stopwords.words('english') | |
| WordNetLemmatizer().lemmatize('test') | |
| except LookupError as e: | |
| raise RuntimeError(f"Required NLTK resources missing in {nltk_data_path}: {str(e)}") | |
| def _verify_model_paths(self, model_paths): | |
| """Verify all model files exist""" | |
| for name, path in model_paths.items(): | |
| if not Path(path).exists(): | |
| raise FileNotFoundError( | |
| f"Model file not found: {path} ({name}). " | |
| f"Current working directory: {os.getcwd()}" | |
| ) | |
| def _load_models(self, model_paths): | |
| """Safely load all required models with validation""" | |
| try: | |
| # Load TF-IDF vectorizer with validation | |
| self.tfidf = joblib.load(model_paths['tfidf']) | |
| if not hasattr(self.tfidf, 'vocabulary_'): | |
| raise NotFittedError("TF-IDF vectorizer is not fitted") | |
| # Load classifier model | |
| self.model = joblib.load(model_paths['model']) | |
| # Load label encoder | |
| self.le = joblib.load(model_paths['label_encoder']) | |
| except Exception as e: | |
| raise ValueError(f"Failed to load models: {str(e)}") | |
| def preprocess_text(self, text): | |
| """Standalone text cleaning function""" | |
| if not isinstance(text, str): | |
| return "" | |
| # Lowercase | |
| text = text.lower() | |
| # Remove email-specific patterns | |
| text = re.sub(r'\S+@\S+', ' ', text) # Email addresses | |
| text = re.sub(r'http\S+', ' ', text) # URLs | |
| text = re.sub(r'www\S+', ' ', text) # URLs | |
| # Remove punctuation and numbers | |
| text = re.sub(r'[^\w\s]', ' ', text) | |
| text = re.sub(r'\d+', ' ', text) | |
| # Tokenize and process | |
| tokens = text.split() | |
| tokens = [self.lemmatizer.lemmatize(token) | |
| for token in tokens | |
| if token not in self.stop_words and len(token) > 2] | |
| return ' '.join(tokens) | |
| def predict(self, text): | |
| """Make prediction on new text with error handling""" | |
| if not self.tfidf or not self.model or not self.le: | |
| raise RuntimeError("Classifier not properly initialized") | |
| try: | |
| # Preprocess | |
| cleaned_text = self.preprocess_text(text) | |
| # Vectorize | |
| vectorized = self.tfidf.transform([cleaned_text]) | |
| # Predict | |
| prediction = self.model.predict(vectorized) | |
| # Return human-readable label | |
| return self.le.inverse_transform(prediction)[0] | |
| except Exception as e: | |
| raise ValueError(f"Prediction failed: {str(e)}") | |
| # Initialize with Docker-compatible paths | |
| MODEL_DIR = Path(__file__).parent.parent / "models" | |
| model_paths = { | |
| 'tfidf': "models/tfidf_vectorizer_stack.pkl", | |
| 'model': "models/intent_classifier_stack.pkl", | |
| 'label_encoder': "models/label_encoder_stack.pkl" | |
| } | |
| # Initialize classifier with comprehensive error handling | |
| try: | |
| classifier = IntentClassifier(model_paths) | |
| # Verify the TF-IDF vectorizer is properly fitted | |
| test_vector = classifier.tfidf.transform(["test email"]) | |
| print("Classifier initialized successfully") | |
| except Exception as e: | |
| print(f"Failed to initialize classifier: {str(e)}") | |
| classifier = None | |