# classification_model.py - Developed by nitinprajwal from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from sklearn.pipeline import Pipeline, FeatureUnion from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.metrics import classification_report, accuracy_score from sklearn.preprocessing import StandardScaler from sklearn.base import BaseEstimator, TransformerMixin import pandas as pd import joblib import os import numpy as np import re from collections import Counter # Assuming utils.py is in the same directory from utils import load_data # Import PII masking functionality from pii_masking import mask_pii_details, nlp as spacy_nlp_model_for_training # Use the loaded spaCy model # Import the advanced feature extractor from feature_extractor import AdvancedTextFeatureExtractor from config import CLASSIFICATION_MODEL_PATH MODEL_FILENAME = CLASSIFICATION_MODEL_PATH DEFAULT_DATASET_PATH = "combined_emails_with_natural_pii.csv" # AdvancedTextFeatureExtractor is now imported from feature_extractor.py def train_classification_model(data_path: str = DEFAULT_DATASET_PATH, model_save_path: str = MODEL_FILENAME): """ Trains the email classification model and saves it. Uses 'email' column for text and 'type' for category. """ print(f"Starting model training with dataset: {data_path}") df = load_data(data_path) if df is None: print("Failed to load data. Aborting training.") return False # Preprocessing: Fill NaN in 'email' (text content) and 'type' (labels) df['email'] = df['email'].fillna('') df['type'] = df['type'].fillna('Unknown') df.dropna(subset=['type'], inplace=True) # Ensure labels are present if df.empty or df['email'].empty or df['type'].empty: print("Data is empty or lacks required 'email' or 'type' columns after preprocessing. Aborting training.") return False print("Applying PII masking to training data...") # Ensure the spaCy model is available for masking if spacy_nlp_model_for_training is None: print("Warning: spaCy model not loaded in pii_masking. Training will use regex-only masked data.") # Mask PII in the training data # This can be slow for large datasets; consider optimizations if needed masked_emails = [] for i, email_text in enumerate(df['email']): if pd.isna(email_text): masked_emails.append("") # Handle potential NaN after fillna('') if any slip through continue masked_text, _ = mask_pii_details(str(email_text), nlp_model=spacy_nlp_model_for_training) masked_emails.append(masked_text) if (i + 1) % 100 == 0: print(f"Masked {i+1}/{len(df['email'])} emails for training...") df['masked_email_for_training'] = masked_emails print("PII masking for training data complete.") X = df['masked_email_for_training'] y = df['type'] # Optional: Split data for evaluation (not strictly required by assignment but good practice) # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) # Cross-version compatible model optimized for deployment print("Building deployment-compatible advanced model...") # Simple but effective pipeline using stable scikit-learn components model = Pipeline([ # Enhanced TF-IDF with optimized parameters for better classification ('tfidf', TfidfVectorizer( stop_words='english', max_df=0.85, min_df=2, ngram_range=(1,3), # Unigrams, bigrams, and trigrams max_features=5000, # Balanced feature count sublinear_tf=True, # Apply sublinear tf scaling norm='l2', strip_accents='unicode', lowercase=True, token_pattern=r'\b[a-zA-Z]+\b' # Only alphabetic tokens )), # Random Forest - highly compatible and robust across versions ('classifier', RandomForestClassifier( n_estimators=100, # Good balance of performance and speed max_depth=15, # Prevent overfitting min_samples_split=5, min_samples_leaf=2, max_features='sqrt', # Feature sampling random_state=42, class_weight='balanced', # Handle class imbalance n_jobs=1 # Single job for compatibility )) ]) print("Compatible model created: Enhanced TF-IDF (1-3 grams) + Random Forest") print("Optimized for cross-version compatibility and deployment stability") print("Training the model...") # model.fit(X_train, y_train) # If using train_test_split model.fit(X, y) # Train on full dataset as per typical assignment flow unless evaluation is separate print("Model training complete.") # Optional: Evaluate the model # print("\nModel Evaluation on Test Set:") # predictions = model.predict(X_test) # print(classification_report(y_test, predictions)) try: joblib.dump(model, CLASSIFICATION_MODEL_PATH) print(f"Model saved to {CLASSIFICATION_MODEL_PATH}") return True except Exception as e: print(f"Error saving model: {e}") return False def load_classification_model(model_path: str = CLASSIFICATION_MODEL_PATH): """ Loads the trained classification model. """ if not os.path.exists(CLASSIFICATION_MODEL_PATH): print(f"Error: Model file not found at {CLASSIFICATION_MODEL_PATH}. Train the model first or ensure path is correct.") print(f"Attempting to train a new model with default dataset: {DEFAULT_DATASET_PATH}") success = train_classification_model(data_path=DEFAULT_DATASET_PATH, model_save_path=CLASSIFICATION_MODEL_PATH) if not success: print("Failed to train a new model. Cannot load model.") return None # If training was successful, the model file should now exist. try: model = joblib.load(CLASSIFICATION_MODEL_PATH) print(f"Model loaded successfully from {CLASSIFICATION_MODEL_PATH}") return model except FileNotFoundError: # This case should be handled by the os.path.exists check and auto-train attempt now. print(f"Error: Model file not found at {CLASSIFICATION_MODEL_PATH} even after attempting to train.") return None except Exception as e: print(f"Error loading model from {model_path}: {e}") return None def classify_email_category(masked_email_text: str, model): """ Classifies the masked email text into a category. """ if model is None: print("Error: Classification model not loaded.") # Fallback category or raise an error, as per application requirements return "Error: Model not available" try: # The model expects a list or iterable of texts prediction = model.predict([masked_email_text]) return prediction[0] except Exception as e: print(f"Error during classification: {e}") return "Error: Classification failed" if __name__ == "__main__": print("Running classification_model.py script...") # Train the model using the provided dataset # This will save the model as 'email_classifier.joblib' in the root directory training_successful = train_classification_model(data_path=DEFAULT_DATASET_PATH, model_save_path=MODEL_FILENAME) if training_successful: print("\n--- Testing loaded model ---_model") # Load the just-trained model loaded_model = load_classification_model(MODEL_FILENAME) if loaded_model: sample_emails_for_testing = [ ("Subject: Urgent - Server down! Our main application server is not responding. We need immediate assistance.", "Incident"), ("Subject: Password Reset Request. Hi, I forgot my password and need to reset it. My username is testuser.", "Request"), ("Subject: Inquiry about new billing plans. Could you please provide more information on your enterprise billing options?", "Request"), ("Subject: System Update Notification for 2023-01-15. We will be performing scheduled maintenance.", "Change"), ("Subject: Recurring login issue. I've been unable to login for the past three days, the error says 'invalid credentials' but I am sure they are correct.", "Problem"), ] print("\nClassifying sample emails:") for email_text, expected_category in sample_emails_for_testing: # For testing the endpoint, the API will handle masking. # For this direct model test, we should simulate that by masking first. print(f"\nOriginal sample for testing: {email_text[:60]}...") masked_sample_text, _ = mask_pii_details(email_text, nlp_model=spacy_nlp_model_for_training) # Use the same nlp model print(f"Masked sample for testing: {masked_sample_text[:60]}...") category = classify_email_category(masked_sample_text, loaded_model) print(f"-> Predicted: {category} (Expected: {expected_category})") else: print("Model training failed. Cannot proceed with testing.")