email-pii-classifier / classification_model.py
nitinprajwal's picture
Update classification_model.py
77fc733 verified
# classification_model.py - Developed by nitinprajwal
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import joblib
import os
import numpy as np
import re
from collections import Counter
# Assuming utils.py is in the same directory
from utils import load_data
# Import PII masking functionality
from pii_masking import mask_pii_details, nlp as spacy_nlp_model_for_training # Use the loaded spaCy model
# Import the advanced feature extractor
from feature_extractor import AdvancedTextFeatureExtractor
from config import CLASSIFICATION_MODEL_PATH
MODEL_FILENAME = CLASSIFICATION_MODEL_PATH
DEFAULT_DATASET_PATH = "combined_emails_with_natural_pii.csv"
# AdvancedTextFeatureExtractor is now imported from feature_extractor.py
def train_classification_model(data_path: str = DEFAULT_DATASET_PATH, model_save_path: str = MODEL_FILENAME):
"""
Trains the email classification model and saves it.
Uses 'email' column for text and 'type' for category.
"""
print(f"Starting model training with dataset: {data_path}")
df = load_data(data_path)
if df is None:
print("Failed to load data. Aborting training.")
return False
# Preprocessing: Fill NaN in 'email' (text content) and 'type' (labels)
df['email'] = df['email'].fillna('')
df['type'] = df['type'].fillna('Unknown')
df.dropna(subset=['type'], inplace=True) # Ensure labels are present
if df.empty or df['email'].empty or df['type'].empty:
print("Data is empty or lacks required 'email' or 'type' columns after preprocessing. Aborting training.")
return False
print("Applying PII masking to training data...")
# Ensure the spaCy model is available for masking
if spacy_nlp_model_for_training is None:
print("Warning: spaCy model not loaded in pii_masking. Training will use regex-only masked data.")
# Mask PII in the training data
# This can be slow for large datasets; consider optimizations if needed
masked_emails = []
for i, email_text in enumerate(df['email']):
if pd.isna(email_text):
masked_emails.append("") # Handle potential NaN after fillna('') if any slip through
continue
masked_text, _ = mask_pii_details(str(email_text), nlp_model=spacy_nlp_model_for_training)
masked_emails.append(masked_text)
if (i + 1) % 100 == 0:
print(f"Masked {i+1}/{len(df['email'])} emails for training...")
df['masked_email_for_training'] = masked_emails
print("PII masking for training data complete.")
X = df['masked_email_for_training']
y = df['type']
# Optional: Split data for evaluation (not strictly required by assignment but good practice)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Cross-version compatible model optimized for deployment
print("Building deployment-compatible advanced model...")
# Simple but effective pipeline using stable scikit-learn components
model = Pipeline([
# Enhanced TF-IDF with optimized parameters for better classification
('tfidf', TfidfVectorizer(
stop_words='english',
max_df=0.85,
min_df=2,
ngram_range=(1,3), # Unigrams, bigrams, and trigrams
max_features=5000, # Balanced feature count
sublinear_tf=True, # Apply sublinear tf scaling
norm='l2',
strip_accents='unicode',
lowercase=True,
token_pattern=r'\b[a-zA-Z]+\b' # Only alphabetic tokens
)),
# Random Forest - highly compatible and robust across versions
('classifier', RandomForestClassifier(
n_estimators=100, # Good balance of performance and speed
max_depth=15, # Prevent overfitting
min_samples_split=5,
min_samples_leaf=2,
max_features='sqrt', # Feature sampling
random_state=42,
class_weight='balanced', # Handle class imbalance
n_jobs=1 # Single job for compatibility
))
])
print("Compatible model created: Enhanced TF-IDF (1-3 grams) + Random Forest")
print("Optimized for cross-version compatibility and deployment stability")
print("Training the model...")
# model.fit(X_train, y_train) # If using train_test_split
model.fit(X, y) # Train on full dataset as per typical assignment flow unless evaluation is separate
print("Model training complete.")
# Optional: Evaluate the model
# print("\nModel Evaluation on Test Set:")
# predictions = model.predict(X_test)
# print(classification_report(y_test, predictions))
try:
joblib.dump(model, CLASSIFICATION_MODEL_PATH)
print(f"Model saved to {CLASSIFICATION_MODEL_PATH}")
return True
except Exception as e:
print(f"Error saving model: {e}")
return False
def load_classification_model(model_path: str = CLASSIFICATION_MODEL_PATH):
"""
Loads the trained classification model.
"""
if not os.path.exists(CLASSIFICATION_MODEL_PATH):
print(f"Error: Model file not found at {CLASSIFICATION_MODEL_PATH}. Train the model first or ensure path is correct.")
print(f"Attempting to train a new model with default dataset: {DEFAULT_DATASET_PATH}")
success = train_classification_model(data_path=DEFAULT_DATASET_PATH, model_save_path=CLASSIFICATION_MODEL_PATH)
if not success:
print("Failed to train a new model. Cannot load model.")
return None
# If training was successful, the model file should now exist.
try:
model = joblib.load(CLASSIFICATION_MODEL_PATH)
print(f"Model loaded successfully from {CLASSIFICATION_MODEL_PATH}")
return model
except FileNotFoundError:
# This case should be handled by the os.path.exists check and auto-train attempt now.
print(f"Error: Model file not found at {CLASSIFICATION_MODEL_PATH} even after attempting to train.")
return None
except Exception as e:
print(f"Error loading model from {model_path}: {e}")
return None
def classify_email_category(masked_email_text: str, model):
"""
Classifies the masked email text into a category.
"""
if model is None:
print("Error: Classification model not loaded.")
# Fallback category or raise an error, as per application requirements
return "Error: Model not available"
try:
# The model expects a list or iterable of texts
prediction = model.predict([masked_email_text])
return prediction[0]
except Exception as e:
print(f"Error during classification: {e}")
return "Error: Classification failed"
if __name__ == "__main__":
print("Running classification_model.py script...")
# Train the model using the provided dataset
# This will save the model as 'email_classifier.joblib' in the root directory
training_successful = train_classification_model(data_path=DEFAULT_DATASET_PATH, model_save_path=MODEL_FILENAME)
if training_successful:
print("\n--- Testing loaded model ---_model")
# Load the just-trained model
loaded_model = load_classification_model(MODEL_FILENAME)
if loaded_model:
sample_emails_for_testing = [
("Subject: Urgent - Server down! Our main application server is not responding. We need immediate assistance.", "Incident"),
("Subject: Password Reset Request. Hi, I forgot my password and need to reset it. My username is testuser.", "Request"),
("Subject: Inquiry about new billing plans. Could you please provide more information on your enterprise billing options?", "Request"),
("Subject: System Update Notification for 2023-01-15. We will be performing scheduled maintenance.", "Change"),
("Subject: Recurring login issue. I've been unable to login for the past three days, the error says 'invalid credentials' but I am sure they are correct.", "Problem"),
]
print("\nClassifying sample emails:")
for email_text, expected_category in sample_emails_for_testing:
# For testing the endpoint, the API will handle masking.
# For this direct model test, we should simulate that by masking first.
print(f"\nOriginal sample for testing: {email_text[:60]}...")
masked_sample_text, _ = mask_pii_details(email_text, nlp_model=spacy_nlp_model_for_training) # Use the same nlp model
print(f"Masked sample for testing: {masked_sample_text[:60]}...")
category = classify_email_category(masked_sample_text, loaded_model)
print(f"-> Predicted: {category} (Expected: {expected_category})")
else:
print("Model training failed. Cannot proceed with testing.")