Spaces:

nitinprajwal
/

email-pii-classifier

Sleeping

File size: 9,520 Bytes

# classification_model.py - Developed by nitinprajwal
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import joblib
import os
import numpy as np
import re
from collections import Counter

# Assuming utils.py is in the same directory
from utils import load_data
# Import PII masking functionality
from pii_masking import mask_pii_details, nlp as spacy_nlp_model_for_training # Use the loaded spaCy model
# Import the advanced feature extractor
from feature_extractor import AdvancedTextFeatureExtractor

from config import CLASSIFICATION_MODEL_PATH
MODEL_FILENAME = CLASSIFICATION_MODEL_PATH
DEFAULT_DATASET_PATH = "combined_emails_with_natural_pii.csv"

# AdvancedTextFeatureExtractor is now imported from feature_extractor.py

def train_classification_model(data_path: str = DEFAULT_DATASET_PATH, model_save_path: str = MODEL_FILENAME):
    """
    Trains the email classification model and saves it.
    Uses 'email' column for text and 'type' for category.
    """
    print(f"Starting model training with dataset: {data_path}")
    df = load_data(data_path)

    if df is None:
        print("Failed to load data. Aborting training.")
        return False

    # Preprocessing: Fill NaN in 'email' (text content) and 'type' (labels)
    df['email'] = df['email'].fillna('')
    df['type'] = df['type'].fillna('Unknown')
    df.dropna(subset=['type'], inplace=True) # Ensure labels are present

    if df.empty or df['email'].empty or df['type'].empty:
        print("Data is empty or lacks required 'email' or 'type' columns after preprocessing. Aborting training.")
        return False

    print("Applying PII masking to training data...")
    # Ensure the spaCy model is available for masking
    if spacy_nlp_model_for_training is None:
        print("Warning: spaCy model not loaded in pii_masking. Training will use regex-only masked data.")

    # Mask PII in the training data
    # This can be slow for large datasets; consider optimizations if needed
    masked_emails = []
    for i, email_text in enumerate(df['email']):
        if pd.isna(email_text):
            masked_emails.append("") # Handle potential NaN after fillna('') if any slip through
            continue
        masked_text, _ = mask_pii_details(str(email_text), nlp_model=spacy_nlp_model_for_training)
        masked_emails.append(masked_text)
        if (i + 1) % 100 == 0:
            print(f"Masked {i+1}/{len(df['email'])} emails for training...")
    
    df['masked_email_for_training'] = masked_emails
    print("PII masking for training data complete.")

    X = df['masked_email_for_training']
    y = df['type']

    # Optional: Split data for evaluation (not strictly required by assignment but good practice)
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Cross-version compatible model optimized for deployment
    print("Building deployment-compatible advanced model...")
    
    # Simple but effective pipeline using stable scikit-learn components
    model = Pipeline([
        # Enhanced TF-IDF with optimized parameters for better classification
        ('tfidf', TfidfVectorizer(
            stop_words='english',
            max_df=0.85,
            min_df=2,
            ngram_range=(1,3),  # Unigrams, bigrams, and trigrams
            max_features=5000,  # Balanced feature count
            sublinear_tf=True,  # Apply sublinear tf scaling
            norm='l2',
            strip_accents='unicode',
            lowercase=True,
            token_pattern=r'\b[a-zA-Z]+\b'  # Only alphabetic tokens
        )),
        # Random Forest - highly compatible and robust across versions
        ('classifier', RandomForestClassifier(
            n_estimators=100,  # Good balance of performance and speed
            max_depth=15,  # Prevent overfitting
            min_samples_split=5,
            min_samples_leaf=2,
            max_features='sqrt',  # Feature sampling
            random_state=42,
            class_weight='balanced',  # Handle class imbalance
            n_jobs=1  # Single job for compatibility
        ))
    ])
    
    print("Compatible model created: Enhanced TF-IDF (1-3 grams) + Random Forest")
    print("Optimized for cross-version compatibility and deployment stability")

    print("Training the model...")
    # model.fit(X_train, y_train) # If using train_test_split
    model.fit(X, y) # Train on full dataset as per typical assignment flow unless evaluation is separate
    print("Model training complete.")

    # Optional: Evaluate the model
    # print("\nModel Evaluation on Test Set:")
    # predictions = model.predict(X_test)
    # print(classification_report(y_test, predictions))

    try:
        joblib.dump(model, CLASSIFICATION_MODEL_PATH)
        print(f"Model saved to {CLASSIFICATION_MODEL_PATH}")
        return True
    except Exception as e:
        print(f"Error saving model: {e}")
        return False

def load_classification_model(model_path: str = CLASSIFICATION_MODEL_PATH):
    """
    Loads the trained classification model.
    """
    if not os.path.exists(CLASSIFICATION_MODEL_PATH):
        print(f"Error: Model file not found at {CLASSIFICATION_MODEL_PATH}. Train the model first or ensure path is correct.")
        print(f"Attempting to train a new model with default dataset: {DEFAULT_DATASET_PATH}")
        success = train_classification_model(data_path=DEFAULT_DATASET_PATH, model_save_path=CLASSIFICATION_MODEL_PATH)
        if not success:
            print("Failed to train a new model. Cannot load model.")
            return None
        # If training was successful, the model file should now exist.
    
    try:
        model = joblib.load(CLASSIFICATION_MODEL_PATH)
        print(f"Model loaded successfully from {CLASSIFICATION_MODEL_PATH}")
        return model
    except FileNotFoundError:
        # This case should be handled by the os.path.exists check and auto-train attempt now.
        print(f"Error: Model file not found at {CLASSIFICATION_MODEL_PATH} even after attempting to train.")
        return None
    except Exception as e:
        print(f"Error loading model from {model_path}: {e}")
        return None

def classify_email_category(masked_email_text: str, model):
    """
    Classifies the masked email text into a category.
    """
    if model is None:
        print("Error: Classification model not loaded.")
        # Fallback category or raise an error, as per application requirements
        return "Error: Model not available"
    try:
        # The model expects a list or iterable of texts
        prediction = model.predict([masked_email_text])
        return prediction[0]
    except Exception as e:
        print(f"Error during classification: {e}")
        return "Error: Classification failed"

if __name__ == "__main__":
    print("Running classification_model.py script...")
    # Train the model using the provided dataset
    # This will save the model as 'email_classifier.joblib' in the root directory
    training_successful = train_classification_model(data_path=DEFAULT_DATASET_PATH, model_save_path=MODEL_FILENAME)

    if training_successful:
        print("\n--- Testing loaded model ---_model")
        # Load the just-trained model
        loaded_model = load_classification_model(MODEL_FILENAME)
        if loaded_model:
            sample_emails_for_testing = [
                ("Subject: Urgent - Server down! Our main application server is not responding. We need immediate assistance.", "Incident"),
                ("Subject: Password Reset Request. Hi, I forgot my password and need to reset it. My username is testuser.", "Request"),
                ("Subject: Inquiry about new billing plans. Could you please provide more information on your enterprise billing options?", "Request"),
                ("Subject: System Update Notification for 2023-01-15. We will be performing scheduled maintenance.", "Change"),
                ("Subject: Recurring login issue. I've been unable to login for the past three days, the error says 'invalid credentials' but I am sure they are correct.", "Problem"),
            ]
            print("\nClassifying sample emails:")
            for email_text, expected_category in sample_emails_for_testing:
                # For testing the endpoint, the API will handle masking. 
                # For this direct model test, we should simulate that by masking first.
                print(f"\nOriginal sample for testing: {email_text[:60]}...")
                masked_sample_text, _ = mask_pii_details(email_text, nlp_model=spacy_nlp_model_for_training) # Use the same nlp model
                print(f"Masked sample for testing: {masked_sample_text[:60]}...")
                category = classify_email_category(masked_sample_text, loaded_model)
                print(f"-> Predicted: {category} (Expected: {expected_category})")
    else:
        print("Model training failed. Cannot proceed with testing.")