Spaces:

nitinprajwal
/

email-pii-classifier

Sleeping

App Files Files Community

email-pii-classifier / classification_model.py

nitinprajwal

Update classification_model.py

77fc733 verified 9 months ago

raw

history blame contribute delete

9.52 kB

	# classification_model.py - Developed by nitinprajwal
	from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
	from sklearn.naive_bayes import MultinomialNB
	from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
	from sklearn.linear_model import LogisticRegression
	from sklearn.svm import SVC
	from sklearn.pipeline import Pipeline, FeatureUnion
	from sklearn.model_selection import train_test_split, GridSearchCV
	from sklearn.metrics import classification_report, accuracy_score
	from sklearn.preprocessing import StandardScaler
	from sklearn.base import BaseEstimator, TransformerMixin
	import pandas as pd
	import joblib
	import os
	import numpy as np
	import re
	from collections import Counter

	# Assuming utils.py is in the same directory
	from utils import load_data
	# Import PII masking functionality
	from pii_masking import mask_pii_details, nlp as spacy_nlp_model_for_training # Use the loaded spaCy model
	# Import the advanced feature extractor
	from feature_extractor import AdvancedTextFeatureExtractor

	from config import CLASSIFICATION_MODEL_PATH
	MODEL_FILENAME = CLASSIFICATION_MODEL_PATH
	DEFAULT_DATASET_PATH = "combined_emails_with_natural_pii.csv"

	# AdvancedTextFeatureExtractor is now imported from feature_extractor.py

	def train_classification_model(data_path: str = DEFAULT_DATASET_PATH, model_save_path: str = MODEL_FILENAME):
	"""
	Trains the email classification model and saves it.
	Uses 'email' column for text and 'type' for category.
	"""
	print(f"Starting model training with dataset: {data_path}")
	df = load_data(data_path)

	if df is None:
	print("Failed to load data. Aborting training.")
	return False

	# Preprocessing: Fill NaN in 'email' (text content) and 'type' (labels)
	df['email'] = df['email'].fillna('')
	df['type'] = df['type'].fillna('Unknown')
	df.dropna(subset=['type'], inplace=True) # Ensure labels are present

	if df.empty or df['email'].empty or df['type'].empty:
	print("Data is empty or lacks required 'email' or 'type' columns after preprocessing. Aborting training.")
	return False

	print("Applying PII masking to training data...")
	# Ensure the spaCy model is available for masking
	if spacy_nlp_model_for_training is None:
	print("Warning: spaCy model not loaded in pii_masking. Training will use regex-only masked data.")

	# Mask PII in the training data
	# This can be slow for large datasets; consider optimizations if needed
	masked_emails = []
	for i, email_text in enumerate(df['email']):
	if pd.isna(email_text):
	masked_emails.append("") # Handle potential NaN after fillna('') if any slip through
	continue
	masked_text, _ = mask_pii_details(str(email_text), nlp_model=spacy_nlp_model_for_training)
	masked_emails.append(masked_text)
	if (i + 1) % 100 == 0:
	print(f"Masked {i+1}/{len(df['email'])} emails for training...")

	df['masked_email_for_training'] = masked_emails
	print("PII masking for training data complete.")

	X = df['masked_email_for_training']
	y = df['type']

	# Optional: Split data for evaluation (not strictly required by assignment but good practice)
	# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

	# Cross-version compatible model optimized for deployment
	print("Building deployment-compatible advanced model...")

	# Simple but effective pipeline using stable scikit-learn components
	model = Pipeline([
	# Enhanced TF-IDF with optimized parameters for better classification
	('tfidf', TfidfVectorizer(
	stop_words='english',
	max_df=0.85,
	min_df=2,
	ngram_range=(1,3), # Unigrams, bigrams, and trigrams
	max_features=5000, # Balanced feature count
	sublinear_tf=True, # Apply sublinear tf scaling
	norm='l2',
	strip_accents='unicode',
	lowercase=True,
	token_pattern=r'\b[a-zA-Z]+\b' # Only alphabetic tokens
	)),
	# Random Forest - highly compatible and robust across versions
	('classifier', RandomForestClassifier(
	n_estimators=100, # Good balance of performance and speed
	max_depth=15, # Prevent overfitting
	min_samples_split=5,
	min_samples_leaf=2,
	max_features='sqrt', # Feature sampling
	random_state=42,
	class_weight='balanced', # Handle class imbalance
	n_jobs=1 # Single job for compatibility
	))
	])

	print("Compatible model created: Enhanced TF-IDF (1-3 grams) + Random Forest")
	print("Optimized for cross-version compatibility and deployment stability")

	print("Training the model...")
	# model.fit(X_train, y_train) # If using train_test_split
	model.fit(X, y) # Train on full dataset as per typical assignment flow unless evaluation is separate
	print("Model training complete.")

	# Optional: Evaluate the model
	# print("\nModel Evaluation on Test Set:")
	# predictions = model.predict(X_test)
	# print(classification_report(y_test, predictions))

	try:
	joblib.dump(model, CLASSIFICATION_MODEL_PATH)
	print(f"Model saved to {CLASSIFICATION_MODEL_PATH}")
	return True
	except Exception as e:
	print(f"Error saving model: {e}")
	return False

	def load_classification_model(model_path: str = CLASSIFICATION_MODEL_PATH):
	"""
	Loads the trained classification model.
	"""
	if not os.path.exists(CLASSIFICATION_MODEL_PATH):
	print(f"Error: Model file not found at {CLASSIFICATION_MODEL_PATH}. Train the model first or ensure path is correct.")
	print(f"Attempting to train a new model with default dataset: {DEFAULT_DATASET_PATH}")
	success = train_classification_model(data_path=DEFAULT_DATASET_PATH, model_save_path=CLASSIFICATION_MODEL_PATH)
	if not success:
	print("Failed to train a new model. Cannot load model.")
	return None
	# If training was successful, the model file should now exist.

	try:
	model = joblib.load(CLASSIFICATION_MODEL_PATH)
	print(f"Model loaded successfully from {CLASSIFICATION_MODEL_PATH}")
	return model
	except FileNotFoundError:
	# This case should be handled by the os.path.exists check and auto-train attempt now.
	print(f"Error: Model file not found at {CLASSIFICATION_MODEL_PATH} even after attempting to train.")
	return None
	except Exception as e:
	print(f"Error loading model from {model_path}: {e}")
	return None

	def classify_email_category(masked_email_text: str, model):
	"""
	Classifies the masked email text into a category.
	"""
	if model is None:
	print("Error: Classification model not loaded.")
	# Fallback category or raise an error, as per application requirements
	return "Error: Model not available"
	try:
	# The model expects a list or iterable of texts
	prediction = model.predict([masked_email_text])
	return prediction[0]
	except Exception as e:
	print(f"Error during classification: {e}")
	return "Error: Classification failed"

	if __name__ == "__main__":
	print("Running classification_model.py script...")
	# Train the model using the provided dataset
	# This will save the model as 'email_classifier.joblib' in the root directory
	training_successful = train_classification_model(data_path=DEFAULT_DATASET_PATH, model_save_path=MODEL_FILENAME)

	if training_successful:
	print("\n--- Testing loaded model ---_model")
	# Load the just-trained model
	loaded_model = load_classification_model(MODEL_FILENAME)
	if loaded_model:
	sample_emails_for_testing = [
	("Subject: Urgent - Server down! Our main application server is not responding. We need immediate assistance.", "Incident"),
	("Subject: Password Reset Request. Hi, I forgot my password and need to reset it. My username is testuser.", "Request"),
	("Subject: Inquiry about new billing plans. Could you please provide more information on your enterprise billing options?", "Request"),
	("Subject: System Update Notification for 2023-01-15. We will be performing scheduled maintenance.", "Change"),
	("Subject: Recurring login issue. I've been unable to login for the past three days, the error says 'invalid credentials' but I am sure they are correct.", "Problem"),
	]
	print("\nClassifying sample emails:")
	for email_text, expected_category in sample_emails_for_testing:
	# For testing the endpoint, the API will handle masking.
	# For this direct model test, we should simulate that by masking first.
	print(f"\nOriginal sample for testing: {email_text[:60]}...")
	masked_sample_text, _ = mask_pii_details(email_text, nlp_model=spacy_nlp_model_for_training) # Use the same nlp model
	print(f"Masked sample for testing: {masked_sample_text[:60]}...")
	category = classify_email_category(masked_sample_text, loaded_model)
	print(f"-> Predicted: {category} (Expected: {expected_category})")
	else:
	print("Model training failed. Cannot proceed with testing.")