Spaces:

nguyennp86
/

email-classifier-api

Sleeping

App Files Files Community

email-classifier-api / train_model.py

nguyennp86

Deploy Email Classifier API

1291f7a 6 months ago

raw

history blame contribute delete

6.24 kB

	#!/usr/bin/env python3
	"""
	Standalone script to train the email classifier model
	"""

	import pandas as pd
	import numpy as np
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.naive_bayes import MultinomialNB
	from sklearn.pipeline import Pipeline
	from sklearn.model_selection import train_test_split, cross_val_score
	from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
	import joblib
	import re
	import os
	from datetime import datetime

	def preprocess_text(text: str) -> str:
	"""Preprocess email text"""
	text = text.lower()
	text = re.sub(r'\s+', ' ', text)
	text = re.sub(r'[^\w\s,.\-!?]', ' ', text)
	return text.strip()

	def load_data(file_path: str):
	"""Load and preprocess the dataset"""
	print(f"Loading dataset from {file_path}...")

	if not os.path.exists(file_path):
	print(f"Error: Dataset file {file_path} not found!")
	return None, None

	df = pd.read_csv(file_path)
	print(f"Dataset loaded: {len(df)} samples")
	print(f"Columns: {list(df.columns)}")

	# Basic data info
	print(f"\nLabel distribution:")
	print(df['label'].value_counts())

	# Preprocess messages
	df['processed_message'] = df['message'].apply(preprocess_text)

	return df['processed_message'], df['label']

	def train_model(X, y):
	"""Train the Naive Bayes model"""
	print("\nSplitting data...")
	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.2, random_state=42, stratify=y
	)

	print(f"Training set: {len(X_train)} samples")
	print(f"Test set: {len(X_test)} samples")

	# Create pipeline
	print("\nCreating model pipeline...")
	pipeline = Pipeline([
	('tfidf', TfidfVectorizer(
	max_features=1000,
	ngram_range=(1, 2),
	stop_words='english',
	lowercase=True,
	min_df=1,
	max_df=0.95
	)),
	('classifier', MultinomialNB(alpha=1.0))
	])

	# Train model
	print("Training model...")
	pipeline.fit(X_train, y_train)

	# Cross-validation
	print("Performing cross-validation...")
	cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
	print(f"Cross-validation scores: {cv_scores}")
	print(f"Mean CV accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

	# Test set evaluation
	print("\nEvaluating on test set...")
	y_pred = pipeline.predict(X_test)
	test_accuracy = accuracy_score(y_test, y_pred)

	print(f"Test accuracy: {test_accuracy:.4f}")
	print("\nClassification Report:")
	print(classification_report(y_test, y_pred, target_names=['No Attachment', 'Has Attachment']))

	print("\nConfusion Matrix:")
	print(confusion_matrix(y_test, y_pred))

	# Feature analysis
	print("\nAnalyzing most important features...")
	feature_names = pipeline.named_steps['tfidf'].get_feature_names_out()
	feature_scores = pipeline.named_steps['classifier'].feature_log_prob_

	# Top features for each class
	for class_idx, class_name in enumerate(['No Attachment', 'Has Attachment']):
	top_features_idx = np.argsort(feature_scores[class_idx])[-20:]
	top_features = [feature_names[i] for i in top_features_idx]
	print(f"\nTop 20 features for {class_name}:")
	print(", ".join(reversed(top_features)))

	return pipeline, test_accuracy

	def save_model(pipeline, accuracy, output_path='email_classifier_model.pkl'):
	"""Save the trained model"""
	print(f"\nSaving model to {output_path}...")

	# Add metadata
	model_info = {
	'pipeline': pipeline,
	'accuracy': accuracy,
	'feature_count': len(pipeline.named_steps['tfidf'].vocabulary_),
	'training_date': datetime.now().isoformat(),
	'model_type': 'Multinomial Naive Bayes',
	'preprocessing': 'TF-IDF with 1-2 grams'
	}

	joblib.dump(model_info, output_path)
	print(f"Model saved successfully!")
	print(f"Model info:")
	print(f" - Accuracy: {accuracy:.4f}")
	print(f" - Features: {model_info['feature_count']}")
	print(f" - Training date: {model_info['training_date']}")

	def test_model_predictions(pipeline):
	"""Test model with sample predictions"""
	print("\n" + "="*50)
	print("TESTING MODEL WITH SAMPLE PREDICTIONS")
	print("="*50)

	test_messages = [
	"Hello, please find attached the document you requested.",
	"Good morning, I'm sharing the report as discussed.",
	"Hi team, attached is the presentation for tomorrow's meeting.",
	"Dear all, kindly review the attached files.",
	"Hello, how are you doing today?",
	"I will send you the information later.",
	"Please let me know if you need any clarification.",
	"The meeting is scheduled for 3 PM tomorrow."
	]

	for msg in test_messages:
	processed_msg = preprocess_text(msg)
	prediction = pipeline.predict([processed_msg])[0]
	probabilities = pipeline.predict_proba([processed_msg])[0]
	confidence = max(probabilities)

	label = "Has Attachment" if prediction == 1 else "No Attachment"
	print(f"\nMessage: '{msg}'")
	print(f"Prediction: {label} (confidence: {confidence:.3f})")
	print(f"Probabilities: No={probabilities[0]:.3f}, Yes={probabilities[1]:.3f}")

	def main():
	"""Main training function"""
	print("="*60)
	print("EMAIL ATTACHMENT CLASSIFIER TRAINING")
	print("="*60)

	# Load data
	dataset_path = 'Synthetic_Email_Dataset.csv'
	X, y = load_data(dataset_path)

	if X is None:
	print("Failed to load dataset. Exiting...")
	return

	# Train model
	pipeline, accuracy = train_model(X, y)

	# Save model
	save_model(pipeline, accuracy)

	# Test predictions
	test_model_predictions(pipeline)

	print("\n" + "="*60)
	print("TRAINING COMPLETED SUCCESSFULLY!")
	print("="*60)
	print(f"Final model accuracy: {accuracy:.4f}")
	print("Model saved as 'email_classifier_model.pkl'")
	print("You can now deploy the API using 'python app.py'")

	if __name__ == "__main__":
	main()