Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Standalone script to train the email classifier model | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.naive_bayes import MultinomialNB | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.model_selection import train_test_split, cross_val_score | |
| from sklearn.metrics import classification_report, accuracy_score, confusion_matrix | |
| import joblib | |
| import re | |
| import os | |
| from datetime import datetime | |
| def preprocess_text(text: str) -> str: | |
| """Preprocess email text""" | |
| text = text.lower() | |
| text = re.sub(r'\s+', ' ', text) | |
| text = re.sub(r'[^\w\s,.\-!?]', ' ', text) | |
| return text.strip() | |
| def load_data(file_path: str): | |
| """Load and preprocess the dataset""" | |
| print(f"Loading dataset from {file_path}...") | |
| if not os.path.exists(file_path): | |
| print(f"Error: Dataset file {file_path} not found!") | |
| return None, None | |
| df = pd.read_csv(file_path) | |
| print(f"Dataset loaded: {len(df)} samples") | |
| print(f"Columns: {list(df.columns)}") | |
| # Basic data info | |
| print(f"\nLabel distribution:") | |
| print(df['label'].value_counts()) | |
| # Preprocess messages | |
| df['processed_message'] = df['message'].apply(preprocess_text) | |
| return df['processed_message'], df['label'] | |
| def train_model(X, y): | |
| """Train the Naive Bayes model""" | |
| print("\nSplitting data...") | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=0.2, random_state=42, stratify=y | |
| ) | |
| print(f"Training set: {len(X_train)} samples") | |
| print(f"Test set: {len(X_test)} samples") | |
| # Create pipeline | |
| print("\nCreating model pipeline...") | |
| pipeline = Pipeline([ | |
| ('tfidf', TfidfVectorizer( | |
| max_features=1000, | |
| ngram_range=(1, 2), | |
| stop_words='english', | |
| lowercase=True, | |
| min_df=1, | |
| max_df=0.95 | |
| )), | |
| ('classifier', MultinomialNB(alpha=1.0)) | |
| ]) | |
| # Train model | |
| print("Training model...") | |
| pipeline.fit(X_train, y_train) | |
| # Cross-validation | |
| print("Performing cross-validation...") | |
| cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy') | |
| print(f"Cross-validation scores: {cv_scores}") | |
| print(f"Mean CV accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})") | |
| # Test set evaluation | |
| print("\nEvaluating on test set...") | |
| y_pred = pipeline.predict(X_test) | |
| test_accuracy = accuracy_score(y_test, y_pred) | |
| print(f"Test accuracy: {test_accuracy:.4f}") | |
| print("\nClassification Report:") | |
| print(classification_report(y_test, y_pred, target_names=['No Attachment', 'Has Attachment'])) | |
| print("\nConfusion Matrix:") | |
| print(confusion_matrix(y_test, y_pred)) | |
| # Feature analysis | |
| print("\nAnalyzing most important features...") | |
| feature_names = pipeline.named_steps['tfidf'].get_feature_names_out() | |
| feature_scores = pipeline.named_steps['classifier'].feature_log_prob_ | |
| # Top features for each class | |
| for class_idx, class_name in enumerate(['No Attachment', 'Has Attachment']): | |
| top_features_idx = np.argsort(feature_scores[class_idx])[-20:] | |
| top_features = [feature_names[i] for i in top_features_idx] | |
| print(f"\nTop 20 features for {class_name}:") | |
| print(", ".join(reversed(top_features))) | |
| return pipeline, test_accuracy | |
| def save_model(pipeline, accuracy, output_path='email_classifier_model.pkl'): | |
| """Save the trained model""" | |
| print(f"\nSaving model to {output_path}...") | |
| # Add metadata | |
| model_info = { | |
| 'pipeline': pipeline, | |
| 'accuracy': accuracy, | |
| 'feature_count': len(pipeline.named_steps['tfidf'].vocabulary_), | |
| 'training_date': datetime.now().isoformat(), | |
| 'model_type': 'Multinomial Naive Bayes', | |
| 'preprocessing': 'TF-IDF with 1-2 grams' | |
| } | |
| joblib.dump(model_info, output_path) | |
| print(f"Model saved successfully!") | |
| print(f"Model info:") | |
| print(f" - Accuracy: {accuracy:.4f}") | |
| print(f" - Features: {model_info['feature_count']}") | |
| print(f" - Training date: {model_info['training_date']}") | |
| def test_model_predictions(pipeline): | |
| """Test model with sample predictions""" | |
| print("\n" + "="*50) | |
| print("TESTING MODEL WITH SAMPLE PREDICTIONS") | |
| print("="*50) | |
| test_messages = [ | |
| "Hello, please find attached the document you requested.", | |
| "Good morning, I'm sharing the report as discussed.", | |
| "Hi team, attached is the presentation for tomorrow's meeting.", | |
| "Dear all, kindly review the attached files.", | |
| "Hello, how are you doing today?", | |
| "I will send you the information later.", | |
| "Please let me know if you need any clarification.", | |
| "The meeting is scheduled for 3 PM tomorrow." | |
| ] | |
| for msg in test_messages: | |
| processed_msg = preprocess_text(msg) | |
| prediction = pipeline.predict([processed_msg])[0] | |
| probabilities = pipeline.predict_proba([processed_msg])[0] | |
| confidence = max(probabilities) | |
| label = "Has Attachment" if prediction == 1 else "No Attachment" | |
| print(f"\nMessage: '{msg}'") | |
| print(f"Prediction: {label} (confidence: {confidence:.3f})") | |
| print(f"Probabilities: No={probabilities[0]:.3f}, Yes={probabilities[1]:.3f}") | |
| def main(): | |
| """Main training function""" | |
| print("="*60) | |
| print("EMAIL ATTACHMENT CLASSIFIER TRAINING") | |
| print("="*60) | |
| # Load data | |
| dataset_path = 'Synthetic_Email_Dataset.csv' | |
| X, y = load_data(dataset_path) | |
| if X is None: | |
| print("Failed to load dataset. Exiting...") | |
| return | |
| # Train model | |
| pipeline, accuracy = train_model(X, y) | |
| # Save model | |
| save_model(pipeline, accuracy) | |
| # Test predictions | |
| test_model_predictions(pipeline) | |
| print("\n" + "="*60) | |
| print("TRAINING COMPLETED SUCCESSFULLY!") | |
| print("="*60) | |
| print(f"Final model accuracy: {accuracy:.4f}") | |
| print("Model saved as 'email_classifier_model.pkl'") | |
| print("You can now deploy the API using 'python app.py'") | |
| if __name__ == "__main__": | |
| main() |