Upload 3 files
Browse files- training/train_model_legacy.py +193 -0
- training/train_model_lite.py +212 -0
- training/train_model_mbo.py +265 -0
training/train_model_legacy.py
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import nltk, string, logging, pickle
|
| 4 |
+
import matplotlib.pyplot as plt
|
| 5 |
+
import seaborn as sns
|
| 6 |
+
from collections import Counter
|
| 7 |
+
from nltk.corpus import stopwords
|
| 8 |
+
from nltk.stem.porter import PorterStemmer
|
| 9 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 10 |
+
from sklearn.model_selection import train_test_split
|
| 11 |
+
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score
|
| 12 |
+
from sklearn.ensemble import VotingClassifier
|
| 13 |
+
from sklearn.svm import SVC
|
| 14 |
+
from sklearn.naive_bayes import MultinomialNB
|
| 15 |
+
from sklearn.ensemble import ExtraTreesClassifier
|
| 16 |
+
|
| 17 |
+
# Setup logging
|
| 18 |
+
logging.basicConfig(level=logging.INFO)
|
| 19 |
+
logger = logging.getLogger(__name__)
|
| 20 |
+
|
| 21 |
+
def transform_text(text):
|
| 22 |
+
ps = PorterStemmer()
|
| 23 |
+
text = text.lower()
|
| 24 |
+
text = nltk.word_tokenize(text)
|
| 25 |
+
|
| 26 |
+
y = []
|
| 27 |
+
for i in text:
|
| 28 |
+
if i.isalnum():
|
| 29 |
+
y.append(i)
|
| 30 |
+
|
| 31 |
+
text = y[:]
|
| 32 |
+
y.clear()
|
| 33 |
+
|
| 34 |
+
for i in text:
|
| 35 |
+
if i not in stopwords.words('english') and i not in string.punctuation:
|
| 36 |
+
y.append(i)
|
| 37 |
+
|
| 38 |
+
text = y[:]
|
| 39 |
+
y.clear()
|
| 40 |
+
|
| 41 |
+
for i in text:
|
| 42 |
+
y.append(ps.stem(i))
|
| 43 |
+
|
| 44 |
+
return " ".join(y)
|
| 45 |
+
|
| 46 |
+
def plot_dataset_insights(df):
|
| 47 |
+
plt.figure(figsize=(15, 5))
|
| 48 |
+
|
| 49 |
+
plt.subplot(131)
|
| 50 |
+
sns.histplot(data=df, x='num_characters', hue='target', bins=50)
|
| 51 |
+
plt.title('Message Length Distribution')
|
| 52 |
+
|
| 53 |
+
plt.subplot(132)
|
| 54 |
+
df['target'].value_counts().plot(kind='bar')
|
| 55 |
+
plt.title('Class Distribution')
|
| 56 |
+
|
| 57 |
+
plt.subplot(133)
|
| 58 |
+
sns.boxplot(data=df, x='target', y='num_words')
|
| 59 |
+
plt.title('Word Count by Class')
|
| 60 |
+
|
| 61 |
+
plt.tight_layout()
|
| 62 |
+
plt.savefig('./graphs/dataset_insights.png')
|
| 63 |
+
plt.close()
|
| 64 |
+
|
| 65 |
+
def plot_word_clouds(df):
|
| 66 |
+
from wordcloud import WordCloud
|
| 67 |
+
plt.figure(figsize=(15, 5))
|
| 68 |
+
|
| 69 |
+
# Map text labels to numeric
|
| 70 |
+
df['target_num'] = df['target'].map({'ham': 0, 'spam': 1})
|
| 71 |
+
|
| 72 |
+
for idx, label in enumerate(['ham', 'spam']):
|
| 73 |
+
# Get text for current label
|
| 74 |
+
text = ' '.join(df[df['target'] == label]['transformed_text'])
|
| 75 |
+
|
| 76 |
+
if not text.strip():
|
| 77 |
+
logger.warning(f"No text found for label: {label}")
|
| 78 |
+
continue
|
| 79 |
+
|
| 80 |
+
try:
|
| 81 |
+
wordcloud = WordCloud(width=800, height=400).generate(text)
|
| 82 |
+
plt.subplot(1, 2, idx+1)
|
| 83 |
+
plt.imshow(wordcloud)
|
| 84 |
+
plt.axis('off')
|
| 85 |
+
plt.title(f'Word Cloud - {label.upper()}')
|
| 86 |
+
except Exception as e:
|
| 87 |
+
logger.error(f"Error generating wordcloud for {label}: {e}")
|
| 88 |
+
|
| 89 |
+
plt.savefig('./graphs/wordclouds.png')
|
| 90 |
+
plt.close()
|
| 91 |
+
|
| 92 |
+
def plot_performance_metrics(y_test, y_pred, model):
|
| 93 |
+
plt.figure(figsize=(15, 5))
|
| 94 |
+
|
| 95 |
+
plt.subplot(131)
|
| 96 |
+
cm = confusion_matrix(y_test, y_pred)
|
| 97 |
+
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
|
| 98 |
+
plt.title('Confusion Matrix')
|
| 99 |
+
|
| 100 |
+
plt.subplot(132)
|
| 101 |
+
performance_df = pd.DataFrame({
|
| 102 |
+
'Metric': ['Accuracy', 'Precision'],
|
| 103 |
+
'Score': [accuracy_score(y_test, y_pred), precision_score(y_test, y_pred)]
|
| 104 |
+
})
|
| 105 |
+
sns.barplot(x='Metric', y='Score', data=performance_df)
|
| 106 |
+
plt.title('Model Performance')
|
| 107 |
+
|
| 108 |
+
plt.subplot(133)
|
| 109 |
+
etc = model.named_estimators_['et']
|
| 110 |
+
importances = pd.Series(etc.feature_importances_)
|
| 111 |
+
importances.nlargest(10).plot(kind='bar')
|
| 112 |
+
plt.title('Top 10 Important Features')
|
| 113 |
+
|
| 114 |
+
plt.tight_layout()
|
| 115 |
+
plt.savefig('./graphs/performance_metrics.png')
|
| 116 |
+
plt.close()
|
| 117 |
+
|
| 118 |
+
def save_metrics(metrics):
|
| 119 |
+
with open('./models/metrics.txt', 'w') as f:
|
| 120 |
+
for metric, value in metrics.items():
|
| 121 |
+
f.write(f"{metric}: {value:.4f}\n")
|
| 122 |
+
|
| 123 |
+
def main():
|
| 124 |
+
try:
|
| 125 |
+
# Load and preprocess data
|
| 126 |
+
logger.info("Loading data...")
|
| 127 |
+
df = pd.read_csv('./data/spam.csv', encoding='latin-1')
|
| 128 |
+
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
|
| 129 |
+
df = df.rename(columns={'v1': 'target', 'v2': 'text'})
|
| 130 |
+
|
| 131 |
+
logger.info(f"Target value counts:\n{df['target'].value_counts()}")
|
| 132 |
+
|
| 133 |
+
# Add numerical features
|
| 134 |
+
df['num_characters'] = df['text'].apply(len)
|
| 135 |
+
df['num_words'] = df['text'].apply(lambda x: len(nltk.word_tokenize(x)))
|
| 136 |
+
df['num_sentences'] = df['text'].apply(lambda x: len(nltk.sent_tokenize(x)))
|
| 137 |
+
|
| 138 |
+
logger.info("Transforming text...")
|
| 139 |
+
df['transformed_text'] = df['text'].apply(transform_text)
|
| 140 |
+
|
| 141 |
+
# Verify transformed text
|
| 142 |
+
logger.info(f"Sample transformed text:\n{df['transformed_text'].head()}")
|
| 143 |
+
|
| 144 |
+
logger.info("Generating visualizations...")
|
| 145 |
+
plot_dataset_insights(df)
|
| 146 |
+
plot_word_clouds(df)
|
| 147 |
+
|
| 148 |
+
# Text vectorization
|
| 149 |
+
tfidf = TfidfVectorizer(max_features=3000)
|
| 150 |
+
X = tfidf.fit_transform(df['transformed_text']).toarray()
|
| 151 |
+
# Convert target to numeric for model
|
| 152 |
+
y = (df['target'] == 'spam').astype(int)
|
| 153 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
|
| 154 |
+
|
| 155 |
+
# Create ensemble
|
| 156 |
+
logger.info("Training model...")
|
| 157 |
+
svc = SVC(kernel='sigmoid', gamma=1.0, probability=True)
|
| 158 |
+
mnb = MultinomialNB()
|
| 159 |
+
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
|
| 160 |
+
|
| 161 |
+
voting = VotingClassifier([('svm', svc), ('nb', mnb), ('et', etc)], voting='soft')
|
| 162 |
+
voting.fit(X_train, y_train)
|
| 163 |
+
|
| 164 |
+
y_pred = voting.predict(X_test)
|
| 165 |
+
|
| 166 |
+
metrics = {
|
| 167 |
+
"Accuracy": accuracy_score(y_test, y_pred),
|
| 168 |
+
"Precision": precision_score(y_test, y_pred)
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
save_metrics(metrics)
|
| 172 |
+
for metric, value in metrics.items():
|
| 173 |
+
logger.info(f"{metric}: {value:.4f}")
|
| 174 |
+
|
| 175 |
+
plot_performance_metrics(y_test, y_pred, voting)
|
| 176 |
+
|
| 177 |
+
logger.info("Saving models...")
|
| 178 |
+
pickle.dump(tfidf, open('./models/vectorizer.pkl', 'wb'))
|
| 179 |
+
pickle.dump(voting, open('./models/model.pkl', 'wb'))
|
| 180 |
+
|
| 181 |
+
logger.info("Training completed successfully")
|
| 182 |
+
|
| 183 |
+
except Exception as e:
|
| 184 |
+
logger.error(f"An error occurred: {e}")
|
| 185 |
+
raise
|
| 186 |
+
|
| 187 |
+
if __name__ == "__main__":
|
| 188 |
+
try:
|
| 189 |
+
nltk.download('punkt')
|
| 190 |
+
nltk.download('stopwords')
|
| 191 |
+
main()
|
| 192 |
+
except Exception as e:
|
| 193 |
+
print(f"Fatal error: {e}")
|
training/train_model_lite.py
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import nltk, string, logging, pickle
|
| 4 |
+
import matplotlib.pyplot as plt
|
| 5 |
+
import seaborn as sns
|
| 6 |
+
from collections import Counter
|
| 7 |
+
from nltk.corpus import stopwords
|
| 8 |
+
from sklearn.metrics import confusion_matrix, classification_report
|
| 9 |
+
from sklearn.model_selection import train_test_split
|
| 10 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 11 |
+
from sklearn.svm import SVC
|
| 12 |
+
from sklearn.naive_bayes import MultinomialNB
|
| 13 |
+
from sklearn.ensemble import ExtraTreesClassifier
|
| 14 |
+
from sklearn.model_selection import cross_val_score, GridSearchCV
|
| 15 |
+
from sklearn.ensemble import VotingClassifier
|
| 16 |
+
from sklearn.metrics import accuracy_score, precision_score, f1_score
|
| 17 |
+
nltk.download('punkt')
|
| 18 |
+
nltk.download('wordnet')
|
| 19 |
+
nltk.download('stopwords')
|
| 20 |
+
nltk.download('punkt_tab')
|
| 21 |
+
|
| 22 |
+
# Setup logging
|
| 23 |
+
logging.basicConfig(level=logging.INFO)
|
| 24 |
+
logger = logging.getLogger(__name__)
|
| 25 |
+
|
| 26 |
+
# Download required NLTK data
|
| 27 |
+
try:
|
| 28 |
+
nltk.download('punkt')
|
| 29 |
+
nltk.download('wordnet')
|
| 30 |
+
nltk.download('stopwords')
|
| 31 |
+
except Exception as e:
|
| 32 |
+
logger.error(f"Failed to download NLTK data: {e}")
|
| 33 |
+
|
| 34 |
+
def improved_transform_text(text):
|
| 35 |
+
try:
|
| 36 |
+
from nltk.stem import WordNetLemmatizer
|
| 37 |
+
lemmatizer = WordNetLemmatizer()
|
| 38 |
+
|
| 39 |
+
text = str(text).lower()
|
| 40 |
+
words = nltk.word_tokenize(text)
|
| 41 |
+
|
| 42 |
+
words = [lemmatizer.lemmatize(word) for word in words
|
| 43 |
+
if word.isalnum() and
|
| 44 |
+
word not in stopwords.words('english') and
|
| 45 |
+
word not in string.punctuation]
|
| 46 |
+
|
| 47 |
+
return " ".join(words)
|
| 48 |
+
except Exception as e:
|
| 49 |
+
logger.error(f"Error in text transformation: {e}")
|
| 50 |
+
return text
|
| 51 |
+
|
| 52 |
+
def extract_features(df):
|
| 53 |
+
try:
|
| 54 |
+
df['text_length'] = df['text'].str.len()
|
| 55 |
+
df['word_count'] = df['text'].str.split().str.len()
|
| 56 |
+
df['unique_word_count'] = df['text'].apply(lambda x: len(set(str(x).split())))
|
| 57 |
+
df['uppercase_count'] = df['text'].apply(lambda x: sum(1 for c in str(x) if c.isupper()))
|
| 58 |
+
df['special_char_count'] = df['text'].apply(lambda x: sum(not c.isalnum() for c in str(x)))
|
| 59 |
+
return df
|
| 60 |
+
except Exception as e:
|
| 61 |
+
logger.error(f"Error in feature extraction: {e}")
|
| 62 |
+
return df
|
| 63 |
+
|
| 64 |
+
def create_optimized_ensemble():
|
| 65 |
+
try:
|
| 66 |
+
svc = SVC(kernel='rbf', C=10, gamma='auto', probability=True, random_state=42)
|
| 67 |
+
mnb = MultinomialNB(alpha=0.1)
|
| 68 |
+
etc = ExtraTreesClassifier(n_estimators=200, max_depth=None,
|
| 69 |
+
min_samples_split=2, random_state=42)
|
| 70 |
+
|
| 71 |
+
estimators = [('svc', svc), ('mnb', mnb), ('etc', etc)]
|
| 72 |
+
voting_clf = VotingClassifier(estimators=estimators,
|
| 73 |
+
voting='soft',
|
| 74 |
+
weights=[2,1,2])
|
| 75 |
+
return voting_clf
|
| 76 |
+
except Exception as e:
|
| 77 |
+
logger.error(f"Error creating ensemble: {e}")
|
| 78 |
+
raise
|
| 79 |
+
|
| 80 |
+
def plot_dataset_insights(df):
|
| 81 |
+
plt.figure(figsize=(15, 5))
|
| 82 |
+
|
| 83 |
+
# Message length distribution
|
| 84 |
+
plt.subplot(131)
|
| 85 |
+
sns.histplot(data=df, x='text_length', hue='target', bins=50)
|
| 86 |
+
plt.title('Message Length Distribution')
|
| 87 |
+
|
| 88 |
+
# Class distribution
|
| 89 |
+
plt.subplot(132)
|
| 90 |
+
df['target'].value_counts().plot(kind='bar')
|
| 91 |
+
plt.title('Class Distribution')
|
| 92 |
+
|
| 93 |
+
# Word count distribution
|
| 94 |
+
plt.subplot(133)
|
| 95 |
+
sns.boxplot(data=df, x='target', y='word_count')
|
| 96 |
+
plt.title('Word Count by Class')
|
| 97 |
+
|
| 98 |
+
plt.tight_layout()
|
| 99 |
+
plt.savefig('./graphs/dataset_insights.png')
|
| 100 |
+
plt.close()
|
| 101 |
+
|
| 102 |
+
def plot_word_clouds(df):
|
| 103 |
+
from wordcloud import WordCloud
|
| 104 |
+
|
| 105 |
+
plt.figure(figsize=(15, 5))
|
| 106 |
+
|
| 107 |
+
for idx, label in enumerate(['ham', 'spam']):
|
| 108 |
+
text = ' '.join(df[df['target'] == label]['transformed_text'])
|
| 109 |
+
wordcloud = WordCloud(width=800, height=400).generate(text)
|
| 110 |
+
|
| 111 |
+
plt.subplot(1, 2, idx+1)
|
| 112 |
+
plt.imshow(wordcloud)
|
| 113 |
+
plt.axis('off')
|
| 114 |
+
plt.title(f'Word Cloud - {label.upper()}')
|
| 115 |
+
|
| 116 |
+
plt.savefig('./graphs/wordclouds.png')
|
| 117 |
+
plt.close()
|
| 118 |
+
|
| 119 |
+
def plot_performance_metrics(y_test, y_pred, model):
|
| 120 |
+
# Confusion Matrix
|
| 121 |
+
plt.figure(figsize=(15, 5))
|
| 122 |
+
|
| 123 |
+
plt.subplot(131)
|
| 124 |
+
cm = confusion_matrix(y_test, y_pred)
|
| 125 |
+
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
|
| 126 |
+
plt.title('Confusion Matrix')
|
| 127 |
+
|
| 128 |
+
# Classification Report Visualization
|
| 129 |
+
plt.subplot(132)
|
| 130 |
+
report = classification_report(y_test, y_pred, output_dict=True)
|
| 131 |
+
sns.heatmap(pd.DataFrame(report).iloc[:-1, :].T, annot=True, cmap='RdYlGn')
|
| 132 |
+
plt.title('Classification Report')
|
| 133 |
+
|
| 134 |
+
# Feature Importance (for ExtraTreesClassifier)
|
| 135 |
+
plt.subplot(133)
|
| 136 |
+
etc = model.named_estimators_['etc']
|
| 137 |
+
importances = pd.Series(etc.feature_importances_)
|
| 138 |
+
importances.nlargest(10).plot(kind='bar')
|
| 139 |
+
plt.title('Top 10 Important Features')
|
| 140 |
+
|
| 141 |
+
plt.tight_layout()
|
| 142 |
+
plt.savefig('./graphs/performance_metrics.png')
|
| 143 |
+
plt.close()
|
| 144 |
+
|
| 145 |
+
def save_metrics(metrics):
|
| 146 |
+
with open('./models/metrics.txt', 'w') as f:
|
| 147 |
+
for metric, value in metrics.items():
|
| 148 |
+
f.write(f"{metric}: {value:.4f}\n")
|
| 149 |
+
|
| 150 |
+
def main():
|
| 151 |
+
try:
|
| 152 |
+
# Load and preprocess data
|
| 153 |
+
df = pd.read_csv('./data/spam.csv', encoding='latin-1')
|
| 154 |
+
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, errors='ignore')
|
| 155 |
+
df = df.rename(columns={'v1': 'target', 'v2': 'text'})
|
| 156 |
+
|
| 157 |
+
logger.info("Preprocessing text...")
|
| 158 |
+
df['transformed_text'] = df['text'].apply(improved_transform_text)
|
| 159 |
+
df = extract_features(df)
|
| 160 |
+
|
| 161 |
+
logger.info("Generating dataset insights...")
|
| 162 |
+
plot_dataset_insights(df)
|
| 163 |
+
plot_word_clouds(df)
|
| 164 |
+
|
| 165 |
+
# Vectorization with optimized parameters
|
| 166 |
+
tfidf = TfidfVectorizer(
|
| 167 |
+
max_features=5000,
|
| 168 |
+
ngram_range=(1,3),
|
| 169 |
+
min_df=2,
|
| 170 |
+
max_df=0.95
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
X = tfidf.fit_transform(df['transformed_text'])
|
| 174 |
+
y = (df['target'] == 'spam').astype(int)
|
| 175 |
+
|
| 176 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 177 |
+
X, y, test_size=0.2, random_state=42, stratify=y
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
logger.info("Training model...")
|
| 181 |
+
model = create_optimized_ensemble()
|
| 182 |
+
model.fit(X_train, y_train)
|
| 183 |
+
|
| 184 |
+
y_pred = model.predict(X_test)
|
| 185 |
+
|
| 186 |
+
metrics = {
|
| 187 |
+
"Accuracy": accuracy_score(y_test, y_pred),
|
| 188 |
+
"Precision": precision_score(y_test, y_pred),
|
| 189 |
+
"F1": f1_score(y_test, y_pred)
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
# Save metrics to file
|
| 193 |
+
save_metrics(metrics)
|
| 194 |
+
|
| 195 |
+
for metric, value in metrics.items():
|
| 196 |
+
logger.info(f"{metric}: {value:.4f}")
|
| 197 |
+
|
| 198 |
+
plot_performance_metrics(y_test, y_pred, model)
|
| 199 |
+
|
| 200 |
+
with open('./models/vectorizer_optimized.pkl', 'wb') as f:
|
| 201 |
+
pickle.dump(tfidf, f)
|
| 202 |
+
with open('./models/model_optimized.pkl', 'wb') as f:
|
| 203 |
+
pickle.dump(model, f)
|
| 204 |
+
|
| 205 |
+
logger.info(f"Training completed. Metrics:\n{metrics}")
|
| 206 |
+
|
| 207 |
+
except Exception as e:
|
| 208 |
+
logger.error(f"An error occurred: {e}")
|
| 209 |
+
raise
|
| 210 |
+
|
| 211 |
+
if __name__ == "__main__":
|
| 212 |
+
main()
|
training/train_model_mbo.py
ADDED
|
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import nltk, string, logging, pickle, torch
|
| 4 |
+
import matplotlib.pyplot as plt
|
| 5 |
+
import seaborn as sns
|
| 6 |
+
from collections import Counter
|
| 7 |
+
from nltk.corpus import stopwords
|
| 8 |
+
from sklearn.metrics import confusion_matrix, classification_report
|
| 9 |
+
from sklearn.model_selection import train_test_split, cross_val_score
|
| 10 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 11 |
+
from sklearn.svm import SVC
|
| 12 |
+
from sklearn.naive_bayes import MultinomialNB
|
| 13 |
+
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier
|
| 14 |
+
from sklearn.metrics import accuracy_score, precision_score, f1_score
|
| 15 |
+
from torch.cuda import is_available as cuda_available
|
| 16 |
+
|
| 17 |
+
# Setup logging
|
| 18 |
+
logging.basicConfig(level=logging.INFO)
|
| 19 |
+
logger = logging.getLogger(__name__)
|
| 20 |
+
|
| 21 |
+
class MonarchButterflyOptimizer:
|
| 22 |
+
def __init__(self, bounds, n_butterflies=20, p_period=1.2, migration_ratio=0.85, max_iter=30, use_gpu=False):
|
| 23 |
+
self.bounds = bounds
|
| 24 |
+
self.n_butterflies = n_butterflies
|
| 25 |
+
self.p_period = p_period
|
| 26 |
+
self.migration_ratio = migration_ratio
|
| 27 |
+
self.max_iter = max_iter
|
| 28 |
+
self.best_solution = None
|
| 29 |
+
self.best_fitness = float('-inf')
|
| 30 |
+
|
| 31 |
+
# GPU setup
|
| 32 |
+
self.use_gpu = use_gpu and cuda_available()
|
| 33 |
+
self.device = torch.device('cuda' if self.use_gpu else 'cpu')
|
| 34 |
+
logger.info(f"Using device: {self.device}")
|
| 35 |
+
|
| 36 |
+
def initialize(self):
|
| 37 |
+
try:
|
| 38 |
+
population = []
|
| 39 |
+
for _ in range(self.n_butterflies):
|
| 40 |
+
butterfly = {}
|
| 41 |
+
for param, (low, high) in self.bounds.items():
|
| 42 |
+
if isinstance(low, int) and isinstance(high, int):
|
| 43 |
+
butterfly[param] = int(torch.randint(low, high+1, (1,), device=self.device).item())
|
| 44 |
+
else:
|
| 45 |
+
butterfly[param] = float(torch.rand(1, device=self.device).item() * (high - low) + low)
|
| 46 |
+
population.append(butterfly)
|
| 47 |
+
return population
|
| 48 |
+
except RuntimeError as e:
|
| 49 |
+
logger.error(f"CUDA error during initialization: {e}")
|
| 50 |
+
self.device = torch.device('cpu')
|
| 51 |
+
logger.info("Falling back to CPU")
|
| 52 |
+
return self.initialize()
|
| 53 |
+
|
| 54 |
+
def migration(self, population):
|
| 55 |
+
try:
|
| 56 |
+
new_population = []
|
| 57 |
+
migration_tensor = torch.rand(len(population), device=self.device)
|
| 58 |
+
|
| 59 |
+
for idx, butterfly in enumerate(population):
|
| 60 |
+
if migration_tensor[idx].item() < self.migration_ratio:
|
| 61 |
+
new_butterfly = {}
|
| 62 |
+
for param in butterfly:
|
| 63 |
+
r = torch.rand(1, device=self.device).item()
|
| 64 |
+
new_val = butterfly[param] + self.p_period * r * (self.best_solution[param] - butterfly[param])
|
| 65 |
+
new_butterfly[param] = self.clip(new_val, param)
|
| 66 |
+
new_population.append(new_butterfly)
|
| 67 |
+
else:
|
| 68 |
+
new_population.append(butterfly.copy())
|
| 69 |
+
return new_population
|
| 70 |
+
except RuntimeError as e:
|
| 71 |
+
logger.error(f"CUDA error during migration: {e}")
|
| 72 |
+
self.device = torch.device('cpu')
|
| 73 |
+
logger.info("Falling back to CPU")
|
| 74 |
+
return self.migration(population)
|
| 75 |
+
|
| 76 |
+
def clip(self, value, param):
|
| 77 |
+
low, high = self.bounds[param]
|
| 78 |
+
if isinstance(low, int) and isinstance(high, int):
|
| 79 |
+
return int(np.clip(value, low, high))
|
| 80 |
+
return np.clip(value, low, high)
|
| 81 |
+
|
| 82 |
+
def optimize(self, fitness_func):
|
| 83 |
+
population = self.initialize()
|
| 84 |
+
|
| 85 |
+
for _ in range(self.max_iter):
|
| 86 |
+
for butterfly in population:
|
| 87 |
+
fitness = fitness_func(butterfly)
|
| 88 |
+
if fitness > self.best_fitness:
|
| 89 |
+
self.best_fitness = fitness
|
| 90 |
+
self.best_solution = butterfly.copy()
|
| 91 |
+
|
| 92 |
+
population = self.migration(population)
|
| 93 |
+
|
| 94 |
+
return self.best_solution, self.best_fitness
|
| 95 |
+
|
| 96 |
+
def plot_dataset_insights(df):
|
| 97 |
+
plt.figure(figsize=(15, 5))
|
| 98 |
+
|
| 99 |
+
plt.subplot(131)
|
| 100 |
+
sns.histplot(data=df, x='feature_length', hue='target', bins=50)
|
| 101 |
+
plt.title('Message Length Distribution')
|
| 102 |
+
|
| 103 |
+
plt.subplot(132)
|
| 104 |
+
df['target'].value_counts().plot(kind='bar')
|
| 105 |
+
plt.title('Class Distribution')
|
| 106 |
+
|
| 107 |
+
plt.subplot(133)
|
| 108 |
+
sns.boxplot(data=df, x='target', y='word_count')
|
| 109 |
+
plt.title('Word Count by Class')
|
| 110 |
+
|
| 111 |
+
plt.tight_layout()
|
| 112 |
+
plt.savefig('./graphs/dataset_insights.png')
|
| 113 |
+
plt.close()
|
| 114 |
+
|
| 115 |
+
def plot_word_clouds(df):
|
| 116 |
+
from wordcloud import WordCloud
|
| 117 |
+
plt.figure(figsize=(15, 5))
|
| 118 |
+
|
| 119 |
+
for idx, label in enumerate(['ham', 'spam']):
|
| 120 |
+
text = ' '.join(df[df['target'] == label]['transformed_text'])
|
| 121 |
+
wordcloud = WordCloud(width=800, height=400).generate(text)
|
| 122 |
+
|
| 123 |
+
plt.subplot(1, 2, idx+1)
|
| 124 |
+
plt.imshow(wordcloud)
|
| 125 |
+
plt.axis('off')
|
| 126 |
+
plt.title(f'Word Cloud - {label.upper()}')
|
| 127 |
+
|
| 128 |
+
plt.savefig('./graphs/wordclouds.png')
|
| 129 |
+
plt.close()
|
| 130 |
+
|
| 131 |
+
def plot_performance_metrics(y_test, y_pred, model):
|
| 132 |
+
plt.figure(figsize=(15, 5))
|
| 133 |
+
|
| 134 |
+
plt.subplot(131)
|
| 135 |
+
cm = confusion_matrix(y_test, y_pred)
|
| 136 |
+
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
|
| 137 |
+
plt.title('Confusion Matrix')
|
| 138 |
+
|
| 139 |
+
plt.subplot(132)
|
| 140 |
+
report = classification_report(y_test, y_pred, output_dict=True)
|
| 141 |
+
sns.heatmap(pd.DataFrame(report).iloc[:-1, :].T, annot=True, cmap='RdYlGn')
|
| 142 |
+
plt.title('Classification Report')
|
| 143 |
+
|
| 144 |
+
plt.subplot(133)
|
| 145 |
+
etc = model.named_estimators_['etc']
|
| 146 |
+
importances = pd.Series(etc.feature_importances_)
|
| 147 |
+
importances.nlargest(10).plot(kind='bar')
|
| 148 |
+
plt.title('Top 10 Important Features')
|
| 149 |
+
|
| 150 |
+
plt.tight_layout()
|
| 151 |
+
plt.savefig('./graphs/performance_metrics.png')
|
| 152 |
+
plt.close()
|
| 153 |
+
|
| 154 |
+
def save_metrics(metrics):
|
| 155 |
+
with open('./models/metrics.txt', 'w') as f:
|
| 156 |
+
for metric, value in metrics.items():
|
| 157 |
+
f.write(f"{metric}: {value:.4f}\n")
|
| 158 |
+
|
| 159 |
+
def create_optimized_ensemble(X_train, y_train, mbo_params):
|
| 160 |
+
param_bounds = {
|
| 161 |
+
'svc_C': (0.1, 20.0),
|
| 162 |
+
'svc_gamma': (0.001, 1.0),
|
| 163 |
+
'mnb_alpha': (0.1, 2.0),
|
| 164 |
+
'etc_n_estimators': (100, 300),
|
| 165 |
+
'w1': (0, 5),
|
| 166 |
+
'w2': (0, 5),
|
| 167 |
+
'w3': (0, 5)
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
mbo = MonarchButterflyOptimizer(
|
| 171 |
+
param_bounds,
|
| 172 |
+
n_butterflies=int(mbo_params.get('n_butterflies', 20)),
|
| 173 |
+
p_period=float(mbo_params.get('p_period', 1.2)),
|
| 174 |
+
migration_ratio=float(mbo_params.get('migration_ratio', 0.85)),
|
| 175 |
+
max_iter=int(mbo_params.get('max_iter', 30)),
|
| 176 |
+
use_gpu=bool(mbo_params.get('use_gpu', False))
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
def fitness_function(params):
|
| 180 |
+
svc = SVC(kernel='rbf', C=params['svc_C'],
|
| 181 |
+
gamma=params['svc_gamma'], probability=True)
|
| 182 |
+
mnb = MultinomialNB(alpha=params['mnb_alpha'])
|
| 183 |
+
etc = ExtraTreesClassifier(n_estimators=int(params['etc_n_estimators']))
|
| 184 |
+
|
| 185 |
+
estimators = [('svc', svc), ('mnb', mnb), ('etc', etc)]
|
| 186 |
+
weights = [params['w1'], params['w2'], params['w3']]
|
| 187 |
+
|
| 188 |
+
clf = VotingClassifier(estimators=estimators, voting='soft', weights=weights)
|
| 189 |
+
scores = cross_val_score(clf, X_train, y_train, cv=5)
|
| 190 |
+
return np.mean(scores)
|
| 191 |
+
|
| 192 |
+
# Initialize and run MBO
|
| 193 |
+
mbo = MonarchButterflyOptimizer(param_bounds)
|
| 194 |
+
best_params, _ = mbo.optimize(fitness_function)
|
| 195 |
+
# Create final model with optimized parameters
|
| 196 |
+
svc = SVC(kernel='rbf', C=best_params['svc_C'],
|
| 197 |
+
gamma=best_params['svc_gamma'], probability=True)
|
| 198 |
+
mnb = MultinomialNB(alpha=best_params['mnb_alpha'])
|
| 199 |
+
etc = ExtraTreesClassifier(n_estimators=int(best_params['etc_n_estimators']))
|
| 200 |
+
|
| 201 |
+
estimators = [('svc', svc), ('mnb', mnb), ('etc', etc)]
|
| 202 |
+
weights = [best_params['w1'], best_params['w2'], best_params['w3']]
|
| 203 |
+
|
| 204 |
+
return VotingClassifier(estimators=estimators, voting='soft', weights=weights)
|
| 205 |
+
|
| 206 |
+
def main(mbo_params=None):
|
| 207 |
+
try:
|
| 208 |
+
logger.info("Loading data...")
|
| 209 |
+
# Load and preprocess data
|
| 210 |
+
df = pd.read_csv('./data/spam.csv', encoding='latin-1')
|
| 211 |
+
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
|
| 212 |
+
df = df.rename(columns={'v1': 'target', 'v2': 'text'})
|
| 213 |
+
|
| 214 |
+
logger.info("Preprocessing text...")
|
| 215 |
+
df['transformed_text'] = df['text'].apply(lambda x: x.lower().translate(str.maketrans('', '', string.punctuation)))
|
| 216 |
+
df['word_count'] = df['transformed_text'].str.split().str.len()
|
| 217 |
+
df['feature_length'] = df['transformed_text'].apply(len)
|
| 218 |
+
|
| 219 |
+
logger.info("Generating visualizations...")
|
| 220 |
+
plot_dataset_insights(df)
|
| 221 |
+
plot_word_clouds(df)
|
| 222 |
+
|
| 223 |
+
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,3))
|
| 224 |
+
X = tfidf.fit_transform(df['transformed_text'])
|
| 225 |
+
y = (df['target'] == 'spam').astype(int)
|
| 226 |
+
|
| 227 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 228 |
+
X, y, test_size=0.2, random_state=42, stratify=y
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
logger.info("Training model with MBO...")
|
| 232 |
+
if mbo_params and mbo_params.get('use_gpu'):
|
| 233 |
+
logger.info("GPU acceleration enabled")
|
| 234 |
+
model = create_optimized_ensemble(X_train, y_train, mbo_params or {})
|
| 235 |
+
|
| 236 |
+
model.fit(X_train, y_train)
|
| 237 |
+
|
| 238 |
+
y_pred = model.predict(X_test)
|
| 239 |
+
|
| 240 |
+
metrics = {
|
| 241 |
+
"Accuracy": accuracy_score(y_test, y_pred),
|
| 242 |
+
"Precision": precision_score(y_test, y_pred),
|
| 243 |
+
"F1": f1_score(y_test, y_pred)
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
save_metrics(metrics)
|
| 247 |
+
for metric, value in metrics.items():
|
| 248 |
+
logger.info(f"{metric}: {value:.4f}")
|
| 249 |
+
|
| 250 |
+
plot_performance_metrics(y_test, y_pred, model)
|
| 251 |
+
|
| 252 |
+
logger.info("Saving models...")
|
| 253 |
+
with open('./models/vectorizer_mbo.pkl', 'wb') as f:
|
| 254 |
+
pickle.dump(tfidf, f)
|
| 255 |
+
with open('./models/model_mbo.pkl', 'wb') as f:
|
| 256 |
+
pickle.dump(model, f)
|
| 257 |
+
|
| 258 |
+
logger.info("MBO optimization completed successfully")
|
| 259 |
+
|
| 260 |
+
except Exception as e:
|
| 261 |
+
logger.error(f"An error occurred: {e}")
|
| 262 |
+
raise
|
| 263 |
+
|
| 264 |
+
if __name__ == "__main__":
|
| 265 |
+
main()
|