|
|
|
|
|
import pandas as pd |
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
from sklearn.linear_model import LogisticRegression |
|
|
from sklearn.model_selection import train_test_split |
|
|
import pickle |
|
|
import os |
|
|
|
|
|
|
|
|
file_path = "data/sms_process_data_main.xlsx" |
|
|
df = pd.read_excel(file_path) |
|
|
|
|
|
|
|
|
X = df['MessageText'] |
|
|
y = df['label'] |
|
|
|
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
|
|
|
|
|
|
|
|
vectorizer = TfidfVectorizer(max_features=5000) |
|
|
|
|
|
|
|
|
X_train_vec = vectorizer.fit_transform(X_train) |
|
|
|
|
|
|
|
|
classifier = LogisticRegression() |
|
|
classifier.fit(X_train_vec, y_train) |
|
|
|
|
|
|
|
|
models_dir = "models" |
|
|
if not os.path.exists(models_dir): |
|
|
os.makedirs(models_dir) |
|
|
|
|
|
|
|
|
with open(os.path.join(models_dir, 'sms_classifier_model.pkl'), 'wb') as model_file: |
|
|
pickle.dump(classifier, model_file) |
|
|
|
|
|
|
|
|
with open(os.path.join(models_dir, 'tfidf_vectorizer.pkl'), 'wb') as vectorizer_file: |
|
|
pickle.dump(vectorizer, vectorizer_file) |
|
|
|
|
|
print("Model and vectorizer saved successfully!") |
|
|
|