linear / services /train_model.py
tharu22's picture
message
877a8c0
# app/services/train_model.py
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pickle
import os
# Load the dataset
file_path = "data/sms_process_data_main.xlsx"
df = pd.read_excel(file_path)
# Prepare the features and labels
X = df['MessageText'] # SMS messages
y = df['label'] # Labels: 'Transaction' or 'Offer'
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
# Fit the vectorizer on the training data and transform the training data
X_train_vec = vectorizer.fit_transform(X_train)
# Initialize and train the logistic regression model
classifier = LogisticRegression()
classifier.fit(X_train_vec, y_train)
# Save the trained model and vectorizer
models_dir = "models"
if not os.path.exists(models_dir):
os.makedirs(models_dir)
# Save the classifier model
with open(os.path.join(models_dir, 'sms_classifier_model.pkl'), 'wb') as model_file:
pickle.dump(classifier, model_file)
# Save the vectorizer
with open(os.path.join(models_dir, 'tfidf_vectorizer.pkl'), 'wb') as vectorizer_file:
pickle.dump(vectorizer, vectorizer_file)
print("Model and vectorizer saved successfully!")