LogReg_model / train.py
subbunanepalli's picture
Create train.py
ee375e6 verified
# train.py
import os
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from config import (
DATA_PATH, TEXT_COLUMN, LABEL_COLUMNS,
MODEL_SAVE_DIR, VECTORIZER_PATH, MODEL_PATH
)
from utils.helpers import create_text_column
# === Load Dataset ===
df = pd.read_csv(DATA_PATH)
df[TEXT_COLUMN] = df.apply(create_text_column, axis=1)
# === Train-Test Split ===
X_train, X_test, y_train, y_test = train_test_split(
df[TEXT_COLUMN], df[LABEL_COLUMNS], test_size=0.2, random_state=42
)
# === TF-IDF Vectorizer ===
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
# === Model ===
model = MultiOutputClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train_tfidf, y_train)
# === Save ===
os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
joblib.dump(tfidf, VECTORIZER_PATH)
joblib.dump(model, MODEL_PATH)
print(" Training completed and models saved.")