# train.py import os import pandas as pd import joblib from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.multioutput import MultiOutputClassifier from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline from config import ( DATA_PATH, TEXT_COLUMN, LABEL_COLUMNS, MODEL_SAVE_DIR, VECTORIZER_PATH, MODEL_PATH ) from utils.helpers import create_text_column # === Load Dataset === df = pd.read_csv(DATA_PATH) df[TEXT_COLUMN] = df.apply(create_text_column, axis=1) # === Train-Test Split === X_train, X_test, y_train, y_test = train_test_split( df[TEXT_COLUMN], df[LABEL_COLUMNS], test_size=0.2, random_state=42 ) # === TF-IDF Vectorizer === tfidf = TfidfVectorizer(max_features=5000) X_train_tfidf = tfidf.fit_transform(X_train) # === Model === model = MultiOutputClassifier(LogisticRegression(max_iter=1000)) model.fit(X_train_tfidf, y_train) # === Save === os.makedirs(MODEL_SAVE_DIR, exist_ok=True) joblib.dump(tfidf, VECTORIZER_PATH) joblib.dump(model, MODEL_PATH) print(" Training completed and models saved.")