Spaces:
Sleeping
Sleeping
| # train.py | |
| import os | |
| import pandas as pd | |
| import joblib | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.multioutput import MultiOutputClassifier | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.pipeline import Pipeline | |
| from config import ( | |
| DATA_PATH, TEXT_COLUMN, LABEL_COLUMNS, | |
| MODEL_SAVE_DIR, VECTORIZER_PATH, MODEL_PATH | |
| ) | |
| from utils.helpers import create_text_column | |
| # === Load Dataset === | |
| df = pd.read_csv(DATA_PATH) | |
| df[TEXT_COLUMN] = df.apply(create_text_column, axis=1) | |
| # === Train-Test Split === | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| df[TEXT_COLUMN], df[LABEL_COLUMNS], test_size=0.2, random_state=42 | |
| ) | |
| # === TF-IDF Vectorizer === | |
| tfidf = TfidfVectorizer(max_features=5000) | |
| X_train_tfidf = tfidf.fit_transform(X_train) | |
| # === Model === | |
| model = MultiOutputClassifier(LogisticRegression(max_iter=1000)) | |
| model.fit(X_train_tfidf, y_train) | |
| # === Save === | |
| os.makedirs(MODEL_SAVE_DIR, exist_ok=True) | |
| joblib.dump(tfidf, VECTORIZER_PATH) | |
| joblib.dump(model, MODEL_PATH) | |
| print(" Training completed and models saved.") | |