Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import pickle | |
| import os | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.preprocessing import LabelEncoder | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.linear_model import LogisticRegression | |
| from config import ( | |
| DATA_PATH, TEXT_COLUMN, LABEL_COLUMNS, | |
| TFIDF_MAX_FEATURES, NGRAM_RANGE, USE_STOPWORDS, | |
| RANDOM_STATE, TEST_SIZE, | |
| MODEL_SAVE_DIR, LABEL_ENCODERS_PATH, TFIDF_VECTORIZER_PATH | |
| ) | |
| def load_data(path): | |
| df = pd.read_csv(path) | |
| df.dropna(subset=[TEXT_COLUMN] + LABEL_COLUMNS, inplace=True) | |
| return df | |
| def save_pickle(obj, path): | |
| with open(path, "wb") as f: | |
| pickle.dump(obj, f) | |
| def train(): | |
| print(" Loading data...") | |
| df = load_data(DATA_PATH) | |
| X = df[TEXT_COLUMN] | |
| print(" Fitting TF-IDF vectorizer...") | |
| stop_words = 'english' if USE_STOPWORDS else None | |
| tfidf = TfidfVectorizer( | |
| max_features=TFIDF_MAX_FEATURES, | |
| ngram_range=NGRAM_RANGE, | |
| stop_words=stop_words | |
| ) | |
| X_tfidf = tfidf.fit_transform(X) | |
| print(f" Saved TF-IDF vectorizer to {TFIDF_VECTORIZER_PATH}") | |
| save_pickle(tfidf, TFIDF_VECTORIZER_PATH) | |
| models = {} | |
| label_encoders = {} | |
| for label in LABEL_COLUMNS: | |
| print(f"\n Processing label: {label}") | |
| le = LabelEncoder() | |
| y = le.fit_transform(df[label]) | |
| print(" Splitting train/test...") | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X_tfidf, y, test_size=TEST_SIZE, random_state=RANDOM_STATE | |
| ) | |
| print(" Training Logistic Regression model...") | |
| model = LogisticRegression( | |
| max_iter=1000, | |
| random_state=RANDOM_STATE | |
| ) | |
| model.fit(X_train, y_train) | |
| models[label] = model | |
| label_encoders[label] = le | |
| print(f" Finished training: {label}") | |
| models_path = os.path.join(MODEL_SAVE_DIR, "logreg_model.pkl") | |
| print(f"\n Saving all models to: {models_path}") | |
| save_pickle(models, models_path) | |
| print(f" Saving label encoders to: {LABEL_ENCODERS_PATH}") | |
| save_pickle(label_encoders, LABEL_ENCODERS_PATH) | |
| print("\n Logistic Regression training complete.") | |
| if __name__ == "__main__": | |
| train() |