Spaces:

point9
/

ml_tfidf_logreg_project

Sleeping

App Files Files Community

subbunanepalli commited on Jun 13, 2025

Commit

1a2d931

verified ·

1 Parent(s): f83328a

Update train.py

Browse files

Files changed (1) hide show

train.py +60 -48

train.py CHANGED Viewed

@@ -1,64 +1,76 @@
-import os
 import pandas as pd
-import joblib
-from sklearn.model_selection import train_test_split
 from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.linear_model import LogisticRegression
-from sklearn.multioutput import MultiOutputClassifier
-from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import LabelEncoder
 from config import (
     DATA_PATH, TEXT_COLUMN, LABEL_COLUMNS,
-    MODEL_SAVE_DIR, LABEL_ENCODERS_PATH,
-    TFIDF_MAX_FEATURES, NGRAM_RANGE,
-    USE_STOPWORDS, RANDOM_STATE, TEST_SIZE
 )
-#  Load and preprocess data
-print(" Loading dataset...")
-df = pd.read_csv(DATA_PATH)
-df.dropna(subset=[TEXT_COLUMN] + LABEL_COLUMNS, inplace=True)
-#  Encode each label
-label_encoders = {}
-for col in LABEL_COLUMNS:
-    le = LabelEncoder()
-    df[col] = le.fit_transform(df[col])
-    label_encoders[col] = le
-#  Features and targets
-X = df[TEXT_COLUMN]
-Y = df[LABEL_COLUMNS]
-#  Train-test split
-X_train, X_test, y_train, y_test = train_test_split(
-    X, Y, test_size=TEST_SIZE, random_state=RANDOM_STATE
-)
-#  Build pipeline
-stop_words = "english" if USE_STOPWORDS else None
-pipeline = Pipeline([
-    ('tfidf', TfidfVectorizer(max_features=TFIDF_MAX_FEATURES, ngram_range=NGRAM_RANGE, stop_words=stop_words)),
-    ('clf', MultiOutputClassifier(LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)))
-])
-#  Train model
-print(" Training model...")
-pipeline.fit(X_train, y_train)
-#  Save full model pipeline
-model_path = os.path.join(MODEL_SAVE_DIR, "logreg_model.pkl")
-print(f" Saving model to {model_path}")
-joblib.dump(pipeline, model_path)
-# Save label encoders
-print(f" Saving label encoders to {LABEL_ENCODERS_PATH}")
-joblib.dump(label_encoders, LABEL_ENCODERS_PATH)
-#  Save TF-IDF vectorizer separately
-tfidf_vectorizer = pipeline.named_steps['tfidf']
-tfidf_path = os.path.join(MODEL_SAVE_DIR, "tfidf_vectorizer.pkl")
-print(f" Saving TF-IDF vectorizer to {tfidf_path}")
-joblib.dump(tfidf_vectorizer, tfidf_path)
-print("Training complete.")

 import pandas as pd
+import pickle
+import os
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.preprocessing import LabelEncoder
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LogisticRegression
 from config import (
     DATA_PATH, TEXT_COLUMN, LABEL_COLUMNS,
+    TFIDF_MAX_FEATURES, NGRAM_RANGE, USE_STOPWORDS,
+    RANDOM_STATE, TEST_SIZE,
+    MODEL_SAVE_DIR, LABEL_ENCODERS_PATH, TFIDF_VECTORIZER_PATH
 )
+def load_data(path):
+    df = pd.read_csv(path)
+    df.dropna(subset=[TEXT_COLUMN] + LABEL_COLUMNS, inplace=True)
+    return df
+def save_pickle(obj, path):
+    with open(path, "wb") as f:
+        pickle.dump(obj, f)
+def train():
+    print(" Loading data...")
+    df = load_data(DATA_PATH)
+    X = df[TEXT_COLUMN]
+    print(" Fitting TF-IDF vectorizer...")
+    stop_words = 'english' if USE_STOPWORDS else None
+    tfidf = TfidfVectorizer(
+        max_features=TFIDF_MAX_FEATURES,
+        ngram_range=NGRAM_RANGE,
+        stop_words=stop_words
+    )
+    X_tfidf = tfidf.fit_transform(X)
+    print(f" Saved TF-IDF vectorizer to {TFIDF_VECTORIZER_PATH}")
+    save_pickle(tfidf, TFIDF_VECTORIZER_PATH)
+    models = {}
+    label_encoders = {}
+    for label in LABEL_COLUMNS:
+        print(f"\n Processing label: {label}")
+        le = LabelEncoder()
+        y = le.fit_transform(df[label])
+        print("    Splitting train/test...")
+        X_train, X_test, y_train, y_test = train_test_split(
+            X_tfidf, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
+        )
+        print("    Training Logistic Regression model...")
+        model = LogisticRegression(
+            max_iter=1000,
+            random_state=RANDOM_STATE
+        )
+        model.fit(X_train, y_train)
+        models[label] = model
+        label_encoders[label] = le
+        print(f" Finished training: {label}")
+    models_path = os.path.join(MODEL_SAVE_DIR, "logreg_model.pkl")
+    print(f"\n Saving all models to: {models_path}")
+    save_pickle(models, models_path)
+    print(f" Saving label encoders to: {LABEL_ENCODERS_PATH}")
+    save_pickle(label_encoders, LABEL_ENCODERS_PATH)
+    print("\n Logistic Regression training complete.")
+if __name__ == "__main__":
+    train()