subbunanepalli commited on
Commit
1a2d931
·
verified ·
1 Parent(s): f83328a

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +60 -48
train.py CHANGED
@@ -1,64 +1,76 @@
1
- import os
2
  import pandas as pd
3
- import joblib
4
- from sklearn.model_selection import train_test_split
5
  from sklearn.feature_extraction.text import TfidfVectorizer
6
- from sklearn.linear_model import LogisticRegression
7
- from sklearn.multioutput import MultiOutputClassifier
8
- from sklearn.pipeline import Pipeline
9
  from sklearn.preprocessing import LabelEncoder
 
 
 
10
  from config import (
11
  DATA_PATH, TEXT_COLUMN, LABEL_COLUMNS,
12
- MODEL_SAVE_DIR, LABEL_ENCODERS_PATH,
13
- TFIDF_MAX_FEATURES, NGRAM_RANGE,
14
- USE_STOPWORDS, RANDOM_STATE, TEST_SIZE
15
  )
16
 
17
- # Load and preprocess data
18
- print(" Loading dataset...")
19
- df = pd.read_csv(DATA_PATH)
20
- df.dropna(subset=[TEXT_COLUMN] + LABEL_COLUMNS, inplace=True)
21
 
22
- # Encode each label
23
- label_encoders = {}
24
- for col in LABEL_COLUMNS:
25
- le = LabelEncoder()
26
- df[col] = le.fit_transform(df[col])
27
- label_encoders[col] = le
28
 
29
- # Features and targets
30
- X = df[TEXT_COLUMN]
31
- Y = df[LABEL_COLUMNS]
 
 
 
 
 
 
 
 
 
 
32
 
33
- # Train-test split
34
- X_train, X_test, y_train, y_test = train_test_split(
35
- X, Y, test_size=TEST_SIZE, random_state=RANDOM_STATE
36
- )
 
 
 
 
 
 
 
 
 
 
 
37
 
38
- # Build pipeline
39
- stop_words = "english" if USE_STOPWORDS else None
40
- pipeline = Pipeline([
41
- ('tfidf', TfidfVectorizer(max_features=TFIDF_MAX_FEATURES, ngram_range=NGRAM_RANGE, stop_words=stop_words)),
42
- ('clf', MultiOutputClassifier(LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)))
43
- ])
44
 
45
- # Train model
46
- print(" Training model...")
47
- pipeline.fit(X_train, y_train)
48
 
49
- # Save full model pipeline
50
- model_path = os.path.join(MODEL_SAVE_DIR, "logreg_model.pkl")
51
- print(f" Saving model to {model_path}")
52
- joblib.dump(pipeline, model_path)
53
 
54
- # Save label encoders
55
- print(f" Saving label encoders to {LABEL_ENCODERS_PATH}")
56
- joblib.dump(label_encoders, LABEL_ENCODERS_PATH)
57
 
58
- # Save TF-IDF vectorizer separately
59
- tfidf_vectorizer = pipeline.named_steps['tfidf']
60
- tfidf_path = os.path.join(MODEL_SAVE_DIR, "tfidf_vectorizer.pkl")
61
- print(f" Saving TF-IDF vectorizer to {tfidf_path}")
62
- joblib.dump(tfidf_vectorizer, tfidf_path)
63
 
64
- print("Training complete.")
 
 
 
1
  import pandas as pd
2
+ import pickle
3
+ import os
4
  from sklearn.feature_extraction.text import TfidfVectorizer
 
 
 
5
  from sklearn.preprocessing import LabelEncoder
6
+ from sklearn.model_selection import train_test_split
7
+ from sklearn.linear_model import LogisticRegression
8
+
9
  from config import (
10
  DATA_PATH, TEXT_COLUMN, LABEL_COLUMNS,
11
+ TFIDF_MAX_FEATURES, NGRAM_RANGE, USE_STOPWORDS,
12
+ RANDOM_STATE, TEST_SIZE,
13
+ MODEL_SAVE_DIR, LABEL_ENCODERS_PATH, TFIDF_VECTORIZER_PATH
14
  )
15
 
16
+ def load_data(path):
17
+ df = pd.read_csv(path)
18
+ df.dropna(subset=[TEXT_COLUMN] + LABEL_COLUMNS, inplace=True)
19
+ return df
20
 
21
+ def save_pickle(obj, path):
22
+ with open(path, "wb") as f:
23
+ pickle.dump(obj, f)
 
 
 
24
 
25
+ def train():
26
+ print(" Loading data...")
27
+ df = load_data(DATA_PATH)
28
+ X = df[TEXT_COLUMN]
29
+
30
+ print(" Fitting TF-IDF vectorizer...")
31
+ stop_words = 'english' if USE_STOPWORDS else None
32
+ tfidf = TfidfVectorizer(
33
+ max_features=TFIDF_MAX_FEATURES,
34
+ ngram_range=NGRAM_RANGE,
35
+ stop_words=stop_words
36
+ )
37
+ X_tfidf = tfidf.fit_transform(X)
38
 
39
+ print(f" Saved TF-IDF vectorizer to {TFIDF_VECTORIZER_PATH}")
40
+ save_pickle(tfidf, TFIDF_VECTORIZER_PATH)
41
+
42
+ models = {}
43
+ label_encoders = {}
44
+
45
+ for label in LABEL_COLUMNS:
46
+ print(f"\n Processing label: {label}")
47
+ le = LabelEncoder()
48
+ y = le.fit_transform(df[label])
49
+
50
+ print(" Splitting train/test...")
51
+ X_train, X_test, y_train, y_test = train_test_split(
52
+ X_tfidf, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
53
+ )
54
 
55
+ print(" Training Logistic Regression model...")
56
+ model = LogisticRegression(
57
+ max_iter=1000,
58
+ random_state=RANDOM_STATE
59
+ )
60
+ model.fit(X_train, y_train)
61
 
62
+ models[label] = model
63
+ label_encoders[label] = le
64
+ print(f" Finished training: {label}")
65
 
66
+ models_path = os.path.join(MODEL_SAVE_DIR, "logreg_model.pkl")
67
+ print(f"\n Saving all models to: {models_path}")
68
+ save_pickle(models, models_path)
 
69
 
70
+ print(f" Saving label encoders to: {LABEL_ENCODERS_PATH}")
71
+ save_pickle(label_encoders, LABEL_ENCODERS_PATH)
 
72
 
73
+ print("\n Logistic Regression training complete.")
 
 
 
 
74
 
75
+ if __name__ == "__main__":
76
+ train()