subbunanepalli commited on
Commit
80ed56a
·
verified ·
1 Parent(s): 37b08eb

Create train.py

Browse files
Files changed (1) hide show
  1. train.py +64 -0
train.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import joblib
4
+ from sklearn.model_selection import train_test_split
5
+ from sklearn.feature_extraction.text import TfidfVectorizer
6
+ from sklearn.linear_model import LogisticRegression
7
+ from sklearn.multioutput import MultiOutputClassifier
8
+ from sklearn.pipeline import Pipeline
9
+ from sklearn.preprocessing import LabelEncoder
10
+ from config import (
11
+ DATA_PATH, TEXT_COLUMN, LABEL_COLUMNS,
12
+ MODEL_SAVE_DIR, LABEL_ENCODERS_PATH,
13
+ TFIDF_MAX_FEATURES, NGRAM_RANGE,
14
+ USE_STOPWORDS, RANDOM_STATE, TEST_SIZE
15
+ )
16
+
17
+ # Load and preprocess data
18
+ print(" Loading dataset...")
19
+ df = pd.read_csv(DATA_PATH)
20
+ df.dropna(subset=[TEXT_COLUMN] + LABEL_COLUMNS, inplace=True)
21
+
22
+ # Encode each label
23
+ label_encoders = {}
24
+ for col in LABEL_COLUMNS:
25
+ le = LabelEncoder()
26
+ df[col] = le.fit_transform(df[col])
27
+ label_encoders[col] = le
28
+
29
+ # Features and targets
30
+ X = df[TEXT_COLUMN]
31
+ Y = df[LABEL_COLUMNS]
32
+
33
+ # Train-test split
34
+ X_train, X_test, y_train, y_test = train_test_split(
35
+ X, Y, test_size=TEST_SIZE, random_state=RANDOM_STATE
36
+ )
37
+
38
+ # Build pipeline
39
+ stop_words = "english" if USE_STOPWORDS else None
40
+ pipeline = Pipeline([
41
+ ('tfidf', TfidfVectorizer(max_features=TFIDF_MAX_FEATURES, ngram_range=NGRAM_RANGE, stop_words=stop_words)),
42
+ ('clf', MultiOutputClassifier(LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)))
43
+ ])
44
+
45
+ # Train model
46
+ print(" Training model...")
47
+ pipeline.fit(X_train, y_train)
48
+
49
+ # Save full model pipeline
50
+ model_path = os.path.join(MODEL_SAVE_DIR, "logreg_model.pkl")
51
+ print(f" Saving model to {model_path}")
52
+ joblib.dump(pipeline, model_path)
53
+
54
+ # Save label encoders
55
+ print(f" Saving label encoders to {LABEL_ENCODERS_PATH}")
56
+ joblib.dump(label_encoders, LABEL_ENCODERS_PATH)
57
+
58
+ # Save TF-IDF vectorizer separately
59
+ tfidf_vectorizer = pipeline.named_steps['tfidf']
60
+ tfidf_path = os.path.join(MODEL_SAVE_DIR, "tfidf_vectorizer.pkl")
61
+ print(f" Saving TF-IDF vectorizer to {tfidf_path}")
62
+ joblib.dump(tfidf_vectorizer, tfidf_path)
63
+
64
+ print("Training complete.")