junaid17 commited on
Commit
d59cad8
·
verified ·
1 Parent(s): 0999e27

Update src/train.py

Browse files
Files changed (1) hide show
  1. src/train.py +51 -52
src/train.py CHANGED
@@ -1,52 +1,51 @@
1
- import pandas as pd
2
- import joblib
3
- from sklearn.preprocessing import MinMaxScaler
4
- from imblearn.combine import SMOTETomek
5
- from xgboost import XGBClassifier
6
-
7
- from src.utils import load_config, get_versioned_path
8
- from src.ingestion import load_raw_data
9
- from src.preprocessing import clean_and_engineer
10
-
11
-
12
- def train_pipeline():
13
- config = load_config()
14
-
15
- df = load_raw_data()
16
- df = clean_and_engineer(df)
17
-
18
- target = config["data"]["target"]
19
-
20
- X = df.drop(columns=[target])
21
- y = df[target]
22
-
23
- # One-hot encoding
24
- X_encoded = pd.get_dummies(X, drop_first=True)
25
-
26
- scaler = MinMaxScaler()
27
- X_scaled = scaler.fit_transform(X_encoded)
28
-
29
- smt = SMOTETomek(random_state=config["training"]["random_state"])
30
- X_res, y_res = smt.fit_resample(X_scaled, y)
31
-
32
- params = config["model"]["params"]
33
- model = XGBClassifier(**params)
34
- model.fit(X_res, y_res)
35
-
36
- model_path = get_versioned_path(config["artifacts"]["model_dir"], "credit_model", "pkl")
37
- scaler_path = get_versioned_path(config["artifacts"]["model_dir"], "scaler", "pkl")
38
- columns_path = get_versioned_path(config["artifacts"]["model_dir"], "columns", "pkl")
39
-
40
- joblib.dump(model, model_path)
41
- joblib.dump(scaler, scaler_path)
42
- joblib.dump(list(X_encoded.columns), columns_path)
43
-
44
- print(f"Model saved at: {model_path}")
45
- print(f"Scaler saved at: {scaler_path}")
46
- print(f"Columns saved at: {columns_path}")
47
-
48
- return model_path, scaler_path, columns_path
49
-
50
-
51
- if __name__ == "__main__":
52
- train_pipeline()
 
1
+ import pandas as pd
2
+ import joblib
3
+ from sklearn.preprocessing import MinMaxScaler
4
+ from imblearn.combine import SMOTETomek
5
+ from sklearn.linear_model import LogisticRegression
6
+
7
+ from src.utils import load_config, get_versioned_path
8
+ from src.ingestion import load_raw_data
9
+ from src.preprocessing import clean_and_engineer
10
+
11
+
12
+ def train_pipeline():
13
+ config = load_config()
14
+
15
+ df = load_raw_data()
16
+ df = clean_and_engineer(df)
17
+
18
+ target = config["data"]["target"]
19
+
20
+ X = df.drop(columns=[target])
21
+ y = df[target]
22
+
23
+ X_encoded = pd.get_dummies(X, drop_first=True)
24
+
25
+ scaler = MinMaxScaler()
26
+ X_scaled = scaler.fit_transform(X_encoded)
27
+
28
+ smt = SMOTETomek(random_state=config["training"]["random_state"])
29
+ X_res, y_res = smt.fit_resample(X_scaled, y)
30
+
31
+ params = config["model"]["params"]
32
+ model = LogisticRegression(**params)
33
+ model.fit(X_res, y_res)
34
+
35
+ model_path = get_versioned_path(config["artifacts"]["model_dir"], "credit_model", "pkl")
36
+ scaler_path = get_versioned_path(config["artifacts"]["model_dir"], "scaler", "pkl")
37
+ columns_path = get_versioned_path(config["artifacts"]["model_dir"], "columns", "pkl")
38
+
39
+ joblib.dump(model, model_path)
40
+ joblib.dump(scaler, scaler_path)
41
+ joblib.dump(list(X_encoded.columns), columns_path)
42
+
43
+ print(f"Model saved at: {model_path}")
44
+ print(f"Scaler saved at: {scaler_path}")
45
+ print(f"Columns saved at: {columns_path}")
46
+
47
+ return model_path, scaler_path, columns_path
48
+
49
+
50
+ if __name__ == "__main__":
51
+ train_pipeline()