DaniAcosta04 commited on
Commit
33f15f3
·
verified ·
1 Parent(s): 07e6bf0

Upload train.py

Browse files
Files changed (1) hide show
  1. train.py +65 -0
train.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ import sklearn
4
+ import joblib
5
+
6
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
7
+ from sklearn.compose import make_column_transformer
8
+
9
+ from sklearn.pipeline import make_pipeline
10
+
11
+ from sklearn.model_selection import train_test_split
12
+
13
+ from sklearn.linear_model import LinearRegression
14
+ from sklearn.metrics import mean_squared_error, r2_score
15
+ from math import sqrt
16
+
17
+ sklearn.set_config(display='diagram')
18
+
19
+ data = pd.read_csv('insurance.csv')
20
+ df = data.copy(deep=True)
21
+
22
+ df = df.drop(columns=['index'])
23
+ df.drop_duplicates(inplace=True)
24
+
25
+ target = 'charges'
26
+ numeric_features = ['age', 'bmi', 'children']
27
+ categorical_features = ['sex', 'smoker', 'region']
28
+
29
+ print('Creating data subsets')
30
+
31
+ X = df[numeric_features + categorical_features]
32
+ y = df[target]
33
+
34
+ Xtrain, Xtest, ytrain, ytest = train_test_split(
35
+ X, y,
36
+ test_size=0.2,
37
+ random_state=42
38
+ )
39
+
40
+ Xtest = Xtest[['age', 'bmi', 'children', 'sex', 'smoker', 'region']]
41
+
42
+ preprocessor = make_column_transformer(
43
+ (StandardScaler(), numeric_features),
44
+ (OneHotEncoder(handle_unknown='ignore'), categorical_features)
45
+ )
46
+
47
+ model_linear_regression = LinearRegression(n_jobs=-1)
48
+
49
+ print('Estimating Best Model Pipeline')
50
+
51
+ model_pipeline = make_pipeline(
52
+ preprocessor,
53
+ model_linear_regression
54
+ )
55
+
56
+ model_pipeline.fit(Xtrain, ytrain)
57
+
58
+ print("Logging Metrics")
59
+ print(f"R-squared: {r2_score(ytest, model_pipeline.predict(Xtest))}")
60
+
61
+ print("Serializing Model")
62
+
63
+ saved_model_path = "model.joblib"
64
+
65
+ joblib.dump(model_pipeline, saved_model_path)