Pre_insurance / app.py
thanhcong2001's picture
Create app.py
889747e verified
import pandas as pd
df = pd.read_csv('insurance.csv')
# Scale numeric and encode category
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
num_feature = df.select_dtypes(include=['int64','float64']).columns.drop('charges').to_list()
cat_feature = df.select_dtypes(include=['object']).columns.to_list()
preprocess = ColumnTransformer(transformers=[
('num',StandardScaler(),num_feature),
('cat',OneHotEncoder(drop='first'),cat_feature)
])
# Split dataset
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
x= df.drop(columns=['charges'])
y= np.log1p(df['charges'])
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.4,random_state=42)
y_test_rvs= np.expm1(y_test)
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
# define models
models = {
'LinearRegression':LinearRegression(),
'Ridge':Ridge(),
"Lasso":Lasso(),
'RandomForestRegressor':RandomForestRegressor(),
"SVR":SVR()
}
from sklearn.pipeline import Pipeline
from sklearn.metrics import root_mean_squared_error,mean_absolute_error
result = []
for name,model in models.items():
pipeline = Pipeline(steps=[
('p',preprocess),
('m',model)
])
pipeline.fit(x_train,y_train)
y_pred = np.expm1(pipeline.predict(x_test).flatten())
rmse = root_mean_squared_error(y_test_rvs,y_pred)
mae = mean_absolute_error(y_test_rvs,y_pred)
result.append({'Model':name, 'RMSE':rmse , 'MAE':mae})
print(f'Model: {name} - RMSE: {rmse:,.0f}$ - MAE: {mae:,.0f}$')
result_df = pd.DataFrame(result)
best_model_name = result_df.sort_values(by='RMSE').iloc[0]['Model']
best_model = models[best_model_name]
print(f'Best model: {best_model_name}')
best_pipeline = Pipeline(steps=[
('p',preprocess),
('b',best_model)
])
grid = GridSearchCV(best_pipeline,param_grid={
'b__n_estimators': [100, 200, 300],
'b__max_depth': [None, 5, 10, 20],
'b__min_samples_split': [2, 5, 10],
'b__min_samples_leaf': [1, 2, 4],
'b__max_features': ['sqrt', 'log2']
},cv=5,scoring='neg_mean_squared_error',n_jobs=-1)
grid.fit(x_train,y_train)
best_y_pred = np.expm1(grid.predict(x_test))
for pre,ac in zip(best_y_pred[:5],y_test_rvs[:5]):
print(f'Predict: {pre:,.0f}$ - Actual: {ac:,.0f}$')
print(f'RMSE: {root_mean_squared_error(y_test_rvs,best_y_pred)}')
print(f'MAE: {mean_absolute_error(y_test_rvs,best_y_pred)}')
def pre_insurance(age, sex, bmi, children, smoker, region):
cols = x.columns
vals = [age, sex, bmi, children, smoker, region]
user = pd.DataFrame([dict(zip(cols,vals))])
prediction = np.expm1(grid.predict(user)).item()
return f'Charges: {prediction:,.0f}$'
import gradio as gr
inputs = [
gr.Number(label="Age"),
gr.Radio(choices=['female', 'male'],label='Sex'),
gr.Number(label="BMI"),
gr.Radio(choices=[0, 1, 3, 2, 5, 4],label='Children'),
gr.Radio(choices=['yes', 'no'],label='Smoker'),
gr.Radio(choices=['southwest', 'southeast', 'northwest', 'northeast'],label='Region'),
]
demo = gr.Interface(fn=pre_insurance,inputs=inputs,outputs='text',title='Predict Insurance')
demo.launch()