Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| df = pd.read_csv('insurance.csv') | |
| # Scale numeric and encode category | |
| from sklearn.compose import ColumnTransformer | |
| from sklearn.preprocessing import StandardScaler,OneHotEncoder | |
| num_feature = df.select_dtypes(include=['int64','float64']).columns.drop('charges').to_list() | |
| cat_feature = df.select_dtypes(include=['object']).columns.to_list() | |
| preprocess = ColumnTransformer(transformers=[ | |
| ('num',StandardScaler(),num_feature), | |
| ('cat',OneHotEncoder(drop='first'),cat_feature) | |
| ]) | |
| # Split dataset | |
| import numpy as np | |
| from sklearn.model_selection import train_test_split,GridSearchCV | |
| x= df.drop(columns=['charges']) | |
| y= np.log1p(df['charges']) | |
| x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.4,random_state=42) | |
| y_test_rvs= np.expm1(y_test) | |
| from sklearn.linear_model import LinearRegression,Ridge,Lasso | |
| from sklearn.ensemble import RandomForestRegressor | |
| from sklearn.svm import SVR | |
| # define models | |
| models = { | |
| 'LinearRegression':LinearRegression(), | |
| 'Ridge':Ridge(), | |
| "Lasso":Lasso(), | |
| 'RandomForestRegressor':RandomForestRegressor(), | |
| "SVR":SVR() | |
| } | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.metrics import root_mean_squared_error,mean_absolute_error | |
| result = [] | |
| for name,model in models.items(): | |
| pipeline = Pipeline(steps=[ | |
| ('p',preprocess), | |
| ('m',model) | |
| ]) | |
| pipeline.fit(x_train,y_train) | |
| y_pred = np.expm1(pipeline.predict(x_test).flatten()) | |
| rmse = root_mean_squared_error(y_test_rvs,y_pred) | |
| mae = mean_absolute_error(y_test_rvs,y_pred) | |
| result.append({'Model':name, 'RMSE':rmse , 'MAE':mae}) | |
| print(f'Model: {name} - RMSE: {rmse:,.0f}$ - MAE: {mae:,.0f}$') | |
| result_df = pd.DataFrame(result) | |
| best_model_name = result_df.sort_values(by='RMSE').iloc[0]['Model'] | |
| best_model = models[best_model_name] | |
| print(f'Best model: {best_model_name}') | |
| best_pipeline = Pipeline(steps=[ | |
| ('p',preprocess), | |
| ('b',best_model) | |
| ]) | |
| grid = GridSearchCV(best_pipeline,param_grid={ | |
| 'b__n_estimators': [100, 200, 300], | |
| 'b__max_depth': [None, 5, 10, 20], | |
| 'b__min_samples_split': [2, 5, 10], | |
| 'b__min_samples_leaf': [1, 2, 4], | |
| 'b__max_features': ['sqrt', 'log2'] | |
| },cv=5,scoring='neg_mean_squared_error',n_jobs=-1) | |
| grid.fit(x_train,y_train) | |
| best_y_pred = np.expm1(grid.predict(x_test)) | |
| for pre,ac in zip(best_y_pred[:5],y_test_rvs[:5]): | |
| print(f'Predict: {pre:,.0f}$ - Actual: {ac:,.0f}$') | |
| print(f'RMSE: {root_mean_squared_error(y_test_rvs,best_y_pred)}') | |
| print(f'MAE: {mean_absolute_error(y_test_rvs,best_y_pred)}') | |
| def pre_insurance(age, sex, bmi, children, smoker, region): | |
| cols = x.columns | |
| vals = [age, sex, bmi, children, smoker, region] | |
| user = pd.DataFrame([dict(zip(cols,vals))]) | |
| prediction = np.expm1(grid.predict(user)).item() | |
| return f'Charges: {prediction:,.0f}$' | |
| import gradio as gr | |
| inputs = [ | |
| gr.Number(label="Age"), | |
| gr.Radio(choices=['female', 'male'],label='Sex'), | |
| gr.Number(label="BMI"), | |
| gr.Radio(choices=[0, 1, 3, 2, 5, 4],label='Children'), | |
| gr.Radio(choices=['yes', 'no'],label='Smoker'), | |
| gr.Radio(choices=['southwest', 'southeast', 'northwest', 'northeast'],label='Region'), | |
| ] | |
| demo = gr.Interface(fn=pre_insurance,inputs=inputs,outputs='text',title='Predict Insurance') | |
| demo.launch() |