Spaces:

thanhcong2001
/

Pre_insurance

Sleeping

App Files Files Community

Pre_insurance / app.py

thanhcong2001

Create app.py

889747e verified 8 months ago

raw

history blame contribute delete

3.26 kB

	import pandas as pd
	df = pd.read_csv('insurance.csv')
	# Scale numeric and encode category
	from sklearn.compose import ColumnTransformer
	from sklearn.preprocessing import StandardScaler,OneHotEncoder
	num_feature = df.select_dtypes(include=['int64','float64']).columns.drop('charges').to_list()
	cat_feature = df.select_dtypes(include=['object']).columns.to_list()
	preprocess = ColumnTransformer(transformers=[
	('num',StandardScaler(),num_feature),
	('cat',OneHotEncoder(drop='first'),cat_feature)
	])
	# Split dataset
	import numpy as np
	from sklearn.model_selection import train_test_split,GridSearchCV
	x= df.drop(columns=['charges'])
	y= np.log1p(df['charges'])
	x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.4,random_state=42)
	y_test_rvs= np.expm1(y_test)
	from sklearn.linear_model import LinearRegression,Ridge,Lasso
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.svm import SVR
	# define models
	models = {
	'LinearRegression':LinearRegression(),
	'Ridge':Ridge(),
	"Lasso":Lasso(),
	'RandomForestRegressor':RandomForestRegressor(),
	"SVR":SVR()
	}
	from sklearn.pipeline import Pipeline
	from sklearn.metrics import root_mean_squared_error,mean_absolute_error
	result = []
	for name,model in models.items():
	pipeline = Pipeline(steps=[
	('p',preprocess),
	('m',model)
	])
	pipeline.fit(x_train,y_train)
	y_pred = np.expm1(pipeline.predict(x_test).flatten())
	rmse = root_mean_squared_error(y_test_rvs,y_pred)
	mae = mean_absolute_error(y_test_rvs,y_pred)
	result.append({'Model':name, 'RMSE':rmse , 'MAE':mae})
	print(f'Model: {name} - RMSE: {rmse:,.0f}$ - MAE: {mae:,.0f}$')
	result_df = pd.DataFrame(result)
	best_model_name = result_df.sort_values(by='RMSE').iloc[0]['Model']
	best_model = models[best_model_name]
	print(f'Best model: {best_model_name}')
	best_pipeline = Pipeline(steps=[
	('p',preprocess),
	('b',best_model)
	])
	grid = GridSearchCV(best_pipeline,param_grid={
	'b__n_estimators': [100, 200, 300],
	'b__max_depth': [None, 5, 10, 20],
	'b__min_samples_split': [2, 5, 10],
	'b__min_samples_leaf': [1, 2, 4],
	'b__max_features': ['sqrt', 'log2']
	},cv=5,scoring='neg_mean_squared_error',n_jobs=-1)
	grid.fit(x_train,y_train)
	best_y_pred = np.expm1(grid.predict(x_test))
	for pre,ac in zip(best_y_pred[:5],y_test_rvs[:5]):
	print(f'Predict: {pre:,.0f}$ - Actual: {ac:,.0f}$')
	print(f'RMSE: {root_mean_squared_error(y_test_rvs,best_y_pred)}')
	print(f'MAE: {mean_absolute_error(y_test_rvs,best_y_pred)}')
	def pre_insurance(age, sex, bmi, children, smoker, region):
	cols = x.columns
	vals = [age, sex, bmi, children, smoker, region]
	user = pd.DataFrame([dict(zip(cols,vals))])
	prediction = np.expm1(grid.predict(user)).item()
	return f'Charges: {prediction:,.0f}$'
	import gradio as gr

	inputs = [
	gr.Number(label="Age"),
	gr.Radio(choices=['female', 'male'],label='Sex'),
	gr.Number(label="BMI"),
	gr.Radio(choices=[0, 1, 3, 2, 5, 4],label='Children'),
	gr.Radio(choices=['yes', 'no'],label='Smoker'),
	gr.Radio(choices=['southwest', 'southeast', 'northwest', 'northeast'],label='Region'),
	]
	demo = gr.Interface(fn=pre_insurance,inputs=inputs,outputs='text',title='Predict Insurance')
	demo.launch()