first commit
Browse files- app.py +89 -0
- data_config.py +30 -0
- random_forest_model.joblib +3 -0
- requirements.txt +4 -0
- scaler.joblib +3 -0
app.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import gradio as gr
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import joblib
|
| 5 |
+
from data_config import region_data, training_columns, unique_pincodes, unique_places
|
| 6 |
+
|
| 7 |
+
# Load the trained model and scaler globally for efficiency
|
| 8 |
+
try:
|
| 9 |
+
model = joblib.load('random_forest_model.joblib')
|
| 10 |
+
scaler = joblib.load('scaler.joblib')
|
| 11 |
+
except FileNotFoundError:
|
| 12 |
+
print("Error: Model or scaler file not found. Make sure 'random_forest_model.joblib' and 'scaler.joblib' are in the same directory.")
|
| 13 |
+
# Exit or handle error appropriately for deployment
|
| 14 |
+
exit()
|
| 15 |
+
|
| 16 |
+
def predict_charges(age, sex, bmi, children, smoker, pincode, place):
|
| 17 |
+
# Create a DataFrame from the raw input
|
| 18 |
+
input_data = pd.DataFrame([{
|
| 19 |
+
'age': age,
|
| 20 |
+
'sex': sex,
|
| 21 |
+
'bmi': bmi,
|
| 22 |
+
'children': children,
|
| 23 |
+
'smoker': smoker,
|
| 24 |
+
'pincode': pincode,
|
| 25 |
+
'place': place
|
| 26 |
+
}])
|
| 27 |
+
|
| 28 |
+
# Define numerical and categorical columns as used during training
|
| 29 |
+
numerical_cols_to_scale = ['age', 'bmi', 'children']
|
| 30 |
+
categorical_cols_to_encode = ['sex', 'smoker', 'pincode', 'place']
|
| 31 |
+
|
| 32 |
+
# Scale numerical features
|
| 33 |
+
input_data[numerical_cols_to_scale] = scaler.transform(input_data[numerical_cols_to_scale])
|
| 34 |
+
|
| 35 |
+
# Apply one-hot encoding to categorical features
|
| 36 |
+
# Ensure drop_first=False to match how `pd.get_dummies` was used initially for X
|
| 37 |
+
input_data_encoded = pd.get_dummies(input_data, columns=categorical_cols_to_encode, drop_first=False)
|
| 38 |
+
|
| 39 |
+
# Align columns with the training data's columns
|
| 40 |
+
# This step is critical to ensure that the input DataFrame for prediction
|
| 41 |
+
# has the exact same columns as the training DataFrame (X) and in the same order.
|
| 42 |
+
# It handles cases where a category might not be present in the single input row.
|
| 43 |
+
final_input = pd.DataFrame(columns=training_columns)
|
| 44 |
+
final_input = pd.concat([final_input, input_data_encoded], ignore_index=True)
|
| 45 |
+
final_input = final_input.fillna(False) # Fill missing one-hot columns (e.g., sex_male if only female input) with False
|
| 46 |
+
|
| 47 |
+
# Ensure boolean columns are treated as 0/1 for the model if necessary
|
| 48 |
+
for col in final_input.columns:
|
| 49 |
+
if final_input[col].dtype == 'bool':
|
| 50 |
+
final_input[col] = final_input[col].astype(int)
|
| 51 |
+
|
| 52 |
+
# Reorder columns to match the training data
|
| 53 |
+
final_input = final_input[training_columns]
|
| 54 |
+
|
| 55 |
+
# Make prediction
|
| 56 |
+
prediction = model.predict(final_input)
|
| 57 |
+
|
| 58 |
+
return prediction[0]
|
| 59 |
+
|
| 60 |
+
# Define Gradio input components
|
| 61 |
+
age_input = gr.Slider(minimum=18, maximum=100, step=1, value=30, label="Age")
|
| 62 |
+
sex_input = gr.Radio(choices=['female', 'male'], value='female', label="Sex")
|
| 63 |
+
bmi_input = gr.Slider(minimum=10.0, maximum=60.0, step=0.1, value=25.0, label="BMI")
|
| 64 |
+
children_input = gr.Slider(minimum=0, maximum=5, step=1, value=1, label="Children")
|
| 65 |
+
smoker_input = gr.Radio(choices=['no', 'yes'], value='no', label="Smoker")
|
| 66 |
+
pincode_input = gr.Dropdown(choices=unique_pincodes, value=unique_pincodes[0] if unique_pincodes else None, label="Pincode")
|
| 67 |
+
place_input = gr.Dropdown(choices=unique_places, value=unique_places[0] if unique_places else None, label="Place")
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
# Create the Gradio Interface
|
| 71 |
+
interface = gr.Interface(
|
| 72 |
+
fn=predict_charges,
|
| 73 |
+
inputs=[
|
| 74 |
+
age_input,
|
| 75 |
+
sex_input,
|
| 76 |
+
bmi_input,
|
| 77 |
+
children_input,
|
| 78 |
+
smoker_input,
|
| 79 |
+
pincode_input,
|
| 80 |
+
place_input
|
| 81 |
+
],
|
| 82 |
+
outputs=gr.Number(label="Predicted Insurance Charges"),
|
| 83 |
+
title="Insurance Charge Predictor",
|
| 84 |
+
description="Enter the details to get an estimated insurance charge."
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
# Launch the Gradio interface
|
| 88 |
+
if __name__ == '__main__':
|
| 89 |
+
interface.launch(share=True)
|
data_config.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# This file contains configuration data for the insurance charges prediction application.
|
| 3 |
+
|
| 4 |
+
import random
|
| 5 |
+
|
| 6 |
+
# Define the region_data dictionary with pincodes and places for each region.
|
| 7 |
+
region_data = {'southeast': {'pincodes': ['30301', '33101', '37201', '27514', '29501'], 'places': ['Atlanta, Georgia', 'Miami, Florida', 'Nashville, Tennessee', 'Chapel Hill, North Carolina', 'Florence, South Carolina']}, 'southwest': {'pincodes': ['85001', '73301', '87501', '73102', '85701'], 'places': ['Phoenix, Arizona', 'Austin, Texas', 'Santa Fe, New Mexico', 'Oklahoma City, Oklahoma', 'Tucson, Arizona']}, 'northwest': {'pincodes': ['98101', '97201', '83702', '99201', '59715'], 'places': ['Seattle, Washington', 'Portland, Oregon', 'Boise, Idaho', 'Spokane, Washington', 'Bozeman, Montana']}, 'northeast': {'pincodes': ['10001', '02108', '19103', '07030', '06103'], 'places': ['New York, New York', 'Boston, Massachusetts', 'Philadelphia, Pennsylvania', 'Hoboken, New Jersey', 'Hartford, Connecticut']}}
|
| 8 |
+
|
| 9 |
+
# List of columns used during model training.
|
| 10 |
+
# This ensures that input data for prediction is processed with the same column structure.
|
| 11 |
+
training_columns = ['age', 'bmi', 'children', 'sex_female', 'sex_male', 'smoker_no', 'smoker_yes', 'pincode_02108', 'pincode_06103', 'pincode_07030', 'pincode_10001', 'pincode_19103', 'pincode_27514', 'pincode_29501', 'pincode_30301', 'pincode_33101', 'pincode_37201', 'pincode_59715', 'pincode_73102', 'pincode_73301', 'pincode_83702', 'pincode_85001', 'pincode_85701', 'pincode_87501', 'pincode_97201', 'pincode_98101', 'pincode_99201', 'place_Atlanta, Georgia', 'place_Austin, Texas', 'place_Boise, Idaho', 'place_Boston, Massachusetts', 'place_Bozeman, Montana', 'place_Chapel Hill, North Carolina', 'place_Florence, South Carolina', 'place_Hartford, Connecticut', 'place_Hoboken, New Jersey', 'place_Miami, Florida', 'place_Nashville, Tennessee', 'place_New York, New York', 'place_Oklahoma City, Oklahoma', 'place_Philadelphia, Pennsylvania', 'place_Phoenix, Arizona', 'place_Portland, Oregon', 'place_Santa Fe, New Mexico', 'place_Seattle, Washington', 'place_Spokane, Washington', 'place_Tucson, Arizona']
|
| 12 |
+
|
| 13 |
+
# Function to assign a random pincode and place based on region
|
| 14 |
+
def assign_location_data(row):
|
| 15 |
+
region = row['region']
|
| 16 |
+
if region in region_data:
|
| 17 |
+
pincode = random.choice(region_data[region]['pincodes'])
|
| 18 |
+
place = random.choice(region_data[region]['places'])
|
| 19 |
+
return pincode, place
|
| 20 |
+
return None, None
|
| 21 |
+
|
| 22 |
+
# Extract unique pincodes and places from region_data for dropdown options
|
| 23 |
+
all_pincodes = []
|
| 24 |
+
all_places = []
|
| 25 |
+
for region in region_data:
|
| 26 |
+
all_pincodes.extend(region_data[region]['pincodes'])
|
| 27 |
+
all_places.extend(region_data[region]['places'])
|
| 28 |
+
|
| 29 |
+
unique_pincodes = sorted(list(set(all_pincodes)))
|
| 30 |
+
unique_places = sorted(list(set(all_places)))
|
random_forest_model.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4e6afb614fefc63a80834f684cd21f53b194b1db360326c0ca5d9542c24f80bb
|
| 3 |
+
size 22880369
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pandas
|
| 2 |
+
scikit-learn
|
| 3 |
+
gradio
|
| 4 |
+
joblib
|
scaler.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2f9ae5dad94b3950eb2ddd5b9cc85d383daa67e8944421be1fd1ba2ebc1311f2
|
| 3 |
+
size 943
|