File size: 6,117 Bytes
8649673
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e95be4c
8649673
e95be4c
8649673
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82d3328
8649673
 
 
 
 
 
 
 
 
 
82d3328
8649673
82d3328
 
8649673
 
 
4da0f00
 
 
 
 
 
8649673
 
 
 
 
 
 
 
 
 
 
 
4da0f00
 
 
 
 
 
3538277
8649673
 
 
 
5218dd0
82d3328
 
 
5218dd0
e95be4c
82d3328
8649673
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# +++
# Import the libraries
#---------------------------------------------------------------------------------------------------------
import os
import uuid
import joblib
import json

# IMPORTANT: I already installed the package "gradio" in my current Virtual Environment (VEnvDSDIL_gpu_Py3.12) as:  pip install -q gradio_client
#            Do NOT install "gradio_client" package again in Anaconda otherwise it will mess up the package.
import gradio as gr
import pandas as pd

# must install the package "huggingface_hub" first in the current python Virtual Environment, with pip, not with conda, as follows
# pip install huggingface_hub
# i.e., in the command line interface within the activated Virtual Environment:
#  (VEnvDSDIL_gpu_Py3.12) epalvarez@DSDILmStation01:~ $ pip install huggingface_hub
from huggingface_hub import CommitScheduler
from pathlib import Path
#---------------------------------------------------------------------------------------------------------

# Run the training script placed in the same directory as app.py
# The training script will train and persist a linear regression
# model with the filename 'model_ic.joblib'
print(f"\n... Initializing train_ic.py\n")
os.system('python train_ic.py')   # Take a command line argument: execute the "train_ic.py" in a subterminal... this will load the data file and serialize the model into "model_ic.joblib
print(f"\n... train_ic.py initialized.\n")

# Load the freshly trained model from disk
# Reconstruct a Python object from a file persisted with joblib.dump.
# Returns: The Python object stored in the file.
# Obtain current directory and data file path
current_directory = Path.cwd()
print(f"current_directory: {current_directory}\n")
# Use joinpath to add subdirectories and a filename
saved_model_file_path = current_directory.joinpath("model_ic.joblib")
print(f"saved_model_file_path: {saved_model_file_path}\n")
# Retrieve serialized model object
insurance_charge_predictor = joblib.load(filename=saved_model_file_path)

# Prepare the logging functionality
log_file = Path("logs/") / f"data_{uuid.uuid4()}.json"
log_folder = log_file.parent
print(f"\nInformation:\n\tlog_file: {log_file}\n\tlog_folder: {log_folder}\n")

# Scheduler will log every 2 API calls:
scheduler = CommitScheduler(
    repo_id="insurance-charge-mlops-logs",  # provide a name "insurance-charge-mlops-logs" for the repo_id
    repo_type="dataset",
    folder_path=log_folder,
    path_in_repo="data",
    every=2
)

# Define the "predict function" which will take features, convert to dataframe and make predictions using the saved model
# the functions runs when 'Submit' is clicked or when a API request is made
# IMPORTANT Note: do not modify the names of keys for "sample" and "scheduler"; the keys should be named exactly as the names in the columns in the DataFrame.
#                 Otherwise, an run-time error will occur.
#-------------------------------------------------------------------------------------------------------------------------------------------------------------
def predict_insurance_charge(age, bmi, children, sex, smoker, region):
    sample = {
        'age': age,
        'bmi': bmi,
        'children': children,
        'sex': sex,
        'smoker': smoker,
        'region': region
    }
    data_point = pd.DataFrame([sample])
    prediction = insurance_charge_predictor.predict(data_point).tolist()     # use the model_ic.joblib retrieved above to make the prediction

    # While the prediction is made, log both the inputs and outputs to a log file
    # While writing to the log file, ensure that the commit scheduler is locked to avoid parallel access
    # Push prediction to a dataset repo for logging
    # Each time we get a prediction we will determine if we should log it to a hugging_face dataset according to the schedule definition outside this function
    with scheduler.lock:
        with log_file.open("a") as f:
            f.write(json.dumps(
                {
                    'age': age,
                    'bmi': bmi,
                    'children': children,
                    'sex': sex,
                    'smoker': smoker,
                    'region': region,
                    'prediction': prediction[0][0]
                }
            ))
            f.write("\n")

    prediction_result = prediction[0][0]
    print(f"\nPrediction result: {prediction_result} - {type(prediction_result)}\n")
    #print(f"\nDebug - prediction[0]: {prediction[0]} - {type(prediction[0])}\n")
    #print(f"\nDebug - prediction[0][0]: {prediction[0][0]} - {type(prediction[0][0])}\n")
    return prediction_result
    #return prediction[0]
    #return prediction[0][0]
#--------------------------------------------------------------------------------------------------------------------------------------------------------------

# Set up UI components for input and output
# Input components
age_input = gr.Number(label="Age [attained years]")
bmi_input = gr.Number(label='BMI')
children_input = gr.Number(label='Children [#]')
sex_input = gr.Dropdown(['male', 'female'], label='Sex')
smoker_input = gr.Dropdown(['no', 'yes'], label='Smoker')
region_input = gr.Dropdown(['southeast', 'southwest', 'northeast', 'northwest'], label='Region')
# Output component
model_output = gr.Label(label="Insurance Charge [$]")

# Create the gradio interface, make title "HealthyLife Insurance Charge Prediction"
demo = gr.Interface(
    fn=predict_insurance_charge,
    inputs=[age_input, bmi_input, children_input, 
            sex_input, smoker_input, region_input],
    outputs=model_output,
    title="Insurance Charge Predictor",
    description="This API allows you to predict the appropriate insurance charge based on the input parameters.",
    allow_flagging="auto",    # automatically push to the HuggingFace Dataset
    concurrency_limit=8
)

# Launch with a load balancer
demo.queue()
demo.launch(share=False)
# To create a public link, set "share=True" in launch() ....  but if I execute this app.py locally, then I have to have my computer on for the public users to access the browser interface