Dalmace's picture
Upload 7 files
6b124dc verified
import numpy as np
import time
import os, sys
import pandas as pd
from pathlib import Path
from concrete.ml.deployment import FHEModelClient
import requests
def to_json(python_object):
if isinstance(python_object, bytes):
return {"__class__": "bytes", "__value__": list(python_object)}
raise TypeError(repr(python_object) + " is not JSON serializable")
def from_json(python_object):
if "__class__" in python_object:
return bytes(python_object["__value__"])
API_URL = "https://h0cvbig1fkmf57eb.eu-west-1.aws.endpoints.huggingface.cloud"
headers = {
"Authorization": "Bearer " + os.environ.get("HF_TOKEN"),
"Content-Type": "application/json",
}
def query(payload, allowed_retries=2):
response = requests.post(API_URL, headers=headers, json=payload)
if response.json() is not None and "error" in response.json():
if allowed_retries > 0:
# Sometimes we have "Bad gateway" error
print(f"Warning, error {response=} {response.json()=} in the query, relaunching")
return query(payload, allowed_retries - 1)
assert False, f"Got an error: {response=} {response.json()=}"
return response.json()
path_to_model = Path("compiled_model")
# Decision-tree in FHE
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
import numpy
####IMPORTING TRAINING AND TEST DATA SETS ######
TrainData = pd.read_csv("/data/R_Module_Day_7.2_Credit_Risk_Train_data.csv")
TestData = pd.read_csv("/data/R_Module_Day_7.2_Credit_Risk_Train_data.csv")
####CREATING A NEW COLUMN SOURCE UNDER BOTH TRAIN AND TEST DATA
TrainData["Source"] = "Train"
TestData["Source"] = "Test"
####COMBINE BOTH TRAIN AND TEST AS FULL DATA
FullData = pd.concat([TrainData,TestData])
FullData.shape
###View starting 5 records
FullData.head()
####Check the summary of Numerical variables
FullData.describe()
####Working on Categorical variable Dependents
FullData.Dependents.value_counts() ###THere is an invalid category as 3+
FullData.Dependents = np.where(FullData.Dependents == '3+',3,FullData.Dependents).astype(float)
FullData.Dependents.value_counts()
FullData.Dependents.dtype
###Finding MISSING VALUES
FullData.isnull().sum()
## MISSING VALUE IMPUTATION
for col_name in list(FullData):
if ((col_name not in ['Loan_ID', 'Loan_Status', 'Source']) & (FullData[col_name].isnull().sum() >0)):
if(FullData[col_name].dtype != object):
temp1 = FullData[col_name][FullData.Source == "Train"].median()
FullData[col_name].fillna(temp1, inplace=True)
else:
temp2 = FullData[col_name][FullData.Source =="Train"].mode()[0]
FullData[col_name].fillna(temp2, inplace=True)
FullData.isnull().sum()
###OUTLIER DETECTION AND CORRECTION
#ApplicantIncome
FullData.ApplicantIncome.dtype
np.percentile(FullData.loc[FullData.Source == "Train","ApplicantIncome"],[95,96,97,98,99])
FullData.ApplicantIncome = np.where(FullData.ApplicantIncome > np.percentile(FullData.loc[FullData.Source == "Train","ApplicantIncome"],99),np.percentile(FullData.loc[FullData.Source == "Train","ApplicantIncome"],99),FullData.ApplicantIncome)
FullData.ApplicantIncome = np.where(FullData.ApplicantIncome > np.percentile(FullData.loc[FullData.Source == "Train","ApplicantIncome"],95),np.percentile(FullData.loc[FullData.Source == "Train","ApplicantIncome"],95),FullData.ApplicantIncome)
FullData.ApplicantIncome = np.where(FullData.ApplicantIncome > np.percentile(FullData.loc[FullData.Source == "Train","ApplicantIncome"],90),np.percentile(FullData.loc[FullData.Source == "Train","ApplicantIncome"],90),FullData.ApplicantIncome)
# CoapplicantIncome
FullData.columns
np.percentile(FullData.loc[FullData.Source == "Train","CoapplicantIncome"],99)
FullData.CoapplicantIncome = np.where(FullData.CoapplicantIncome > np.percentile(FullData.loc[FullData.Source == "Train","CoapplicantIncome"],99),np.percentile(FullData.loc[FullData.Source == "Train","CoapplicantIncome"],99),FullData.CoapplicantIncome)
FullData.CoapplicantIncome = np.where(FullData.CoapplicantIncome > np.percentile(FullData.loc[FullData.Source == "Train","CoapplicantIncome"],95),np.percentile(FullData.loc[FullData.Source == "Train","CoapplicantIncome"],95),FullData.CoapplicantIncome)
# LoanAmount
np.percentile(FullData.loc[FullData.Source =="Train","LoanAmount"],99)
FullData.LoanAmount = np.where(FullData.LoanAmount > np.percentile(FullData.loc[FullData.Source =="Train","LoanAmount"],99),np.percentile(FullData.loc[FullData.Source=="Train","LoanAmount"],99),FullData.LoanAmount)
FullData.LoanAmount = np.where(FullData.LoanAmount > np.percentile(FullData.loc[FullData.Source =="Train","LoanAmount"],95),np.percentile(FullData.loc[FullData.Source=="Train","LoanAmount"],95),FullData.LoanAmount)
FullData.LoanAmount = np.where(FullData.LoanAmount > np.percentile(FullData.loc[FullData.Source =="Train","LoanAmount"],90),np.percentile(FullData.loc[FullData.Source=="Train","LoanAmount"],90),FullData.LoanAmount)
########ONE HOT ENCODING OF CATEGORICAL VARIABLES BY CREATING DUMMY VARIABLES ########
cat = FullData.loc[:,FullData.dtypes == object].columns
Dummy = pd.get_dummies(FullData[cat].drop(['Loan_ID', 'Source', 'Loan_Status'], axis = 1),drop_first = True)
Dummy.shape
Dummy.columns
FullData2 = pd.concat([FullData,Dummy],axis =1)
FullData2.shape
Cols_To_Drop = ['Loan_ID', 'Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']
FullData3 = FullData2.drop(Cols_To_Drop,axis = 1).copy()
FullData3.columns
FullData3.shape
# Convert Dependent variable into 0,1. If Loan_Status = N, then 1 else 0
FullData3.Loan_Status = np.where(FullData3.Loan_Status == 'N',1,0)
FullData3.Loan_Status.value_counts()
FullData3.shape
FullData3.dtypes
######SAMPLING #######################
# Divide the data into Train and Test based on Source column and
# make sure you drop the source column
Train = FullData3.loc[FullData3.Source == "Train",].drop("Source",axis = 1).copy()
Train.shape
Test = FullData3.loc[FullData3.Source == "Test",].drop("Source",axis =1).copy()
Test.shape
###DIVIDE EACH DATA SET AS INDEPENDENT AND DEPENDENT VARAIBLES
X_train = Train.drop("Loan_Status",axis = 1)
Y_train = Train["Loan_Status"].copy()
X_test = Test.drop("Loan_Status",axis = 1)
Y_test = Test["Loan_Status"].copy()
# Recover parameters for client side
fhemodel_client = FHEModelClient(path_to_model)
# Generate the keys
fhemodel_client.generate_private_and_evaluation_keys()
evaluation_keys = fhemodel_client.get_serialized_evaluation_keys()
# Save the key in the database
evaluation_keys_remaining = evaluation_keys[:]
uid = None
is_first = True
is_finished = False
i = 0
packet_size = 1024 * 1024 * 100
while not is_finished:
# Send by packets of 100M
if sys.getsizeof(evaluation_keys_remaining) > packet_size:
evaluation_keys_piece = evaluation_keys_remaining[:packet_size]
evaluation_keys_remaining = evaluation_keys_remaining[packet_size:]
else:
evaluation_keys_piece = evaluation_keys_remaining
evaluation_keys_remaining = None
is_finished = True
print(
f"Sending {i}-th piece of the key (remaining size is {sys.getsizeof(evaluation_keys_remaining) / 1024:.2f} kbytes)"
)
i += 1
if is_first:
is_first = False
payload = {
"inputs": "fake",
"evaluation_keys": to_json(evaluation_keys_piece),
"method": "save_key",
}
uid = query(payload)["uid"]
print(f"Storing the key in the database under {uid=}")
else:
payload = {
"inputs": "fake",
"evaluation_keys": to_json(evaluation_keys_piece),
"method": "append_key",
"uid": uid,
}
query(payload)
# Test the handler
nb_good = 0
nb_samples = len(X_test)
verbose = True
time_start = time.time()
duration = 0
is_first = True
for i in range(nb_samples):
# Quantize the input and encrypt it
encrypted_inputs = fhemodel_client.quantize_encrypt_serialize(X_test[i].reshape(1, -1))
# Prepare the payload
payload = {
"inputs": "fake",
"encrypted_inputs": to_json(encrypted_inputs),
"method": "inference",
"uid": uid,
}
if is_first:
print(f"Size of the payload: {sys.getsizeof(payload) / 1024:.2f} kilobytes")
is_first = False
# Run the inference on HF servers
duration -= time.time()
duration_inference = -time.time()
encrypted_prediction = query(payload)
duration += time.time()
duration_inference += time.time()
encrypted_prediction = from_json(encrypted_prediction)
# Decrypt the result and dequantize
prediction_proba = fhemodel_client.deserialize_decrypt_dequantize(encrypted_prediction)[0]
prediction = np.argmax(prediction_proba)
if verbose:
print(
f"for {i}-th input, {prediction=} with expected {Y_test[i]} in {duration_inference:.3f} seconds"
)
# Measure accuracy
nb_good += Y_test[i] == prediction
print(f"Accuracy on {nb_samples} samples is {nb_good * 1. / nb_samples}")
print(f"Total time: {time.time() - time_start:.3f} seconds")
print(f"Duration per inference: {duration / nb_samples:.3f} seconds")