Upload 7 files

6b124dc verified 7 months ago

9.53 kB

	import numpy as np
	import time
	import os, sys

	import pandas as pd
	from pathlib import Path

	from concrete.ml.deployment import FHEModelClient

	import requests


	def to_json(python_object):
	if isinstance(python_object, bytes):
	return {"__class__": "bytes", "__value__": list(python_object)}
	raise TypeError(repr(python_object) + " is not JSON serializable")


	def from_json(python_object):
	if "__class__" in python_object:
	return bytes(python_object["__value__"])


	API_URL = "https://h0cvbig1fkmf57eb.eu-west-1.aws.endpoints.huggingface.cloud"
	headers = {
	"Authorization": "Bearer " + os.environ.get("HF_TOKEN"),
	"Content-Type": "application/json",
	}


	def query(payload, allowed_retries=2):
	response = requests.post(API_URL, headers=headers, json=payload)

	if response.json() is not None and "error" in response.json():
	if allowed_retries > 0:
	# Sometimes we have "Bad gateway" error
	print(f"Warning, error {response=} {response.json()=} in the query, relaunching")

	return query(payload, allowed_retries - 1)

	assert False, f"Got an error: {response=} {response.json()=}"

	return response.json()


	path_to_model = Path("compiled_model")

	# Decision-tree in FHE
	from sklearn.datasets import fetch_openml
	from sklearn.model_selection import train_test_split
	import numpy

	####IMPORTING TRAINING AND TEST DATA SETS ######
	TrainData = pd.read_csv("/data/R_Module_Day_7.2_Credit_Risk_Train_data.csv")
	TestData = pd.read_csv("/data/R_Module_Day_7.2_Credit_Risk_Train_data.csv")

	####CREATING A NEW COLUMN SOURCE UNDER BOTH TRAIN AND TEST DATA
	TrainData["Source"] = "Train"
	TestData["Source"] = "Test"

	####COMBINE BOTH TRAIN AND TEST AS FULL DATA
	FullData = pd.concat([TrainData,TestData])
	FullData.shape

	###View starting 5 records
	FullData.head()

	####Check the summary of Numerical variables
	FullData.describe()

	####Working on Categorical variable Dependents
	FullData.Dependents.value_counts() ###THere is an invalid category as 3+

	FullData.Dependents = np.where(FullData.Dependents == '3+',3,FullData.Dependents).astype(float)
	FullData.Dependents.value_counts()
	FullData.Dependents.dtype

	###Finding MISSING VALUES
	FullData.isnull().sum()

	## MISSING VALUE IMPUTATION
	for col_name in list(FullData):
	if ((col_name not in ['Loan_ID', 'Loan_Status', 'Source']) & (FullData[col_name].isnull().sum() >0)):
	if(FullData[col_name].dtype != object):
	temp1 = FullData[col_name][FullData.Source == "Train"].median()
	FullData[col_name].fillna(temp1, inplace=True)
	else:
	temp2 = FullData[col_name][FullData.Source =="Train"].mode()[0]
	FullData[col_name].fillna(temp2, inplace=True)

	FullData.isnull().sum()

	###OUTLIER DETECTION AND CORRECTION
	#ApplicantIncome

	FullData.ApplicantIncome.dtype
	np.percentile(FullData.loc[FullData.Source == "Train","ApplicantIncome"],[95,96,97,98,99])

	FullData.ApplicantIncome = np.where(FullData.ApplicantIncome > np.percentile(FullData.loc[FullData.Source == "Train","ApplicantIncome"],99),np.percentile(FullData.loc[FullData.Source == "Train","ApplicantIncome"],99),FullData.ApplicantIncome)
	FullData.ApplicantIncome = np.where(FullData.ApplicantIncome > np.percentile(FullData.loc[FullData.Source == "Train","ApplicantIncome"],95),np.percentile(FullData.loc[FullData.Source == "Train","ApplicantIncome"],95),FullData.ApplicantIncome)
	FullData.ApplicantIncome = np.where(FullData.ApplicantIncome > np.percentile(FullData.loc[FullData.Source == "Train","ApplicantIncome"],90),np.percentile(FullData.loc[FullData.Source == "Train","ApplicantIncome"],90),FullData.ApplicantIncome)

	# CoapplicantIncome
	FullData.columns
	np.percentile(FullData.loc[FullData.Source == "Train","CoapplicantIncome"],99)

	FullData.CoapplicantIncome = np.where(FullData.CoapplicantIncome > np.percentile(FullData.loc[FullData.Source == "Train","CoapplicantIncome"],99),np.percentile(FullData.loc[FullData.Source == "Train","CoapplicantIncome"],99),FullData.CoapplicantIncome)
	FullData.CoapplicantIncome = np.where(FullData.CoapplicantIncome > np.percentile(FullData.loc[FullData.Source == "Train","CoapplicantIncome"],95),np.percentile(FullData.loc[FullData.Source == "Train","CoapplicantIncome"],95),FullData.CoapplicantIncome)

	# LoanAmount

	np.percentile(FullData.loc[FullData.Source =="Train","LoanAmount"],99)
	FullData.LoanAmount = np.where(FullData.LoanAmount > np.percentile(FullData.loc[FullData.Source =="Train","LoanAmount"],99),np.percentile(FullData.loc[FullData.Source=="Train","LoanAmount"],99),FullData.LoanAmount)
	FullData.LoanAmount = np.where(FullData.LoanAmount > np.percentile(FullData.loc[FullData.Source =="Train","LoanAmount"],95),np.percentile(FullData.loc[FullData.Source=="Train","LoanAmount"],95),FullData.LoanAmount)
	FullData.LoanAmount = np.where(FullData.LoanAmount > np.percentile(FullData.loc[FullData.Source =="Train","LoanAmount"],90),np.percentile(FullData.loc[FullData.Source=="Train","LoanAmount"],90),FullData.LoanAmount)

	########ONE HOT ENCODING OF CATEGORICAL VARIABLES BY CREATING DUMMY VARIABLES ########
	cat = FullData.loc[:,FullData.dtypes == object].columns
	Dummy = pd.get_dummies(FullData[cat].drop(['Loan_ID', 'Source', 'Loan_Status'], axis = 1),drop_first = True)
	Dummy.shape
	Dummy.columns

	FullData2 = pd.concat([FullData,Dummy],axis =1)
	FullData2.shape

	Cols_To_Drop = ['Loan_ID', 'Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']
	FullData3 = FullData2.drop(Cols_To_Drop,axis = 1).copy()
	FullData3.columns
	FullData3.shape

	# Convert Dependent variable into 0,1. If Loan_Status = N, then 1 else 0
	FullData3.Loan_Status = np.where(FullData3.Loan_Status == 'N',1,0)
	FullData3.Loan_Status.value_counts()
	FullData3.shape
	FullData3.dtypes

	######SAMPLING #######################
	# Divide the data into Train and Test based on Source column and
	# make sure you drop the source column
	Train = FullData3.loc[FullData3.Source == "Train",].drop("Source",axis = 1).copy()
	Train.shape

	Test = FullData3.loc[FullData3.Source == "Test",].drop("Source",axis =1).copy()
	Test.shape

	###DIVIDE EACH DATA SET AS INDEPENDENT AND DEPENDENT VARAIBLES
	X_train = Train.drop("Loan_Status",axis = 1)
	Y_train = Train["Loan_Status"].copy()
	X_test = Test.drop("Loan_Status",axis = 1)
	Y_test = Test["Loan_Status"].copy()

	# Recover parameters for client side
	fhemodel_client = FHEModelClient(path_to_model)

	# Generate the keys
	fhemodel_client.generate_private_and_evaluation_keys()
	evaluation_keys = fhemodel_client.get_serialized_evaluation_keys()

	# Save the key in the database
	evaluation_keys_remaining = evaluation_keys[:]
	uid = None
	is_first = True
	is_finished = False
	i = 0
	packet_size = 1024 * 1024 * 100

	while not is_finished:

	# Send by packets of 100M
	if sys.getsizeof(evaluation_keys_remaining) > packet_size:
	evaluation_keys_piece = evaluation_keys_remaining[:packet_size]
	evaluation_keys_remaining = evaluation_keys_remaining[packet_size:]
	else:
	evaluation_keys_piece = evaluation_keys_remaining
	evaluation_keys_remaining = None
	is_finished = True

	print(
	f"Sending {i}-th piece of the key (remaining size is {sys.getsizeof(evaluation_keys_remaining) / 1024:.2f} kbytes)"
	)
	i += 1

	if is_first:
	is_first = False
	payload = {
	"inputs": "fake",
	"evaluation_keys": to_json(evaluation_keys_piece),
	"method": "save_key",
	}

	uid = query(payload)["uid"]
	print(f"Storing the key in the database under {uid=}")

	else:
	payload = {
	"inputs": "fake",
	"evaluation_keys": to_json(evaluation_keys_piece),
	"method": "append_key",
	"uid": uid,
	}

	query(payload)

	# Test the handler
	nb_good = 0
	nb_samples = len(X_test)
	verbose = True
	time_start = time.time()
	duration = 0
	is_first = True

	for i in range(nb_samples):

	# Quantize the input and encrypt it
	encrypted_inputs = fhemodel_client.quantize_encrypt_serialize(X_test[i].reshape(1, -1))

	# Prepare the payload
	payload = {
	"inputs": "fake",
	"encrypted_inputs": to_json(encrypted_inputs),
	"method": "inference",
	"uid": uid,
	}

	if is_first:
	print(f"Size of the payload: {sys.getsizeof(payload) / 1024:.2f} kilobytes")
	is_first = False

	# Run the inference on HF servers
	duration -= time.time()
	duration_inference = -time.time()
	encrypted_prediction = query(payload)
	duration += time.time()
	duration_inference += time.time()

	encrypted_prediction = from_json(encrypted_prediction)

	# Decrypt the result and dequantize
	prediction_proba = fhemodel_client.deserialize_decrypt_dequantize(encrypted_prediction)[0]
	prediction = np.argmax(prediction_proba)

	if verbose:
	print(
	f"for {i}-th input, {prediction=} with expected {Y_test[i]} in {duration_inference:.3f} seconds"
	)

	# Measure accuracy
	nb_good += Y_test[i] == prediction

	print(f"Accuracy on {nb_samples} samples is {nb_good * 1. / nb_samples}")
	print(f"Total time: {time.time() - time_start:.3f} seconds")
	print(f"Duration per inference: {duration / nb_samples:.3f} seconds")