|
|
import pickle |
|
|
import torch |
|
|
import pandas as pd |
|
|
from sklearn.model_selection import GridSearchCV, train_test_split |
|
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer |
|
|
from concrete.ml.sklearn import XGBClassifier |
|
|
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay |
|
|
import numpy as np |
|
|
import tqdm |
|
|
from pathlib import Path |
|
|
from concrete.ml.deployment import FHEModelDev |
|
|
|
|
|
|
|
|
train = pd.read_csv("../dataset/local_datasets/twitter-airline-sentiment/Tweets.csv", index_col=0) |
|
|
text_X = train["text"] |
|
|
y = train["airline_sentiment"].replace(["negative", "neutral", "positive"], [0, 1, 2]) |
|
|
|
|
|
text_X_train, text_X_test, y_train, y_test = train_test_split( |
|
|
text_X, y, test_size=0.1, random_state=42 |
|
|
) |
|
|
|
|
|
|
|
|
device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
|
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest") |
|
|
transformer_model = AutoModelForSequenceClassification.from_pretrained( |
|
|
"cardiffnlp/twitter-roberta-base-sentiment-latest" |
|
|
).to(device) |
|
|
|
|
|
|
|
|
|
|
|
def text_to_tensor(list_text, transformer_model, tokenizer, device): |
|
|
tokenized_text = [tokenizer.encode(text, return_tensors="pt") for text in list_text] |
|
|
output_hidden_states_list = [None] * len(tokenized_text) |
|
|
|
|
|
for i, tokenized_x in enumerate(tqdm.tqdm(tokenized_text)): |
|
|
output_hidden_states = transformer_model(tokenized_x.to(device), output_hidden_states=True)[1][-1] |
|
|
output_hidden_states = output_hidden_states.mean(dim=1).detach().cpu().numpy() |
|
|
output_hidden_states_list[i] = output_hidden_states |
|
|
|
|
|
return np.concatenate(output_hidden_states_list, axis=0) |
|
|
|
|
|
|
|
|
|
|
|
X_train_transformer = text_to_tensor(text_X_train.tolist(), transformer_model, tokenizer, device) |
|
|
X_test_transformer = text_to_tensor(text_X_test.tolist(), transformer_model, tokenizer, device) |
|
|
|
|
|
|
|
|
model = XGBClassifier() |
|
|
parameters = {"n_bits": [2, 3], "max_depth": [1], "n_estimators": [10, 30, 50]} |
|
|
grid_search = GridSearchCV(model, parameters, cv=5, scoring="accuracy") |
|
|
grid_search.fit(X_train_transformer, y_train) |
|
|
|
|
|
|
|
|
best_model = grid_search.best_estimator_ |
|
|
y_pred = best_model.predict(X_test_transformer) |
|
|
matrix = confusion_matrix(y_test, y_pred) |
|
|
ConfusionMatrixDisplay(matrix).plot() |
|
|
|
|
|
|
|
|
best_model.compile(X_train_transformer) |
|
|
tested_tweet = ["AirFrance is awesome, almost as much as Zama!"] |
|
|
X_tested_tweet = text_to_tensor(tested_tweet, transformer_model, tokenizer, device) |
|
|
decrypted_proba = best_model.predict_proba(X_tested_tweet, fhe="execute") |
|
|
|
|
|
|
|
|
DEPLOYMENT_DIR = Path("../deployment") |
|
|
DEPLOYMENT_DIR.mkdir(exist_ok=True) |
|
|
fhe_api = FHEModelDev(DEPLOYMENT_DIR / "sentiment_fhe_model", best_model) |
|
|
fhe_api.save(via_mlir=True) |
|
|
with (DEPLOYMENT_DIR / "serialized_model").open("w") as file: |
|
|
best_model.dump(file) |
|
|
|
|
|
|
|
|
with (DEPLOYMENT_DIR / "serialized_model_zkml").open("wb") as file: |
|
|
pickle.dump(best_model.dump_dict(), file) |