MyronZhang commited on
Commit
8f80484
·
1 Parent(s): 5e6e795

Sync from GitHub

Browse files
app.py CHANGED
@@ -16,7 +16,7 @@ import time
16
  # This repository's directory
17
  REPO_DIR = Path(__file__).parent
18
 
19
- subprocess.Popen(["anvil", "-p", "3030"], cwd=REPO_DIR)
20
  subprocess.Popen(["uvicorn", "server:app", "--port", "8000"], cwd=REPO_DIR)
21
  subprocess.Popen(["uvicorn", "zkml_non_encrypted:app", "--port", "8001"], cwd=REPO_DIR)
22
  subprocess.Popen(["uvicorn", "zkml_encrypted:app", "--port", "8002"], cwd=REPO_DIR)
 
16
  # This repository's directory
17
  REPO_DIR = Path(__file__).parent
18
 
19
+ # subprocess.Popen(["anvil", "-p", "3030"], cwd=REPO_DIR)
20
  subprocess.Popen(["uvicorn", "server:app", "--port", "8000"], cwd=REPO_DIR)
21
  subprocess.Popen(["uvicorn", "zkml_non_encrypted:app", "--port", "8001"], cwd=REPO_DIR)
22
  subprocess.Popen(["uvicorn", "zkml_encrypted:app", "--port", "8002"], cwd=REPO_DIR)
hf_repo/.github/workflows/sync_to_hf.yml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ sync:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - name: Checkout repository
14
+ uses: actions/checkout@v3
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v4
18
+ with:
19
+ python-version: '3.9' # Specify your Python version
20
+
21
+ - name: Install dependencies
22
+ run: |
23
+ pip install huggingface_hub
24
+
25
+ - name: Sync to Hugging Face
26
+ env:
27
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
28
+ run: |
29
+ # Configure git
30
+ git config --global user.email "myronzhangweb3@gmail.com"
31
+ git config --global user.name "Myron Zhang"
32
+
33
+ # Clone the Hugging Face repository
34
+ git clone https://myronzhangweb3:$HF_TOKEN@huggingface.co/spaces/PrivEcho/encrypted_sentiment_analysis hf_repo
35
+ cd hf_repo
36
+
37
+ # Copy files from the GitHub repository
38
+ rsync -av --exclude='.git' ../ .
39
+
40
+ # Commit and push changes to Hugging Face
41
+ git add .
42
+ git commit -m "Sync from GitHub"
43
+ git push
hf_repo/deployment/serialized_model_zkml ADDED
Binary file (523 kB). View file
 
hf_repo/hf_repo/.gitattributes ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.npy filter=lfs diff=lfs merge=lfs -text
14
+ *.npz filter=lfs diff=lfs merge=lfs -text
15
+ *.onnx filter=lfs diff=lfs merge=lfs -text
16
+ *.ot filter=lfs diff=lfs merge=lfs -text
17
+ *.parquet filter=lfs diff=lfs merge=lfs -text
18
+ *.pb filter=lfs diff=lfs merge=lfs -text
19
+ *.pickle filter=lfs diff=lfs merge=lfs -text
20
+ *.pkl filter=lfs diff=lfs merge=lfs -text
21
+ *.pt filter=lfs diff=lfs merge=lfs -text
22
+ *.pth filter=lfs diff=lfs merge=lfs -text
23
+ *.rar filter=lfs diff=lfs merge=lfs -text
24
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
25
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
26
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
27
+ *.tflite filter=lfs diff=lfs merge=lfs -text
28
+ *.tgz filter=lfs diff=lfs merge=lfs -text
29
+ *.wasm filter=lfs diff=lfs merge=lfs -text
30
+ *.xz filter=lfs diff=lfs merge=lfs -text
31
+ *.zip filter=lfs diff=lfs merge=lfs -text
32
+ *.zst filter=lfs diff=lfs merge=lfs -text
33
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
hf_repo/hf_repo/.gitignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ tmp/
2
+ .venv
3
+ .fhe_keys
4
+ *.pyc
5
+ local_datasets/
6
+ .vscode/
7
+ /.idea
8
+ /zkml_encrypted
9
+ /zkml_non_encrypted
hf_repo/hf_repo/README.md ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Sentiment Analysis On Encrypted Data Using Fully Homomorphic Encryption
3
+ emoji: 🥷💬
4
+ colorFrom: yellow
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ sdk_version: 4.44.0
8
+ app_file: app.py
9
+ pinned: true
10
+ tags: [FHE, PPML, privacy, privacy preserving machine learning, homomorphic encryption, security]
11
+ python_version: 3.10.11
12
+ ---
13
+
14
+ # Sentiment Analysis With FHE
15
+
16
+ ## Set up the app locally
17
+
18
+ - First, create a virtual env and activate it:
19
+
20
+ ```bash
21
+ python3 -m venv .venv
22
+ source .venv/bin/activate
23
+ ```
24
+
25
+ - Then, install required packages:
26
+
27
+ ```bash
28
+ pip3 install pip --upgrade
29
+ pip3 install -U pip wheel setuptools --ignore-installed
30
+ pip3 install -r requirements.txt --ignore-installed
31
+
32
+ # mac z3
33
+ brew install z3
34
+ pip3 uninstall z3-solver
35
+ pip3 install z3-solver
36
+ pip3 install more-itertools
37
+ ```
38
+
39
+ Check it finish well (with a "Done!"). Please note that the actual model initialization and training
40
+ can be found in the [SentimentClassification notebook](SentimentClassification.ipynb) (see below).
41
+
42
+ ### Launch the app locally
43
+
44
+ - In a terminal:
45
+
46
+ ```bash
47
+ source .venv/bin/activate
48
+ python3 app.py
49
+ ```
50
+
51
+ ## Interact with the application
52
+
53
+ Open the given URL link (search for a line like `Running on local URL: http://127.0.0.1:8888/` in the
54
+ terminal).
hf_repo/hf_repo/SentimentClassification.ipynb ADDED
@@ -0,0 +1,1053 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Sentiment Classification with FHE\n",
8
+ "\n",
9
+ "This notebook tackles sentiment classification with Fully Homomorphic Encryption. Let's imagine some client (could be a user or a company) wants to predict whether a specific text (e.g., a tweet) contains positive, neutral or negative feedback using a cloud service provider without actually revealing the text during the process.\n",
10
+ "\n",
11
+ "To do this, we use a machine learning model that can predict over encrypted data thanks to the Concrete-ML library available on [GitHub](https://github.com/zama-ai/concrete-ml).\n",
12
+ "\n",
13
+ "The dataset we use in this notebook can be found on [Kaggle](https://www.kaggle.com/datasets/crowdflower/twitter-airline-sentiment). \n",
14
+ " \n",
15
+ "We present two different ways to encode the text:\n",
16
+ "1. A basic **TF-IDF** approach, which essentially looks at how often a word appears in the text.\n",
17
+ "2. An advanced **transformer** embedding of the text using the Huggingface repository.\n",
18
+ "\n",
19
+ "The main assumption of this notebook is that clients, who want to have their text analyzed in a privacy preserving manner, can encode the text using a predefined representation before encrypting the data. The FHE-friendly model is thus trained in the clear beforehand for the given task, here classification, over theses representations using a relevant training set."
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": 1,
25
+ "metadata": {},
26
+ "outputs": [],
27
+ "source": [
28
+ "# Import the required packages\n",
29
+ "import os\n",
30
+ "import time\n",
31
+ "from pathlib import Path\n",
32
+ "\n",
33
+ "import numpy\n",
34
+ "import pandas as pd\n",
35
+ "from sklearn.metrics import average_precision_score\n",
36
+ "from sklearn.model_selection import GridSearchCV, train_test_split\n",
37
+ "\n",
38
+ "from concrete.ml.sklearn import XGBClassifier"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 2,
44
+ "metadata": {},
45
+ "outputs": [
46
+ {
47
+ "name": "stdout",
48
+ "output_type": "stream",
49
+ "text": [
50
+ "Proportion of positive examples: 16.14%\n",
51
+ "Proportion of negative examples: 62.69%\n",
52
+ "Proportion of neutral examples: 21.17%\n"
53
+ ]
54
+ }
55
+ ],
56
+ "source": [
57
+ "# Download the datasets\n",
58
+ "# The dataset can be downloaded through the `download_data.sh` script, which requires to set up\n",
59
+ "# Kaggle's CLI, or manually at https://www.kaggle.com/datasets/crowdflower/twitter-airline-sentiment\n",
60
+ "if not os.path.isfile(\"local_datasets/twitter-airline-sentiment/Tweets.csv\"):\n",
61
+ " raise ValueError(\"Please launch the `download_data.sh` script to get datasets\")\n",
62
+ "\n",
63
+ "\n",
64
+ "train = pd.read_csv(\"local_datasets/twitter-airline-sentiment/Tweets.csv\", index_col=0)\n",
65
+ "text_X = train[\"text\"]\n",
66
+ "y = train[\"airline_sentiment\"]\n",
67
+ "y = y.replace([\"negative\", \"neutral\", \"positive\"], [0, 1, 2])\n",
68
+ "\n",
69
+ "pos_ratio = y.value_counts()[2] / y.value_counts().sum()\n",
70
+ "neg_ratio = y.value_counts()[0] / y.value_counts().sum()\n",
71
+ "neutral_ratio = y.value_counts()[1] / y.value_counts().sum()\n",
72
+ "print(f\"Proportion of positive examples: {round(pos_ratio * 100, 2)}%\")\n",
73
+ "print(f\"Proportion of negative examples: {round(neg_ratio * 100, 2)}%\")\n",
74
+ "print(f\"Proportion of neutral examples: {round(neutral_ratio * 100, 2)}%\")"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": 3,
80
+ "metadata": {},
81
+ "outputs": [],
82
+ "source": [
83
+ "# Split in train test\n",
84
+ "text_X_train, text_X_test, y_train, y_test = train_test_split(\n",
85
+ " text_X, y, test_size=0.1, random_state=42\n",
86
+ ")"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "markdown",
91
+ "metadata": {},
92
+ "source": [
93
+ "### 1. Text representation using TF-IDF\n",
94
+ "\n",
95
+ "[Term Frequency-Inverse Document Frequency](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)(TF-IDF) also known as is a numerical statistic that is used to compute the importance of a term in a document. The higher the TF-IDF score, the more important the term is to the document.\n",
96
+ "\n",
97
+ "We compute it as follows:\n",
98
+ "\n",
99
+ "$$ \\mathsf{TF\\textrm{-}IDF}(t,d,D) = \\mathsf{TF}(t,d) * \\mathsf{IDF}(t,D) $$\n",
100
+ "\n",
101
+ "where: $\\mathsf{TF}(t,d)$ is the term frequency of term $t$ in document $d$, $\\mathsf{IDF}(t,D)$ is the inverse document frequency of term $t$ in document collection $D$.\n",
102
+ "\n",
103
+ "Here we use the scikit-learn implementation of TF-IDF vectorizer."
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": 4,
109
+ "metadata": {},
110
+ "outputs": [],
111
+ "source": [
112
+ "# Let's first build a representation vector from the text\n",
113
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
114
+ "\n",
115
+ "tfidf_vectorizer = TfidfVectorizer(max_features=500, stop_words=\"english\")\n",
116
+ "X_train = tfidf_vectorizer.fit_transform(text_X_train)\n",
117
+ "X_test = tfidf_vectorizer.transform(text_X_test)\n",
118
+ "\n",
119
+ "# Make our train and test dense array\n",
120
+ "X_train = X_train.toarray()\n",
121
+ "X_test = X_test.toarray()"
122
+ ]
123
+ },
124
+ {
125
+ "cell_type": "code",
126
+ "execution_count": 5,
127
+ "metadata": {},
128
+ "outputs": [],
129
+ "source": [
130
+ "# Let's build our model\n",
131
+ "model = XGBClassifier()\n",
132
+ "\n",
133
+ "# A gridsearch to find the best parameters\n",
134
+ "parameters = {\n",
135
+ " \"n_bits\": [2, 3],\n",
136
+ " \"max_depth\": [1],\n",
137
+ " \"n_estimators\": [10, 30, 50],\n",
138
+ " # \"n_jobs\": [-1],\n",
139
+ "}"
140
+ ]
141
+ },
142
+ {
143
+ "cell_type": "code",
144
+ "execution_count": 6,
145
+ "metadata": {},
146
+ "outputs": [
147
+ {
148
+ "data": {
149
+ "text/html": [
150
+ "<style>#sk-container-id-1 {color: black;background-color: white;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>GridSearchCV(cv=3, estimator=XGBClassifier(n_jobs=1),\n",
151
+ " param_grid={&#x27;max_depth&#x27;: [1], &#x27;n_bits&#x27;: [2, 3],\n",
152
+ " &#x27;n_estimators&#x27;: [10, 30, 50]},\n",
153
+ " scoring=&#x27;accuracy&#x27;)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" ><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">GridSearchCV</label><div class=\"sk-toggleable__content\"><pre>GridSearchCV(cv=3, estimator=XGBClassifier(n_jobs=1),\n",
154
+ " param_grid={&#x27;max_depth&#x27;: [1], &#x27;n_bits&#x27;: [2, 3],\n",
155
+ " &#x27;n_estimators&#x27;: [10, 30, 50]},\n",
156
+ " scoring=&#x27;accuracy&#x27;)</pre></div></div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-2\" type=\"checkbox\" ><label for=\"sk-estimator-id-2\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">estimator: XGBClassifier</label><div class=\"sk-toggleable__content\"><pre>XGBClassifier(n_jobs=1)</pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-3\" type=\"checkbox\" ><label for=\"sk-estimator-id-3\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">XGBClassifier</label><div class=\"sk-toggleable__content\"><pre>XGBClassifier(n_jobs=1)</pre></div></div></div></div></div></div></div></div></div></div>"
157
+ ],
158
+ "text/plain": [
159
+ "GridSearchCV(cv=3, estimator=XGBClassifier(n_jobs=1),\n",
160
+ " param_grid={'max_depth': [1], 'n_bits': [2, 3],\n",
161
+ " 'n_estimators': [10, 30, 50]},\n",
162
+ " scoring='accuracy')"
163
+ ]
164
+ },
165
+ "execution_count": 6,
166
+ "metadata": {},
167
+ "output_type": "execute_result"
168
+ }
169
+ ],
170
+ "source": [
171
+ "# Run the gridsearch\n",
172
+ "grid_search = GridSearchCV(model, parameters, cv=3, scoring=\"accuracy\")\n",
173
+ "grid_search.fit(X_train, y_train)"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "code",
178
+ "execution_count": 7,
179
+ "metadata": {},
180
+ "outputs": [
181
+ {
182
+ "name": "stdout",
183
+ "output_type": "stream",
184
+ "text": [
185
+ "Best score: 0.705980570734669\n",
186
+ "Best parameters: {'max_depth': 1, 'n_bits': 3, 'n_estimators': 50}\n"
187
+ ]
188
+ }
189
+ ],
190
+ "source": [
191
+ "# Check the accuracy of the best model\n",
192
+ "print(f\"Best score: {grid_search.best_score_}\")\n",
193
+ "\n",
194
+ "# Check best hyperparameters\n",
195
+ "print(f\"Best parameters: {grid_search.best_params_}\")\n",
196
+ "\n",
197
+ "# Extract best model\n",
198
+ "best_model = grid_search.best_estimator_"
199
+ ]
200
+ },
201
+ {
202
+ "cell_type": "code",
203
+ "execution_count": 8,
204
+ "metadata": {},
205
+ "outputs": [
206
+ {
207
+ "name": "stdout",
208
+ "output_type": "stream",
209
+ "text": [
210
+ "Accuracy: 0.7117\n",
211
+ "Average precision score for positive class: 0.6404\n",
212
+ "Average precision score for negative class: 0.8719\n",
213
+ "Average precision score for neutral class: 0.4349\n"
214
+ ]
215
+ }
216
+ ],
217
+ "source": [
218
+ "# Compute the average precision for each class\n",
219
+ "y_proba_test_tfidf = best_model.predict_proba(X_test)\n",
220
+ "\n",
221
+ "# Compute accuracy\n",
222
+ "y_pred_test_tfidf = numpy.argmax(y_proba_test_tfidf, axis=1)\n",
223
+ "accuracy_tfidf = numpy.mean(y_pred_test_tfidf == y_test)\n",
224
+ "print(f\"Accuracy: {accuracy_tfidf:.4f}\")\n",
225
+ "\n",
226
+ "y_pred_positive = y_proba_test_tfidf[:, 2]\n",
227
+ "y_pred_negative = y_proba_test_tfidf[:, 0]\n",
228
+ "y_pred_neutral = y_proba_test_tfidf[:, 1]\n",
229
+ "\n",
230
+ "ap_positive_tfidf = average_precision_score((y_test == 2), y_pred_positive)\n",
231
+ "ap_negative_tfidf = average_precision_score((y_test == 0), y_pred_negative)\n",
232
+ "ap_neutral_tfidf = average_precision_score((y_test == 1), y_pred_neutral)\n",
233
+ "\n",
234
+ "print(f\"Average precision score for positive class: \" f\"{ap_positive_tfidf:.4f}\")\n",
235
+ "print(f\"Average precision score for negative class: \" f\"{ap_negative_tfidf:.4f}\")\n",
236
+ "print(f\"Average precision score for neutral class: \" f\"{ap_neutral_tfidf:.4f}\")"
237
+ ]
238
+ },
239
+ {
240
+ "cell_type": "code",
241
+ "execution_count": 9,
242
+ "metadata": {},
243
+ "outputs": [
244
+ {
245
+ "data": {
246
+ "text/plain": [
247
+ "array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
248
+ " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
249
+ " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
250
+ " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
251
+ " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
252
+ " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
253
+ " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
254
+ " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
255
+ " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
256
+ " 2, 2, 2, 2, 2, 2])"
257
+ ]
258
+ },
259
+ "execution_count": 9,
260
+ "metadata": {},
261
+ "output_type": "execute_result"
262
+ }
263
+ ],
264
+ "source": [
265
+ "y_pred_test_tfidf[y_pred_test_tfidf == 2]"
266
+ ]
267
+ },
268
+ {
269
+ "cell_type": "code",
270
+ "execution_count": 10,
271
+ "metadata": {},
272
+ "outputs": [
273
+ {
274
+ "name": "stdout",
275
+ "output_type": "stream",
276
+ "text": [
277
+ "5 most positive tweets (class 2):\n",
278
+ "@JetBlue do bags still fly free or have you started charging? thanks!\n",
279
+ "@SouthwestAir Is there a way to receive a refund on a trip that was Cancelled Flight online instead of calling? Your phone lines are super busy.\n",
280
+ "@JetBlue bag is supposedly here in Boston\n",
281
+ "@AmericanAir Cancelled Flights my flight, doesn't send an email, text or call. Now I'm stranded in Louisville.\n",
282
+ "@SouthwestAir I need to Cancelled Flight one leg of a flight, but can't seem to do this online. Been on hold on the phone for 10 minutes. Any help?\n",
283
+ "----------------------------------------------------------------------------------------------------\n",
284
+ "5 most negative tweets (class 0):\n",
285
+ "@AmericanAir - keeping AA up in the Air! My crew chief cousin Alex Espinosa in DFW! http://t.co/0HXLNvZknP\n",
286
+ "@JetBlue Called JB 3 times!Everytime, Auto Vmsg:\"your wait time should not be longer than 9 mins\" waited longer than 18 mins and no answer!\n",
287
+ "@SouthwestAir can you outline the policies for both scenarios?\n",
288
+ "@united is not a company that values it's customer &amp; after reading tweets to them I'm not the only one who feels that way #lostmybusiness\n",
289
+ "@JetBlue how about free wifi on flt 1254 out of PBI to make up for 2.5 hr delay? Treat us right.\n"
290
+ ]
291
+ }
292
+ ],
293
+ "source": [
294
+ "# Let's see what are the top predictions based on the probabilities in y_pred_test\n",
295
+ "print(\"5 most positive tweets (class 2):\")\n",
296
+ "for i in range(5):\n",
297
+ " print(text_X_test.iloc[y_pred_test_tfidf[y_pred_test_tfidf==2].argsort()[-1 - i]])\n",
298
+ "\n",
299
+ "print(\"-\" * 100)\n",
300
+ "\n",
301
+ "print(\"5 most negative tweets (class 0):\")\n",
302
+ "for i in range(5):\n",
303
+ " print(text_X_test.iloc[y_pred_test_tfidf[y_pred_test_tfidf==0].argsort()[-1 - i]])"
304
+ ]
305
+ },
306
+ {
307
+ "cell_type": "code",
308
+ "execution_count": 11,
309
+ "metadata": {},
310
+ "outputs": [
311
+ {
312
+ "name": "stdout",
313
+ "output_type": "stream",
314
+ "text": [
315
+ "Compilation time: 5.3550 seconds\n",
316
+ "FHE inference time: 1.1162 seconds\n"
317
+ ]
318
+ }
319
+ ],
320
+ "source": [
321
+ "# Compile the model to get the FHE inference engine\n",
322
+ "# (this may take a few minutes depending on the selected model)\n",
323
+ "start = time.perf_counter()\n",
324
+ "best_model.compile(X_train)\n",
325
+ "end = time.perf_counter()\n",
326
+ "print(f\"Compilation time: {end - start:.4f} seconds\")\n",
327
+ "\n",
328
+ "# Let's write a custom example and predict in FHE\n",
329
+ "tested_tweet = [\"AirFrance is awesome, almost as much as Zama!\"]\n",
330
+ "X_tested_tweet = tfidf_vectorizer.transform(numpy.array(tested_tweet)).toarray()\n",
331
+ "clear_proba = best_model.predict_proba(X_tested_tweet)\n",
332
+ "\n",
333
+ "# Now let's predict with FHE over a single tweet and print the time it takes\n",
334
+ "start = time.perf_counter()\n",
335
+ "decrypted_proba = best_model.predict_proba(X_tested_tweet, fhe=\"execute\")\n",
336
+ "end = time.perf_counter()\n",
337
+ "print(f\"FHE inference time: {end - start:.4f} seconds\")"
338
+ ]
339
+ },
340
+ {
341
+ "cell_type": "code",
342
+ "execution_count": 12,
343
+ "metadata": {},
344
+ "outputs": [
345
+ {
346
+ "name": "stdout",
347
+ "output_type": "stream",
348
+ "text": [
349
+ "Probabilities from the FHE inference: [[0.30244059 0.17506451 0.5224949 ]]\n",
350
+ "Probabilities from the clear model: [[0.30244059 0.17506451 0.5224949 ]]\n"
351
+ ]
352
+ }
353
+ ],
354
+ "source": [
355
+ "print(f\"Probabilities from the FHE inference: {decrypted_proba}\")\n",
356
+ "print(f\"Probabilities from the clear model: {clear_proba}\")"
357
+ ]
358
+ },
359
+ {
360
+ "cell_type": "markdown",
361
+ "metadata": {},
362
+ "source": [
363
+ "To sum up, \n",
364
+ "- We trained a XGBoost model over TF-IDF representation of the tweets and their respective sentiment class. \n",
365
+ "- The grid search gives us a model that achieves around ~70% accuracy.\n",
366
+ "- Given the imbalance in the classes, we rather compute the average precision per class.\n",
367
+ "\n",
368
+ "Now we will see how we can approach the problem by leveraging the transformers power."
369
+ ]
370
+ },
371
+ {
372
+ "cell_type": "markdown",
373
+ "metadata": {},
374
+ "source": [
375
+ "### 2. A transformer approach to text representation\n",
376
+ "\n",
377
+ "[**Transformers**](https://en.wikipedia.org/wiki/Transformer_(machine_learning_model\\)) are neural networks that are often trained to predict the next words to appear in a text (this is commonly called self-supervised learning). \n",
378
+ "\n",
379
+ "They are powerful tools for all kind of Natural Language Processing tasks but supporting a transformer model in FHE might not always be ideal as they are quite big models. However, we can still leverage their hidden representation for any text and feed it to a more FHE friendly machine learning model (in this notebook we will use XGBoost) for classification.\n",
380
+ "\n",
381
+ "Here we will use the transformer model from the amazing [**Huggingface**](https://huggingface.co/) repository."
382
+ ]
383
+ },
384
+ {
385
+ "cell_type": "code",
386
+ "execution_count": 13,
387
+ "metadata": {},
388
+ "outputs": [
389
+ {
390
+ "name": "stderr",
391
+ "output_type": "stream",
392
+ "text": [
393
+ "Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']\n",
394
+ "- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
395
+ "- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
396
+ ]
397
+ }
398
+ ],
399
+ "source": [
400
+ "import torch\n",
401
+ "import tqdm\n",
402
+ "from transformers import AutoModelForSequenceClassification, AutoTokenizer\n",
403
+ "\n",
404
+ "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
405
+ "\n",
406
+ "# Load the tokenizer (converts text to tokens)\n",
407
+ "tokenizer = AutoTokenizer.from_pretrained(\"cardiffnlp/twitter-roberta-base-sentiment-latest\")\n",
408
+ "\n",
409
+ "# Load the pre-trained model\n",
410
+ "transformer_model = AutoModelForSequenceClassification.from_pretrained(\n",
411
+ " \"cardiffnlp/twitter-roberta-base-sentiment-latest\"\n",
412
+ ")"
413
+ ]
414
+ },
415
+ {
416
+ "cell_type": "code",
417
+ "execution_count": 14,
418
+ "metadata": {},
419
+ "outputs": [
420
+ {
421
+ "name": "stderr",
422
+ "output_type": "stream",
423
+ "text": [
424
+ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
425
+ "To disable this warning, you can either:\n",
426
+ "\t- Avoid using `tokenizers` before the fork if possible\n",
427
+ "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
428
+ " 0%| | 0/30 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.\n",
429
+ "100%|██████████| 30/30 [00:20<00:00, 1.45it/s]\n"
430
+ ]
431
+ }
432
+ ],
433
+ "source": [
434
+ "# Let's first see what are the model performance by itself\n",
435
+ "list_text_X_test = text_X_test.tolist()\n",
436
+ "\n",
437
+ "tokenized_text_X_test = tokenizer.batch_encode_plus(\n",
438
+ " list_text_X_test, pad_to_max_length=True, return_tensors=\"pt\"\n",
439
+ ")[\"input_ids\"]\n",
440
+ "\n",
441
+ "# Depending on the hardware used, the number of examples to be processed can be reduced\n",
442
+ "# Here we split the data into 100 examples per batch\n",
443
+ "tokenized_text_X_test_split = torch.split(tokenized_text_X_test, split_size_or_sections=50)\n",
444
+ "transformer_model = transformer_model.to(device)\n",
445
+ "\n",
446
+ "outputs = []\n",
447
+ "for tokenized_x_test in tqdm.tqdm(tokenized_text_X_test_split):\n",
448
+ " tokenized_x = tokenized_x_test.to(device)\n",
449
+ " output_batch = transformer_model(tokenized_x)[\"logits\"]\n",
450
+ " output_batch = output_batch.detach().cpu().numpy()\n",
451
+ " outputs.append(output_batch)\n",
452
+ "\n",
453
+ "outputs = numpy.concatenate(outputs, axis=0)"
454
+ ]
455
+ },
456
+ {
457
+ "cell_type": "code",
458
+ "execution_count": 15,
459
+ "metadata": {},
460
+ "outputs": [
461
+ {
462
+ "name": "stdout",
463
+ "output_type": "stream",
464
+ "text": [
465
+ "Predictions for the first 3 tweets:\n",
466
+ " [[-2.3807454 -0.61802197 2.9900734 ]\n",
467
+ " [ 2.0166504 0.49380752 -2.8006463 ]\n",
468
+ " [ 2.3892734 0.13443531 -2.6873832 ]]\n"
469
+ ]
470
+ }
471
+ ],
472
+ "source": [
473
+ "# Let's see what the transformer model predicts\n",
474
+ "print(f\"Predictions for the first 3 tweets:\\n {outputs[:3]}\")"
475
+ ]
476
+ },
477
+ {
478
+ "cell_type": "code",
479
+ "execution_count": 16,
480
+ "metadata": {},
481
+ "outputs": [
482
+ {
483
+ "name": "stdout",
484
+ "output_type": "stream",
485
+ "text": [
486
+ "Accuracy: 0.8053\n",
487
+ "Average precision score for positive class: 0.8548\n",
488
+ "Average precision score for negative class: 0.9548\n",
489
+ "Average precision score for neutral class: 0.6801\n"
490
+ ]
491
+ }
492
+ ],
493
+ "source": [
494
+ "# Compute the metrics for each class\n",
495
+ "\n",
496
+ "# Compute accuracy\n",
497
+ "accuracy_transformer_only = numpy.mean(numpy.argmax(outputs, axis=1) == y_test)\n",
498
+ "print(f\"Accuracy: {accuracy_transformer_only:.4f}\")\n",
499
+ "\n",
500
+ "y_pred_positive = outputs[:, 2]\n",
501
+ "y_pred_negative = outputs[:, 0]\n",
502
+ "y_pred_neutral = outputs[:, 1]\n",
503
+ "\n",
504
+ "ap_positive_transformer_only = average_precision_score((y_test == 2), y_pred_positive)\n",
505
+ "ap_negative_transformer_only = average_precision_score((y_test == 0), y_pred_negative)\n",
506
+ "ap_neutral_transformer_only = average_precision_score((y_test == 1), y_pred_neutral)\n",
507
+ "\n",
508
+ "print(f\"Average precision score for positive class: \" f\"{ap_positive_transformer_only:.4f}\")\n",
509
+ "print(f\"Average precision score for negative class: \" f\"{ap_negative_transformer_only:.4f}\")\n",
510
+ "print(f\"Average precision score for neutral class: \" f\"{ap_neutral_transformer_only:.4f}\")"
511
+ ]
512
+ },
513
+ {
514
+ "cell_type": "markdown",
515
+ "metadata": {},
516
+ "source": [
517
+ "It looks like the transformer outperforms the model built on TF-IDF reprensentation.\n",
518
+ "Unfortunately, running a transformer that big in FHE would be highly inefficient. \n",
519
+ "\n",
520
+ "Let's see if we can leverage transformer representation and train a FHE model for the given classification task. "
521
+ ]
522
+ },
523
+ {
524
+ "cell_type": "code",
525
+ "execution_count": 17,
526
+ "metadata": {},
527
+ "outputs": [
528
+ {
529
+ "name": "stderr",
530
+ "output_type": "stream",
531
+ "text": [
532
+ "100%|██████████| 13176/13176 [09:24<00:00, 23.36it/s]\n",
533
+ "100%|██████████| 1464/1464 [01:00<00:00, 24.12it/s]\n"
534
+ ]
535
+ }
536
+ ],
537
+ "source": [
538
+ "# Function that transforms a list of texts to their representation\n",
539
+ "# learned by the transformer.\n",
540
+ "def text_to_tensor(\n",
541
+ " list_text_X_train: list,\n",
542
+ " transformer_model: AutoModelForSequenceClassification,\n",
543
+ " tokenizer: AutoTokenizer,\n",
544
+ " device: str,\n",
545
+ ") -> numpy.ndarray:\n",
546
+ " # Tokenize each text in the list one by one\n",
547
+ " tokenized_text_X_train_split = []\n",
548
+ " for text_x_train in list_text_X_train:\n",
549
+ " tokenized_text_X_train_split.append(tokenizer.encode(text_x_train, return_tensors=\"pt\"))\n",
550
+ "\n",
551
+ " # Send the model to the device\n",
552
+ " transformer_model = transformer_model.to(device)\n",
553
+ " output_hidden_states_list = []\n",
554
+ "\n",
555
+ " for tokenized_x in tqdm.tqdm(tokenized_text_X_train_split):\n",
556
+ " # Pass the tokens through the transformer model and get the hidden states\n",
557
+ " # Only keep the last hidden layer state for now\n",
558
+ " output_hidden_states = transformer_model(tokenized_x.to(device), output_hidden_states=True)[\n",
559
+ " 1\n",
560
+ " ][-1]\n",
561
+ " # Average over the tokens axis to get a representation at the text level.\n",
562
+ " output_hidden_states = output_hidden_states.mean(dim=1)\n",
563
+ " output_hidden_states = output_hidden_states.detach().cpu().numpy()\n",
564
+ " output_hidden_states_list.append(output_hidden_states)\n",
565
+ "\n",
566
+ " return numpy.concatenate(output_hidden_states_list, axis=0)\n",
567
+ "\n",
568
+ "\n",
569
+ "# Let's vectorize the text using the transformer\n",
570
+ "list_text_X_train = text_X_train.tolist()\n",
571
+ "list_text_X_test = text_X_test.tolist()\n",
572
+ "\n",
573
+ "X_train_transformer = text_to_tensor(list_text_X_train, transformer_model, tokenizer, device)\n",
574
+ "X_test_transformer = text_to_tensor(list_text_X_test, transformer_model, tokenizer, device)"
575
+ ]
576
+ },
577
+ {
578
+ "cell_type": "code",
579
+ "execution_count": 18,
580
+ "metadata": {},
581
+ "outputs": [
582
+ {
583
+ "data": {
584
+ "text/html": [
585
+ "<style>#sk-container-id-2 {color: black;background-color: white;}#sk-container-id-2 pre{padding: 0;}#sk-container-id-2 div.sk-toggleable {background-color: white;}#sk-container-id-2 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-2 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-2 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-2 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-2 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-2 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-2 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-2 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-2 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-2 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-2 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-2 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-2 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-2 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-2 div.sk-item {position: relative;z-index: 1;}#sk-container-id-2 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-2 div.sk-item::before, #sk-container-id-2 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-2 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-2 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-2 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-2 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-2 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-2 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-2 div.sk-label-container {text-align: center;}#sk-container-id-2 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-2 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-2\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>GridSearchCV(cv=3, estimator=XGBClassifier(n_jobs=1), n_jobs=1,\n",
586
+ " param_grid={&#x27;max_depth&#x27;: [1], &#x27;n_bits&#x27;: [2, 3],\n",
587
+ " &#x27;n_estimators&#x27;: [10, 30, 50]},\n",
588
+ " scoring=&#x27;accuracy&#x27;)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-4\" type=\"checkbox\" ><label for=\"sk-estimator-id-4\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">GridSearchCV</label><div class=\"sk-toggleable__content\"><pre>GridSearchCV(cv=3, estimator=XGBClassifier(n_jobs=1), n_jobs=1,\n",
589
+ " param_grid={&#x27;max_depth&#x27;: [1], &#x27;n_bits&#x27;: [2, 3],\n",
590
+ " &#x27;n_estimators&#x27;: [10, 30, 50]},\n",
591
+ " scoring=&#x27;accuracy&#x27;)</pre></div></div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-5\" type=\"checkbox\" ><label for=\"sk-estimator-id-5\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">estimator: XGBClassifier</label><div class=\"sk-toggleable__content\"><pre>XGBClassifier(n_jobs=1)</pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-6\" type=\"checkbox\" ><label for=\"sk-estimator-id-6\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">XGBClassifier</label><div class=\"sk-toggleable__content\"><pre>XGBClassifier(n_jobs=1)</pre></div></div></div></div></div></div></div></div></div></div>"
592
+ ],
593
+ "text/plain": [
594
+ "GridSearchCV(cv=3, estimator=XGBClassifier(n_jobs=1), n_jobs=1,\n",
595
+ " param_grid={'max_depth': [1], 'n_bits': [2, 3],\n",
596
+ " 'n_estimators': [10, 30, 50]},\n",
597
+ " scoring='accuracy')"
598
+ ]
599
+ },
600
+ "execution_count": 18,
601
+ "metadata": {},
602
+ "output_type": "execute_result"
603
+ }
604
+ ],
605
+ "source": [
606
+ "# Now we have a representation for each tweet, we can train a model on these.\n",
607
+ "grid_search = GridSearchCV(model, parameters, cv=3, n_jobs=1, scoring=\"accuracy\")\n",
608
+ "grid_search.fit(X_train_transformer, y_train)"
609
+ ]
610
+ },
611
+ {
612
+ "cell_type": "code",
613
+ "execution_count": 19,
614
+ "metadata": {},
615
+ "outputs": [
616
+ {
617
+ "name": "stdout",
618
+ "output_type": "stream",
619
+ "text": [
620
+ "Best score: 0.8381147540983607\n",
621
+ "Best parameters: {'max_depth': 1, 'n_bits': 3, 'n_estimators': 50}\n"
622
+ ]
623
+ }
624
+ ],
625
+ "source": [
626
+ "# Check the accuracy of the best model\n",
627
+ "print(f\"Best score: {grid_search.best_score_}\")\n",
628
+ "\n",
629
+ "# Check best hyperparameters\n",
630
+ "print(f\"Best parameters: {grid_search.best_params_}\")\n",
631
+ "\n",
632
+ "# Extract best model\n",
633
+ "best_model = grid_search.best_estimator_"
634
+ ]
635
+ },
636
+ {
637
+ "cell_type": "code",
638
+ "execution_count": 20,
639
+ "metadata": {},
640
+ "outputs": [
641
+ {
642
+ "name": "stdout",
643
+ "output_type": "stream",
644
+ "text": [
645
+ "Accuracy: 0.8463\n",
646
+ "Average precision score for positive class: 0.8959\n",
647
+ "Average precision score for negative class: 0.9647\n",
648
+ "Average precision score for neutral class: 0.7449\n"
649
+ ]
650
+ }
651
+ ],
652
+ "source": [
653
+ "# Compute the metrics for each class\n",
654
+ "\n",
655
+ "y_proba = best_model.predict_proba(X_test_transformer)\n",
656
+ "\n",
657
+ "# Compute the accuracy\n",
658
+ "y_pred = numpy.argmax(y_proba, axis=1)\n",
659
+ "accuracy_transformer_xgboost = numpy.mean(y_pred == y_test)\n",
660
+ "print(f\"Accuracy: {accuracy_transformer_xgboost:.4f}\")\n",
661
+ "\n",
662
+ "y_pred_positive = y_proba[:, 2]\n",
663
+ "y_pred_negative = y_proba[:, 0]\n",
664
+ "y_pred_neutral = y_proba[:, 1]\n",
665
+ "\n",
666
+ "ap_positive_transformer_xgboost = average_precision_score((y_test == 2), y_pred_positive)\n",
667
+ "ap_negative_transformer_xgboost = average_precision_score((y_test == 0), y_pred_negative)\n",
668
+ "ap_neutral_transformer_xgboost = average_precision_score((y_test == 1), y_pred_neutral)\n",
669
+ "\n",
670
+ "print(f\"Average precision score for positive class: \" f\"{ap_positive_transformer_xgboost:.4f}\")\n",
671
+ "print(f\"Average precision score for negative class: \" f\"{ap_negative_transformer_xgboost:.4f}\")\n",
672
+ "print(f\"Average precision score for neutral class: \" f\"{ap_neutral_transformer_xgboost:.4f}\")"
673
+ ]
674
+ },
675
+ {
676
+ "cell_type": "markdown",
677
+ "metadata": {},
678
+ "source": [
679
+ "Our FHE-friendly XGBoost model does 38% better than the XGBoost model built over TF-IDF representation of the text. Note that here we are still not using FHE and only evaluating the model.\n",
680
+ "Interestingly, using XGBoost over the transformer representation of the text matches the performance of the transformer model alone."
681
+ ]
682
+ },
683
+ {
684
+ "cell_type": "code",
685
+ "execution_count": 21,
686
+ "metadata": {},
687
+ "outputs": [
688
+ {
689
+ "name": "stdout",
690
+ "output_type": "stream",
691
+ "text": [
692
+ "5 most positive tweets (class 2):\n",
693
+ "@united I think this is the best first class I have ever gotten!! Denver to LAX and it's wonderful!!!\n",
694
+ "@AmericanAir Flight 236 was great. Fantastic cabin crew. A+ landing. #thankyou #JFK http://t.co/dRW08djHAI\n",
695
+ "@SouthwestAir Jason (108639) at Gate #3 in SAN made my afternoon!!! #southwestairlines #stellarservice #thanks!\n",
696
+ "@SouthwestAir love them! Always get the best deals!\n",
697
+ "@AmericanAir simply amazing. Smiles for miles.Thank u for my upgrade tomorrow for ORD.We are spending a lot of time together next few weeks!\n",
698
+ "----------------------------------------------------------------------------------------------------\n",
699
+ "5 most negative tweets (class 0):\n",
700
+ "@united first you lost all my bags, now you Cancelled Flight my flight home. 30 min wait to talk to somebody #poorservice #notgoodenough\n",
701
+ "@USAirways Not only did u lose the flight plan! Now ur flight crew is FAA timed out! Thx for havin us sit on the tarmac for an hr! #Pathetic\n",
702
+ "@AmericanAir Phone just disconnects if you stay on the line. Need to checkout of hotel in 2 hrs &amp; have no place to go. Can't keep calling.\n",
703
+ "@VirginAmerica I have lots of flights to book and your site it not working!!!! I've been on the phone waiting for over 10 minutes..........\n",
704
+ "@united 3 hour delay plus a jetway that won't move. This biz traveler is never flying u again!\n"
705
+ ]
706
+ }
707
+ ],
708
+ "source": [
709
+ "# Get probabilities predictions in clear\n",
710
+ "y_pred_test = best_model.predict_proba(X_test_transformer)\n",
711
+ "\n",
712
+ "# Let's see what are the top predictions based on the probabilities in y_pred_test\n",
713
+ "print(\"5 most positive tweets (class 2):\")\n",
714
+ "for i in range(5):\n",
715
+ " print(text_X_test.iloc[y_pred_test[:, 2].argsort()[-1 - i]])\n",
716
+ "\n",
717
+ "print(\"-\" * 100)\n",
718
+ "\n",
719
+ "print(\"5 most negative tweets (class 0):\")\n",
720
+ "for i in range(5):\n",
721
+ " print(text_X_test.iloc[y_pred_test[:, 0].argsort()[-1 - i]])"
722
+ ]
723
+ },
724
+ {
725
+ "cell_type": "code",
726
+ "execution_count": 22,
727
+ "metadata": {},
728
+ "outputs": [
729
+ {
730
+ "name": "stdout",
731
+ "output_type": "stream",
732
+ "text": [
733
+ "5 most positive (predicted) tweets that are actually negative (ground truth class 0):\n",
734
+ "@united thanks for the link, now finally arrived in Brussels, 9 h after schedule...\n",
735
+ "@USAirways as far as being delayed goes… Looks like tailwinds are going to make up for it. Good news!\n",
736
+ "@united thanks for having changed me. Managed to arrive with only 8 hours of delay and exhausted\n",
737
+ "@USAirways your saving grace was our flight attendant Dallas who was amazing. wish he would transfer to Delta where I would see him again\n",
738
+ "@AmericanAir that luggage you forgot...#mia.....he just won an oscar😄💝💝💝\n",
739
+ "----------------------------------------------------------------------------------------------------\n",
740
+ "5 most negative (predicted) tweets that are actually positive (ground truth class 2):\n",
741
+ "@united thanks for updating me about the 1+ hour delay the exact second I got to ATL. 🙅🙅🙅\n",
742
+ "@SouthwestAir save mile to visit family in 2015 and this will impact how many times I can see my mother. I planned and you change the rules\n",
743
+ "@JetBlue you don't remember our date Monday night back to NYC? #heartbroken\n",
744
+ "@SouthwestAir hot stewardess flipped me off\n",
745
+ "@SouthwestAir - We left iPad in a seat pocket. Filed lost item report. Received it exactly 1 week Late Flightr. Is that a record? #unbelievable\n"
746
+ ]
747
+ }
748
+ ],
749
+ "source": [
750
+ "# Now let's see where the model is wrong\n",
751
+ "y_pred_test_0 = y_pred_test[y_test == 0]\n",
752
+ "text_X_test_0 = text_X_test[y_test == 0]\n",
753
+ "\n",
754
+ "print(\"5 most positive (predicted) tweets that are actually negative (ground truth class 0):\")\n",
755
+ "for i in range(5):\n",
756
+ " print(text_X_test_0.iloc[y_pred_test_0[:, 2].argsort()[-1 - i]])\n",
757
+ "\n",
758
+ "print(\"-\" * 100)\n",
759
+ "\n",
760
+ "y_pred_test_2 = y_pred_test[y_test == 2]\n",
761
+ "text_X_test_2 = text_X_test[y_test == 2]\n",
762
+ "print(\"5 most negative (predicted) tweets that are actually positive (ground truth class 2):\")\n",
763
+ "for i in range(5):\n",
764
+ " print(text_X_test_2.iloc[y_pred_test_2[:, 0].argsort()[-1 - i]])"
765
+ ]
766
+ },
767
+ {
768
+ "cell_type": "markdown",
769
+ "metadata": {},
770
+ "source": [
771
+ "Interestingly, these misclassifications are not obvious and some actually look rather like mislabeled. Also, it seems that the model is having a hard time to find ironic tweets.\n",
772
+ "\n",
773
+ "Now we have our model trained which has some great accuracy. Let's have it predict over the encrypted representation."
774
+ ]
775
+ },
776
+ {
777
+ "cell_type": "markdown",
778
+ "metadata": {},
779
+ "source": [
780
+ "### Sentiment Analysis of the Tweet with Fully Homomorphic Encryption\n",
781
+ "\n",
782
+ "Now that we have our model ready for FHE inference and our data ready for encryption let's use the model in a privacy preserving manner with FHE."
783
+ ]
784
+ },
785
+ {
786
+ "cell_type": "code",
787
+ "execution_count": 23,
788
+ "metadata": {},
789
+ "outputs": [
790
+ {
791
+ "name": "stdout",
792
+ "output_type": "stream",
793
+ "text": [
794
+ "Compilation time: 5.8594 seconds\n"
795
+ ]
796
+ },
797
+ {
798
+ "name": "stderr",
799
+ "output_type": "stream",
800
+ "text": [
801
+ "100%|██████████| 1/1 [00:00<00:00, 17.16it/s]"
802
+ ]
803
+ },
804
+ {
805
+ "name": "stdout",
806
+ "output_type": "stream",
807
+ "text": [
808
+ "FHE inference time: 0.9319 seconds\n"
809
+ ]
810
+ },
811
+ {
812
+ "name": "stderr",
813
+ "output_type": "stream",
814
+ "text": [
815
+ "\n"
816
+ ]
817
+ }
818
+ ],
819
+ "source": [
820
+ "# Compile the model to get the FHE inference engine\n",
821
+ "# (this may take a few minutes depending on the selected model)\n",
822
+ "start = time.perf_counter()\n",
823
+ "best_model.compile(X_train_transformer)\n",
824
+ "end = time.perf_counter()\n",
825
+ "print(f\"Compilation time: {end - start:.4f} seconds\")\n",
826
+ "\n",
827
+ "\n",
828
+ "# Let's write a custom example and predict in FHE\n",
829
+ "tested_tweet = [\"AirFrance is awesome, almost as much as Zama!\"]\n",
830
+ "X_tested_tweet = text_to_tensor(tested_tweet, transformer_model, tokenizer, device)\n",
831
+ "clear_proba = best_model.predict_proba(X_tested_tweet)\n",
832
+ "\n",
833
+ "# Now let's predict with FHE over a single tweet and print the time it takes\n",
834
+ "start = time.perf_counter()\n",
835
+ "decrypted_proba = best_model.predict_proba(X_tested_tweet, fhe=\"execute\")\n",
836
+ "end = time.perf_counter()\n",
837
+ "fhe_exec_time = end - start\n",
838
+ "print(f\"FHE inference time: {fhe_exec_time:.4f} seconds\")"
839
+ ]
840
+ },
841
+ {
842
+ "cell_type": "code",
843
+ "execution_count": 24,
844
+ "metadata": {},
845
+ "outputs": [
846
+ {
847
+ "name": "stdout",
848
+ "output_type": "stream",
849
+ "text": [
850
+ "Probabilities from the FHE inference: [[0.05162184 0.04558276 0.90279541]]\n",
851
+ "Probabilities from the clear model: [[0.05162184 0.04558276 0.90279541]]\n"
852
+ ]
853
+ }
854
+ ],
855
+ "source": [
856
+ "print(f\"Probabilities from the FHE inference: {decrypted_proba}\")\n",
857
+ "print(f\"Probabilities from the clear model: {clear_proba}\")"
858
+ ]
859
+ },
860
+ {
861
+ "cell_type": "code",
862
+ "execution_count": 26,
863
+ "metadata": {},
864
+ "outputs": [],
865
+ "source": [
866
+ "DEPLOYMENT_DIR = Path(\"deployment\")\n",
867
+ "DEPLOYMENT_DIR.mkdir(exist_ok=True)\n",
868
+ "\n",
869
+ "# Let's export the final model such that we can reuse it in a client/server environment\n",
870
+ "\n",
871
+ "# Serialize the model (for development only)\n",
872
+ "with (DEPLOYMENT_DIR / \"serialized_model\").open(\"w\") as file:\n",
873
+ " best_model.dump(file)\n",
874
+ "\n",
875
+ "# Export some data to be used for compilation \n",
876
+ "X_train_numpy = X_train_transformer[:100]\n",
877
+ "\n",
878
+ "# Merge the two arrays in a pandas dataframe\n",
879
+ "X_test_numpy_df = pd.DataFrame(X_train_numpy)\n",
880
+ "\n",
881
+ "# to csv\n",
882
+ "X_test_numpy_df.to_csv(DEPLOYMENT_DIR / \"samples_for_compilation.csv\")\n",
883
+ "\n",
884
+ "# Let's save the model to be pushed to a server later\n",
885
+ "from concrete.ml.deployment import FHEModelDev\n",
886
+ "\n",
887
+ "fhe_api = FHEModelDev(DEPLOYMENT_DIR / \"sentiment_fhe_model\", best_model)\n",
888
+ "fhe_api.save(via_mlir=True)"
889
+ ]
890
+ },
891
+ {
892
+ "cell_type": "code",
893
+ "execution_count": 27,
894
+ "metadata": {},
895
+ "outputs": [
896
+ {
897
+ "data": {
898
+ "text/html": [
899
+ "<div>\n",
900
+ "<style scoped>\n",
901
+ " .dataframe tbody tr th:only-of-type {\n",
902
+ " vertical-align: middle;\n",
903
+ " }\n",
904
+ "\n",
905
+ " .dataframe tbody tr th {\n",
906
+ " vertical-align: top;\n",
907
+ " }\n",
908
+ "\n",
909
+ " .dataframe thead th {\n",
910
+ " text-align: right;\n",
911
+ " }\n",
912
+ "</style>\n",
913
+ "<table border=\"1\" class=\"dataframe\">\n",
914
+ " <thead>\n",
915
+ " <tr style=\"text-align: right;\">\n",
916
+ " <th></th>\n",
917
+ " <th>Accuracy</th>\n",
918
+ " <th>Average Precision (positive)</th>\n",
919
+ " <th>Average Precision (negative)</th>\n",
920
+ " <th>Average Precision (neutral)</th>\n",
921
+ " </tr>\n",
922
+ " <tr>\n",
923
+ " <th>Model</th>\n",
924
+ " <th></th>\n",
925
+ " <th></th>\n",
926
+ " <th></th>\n",
927
+ " <th></th>\n",
928
+ " </tr>\n",
929
+ " </thead>\n",
930
+ " <tbody>\n",
931
+ " <tr>\n",
932
+ " <th>TF-IDF + XGBoost</th>\n",
933
+ " <td>0.711749</td>\n",
934
+ " <td>0.640422</td>\n",
935
+ " <td>0.871891</td>\n",
936
+ " <td>0.43486</td>\n",
937
+ " </tr>\n",
938
+ " <tr>\n",
939
+ " <th>Transformer Only</th>\n",
940
+ " <td>0.805328</td>\n",
941
+ " <td>0.854827</td>\n",
942
+ " <td>0.954804</td>\n",
943
+ " <td>0.68011</td>\n",
944
+ " </tr>\n",
945
+ " <tr>\n",
946
+ " <th>Transformer + XGBoost</th>\n",
947
+ " <td>0.846311</td>\n",
948
+ " <td>0.895930</td>\n",
949
+ " <td>0.964674</td>\n",
950
+ " <td>0.74489</td>\n",
951
+ " </tr>\n",
952
+ " </tbody>\n",
953
+ "</table>\n",
954
+ "</div>"
955
+ ],
956
+ "text/plain": [
957
+ " Accuracy Average Precision (positive) \\\n",
958
+ "Model \n",
959
+ "TF-IDF + XGBoost 0.711749 0.640422 \n",
960
+ "Transformer Only 0.805328 0.854827 \n",
961
+ "Transformer + XGBoost 0.846311 0.895930 \n",
962
+ "\n",
963
+ " Average Precision (negative) \\\n",
964
+ "Model \n",
965
+ "TF-IDF + XGBoost 0.871891 \n",
966
+ "Transformer Only 0.954804 \n",
967
+ "Transformer + XGBoost 0.964674 \n",
968
+ "\n",
969
+ " Average Precision (neutral) \n",
970
+ "Model \n",
971
+ "TF-IDF + XGBoost 0.43486 \n",
972
+ "Transformer Only 0.68011 \n",
973
+ "Transformer + XGBoost 0.74489 "
974
+ ]
975
+ },
976
+ "execution_count": 27,
977
+ "metadata": {},
978
+ "output_type": "execute_result"
979
+ }
980
+ ],
981
+ "source": [
982
+ "%matplotlib inline\n",
983
+ "# Let's print the results obtained in this notebook\n",
984
+ "df_results = pd.DataFrame(\n",
985
+ " {\n",
986
+ " \"Model\": [\"TF-IDF + XGBoost\", \"Transformer Only\", \"Transformer + XGBoost\"],\n",
987
+ " \"Accuracy\": [accuracy_tfidf, accuracy_transformer_only, accuracy_transformer_xgboost],\n",
988
+ " \"Average Precision (positive)\": [\n",
989
+ " ap_positive_tfidf,\n",
990
+ " ap_positive_transformer_only,\n",
991
+ " ap_positive_transformer_xgboost,\n",
992
+ " ],\n",
993
+ " \"Average Precision (negative)\": [\n",
994
+ " ap_negative_tfidf,\n",
995
+ " ap_negative_transformer_only,\n",
996
+ " ap_negative_transformer_xgboost,\n",
997
+ " ],\n",
998
+ " \"Average Precision (neutral)\": [\n",
999
+ " ap_neutral_tfidf,\n",
1000
+ " ap_neutral_transformer_only,\n",
1001
+ " ap_neutral_transformer_xgboost,\n",
1002
+ " ],\n",
1003
+ " }\n",
1004
+ ")\n",
1005
+ "df_results.set_index(\"Model\", inplace=True)\n",
1006
+ "df_results # pylint: disable=pointless-statement"
1007
+ ]
1008
+ },
1009
+ {
1010
+ "cell_type": "markdown",
1011
+ "metadata": {},
1012
+ "source": [
1013
+ "### Conclusion\n",
1014
+ "\n",
1015
+ "In this notebook we presented two different ways to represent a text.\n",
1016
+ "1. Using TF-IDF vectorization\n",
1017
+ "2. Using the hidden layers from a transformer\n",
1018
+ "\n",
1019
+ "Both representation are then used to train a machine learning model will run in FHE (here XGBoost)\n",
1020
+ "\n",
1021
+ "Once the model is trained, clients can send encrypted text representation to the server to get a sentiment analysis done and they receive the probability for each class (negative, neutral and positive) in an encrypted format which can then be decrypted by the client. For now, all the FHE magic (encrypt, predict and decrypt) is done within the `predict_proba` function with the argument `execute_in_fhe=True`. In the next release, an API will be provided to split the server/client parts.\n",
1022
+ "\n",
1023
+ "Regarding the FHE execution times, the final XGboost model can predict over an encrypted data point in ~40 seconds. This will change depending on the number of threads available. In the future, more hardware acceleration will be available to speed up the execution time.\n",
1024
+ "\n",
1025
+ "It seems that the combination of a transformer (thanks Huggingface!) with a \"simpler\" model such as XGBoost works pretty well. Thanks to Concrete-ML library, we can easily use this text representation on the client machine and then encrypt it to send it to a remote server without having to deal with a transformer runtime in FHE."
1026
+ ]
1027
+ }
1028
+ ],
1029
+ "metadata": {
1030
+ "execution": {
1031
+ "timeout": 10800
1032
+ },
1033
+ "kernelspec": {
1034
+ "display_name": ".venv",
1035
+ "language": "python",
1036
+ "name": "python3"
1037
+ },
1038
+ "language_info": {
1039
+ "codemirror_mode": {
1040
+ "name": "ipython",
1041
+ "version": 3
1042
+ },
1043
+ "file_extension": ".py",
1044
+ "mimetype": "text/x-python",
1045
+ "name": "python",
1046
+ "nbconvert_exporter": "python",
1047
+ "pygments_lexer": "ipython3",
1048
+ "version": "3.10.11"
1049
+ }
1050
+ },
1051
+ "nbformat": 4,
1052
+ "nbformat_minor": 2
1053
+ }
hf_repo/hf_repo/app.py ADDED
@@ -0,0 +1,401 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """A gradio app. that runs locally (analytics=False and share=False) about sentiment analysis on tweets."""
2
+
3
+ import gradio as gr
4
+ from transformer_vectorizer import TransformerVectorizer
5
+ from concrete.ml.deployment import FHEModelClient
6
+ import numpy
7
+ import os
8
+ from pathlib import Path
9
+ import requests
10
+ import json
11
+ import base64
12
+ import subprocess
13
+ import shutil
14
+ import time
15
+
16
+ # This repository's directory
17
+ REPO_DIR = Path(__file__).parent
18
+
19
+ subprocess.Popen(["anvil", "-p", "3030"], cwd=REPO_DIR)
20
+ subprocess.Popen(["uvicorn", "server:app", "--port", "8000"], cwd=REPO_DIR)
21
+ subprocess.Popen(["uvicorn", "zkml_non_encrypted:app", "--port", "8001"], cwd=REPO_DIR)
22
+ subprocess.Popen(["uvicorn", "zkml_encrypted:app", "--port", "8002"], cwd=REPO_DIR)
23
+
24
+ # Wait 30 sec for the server to start
25
+ time.sleep(30)
26
+
27
+ # Encrypted data limit for the browser to display
28
+ # (encrypted data is too large to display in the browser)
29
+ ENCRYPTED_DATA_BROWSER_LIMIT = 500
30
+ N_USER_KEY_STORED = 20
31
+ FHE_MODEL_PATH = "deployment/sentiment_fhe_model"
32
+
33
+ print("Loading the transformer model...")
34
+
35
+ # Initialize the transformer vectorizer
36
+ transformer_vectorizer = TransformerVectorizer()
37
+
38
+
39
+ def clean_tmp_directory():
40
+ # Allow 20 user keys to be stored.
41
+ # Once that limitation is reached, deleted the oldest.
42
+ path_sub_directories = sorted([f for f in Path(".fhe_keys/").iterdir() if f.is_dir()], key=os.path.getmtime)
43
+
44
+ user_ids = []
45
+ if len(path_sub_directories) > N_USER_KEY_STORED:
46
+ n_files_to_delete = len(path_sub_directories) - N_USER_KEY_STORED
47
+ for p in path_sub_directories[:n_files_to_delete]:
48
+ user_ids.append(p.name)
49
+ shutil.rmtree(p)
50
+
51
+ list_files_tmp = Path("tmp/").iterdir()
52
+ # Delete all files related to user_id
53
+ for file in list_files_tmp:
54
+ for user_id in user_ids:
55
+ if file.name.endswith(f"{user_id}.npy"):
56
+ file.unlink()
57
+
58
+
59
+ def keygen():
60
+ # Clean tmp directory if needed
61
+ clean_tmp_directory()
62
+
63
+ print("Initializing FHEModelClient...")
64
+
65
+ # Let's create a user_id
66
+ user_id = numpy.random.randint(0, 2 ** 32)
67
+ fhe_api = FHEModelClient(FHE_MODEL_PATH, f".fhe_keys/{user_id}")
68
+ fhe_api.load()
69
+
70
+ # Generate a fresh key
71
+ fhe_api.generate_private_and_evaluation_keys(force=True)
72
+ evaluation_key = fhe_api.get_serialized_evaluation_keys()
73
+
74
+ # Save evaluation_key in a file, since too large to pass through regular Gradio
75
+ # buttons, https://github.com/gradio-app/gradio/issues/1877
76
+ numpy.save(f"tmp/tmp_evaluation_key_{user_id}.npy", evaluation_key)
77
+
78
+ return [list(evaluation_key)[:ENCRYPTED_DATA_BROWSER_LIMIT], user_id]
79
+
80
+
81
+ def encode_quantize_encrypt(text, user_id):
82
+ if not user_id:
83
+ raise gr.Error("You need to generate FHE keys first.")
84
+
85
+ fhe_api = FHEModelClient(FHE_MODEL_PATH, f".fhe_keys/{user_id}")
86
+ fhe_api.load()
87
+ encodings = transformer_vectorizer.transform([text])
88
+ quantized_encodings = fhe_api.model.quantize_input(encodings).astype(numpy.uint8)
89
+ encrypted_quantized_encoding = fhe_api.quantize_encrypt_serialize(encodings)
90
+
91
+ # Save encrypted_quantized_encoding in a file, since too large to pass through regular Gradio
92
+ # buttons, https://github.com/gradio-app/gradio/issues/1877
93
+ numpy.save(f"tmp/tmp_encrypted_quantized_encoding_{user_id}.npy", encrypted_quantized_encoding)
94
+
95
+ # Compute size
96
+ encrypted_quantized_encoding_shorten = list(encrypted_quantized_encoding)
97
+ encrypted_quantized_encoding_shorten_hex = ''.join(f'{i:02x}' for i in encrypted_quantized_encoding_shorten)
98
+ return (
99
+ encodings[0],
100
+ quantized_encodings[0],
101
+ encrypted_quantized_encoding_shorten_hex,
102
+ )
103
+
104
+
105
+ def run_fhe(user_id):
106
+ encoded_data_path = Path(f"tmp/tmp_encrypted_quantized_encoding_{user_id}.npy")
107
+ if not user_id:
108
+ raise gr.Error("You need to generate FHE keys first.")
109
+ if not encoded_data_path.is_file():
110
+ raise gr.Error("No encrypted data was found. Encrypt the data before trying to predict.")
111
+
112
+ # Read encrypted_quantized_encoding from the file
113
+ encrypted_quantized_encoding = numpy.load(encoded_data_path)
114
+
115
+ # Read evaluation_key from the file
116
+ evaluation_key = numpy.load(f"tmp/tmp_evaluation_key_{user_id}.npy")
117
+
118
+ # Use base64 to encode the encodings and evaluation key
119
+ encrypted_quantized_encoding = base64.b64encode(encrypted_quantized_encoding).decode()
120
+ encoded_evaluation_key = base64.b64encode(evaluation_key).decode()
121
+
122
+ query = {}
123
+ query["evaluation_key"] = encoded_evaluation_key
124
+ query["encrypted_encoding"] = encrypted_quantized_encoding
125
+ headers = {"Content-type": "application/json"}
126
+ response = requests.post(
127
+ "http://localhost:8000/predict_sentiment", data=json.dumps(query), headers=headers
128
+ )
129
+ encrypted_prediction = base64.b64decode(response.json()["encrypted_prediction"])
130
+
131
+ # Save encrypted_prediction in a file, since too large to pass through regular Gradio
132
+ # buttons, https://github.com/gradio-app/gradio/issues/1877
133
+ numpy.save(f"tmp/tmp_encrypted_prediction_{user_id}.npy", encrypted_prediction)
134
+ encrypted_prediction_shorten = list(encrypted_prediction)
135
+ encrypted_prediction_shorten_hex = ''.join(f'{i:02x}' for i in encrypted_prediction_shorten)
136
+ return encrypted_prediction_shorten_hex
137
+
138
+
139
+ def decrypt_prediction(user_id):
140
+ encoded_data_path = Path(f"tmp/tmp_encrypted_prediction_{user_id}.npy")
141
+ if not user_id:
142
+ raise gr.Error("You need to generate FHE keys first.")
143
+ if not encoded_data_path.is_file():
144
+ raise gr.Error("No encrypted prediction was found. Run the prediction over the encrypted data first.")
145
+
146
+ # Read encrypted_prediction from the file
147
+ encrypted_prediction = numpy.load(encoded_data_path).tobytes()
148
+
149
+ fhe_api = FHEModelClient(FHE_MODEL_PATH, f".fhe_keys/{user_id}")
150
+ fhe_api.load()
151
+
152
+ # We need to retrieve the private key that matches the client specs (see issue #18)
153
+ fhe_api.generate_private_and_evaluation_keys(force=False)
154
+
155
+ predictions = fhe_api.deserialize_decrypt_dequantize(encrypted_prediction)
156
+ return {
157
+ "negative": predictions[0][0],
158
+ "neutral": predictions[0][1],
159
+ "positive": predictions[0][2],
160
+ }
161
+
162
+
163
+ def get_zk_proof_non_encrypted(text):
164
+ headers = {"Content-type": "application/json"}
165
+ query = {"text": text}
166
+ response = requests.post(
167
+ "http://localhost:8001/get_zk_proof", data=json.dumps(query), headers=headers
168
+ )
169
+ result = response.json()
170
+
171
+ sentiment = ""
172
+ if result["output"][0] > 0.5:
173
+ sentiment = "negative"
174
+ elif result["output"][1] > 0.5:
175
+ sentiment = "neutral"
176
+ else:
177
+ sentiment = "positive"
178
+
179
+ return sentiment, result["proof"], result["verify_contract_addr"]
180
+
181
+
182
+ def get_zk_proof_encrypted(user_id):
183
+ encoded_data_path = Path(f"tmp/tmp_encrypted_quantized_encoding_{user_id}.npy")
184
+ if not user_id:
185
+ raise gr.Error("You need to generate FHE keys first.")
186
+ if not encoded_data_path.is_file():
187
+ raise gr.Error("No encrypted data was found. Encrypt the data before trying to predict.")
188
+
189
+ # Read encrypted_quantized_encoding from the file
190
+ encrypted_quantized_encoding = numpy.load(encoded_data_path)
191
+
192
+ # Read evaluation_key from the file
193
+ evaluation_key = numpy.load(f"tmp/tmp_evaluation_key_{user_id}.npy")
194
+
195
+ # Use base64 to encode the encodings and evaluation key
196
+ encrypted_quantized_encoding = base64.b64encode(encrypted_quantized_encoding).decode()
197
+ encoded_evaluation_key = base64.b64encode(evaluation_key).decode()
198
+
199
+ query = {}
200
+ query["evaluation_key"] = encoded_evaluation_key
201
+ query["encrypted_encoding"] = encrypted_quantized_encoding
202
+ headers = {"Content-type": "application/json"}
203
+ response = requests.post(
204
+ "http://localhost:8002/get_zk_proof", data=json.dumps(query), headers=headers
205
+ )
206
+ result = response.json()
207
+ return result["output"], result["proof"], result["verify_contract_addr"]
208
+
209
+
210
+ demo = gr.Blocks()
211
+
212
+ print("Starting the demo...")
213
+ with demo:
214
+ gr.Markdown(
215
+ """
216
+ <p align="center">
217
+ <img width=200 src="https://user-images.githubusercontent.com/5758427/197816413-d9cddad3-ba38-4793-847d-120975e1da11.png">
218
+ </p>
219
+
220
+ <h2 align="center">Sentiment Analysis On Encrypted Data Using Homomorphic Encryption</h2>
221
+
222
+ <p align="center">
223
+ <a href="https://github.com/zama-ai/concrete-ml"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="https://user-images.githubusercontent.com/5758427/197972109-faaaff3e-10e2-4ab6-80f5-7531f7cfb08f.png">Concrete-ML</a>
224
+
225
+ <a href="https://docs.zama.ai/concrete-ml"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="https://user-images.githubusercontent.com/5758427/197976802-fddd34c5-f59a-48d0-9bff-7ad1b00cb1fb.png">Documentation</a>
226
+
227
+ <a href="https://zama.ai/community"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="https://user-images.githubusercontent.com/5758427/197977153-8c9c01a7-451a-4993-8e10-5a6ed5343d02.png">Community</a>
228
+
229
+ <a href="https://twitter.com/zama_fhe"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="https://user-images.githubusercontent.com/5758427/197975044-bab9d199-e120-433b-b3be-abd73b211a54.png">@zama_fhe</a>
230
+ </p>
231
+
232
+ <p align="center">
233
+ <img src="https://user-images.githubusercontent.com/56846628/219329304-6868be9e-5ce8-4279-9123-4cb1bc0c2fb5.png" width="60%" height="60%">
234
+ </p>
235
+ """
236
+ )
237
+
238
+ gr.Markdown(
239
+ """
240
+ <p align="center">
241
+ </p>
242
+ <p align="center">
243
+ </p>
244
+ """
245
+ )
246
+
247
+ gr.Markdown("## Notes")
248
+ gr.Markdown(
249
+ """
250
+ - The private key is used to encrypt and decrypt the data and shall never be shared.
251
+ - The evaluation key is a public key that the server needs to process encrypted data.
252
+ """
253
+ )
254
+
255
+ gr.Markdown("# Step 1: Generate the keys")
256
+
257
+ b_gen_key_and_install = gr.Button("Generate the keys and send public part to server")
258
+
259
+ evaluation_key = gr.Textbox(
260
+ label="Evaluation key (truncated):",
261
+ max_lines=4,
262
+ interactive=False,
263
+ )
264
+
265
+ user_id = gr.Textbox(
266
+ label="",
267
+ max_lines=4,
268
+ interactive=False,
269
+ visible=False
270
+ )
271
+
272
+ gr.Markdown("# Step 2: Provide a message")
273
+ gr.Markdown("## Client side")
274
+ gr.Markdown(
275
+ "Enter a sensitive text message you received and would like to do sentiment analysis on (ideas: the last text message of your boss.... or lover)."
276
+ )
277
+ text = gr.Textbox(label="Enter a message:", value="I really like your work recently")
278
+
279
+ gr.Markdown("# Step 3: Encode the message with the private key")
280
+ b_encode_quantize_text = gr.Button(
281
+ "Encode, quantize and encrypt the text with transformer vectorizer, and send to server"
282
+ )
283
+
284
+ with gr.Row():
285
+ encoding = gr.Textbox(
286
+ label="Transformer representation:",
287
+ max_lines=4,
288
+ interactive=False,
289
+ )
290
+ quantized_encoding = gr.Textbox(
291
+ label="Quantized transformer representation:", max_lines=4, interactive=False
292
+ )
293
+ encrypted_quantized_encoding = gr.Textbox(
294
+ label="Encrypted quantized transformer representation (truncated):",
295
+ max_lines=4,
296
+ interactive=False,
297
+ )
298
+
299
+ gr.Markdown("# Step 4: Run the FHE evaluation")
300
+ gr.Markdown("## Server side")
301
+ gr.Markdown(
302
+ "The encrypted value is received by the server. Thanks to the evaluation key and to FHE, the server can compute the (encrypted) prediction directly over encrypted values. Once the computation is finished, the server returns the encrypted prediction to the client."
303
+ )
304
+
305
+ b_run_fhe = gr.Button("Run FHE execution there")
306
+ encrypted_prediction = gr.Textbox(
307
+ label="Encrypted prediction (truncated):",
308
+ max_lines=4,
309
+ interactive=False,
310
+ )
311
+
312
+ gr.Markdown("# Step 5: Decrypt the sentiment")
313
+ gr.Markdown("## Client side")
314
+ gr.Markdown(
315
+ "The encrypted sentiment is sent back to client, who can finally decrypt it with its private key. Only the client is aware of the original tweet and the prediction."
316
+ )
317
+ b_decrypt_prediction = gr.Button("Decrypt prediction")
318
+
319
+ labels_sentiment = gr.Label(label="Sentiment:")
320
+
321
+ gr.Markdown("# Step 6: Get ZK Proof(non-encrypted input)")
322
+ gr.Markdown("## Server side")
323
+ gr.Markdown(
324
+ "Get zero-knowledge proof of the sentiment analysis computation (for non-encrypted input)."
325
+ )
326
+ b_get_zk_proof_non_encrypted = gr.Button("Get ZK Proof(non-encrypted input)")
327
+
328
+ with gr.Row():
329
+ zk_sentiment_non_encrypted = gr.Textbox(
330
+ label="Sentiment:",
331
+ max_lines=1,
332
+ interactive=False,
333
+ )
334
+ zk_proof_non_encrypted = gr.Textbox(
335
+ label="ZK Proof:",
336
+ max_lines=4,
337
+ interactive=False,
338
+ )
339
+ zk_contract_non_encrypted = gr.Textbox(
340
+ label="Verify Contract Address:",
341
+ max_lines=1,
342
+ interactive=False,
343
+ )
344
+
345
+ gr.Markdown("# Step 6: Get ZK Proof(encrypted input)")
346
+ gr.Markdown("## Server side")
347
+ gr.Markdown(
348
+ "Get zero-knowledge proof of the sentiment analysis computation (for encrypted input)."
349
+ )
350
+ b_get_zk_proof_encrypted = gr.Button("Get ZK Proof(encrypted input)")
351
+
352
+ with gr.Row():
353
+ zk_encrypted_prediction = gr.Textbox(
354
+ label="Encrypted Prediction(same as Step 4 output):",
355
+ max_lines=1,
356
+ interactive=False,
357
+ )
358
+ zk_proof_encrypted = gr.Textbox(
359
+ label="ZK Proof:",
360
+ max_lines=4,
361
+ interactive=False,
362
+ )
363
+ zk_contract_encrypted = gr.Textbox(
364
+ label="Verify Contract Address:",
365
+ max_lines=1,
366
+ interactive=False,
367
+ )
368
+
369
+ # Button for key generation
370
+ b_gen_key_and_install.click(keygen, inputs=[], outputs=[evaluation_key, user_id])
371
+
372
+ # Button to quantize and encrypt
373
+ b_encode_quantize_text.click(
374
+ encode_quantize_encrypt,
375
+ inputs=[text, user_id],
376
+ outputs=[
377
+ encoding,
378
+ quantized_encoding,
379
+ encrypted_quantized_encoding,
380
+ ],
381
+ )
382
+
383
+ # Button to send the encodings to the server using post at (localhost:8000/predict_sentiment)
384
+ b_run_fhe.click(run_fhe, inputs=[user_id], outputs=[encrypted_prediction])
385
+
386
+ # Button to decrypt the prediction on the client
387
+ b_decrypt_prediction.click(decrypt_prediction, inputs=[user_id], outputs=[labels_sentiment])
388
+
389
+ # Button to get ZK proof(non encrypted)
390
+ b_get_zk_proof_non_encrypted.click(get_zk_proof_non_encrypted, inputs=[text],
391
+ outputs=[zk_sentiment_non_encrypted, zk_proof_non_encrypted,
392
+ zk_contract_non_encrypted])
393
+
394
+ # Button to get ZK proof(encrypted)
395
+ b_get_zk_proof_encrypted.click(get_zk_proof_encrypted, inputs=[user_id],
396
+ outputs=[zk_encrypted_prediction, zk_proof_encrypted, zk_contract_encrypted])
397
+
398
+ gr.Markdown(
399
+ "The app was built with [Concrete-ML](https://github.com/zama-ai/concrete-ml), a Privacy-Preserving Machine Learning (PPML) open-source set of tools by [Zama](https://zama.ai/). Try it yourself and don't forget to star on Github &#11088;."
400
+ )
401
+ demo.launch(share=False)
hf_repo/hf_repo/compile.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from concrete.ml.deployment import FHEModelDev
3
+ from concrete.ml.common.serialization.loaders import load
4
+ import shutil
5
+ from pathlib import Path
6
+
7
+
8
+ script_dir = Path(__file__).parent
9
+
10
+ DEPLOYMENT_DIR = script_dir / "deployment"
11
+
12
+ print("Compiling the model...")
13
+
14
+ with (DEPLOYMENT_DIR / "serialized_model").open("r") as file:
15
+ model = load(file)
16
+
17
+ # Load the data from the csv file to be used for compilation
18
+ data = pd.read_csv(DEPLOYMENT_DIR / "samples_for_compilation.csv", index_col=0).values
19
+
20
+ # Compile the model
21
+ model.compile(data)
22
+
23
+ dev_model_path = DEPLOYMENT_DIR / "sentiment_fhe_model"
24
+
25
+ # Delete the deployment folder if it exist
26
+ if dev_model_path.is_dir():
27
+ shutil.rmtree(dev_model_path)
28
+
29
+ fhe_api = FHEModelDev(
30
+ model=model, path_dir=dev_model_path
31
+ )
32
+ fhe_api.save(via_mlir=True)
33
+
34
+
35
+ print("Done!")
hf_repo/hf_repo/deployment/samples_for_compilation.csv ADDED
The diff for this file is too large to render. See raw diff
 
hf_repo/hf_repo/deployment/sentiment_fhe_model/client.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18d71ff210dfcfaeffa62d500eea3930694f2ded438589baa4458f971479ee31
3
+ size 1509958
hf_repo/hf_repo/deployment/sentiment_fhe_model/server.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23a0a8ef429c6d990fda93aa7ae786353968b1ec366326c02acbf8897b4f431b
3
+ size 2582
hf_repo/hf_repo/deployment/sentiment_fhe_model/versions.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"concrete-python": "2.8.1", "concrete-ml": "1.7.0", "python": "3.10.11"}
hf_repo/hf_repo/deployment/serialized_model ADDED
The diff for this file is too large to render. See raw diff
 
hf_repo/hf_repo/download_data.sh ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ set -e
4
+
5
+ # You need to install kaggle using pip and then have a valid ~/.kaggle/kaggle.json, that you can
6
+ # generate from "Create new API token" on your account page in kaggle.com
7
+ # Alternatively, the dataset can be downloaded manually at
8
+ # https://www.kaggle.com/datasets/crowdflower/twitter-airline-sentiment
9
+ rm -rf local_datasets
10
+ mkdir local_datasets
11
+ cd local_datasets
12
+
13
+ kaggle datasets download -d crowdflower/twitter-airline-sentiment
14
+
15
+ unzip twitter-airline-sentiment.zip -d twitter-airline-sentiment
hf_repo/hf_repo/requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ concrete-ml==1.7.0
2
+ gradio==3.50.2
3
+ pandas==2.0.3
4
+ transformers==4.36.0
5
+ jupyter==1.0.0
6
+ urllib3==1.25.4
7
+ more-itertools==10.5.0
8
+ ezkl==11.2.2
9
+ httpx[socks]
hf_repo/hf_repo/sentiment_analysis_banner.png ADDED
hf_repo/hf_repo/server.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Server that will listen for GET requests from the client."""
2
+ import json
3
+
4
+ from fastapi import FastAPI
5
+ from concrete.ml.deployment import FHEModelServer
6
+ from pydantic import BaseModel
7
+ import base64
8
+ from pathlib import Path
9
+
10
+ current_dir = Path(__file__).parent
11
+
12
+ # Load the model
13
+ fhe_model = FHEModelServer("deployment/sentiment_fhe_model")
14
+
15
+
16
+ class PredictRequest(BaseModel):
17
+ evaluation_key: str
18
+ encrypted_encoding: str
19
+
20
+
21
+ # Initialize an instance of FastAPI
22
+ app = FastAPI()
23
+
24
+
25
+ # Define the default route
26
+ @app.get("/")
27
+ def root():
28
+ return {"message": "Welcome to Your Sentiment Classification FHE Model Server!"}
29
+
30
+
31
+ @app.post("/predict_sentiment")
32
+ def predict_sentiment(query: PredictRequest):
33
+ encrypted_encoding = base64.b64decode(query.encrypted_encoding)
34
+ evaluation_key = base64.b64decode(query.evaluation_key)
35
+ prediction = fhe_model.run(encrypted_encoding, evaluation_key)
36
+
37
+ # Encode base64 the prediction
38
+ encoded_prediction = base64.b64encode(prediction).decode()
39
+ return {"encrypted_prediction": encoded_prediction}
hf_repo/hf_repo/transformer_vectorizer.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Let's import a few requirements
2
+ import torch
3
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
4
+ import numpy
5
+
6
+ class TransformerVectorizer:
7
+ def __init__(self):
8
+ # Load the tokenizer (converts text to tokens)
9
+ self.tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
10
+
11
+ # Load the pre-trained model
12
+ self.transformer_model = AutoModelForSequenceClassification.from_pretrained(
13
+ "cardiffnlp/twitter-roberta-base-sentiment-latest"
14
+ )
15
+ self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
16
+
17
+ def text_to_tensor(
18
+ self,
19
+ texts: list,
20
+ ) -> numpy.ndarray:
21
+ """Function that transforms a list of texts to their learned representation.
22
+
23
+ Args:
24
+ list_text_X (list): List of texts to be transformed.
25
+
26
+ Returns:
27
+ numpy.ndarray: Transformed list of texts.
28
+ """
29
+ # First, tokenize all the input text
30
+ tokenized_text_X_train = self.tokenizer.batch_encode_plus(
31
+ texts, return_tensors="pt"
32
+ )["input_ids"]
33
+
34
+ # Depending on the hardware used, the number of examples to be processed can be reduced
35
+ # Here we split the data into 100 examples per batch
36
+ tokenized_text_X_train_split = torch.split(tokenized_text_X_train, split_size_or_sections=50)
37
+
38
+ # Send the model to the device
39
+ transformer_model = self.transformer_model.to(self.device)
40
+ output_hidden_states_list = []
41
+
42
+ for tokenized_x in tokenized_text_X_train_split:
43
+ # Pass the tokens through the transformer model and get the hidden states
44
+ # Only keep the last hidden layer state for now
45
+ output_hidden_states = transformer_model(tokenized_x.to(self.device), output_hidden_states=True)[
46
+ 1
47
+ ][-1]
48
+ # Average over the tokens axis to get a representation at the text level.
49
+ output_hidden_states = output_hidden_states.mean(dim=1)
50
+ output_hidden_states = output_hidden_states.detach().cpu().numpy()
51
+ output_hidden_states_list.append(output_hidden_states)
52
+
53
+ self.encodings = numpy.concatenate(output_hidden_states_list, axis=0)
54
+ return self.encodings
55
+
56
+ def transform(self, texts: list):
57
+ return self.text_to_tensor(texts)
58
+
hf_repo/zkml_encrypted.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://docs.ezkl.xyz/
2
+ # https://colab.research.google.com/github/zkonduit/ezkl/blob/main/examples/notebooks/simple_demo_all_public.ipynb
3
+ import struct
4
+ import uuid
5
+
6
+ import numpy as np
7
+ from torch import nn
8
+ import ezkl
9
+ import os
10
+ import json
11
+ import torch
12
+ import base64
13
+ from concrete.ml.deployment import FHEModelServer
14
+ from fastapi import FastAPI
15
+ from pydantic import BaseModel
16
+
17
+ app = FastAPI()
18
+
19
+ evaluation_key = None
20
+
21
+
22
+ # Defines the model
23
+ class AIModel(nn.Module):
24
+ def __init__(self):
25
+ super(AIModel, self).__init__()
26
+
27
+ # Load the model
28
+ self.fhe_model = FHEModelServer("deployment/sentiment_fhe_model")
29
+
30
+ def forward(self, x):
31
+ print(f"forward input: {x}")
32
+
33
+ # Convert to bytes
34
+ x = x[0]
35
+ _encrypted_encoding = x.numpy().tobytes()
36
+ prediction = self.fhe_model.run(_encrypted_encoding, evaluation_key)
37
+ print(f"forward prediction hex: {prediction.hex()}")
38
+
39
+ byte_tensor = torch.tensor(list(prediction), dtype=torch.uint8)
40
+ print(f"tensor_output: {byte_tensor}")
41
+
42
+ return byte_tensor
43
+
44
+
45
+ class ZKProofRequest(BaseModel):
46
+ encrypted_encoding: str
47
+ evaluation_key: str
48
+
49
+
50
+ circuit = AIModel()
51
+
52
+
53
+ @app.post("/get_zk_proof")
54
+ async def get_zk_proof(request: ZKProofRequest):
55
+ request.encrypted_encoding = base64.b64decode(request.encrypted_encoding)
56
+ request.evaluation_key = base64.b64decode(request.evaluation_key)
57
+
58
+ global evaluation_key
59
+ evaluation_key = request.evaluation_key
60
+
61
+ folder_path = f"zkml_encrypted/{str(uuid.uuid4())}"
62
+ if not os.path.exists(folder_path):
63
+ os.makedirs(folder_path)
64
+
65
+ model_path = os.path.join(f'{folder_path}/network.onnx')
66
+ compiled_model_path = os.path.join(f'{folder_path}/network.compiled')
67
+ pk_path = os.path.join(f'{folder_path}/test.pk')
68
+ vk_path = os.path.join(f'{folder_path}/test.vk')
69
+ settings_path = os.path.join(f'{folder_path}/settings.json')
70
+
71
+ witness_path = os.path.join(f'{folder_path}/witness.json')
72
+ input_data_path = os.path.join(f'{folder_path}/input.json')
73
+ srs_path = os.path.join(f'{folder_path}/kzg14.srs')
74
+ output_path = os.path.join(f'{folder_path}/output.json')
75
+
76
+ # After training, export to onnx (network.onnx) and create a data file (input.json)
77
+ x = torch.tensor(list([request.encrypted_encoding]), dtype=torch.uint8)
78
+
79
+ # Flips the neural net into inference mode
80
+ circuit.eval()
81
+
82
+ # Get the output of the model
83
+ with torch.no_grad():
84
+ output = circuit(x)
85
+ # Save the output to a file
86
+ output_data = output.detach().numpy().tolist()
87
+ with open(output_path, 'w') as f:
88
+ json.dump(output_data, f)
89
+
90
+ print("start")
91
+ # Export the model
92
+ torch.onnx.export(circuit, # model being run
93
+ x, # model input (or a tuple for multiple inputs)
94
+ model_path, # where to save the model (can be a file or file-like object)
95
+ export_params=True, # store the trained parameter weights inside the model file
96
+ opset_version=10, # the ONNX version to export the model to
97
+ do_constant_folding=True, # whether to execute constant folding for optimization
98
+ input_names=['input'], # the model's input names
99
+ output_names=['output'], # the model's output names
100
+ dynamic_axes={'input': {0: 'batch_size'}, # variable length axes
101
+ 'output': {0: 'batch_size'}})
102
+ print("end")
103
+
104
+ data = dict(input_data=x.tolist())
105
+
106
+ # Serialize data into file:
107
+ json.dump(data, open(input_data_path, 'w'))
108
+
109
+ py_run_args = ezkl.PyRunArgs()
110
+ py_run_args.input_visibility = "public"
111
+ py_run_args.output_visibility = "public"
112
+ py_run_args.param_visibility = "fixed" # "fixed" for params means that the committed to params are used for all proofs
113
+
114
+ res = ezkl.gen_settings(model_path, settings_path, py_run_args=py_run_args)
115
+ assert res is True
116
+
117
+ cal_path = os.path.join(f"{folder_path}/calibration.json")
118
+
119
+ # Serialize data into file:
120
+ json.dump(data, open(cal_path, 'w'))
121
+
122
+ await ezkl.calibrate_settings(cal_path, model_path, settings_path, "resources")
123
+
124
+ res = ezkl.compile_circuit(model_path, compiled_model_path, settings_path)
125
+ assert res is True
126
+
127
+ # srs path
128
+ res = await ezkl.get_srs(settings_path, srs_path=srs_path)
129
+ assert res is True
130
+
131
+ # now generate the witness file
132
+
133
+ res = await ezkl.gen_witness(input_data_path, compiled_model_path, witness_path)
134
+ assert os.path.isfile(witness_path)
135
+
136
+ # HERE WE SETUP THE CIRCUIT PARAMS
137
+ # WE GOT KEYS
138
+ # WE GOT CIRCUIT PARAMETERS
139
+ # EVERYTHING ANYONE HAS EVER NEEDED FOR ZK
140
+
141
+ res = ezkl.setup(
142
+ compiled_model_path,
143
+ vk_path,
144
+ pk_path,
145
+ srs_path
146
+ )
147
+
148
+ assert res is True
149
+ assert os.path.isfile(vk_path)
150
+ assert os.path.isfile(pk_path)
151
+ assert os.path.isfile(settings_path)
152
+
153
+ # GENERATE A PROOF
154
+ proof_path = os.path.join(f'{folder_path}/test.pf')
155
+ res = ezkl.prove(
156
+ witness_path,
157
+ compiled_model_path,
158
+ pk_path,
159
+ proof_path,
160
+ "single",
161
+ srs_path
162
+ )
163
+ assert os.path.isfile(proof_path)
164
+
165
+ # VERIFY IT ON LOCAL
166
+ res = ezkl.verify(
167
+ proof_path,
168
+ settings_path,
169
+ vk_path,
170
+ srs_path
171
+ )
172
+ assert res is True
173
+ print("verified on local")
174
+
175
+ # VERIFY IT ON CHAIN
176
+ verify_sol_code_path = os.path.join(f'{folder_path}/verify.sol')
177
+ verify_sol_abi_path = os.path.join(f'{folder_path}/verify.abi')
178
+ res = await ezkl.create_evm_verifier(
179
+ vk_path,
180
+ settings_path,
181
+ verify_sol_code_path,
182
+ verify_sol_abi_path,
183
+ srs_path
184
+ )
185
+ assert res is True
186
+ verify_contract_addr_file = f"{folder_path}/addr.txt"
187
+ rpc_url = "http://103.231.86.33:10219"
188
+ await ezkl.deploy_evm(
189
+ addr_path=verify_contract_addr_file,
190
+ rpc_url=rpc_url,
191
+ sol_code_path=verify_sol_code_path
192
+ )
193
+ if os.path.exists(verify_contract_addr_file):
194
+ with open(verify_contract_addr_file, 'r') as file:
195
+ verify_contract_addr = file.read()
196
+ else:
197
+ print(f"error: File {verify_contract_addr_file} does not exist.")
198
+ return {"error": "Contract address file not found"}
199
+ # TODO verify failed. maybe need to change the x
200
+ res = await ezkl.verify_evm(
201
+ addr_verifier=verify_contract_addr,
202
+ proof_path=proof_path,
203
+ rpc_url=rpc_url
204
+ )
205
+ assert res is True
206
+ print("verified on chain")
207
+
208
+ # Read proof file content
209
+ with open(proof_path, 'rb') as f:
210
+ proof_content = base64.b64encode(f.read()).decode('utf-8')
211
+
212
+ return {"output": output_data, "proof": proof_content, "verify_contract_addr": verify_contract_addr}
hf_repo/zkml_non_encrypted.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://docs.ezkl.xyz/
2
+ # https://colab.research.google.com/github/zkonduit/ezkl/blob/main/examples/notebooks/simple_demo_all_public.ipynb
3
+ import pickle
4
+ import struct
5
+ import uuid
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ from sklearn.model_selection import GridSearchCV, train_test_split
10
+ from torch import nn
11
+ import ezkl
12
+ import os
13
+ import json
14
+ import torch
15
+ import base64
16
+ from concrete.ml.deployment import FHEModelServer
17
+ from concrete.ml.sklearn import XGBClassifier
18
+ import tqdm
19
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
20
+ from fastapi import FastAPI
21
+ from pydantic import BaseModel
22
+
23
+ app = FastAPI()
24
+
25
+ evaluation_key = None
26
+
27
+
28
+ # Defines the model
29
+ class AIWordsModel(nn.Module):
30
+ def __init__(self):
31
+ super(AIWordsModel, self).__init__()
32
+
33
+ print("init ZK AIWordsModel")
34
+
35
+ # Load the model
36
+ self.model = XGBClassifier()
37
+ train = pd.read_csv("./local_datasets/twitter-airline-sentiment/Tweets.csv", index_col=0)
38
+ text_X = train["text"]
39
+ y = train["airline_sentiment"].replace(["negative", "neutral", "positive"], [0, 1, 2])
40
+
41
+ # Load the tokenizer and model
42
+ self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
43
+ self.tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
44
+ self.transformer_model = AutoModelForSequenceClassification.from_pretrained(
45
+ "cardiffnlp/twitter-roberta-base-sentiment-latest"
46
+ ).to(self.device)
47
+
48
+ text_X_train, text_X_test, y_train, y_test = train_test_split(
49
+ text_X, y, test_size=0.1, random_state=42
50
+ )
51
+ X_train_transformer = self.text_to_tensor(text_X_train.tolist(), self.transformer_model, self.tokenizer,
52
+ self.device)
53
+
54
+ with open("deployment/serialized_model_zkml", 'rb') as file: # Open in binary read mode
55
+ loaded_data = pickle.load(file)
56
+ self.model.load_dict(loaded_data)
57
+ parameters = {"n_bits": [2, 3], "max_depth": [1], "n_estimators": [10, 30, 50]}
58
+ grid_search2 = GridSearchCV(self.model, parameters, cv=5, scoring="accuracy")
59
+ grid_search2.fit(X_train_transformer, y_train)
60
+ self.best_model2 = grid_search2.best_estimator_
61
+ self.best_model2.load_dict(loaded_data)
62
+ self.best_model2.compile(X_train_transformer)
63
+
64
+ print(f"loaded_data finished")
65
+
66
+ def forward(self, x):
67
+ prediction = self.best_model2.predict_proba(x, fhe="execute")
68
+
69
+ prediction_tensor = torch.tensor(prediction, dtype=torch.float32)
70
+ prediction_tensor = prediction_tensor.squeeze() # Remove extra dimensions if any
71
+
72
+ return prediction_tensor
73
+
74
+ # Function to convert text to tensor
75
+ def text_to_tensor(self, list_text, transformer_model, tokenizer, device):
76
+ tokenized_text = [tokenizer.encode(text, return_tensors="pt") for text in list_text]
77
+ output_hidden_states_list = [None] * len(tokenized_text)
78
+
79
+ for i, tokenized_x in enumerate(tqdm.tqdm(tokenized_text)):
80
+ output_hidden_states = transformer_model(tokenized_x.to(device), output_hidden_states=True)[1][-1]
81
+ output_hidden_states = output_hidden_states.mean(dim=1).detach().cpu().numpy()
82
+ output_hidden_states_list[i] = output_hidden_states
83
+
84
+ return np.concatenate(output_hidden_states_list, axis=0)
85
+
86
+
87
+ class ZKProofRequest(BaseModel):
88
+ text: str
89
+
90
+
91
+ circuit = AIWordsModel()
92
+
93
+
94
+ @app.post("/get_zk_proof")
95
+ async def get_zk_proof(request: ZKProofRequest):
96
+ folder_path = f"zkml_non_encrypted/{str(uuid.uuid4())}"
97
+ if not os.path.exists(folder_path):
98
+ os.makedirs(folder_path)
99
+
100
+ model_path = os.path.join(f'{folder_path}/network.onnx')
101
+ compiled_model_path = os.path.join(f'{folder_path}/network.compiled')
102
+ pk_path = os.path.join(f'{folder_path}/test.pk')
103
+ vk_path = os.path.join(f'{folder_path}/test.vk')
104
+ settings_path = os.path.join(f'{folder_path}/settings.json')
105
+
106
+ witness_path = os.path.join(f'{folder_path}/witness.json')
107
+ input_data_path = os.path.join(f'{folder_path}/input.json')
108
+ srs_path = os.path.join(f'{folder_path}/kzg14.srs')
109
+ output_path = os.path.join(f'{folder_path}/output.json')
110
+
111
+ # After training, export to onnx (network.onnx) and create a data file (input.json)
112
+ words = [request.text]
113
+ x_list = circuit.text_to_tensor(words, circuit.transformer_model, circuit.tokenizer, circuit.device)
114
+ x = torch.tensor(x_list, dtype=torch.float32)
115
+
116
+ # Flips the neural net into inference mode
117
+ circuit.eval()
118
+
119
+ # Get the output of the model
120
+ with torch.no_grad():
121
+ output = circuit(x)
122
+ # Save the output to a file
123
+ output_data = output.detach().numpy().tolist()
124
+ with open(output_path, 'w') as f:
125
+ json.dump(output_data, f)
126
+
127
+ # Export the model
128
+ torch.onnx.export(circuit, # model being run
129
+ x, # model input (or a tuple for multiple inputs)
130
+ model_path, # where to save the model (can be a file or file-like object)
131
+ export_params=True, # store the trained parameter weights inside the model file
132
+ opset_version=10, # the ONNX version to export the model to
133
+ do_constant_folding=True, # whether to execute constant folding for optimization
134
+ input_names=['input'], # the model's input names
135
+ output_names=['output'], # the model's output names
136
+ dynamic_axes={'input': {0: 'batch_size'}, # variable length axes
137
+ 'output': {0: 'batch_size'}})
138
+
139
+ data = dict(input_data=x.tolist())
140
+
141
+ # Serialize data into file:
142
+ json.dump(data, open(input_data_path, 'w'))
143
+
144
+ py_run_args = ezkl.PyRunArgs()
145
+ py_run_args.input_visibility = "public"
146
+ py_run_args.output_visibility = "public"
147
+ py_run_args.param_visibility = "fixed" # "fixed" for params means that the committed to params are used for all proofs
148
+
149
+ res = ezkl.gen_settings(model_path, settings_path, py_run_args=py_run_args)
150
+ assert res is True
151
+
152
+ cal_path = os.path.join(f"{folder_path}/calibration.json")
153
+
154
+ # Serialize data into file:
155
+ json.dump(data, open(cal_path, 'w'))
156
+
157
+ await ezkl.calibrate_settings(cal_path, model_path, settings_path, "resources")
158
+
159
+ res = ezkl.compile_circuit(model_path, compiled_model_path, settings_path)
160
+ assert res is True
161
+
162
+ # srs path
163
+ res = await ezkl.get_srs(settings_path, srs_path=srs_path)
164
+ assert res is True
165
+
166
+ # now generate the witness file
167
+ res = await ezkl.gen_witness(input_data_path, compiled_model_path, witness_path)
168
+ assert os.path.isfile(witness_path)
169
+
170
+ # HERE WE SETUP THE CIRCUIT PARAMS
171
+ # WE GOT KEYS
172
+ # WE GOT CIRCUIT PARAMETERS
173
+ # EVERYTHING ANYONE HAS EVER NEEDED FOR ZK
174
+
175
+ res = ezkl.setup(
176
+ compiled_model_path,
177
+ vk_path,
178
+ pk_path,
179
+ srs_path
180
+ )
181
+
182
+ assert res is True
183
+ assert os.path.isfile(vk_path)
184
+ assert os.path.isfile(pk_path)
185
+ assert os.path.isfile(settings_path)
186
+
187
+ # GENERATE A PROOF
188
+ proof_path = os.path.join(f'{folder_path}/test.pf')
189
+ res = ezkl.prove(
190
+ witness_path,
191
+ compiled_model_path,
192
+ pk_path,
193
+ proof_path,
194
+ "single",
195
+ srs_path
196
+ )
197
+ assert os.path.isfile(proof_path)
198
+
199
+ # VERIFY IT ON LOCAL
200
+ res = ezkl.verify(
201
+ proof_path,
202
+ settings_path,
203
+ vk_path,
204
+ srs_path
205
+ )
206
+ assert res is True
207
+ print("verified on local")
208
+
209
+ # VERIFY IT ON CHAIN
210
+ verify_sol_code_path = os.path.join(f'{folder_path}/verify.sol')
211
+ verify_sol_abi_path = os.path.join(f'{folder_path}/verify.abi')
212
+ res = await ezkl.create_evm_verifier(
213
+ vk_path,
214
+ settings_path,
215
+ verify_sol_code_path,
216
+ verify_sol_abi_path,
217
+ srs_path
218
+ )
219
+ assert res is True
220
+ verify_contract_addr_file = f"{folder_path}/addr.txt"
221
+ rpc_url = "http://103.231.86.33:10219"
222
+ await ezkl.deploy_evm(
223
+ addr_path=verify_contract_addr_file,
224
+ rpc_url=rpc_url,
225
+ sol_code_path=verify_sol_code_path
226
+ )
227
+ if os.path.exists(verify_contract_addr_file):
228
+ with open(verify_contract_addr_file, 'r') as file:
229
+ verify_contract_addr = file.read()
230
+ else:
231
+ print(f"error: File {verify_contract_addr_file} does not exist.")
232
+ return {"error": "Contract address file not found"}
233
+ res = await ezkl.verify_evm(
234
+ addr_verifier=verify_contract_addr,
235
+ proof_path=proof_path,
236
+ rpc_url=rpc_url
237
+ )
238
+ assert res is True
239
+ print("verified on chain")
240
+
241
+ # Read proof file content
242
+ with open(proof_path, 'rb') as f:
243
+ proof_content = base64.b64encode(f.read()).decode('utf-8')
244
+
245
+ return {"output": output_data, "proof": proof_content, "verify_contract_addr": verify_contract_addr}
zkml_encrypted.py CHANGED
@@ -184,7 +184,7 @@ async def get_zk_proof(request: ZKProofRequest):
184
  )
185
  assert res is True
186
  verify_contract_addr_file = f"{folder_path}/addr.txt"
187
- rpc_url = "http://127.0.0.1:3030"
188
  await ezkl.deploy_evm(
189
  addr_path=verify_contract_addr_file,
190
  rpc_url=rpc_url,
 
184
  )
185
  assert res is True
186
  verify_contract_addr_file = f"{folder_path}/addr.txt"
187
+ rpc_url = "http://103.231.86.33:10219"
188
  await ezkl.deploy_evm(
189
  addr_path=verify_contract_addr_file,
190
  rpc_url=rpc_url,
zkml_non_encrypted.py CHANGED
@@ -64,15 +64,11 @@ class AIWordsModel(nn.Module):
64
  print(f"loaded_data finished")
65
 
66
  def forward(self, x):
67
- print(f"forward input: {x}")
68
-
69
  prediction = self.best_model2.predict_proba(x, fhe="execute")
70
- print(f"prediction: {prediction}")
71
 
72
  prediction_tensor = torch.tensor(prediction, dtype=torch.float32)
73
  prediction_tensor = prediction_tensor.squeeze() # Remove extra dimensions if any
74
 
75
- print(f"tensor_output: {prediction_tensor}")
76
  return prediction_tensor
77
 
78
  # Function to convert text to tensor
@@ -128,7 +124,6 @@ async def get_zk_proof(request: ZKProofRequest):
128
  with open(output_path, 'w') as f:
129
  json.dump(output_data, f)
130
 
131
- print("start")
132
  # Export the model
133
  torch.onnx.export(circuit, # model being run
134
  x, # model input (or a tuple for multiple inputs)
@@ -140,7 +135,6 @@ async def get_zk_proof(request: ZKProofRequest):
140
  output_names=['output'], # the model's output names
141
  dynamic_axes={'input': {0: 'batch_size'}, # variable length axes
142
  'output': {0: 'batch_size'}})
143
- print("end")
144
 
145
  data = dict(input_data=x.tolist())
146
 
@@ -224,7 +218,7 @@ async def get_zk_proof(request: ZKProofRequest):
224
  )
225
  assert res is True
226
  verify_contract_addr_file = f"{folder_path}/addr.txt"
227
- rpc_url = "http://127.0.0.1:3030"
228
  await ezkl.deploy_evm(
229
  addr_path=verify_contract_addr_file,
230
  rpc_url=rpc_url,
 
64
  print(f"loaded_data finished")
65
 
66
  def forward(self, x):
 
 
67
  prediction = self.best_model2.predict_proba(x, fhe="execute")
 
68
 
69
  prediction_tensor = torch.tensor(prediction, dtype=torch.float32)
70
  prediction_tensor = prediction_tensor.squeeze() # Remove extra dimensions if any
71
 
 
72
  return prediction_tensor
73
 
74
  # Function to convert text to tensor
 
124
  with open(output_path, 'w') as f:
125
  json.dump(output_data, f)
126
 
 
127
  # Export the model
128
  torch.onnx.export(circuit, # model being run
129
  x, # model input (or a tuple for multiple inputs)
 
135
  output_names=['output'], # the model's output names
136
  dynamic_axes={'input': {0: 'batch_size'}, # variable length axes
137
  'output': {0: 'batch_size'}})
 
138
 
139
  data = dict(input_data=x.tolist())
140
 
 
218
  )
219
  assert res is True
220
  verify_contract_addr_file = f"{folder_path}/addr.txt"
221
+ rpc_url = "http://103.231.86.33:10219"
222
  await ezkl.deploy_evm(
223
  addr_path=verify_contract_addr_file,
224
  rpc_url=rpc_url,