MyronZhang commited on
Commit
a0d56b3
·
1 Parent(s): 0b64ad2

Sync from GitHub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. 2_run/app.py +6 -6
  2. hf_repo/0_download_data.sh +3 -0
  3. hf_repo/2_run/app.py +3 -3
  4. hf_repo/2_run/zkml_encrypted_server.py +2 -1
  5. hf_repo/2_run/zkml_non_encrypted_server.py +2 -2
  6. hf_repo/config.py.example +3 -1
  7. hf_repo/hf_repo/2_run/app.py +8 -5
  8. hf_repo/hf_repo/hf_repo/.gitignore +1 -0
  9. hf_repo/hf_repo/hf_repo/README.md +1 -1
  10. hf_repo/hf_repo/hf_repo/hf_repo/2_run/app.py +3 -0
  11. hf_repo/hf_repo/hf_repo/hf_repo/2_run/zkml_encrypted_server.py +2 -2
  12. hf_repo/hf_repo/hf_repo/hf_repo/2_run/zkml_non_encrypted_server.py +2 -2
  13. hf_repo/hf_repo/hf_repo/hf_repo/config.py.example +3 -0
  14. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/README.md +2 -1
  15. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/README.md +2 -2
  16. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/2_run/app.py +2 -15
  17. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/README.md +0 -1
  18. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/2_run/zkml_encrypted_server.py +2 -2
  19. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/2_run/zkml_encrypted_server.py +11 -1
  20. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/2_run/zkml_encrypted_server.py +0 -2
  21. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/2_run/app.py +1 -1
  22. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/2_run/zkml_encrypted_server.py +11 -11
  23. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/2_run/app.py +1 -1
  24. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/2_run/zkml_non_encrypted_server.py +2 -1
  25. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/2_run/app.py +3 -3
  26. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/0_download_data.sh +1 -1
  27. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/2_run/app.py +3 -3
  28. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/README.md +6 -4
  29. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/0_download_data.sh +15 -0
  30. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/1_build/main.py +75 -0
  31. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/2_run/app.py +407 -0
  32. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/2_run/fhe_server.py +39 -0
  33. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/2_run/transformer_vectorizer.py +58 -0
  34. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/2_run/zkml_encrypted_server.py +212 -0
  35. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/2_run/zkml_non_encrypted_server.py +245 -0
  36. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/.gitignore +6 -2
  37. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/README.md +24 -10
  38. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/.github/workflows/sync_to_hf.yml +43 -0
  39. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/deployment/serialized_model_zkml +0 -0
  40. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/.gitattributes +33 -0
  41. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/.gitignore +9 -0
  42. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/README.md +54 -0
  43. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/SentimentClassification.ipynb +1053 -0
  44. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/app.py +408 -0
  45. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/compile.py +35 -0
  46. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/deployment/samples_for_compilation.csv +0 -0
  47. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/deployment/sentiment_fhe_model/client.zip +3 -0
  48. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/deployment/sentiment_fhe_model/server.zip +3 -0
  49. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/deployment/sentiment_fhe_model/versions.json +1 -0
  50. hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/deployment/serialized_model +0 -0
2_run/app.py CHANGED
@@ -22,9 +22,9 @@ if config.gpu_enable:
22
  # This repository's directory
23
  REPO_DIR = Path(__file__).parent
24
 
25
- subprocess.Popen(["uvicorn", "fhe_server:app", "--port", "8000"], cwd=REPO_DIR)
26
- subprocess.Popen(["uvicorn", "zkml_non_encrypted_server:app", "--port", "8001"], cwd=REPO_DIR)
27
- subprocess.Popen(["uvicorn", "zkml_encrypted_server:app", "--port", "8002"], cwd=REPO_DIR)
28
 
29
  # Wait 5 sec for the server to start
30
  time.sleep(5)
@@ -132,7 +132,7 @@ def run_fhe(user_id):
132
  query["encrypted_encoding"] = encrypted_quantized_encoding
133
  headers = {"Content-type": "application/json"}
134
  response = requests.post(
135
- "http://localhost:8000/predict_sentiment", data=json.dumps(query), headers=headers
136
  )
137
  encrypted_prediction = base64.b64decode(response.json()["encrypted_prediction"])
138
 
@@ -172,7 +172,7 @@ def get_zk_proof_non_encrypted(text):
172
  headers = {"Content-type": "application/json"}
173
  query = {"text": text}
174
  response = requests.post(
175
- "http://localhost:8001/get_zk_proof", data=json.dumps(query), headers=headers
176
  )
177
  result = response.json()
178
 
@@ -209,7 +209,7 @@ def get_zk_proof_encrypted(user_id):
209
  query["encrypted_encoding"] = encrypted_quantized_encoding
210
  headers = {"Content-type": "application/json"}
211
  response = requests.post(
212
- "http://localhost:8002/get_zk_proof", data=json.dumps(query), headers=headers
213
  )
214
  result = response.json()
215
  return result["output"], result["proof"], result["verify_contract_addr"]
 
22
  # This repository's directory
23
  REPO_DIR = Path(__file__).parent
24
 
25
+ subprocess.Popen(["uvicorn", "fhe_server:app", "--port", "8080"], cwd=REPO_DIR)
26
+ subprocess.Popen(["uvicorn", "zkml_non_encrypted_server:app", "--port", "8081"], cwd=REPO_DIR)
27
+ subprocess.Popen(["uvicorn", "zkml_encrypted_server:app", "--port", "8082"], cwd=REPO_DIR)
28
 
29
  # Wait 5 sec for the server to start
30
  time.sleep(5)
 
132
  query["encrypted_encoding"] = encrypted_quantized_encoding
133
  headers = {"Content-type": "application/json"}
134
  response = requests.post(
135
+ "http://localhost:8080/predict_sentiment", data=json.dumps(query), headers=headers
136
  )
137
  encrypted_prediction = base64.b64decode(response.json()["encrypted_prediction"])
138
 
 
172
  headers = {"Content-type": "application/json"}
173
  query = {"text": text}
174
  response = requests.post(
175
+ "http://localhost:8081/get_zk_proof", data=json.dumps(query), headers=headers
176
  )
177
  result = response.json()
178
 
 
209
  query["encrypted_encoding"] = encrypted_quantized_encoding
210
  headers = {"Content-type": "application/json"}
211
  response = requests.post(
212
+ "http://localhost:8082/get_zk_proof", data=json.dumps(query), headers=headers
213
  )
214
  result = response.json()
215
  return result["output"], result["proof"], result["verify_contract_addr"]
hf_repo/0_download_data.sh CHANGED
@@ -2,6 +2,9 @@
2
 
3
  set -e
4
 
 
 
 
5
  # You need to install kaggle using pip and then have a valid ~/.kaggle/kaggle.json, that you can
6
  # generate from "Create new API token" on your account page in kaggle.com
7
  # Alternatively, the dataset can be downloaded manually at
 
2
 
3
  set -e
4
 
5
+ export KAGGLE_USERNAME=myronzhangweb3
6
+ export KAGGLE_KEY=313af1a98e3a4c1beb2331d6b0056105
7
+
8
  # You need to install kaggle using pip and then have a valid ~/.kaggle/kaggle.json, that you can
9
  # generate from "Create new API token" on your account page in kaggle.com
10
  # Alternatively, the dataset can be downloaded manually at
hf_repo/2_run/app.py CHANGED
@@ -339,7 +339,7 @@ with demo:
339
  interactive=False,
340
  )
341
  zk_contract_non_encrypted = gr.Textbox(
342
- label="Verify Contract Address:",
343
  max_lines=1,
344
  interactive=False,
345
  )
@@ -363,7 +363,7 @@ with demo:
363
  interactive=False,
364
  )
365
  zk_contract_encrypted = gr.Textbox(
366
- label="Verify Contract Address:",
367
  max_lines=1,
368
  interactive=False,
369
  )
@@ -397,4 +397,4 @@ with demo:
397
  b_get_zk_proof_encrypted.click(get_zk_proof_encrypted, inputs=[user_id],
398
  outputs=[zk_encrypted_prediction, zk_proof_encrypted, zk_contract_encrypted])
399
 
400
- demo.launch(share=False, server_name="0.0.0.0", server_port=10003)
 
339
  interactive=False,
340
  )
341
  zk_contract_non_encrypted = gr.Textbox(
342
+ label=f"Verify Contract Address: ({config.chain_name})",
343
  max_lines=1,
344
  interactive=False,
345
  )
 
363
  interactive=False,
364
  )
365
  zk_contract_encrypted = gr.Textbox(
366
+ label=f"Verify Contract Address: ({config.chain_name})",
367
  max_lines=1,
368
  interactive=False,
369
  )
 
397
  b_get_zk_proof_encrypted.click(get_zk_proof_encrypted, inputs=[user_id],
398
  outputs=[zk_encrypted_prediction, zk_proof_encrypted, zk_contract_encrypted])
399
 
400
+ demo.launch(share=False, server_name="127.0.0.1", server_port=config.port)
hf_repo/2_run/zkml_encrypted_server.py CHANGED
@@ -14,7 +14,7 @@ from concrete.ml.deployment import FHEModelServer
14
  from fastapi import FastAPI
15
  from pydantic import BaseModel
16
 
17
- from config import rpc_url
18
 
19
  app = FastAPI()
20
 
@@ -187,6 +187,7 @@ async def get_zk_proof(request: ZKProofRequest):
187
  await ezkl.deploy_evm(
188
  addr_path=verify_contract_addr_file,
189
  rpc_url=rpc_url,
 
190
  sol_code_path=verify_sol_code_path
191
  )
192
  if os.path.exists(verify_contract_addr_file):
 
14
  from fastapi import FastAPI
15
  from pydantic import BaseModel
16
 
17
+ from config import rpc_url, private_key
18
 
19
  app = FastAPI()
20
 
 
187
  await ezkl.deploy_evm(
188
  addr_path=verify_contract_addr_file,
189
  rpc_url=rpc_url,
190
+ private_key=private_key,
191
  sol_code_path=verify_sol_code_path
192
  )
193
  if os.path.exists(verify_contract_addr_file):
hf_repo/2_run/zkml_non_encrypted_server.py CHANGED
@@ -13,14 +13,13 @@ import os
13
  import json
14
  import torch
15
  import base64
16
- from concrete.ml.deployment import FHEModelServer
17
  from concrete.ml.sklearn import XGBClassifier
18
  import tqdm
19
  from transformers import AutoModelForSequenceClassification, AutoTokenizer
20
  from fastapi import FastAPI
21
  from pydantic import BaseModel
22
 
23
- from config import rpc_url
24
 
25
  app = FastAPI()
26
 
@@ -223,6 +222,7 @@ async def get_zk_proof(request: ZKProofRequest):
223
  await ezkl.deploy_evm(
224
  addr_path=verify_contract_addr_file,
225
  rpc_url=rpc_url,
 
226
  sol_code_path=verify_sol_code_path
227
  )
228
  if os.path.exists(verify_contract_addr_file):
 
13
  import json
14
  import torch
15
  import base64
 
16
  from concrete.ml.sklearn import XGBClassifier
17
  import tqdm
18
  from transformers import AutoModelForSequenceClassification, AutoTokenizer
19
  from fastapi import FastAPI
20
  from pydantic import BaseModel
21
 
22
+ from config import rpc_url, private_key
23
 
24
  app = FastAPI()
25
 
 
222
  await ezkl.deploy_evm(
223
  addr_path=verify_contract_addr_file,
224
  rpc_url=rpc_url,
225
+ private_key=private_key,
226
  sol_code_path=verify_sol_code_path
227
  )
228
  if os.path.exists(verify_contract_addr_file):
hf_repo/config.py.example CHANGED
@@ -1,3 +1,5 @@
 
1
  gpu_enable = False
 
2
  rpc_url = "http://103.231.86.33:10219"
3
- # rpc_url = "http://172.18.38.166:10001"
 
1
+ port = 9000
2
  gpu_enable = False
3
+ chain_name = "Sepolia"
4
  rpc_url = "http://103.231.86.33:10219"
5
+ private_key = "xxx"
hf_repo/hf_repo/2_run/app.py CHANGED
@@ -12,9 +12,12 @@ import base64
12
  import subprocess
13
  import shutil
14
  import time
 
15
 
16
- os.environ['ENABLE_ICICLE_GPU'] = 'true'
17
- os.environ['RUST_BACKTRACE']='full'
 
 
18
 
19
  # This repository's directory
20
  REPO_DIR = Path(__file__).parent
@@ -336,7 +339,7 @@ with demo:
336
  interactive=False,
337
  )
338
  zk_contract_non_encrypted = gr.Textbox(
339
- label="Verify Contract Address:",
340
  max_lines=1,
341
  interactive=False,
342
  )
@@ -360,7 +363,7 @@ with demo:
360
  interactive=False,
361
  )
362
  zk_contract_encrypted = gr.Textbox(
363
- label="Verify Contract Address:",
364
  max_lines=1,
365
  interactive=False,
366
  )
@@ -394,4 +397,4 @@ with demo:
394
  b_get_zk_proof_encrypted.click(get_zk_proof_encrypted, inputs=[user_id],
395
  outputs=[zk_encrypted_prediction, zk_proof_encrypted, zk_contract_encrypted])
396
 
397
- demo.launch(share=False, server_name="0.0.0.0", server_port=10003)
 
12
  import subprocess
13
  import shutil
14
  import time
15
+ import config
16
 
17
+ if config.gpu_enable:
18
+ print("gpu enable")
19
+ os.environ['ENABLE_ICICLE_GPU'] = 'true'
20
+ os.environ['RUST_BACKTRACE'] = 'full'
21
 
22
  # This repository's directory
23
  REPO_DIR = Path(__file__).parent
 
339
  interactive=False,
340
  )
341
  zk_contract_non_encrypted = gr.Textbox(
342
+ label=f"Verify Contract Address: ({config.chain_name})",
343
  max_lines=1,
344
  interactive=False,
345
  )
 
363
  interactive=False,
364
  )
365
  zk_contract_encrypted = gr.Textbox(
366
+ label=f"Verify Contract Address: ({config.chain_name})",
367
  max_lines=1,
368
  interactive=False,
369
  )
 
397
  b_get_zk_proof_encrypted.click(get_zk_proof_encrypted, inputs=[user_id],
398
  outputs=[zk_encrypted_prediction, zk_proof_encrypted, zk_contract_encrypted])
399
 
400
+ demo.launch(share=False, server_name="127.0.0.1", server_port=config.port)
hf_repo/hf_repo/hf_repo/.gitignore CHANGED
@@ -11,3 +11,4 @@ docker/3_run/docker-compose.yml
11
  /2_run/zkml_encrypted/
12
  /2_run/zkml_non_encrypted/
13
  /deployment/
 
 
11
  /2_run/zkml_encrypted/
12
  /2_run/zkml_non_encrypted/
13
  /deployment/
14
+ /2_run/config.py
hf_repo/hf_repo/hf_repo/README.md CHANGED
@@ -55,8 +55,8 @@ python3 main.py
55
  - In a terminal:
56
 
57
  ```bash
58
- cp config.py.example config.py
59
  cd 2_run
 
60
  python3 app.py
61
  ```
62
 
 
55
  - In a terminal:
56
 
57
  ```bash
 
58
  cd 2_run
59
+ cp ../config.py.example config.py
60
  python3 app.py
61
  ```
62
 
hf_repo/hf_repo/hf_repo/hf_repo/2_run/app.py CHANGED
@@ -13,6 +13,9 @@ import subprocess
13
  import shutil
14
  import time
15
 
 
 
 
16
  # This repository's directory
17
  REPO_DIR = Path(__file__).parent
18
 
 
13
  import shutil
14
  import time
15
 
16
+ os.environ['ENABLE_ICICLE_GPU'] = 'true'
17
+ os.environ['RUST_BACKTRACE']='full'
18
+
19
  # This repository's directory
20
  REPO_DIR = Path(__file__).parent
21
 
hf_repo/hf_repo/hf_repo/hf_repo/2_run/zkml_encrypted_server.py CHANGED
@@ -14,6 +14,8 @@ from concrete.ml.deployment import FHEModelServer
14
  from fastapi import FastAPI
15
  from pydantic import BaseModel
16
 
 
 
17
  app = FastAPI()
18
 
19
  evaluation_key = None
@@ -182,8 +184,6 @@ async def get_zk_proof(request: ZKProofRequest):
182
  )
183
  assert res is True
184
  verify_contract_addr_file = f"{folder_path}/addr.txt"
185
- # rpc_url = "http://172.18.38.166:10001"
186
- rpc_url = "http://103.231.86.33:10219"
187
  await ezkl.deploy_evm(
188
  addr_path=verify_contract_addr_file,
189
  rpc_url=rpc_url,
 
14
  from fastapi import FastAPI
15
  from pydantic import BaseModel
16
 
17
+ from config import rpc_url
18
+
19
  app = FastAPI()
20
 
21
  evaluation_key = None
 
184
  )
185
  assert res is True
186
  verify_contract_addr_file = f"{folder_path}/addr.txt"
 
 
187
  await ezkl.deploy_evm(
188
  addr_path=verify_contract_addr_file,
189
  rpc_url=rpc_url,
hf_repo/hf_repo/hf_repo/hf_repo/2_run/zkml_non_encrypted_server.py CHANGED
@@ -20,6 +20,8 @@ from transformers import AutoModelForSequenceClassification, AutoTokenizer
20
  from fastapi import FastAPI
21
  from pydantic import BaseModel
22
 
 
 
23
  app = FastAPI()
24
 
25
  evaluation_key = None
@@ -218,8 +220,6 @@ async def get_zk_proof(request: ZKProofRequest):
218
  )
219
  assert res is True
220
  verify_contract_addr_file = f"{folder_path}/addr.txt"
221
- rpc_url = "http://172.18.38.166:10001"
222
- # rpc_url = "http://103.231.86.33:10219"
223
  await ezkl.deploy_evm(
224
  addr_path=verify_contract_addr_file,
225
  rpc_url=rpc_url,
 
20
  from fastapi import FastAPI
21
  from pydantic import BaseModel
22
 
23
+ from config import rpc_url
24
+
25
  app = FastAPI()
26
 
27
  evaluation_key = None
 
220
  )
221
  assert res is True
222
  verify_contract_addr_file = f"{folder_path}/addr.txt"
 
 
223
  await ezkl.deploy_evm(
224
  addr_path=verify_contract_addr_file,
225
  rpc_url=rpc_url,
hf_repo/hf_repo/hf_repo/hf_repo/config.py.example ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gpu_enable = False
2
+ rpc_url = "http://103.231.86.33:10219"
3
+ # rpc_url = "http://172.18.38.166:10001"
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/README.md CHANGED
@@ -11,7 +11,7 @@ tags: [FHE, EZKL, PPML, privacy, privacy preserving machine learning, homomorphi
11
  python_version: 3.10.11
12
  ---
13
 
14
- # Sentiment Analysis With FHE
15
 
16
  ## Launch locally
17
 
@@ -55,6 +55,7 @@ python3 main.py
55
  - In a terminal:
56
 
57
  ```bash
 
58
  cd 2_run
59
  python3 app.py
60
  ```
 
11
  python_version: 3.10.11
12
  ---
13
 
14
+ # Sentiment Analysis With FHE And EZKL
15
 
16
  ## Launch locally
17
 
 
55
  - In a terminal:
56
 
57
  ```bash
58
+ cp config.py.example config.py
59
  cd 2_run
60
  python3 app.py
61
  ```
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Sentiment Analysis On Encrypted Data Using Fully Homomorphic Encryption
3
  emoji: 🥷💬
4
  colorFrom: yellow
5
  colorTo: yellow
@@ -7,7 +7,7 @@ sdk: gradio
7
  sdk_version: 4.44.0
8
  app_file: app.py
9
  pinned: true
10
- tags: [FHE, PPML, privacy, privacy preserving machine learning, homomorphic encryption, security]
11
  python_version: 3.10.11
12
  ---
13
 
 
1
  ---
2
+ title: Sentiment Analysis On Encrypted Data Using Fully Homomorphic Encryption And EZKL
3
  emoji: 🥷💬
4
  colorFrom: yellow
5
  colorTo: yellow
 
7
  sdk_version: 4.44.0
8
  app_file: app.py
9
  pinned: true
10
+ tags: [FHE, EZKL, PPML, privacy, privacy preserving machine learning, homomorphic encryption, security]
11
  python_version: 3.10.11
12
  ---
13
 
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/2_run/app.py CHANGED
@@ -223,20 +223,10 @@ with demo:
223
  <img width=200 src="https://user-images.githubusercontent.com/5758427/197816413-d9cddad3-ba38-4793-847d-120975e1da11.png">
224
  </p>
225
 
226
- <h2 align="center">Sentiment Analysis On Encrypted Data Using Homomorphic Encryption</h2>
227
 
228
  <p align="center">
229
- <a href="https://github.com/zama-ai/concrete-ml"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="https://user-images.githubusercontent.com/5758427/197972109-faaaff3e-10e2-4ab6-80f5-7531f7cfb08f.png">Concrete-ML</a>
230
-
231
- <a href="https://docs.zama.ai/concrete-ml"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="https://user-images.githubusercontent.com/5758427/197976802-fddd34c5-f59a-48d0-9bff-7ad1b00cb1fb.png">Documentation</a>
232
-
233
- <a href="https://zama.ai/community"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="https://user-images.githubusercontent.com/5758427/197977153-8c9c01a7-451a-4993-8e10-5a6ed5343d02.png">Community</a>
234
-
235
- <a href="https://twitter.com/zama_fhe"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="https://user-images.githubusercontent.com/5758427/197975044-bab9d199-e120-433b-b3be-abd73b211a54.png">@zama_fhe</a>
236
- </p>
237
-
238
- <p align="center">
239
- <img src="https://user-images.githubusercontent.com/56846628/219329304-6868be9e-5ce8-4279-9123-4cb1bc0c2fb5.png" width="60%" height="60%">
240
  </p>
241
  """
242
  )
@@ -401,7 +391,4 @@ with demo:
401
  b_get_zk_proof_encrypted.click(get_zk_proof_encrypted, inputs=[user_id],
402
  outputs=[zk_encrypted_prediction, zk_proof_encrypted, zk_contract_encrypted])
403
 
404
- gr.Markdown(
405
- "The app was built with [Concrete-ML](https://github.com/zama-ai/concrete-ml), a Privacy-Preserving Machine Learning (PPML) open-source set of tools by [Zama](https://zama.ai/). Try it yourself and don't forget to star on Github &#11088;."
406
- )
407
  demo.launch(share=False, server_name="0.0.0.0", server_port=10003)
 
223
  <img width=200 src="https://user-images.githubusercontent.com/5758427/197816413-d9cddad3-ba38-4793-847d-120975e1da11.png">
224
  </p>
225
 
226
+ <h2 align="center">Sentiment Analysis On Encrypted Data Using Homomorphic Encryption And EZKL</h2>
227
 
228
  <p align="center">
229
+ <img src="https://privecho.web3idea.xyz/images/sentiment-analysis.png" width="60%" height="60%">
 
 
 
 
 
 
 
 
 
 
230
  </p>
231
  """
232
  )
 
391
  b_get_zk_proof_encrypted.click(get_zk_proof_encrypted, inputs=[user_id],
392
  outputs=[zk_encrypted_prediction, zk_proof_encrypted, zk_contract_encrypted])
393
 
 
 
 
394
  demo.launch(share=False, server_name="0.0.0.0", server_port=10003)
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/README.md CHANGED
@@ -18,7 +18,6 @@ python_version: 3.10.11
18
  - First, create a virtual env and activate it:
19
 
20
  ```bash
21
- source ~/anaconda3/bin/activate
22
  conda create --name sentiment_analysis_demo python=3.10.11
23
  conda activate sentiment_analysis_demo
24
  ```
 
18
  - First, create a virtual env and activate it:
19
 
20
  ```bash
 
21
  conda create --name sentiment_analysis_demo python=3.10.11
22
  conda activate sentiment_analysis_demo
23
  ```
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/2_run/zkml_encrypted_server.py CHANGED
@@ -208,9 +208,9 @@ async def get_zk_proof(request: ZKProofRequest):
208
  with open(proof_path, 'rb') as f:
209
  proof_content = base64.b64encode(f.read()).decode('utf-8')
210
 
211
- return {"output": array_to_hex_string(output_data)[:100],
212
  "output_path": output_path,
213
- "proof": proof_content[:100],
214
  "proof_path": proof_path,
215
  "verify_contract_addr": verify_contract_addr}
216
 
 
208
  with open(proof_path, 'rb') as f:
209
  proof_content = base64.b64encode(f.read()).decode('utf-8')
210
 
211
+ return {"output": array_to_hex_string(output_data)[:1000],
212
  "output_path": output_path,
213
+ "proof": proof_content[:500],
214
  "proof_path": proof_path,
215
  "verify_contract_addr": verify_contract_addr}
216
 
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/2_run/zkml_encrypted_server.py CHANGED
@@ -182,6 +182,7 @@ async def get_zk_proof(request: ZKProofRequest):
182
  )
183
  assert res is True
184
  verify_contract_addr_file = f"{folder_path}/addr.txt"
 
185
  rpc_url = "http://103.231.86.33:10219"
186
  await ezkl.deploy_evm(
187
  addr_path=verify_contract_addr_file,
@@ -207,4 +208,13 @@ async def get_zk_proof(request: ZKProofRequest):
207
  with open(proof_path, 'rb') as f:
208
  proof_content = base64.b64encode(f.read()).decode('utf-8')
209
 
210
- return {"output": output_data, "proof": proof_content, "verify_contract_addr": verify_contract_addr}
 
 
 
 
 
 
 
 
 
 
182
  )
183
  assert res is True
184
  verify_contract_addr_file = f"{folder_path}/addr.txt"
185
+ # rpc_url = "http://172.18.38.166:10001"
186
  rpc_url = "http://103.231.86.33:10219"
187
  await ezkl.deploy_evm(
188
  addr_path=verify_contract_addr_file,
 
208
  with open(proof_path, 'rb') as f:
209
  proof_content = base64.b64encode(f.read()).decode('utf-8')
210
 
211
+ return {"output": array_to_hex_string(output_data)[:100],
212
+ "output_path": output_path,
213
+ "proof": proof_content[:100],
214
+ "proof_path": proof_path,
215
+ "verify_contract_addr": verify_contract_addr}
216
+
217
+
218
+ def array_to_hex_string(array):
219
+ hex_string = ''.join(format(num, '02x') for num in array)
220
+ return hex_string
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/2_run/zkml_encrypted_server.py CHANGED
@@ -87,7 +87,6 @@ async def get_zk_proof(request: ZKProofRequest):
87
  with open(output_path, 'w') as f:
88
  json.dump(output_data, f)
89
 
90
- print("start")
91
  # Export the model
92
  torch.onnx.export(circuit, # model being 2_run
93
  x, # model input (or a tuple for multiple inputs)
@@ -99,7 +98,6 @@ async def get_zk_proof(request: ZKProofRequest):
99
  output_names=['output'], # the model's output names
100
  dynamic_axes={'input': {0: 'batch_size'}, # variable length axes
101
  'output': {0: 'batch_size'}})
102
- print("end")
103
 
104
  data = dict(input_data=x.tolist())
105
 
 
87
  with open(output_path, 'w') as f:
88
  json.dump(output_data, f)
89
 
 
90
  # Export the model
91
  torch.onnx.export(circuit, # model being 2_run
92
  x, # model input (or a tuple for multiple inputs)
 
98
  output_names=['output'], # the model's output names
99
  dynamic_axes={'input': {0: 'batch_size'}, # variable length axes
100
  'output': {0: 'batch_size'}})
 
101
 
102
  data = dict(input_data=x.tolist())
103
 
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/2_run/app.py CHANGED
@@ -348,7 +348,7 @@ with demo:
348
  interactive=False,
349
  )
350
 
351
- gr.Markdown("# Step 6: Get ZK Proof(encrypted input, Coming Soon)")
352
  gr.Markdown("## Server side")
353
  gr.Markdown(
354
  "Get zero-knowledge proof of the sentiment analysis computation (for encrypted input)."
 
348
  interactive=False,
349
  )
350
 
351
+ gr.Markdown("# Step 7: Get ZK Proof(encrypted input)")
352
  gr.Markdown("## Server side")
353
  gr.Markdown(
354
  "Get zero-knowledge proof of the sentiment analysis computation (for encrypted input)."
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/2_run/zkml_encrypted_server.py CHANGED
@@ -28,16 +28,16 @@ class AIModel(nn.Module):
28
  self.fhe_model = FHEModelServer("../deployment/sentiment_fhe_model")
29
 
30
  def forward(self, x):
31
- print(f"forward input: {x}")
32
 
33
  # Convert to bytes
34
  x = x[0]
35
  _encrypted_encoding = x.numpy().tobytes()
36
  prediction = self.fhe_model.run(_encrypted_encoding, evaluation_key)
37
- print(f"forward prediction hex: {prediction.hex()}")
38
 
39
  byte_tensor = torch.tensor(list(prediction), dtype=torch.uint8)
40
- print(f"tensor_output: {byte_tensor}")
41
 
42
  return byte_tensor
43
 
@@ -196,14 +196,14 @@ async def get_zk_proof(request: ZKProofRequest):
196
  else:
197
  print(f"error: File {verify_contract_addr_file} does not exist.")
198
  return {"error": "Contract address file not found"}
199
- # TODO verify failed. maybe need to change the x
200
- res = await ezkl.verify_evm(
201
- addr_verifier=verify_contract_addr,
202
- proof_path=proof_path,
203
- rpc_url=rpc_url
204
- )
205
- assert res is True
206
- print("verified on chain")
207
 
208
  # Read proof file content
209
  with open(proof_path, 'rb') as f:
 
28
  self.fhe_model = FHEModelServer("../deployment/sentiment_fhe_model")
29
 
30
  def forward(self, x):
31
+ # print(f"forward input: {x}")
32
 
33
  # Convert to bytes
34
  x = x[0]
35
  _encrypted_encoding = x.numpy().tobytes()
36
  prediction = self.fhe_model.run(_encrypted_encoding, evaluation_key)
37
+ # print(f"forward prediction hex: {prediction.hex()}")
38
 
39
  byte_tensor = torch.tensor(list(prediction), dtype=torch.uint8)
40
+ # print(f"tensor_output: {byte_tensor}")
41
 
42
  return byte_tensor
43
 
 
196
  else:
197
  print(f"error: File {verify_contract_addr_file} does not exist.")
198
  return {"error": "Contract address file not found"}
199
+ # TODO verify failed. It may be because the proof is too large.
200
+ # res = await ezkl.verify_evm(
201
+ # addr_verifier=verify_contract_addr,
202
+ # proof_path=proof_path,
203
+ # rpc_url=rpc_url
204
+ # )
205
+ # assert res is True
206
+ # print("verified on chain")
207
 
208
  # Read proof file content
209
  with open(proof_path, 'rb') as f:
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/2_run/app.py CHANGED
@@ -133,7 +133,7 @@ def run_fhe(user_id):
133
  # Save encrypted_prediction in a file, since too large to pass through regular Gradio
134
  # buttons, https://github.com/gradio-app/gradio/issues/1877
135
  numpy.save(f"tmp/tmp_encrypted_prediction_{user_id}.npy", encrypted_prediction)
136
- encrypted_prediction_shorten = list(encrypted_prediction)
137
  encrypted_prediction_shorten_hex = ''.join(f'{i:02x}' for i in encrypted_prediction_shorten)
138
  return encrypted_prediction_shorten_hex
139
 
 
133
  # Save encrypted_prediction in a file, since too large to pass through regular Gradio
134
  # buttons, https://github.com/gradio-app/gradio/issues/1877
135
  numpy.save(f"tmp/tmp_encrypted_prediction_{user_id}.npy", encrypted_prediction)
136
+ encrypted_prediction_shorten = list(encrypted_prediction)[:ENCRYPTED_DATA_BROWSER_LIMIT]
137
  encrypted_prediction_shorten_hex = ''.join(f'{i:02x}' for i in encrypted_prediction_shorten)
138
  return encrypted_prediction_shorten_hex
139
 
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/2_run/zkml_non_encrypted_server.py CHANGED
@@ -218,7 +218,8 @@ async def get_zk_proof(request: ZKProofRequest):
218
  )
219
  assert res is True
220
  verify_contract_addr_file = f"{folder_path}/addr.txt"
221
- rpc_url = "http://103.231.86.33:10219"
 
222
  await ezkl.deploy_evm(
223
  addr_path=verify_contract_addr_file,
224
  rpc_url=rpc_url,
 
218
  )
219
  assert res is True
220
  verify_contract_addr_file = f"{folder_path}/addr.txt"
221
+ rpc_url = "http://172.18.38.166:10001"
222
+ # rpc_url = "http://103.231.86.33:10219"
223
  await ezkl.deploy_evm(
224
  addr_path=verify_contract_addr_file,
225
  rpc_url=rpc_url,
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/2_run/app.py CHANGED
@@ -20,8 +20,8 @@ subprocess.Popen(["uvicorn", "fhe_server:app", "--port", "8000"], cwd=REPO_DIR)
20
  subprocess.Popen(["uvicorn", "zkml_non_encrypted_server:app", "--port", "8001"], cwd=REPO_DIR)
21
  subprocess.Popen(["uvicorn", "zkml_encrypted_server:app", "--port", "8002"], cwd=REPO_DIR)
22
 
23
- # Wait 30 sec for the server to start
24
- time.sleep(30)
25
 
26
  # Encrypted data limit for the browser to display
27
  # (encrypted data is too large to display in the browser)
@@ -404,4 +404,4 @@ with demo:
404
  gr.Markdown(
405
  "The app was built with [Concrete-ML](https://github.com/zama-ai/concrete-ml), a Privacy-Preserving Machine Learning (PPML) open-source set of tools by [Zama](https://zama.ai/). Try it yourself and don't forget to star on Github &#11088;."
406
  )
407
- demo.launch(share=False, server_port=10003)
 
20
  subprocess.Popen(["uvicorn", "zkml_non_encrypted_server:app", "--port", "8001"], cwd=REPO_DIR)
21
  subprocess.Popen(["uvicorn", "zkml_encrypted_server:app", "--port", "8002"], cwd=REPO_DIR)
22
 
23
+ # Wait 5 sec for the server to start
24
+ time.sleep(5)
25
 
26
  # Encrypted data limit for the browser to display
27
  # (encrypted data is too large to display in the browser)
 
404
  gr.Markdown(
405
  "The app was built with [Concrete-ML](https://github.com/zama-ai/concrete-ml), a Privacy-Preserving Machine Learning (PPML) open-source set of tools by [Zama](https://zama.ai/). Try it yourself and don't forget to star on Github &#11088;."
406
  )
407
+ demo.launch(share=False, server_name="0.0.0.0", server_port=10003)
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/0_download_data.sh CHANGED
@@ -7,7 +7,7 @@ set -e
7
  # Alternatively, the dataset can be downloaded manually at
8
  # https://www.kaggle.com/datasets/crowdflower/twitter-airline-sentiment
9
  rm -rf local_datasets
10
- mkdir dataset/local_datasets
11
  cd dataset/local_datasets
12
 
13
  kaggle datasets download -d crowdflower/twitter-airline-sentiment
 
7
  # Alternatively, the dataset can be downloaded manually at
8
  # https://www.kaggle.com/datasets/crowdflower/twitter-airline-sentiment
9
  rm -rf local_datasets
10
+ mkdir -p dataset/local_datasets
11
  cd dataset/local_datasets
12
 
13
  kaggle datasets download -d crowdflower/twitter-airline-sentiment
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/2_run/app.py CHANGED
@@ -27,7 +27,7 @@ time.sleep(30)
27
  # (encrypted data is too large to display in the browser)
28
  ENCRYPTED_DATA_BROWSER_LIMIT = 500
29
  N_USER_KEY_STORED = 20
30
- FHE_MODEL_PATH = "deployment/sentiment_fhe_model"
31
 
32
  print("Loading the transformer model...")
33
 
@@ -324,7 +324,7 @@ with demo:
324
 
325
  labels_sentiment = gr.Label(label="Sentiment:")
326
 
327
- gr.Markdown("# Step 6: Get ZK Proof(non-encrypted input, Coming Soon)")
328
  gr.Markdown("## Server side")
329
  gr.Markdown(
330
  "Get zero-knowledge proof of the sentiment analysis computation (for non-encrypted input)."
@@ -348,7 +348,7 @@ with demo:
348
  interactive=False,
349
  )
350
 
351
- gr.Markdown("# Step 6: Get ZK Proof(encrypted input)")
352
  gr.Markdown("## Server side")
353
  gr.Markdown(
354
  "Get zero-knowledge proof of the sentiment analysis computation (for encrypted input)."
 
27
  # (encrypted data is too large to display in the browser)
28
  ENCRYPTED_DATA_BROWSER_LIMIT = 500
29
  N_USER_KEY_STORED = 20
30
+ FHE_MODEL_PATH = "../deployment/sentiment_fhe_model"
31
 
32
  print("Loading the transformer model...")
33
 
 
324
 
325
  labels_sentiment = gr.Label(label="Sentiment:")
326
 
327
+ gr.Markdown("# Step 6: Get ZK Proof(non-encrypted input)")
328
  gr.Markdown("## Server side")
329
  gr.Markdown(
330
  "Get zero-knowledge proof of the sentiment analysis computation (for non-encrypted input)."
 
348
  interactive=False,
349
  )
350
 
351
+ gr.Markdown("# Step 6: Get ZK Proof(encrypted input, Coming Soon)")
352
  gr.Markdown("## Server side")
353
  gr.Markdown(
354
  "Get zero-knowledge proof of the sentiment analysis computation (for encrypted input)."
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/README.md CHANGED
@@ -13,7 +13,7 @@ python_version: 3.10.11
13
 
14
  # Sentiment Analysis With FHE
15
 
16
- ## Set up the app locally
17
 
18
  - First, create a virtual env and activate it:
19
 
@@ -46,7 +46,8 @@ pip3 install kaggle
46
  ### Compile
47
 
48
  ```bash
49
- python3 1_build/main.py
 
50
  ```
51
 
52
 
@@ -55,10 +56,11 @@ python3 1_build/main.py
55
  - In a terminal:
56
 
57
  ```bash
58
- python3 2_run/app.py
 
59
  ```
60
 
61
- ## Launch the app with docker
62
 
63
  TODO
64
 
 
13
 
14
  # Sentiment Analysis With FHE
15
 
16
+ ## Launch locally
17
 
18
  - First, create a virtual env and activate it:
19
 
 
46
  ### Compile
47
 
48
  ```bash
49
+ cd 1_build
50
+ python3 main.py
51
  ```
52
 
53
 
 
56
  - In a terminal:
57
 
58
  ```bash
59
+ cd 2_run
60
+ python3 app.py
61
  ```
62
 
63
+ ## Launch docker
64
 
65
  TODO
66
 
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/0_download_data.sh ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ set -e
4
+
5
+ # You need to install kaggle using pip and then have a valid ~/.kaggle/kaggle.json, that you can
6
+ # generate from "Create new API token" on your account page in kaggle.com
7
+ # Alternatively, the dataset can be downloaded manually at
8
+ # https://www.kaggle.com/datasets/crowdflower/twitter-airline-sentiment
9
+ rm -rf local_datasets
10
+ mkdir dataset/local_datasets
11
+ cd dataset/local_datasets
12
+
13
+ kaggle datasets download -d crowdflower/twitter-airline-sentiment
14
+
15
+ unzip twitter-airline-sentiment.zip -d twitter-airline-sentiment
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/1_build/main.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import torch
3
+ import pandas as pd
4
+ from sklearn.model_selection import GridSearchCV, train_test_split
5
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
6
+ from concrete.ml.sklearn import XGBClassifier
7
+ from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
8
+ import numpy as np
9
+ import tqdm
10
+ from pathlib import Path
11
+ from concrete.ml.deployment import FHEModelDev
12
+
13
+ # Load and prepare the dataset
14
+ train = pd.read_csv("../dataset/local_datasets/twitter-airline-sentiment/Tweets.csv", index_col=0)
15
+ text_X = train["text"]
16
+ y = train["airline_sentiment"].replace(["negative", "neutral", "positive"], [0, 1, 2])
17
+
18
+ text_X_train, text_X_test, y_train, y_test = train_test_split(
19
+ text_X, y, test_size=0.1, random_state=42
20
+ )
21
+
22
+ # Load the tokenizer and model
23
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
24
+ tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
25
+ transformer_model = AutoModelForSequenceClassification.from_pretrained(
26
+ "cardiffnlp/twitter-roberta-base-sentiment-latest"
27
+ ).to(device)
28
+
29
+
30
+ # Function to convert text to tensor
31
+ def text_to_tensor(list_text, transformer_model, tokenizer, device):
32
+ tokenized_text = [tokenizer.encode(text, return_tensors="pt") for text in list_text]
33
+ output_hidden_states_list = [None] * len(tokenized_text)
34
+
35
+ for i, tokenized_x in enumerate(tqdm.tqdm(tokenized_text)):
36
+ output_hidden_states = transformer_model(tokenized_x.to(device), output_hidden_states=True)[1][-1]
37
+ output_hidden_states = output_hidden_states.mean(dim=1).detach().cpu().numpy()
38
+ output_hidden_states_list[i] = output_hidden_states
39
+
40
+ return np.concatenate(output_hidden_states_list, axis=0)
41
+
42
+
43
+ # Vectorize the text
44
+ X_train_transformer = text_to_tensor(text_X_train.tolist(), transformer_model, tokenizer, device)
45
+ X_test_transformer = text_to_tensor(text_X_test.tolist(), transformer_model, tokenizer, device)
46
+
47
+ # Train the model
48
+ model = XGBClassifier()
49
+ parameters = {"n_bits": [2, 3], "max_depth": [1], "n_estimators": [10, 30, 50]}
50
+ grid_search = GridSearchCV(model, parameters, cv=5, scoring="accuracy")
51
+ grid_search.fit(X_train_transformer, y_train)
52
+
53
+ # Evaluate the model
54
+ best_model = grid_search.best_estimator_
55
+ y_pred = best_model.predict(X_test_transformer)
56
+ matrix = confusion_matrix(y_test, y_pred)
57
+ ConfusionMatrixDisplay(matrix).plot()
58
+
59
+ # FHE Inference
60
+ best_model.compile(X_train_transformer)
61
+ tested_tweet = ["AirFrance is awesome, almost as much as Zama!"]
62
+ X_tested_tweet = text_to_tensor(tested_tweet, transformer_model, tokenizer, device)
63
+ decrypted_proba = best_model.predict_proba(X_tested_tweet, fhe="execute")
64
+
65
+ # Deployment
66
+ DEPLOYMENT_DIR = Path("../deployment")
67
+ DEPLOYMENT_DIR.mkdir(exist_ok=True)
68
+ fhe_api = FHEModelDev(DEPLOYMENT_DIR / "sentiment_fhe_model", best_model)
69
+ fhe_api.save(via_mlir=True)
70
+ with (DEPLOYMENT_DIR / "serialized_model").open("w") as file:
71
+ best_model.dump(file)
72
+
73
+ # TODO useless?
74
+ with (DEPLOYMENT_DIR / "serialized_model_zkml").open("wb") as file:
75
+ pickle.dump(best_model.dump_dict(), file)
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/2_run/app.py ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """A gradio app. that runs locally (analytics=False and share=False) about sentiment analysis on tweets."""
2
+
3
+ import gradio as gr
4
+ from transformer_vectorizer import TransformerVectorizer
5
+ from concrete.ml.deployment import FHEModelClient
6
+ import numpy
7
+ import os
8
+ from pathlib import Path
9
+ import requests
10
+ import json
11
+ import base64
12
+ import subprocess
13
+ import shutil
14
+ import time
15
+
16
+ # This repository's directory
17
+ REPO_DIR = Path(__file__).parent
18
+
19
+ subprocess.Popen(["uvicorn", "fhe_server:app", "--port", "8000"], cwd=REPO_DIR)
20
+ subprocess.Popen(["uvicorn", "zkml_non_encrypted_server:app", "--port", "8001"], cwd=REPO_DIR)
21
+ subprocess.Popen(["uvicorn", "zkml_encrypted_server:app", "--port", "8002"], cwd=REPO_DIR)
22
+
23
+ # Wait 30 sec for the server to start
24
+ time.sleep(30)
25
+
26
+ # Encrypted data limit for the browser to display
27
+ # (encrypted data is too large to display in the browser)
28
+ ENCRYPTED_DATA_BROWSER_LIMIT = 500
29
+ N_USER_KEY_STORED = 20
30
+ FHE_MODEL_PATH = "deployment/sentiment_fhe_model"
31
+
32
+ print("Loading the transformer model...")
33
+
34
+ # Initialize the transformer vectorizer
35
+ transformer_vectorizer = TransformerVectorizer()
36
+
37
+
38
+ def clean_tmp_directory():
39
+ # Allow 20 user keys to be stored.
40
+ # Once that limitation is reached, deleted the oldest.
41
+ path_sub_directories = sorted([f for f in Path(".fhe_keys/").iterdir() if f.is_dir()], key=os.path.getmtime)
42
+
43
+ user_ids = []
44
+ if len(path_sub_directories) > N_USER_KEY_STORED:
45
+ n_files_to_delete = len(path_sub_directories) - N_USER_KEY_STORED
46
+ for p in path_sub_directories[:n_files_to_delete]:
47
+ user_ids.append(p.name)
48
+ shutil.rmtree(p)
49
+
50
+ list_files_tmp = Path("tmp/").iterdir()
51
+ # Delete all files related to user_id
52
+ for file in list_files_tmp:
53
+ for user_id in user_ids:
54
+ if file.name.endswith(f"{user_id}.npy"):
55
+ file.unlink()
56
+
57
+
58
+ def keygen():
59
+ # Clean tmp directory if needed
60
+ clean_tmp_directory()
61
+
62
+ print("Initializing FHEModelClient...")
63
+
64
+ # Create .fhe_keys directory if it doesn't exist
65
+ Path(".fhe_keys/").mkdir(exist_ok=True)
66
+
67
+ # Let's create a user_id
68
+ user_id = numpy.random.randint(0, 2 ** 32)
69
+ fhe_api = FHEModelClient(FHE_MODEL_PATH, f".fhe_keys/{user_id}")
70
+ fhe_api.load()
71
+
72
+ # Generate a fresh key
73
+ fhe_api.generate_private_and_evaluation_keys(force=True)
74
+ evaluation_key = fhe_api.get_serialized_evaluation_keys()
75
+
76
+ # Save evaluation_key in a file, since too large to pass through regular Gradio
77
+ # buttons, https://github.com/gradio-app/gradio/issues/1877
78
+ numpy.save(f"tmp/tmp_evaluation_key_{user_id}.npy", evaluation_key)
79
+
80
+ return [list(evaluation_key)[:ENCRYPTED_DATA_BROWSER_LIMIT], user_id]
81
+
82
+
83
+ def encode_quantize_encrypt(text, user_id):
84
+ if not user_id:
85
+ raise gr.Error("You need to generate FHE keys first.")
86
+
87
+ fhe_api = FHEModelClient(FHE_MODEL_PATH, f".fhe_keys/{user_id}")
88
+ fhe_api.load()
89
+ encodings = transformer_vectorizer.transform([text])
90
+ quantized_encodings = fhe_api.model.quantize_input(encodings).astype(numpy.uint8)
91
+ encrypted_quantized_encoding = fhe_api.quantize_encrypt_serialize(encodings)
92
+
93
+ # Save encrypted_quantized_encoding in a file, since too large to pass through regular Gradio
94
+ # buttons, https://github.com/gradio-app/gradio/issues/1877
95
+ numpy.save(f"tmp/tmp_encrypted_quantized_encoding_{user_id}.npy", encrypted_quantized_encoding)
96
+
97
+ # Compute size
98
+ encrypted_quantized_encoding_shorten = list(encrypted_quantized_encoding)
99
+ encrypted_quantized_encoding_shorten_hex = ''.join(f'{i:02x}' for i in encrypted_quantized_encoding_shorten)
100
+ return (
101
+ encodings[0],
102
+ quantized_encodings[0],
103
+ encrypted_quantized_encoding_shorten_hex,
104
+ )
105
+
106
+
107
+ def run_fhe(user_id):
108
+ encoded_data_path = Path(f"tmp/tmp_encrypted_quantized_encoding_{user_id}.npy")
109
+ if not user_id:
110
+ raise gr.Error("You need to generate FHE keys first.")
111
+ if not encoded_data_path.is_file():
112
+ raise gr.Error("No encrypted data was found. Encrypt the data before trying to predict.")
113
+
114
+ # Read encrypted_quantized_encoding from the file
115
+ encrypted_quantized_encoding = numpy.load(encoded_data_path)
116
+
117
+ # Read evaluation_key from the file
118
+ evaluation_key = numpy.load(f"tmp/tmp_evaluation_key_{user_id}.npy")
119
+
120
+ # Use base64 to encode the encodings and evaluation key
121
+ encrypted_quantized_encoding = base64.b64encode(encrypted_quantized_encoding).decode()
122
+ encoded_evaluation_key = base64.b64encode(evaluation_key).decode()
123
+
124
+ query = {}
125
+ query["evaluation_key"] = encoded_evaluation_key
126
+ query["encrypted_encoding"] = encrypted_quantized_encoding
127
+ headers = {"Content-type": "application/json"}
128
+ response = requests.post(
129
+ "http://localhost:8000/predict_sentiment", data=json.dumps(query), headers=headers
130
+ )
131
+ encrypted_prediction = base64.b64decode(response.json()["encrypted_prediction"])
132
+
133
+ # Save encrypted_prediction in a file, since too large to pass through regular Gradio
134
+ # buttons, https://github.com/gradio-app/gradio/issues/1877
135
+ numpy.save(f"tmp/tmp_encrypted_prediction_{user_id}.npy", encrypted_prediction)
136
+ encrypted_prediction_shorten = list(encrypted_prediction)
137
+ encrypted_prediction_shorten_hex = ''.join(f'{i:02x}' for i in encrypted_prediction_shorten)
138
+ return encrypted_prediction_shorten_hex
139
+
140
+
141
+ def decrypt_prediction(user_id):
142
+ encoded_data_path = Path(f"tmp/tmp_encrypted_prediction_{user_id}.npy")
143
+ if not user_id:
144
+ raise gr.Error("You need to generate FHE keys first.")
145
+ if not encoded_data_path.is_file():
146
+ raise gr.Error("No encrypted prediction was found. Run the prediction over the encrypted data first.")
147
+
148
+ # Read encrypted_prediction from the file
149
+ encrypted_prediction = numpy.load(encoded_data_path).tobytes()
150
+
151
+ fhe_api = FHEModelClient(FHE_MODEL_PATH, f".fhe_keys/{user_id}")
152
+ fhe_api.load()
153
+
154
+ # We need to retrieve the private key that matches the client specs (see issue #18)
155
+ fhe_api.generate_private_and_evaluation_keys(force=False)
156
+
157
+ predictions = fhe_api.deserialize_decrypt_dequantize(encrypted_prediction)
158
+ return {
159
+ "negative": predictions[0][0],
160
+ "neutral": predictions[0][1],
161
+ "positive": predictions[0][2],
162
+ }
163
+
164
+
165
+ def get_zk_proof_non_encrypted(text):
166
+ headers = {"Content-type": "application/json"}
167
+ query = {"text": text}
168
+ response = requests.post(
169
+ "http://localhost:8001/get_zk_proof", data=json.dumps(query), headers=headers
170
+ )
171
+ result = response.json()
172
+
173
+ sentiment = ""
174
+ if result["output"][0] > 0.5:
175
+ sentiment = "negative"
176
+ elif result["output"][1] > 0.5:
177
+ sentiment = "neutral"
178
+ else:
179
+ sentiment = "positive"
180
+
181
+ return sentiment, result["proof"], result["verify_contract_addr"]
182
+
183
+
184
+ def get_zk_proof_encrypted(user_id):
185
+ encoded_data_path = Path(f"tmp/tmp_encrypted_quantized_encoding_{user_id}.npy")
186
+ if not user_id:
187
+ raise gr.Error("You need to generate FHE keys first.")
188
+ if not encoded_data_path.is_file():
189
+ raise gr.Error("No encrypted data was found. Encrypt the data before trying to predict.")
190
+
191
+ # Read encrypted_quantized_encoding from the file
192
+ encrypted_quantized_encoding = numpy.load(encoded_data_path)
193
+
194
+ # Read evaluation_key from the file
195
+ evaluation_key = numpy.load(f"tmp/tmp_evaluation_key_{user_id}.npy")
196
+
197
+ # Use base64 to encode the encodings and evaluation key
198
+ encrypted_quantized_encoding = base64.b64encode(encrypted_quantized_encoding).decode()
199
+ encoded_evaluation_key = base64.b64encode(evaluation_key).decode()
200
+
201
+ query = {}
202
+ query["evaluation_key"] = encoded_evaluation_key
203
+ query["encrypted_encoding"] = encrypted_quantized_encoding
204
+ headers = {"Content-type": "application/json"}
205
+ response = requests.post(
206
+ "http://localhost:8002/get_zk_proof", data=json.dumps(query), headers=headers
207
+ )
208
+ result = response.json()
209
+ return result["output"], result["proof"], result["verify_contract_addr"]
210
+
211
+
212
+ # Create tmp directory if it doesn't exist
213
+ Path(".fhe_keys/").mkdir(exist_ok=True)
214
+ Path("tmp/").mkdir(exist_ok=True)
215
+
216
+ demo = gr.Blocks()
217
+
218
+ print("Starting the demo...")
219
+ with demo:
220
+ gr.Markdown(
221
+ """
222
+ <p align="center">
223
+ <img width=200 src="https://user-images.githubusercontent.com/5758427/197816413-d9cddad3-ba38-4793-847d-120975e1da11.png">
224
+ </p>
225
+
226
+ <h2 align="center">Sentiment Analysis On Encrypted Data Using Homomorphic Encryption</h2>
227
+
228
+ <p align="center">
229
+ <a href="https://github.com/zama-ai/concrete-ml"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="https://user-images.githubusercontent.com/5758427/197972109-faaaff3e-10e2-4ab6-80f5-7531f7cfb08f.png">Concrete-ML</a>
230
+
231
+ <a href="https://docs.zama.ai/concrete-ml"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="https://user-images.githubusercontent.com/5758427/197976802-fddd34c5-f59a-48d0-9bff-7ad1b00cb1fb.png">Documentation</a>
232
+
233
+ <a href="https://zama.ai/community"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="https://user-images.githubusercontent.com/5758427/197977153-8c9c01a7-451a-4993-8e10-5a6ed5343d02.png">Community</a>
234
+
235
+ <a href="https://twitter.com/zama_fhe"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="https://user-images.githubusercontent.com/5758427/197975044-bab9d199-e120-433b-b3be-abd73b211a54.png">@zama_fhe</a>
236
+ </p>
237
+
238
+ <p align="center">
239
+ <img src="https://user-images.githubusercontent.com/56846628/219329304-6868be9e-5ce8-4279-9123-4cb1bc0c2fb5.png" width="60%" height="60%">
240
+ </p>
241
+ """
242
+ )
243
+
244
+ gr.Markdown(
245
+ """
246
+ <p align="center">
247
+ </p>
248
+ <p align="center">
249
+ </p>
250
+ """
251
+ )
252
+
253
+ gr.Markdown("## Notes")
254
+ gr.Markdown(
255
+ """
256
+ - The private key is used to encrypt and decrypt the data and shall never be shared.
257
+ - The evaluation key is a public key that the server needs to process encrypted data.
258
+ """
259
+ )
260
+
261
+ gr.Markdown("# Step 1: Generate the keys")
262
+
263
+ b_gen_key_and_install = gr.Button("Generate the keys and send public part to server")
264
+
265
+ evaluation_key = gr.Textbox(
266
+ label="Evaluation key (truncated):",
267
+ max_lines=4,
268
+ interactive=False,
269
+ )
270
+
271
+ user_id = gr.Textbox(
272
+ label="",
273
+ max_lines=4,
274
+ interactive=False,
275
+ visible=False
276
+ )
277
+
278
+ gr.Markdown("# Step 2: Provide a message")
279
+ gr.Markdown("## Client side")
280
+ gr.Markdown(
281
+ "Enter a sensitive text message you received and would like to do sentiment analysis on (ideas: the last text message of your boss.... or lover)."
282
+ )
283
+ text = gr.Textbox(label="Enter a message:", value="I really like your work recently")
284
+
285
+ gr.Markdown("# Step 3: Encode the message with the private key")
286
+ b_encode_quantize_text = gr.Button(
287
+ "Encode, quantize and encrypt the text with transformer vectorizer, and send to server"
288
+ )
289
+
290
+ with gr.Row():
291
+ encoding = gr.Textbox(
292
+ label="Transformer representation:",
293
+ max_lines=4,
294
+ interactive=False,
295
+ )
296
+ quantized_encoding = gr.Textbox(
297
+ label="Quantized transformer representation:", max_lines=4, interactive=False
298
+ )
299
+ encrypted_quantized_encoding = gr.Textbox(
300
+ label="Encrypted quantized transformer representation (truncated):",
301
+ max_lines=4,
302
+ interactive=False,
303
+ )
304
+
305
+ gr.Markdown("# Step 4: Run the FHE evaluation")
306
+ gr.Markdown("## Server side")
307
+ gr.Markdown(
308
+ "The encrypted value is received by the server. Thanks to the evaluation key and to FHE, the server can compute the (encrypted) prediction directly over encrypted values. Once the computation is finished, the server returns the encrypted prediction to the client."
309
+ )
310
+
311
+ b_run_fhe = gr.Button("Run FHE execution there")
312
+ encrypted_prediction = gr.Textbox(
313
+ label="Encrypted prediction (truncated):",
314
+ max_lines=4,
315
+ interactive=False,
316
+ )
317
+
318
+ gr.Markdown("# Step 5: Decrypt the sentiment")
319
+ gr.Markdown("## Client side")
320
+ gr.Markdown(
321
+ "The encrypted sentiment is sent back to client, who can finally decrypt it with its private key. Only the client is aware of the original tweet and the prediction."
322
+ )
323
+ b_decrypt_prediction = gr.Button("Decrypt prediction")
324
+
325
+ labels_sentiment = gr.Label(label="Sentiment:")
326
+
327
+ gr.Markdown("# Step 6: Get ZK Proof(non-encrypted input, Coming Soon)")
328
+ gr.Markdown("## Server side")
329
+ gr.Markdown(
330
+ "Get zero-knowledge proof of the sentiment analysis computation (for non-encrypted input)."
331
+ )
332
+ b_get_zk_proof_non_encrypted = gr.Button("Get ZK Proof(non-encrypted input)")
333
+
334
+ with gr.Row():
335
+ zk_sentiment_non_encrypted = gr.Textbox(
336
+ label="Sentiment:",
337
+ max_lines=1,
338
+ interactive=False,
339
+ )
340
+ zk_proof_non_encrypted = gr.Textbox(
341
+ label="ZK Proof:",
342
+ max_lines=4,
343
+ interactive=False,
344
+ )
345
+ zk_contract_non_encrypted = gr.Textbox(
346
+ label="Verify Contract Address:",
347
+ max_lines=1,
348
+ interactive=False,
349
+ )
350
+
351
+ gr.Markdown("# Step 6: Get ZK Proof(encrypted input)")
352
+ gr.Markdown("## Server side")
353
+ gr.Markdown(
354
+ "Get zero-knowledge proof of the sentiment analysis computation (for encrypted input)."
355
+ )
356
+ b_get_zk_proof_encrypted = gr.Button("Get ZK Proof(encrypted input)")
357
+
358
+ with gr.Row():
359
+ zk_encrypted_prediction = gr.Textbox(
360
+ label="Encrypted Prediction(same as Step 4 output):",
361
+ max_lines=1,
362
+ interactive=False,
363
+ )
364
+ zk_proof_encrypted = gr.Textbox(
365
+ label="ZK Proof:",
366
+ max_lines=4,
367
+ interactive=False,
368
+ )
369
+ zk_contract_encrypted = gr.Textbox(
370
+ label="Verify Contract Address:",
371
+ max_lines=1,
372
+ interactive=False,
373
+ )
374
+
375
+ # Button for key generation
376
+ b_gen_key_and_install.click(keygen, inputs=[], outputs=[evaluation_key, user_id])
377
+
378
+ # Button to quantize and encrypt
379
+ b_encode_quantize_text.click(
380
+ encode_quantize_encrypt,
381
+ inputs=[text, user_id],
382
+ outputs=[
383
+ encoding,
384
+ quantized_encoding,
385
+ encrypted_quantized_encoding,
386
+ ],
387
+ )
388
+
389
+ # Button to send the encodings to the server using post at (localhost:8000/predict_sentiment)
390
+ b_run_fhe.click(run_fhe, inputs=[user_id], outputs=[encrypted_prediction])
391
+
392
+ # Button to decrypt the prediction on the client
393
+ b_decrypt_prediction.click(decrypt_prediction, inputs=[user_id], outputs=[labels_sentiment])
394
+
395
+ # Button to get ZK proof(non encrypted)
396
+ b_get_zk_proof_non_encrypted.click(get_zk_proof_non_encrypted, inputs=[text],
397
+ outputs=[zk_sentiment_non_encrypted, zk_proof_non_encrypted,
398
+ zk_contract_non_encrypted])
399
+
400
+ # Button to get ZK proof(encrypted)
401
+ b_get_zk_proof_encrypted.click(get_zk_proof_encrypted, inputs=[user_id],
402
+ outputs=[zk_encrypted_prediction, zk_proof_encrypted, zk_contract_encrypted])
403
+
404
+ gr.Markdown(
405
+ "The app was built with [Concrete-ML](https://github.com/zama-ai/concrete-ml), a Privacy-Preserving Machine Learning (PPML) open-source set of tools by [Zama](https://zama.ai/). Try it yourself and don't forget to star on Github &#11088;."
406
+ )
407
+ demo.launch(share=False, server_port=10003)
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/2_run/fhe_server.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Server that will listen for GET requests from the client."""
2
+ import json
3
+
4
+ from fastapi import FastAPI
5
+ from concrete.ml.deployment import FHEModelServer
6
+ from pydantic import BaseModel
7
+ import base64
8
+ from pathlib import Path
9
+
10
+ current_dir = Path(__file__).parent
11
+
12
+ # Load the model
13
+ fhe_model = FHEModelServer("../deployment/sentiment_fhe_model")
14
+
15
+
16
+ class PredictRequest(BaseModel):
17
+ evaluation_key: str
18
+ encrypted_encoding: str
19
+
20
+
21
+ # Initialize an instance of FastAPI
22
+ app = FastAPI()
23
+
24
+
25
+ # Define the default route
26
+ @app.get("/")
27
+ def root():
28
+ return {"message": "Welcome to Your Sentiment Classification FHE Model Server!"}
29
+
30
+
31
+ @app.post("/predict_sentiment")
32
+ def predict_sentiment(query: PredictRequest):
33
+ encrypted_encoding = base64.b64decode(query.encrypted_encoding)
34
+ evaluation_key = base64.b64decode(query.evaluation_key)
35
+ prediction = fhe_model.run(encrypted_encoding, evaluation_key)
36
+
37
+ # Encode base64 the prediction
38
+ encoded_prediction = base64.b64encode(prediction).decode()
39
+ return {"encrypted_prediction": encoded_prediction}
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/2_run/transformer_vectorizer.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Let's import a few requirements
2
+ import torch
3
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
4
+ import numpy
5
+
6
+ class TransformerVectorizer:
7
+ def __init__(self):
8
+ # Load the tokenizer (converts text to tokens)
9
+ self.tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
10
+
11
+ # Load the pre-trained model
12
+ self.transformer_model = AutoModelForSequenceClassification.from_pretrained(
13
+ "cardiffnlp/twitter-roberta-base-sentiment-latest"
14
+ )
15
+ self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
16
+
17
+ def text_to_tensor(
18
+ self,
19
+ texts: list,
20
+ ) -> numpy.ndarray:
21
+ """Function that transforms a list of texts to their learned representation.
22
+
23
+ Args:
24
+ list_text_X (list): List of texts to be transformed.
25
+
26
+ Returns:
27
+ numpy.ndarray: Transformed list of texts.
28
+ """
29
+ # First, tokenize all the input text
30
+ tokenized_text_X_train = self.tokenizer.batch_encode_plus(
31
+ texts, return_tensors="pt"
32
+ )["input_ids"]
33
+
34
+ # Depending on the hardware used, the number of examples to be processed can be reduced
35
+ # Here we split the data into 100 examples per batch
36
+ tokenized_text_X_train_split = torch.split(tokenized_text_X_train, split_size_or_sections=50)
37
+
38
+ # Send the model to the device
39
+ transformer_model = self.transformer_model.to(self.device)
40
+ output_hidden_states_list = []
41
+
42
+ for tokenized_x in tokenized_text_X_train_split:
43
+ # Pass the tokens through the transformer model and get the hidden states
44
+ # Only keep the last hidden layer state for now
45
+ output_hidden_states = transformer_model(tokenized_x.to(self.device), output_hidden_states=True)[
46
+ 1
47
+ ][-1]
48
+ # Average over the tokens axis to get a representation at the text level.
49
+ output_hidden_states = output_hidden_states.mean(dim=1)
50
+ output_hidden_states = output_hidden_states.detach().cpu().numpy()
51
+ output_hidden_states_list.append(output_hidden_states)
52
+
53
+ self.encodings = numpy.concatenate(output_hidden_states_list, axis=0)
54
+ return self.encodings
55
+
56
+ def transform(self, texts: list):
57
+ return self.text_to_tensor(texts)
58
+
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/2_run/zkml_encrypted_server.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://docs.ezkl.xyz/
2
+ # https://colab.research.google.com/github/zkonduit/ezkl/blob/main/examples/notebooks/simple_demo_all_public.ipynb
3
+ import struct
4
+ import uuid
5
+
6
+ import numpy as np
7
+ from torch import nn
8
+ import ezkl
9
+ import os
10
+ import json
11
+ import torch
12
+ import base64
13
+ from concrete.ml.deployment import FHEModelServer
14
+ from fastapi import FastAPI
15
+ from pydantic import BaseModel
16
+
17
+ app = FastAPI()
18
+
19
+ evaluation_key = None
20
+
21
+
22
+ # Defines the model
23
+ class AIModel(nn.Module):
24
+ def __init__(self):
25
+ super(AIModel, self).__init__()
26
+
27
+ # Load the model
28
+ self.fhe_model = FHEModelServer("../deployment/sentiment_fhe_model")
29
+
30
+ def forward(self, x):
31
+ print(f"forward input: {x}")
32
+
33
+ # Convert to bytes
34
+ x = x[0]
35
+ _encrypted_encoding = x.numpy().tobytes()
36
+ prediction = self.fhe_model.run(_encrypted_encoding, evaluation_key)
37
+ print(f"forward prediction hex: {prediction.hex()}")
38
+
39
+ byte_tensor = torch.tensor(list(prediction), dtype=torch.uint8)
40
+ print(f"tensor_output: {byte_tensor}")
41
+
42
+ return byte_tensor
43
+
44
+
45
+ class ZKProofRequest(BaseModel):
46
+ encrypted_encoding: str
47
+ evaluation_key: str
48
+
49
+
50
+ circuit = AIModel()
51
+
52
+
53
+ @app.post("/get_zk_proof")
54
+ async def get_zk_proof(request: ZKProofRequest):
55
+ request.encrypted_encoding = base64.b64decode(request.encrypted_encoding)
56
+ request.evaluation_key = base64.b64decode(request.evaluation_key)
57
+
58
+ global evaluation_key
59
+ evaluation_key = request.evaluation_key
60
+
61
+ folder_path = f"zkml_encrypted/{str(uuid.uuid4())}"
62
+ if not os.path.exists(folder_path):
63
+ os.makedirs(folder_path)
64
+
65
+ model_path = os.path.join(f'{folder_path}/network.onnx')
66
+ compiled_model_path = os.path.join(f'{folder_path}/network.compiled')
67
+ pk_path = os.path.join(f'{folder_path}/test.pk')
68
+ vk_path = os.path.join(f'{folder_path}/test.vk')
69
+ settings_path = os.path.join(f'{folder_path}/settings.json')
70
+
71
+ witness_path = os.path.join(f'{folder_path}/witness.json')
72
+ input_data_path = os.path.join(f'{folder_path}/input.json')
73
+ srs_path = os.path.join(f'{folder_path}/kzg14.srs')
74
+ output_path = os.path.join(f'{folder_path}/output.json')
75
+
76
+ # After training, export to onnx (network.onnx) and create a data file (input.json)
77
+ x = torch.tensor(list([request.encrypted_encoding]), dtype=torch.uint8)
78
+
79
+ # Flips the neural net into inference mode
80
+ circuit.eval()
81
+
82
+ # Get the output of the model
83
+ with torch.no_grad():
84
+ output = circuit(x)
85
+ # Save the output to a file
86
+ output_data = output.detach().numpy().tolist()
87
+ with open(output_path, 'w') as f:
88
+ json.dump(output_data, f)
89
+
90
+ print("start")
91
+ # Export the model
92
+ torch.onnx.export(circuit, # model being 2_run
93
+ x, # model input (or a tuple for multiple inputs)
94
+ model_path, # where to save the model (can be a file or file-like object)
95
+ export_params=True, # store the trained parameter weights inside the model file
96
+ opset_version=10, # the ONNX version to export the model to
97
+ do_constant_folding=True, # whether to execute constant folding for optimization
98
+ input_names=['input'], # the model's input names
99
+ output_names=['output'], # the model's output names
100
+ dynamic_axes={'input': {0: 'batch_size'}, # variable length axes
101
+ 'output': {0: 'batch_size'}})
102
+ print("end")
103
+
104
+ data = dict(input_data=x.tolist())
105
+
106
+ # Serialize data into file:
107
+ json.dump(data, open(input_data_path, 'w'))
108
+
109
+ py_run_args = ezkl.PyRunArgs()
110
+ py_run_args.input_visibility = "public"
111
+ py_run_args.output_visibility = "public"
112
+ py_run_args.param_visibility = "fixed" # "fixed" for params means that the committed to params are used for all proofs
113
+
114
+ res = ezkl.gen_settings(model_path, settings_path, py_run_args=py_run_args)
115
+ assert res is True
116
+
117
+ cal_path = os.path.join(f"{folder_path}/calibration.json")
118
+
119
+ # Serialize data into file:
120
+ json.dump(data, open(cal_path, 'w'))
121
+
122
+ await ezkl.calibrate_settings(cal_path, model_path, settings_path, "resources")
123
+
124
+ res = ezkl.compile_circuit(model_path, compiled_model_path, settings_path)
125
+ assert res is True
126
+
127
+ # srs path
128
+ res = await ezkl.get_srs(settings_path, srs_path=srs_path)
129
+ assert res is True
130
+
131
+ # now generate the witness file
132
+
133
+ res = await ezkl.gen_witness(input_data_path, compiled_model_path, witness_path)
134
+ assert os.path.isfile(witness_path)
135
+
136
+ # HERE WE SETUP THE CIRCUIT PARAMS
137
+ # WE GOT KEYS
138
+ # WE GOT CIRCUIT PARAMETERS
139
+ # EVERYTHING ANYONE HAS EVER NEEDED FOR ZK
140
+
141
+ res = ezkl.setup(
142
+ compiled_model_path,
143
+ vk_path,
144
+ pk_path,
145
+ srs_path
146
+ )
147
+
148
+ assert res is True
149
+ assert os.path.isfile(vk_path)
150
+ assert os.path.isfile(pk_path)
151
+ assert os.path.isfile(settings_path)
152
+
153
+ # GENERATE A PROOF
154
+ proof_path = os.path.join(f'{folder_path}/test.pf')
155
+ res = ezkl.prove(
156
+ witness_path,
157
+ compiled_model_path,
158
+ pk_path,
159
+ proof_path,
160
+ "single",
161
+ srs_path
162
+ )
163
+ assert os.path.isfile(proof_path)
164
+
165
+ # VERIFY IT ON LOCAL
166
+ res = ezkl.verify(
167
+ proof_path,
168
+ settings_path,
169
+ vk_path,
170
+ srs_path
171
+ )
172
+ assert res is True
173
+ print("verified on local")
174
+
175
+ # VERIFY IT ON CHAIN
176
+ verify_sol_code_path = os.path.join(f'{folder_path}/verify.sol')
177
+ verify_sol_abi_path = os.path.join(f'{folder_path}/verify.abi')
178
+ res = await ezkl.create_evm_verifier(
179
+ vk_path,
180
+ settings_path,
181
+ verify_sol_code_path,
182
+ verify_sol_abi_path,
183
+ srs_path
184
+ )
185
+ assert res is True
186
+ verify_contract_addr_file = f"{folder_path}/addr.txt"
187
+ rpc_url = "http://103.231.86.33:10219"
188
+ await ezkl.deploy_evm(
189
+ addr_path=verify_contract_addr_file,
190
+ rpc_url=rpc_url,
191
+ sol_code_path=verify_sol_code_path
192
+ )
193
+ if os.path.exists(verify_contract_addr_file):
194
+ with open(verify_contract_addr_file, 'r') as file:
195
+ verify_contract_addr = file.read()
196
+ else:
197
+ print(f"error: File {verify_contract_addr_file} does not exist.")
198
+ return {"error": "Contract address file not found"}
199
+ # TODO verify failed. maybe need to change the x
200
+ res = await ezkl.verify_evm(
201
+ addr_verifier=verify_contract_addr,
202
+ proof_path=proof_path,
203
+ rpc_url=rpc_url
204
+ )
205
+ assert res is True
206
+ print("verified on chain")
207
+
208
+ # Read proof file content
209
+ with open(proof_path, 'rb') as f:
210
+ proof_content = base64.b64encode(f.read()).decode('utf-8')
211
+
212
+ return {"output": output_data, "proof": proof_content, "verify_contract_addr": verify_contract_addr}
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/2_run/zkml_non_encrypted_server.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://docs.ezkl.xyz/
2
+ # https://colab.research.google.com/github/zkonduit/ezkl/blob/main/examples/notebooks/simple_demo_all_public.ipynb
3
+ import pickle
4
+ import struct
5
+ import uuid
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ from sklearn.model_selection import GridSearchCV, train_test_split
10
+ from torch import nn
11
+ import ezkl
12
+ import os
13
+ import json
14
+ import torch
15
+ import base64
16
+ from concrete.ml.deployment import FHEModelServer
17
+ from concrete.ml.sklearn import XGBClassifier
18
+ import tqdm
19
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
20
+ from fastapi import FastAPI
21
+ from pydantic import BaseModel
22
+
23
+ app = FastAPI()
24
+
25
+ evaluation_key = None
26
+
27
+
28
+ # Defines the model
29
+ class AIWordsModel(nn.Module):
30
+ def __init__(self):
31
+ super(AIWordsModel, self).__init__()
32
+
33
+ print("init ZK AIWordsModel")
34
+
35
+ # Load the model
36
+ self.model = XGBClassifier()
37
+ train = pd.read_csv("../dataset/local_datasets/twitter-airline-sentiment/Tweets.csv", index_col=0)
38
+ text_X = train["text"]
39
+ y = train["airline_sentiment"].replace(["negative", "neutral", "positive"], [0, 1, 2])
40
+
41
+ # Load the tokenizer and model
42
+ self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
43
+ self.tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
44
+ self.transformer_model = AutoModelForSequenceClassification.from_pretrained(
45
+ "cardiffnlp/twitter-roberta-base-sentiment-latest"
46
+ ).to(self.device)
47
+
48
+ text_X_train, text_X_test, y_train, y_test = train_test_split(
49
+ text_X, y, test_size=0.1, random_state=42
50
+ )
51
+ X_train_transformer = self.text_to_tensor(text_X_train.tolist(), self.transformer_model, self.tokenizer,
52
+ self.device)
53
+
54
+ with open("../deployment/serialized_model_zkml", 'rb') as file: # Open in binary read mode
55
+ loaded_data = pickle.load(file)
56
+ self.model.load_dict(loaded_data)
57
+ parameters = {"n_bits": [2, 3], "max_depth": [1], "n_estimators": [10, 30, 50]}
58
+ grid_search2 = GridSearchCV(self.model, parameters, cv=5, scoring="accuracy")
59
+ grid_search2.fit(X_train_transformer, y_train)
60
+ self.best_model2 = grid_search2.best_estimator_
61
+ self.best_model2.load_dict(loaded_data)
62
+ self.best_model2.compile(X_train_transformer)
63
+
64
+ print(f"loaded_data finished")
65
+
66
+ def forward(self, x):
67
+ prediction = self.best_model2.predict_proba(x, fhe="execute")
68
+
69
+ prediction_tensor = torch.tensor(prediction, dtype=torch.float32)
70
+ prediction_tensor = prediction_tensor.squeeze() # Remove extra dimensions if any
71
+
72
+ return prediction_tensor
73
+
74
+ # Function to convert text to tensor
75
+ def text_to_tensor(self, list_text, transformer_model, tokenizer, device):
76
+ tokenized_text = [tokenizer.encode(text, return_tensors="pt") for text in list_text]
77
+ output_hidden_states_list = [None] * len(tokenized_text)
78
+
79
+ for i, tokenized_x in enumerate(tqdm.tqdm(tokenized_text)):
80
+ output_hidden_states = transformer_model(tokenized_x.to(device), output_hidden_states=True)[1][-1]
81
+ output_hidden_states = output_hidden_states.mean(dim=1).detach().cpu().numpy()
82
+ output_hidden_states_list[i] = output_hidden_states
83
+
84
+ return np.concatenate(output_hidden_states_list, axis=0)
85
+
86
+
87
+ class ZKProofRequest(BaseModel):
88
+ text: str
89
+
90
+
91
+ circuit = AIWordsModel()
92
+
93
+
94
+ @app.post("/get_zk_proof")
95
+ async def get_zk_proof(request: ZKProofRequest):
96
+ folder_path = f"zkml_non_encrypted/{str(uuid.uuid4())}"
97
+ if not os.path.exists(folder_path):
98
+ os.makedirs(folder_path)
99
+
100
+ model_path = os.path.join(f'{folder_path}/network.onnx')
101
+ compiled_model_path = os.path.join(f'{folder_path}/network.compiled')
102
+ pk_path = os.path.join(f'{folder_path}/test.pk')
103
+ vk_path = os.path.join(f'{folder_path}/test.vk')
104
+ settings_path = os.path.join(f'{folder_path}/settings.json')
105
+
106
+ witness_path = os.path.join(f'{folder_path}/witness.json')
107
+ input_data_path = os.path.join(f'{folder_path}/input.json')
108
+ srs_path = os.path.join(f'{folder_path}/kzg14.srs')
109
+ output_path = os.path.join(f'{folder_path}/output.json')
110
+
111
+ # After training, export to onnx (network.onnx) and create a data file (input.json)
112
+ words = [request.text]
113
+ x_list = circuit.text_to_tensor(words, circuit.transformer_model, circuit.tokenizer, circuit.device)
114
+ x = torch.tensor(x_list, dtype=torch.float32)
115
+
116
+ # Flips the neural net into inference mode
117
+ circuit.eval()
118
+
119
+ # Get the output of the model
120
+ with torch.no_grad():
121
+ output = circuit(x)
122
+ # Save the output to a file
123
+ output_data = output.detach().numpy().tolist()
124
+ with open(output_path, 'w') as f:
125
+ json.dump(output_data, f)
126
+
127
+ # Export the model
128
+ torch.onnx.export(circuit, # model being 2_run
129
+ x, # model input (or a tuple for multiple inputs)
130
+ model_path, # where to save the model (can be a file or file-like object)
131
+ export_params=True, # store the trained parameter weights inside the model file
132
+ opset_version=10, # the ONNX version to export the model to
133
+ do_constant_folding=True, # whether to execute constant folding for optimization
134
+ input_names=['input'], # the model's input names
135
+ output_names=['output'], # the model's output names
136
+ dynamic_axes={'input': {0: 'batch_size'}, # variable length axes
137
+ 'output': {0: 'batch_size'}})
138
+
139
+ data = dict(input_data=x.tolist())
140
+
141
+ # Serialize data into file:
142
+ json.dump(data, open(input_data_path, 'w'))
143
+
144
+ py_run_args = ezkl.PyRunArgs()
145
+ py_run_args.input_visibility = "public"
146
+ py_run_args.output_visibility = "public"
147
+ py_run_args.param_visibility = "fixed" # "fixed" for params means that the committed to params are used for all proofs
148
+
149
+ res = ezkl.gen_settings(model_path, settings_path, py_run_args=py_run_args)
150
+ assert res is True
151
+
152
+ cal_path = os.path.join(f"{folder_path}/calibration.json")
153
+
154
+ # Serialize data into file:
155
+ json.dump(data, open(cal_path, 'w'))
156
+
157
+ await ezkl.calibrate_settings(cal_path, model_path, settings_path, "resources")
158
+
159
+ res = ezkl.compile_circuit(model_path, compiled_model_path, settings_path)
160
+ assert res is True
161
+
162
+ # srs path
163
+ res = await ezkl.get_srs(settings_path, srs_path=srs_path)
164
+ assert res is True
165
+
166
+ # now generate the witness file
167
+ res = await ezkl.gen_witness(input_data_path, compiled_model_path, witness_path)
168
+ assert os.path.isfile(witness_path)
169
+
170
+ # HERE WE SETUP THE CIRCUIT PARAMS
171
+ # WE GOT KEYS
172
+ # WE GOT CIRCUIT PARAMETERS
173
+ # EVERYTHING ANYONE HAS EVER NEEDED FOR ZK
174
+
175
+ res = ezkl.setup(
176
+ compiled_model_path,
177
+ vk_path,
178
+ pk_path,
179
+ srs_path
180
+ )
181
+
182
+ assert res is True
183
+ assert os.path.isfile(vk_path)
184
+ assert os.path.isfile(pk_path)
185
+ assert os.path.isfile(settings_path)
186
+
187
+ # GENERATE A PROOF
188
+ proof_path = os.path.join(f'{folder_path}/test.pf')
189
+ res = ezkl.prove(
190
+ witness_path,
191
+ compiled_model_path,
192
+ pk_path,
193
+ proof_path,
194
+ "single",
195
+ srs_path
196
+ )
197
+ assert os.path.isfile(proof_path)
198
+
199
+ # VERIFY IT ON LOCAL
200
+ res = ezkl.verify(
201
+ proof_path,
202
+ settings_path,
203
+ vk_path,
204
+ srs_path
205
+ )
206
+ assert res is True
207
+ print("verified on local")
208
+
209
+ # VERIFY IT ON CHAIN
210
+ verify_sol_code_path = os.path.join(f'{folder_path}/verify.sol')
211
+ verify_sol_abi_path = os.path.join(f'{folder_path}/verify.abi')
212
+ res = await ezkl.create_evm_verifier(
213
+ vk_path,
214
+ settings_path,
215
+ verify_sol_code_path,
216
+ verify_sol_abi_path,
217
+ srs_path
218
+ )
219
+ assert res is True
220
+ verify_contract_addr_file = f"{folder_path}/addr.txt"
221
+ rpc_url = "http://103.231.86.33:10219"
222
+ await ezkl.deploy_evm(
223
+ addr_path=verify_contract_addr_file,
224
+ rpc_url=rpc_url,
225
+ sol_code_path=verify_sol_code_path
226
+ )
227
+ if os.path.exists(verify_contract_addr_file):
228
+ with open(verify_contract_addr_file, 'r') as file:
229
+ verify_contract_addr = file.read()
230
+ else:
231
+ print(f"error: File {verify_contract_addr_file} does not exist.")
232
+ return {"error": "Contract address file not found"}
233
+ res = await ezkl.verify_evm(
234
+ addr_verifier=verify_contract_addr,
235
+ proof_path=proof_path,
236
+ rpc_url=rpc_url
237
+ )
238
+ assert res is True
239
+ print("verified on chain")
240
+
241
+ # Read proof file content
242
+ with open(proof_path, 'rb') as f:
243
+ proof_content = base64.b64encode(f.read()).decode('utf-8')
244
+
245
+ return {"output": output_data, "proof": proof_content, "verify_contract_addr": verify_contract_addr}
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/.gitignore CHANGED
@@ -2,8 +2,12 @@ tmp/
2
  .venv
3
  .fhe_keys
4
  *.pyc
5
- local_datasets/
6
  .vscode/
7
  /.idea
8
  /zkml_encrypted
9
- /zkml_non_encrypted
 
 
 
 
 
 
2
  .venv
3
  .fhe_keys
4
  *.pyc
 
5
  .vscode/
6
  /.idea
7
  /zkml_encrypted
8
+ /zkml_non_encrypted
9
+ docker/3_run/docker-compose.yml
10
+ /dataset/
11
+ /2_run/zkml_encrypted/
12
+ /2_run/zkml_non_encrypted/
13
+ /deployment/
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/README.md CHANGED
@@ -18,8 +18,9 @@ python_version: 3.10.11
18
  - First, create a virtual env and activate it:
19
 
20
  ```bash
21
- python3 -m venv .venv
22
- source .venv/bin/activate
 
23
  ```
24
 
25
  - Then, install required packages:
@@ -28,26 +29,39 @@ source .venv/bin/activate
28
  pip3 install pip --upgrade
29
  pip3 install -U pip wheel setuptools --ignore-installed
30
  pip3 install -r requirements.txt --ignore-installed
31
-
32
- # mac z3
33
- brew install z3
34
- pip3 uninstall z3-solver
35
- pip3 install z3-solver
36
- pip3 install more-itertools
37
  ```
38
 
39
  Check it finish well (with a "Done!"). Please note that the actual model initialization and training
40
  can be found in the [SentimentClassification notebook](SentimentClassification.ipynb) (see below).
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  ### Launch the app locally
43
 
44
  - In a terminal:
45
 
46
  ```bash
47
- source .venv/bin/activate
48
- python3 app.py
49
  ```
50
 
 
 
 
 
51
  ## Interact with the application
52
 
53
  Open the given URL link (search for a line like `Running on local URL: http://127.0.0.1:8888/` in the
 
18
  - First, create a virtual env and activate it:
19
 
20
  ```bash
21
+ source ~/anaconda3/bin/activate
22
+ conda create --name sentiment_analysis_demo python=3.10.11
23
+ conda activate sentiment_analysis_demo
24
  ```
25
 
26
  - Then, install required packages:
 
29
  pip3 install pip --upgrade
30
  pip3 install -U pip wheel setuptools --ignore-installed
31
  pip3 install -r requirements.txt --ignore-installed
 
 
 
 
 
 
32
  ```
33
 
34
  Check it finish well (with a "Done!"). Please note that the actual model initialization and training
35
  can be found in the [SentimentClassification notebook](SentimentClassification.ipynb) (see below).
36
 
37
+ ## Compile the FHE algorithm
38
+
39
+ ### Download data
40
+
41
+ ```shell
42
+ pip3 install kaggle
43
+ ./0_download_data.sh
44
+ ```
45
+
46
+ ### Compile
47
+
48
+ ```bash
49
+ python3 1_build/main.py
50
+ ```
51
+
52
+
53
  ### Launch the app locally
54
 
55
  - In a terminal:
56
 
57
  ```bash
58
+ python3 2_run/app.py
 
59
  ```
60
 
61
+ ## Launch the app with docker
62
+
63
+ TODO
64
+
65
  ## Interact with the application
66
 
67
  Open the given URL link (search for a line like `Running on local URL: http://127.0.0.1:8888/` in the
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/.github/workflows/sync_to_hf.yml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ sync:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - name: Checkout repository
14
+ uses: actions/checkout@v3
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v4
18
+ with:
19
+ python-version: '3.9' # Specify your Python version
20
+
21
+ - name: Install dependencies
22
+ run: |
23
+ pip install huggingface_hub
24
+
25
+ - name: Sync to Hugging Face
26
+ env:
27
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
28
+ run: |
29
+ # Configure git
30
+ git config --global user.email "myronzhangweb3@gmail.com"
31
+ git config --global user.name "Myron Zhang"
32
+
33
+ # Clone the Hugging Face repository
34
+ git clone https://myronzhangweb3:$HF_TOKEN@huggingface.co/spaces/PrivEcho/encrypted_sentiment_analysis hf_repo
35
+ cd hf_repo
36
+
37
+ # Copy files from the GitHub repository
38
+ rsync -av --exclude='.git' ../ .
39
+
40
+ # Commit and push changes to Hugging Face
41
+ git add .
42
+ git commit -m "Sync from GitHub"
43
+ git push
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/deployment/serialized_model_zkml ADDED
Binary file (523 kB). View file
 
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/.gitattributes ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.npy filter=lfs diff=lfs merge=lfs -text
14
+ *.npz filter=lfs diff=lfs merge=lfs -text
15
+ *.onnx filter=lfs diff=lfs merge=lfs -text
16
+ *.ot filter=lfs diff=lfs merge=lfs -text
17
+ *.parquet filter=lfs diff=lfs merge=lfs -text
18
+ *.pb filter=lfs diff=lfs merge=lfs -text
19
+ *.pickle filter=lfs diff=lfs merge=lfs -text
20
+ *.pkl filter=lfs diff=lfs merge=lfs -text
21
+ *.pt filter=lfs diff=lfs merge=lfs -text
22
+ *.pth filter=lfs diff=lfs merge=lfs -text
23
+ *.rar filter=lfs diff=lfs merge=lfs -text
24
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
25
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
26
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
27
+ *.tflite filter=lfs diff=lfs merge=lfs -text
28
+ *.tgz filter=lfs diff=lfs merge=lfs -text
29
+ *.wasm filter=lfs diff=lfs merge=lfs -text
30
+ *.xz filter=lfs diff=lfs merge=lfs -text
31
+ *.zip filter=lfs diff=lfs merge=lfs -text
32
+ *.zst filter=lfs diff=lfs merge=lfs -text
33
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/.gitignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ tmp/
2
+ .venv
3
+ .fhe_keys
4
+ *.pyc
5
+ local_datasets/
6
+ .vscode/
7
+ /.idea
8
+ /zkml_encrypted
9
+ /zkml_non_encrypted
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/README.md ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Sentiment Analysis On Encrypted Data Using Fully Homomorphic Encryption
3
+ emoji: 🥷💬
4
+ colorFrom: yellow
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ sdk_version: 4.44.0
8
+ app_file: app.py
9
+ pinned: true
10
+ tags: [FHE, PPML, privacy, privacy preserving machine learning, homomorphic encryption, security]
11
+ python_version: 3.10.11
12
+ ---
13
+
14
+ # Sentiment Analysis With FHE
15
+
16
+ ## Set up the app locally
17
+
18
+ - First, create a virtual env and activate it:
19
+
20
+ ```bash
21
+ python3 -m venv .venv
22
+ source .venv/bin/activate
23
+ ```
24
+
25
+ - Then, install required packages:
26
+
27
+ ```bash
28
+ pip3 install pip --upgrade
29
+ pip3 install -U pip wheel setuptools --ignore-installed
30
+ pip3 install -r requirements.txt --ignore-installed
31
+
32
+ # mac z3
33
+ brew install z3
34
+ pip3 uninstall z3-solver
35
+ pip3 install z3-solver
36
+ pip3 install more-itertools
37
+ ```
38
+
39
+ Check it finish well (with a "Done!"). Please note that the actual model initialization and training
40
+ can be found in the [SentimentClassification notebook](SentimentClassification.ipynb) (see below).
41
+
42
+ ### Launch the app locally
43
+
44
+ - In a terminal:
45
+
46
+ ```bash
47
+ source .venv/bin/activate
48
+ python3 app.py
49
+ ```
50
+
51
+ ## Interact with the application
52
+
53
+ Open the given URL link (search for a line like `Running on local URL: http://127.0.0.1:8888/` in the
54
+ terminal).
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/SentimentClassification.ipynb ADDED
@@ -0,0 +1,1053 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Sentiment Classification with FHE\n",
8
+ "\n",
9
+ "This notebook tackles sentiment classification with Fully Homomorphic Encryption. Let's imagine some client (could be a user or a company) wants to predict whether a specific text (e.g., a tweet) contains positive, neutral or negative feedback using a cloud service provider without actually revealing the text during the process.\n",
10
+ "\n",
11
+ "To do this, we use a machine learning model that can predict over encrypted data thanks to the Concrete-ML library available on [GitHub](https://github.com/zama-ai/concrete-ml).\n",
12
+ "\n",
13
+ "The dataset we use in this notebook can be found on [Kaggle](https://www.kaggle.com/datasets/crowdflower/twitter-airline-sentiment). \n",
14
+ " \n",
15
+ "We present two different ways to encode the text:\n",
16
+ "1. A basic **TF-IDF** approach, which essentially looks at how often a word appears in the text.\n",
17
+ "2. An advanced **transformer** embedding of the text using the Huggingface repository.\n",
18
+ "\n",
19
+ "The main assumption of this notebook is that clients, who want to have their text analyzed in a privacy preserving manner, can encode the text using a predefined representation before encrypting the data. The FHE-friendly model is thus trained in the clear beforehand for the given task, here classification, over theses representations using a relevant training set."
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": 1,
25
+ "metadata": {},
26
+ "outputs": [],
27
+ "source": [
28
+ "# Import the required packages\n",
29
+ "import os\n",
30
+ "import time\n",
31
+ "from pathlib import Path\n",
32
+ "\n",
33
+ "import numpy\n",
34
+ "import pandas as pd\n",
35
+ "from sklearn.metrics import average_precision_score\n",
36
+ "from sklearn.model_selection import GridSearchCV, train_test_split\n",
37
+ "\n",
38
+ "from concrete.ml.sklearn import XGBClassifier"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 2,
44
+ "metadata": {},
45
+ "outputs": [
46
+ {
47
+ "name": "stdout",
48
+ "output_type": "stream",
49
+ "text": [
50
+ "Proportion of positive examples: 16.14%\n",
51
+ "Proportion of negative examples: 62.69%\n",
52
+ "Proportion of neutral examples: 21.17%\n"
53
+ ]
54
+ }
55
+ ],
56
+ "source": [
57
+ "# Download the datasets\n",
58
+ "# The dataset can be downloaded through the `download_data.sh` script, which requires to set up\n",
59
+ "# Kaggle's CLI, or manually at https://www.kaggle.com/datasets/crowdflower/twitter-airline-sentiment\n",
60
+ "if not os.path.isfile(\"local_datasets/twitter-airline-sentiment/Tweets.csv\"):\n",
61
+ " raise ValueError(\"Please launch the `download_data.sh` script to get datasets\")\n",
62
+ "\n",
63
+ "\n",
64
+ "train = pd.read_csv(\"local_datasets/twitter-airline-sentiment/Tweets.csv\", index_col=0)\n",
65
+ "text_X = train[\"text\"]\n",
66
+ "y = train[\"airline_sentiment\"]\n",
67
+ "y = y.replace([\"negative\", \"neutral\", \"positive\"], [0, 1, 2])\n",
68
+ "\n",
69
+ "pos_ratio = y.value_counts()[2] / y.value_counts().sum()\n",
70
+ "neg_ratio = y.value_counts()[0] / y.value_counts().sum()\n",
71
+ "neutral_ratio = y.value_counts()[1] / y.value_counts().sum()\n",
72
+ "print(f\"Proportion of positive examples: {round(pos_ratio * 100, 2)}%\")\n",
73
+ "print(f\"Proportion of negative examples: {round(neg_ratio * 100, 2)}%\")\n",
74
+ "print(f\"Proportion of neutral examples: {round(neutral_ratio * 100, 2)}%\")"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": 3,
80
+ "metadata": {},
81
+ "outputs": [],
82
+ "source": [
83
+ "# Split in train test\n",
84
+ "text_X_train, text_X_test, y_train, y_test = train_test_split(\n",
85
+ " text_X, y, test_size=0.1, random_state=42\n",
86
+ ")"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "markdown",
91
+ "metadata": {},
92
+ "source": [
93
+ "### 1. Text representation using TF-IDF\n",
94
+ "\n",
95
+ "[Term Frequency-Inverse Document Frequency](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)(TF-IDF) also known as is a numerical statistic that is used to compute the importance of a term in a document. The higher the TF-IDF score, the more important the term is to the document.\n",
96
+ "\n",
97
+ "We compute it as follows:\n",
98
+ "\n",
99
+ "$$ \\mathsf{TF\\textrm{-}IDF}(t,d,D) = \\mathsf{TF}(t,d) * \\mathsf{IDF}(t,D) $$\n",
100
+ "\n",
101
+ "where: $\\mathsf{TF}(t,d)$ is the term frequency of term $t$ in document $d$, $\\mathsf{IDF}(t,D)$ is the inverse document frequency of term $t$ in document collection $D$.\n",
102
+ "\n",
103
+ "Here we use the scikit-learn implementation of TF-IDF vectorizer."
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": 4,
109
+ "metadata": {},
110
+ "outputs": [],
111
+ "source": [
112
+ "# Let's first build a representation vector from the text\n",
113
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
114
+ "\n",
115
+ "tfidf_vectorizer = TfidfVectorizer(max_features=500, stop_words=\"english\")\n",
116
+ "X_train = tfidf_vectorizer.fit_transform(text_X_train)\n",
117
+ "X_test = tfidf_vectorizer.transform(text_X_test)\n",
118
+ "\n",
119
+ "# Make our train and test dense array\n",
120
+ "X_train = X_train.toarray()\n",
121
+ "X_test = X_test.toarray()"
122
+ ]
123
+ },
124
+ {
125
+ "cell_type": "code",
126
+ "execution_count": 5,
127
+ "metadata": {},
128
+ "outputs": [],
129
+ "source": [
130
+ "# Let's build our model\n",
131
+ "model = XGBClassifier()\n",
132
+ "\n",
133
+ "# A gridsearch to find the best parameters\n",
134
+ "parameters = {\n",
135
+ " \"n_bits\": [2, 3],\n",
136
+ " \"max_depth\": [1],\n",
137
+ " \"n_estimators\": [10, 30, 50],\n",
138
+ " # \"n_jobs\": [-1],\n",
139
+ "}"
140
+ ]
141
+ },
142
+ {
143
+ "cell_type": "code",
144
+ "execution_count": 6,
145
+ "metadata": {},
146
+ "outputs": [
147
+ {
148
+ "data": {
149
+ "text/html": [
150
+ "<style>#sk-container-id-1 {color: black;background-color: white;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>GridSearchCV(cv=3, estimator=XGBClassifier(n_jobs=1),\n",
151
+ " param_grid={&#x27;max_depth&#x27;: [1], &#x27;n_bits&#x27;: [2, 3],\n",
152
+ " &#x27;n_estimators&#x27;: [10, 30, 50]},\n",
153
+ " scoring=&#x27;accuracy&#x27;)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" ><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">GridSearchCV</label><div class=\"sk-toggleable__content\"><pre>GridSearchCV(cv=3, estimator=XGBClassifier(n_jobs=1),\n",
154
+ " param_grid={&#x27;max_depth&#x27;: [1], &#x27;n_bits&#x27;: [2, 3],\n",
155
+ " &#x27;n_estimators&#x27;: [10, 30, 50]},\n",
156
+ " scoring=&#x27;accuracy&#x27;)</pre></div></div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-2\" type=\"checkbox\" ><label for=\"sk-estimator-id-2\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">estimator: XGBClassifier</label><div class=\"sk-toggleable__content\"><pre>XGBClassifier(n_jobs=1)</pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-3\" type=\"checkbox\" ><label for=\"sk-estimator-id-3\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">XGBClassifier</label><div class=\"sk-toggleable__content\"><pre>XGBClassifier(n_jobs=1)</pre></div></div></div></div></div></div></div></div></div></div>"
157
+ ],
158
+ "text/plain": [
159
+ "GridSearchCV(cv=3, estimator=XGBClassifier(n_jobs=1),\n",
160
+ " param_grid={'max_depth': [1], 'n_bits': [2, 3],\n",
161
+ " 'n_estimators': [10, 30, 50]},\n",
162
+ " scoring='accuracy')"
163
+ ]
164
+ },
165
+ "execution_count": 6,
166
+ "metadata": {},
167
+ "output_type": "execute_result"
168
+ }
169
+ ],
170
+ "source": [
171
+ "# Run the gridsearch\n",
172
+ "grid_search = GridSearchCV(model, parameters, cv=3, scoring=\"accuracy\")\n",
173
+ "grid_search.fit(X_train, y_train)"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "code",
178
+ "execution_count": 7,
179
+ "metadata": {},
180
+ "outputs": [
181
+ {
182
+ "name": "stdout",
183
+ "output_type": "stream",
184
+ "text": [
185
+ "Best score: 0.705980570734669\n",
186
+ "Best parameters: {'max_depth': 1, 'n_bits': 3, 'n_estimators': 50}\n"
187
+ ]
188
+ }
189
+ ],
190
+ "source": [
191
+ "# Check the accuracy of the best model\n",
192
+ "print(f\"Best score: {grid_search.best_score_}\")\n",
193
+ "\n",
194
+ "# Check best hyperparameters\n",
195
+ "print(f\"Best parameters: {grid_search.best_params_}\")\n",
196
+ "\n",
197
+ "# Extract best model\n",
198
+ "best_model = grid_search.best_estimator_"
199
+ ]
200
+ },
201
+ {
202
+ "cell_type": "code",
203
+ "execution_count": 8,
204
+ "metadata": {},
205
+ "outputs": [
206
+ {
207
+ "name": "stdout",
208
+ "output_type": "stream",
209
+ "text": [
210
+ "Accuracy: 0.7117\n",
211
+ "Average precision score for positive class: 0.6404\n",
212
+ "Average precision score for negative class: 0.8719\n",
213
+ "Average precision score for neutral class: 0.4349\n"
214
+ ]
215
+ }
216
+ ],
217
+ "source": [
218
+ "# Compute the average precision for each class\n",
219
+ "y_proba_test_tfidf = best_model.predict_proba(X_test)\n",
220
+ "\n",
221
+ "# Compute accuracy\n",
222
+ "y_pred_test_tfidf = numpy.argmax(y_proba_test_tfidf, axis=1)\n",
223
+ "accuracy_tfidf = numpy.mean(y_pred_test_tfidf == y_test)\n",
224
+ "print(f\"Accuracy: {accuracy_tfidf:.4f}\")\n",
225
+ "\n",
226
+ "y_pred_positive = y_proba_test_tfidf[:, 2]\n",
227
+ "y_pred_negative = y_proba_test_tfidf[:, 0]\n",
228
+ "y_pred_neutral = y_proba_test_tfidf[:, 1]\n",
229
+ "\n",
230
+ "ap_positive_tfidf = average_precision_score((y_test == 2), y_pred_positive)\n",
231
+ "ap_negative_tfidf = average_precision_score((y_test == 0), y_pred_negative)\n",
232
+ "ap_neutral_tfidf = average_precision_score((y_test == 1), y_pred_neutral)\n",
233
+ "\n",
234
+ "print(f\"Average precision score for positive class: \" f\"{ap_positive_tfidf:.4f}\")\n",
235
+ "print(f\"Average precision score for negative class: \" f\"{ap_negative_tfidf:.4f}\")\n",
236
+ "print(f\"Average precision score for neutral class: \" f\"{ap_neutral_tfidf:.4f}\")"
237
+ ]
238
+ },
239
+ {
240
+ "cell_type": "code",
241
+ "execution_count": 9,
242
+ "metadata": {},
243
+ "outputs": [
244
+ {
245
+ "data": {
246
+ "text/plain": [
247
+ "array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
248
+ " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
249
+ " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
250
+ " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
251
+ " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
252
+ " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
253
+ " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
254
+ " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
255
+ " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
256
+ " 2, 2, 2, 2, 2, 2])"
257
+ ]
258
+ },
259
+ "execution_count": 9,
260
+ "metadata": {},
261
+ "output_type": "execute_result"
262
+ }
263
+ ],
264
+ "source": [
265
+ "y_pred_test_tfidf[y_pred_test_tfidf == 2]"
266
+ ]
267
+ },
268
+ {
269
+ "cell_type": "code",
270
+ "execution_count": 10,
271
+ "metadata": {},
272
+ "outputs": [
273
+ {
274
+ "name": "stdout",
275
+ "output_type": "stream",
276
+ "text": [
277
+ "5 most positive tweets (class 2):\n",
278
+ "@JetBlue do bags still fly free or have you started charging? thanks!\n",
279
+ "@SouthwestAir Is there a way to receive a refund on a trip that was Cancelled Flight online instead of calling? Your phone lines are super busy.\n",
280
+ "@JetBlue bag is supposedly here in Boston\n",
281
+ "@AmericanAir Cancelled Flights my flight, doesn't send an email, text or call. Now I'm stranded in Louisville.\n",
282
+ "@SouthwestAir I need to Cancelled Flight one leg of a flight, but can't seem to do this online. Been on hold on the phone for 10 minutes. Any help?\n",
283
+ "----------------------------------------------------------------------------------------------------\n",
284
+ "5 most negative tweets (class 0):\n",
285
+ "@AmericanAir - keeping AA up in the Air! My crew chief cousin Alex Espinosa in DFW! http://t.co/0HXLNvZknP\n",
286
+ "@JetBlue Called JB 3 times!Everytime, Auto Vmsg:\"your wait time should not be longer than 9 mins\" waited longer than 18 mins and no answer!\n",
287
+ "@SouthwestAir can you outline the policies for both scenarios?\n",
288
+ "@united is not a company that values it's customer &amp; after reading tweets to them I'm not the only one who feels that way #lostmybusiness\n",
289
+ "@JetBlue how about free wifi on flt 1254 out of PBI to make up for 2.5 hr delay? Treat us right.\n"
290
+ ]
291
+ }
292
+ ],
293
+ "source": [
294
+ "# Let's see what are the top predictions based on the probabilities in y_pred_test\n",
295
+ "print(\"5 most positive tweets (class 2):\")\n",
296
+ "for i in range(5):\n",
297
+ " print(text_X_test.iloc[y_pred_test_tfidf[y_pred_test_tfidf==2].argsort()[-1 - i]])\n",
298
+ "\n",
299
+ "print(\"-\" * 100)\n",
300
+ "\n",
301
+ "print(\"5 most negative tweets (class 0):\")\n",
302
+ "for i in range(5):\n",
303
+ " print(text_X_test.iloc[y_pred_test_tfidf[y_pred_test_tfidf==0].argsort()[-1 - i]])"
304
+ ]
305
+ },
306
+ {
307
+ "cell_type": "code",
308
+ "execution_count": 11,
309
+ "metadata": {},
310
+ "outputs": [
311
+ {
312
+ "name": "stdout",
313
+ "output_type": "stream",
314
+ "text": [
315
+ "Compilation time: 5.3550 seconds\n",
316
+ "FHE inference time: 1.1162 seconds\n"
317
+ ]
318
+ }
319
+ ],
320
+ "source": [
321
+ "# Compile the model to get the FHE inference engine\n",
322
+ "# (this may take a few minutes depending on the selected model)\n",
323
+ "start = time.perf_counter()\n",
324
+ "best_model.compile(X_train)\n",
325
+ "end = time.perf_counter()\n",
326
+ "print(f\"Compilation time: {end - start:.4f} seconds\")\n",
327
+ "\n",
328
+ "# Let's write a custom example and predict in FHE\n",
329
+ "tested_tweet = [\"AirFrance is awesome, almost as much as Zama!\"]\n",
330
+ "X_tested_tweet = tfidf_vectorizer.transform(numpy.array(tested_tweet)).toarray()\n",
331
+ "clear_proba = best_model.predict_proba(X_tested_tweet)\n",
332
+ "\n",
333
+ "# Now let's predict with FHE over a single tweet and print the time it takes\n",
334
+ "start = time.perf_counter()\n",
335
+ "decrypted_proba = best_model.predict_proba(X_tested_tweet, fhe=\"execute\")\n",
336
+ "end = time.perf_counter()\n",
337
+ "print(f\"FHE inference time: {end - start:.4f} seconds\")"
338
+ ]
339
+ },
340
+ {
341
+ "cell_type": "code",
342
+ "execution_count": 12,
343
+ "metadata": {},
344
+ "outputs": [
345
+ {
346
+ "name": "stdout",
347
+ "output_type": "stream",
348
+ "text": [
349
+ "Probabilities from the FHE inference: [[0.30244059 0.17506451 0.5224949 ]]\n",
350
+ "Probabilities from the clear model: [[0.30244059 0.17506451 0.5224949 ]]\n"
351
+ ]
352
+ }
353
+ ],
354
+ "source": [
355
+ "print(f\"Probabilities from the FHE inference: {decrypted_proba}\")\n",
356
+ "print(f\"Probabilities from the clear model: {clear_proba}\")"
357
+ ]
358
+ },
359
+ {
360
+ "cell_type": "markdown",
361
+ "metadata": {},
362
+ "source": [
363
+ "To sum up, \n",
364
+ "- We trained a XGBoost model over TF-IDF representation of the tweets and their respective sentiment class. \n",
365
+ "- The grid search gives us a model that achieves around ~70% accuracy.\n",
366
+ "- Given the imbalance in the classes, we rather compute the average precision per class.\n",
367
+ "\n",
368
+ "Now we will see how we can approach the problem by leveraging the transformers power."
369
+ ]
370
+ },
371
+ {
372
+ "cell_type": "markdown",
373
+ "metadata": {},
374
+ "source": [
375
+ "### 2. A transformer approach to text representation\n",
376
+ "\n",
377
+ "[**Transformers**](https://en.wikipedia.org/wiki/Transformer_(machine_learning_model\\)) are neural networks that are often trained to predict the next words to appear in a text (this is commonly called self-supervised learning). \n",
378
+ "\n",
379
+ "They are powerful tools for all kind of Natural Language Processing tasks but supporting a transformer model in FHE might not always be ideal as they are quite big models. However, we can still leverage their hidden representation for any text and feed it to a more FHE friendly machine learning model (in this notebook we will use XGBoost) for classification.\n",
380
+ "\n",
381
+ "Here we will use the transformer model from the amazing [**Huggingface**](https://huggingface.co/) repository."
382
+ ]
383
+ },
384
+ {
385
+ "cell_type": "code",
386
+ "execution_count": 13,
387
+ "metadata": {},
388
+ "outputs": [
389
+ {
390
+ "name": "stderr",
391
+ "output_type": "stream",
392
+ "text": [
393
+ "Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']\n",
394
+ "- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
395
+ "- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
396
+ ]
397
+ }
398
+ ],
399
+ "source": [
400
+ "import torch\n",
401
+ "import tqdm\n",
402
+ "from transformers import AutoModelForSequenceClassification, AutoTokenizer\n",
403
+ "\n",
404
+ "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
405
+ "\n",
406
+ "# Load the tokenizer (converts text to tokens)\n",
407
+ "tokenizer = AutoTokenizer.from_pretrained(\"cardiffnlp/twitter-roberta-base-sentiment-latest\")\n",
408
+ "\n",
409
+ "# Load the pre-trained model\n",
410
+ "transformer_model = AutoModelForSequenceClassification.from_pretrained(\n",
411
+ " \"cardiffnlp/twitter-roberta-base-sentiment-latest\"\n",
412
+ ")"
413
+ ]
414
+ },
415
+ {
416
+ "cell_type": "code",
417
+ "execution_count": 14,
418
+ "metadata": {},
419
+ "outputs": [
420
+ {
421
+ "name": "stderr",
422
+ "output_type": "stream",
423
+ "text": [
424
+ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
425
+ "To disable this warning, you can either:\n",
426
+ "\t- Avoid using `tokenizers` before the fork if possible\n",
427
+ "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
428
+ " 0%| | 0/30 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.\n",
429
+ "100%|██████████| 30/30 [00:20<00:00, 1.45it/s]\n"
430
+ ]
431
+ }
432
+ ],
433
+ "source": [
434
+ "# Let's first see what are the model performance by itself\n",
435
+ "list_text_X_test = text_X_test.tolist()\n",
436
+ "\n",
437
+ "tokenized_text_X_test = tokenizer.batch_encode_plus(\n",
438
+ " list_text_X_test, pad_to_max_length=True, return_tensors=\"pt\"\n",
439
+ ")[\"input_ids\"]\n",
440
+ "\n",
441
+ "# Depending on the hardware used, the number of examples to be processed can be reduced\n",
442
+ "# Here we split the data into 100 examples per batch\n",
443
+ "tokenized_text_X_test_split = torch.split(tokenized_text_X_test, split_size_or_sections=50)\n",
444
+ "transformer_model = transformer_model.to(device)\n",
445
+ "\n",
446
+ "outputs = []\n",
447
+ "for tokenized_x_test in tqdm.tqdm(tokenized_text_X_test_split):\n",
448
+ " tokenized_x = tokenized_x_test.to(device)\n",
449
+ " output_batch = transformer_model(tokenized_x)[\"logits\"]\n",
450
+ " output_batch = output_batch.detach().cpu().numpy()\n",
451
+ " outputs.append(output_batch)\n",
452
+ "\n",
453
+ "outputs = numpy.concatenate(outputs, axis=0)"
454
+ ]
455
+ },
456
+ {
457
+ "cell_type": "code",
458
+ "execution_count": 15,
459
+ "metadata": {},
460
+ "outputs": [
461
+ {
462
+ "name": "stdout",
463
+ "output_type": "stream",
464
+ "text": [
465
+ "Predictions for the first 3 tweets:\n",
466
+ " [[-2.3807454 -0.61802197 2.9900734 ]\n",
467
+ " [ 2.0166504 0.49380752 -2.8006463 ]\n",
468
+ " [ 2.3892734 0.13443531 -2.6873832 ]]\n"
469
+ ]
470
+ }
471
+ ],
472
+ "source": [
473
+ "# Let's see what the transformer model predicts\n",
474
+ "print(f\"Predictions for the first 3 tweets:\\n {outputs[:3]}\")"
475
+ ]
476
+ },
477
+ {
478
+ "cell_type": "code",
479
+ "execution_count": 16,
480
+ "metadata": {},
481
+ "outputs": [
482
+ {
483
+ "name": "stdout",
484
+ "output_type": "stream",
485
+ "text": [
486
+ "Accuracy: 0.8053\n",
487
+ "Average precision score for positive class: 0.8548\n",
488
+ "Average precision score for negative class: 0.9548\n",
489
+ "Average precision score for neutral class: 0.6801\n"
490
+ ]
491
+ }
492
+ ],
493
+ "source": [
494
+ "# Compute the metrics for each class\n",
495
+ "\n",
496
+ "# Compute accuracy\n",
497
+ "accuracy_transformer_only = numpy.mean(numpy.argmax(outputs, axis=1) == y_test)\n",
498
+ "print(f\"Accuracy: {accuracy_transformer_only:.4f}\")\n",
499
+ "\n",
500
+ "y_pred_positive = outputs[:, 2]\n",
501
+ "y_pred_negative = outputs[:, 0]\n",
502
+ "y_pred_neutral = outputs[:, 1]\n",
503
+ "\n",
504
+ "ap_positive_transformer_only = average_precision_score((y_test == 2), y_pred_positive)\n",
505
+ "ap_negative_transformer_only = average_precision_score((y_test == 0), y_pred_negative)\n",
506
+ "ap_neutral_transformer_only = average_precision_score((y_test == 1), y_pred_neutral)\n",
507
+ "\n",
508
+ "print(f\"Average precision score for positive class: \" f\"{ap_positive_transformer_only:.4f}\")\n",
509
+ "print(f\"Average precision score for negative class: \" f\"{ap_negative_transformer_only:.4f}\")\n",
510
+ "print(f\"Average precision score for neutral class: \" f\"{ap_neutral_transformer_only:.4f}\")"
511
+ ]
512
+ },
513
+ {
514
+ "cell_type": "markdown",
515
+ "metadata": {},
516
+ "source": [
517
+ "It looks like the transformer outperforms the model built on TF-IDF reprensentation.\n",
518
+ "Unfortunately, running a transformer that big in FHE would be highly inefficient. \n",
519
+ "\n",
520
+ "Let's see if we can leverage transformer representation and train a FHE model for the given classification task. "
521
+ ]
522
+ },
523
+ {
524
+ "cell_type": "code",
525
+ "execution_count": 17,
526
+ "metadata": {},
527
+ "outputs": [
528
+ {
529
+ "name": "stderr",
530
+ "output_type": "stream",
531
+ "text": [
532
+ "100%|██████████| 13176/13176 [09:24<00:00, 23.36it/s]\n",
533
+ "100%|██████████| 1464/1464 [01:00<00:00, 24.12it/s]\n"
534
+ ]
535
+ }
536
+ ],
537
+ "source": [
538
+ "# Function that transforms a list of texts to their representation\n",
539
+ "# learned by the transformer.\n",
540
+ "def text_to_tensor(\n",
541
+ " list_text_X_train: list,\n",
542
+ " transformer_model: AutoModelForSequenceClassification,\n",
543
+ " tokenizer: AutoTokenizer,\n",
544
+ " device: str,\n",
545
+ ") -> numpy.ndarray:\n",
546
+ " # Tokenize each text in the list one by one\n",
547
+ " tokenized_text_X_train_split = []\n",
548
+ " for text_x_train in list_text_X_train:\n",
549
+ " tokenized_text_X_train_split.append(tokenizer.encode(text_x_train, return_tensors=\"pt\"))\n",
550
+ "\n",
551
+ " # Send the model to the device\n",
552
+ " transformer_model = transformer_model.to(device)\n",
553
+ " output_hidden_states_list = []\n",
554
+ "\n",
555
+ " for tokenized_x in tqdm.tqdm(tokenized_text_X_train_split):\n",
556
+ " # Pass the tokens through the transformer model and get the hidden states\n",
557
+ " # Only keep the last hidden layer state for now\n",
558
+ " output_hidden_states = transformer_model(tokenized_x.to(device), output_hidden_states=True)[\n",
559
+ " 1\n",
560
+ " ][-1]\n",
561
+ " # Average over the tokens axis to get a representation at the text level.\n",
562
+ " output_hidden_states = output_hidden_states.mean(dim=1)\n",
563
+ " output_hidden_states = output_hidden_states.detach().cpu().numpy()\n",
564
+ " output_hidden_states_list.append(output_hidden_states)\n",
565
+ "\n",
566
+ " return numpy.concatenate(output_hidden_states_list, axis=0)\n",
567
+ "\n",
568
+ "\n",
569
+ "# Let's vectorize the text using the transformer\n",
570
+ "list_text_X_train = text_X_train.tolist()\n",
571
+ "list_text_X_test = text_X_test.tolist()\n",
572
+ "\n",
573
+ "X_train_transformer = text_to_tensor(list_text_X_train, transformer_model, tokenizer, device)\n",
574
+ "X_test_transformer = text_to_tensor(list_text_X_test, transformer_model, tokenizer, device)"
575
+ ]
576
+ },
577
+ {
578
+ "cell_type": "code",
579
+ "execution_count": 18,
580
+ "metadata": {},
581
+ "outputs": [
582
+ {
583
+ "data": {
584
+ "text/html": [
585
+ "<style>#sk-container-id-2 {color: black;background-color: white;}#sk-container-id-2 pre{padding: 0;}#sk-container-id-2 div.sk-toggleable {background-color: white;}#sk-container-id-2 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-2 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-2 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-2 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-2 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-2 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-2 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-2 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-2 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-2 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-2 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-2 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-2 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-2 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-2 div.sk-item {position: relative;z-index: 1;}#sk-container-id-2 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-2 div.sk-item::before, #sk-container-id-2 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-2 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-2 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-2 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-2 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-2 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-2 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-2 div.sk-label-container {text-align: center;}#sk-container-id-2 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-2 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-2\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>GridSearchCV(cv=3, estimator=XGBClassifier(n_jobs=1), n_jobs=1,\n",
586
+ " param_grid={&#x27;max_depth&#x27;: [1], &#x27;n_bits&#x27;: [2, 3],\n",
587
+ " &#x27;n_estimators&#x27;: [10, 30, 50]},\n",
588
+ " scoring=&#x27;accuracy&#x27;)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-4\" type=\"checkbox\" ><label for=\"sk-estimator-id-4\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">GridSearchCV</label><div class=\"sk-toggleable__content\"><pre>GridSearchCV(cv=3, estimator=XGBClassifier(n_jobs=1), n_jobs=1,\n",
589
+ " param_grid={&#x27;max_depth&#x27;: [1], &#x27;n_bits&#x27;: [2, 3],\n",
590
+ " &#x27;n_estimators&#x27;: [10, 30, 50]},\n",
591
+ " scoring=&#x27;accuracy&#x27;)</pre></div></div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-5\" type=\"checkbox\" ><label for=\"sk-estimator-id-5\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">estimator: XGBClassifier</label><div class=\"sk-toggleable__content\"><pre>XGBClassifier(n_jobs=1)</pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-6\" type=\"checkbox\" ><label for=\"sk-estimator-id-6\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">XGBClassifier</label><div class=\"sk-toggleable__content\"><pre>XGBClassifier(n_jobs=1)</pre></div></div></div></div></div></div></div></div></div></div>"
592
+ ],
593
+ "text/plain": [
594
+ "GridSearchCV(cv=3, estimator=XGBClassifier(n_jobs=1), n_jobs=1,\n",
595
+ " param_grid={'max_depth': [1], 'n_bits': [2, 3],\n",
596
+ " 'n_estimators': [10, 30, 50]},\n",
597
+ " scoring='accuracy')"
598
+ ]
599
+ },
600
+ "execution_count": 18,
601
+ "metadata": {},
602
+ "output_type": "execute_result"
603
+ }
604
+ ],
605
+ "source": [
606
+ "# Now we have a representation for each tweet, we can train a model on these.\n",
607
+ "grid_search = GridSearchCV(model, parameters, cv=3, n_jobs=1, scoring=\"accuracy\")\n",
608
+ "grid_search.fit(X_train_transformer, y_train)"
609
+ ]
610
+ },
611
+ {
612
+ "cell_type": "code",
613
+ "execution_count": 19,
614
+ "metadata": {},
615
+ "outputs": [
616
+ {
617
+ "name": "stdout",
618
+ "output_type": "stream",
619
+ "text": [
620
+ "Best score: 0.8381147540983607\n",
621
+ "Best parameters: {'max_depth': 1, 'n_bits': 3, 'n_estimators': 50}\n"
622
+ ]
623
+ }
624
+ ],
625
+ "source": [
626
+ "# Check the accuracy of the best model\n",
627
+ "print(f\"Best score: {grid_search.best_score_}\")\n",
628
+ "\n",
629
+ "# Check best hyperparameters\n",
630
+ "print(f\"Best parameters: {grid_search.best_params_}\")\n",
631
+ "\n",
632
+ "# Extract best model\n",
633
+ "best_model = grid_search.best_estimator_"
634
+ ]
635
+ },
636
+ {
637
+ "cell_type": "code",
638
+ "execution_count": 20,
639
+ "metadata": {},
640
+ "outputs": [
641
+ {
642
+ "name": "stdout",
643
+ "output_type": "stream",
644
+ "text": [
645
+ "Accuracy: 0.8463\n",
646
+ "Average precision score for positive class: 0.8959\n",
647
+ "Average precision score for negative class: 0.9647\n",
648
+ "Average precision score for neutral class: 0.7449\n"
649
+ ]
650
+ }
651
+ ],
652
+ "source": [
653
+ "# Compute the metrics for each class\n",
654
+ "\n",
655
+ "y_proba = best_model.predict_proba(X_test_transformer)\n",
656
+ "\n",
657
+ "# Compute the accuracy\n",
658
+ "y_pred = numpy.argmax(y_proba, axis=1)\n",
659
+ "accuracy_transformer_xgboost = numpy.mean(y_pred == y_test)\n",
660
+ "print(f\"Accuracy: {accuracy_transformer_xgboost:.4f}\")\n",
661
+ "\n",
662
+ "y_pred_positive = y_proba[:, 2]\n",
663
+ "y_pred_negative = y_proba[:, 0]\n",
664
+ "y_pred_neutral = y_proba[:, 1]\n",
665
+ "\n",
666
+ "ap_positive_transformer_xgboost = average_precision_score((y_test == 2), y_pred_positive)\n",
667
+ "ap_negative_transformer_xgboost = average_precision_score((y_test == 0), y_pred_negative)\n",
668
+ "ap_neutral_transformer_xgboost = average_precision_score((y_test == 1), y_pred_neutral)\n",
669
+ "\n",
670
+ "print(f\"Average precision score for positive class: \" f\"{ap_positive_transformer_xgboost:.4f}\")\n",
671
+ "print(f\"Average precision score for negative class: \" f\"{ap_negative_transformer_xgboost:.4f}\")\n",
672
+ "print(f\"Average precision score for neutral class: \" f\"{ap_neutral_transformer_xgboost:.4f}\")"
673
+ ]
674
+ },
675
+ {
676
+ "cell_type": "markdown",
677
+ "metadata": {},
678
+ "source": [
679
+ "Our FHE-friendly XGBoost model does 38% better than the XGBoost model built over TF-IDF representation of the text. Note that here we are still not using FHE and only evaluating the model.\n",
680
+ "Interestingly, using XGBoost over the transformer representation of the text matches the performance of the transformer model alone."
681
+ ]
682
+ },
683
+ {
684
+ "cell_type": "code",
685
+ "execution_count": 21,
686
+ "metadata": {},
687
+ "outputs": [
688
+ {
689
+ "name": "stdout",
690
+ "output_type": "stream",
691
+ "text": [
692
+ "5 most positive tweets (class 2):\n",
693
+ "@united I think this is the best first class I have ever gotten!! Denver to LAX and it's wonderful!!!\n",
694
+ "@AmericanAir Flight 236 was great. Fantastic cabin crew. A+ landing. #thankyou #JFK http://t.co/dRW08djHAI\n",
695
+ "@SouthwestAir Jason (108639) at Gate #3 in SAN made my afternoon!!! #southwestairlines #stellarservice #thanks!\n",
696
+ "@SouthwestAir love them! Always get the best deals!\n",
697
+ "@AmericanAir simply amazing. Smiles for miles.Thank u for my upgrade tomorrow for ORD.We are spending a lot of time together next few weeks!\n",
698
+ "----------------------------------------------------------------------------------------------------\n",
699
+ "5 most negative tweets (class 0):\n",
700
+ "@united first you lost all my bags, now you Cancelled Flight my flight home. 30 min wait to talk to somebody #poorservice #notgoodenough\n",
701
+ "@USAirways Not only did u lose the flight plan! Now ur flight crew is FAA timed out! Thx for havin us sit on the tarmac for an hr! #Pathetic\n",
702
+ "@AmericanAir Phone just disconnects if you stay on the line. Need to checkout of hotel in 2 hrs &amp; have no place to go. Can't keep calling.\n",
703
+ "@VirginAmerica I have lots of flights to book and your site it not working!!!! I've been on the phone waiting for over 10 minutes..........\n",
704
+ "@united 3 hour delay plus a jetway that won't move. This biz traveler is never flying u again!\n"
705
+ ]
706
+ }
707
+ ],
708
+ "source": [
709
+ "# Get probabilities predictions in clear\n",
710
+ "y_pred_test = best_model.predict_proba(X_test_transformer)\n",
711
+ "\n",
712
+ "# Let's see what are the top predictions based on the probabilities in y_pred_test\n",
713
+ "print(\"5 most positive tweets (class 2):\")\n",
714
+ "for i in range(5):\n",
715
+ " print(text_X_test.iloc[y_pred_test[:, 2].argsort()[-1 - i]])\n",
716
+ "\n",
717
+ "print(\"-\" * 100)\n",
718
+ "\n",
719
+ "print(\"5 most negative tweets (class 0):\")\n",
720
+ "for i in range(5):\n",
721
+ " print(text_X_test.iloc[y_pred_test[:, 0].argsort()[-1 - i]])"
722
+ ]
723
+ },
724
+ {
725
+ "cell_type": "code",
726
+ "execution_count": 22,
727
+ "metadata": {},
728
+ "outputs": [
729
+ {
730
+ "name": "stdout",
731
+ "output_type": "stream",
732
+ "text": [
733
+ "5 most positive (predicted) tweets that are actually negative (ground truth class 0):\n",
734
+ "@united thanks for the link, now finally arrived in Brussels, 9 h after schedule...\n",
735
+ "@USAirways as far as being delayed goes… Looks like tailwinds are going to make up for it. Good news!\n",
736
+ "@united thanks for having changed me. Managed to arrive with only 8 hours of delay and exhausted\n",
737
+ "@USAirways your saving grace was our flight attendant Dallas who was amazing. wish he would transfer to Delta where I would see him again\n",
738
+ "@AmericanAir that luggage you forgot...#mia.....he just won an oscar😄💝💝💝\n",
739
+ "----------------------------------------------------------------------------------------------------\n",
740
+ "5 most negative (predicted) tweets that are actually positive (ground truth class 2):\n",
741
+ "@united thanks for updating me about the 1+ hour delay the exact second I got to ATL. 🙅🙅🙅\n",
742
+ "@SouthwestAir save mile to visit family in 2015 and this will impact how many times I can see my mother. I planned and you change the rules\n",
743
+ "@JetBlue you don't remember our date Monday night back to NYC? #heartbroken\n",
744
+ "@SouthwestAir hot stewardess flipped me off\n",
745
+ "@SouthwestAir - We left iPad in a seat pocket. Filed lost item report. Received it exactly 1 week Late Flightr. Is that a record? #unbelievable\n"
746
+ ]
747
+ }
748
+ ],
749
+ "source": [
750
+ "# Now let's see where the model is wrong\n",
751
+ "y_pred_test_0 = y_pred_test[y_test == 0]\n",
752
+ "text_X_test_0 = text_X_test[y_test == 0]\n",
753
+ "\n",
754
+ "print(\"5 most positive (predicted) tweets that are actually negative (ground truth class 0):\")\n",
755
+ "for i in range(5):\n",
756
+ " print(text_X_test_0.iloc[y_pred_test_0[:, 2].argsort()[-1 - i]])\n",
757
+ "\n",
758
+ "print(\"-\" * 100)\n",
759
+ "\n",
760
+ "y_pred_test_2 = y_pred_test[y_test == 2]\n",
761
+ "text_X_test_2 = text_X_test[y_test == 2]\n",
762
+ "print(\"5 most negative (predicted) tweets that are actually positive (ground truth class 2):\")\n",
763
+ "for i in range(5):\n",
764
+ " print(text_X_test_2.iloc[y_pred_test_2[:, 0].argsort()[-1 - i]])"
765
+ ]
766
+ },
767
+ {
768
+ "cell_type": "markdown",
769
+ "metadata": {},
770
+ "source": [
771
+ "Interestingly, these misclassifications are not obvious and some actually look rather like mislabeled. Also, it seems that the model is having a hard time to find ironic tweets.\n",
772
+ "\n",
773
+ "Now we have our model trained which has some great accuracy. Let's have it predict over the encrypted representation."
774
+ ]
775
+ },
776
+ {
777
+ "cell_type": "markdown",
778
+ "metadata": {},
779
+ "source": [
780
+ "### Sentiment Analysis of the Tweet with Fully Homomorphic Encryption\n",
781
+ "\n",
782
+ "Now that we have our model ready for FHE inference and our data ready for encryption let's use the model in a privacy preserving manner with FHE."
783
+ ]
784
+ },
785
+ {
786
+ "cell_type": "code",
787
+ "execution_count": 23,
788
+ "metadata": {},
789
+ "outputs": [
790
+ {
791
+ "name": "stdout",
792
+ "output_type": "stream",
793
+ "text": [
794
+ "Compilation time: 5.8594 seconds\n"
795
+ ]
796
+ },
797
+ {
798
+ "name": "stderr",
799
+ "output_type": "stream",
800
+ "text": [
801
+ "100%|██████████| 1/1 [00:00<00:00, 17.16it/s]"
802
+ ]
803
+ },
804
+ {
805
+ "name": "stdout",
806
+ "output_type": "stream",
807
+ "text": [
808
+ "FHE inference time: 0.9319 seconds\n"
809
+ ]
810
+ },
811
+ {
812
+ "name": "stderr",
813
+ "output_type": "stream",
814
+ "text": [
815
+ "\n"
816
+ ]
817
+ }
818
+ ],
819
+ "source": [
820
+ "# Compile the model to get the FHE inference engine\n",
821
+ "# (this may take a few minutes depending on the selected model)\n",
822
+ "start = time.perf_counter()\n",
823
+ "best_model.compile(X_train_transformer)\n",
824
+ "end = time.perf_counter()\n",
825
+ "print(f\"Compilation time: {end - start:.4f} seconds\")\n",
826
+ "\n",
827
+ "\n",
828
+ "# Let's write a custom example and predict in FHE\n",
829
+ "tested_tweet = [\"AirFrance is awesome, almost as much as Zama!\"]\n",
830
+ "X_tested_tweet = text_to_tensor(tested_tweet, transformer_model, tokenizer, device)\n",
831
+ "clear_proba = best_model.predict_proba(X_tested_tweet)\n",
832
+ "\n",
833
+ "# Now let's predict with FHE over a single tweet and print the time it takes\n",
834
+ "start = time.perf_counter()\n",
835
+ "decrypted_proba = best_model.predict_proba(X_tested_tweet, fhe=\"execute\")\n",
836
+ "end = time.perf_counter()\n",
837
+ "fhe_exec_time = end - start\n",
838
+ "print(f\"FHE inference time: {fhe_exec_time:.4f} seconds\")"
839
+ ]
840
+ },
841
+ {
842
+ "cell_type": "code",
843
+ "execution_count": 24,
844
+ "metadata": {},
845
+ "outputs": [
846
+ {
847
+ "name": "stdout",
848
+ "output_type": "stream",
849
+ "text": [
850
+ "Probabilities from the FHE inference: [[0.05162184 0.04558276 0.90279541]]\n",
851
+ "Probabilities from the clear model: [[0.05162184 0.04558276 0.90279541]]\n"
852
+ ]
853
+ }
854
+ ],
855
+ "source": [
856
+ "print(f\"Probabilities from the FHE inference: {decrypted_proba}\")\n",
857
+ "print(f\"Probabilities from the clear model: {clear_proba}\")"
858
+ ]
859
+ },
860
+ {
861
+ "cell_type": "code",
862
+ "execution_count": 26,
863
+ "metadata": {},
864
+ "outputs": [],
865
+ "source": [
866
+ "DEPLOYMENT_DIR = Path(\"deployment\")\n",
867
+ "DEPLOYMENT_DIR.mkdir(exist_ok=True)\n",
868
+ "\n",
869
+ "# Let's export the final model such that we can reuse it in a client/server environment\n",
870
+ "\n",
871
+ "# Serialize the model (for development only)\n",
872
+ "with (DEPLOYMENT_DIR / \"serialized_model\").open(\"w\") as file:\n",
873
+ " best_model.dump(file)\n",
874
+ "\n",
875
+ "# Export some data to be used for compilation \n",
876
+ "X_train_numpy = X_train_transformer[:100]\n",
877
+ "\n",
878
+ "# Merge the two arrays in a pandas dataframe\n",
879
+ "X_test_numpy_df = pd.DataFrame(X_train_numpy)\n",
880
+ "\n",
881
+ "# to csv\n",
882
+ "X_test_numpy_df.to_csv(DEPLOYMENT_DIR / \"samples_for_compilation.csv\")\n",
883
+ "\n",
884
+ "# Let's save the model to be pushed to a server later\n",
885
+ "from concrete.ml.deployment import FHEModelDev\n",
886
+ "\n",
887
+ "fhe_api = FHEModelDev(DEPLOYMENT_DIR / \"sentiment_fhe_model\", best_model)\n",
888
+ "fhe_api.save(via_mlir=True)"
889
+ ]
890
+ },
891
+ {
892
+ "cell_type": "code",
893
+ "execution_count": 27,
894
+ "metadata": {},
895
+ "outputs": [
896
+ {
897
+ "data": {
898
+ "text/html": [
899
+ "<div>\n",
900
+ "<style scoped>\n",
901
+ " .dataframe tbody tr th:only-of-type {\n",
902
+ " vertical-align: middle;\n",
903
+ " }\n",
904
+ "\n",
905
+ " .dataframe tbody tr th {\n",
906
+ " vertical-align: top;\n",
907
+ " }\n",
908
+ "\n",
909
+ " .dataframe thead th {\n",
910
+ " text-align: right;\n",
911
+ " }\n",
912
+ "</style>\n",
913
+ "<table border=\"1\" class=\"dataframe\">\n",
914
+ " <thead>\n",
915
+ " <tr style=\"text-align: right;\">\n",
916
+ " <th></th>\n",
917
+ " <th>Accuracy</th>\n",
918
+ " <th>Average Precision (positive)</th>\n",
919
+ " <th>Average Precision (negative)</th>\n",
920
+ " <th>Average Precision (neutral)</th>\n",
921
+ " </tr>\n",
922
+ " <tr>\n",
923
+ " <th>Model</th>\n",
924
+ " <th></th>\n",
925
+ " <th></th>\n",
926
+ " <th></th>\n",
927
+ " <th></th>\n",
928
+ " </tr>\n",
929
+ " </thead>\n",
930
+ " <tbody>\n",
931
+ " <tr>\n",
932
+ " <th>TF-IDF + XGBoost</th>\n",
933
+ " <td>0.711749</td>\n",
934
+ " <td>0.640422</td>\n",
935
+ " <td>0.871891</td>\n",
936
+ " <td>0.43486</td>\n",
937
+ " </tr>\n",
938
+ " <tr>\n",
939
+ " <th>Transformer Only</th>\n",
940
+ " <td>0.805328</td>\n",
941
+ " <td>0.854827</td>\n",
942
+ " <td>0.954804</td>\n",
943
+ " <td>0.68011</td>\n",
944
+ " </tr>\n",
945
+ " <tr>\n",
946
+ " <th>Transformer + XGBoost</th>\n",
947
+ " <td>0.846311</td>\n",
948
+ " <td>0.895930</td>\n",
949
+ " <td>0.964674</td>\n",
950
+ " <td>0.74489</td>\n",
951
+ " </tr>\n",
952
+ " </tbody>\n",
953
+ "</table>\n",
954
+ "</div>"
955
+ ],
956
+ "text/plain": [
957
+ " Accuracy Average Precision (positive) \\\n",
958
+ "Model \n",
959
+ "TF-IDF + XGBoost 0.711749 0.640422 \n",
960
+ "Transformer Only 0.805328 0.854827 \n",
961
+ "Transformer + XGBoost 0.846311 0.895930 \n",
962
+ "\n",
963
+ " Average Precision (negative) \\\n",
964
+ "Model \n",
965
+ "TF-IDF + XGBoost 0.871891 \n",
966
+ "Transformer Only 0.954804 \n",
967
+ "Transformer + XGBoost 0.964674 \n",
968
+ "\n",
969
+ " Average Precision (neutral) \n",
970
+ "Model \n",
971
+ "TF-IDF + XGBoost 0.43486 \n",
972
+ "Transformer Only 0.68011 \n",
973
+ "Transformer + XGBoost 0.74489 "
974
+ ]
975
+ },
976
+ "execution_count": 27,
977
+ "metadata": {},
978
+ "output_type": "execute_result"
979
+ }
980
+ ],
981
+ "source": [
982
+ "%matplotlib inline\n",
983
+ "# Let's print the results obtained in this notebook\n",
984
+ "df_results = pd.DataFrame(\n",
985
+ " {\n",
986
+ " \"Model\": [\"TF-IDF + XGBoost\", \"Transformer Only\", \"Transformer + XGBoost\"],\n",
987
+ " \"Accuracy\": [accuracy_tfidf, accuracy_transformer_only, accuracy_transformer_xgboost],\n",
988
+ " \"Average Precision (positive)\": [\n",
989
+ " ap_positive_tfidf,\n",
990
+ " ap_positive_transformer_only,\n",
991
+ " ap_positive_transformer_xgboost,\n",
992
+ " ],\n",
993
+ " \"Average Precision (negative)\": [\n",
994
+ " ap_negative_tfidf,\n",
995
+ " ap_negative_transformer_only,\n",
996
+ " ap_negative_transformer_xgboost,\n",
997
+ " ],\n",
998
+ " \"Average Precision (neutral)\": [\n",
999
+ " ap_neutral_tfidf,\n",
1000
+ " ap_neutral_transformer_only,\n",
1001
+ " ap_neutral_transformer_xgboost,\n",
1002
+ " ],\n",
1003
+ " }\n",
1004
+ ")\n",
1005
+ "df_results.set_index(\"Model\", inplace=True)\n",
1006
+ "df_results # pylint: disable=pointless-statement"
1007
+ ]
1008
+ },
1009
+ {
1010
+ "cell_type": "markdown",
1011
+ "metadata": {},
1012
+ "source": [
1013
+ "### Conclusion\n",
1014
+ "\n",
1015
+ "In this notebook we presented two different ways to represent a text.\n",
1016
+ "1. Using TF-IDF vectorization\n",
1017
+ "2. Using the hidden layers from a transformer\n",
1018
+ "\n",
1019
+ "Both representation are then used to train a machine learning model will run in FHE (here XGBoost)\n",
1020
+ "\n",
1021
+ "Once the model is trained, clients can send encrypted text representation to the server to get a sentiment analysis done and they receive the probability for each class (negative, neutral and positive) in an encrypted format which can then be decrypted by the client. For now, all the FHE magic (encrypt, predict and decrypt) is done within the `predict_proba` function with the argument `execute_in_fhe=True`. In the next release, an API will be provided to split the server/client parts.\n",
1022
+ "\n",
1023
+ "Regarding the FHE execution times, the final XGboost model can predict over an encrypted data point in ~40 seconds. This will change depending on the number of threads available. In the future, more hardware acceleration will be available to speed up the execution time.\n",
1024
+ "\n",
1025
+ "It seems that the combination of a transformer (thanks Huggingface!) with a \"simpler\" model such as XGBoost works pretty well. Thanks to Concrete-ML library, we can easily use this text representation on the client machine and then encrypt it to send it to a remote server without having to deal with a transformer runtime in FHE."
1026
+ ]
1027
+ }
1028
+ ],
1029
+ "metadata": {
1030
+ "execution": {
1031
+ "timeout": 10800
1032
+ },
1033
+ "kernelspec": {
1034
+ "display_name": ".venv",
1035
+ "language": "python",
1036
+ "name": "python3"
1037
+ },
1038
+ "language_info": {
1039
+ "codemirror_mode": {
1040
+ "name": "ipython",
1041
+ "version": 3
1042
+ },
1043
+ "file_extension": ".py",
1044
+ "mimetype": "text/x-python",
1045
+ "name": "python",
1046
+ "nbconvert_exporter": "python",
1047
+ "pygments_lexer": "ipython3",
1048
+ "version": "3.10.11"
1049
+ }
1050
+ },
1051
+ "nbformat": 4,
1052
+ "nbformat_minor": 2
1053
+ }
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/app.py ADDED
@@ -0,0 +1,408 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """A gradio app. that runs locally (analytics=False and share=False) about sentiment analysis on tweets."""
2
+
3
+ import gradio as gr
4
+ from transformer_vectorizer import TransformerVectorizer
5
+ from concrete.ml.deployment import FHEModelClient
6
+ import numpy
7
+ import os
8
+ from pathlib import Path
9
+ import requests
10
+ import json
11
+ import base64
12
+ import subprocess
13
+ import shutil
14
+ import time
15
+
16
+ # This repository's directory
17
+ REPO_DIR = Path(__file__).parent
18
+
19
+ # Download required data files
20
+ subprocess.Popen(["bash", "./download_data.sh"], cwd=REPO_DIR)
21
+ subprocess.Popen(["uvicorn", "server:app", "--port", "8000"], cwd=REPO_DIR)
22
+ subprocess.Popen(["uvicorn", "zkml_non_encrypted:app", "--port", "8001"], cwd=REPO_DIR)
23
+ subprocess.Popen(["uvicorn", "zkml_encrypted:app", "--port", "8002"], cwd=REPO_DIR)
24
+
25
+ # Wait 30 sec for the server to start
26
+ time.sleep(30)
27
+
28
+ # Encrypted data limit for the browser to display
29
+ # (encrypted data is too large to display in the browser)
30
+ ENCRYPTED_DATA_BROWSER_LIMIT = 500
31
+ N_USER_KEY_STORED = 20
32
+ FHE_MODEL_PATH = "deployment/sentiment_fhe_model"
33
+
34
+ print("Loading the transformer model...")
35
+
36
+ # Initialize the transformer vectorizer
37
+ transformer_vectorizer = TransformerVectorizer()
38
+
39
+
40
+ def clean_tmp_directory():
41
+ # Create tmp directory if it doesn't exist
42
+ Path(".fhe_keys/").mkdir(exist_ok=True)
43
+
44
+ # Allow 20 user keys to be stored.
45
+ # Once that limitation is reached, deleted the oldest.
46
+ path_sub_directories = sorted([f for f in Path(".fhe_keys/").iterdir() if f.is_dir()], key=os.path.getmtime)
47
+
48
+ user_ids = []
49
+ if len(path_sub_directories) > N_USER_KEY_STORED:
50
+ n_files_to_delete = len(path_sub_directories) - N_USER_KEY_STORED
51
+ for p in path_sub_directories[:n_files_to_delete]:
52
+ user_ids.append(p.name)
53
+ shutil.rmtree(p)
54
+
55
+ list_files_tmp = Path("tmp/").iterdir()
56
+ # Delete all files related to user_id
57
+ for file in list_files_tmp:
58
+ for user_id in user_ids:
59
+ if file.name.endswith(f"{user_id}.npy"):
60
+ file.unlink()
61
+
62
+
63
+ def keygen():
64
+ # Clean tmp directory if needed
65
+ clean_tmp_directory()
66
+
67
+ print("Initializing FHEModelClient...")
68
+
69
+ # Create .fhe_keys directory if it doesn't exist
70
+ Path(".fhe_keys/").mkdir(exist_ok=True)
71
+
72
+ # Let's create a user_id
73
+ user_id = numpy.random.randint(0, 2 ** 32)
74
+ fhe_api = FHEModelClient(FHE_MODEL_PATH, f".fhe_keys/{user_id}")
75
+ fhe_api.load()
76
+
77
+ # Generate a fresh key
78
+ fhe_api.generate_private_and_evaluation_keys(force=True)
79
+ evaluation_key = fhe_api.get_serialized_evaluation_keys()
80
+
81
+ # Save evaluation_key in a file, since too large to pass through regular Gradio
82
+ # buttons, https://github.com/gradio-app/gradio/issues/1877
83
+ numpy.save(f"tmp/tmp_evaluation_key_{user_id}.npy", evaluation_key)
84
+
85
+ return [list(evaluation_key)[:ENCRYPTED_DATA_BROWSER_LIMIT], user_id]
86
+
87
+
88
+ def encode_quantize_encrypt(text, user_id):
89
+ if not user_id:
90
+ raise gr.Error("You need to generate FHE keys first.")
91
+
92
+ fhe_api = FHEModelClient(FHE_MODEL_PATH, f".fhe_keys/{user_id}")
93
+ fhe_api.load()
94
+ encodings = transformer_vectorizer.transform([text])
95
+ quantized_encodings = fhe_api.model.quantize_input(encodings).astype(numpy.uint8)
96
+ encrypted_quantized_encoding = fhe_api.quantize_encrypt_serialize(encodings)
97
+
98
+ # Save encrypted_quantized_encoding in a file, since too large to pass through regular Gradio
99
+ # buttons, https://github.com/gradio-app/gradio/issues/1877
100
+ numpy.save(f"tmp/tmp_encrypted_quantized_encoding_{user_id}.npy", encrypted_quantized_encoding)
101
+
102
+ # Compute size
103
+ encrypted_quantized_encoding_shorten = list(encrypted_quantized_encoding)
104
+ encrypted_quantized_encoding_shorten_hex = ''.join(f'{i:02x}' for i in encrypted_quantized_encoding_shorten)
105
+ return (
106
+ encodings[0],
107
+ quantized_encodings[0],
108
+ encrypted_quantized_encoding_shorten_hex,
109
+ )
110
+
111
+
112
+ def run_fhe(user_id):
113
+ encoded_data_path = Path(f"tmp/tmp_encrypted_quantized_encoding_{user_id}.npy")
114
+ if not user_id:
115
+ raise gr.Error("You need to generate FHE keys first.")
116
+ if not encoded_data_path.is_file():
117
+ raise gr.Error("No encrypted data was found. Encrypt the data before trying to predict.")
118
+
119
+ # Read encrypted_quantized_encoding from the file
120
+ encrypted_quantized_encoding = numpy.load(encoded_data_path)
121
+
122
+ # Read evaluation_key from the file
123
+ evaluation_key = numpy.load(f"tmp/tmp_evaluation_key_{user_id}.npy")
124
+
125
+ # Use base64 to encode the encodings and evaluation key
126
+ encrypted_quantized_encoding = base64.b64encode(encrypted_quantized_encoding).decode()
127
+ encoded_evaluation_key = base64.b64encode(evaluation_key).decode()
128
+
129
+ query = {}
130
+ query["evaluation_key"] = encoded_evaluation_key
131
+ query["encrypted_encoding"] = encrypted_quantized_encoding
132
+ headers = {"Content-type": "application/json"}
133
+ response = requests.post(
134
+ "http://localhost:8000/predict_sentiment", data=json.dumps(query), headers=headers
135
+ )
136
+ encrypted_prediction = base64.b64decode(response.json()["encrypted_prediction"])
137
+
138
+ # Save encrypted_prediction in a file, since too large to pass through regular Gradio
139
+ # buttons, https://github.com/gradio-app/gradio/issues/1877
140
+ numpy.save(f"tmp/tmp_encrypted_prediction_{user_id}.npy", encrypted_prediction)
141
+ encrypted_prediction_shorten = list(encrypted_prediction)
142
+ encrypted_prediction_shorten_hex = ''.join(f'{i:02x}' for i in encrypted_prediction_shorten)
143
+ return encrypted_prediction_shorten_hex
144
+
145
+
146
+ def decrypt_prediction(user_id):
147
+ encoded_data_path = Path(f"tmp/tmp_encrypted_prediction_{user_id}.npy")
148
+ if not user_id:
149
+ raise gr.Error("You need to generate FHE keys first.")
150
+ if not encoded_data_path.is_file():
151
+ raise gr.Error("No encrypted prediction was found. Run the prediction over the encrypted data first.")
152
+
153
+ # Read encrypted_prediction from the file
154
+ encrypted_prediction = numpy.load(encoded_data_path).tobytes()
155
+
156
+ fhe_api = FHEModelClient(FHE_MODEL_PATH, f".fhe_keys/{user_id}")
157
+ fhe_api.load()
158
+
159
+ # We need to retrieve the private key that matches the client specs (see issue #18)
160
+ fhe_api.generate_private_and_evaluation_keys(force=False)
161
+
162
+ predictions = fhe_api.deserialize_decrypt_dequantize(encrypted_prediction)
163
+ return {
164
+ "negative": predictions[0][0],
165
+ "neutral": predictions[0][1],
166
+ "positive": predictions[0][2],
167
+ }
168
+
169
+
170
+ def get_zk_proof_non_encrypted(text):
171
+ headers = {"Content-type": "application/json"}
172
+ query = {"text": text}
173
+ response = requests.post(
174
+ "http://localhost:8001/get_zk_proof", data=json.dumps(query), headers=headers
175
+ )
176
+ result = response.json()
177
+
178
+ sentiment = ""
179
+ if result["output"][0] > 0.5:
180
+ sentiment = "negative"
181
+ elif result["output"][1] > 0.5:
182
+ sentiment = "neutral"
183
+ else:
184
+ sentiment = "positive"
185
+
186
+ return sentiment, result["proof"], result["verify_contract_addr"]
187
+
188
+
189
+ def get_zk_proof_encrypted(user_id):
190
+ encoded_data_path = Path(f"tmp/tmp_encrypted_quantized_encoding_{user_id}.npy")
191
+ if not user_id:
192
+ raise gr.Error("You need to generate FHE keys first.")
193
+ if not encoded_data_path.is_file():
194
+ raise gr.Error("No encrypted data was found. Encrypt the data before trying to predict.")
195
+
196
+ # Read encrypted_quantized_encoding from the file
197
+ encrypted_quantized_encoding = numpy.load(encoded_data_path)
198
+
199
+ # Read evaluation_key from the file
200
+ evaluation_key = numpy.load(f"tmp/tmp_evaluation_key_{user_id}.npy")
201
+
202
+ # Use base64 to encode the encodings and evaluation key
203
+ encrypted_quantized_encoding = base64.b64encode(encrypted_quantized_encoding).decode()
204
+ encoded_evaluation_key = base64.b64encode(evaluation_key).decode()
205
+
206
+ query = {}
207
+ query["evaluation_key"] = encoded_evaluation_key
208
+ query["encrypted_encoding"] = encrypted_quantized_encoding
209
+ headers = {"Content-type": "application/json"}
210
+ response = requests.post(
211
+ "http://localhost:8002/get_zk_proof", data=json.dumps(query), headers=headers
212
+ )
213
+ result = response.json()
214
+ return result["output"], result["proof"], result["verify_contract_addr"]
215
+
216
+
217
+ demo = gr.Blocks()
218
+
219
+ print("Starting the demo...")
220
+ with demo:
221
+ gr.Markdown(
222
+ """
223
+ <p align="center">
224
+ <img width=200 src="https://user-images.githubusercontent.com/5758427/197816413-d9cddad3-ba38-4793-847d-120975e1da11.png">
225
+ </p>
226
+
227
+ <h2 align="center">Sentiment Analysis On Encrypted Data Using Homomorphic Encryption</h2>
228
+
229
+ <p align="center">
230
+ <a href="https://github.com/zama-ai/concrete-ml"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="https://user-images.githubusercontent.com/5758427/197972109-faaaff3e-10e2-4ab6-80f5-7531f7cfb08f.png">Concrete-ML</a>
231
+
232
+ <a href="https://docs.zama.ai/concrete-ml"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="https://user-images.githubusercontent.com/5758427/197976802-fddd34c5-f59a-48d0-9bff-7ad1b00cb1fb.png">Documentation</a>
233
+
234
+ <a href="https://zama.ai/community"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="https://user-images.githubusercontent.com/5758427/197977153-8c9c01a7-451a-4993-8e10-5a6ed5343d02.png">Community</a>
235
+
236
+ <a href="https://twitter.com/zama_fhe"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="https://user-images.githubusercontent.com/5758427/197975044-bab9d199-e120-433b-b3be-abd73b211a54.png">@zama_fhe</a>
237
+ </p>
238
+
239
+ <p align="center">
240
+ <img src="https://user-images.githubusercontent.com/56846628/219329304-6868be9e-5ce8-4279-9123-4cb1bc0c2fb5.png" width="60%" height="60%">
241
+ </p>
242
+ """
243
+ )
244
+
245
+ gr.Markdown(
246
+ """
247
+ <p align="center">
248
+ </p>
249
+ <p align="center">
250
+ </p>
251
+ """
252
+ )
253
+
254
+ gr.Markdown("## Notes")
255
+ gr.Markdown(
256
+ """
257
+ - The private key is used to encrypt and decrypt the data and shall never be shared.
258
+ - The evaluation key is a public key that the server needs to process encrypted data.
259
+ """
260
+ )
261
+
262
+ gr.Markdown("# Step 1: Generate the keys")
263
+
264
+ b_gen_key_and_install = gr.Button("Generate the keys and send public part to server")
265
+
266
+ evaluation_key = gr.Textbox(
267
+ label="Evaluation key (truncated):",
268
+ max_lines=4,
269
+ interactive=False,
270
+ )
271
+
272
+ user_id = gr.Textbox(
273
+ label="",
274
+ max_lines=4,
275
+ interactive=False,
276
+ visible=False
277
+ )
278
+
279
+ gr.Markdown("# Step 2: Provide a message")
280
+ gr.Markdown("## Client side")
281
+ gr.Markdown(
282
+ "Enter a sensitive text message you received and would like to do sentiment analysis on (ideas: the last text message of your boss.... or lover)."
283
+ )
284
+ text = gr.Textbox(label="Enter a message:", value="I really like your work recently")
285
+
286
+ gr.Markdown("# Step 3: Encode the message with the private key")
287
+ b_encode_quantize_text = gr.Button(
288
+ "Encode, quantize and encrypt the text with transformer vectorizer, and send to server"
289
+ )
290
+
291
+ with gr.Row():
292
+ encoding = gr.Textbox(
293
+ label="Transformer representation:",
294
+ max_lines=4,
295
+ interactive=False,
296
+ )
297
+ quantized_encoding = gr.Textbox(
298
+ label="Quantized transformer representation:", max_lines=4, interactive=False
299
+ )
300
+ encrypted_quantized_encoding = gr.Textbox(
301
+ label="Encrypted quantized transformer representation (truncated):",
302
+ max_lines=4,
303
+ interactive=False,
304
+ )
305
+
306
+ gr.Markdown("# Step 4: Run the FHE evaluation")
307
+ gr.Markdown("## Server side")
308
+ gr.Markdown(
309
+ "The encrypted value is received by the server. Thanks to the evaluation key and to FHE, the server can compute the (encrypted) prediction directly over encrypted values. Once the computation is finished, the server returns the encrypted prediction to the client."
310
+ )
311
+
312
+ b_run_fhe = gr.Button("Run FHE execution there")
313
+ encrypted_prediction = gr.Textbox(
314
+ label="Encrypted prediction (truncated):",
315
+ max_lines=4,
316
+ interactive=False,
317
+ )
318
+
319
+ gr.Markdown("# Step 5: Decrypt the sentiment")
320
+ gr.Markdown("## Client side")
321
+ gr.Markdown(
322
+ "The encrypted sentiment is sent back to client, who can finally decrypt it with its private key. Only the client is aware of the original tweet and the prediction."
323
+ )
324
+ b_decrypt_prediction = gr.Button("Decrypt prediction")
325
+
326
+ labels_sentiment = gr.Label(label="Sentiment:")
327
+
328
+ gr.Markdown("# Step 6: Get ZK Proof(non-encrypted input)")
329
+ gr.Markdown("## Server side")
330
+ gr.Markdown(
331
+ "Get zero-knowledge proof of the sentiment analysis computation (for non-encrypted input)."
332
+ )
333
+ b_get_zk_proof_non_encrypted = gr.Button("Get ZK Proof(non-encrypted input)")
334
+
335
+ with gr.Row():
336
+ zk_sentiment_non_encrypted = gr.Textbox(
337
+ label="Sentiment:",
338
+ max_lines=1,
339
+ interactive=False,
340
+ )
341
+ zk_proof_non_encrypted = gr.Textbox(
342
+ label="ZK Proof:",
343
+ max_lines=4,
344
+ interactive=False,
345
+ )
346
+ zk_contract_non_encrypted = gr.Textbox(
347
+ label="Verify Contract Address:",
348
+ max_lines=1,
349
+ interactive=False,
350
+ )
351
+
352
+ gr.Markdown("# Step 6: Get ZK Proof(encrypted input)")
353
+ gr.Markdown("## Server side")
354
+ gr.Markdown(
355
+ "Get zero-knowledge proof of the sentiment analysis computation (for encrypted input)."
356
+ )
357
+ b_get_zk_proof_encrypted = gr.Button("Get ZK Proof(encrypted input)")
358
+
359
+ with gr.Row():
360
+ zk_encrypted_prediction = gr.Textbox(
361
+ label="Encrypted Prediction(same as Step 4 output):",
362
+ max_lines=1,
363
+ interactive=False,
364
+ )
365
+ zk_proof_encrypted = gr.Textbox(
366
+ label="ZK Proof:",
367
+ max_lines=4,
368
+ interactive=False,
369
+ )
370
+ zk_contract_encrypted = gr.Textbox(
371
+ label="Verify Contract Address:",
372
+ max_lines=1,
373
+ interactive=False,
374
+ )
375
+
376
+ # Button for key generation
377
+ b_gen_key_and_install.click(keygen, inputs=[], outputs=[evaluation_key, user_id])
378
+
379
+ # Button to quantize and encrypt
380
+ b_encode_quantize_text.click(
381
+ encode_quantize_encrypt,
382
+ inputs=[text, user_id],
383
+ outputs=[
384
+ encoding,
385
+ quantized_encoding,
386
+ encrypted_quantized_encoding,
387
+ ],
388
+ )
389
+
390
+ # Button to send the encodings to the server using post at (localhost:8000/predict_sentiment)
391
+ b_run_fhe.click(run_fhe, inputs=[user_id], outputs=[encrypted_prediction])
392
+
393
+ # Button to decrypt the prediction on the client
394
+ b_decrypt_prediction.click(decrypt_prediction, inputs=[user_id], outputs=[labels_sentiment])
395
+
396
+ # Button to get ZK proof(non encrypted)
397
+ b_get_zk_proof_non_encrypted.click(get_zk_proof_non_encrypted, inputs=[text],
398
+ outputs=[zk_sentiment_non_encrypted, zk_proof_non_encrypted,
399
+ zk_contract_non_encrypted])
400
+
401
+ # Button to get ZK proof(encrypted)
402
+ b_get_zk_proof_encrypted.click(get_zk_proof_encrypted, inputs=[user_id],
403
+ outputs=[zk_encrypted_prediction, zk_proof_encrypted, zk_contract_encrypted])
404
+
405
+ gr.Markdown(
406
+ "The app was built with [Concrete-ML](https://github.com/zama-ai/concrete-ml), a Privacy-Preserving Machine Learning (PPML) open-source set of tools by [Zama](https://zama.ai/). Try it yourself and don't forget to star on Github &#11088;."
407
+ )
408
+ demo.launch(share=False)
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/compile.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from concrete.ml.deployment import FHEModelDev
3
+ from concrete.ml.common.serialization.loaders import load
4
+ import shutil
5
+ from pathlib import Path
6
+
7
+
8
+ script_dir = Path(__file__).parent
9
+
10
+ DEPLOYMENT_DIR = script_dir / "deployment"
11
+
12
+ print("Compiling the model...")
13
+
14
+ with (DEPLOYMENT_DIR / "serialized_model").open("r") as file:
15
+ model = load(file)
16
+
17
+ # Load the data from the csv file to be used for compilation
18
+ data = pd.read_csv(DEPLOYMENT_DIR / "samples_for_compilation.csv", index_col=0).values
19
+
20
+ # Compile the model
21
+ model.compile(data)
22
+
23
+ dev_model_path = DEPLOYMENT_DIR / "sentiment_fhe_model"
24
+
25
+ # Delete the deployment folder if it exist
26
+ if dev_model_path.is_dir():
27
+ shutil.rmtree(dev_model_path)
28
+
29
+ fhe_api = FHEModelDev(
30
+ model=model, path_dir=dev_model_path
31
+ )
32
+ fhe_api.save(via_mlir=True)
33
+
34
+
35
+ print("Done!")
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/deployment/samples_for_compilation.csv ADDED
The diff for this file is too large to render. See raw diff
 
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/deployment/sentiment_fhe_model/client.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18d71ff210dfcfaeffa62d500eea3930694f2ded438589baa4458f971479ee31
3
+ size 1509958
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/deployment/sentiment_fhe_model/server.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23a0a8ef429c6d990fda93aa7ae786353968b1ec366326c02acbf8897b4f431b
3
+ size 2582
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/deployment/sentiment_fhe_model/versions.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"concrete-python": "2.8.1", "concrete-ml": "1.7.0", "python": "3.10.11"}
hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/hf_repo/deployment/serialized_model ADDED
The diff for this file is too large to render. See raw diff