80cols commited on
Commit
17f70ea
·
verified ·
1 Parent(s): 8d3bec2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +817 -511
app.py CHANGED
@@ -1,503 +1,763 @@
1
- """A Gradio app for anonymizing text data using FHE."""
2
 
3
- import base64
4
- import os
5
- import re
6
- import subprocess
7
- import time
8
- import uuid
9
- from typing import Dict, List
10
 
11
- import gradio as gr
12
- import numpy
13
- import pandas as pd
14
- import requests
15
- from fhe_anonymizer import FHEAnonymizer
16
- from utils_demo import *
17
 
18
- from concrete.ml.deployment import FHEModelClient
19
 
20
 
21
- # Ensure the directory is clean before starting processes or reading files
22
- clean_directory()
23
 
24
- anonymizer = FHEAnonymizer()
 
 
25
 
26
- # Start the Uvicorn server hosting the FastAPI app
27
- subprocess.Popen(["uvicorn", "server:app"], cwd=CURRENT_DIR)
28
- time.sleep(3)
29
 
30
- # Load data from files required for the application
31
- UUID_MAP = read_json(MAPPING_UUID_PATH)
32
- ANONYMIZED_DOCUMENT = read_txt(ANONYMIZED_FILE_PATH)
33
- MAPPING_ANONYMIZED_SENTENCES = read_pickle(MAPPING_ANONYMIZED_SENTENCES_PATH)
34
- MAPPING_ENCRYPTED_SENTENCES = read_pickle(MAPPING_ENCRYPTED_SENTENCES_PATH)
35
- ORIGINAL_DOCUMENT = read_txt(ORIGINAL_FILE_PATH).split("\n\n")
36
- MAPPING_DOC_EMBEDDING = read_pickle(MAPPING_DOC_EMBEDDING_PATH)
37
 
38
- print(f"{ORIGINAL_DOCUMENT=}\n")
39
- print(f"{MAPPING_DOC_EMBEDDING.keys()=}")
40
 
41
- # 4. Data Processing and Operations (No specific operations shown here, assuming it's part of anonymizer or client usage)
 
 
42
 
43
- # 5. Utilizing External Services or APIs
44
- # (Assuming client initialization and anonymizer setup are parts of using external services or application-specific logic)
 
 
 
 
 
45
 
46
- # Generate a random user ID for this session
47
- USER_ID = numpy.random.randint(0, 2**32)
48
 
 
49
 
50
- def select_static_anonymized_sentences_fn(selected_sentences: List):
 
51
 
52
- selected_sentences = [MAPPING_ANONYMIZED_SENTENCES[sentence] for sentence in selected_sentences]
 
53
 
54
- anonymized_selected_sentence = sorted(selected_sentences, key=lambda x: x[0])
55
 
56
- anonymized_selected_sentence = [sentence for _, sentence in anonymized_selected_sentence]
57
 
58
- return "\n\n".join(anonymized_selected_sentence)
59
 
 
60
 
61
- def key_gen_fn() -> Dict:
62
- """Generate keys for a given user."""
63
 
64
- print("------------ Step 1: Key Generation:")
65
 
66
- print(f"Your user ID is: {USER_ID}....")
67
 
 
 
68
 
69
- client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
70
- client.load()
71
 
72
- # Creates the private and evaluation keys on the client side
73
- client.generate_private_and_evaluation_keys()
74
 
75
- # Get the serialized evaluation keys
76
- serialized_evaluation_keys = client.get_serialized_evaluation_keys()
77
- assert isinstance(serialized_evaluation_keys, bytes)
78
 
79
- # Save the evaluation key
80
- evaluation_key_path = KEYS_DIR / f"{USER_ID}/evaluation_key"
81
 
82
- write_bytes(evaluation_key_path, serialized_evaluation_keys)
 
83
 
84
- # anonymizer.generate_key()
 
 
85
 
86
- if not evaluation_key_path.is_file():
87
- error_message = (
88
- f"Error Encountered While generating the evaluation {evaluation_key_path.is_file()=}"
89
- )
90
- print(error_message)
91
- return {gen_key_btn: gr.update(value=error_message)}
92
- else:
93
- print("Keys have been generated ✅")
94
- return {gen_key_btn: gr.update(value="Keys have been generated ✅")}
95
 
 
96
 
97
- def encrypt_doc_fn(doc):
 
 
 
 
 
 
 
 
98
 
99
- print(f"\n------------ Step 2.1: Doc encryption: {doc=}")
100
 
101
- if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
102
- return {encrypted_doc_box: gr.update(value="Error ❌: Please generate the key first!", lines=10)}
103
 
104
- # Retrieve the client API
105
- client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
106
- client.load()
107
 
108
- encrypted_tokens = []
109
- tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+|\$\d+(?:\.\d+)?|\€\d+(?:\.\d+)?)", ' '.join(doc))
 
 
 
 
 
 
 
110
 
111
- for token in tokens:
112
- if token.strip() and re.match(r"\w+", token):
113
- emb_x = MAPPING_DOC_EMBEDDING[token]
114
- assert emb_x.shape == (1, 1024)
115
- encrypted_x = client.quantize_encrypt_serialize(emb_x)
116
- assert isinstance(encrypted_x, bytes)
117
- encrypted_tokens.append(encrypted_x)
118
 
119
- print("Doc encrypted ✅ on Client Side")
120
 
121
- # No need to save it
122
- # write_bytes(KEYS_DIR / f"{USER_ID}/encrypted_doc", b"".join(encrypted_tokens))
123
 
124
- encrypted_quant_tokens_hex = [token.hex()[500:510] for token in encrypted_tokens]
125
 
126
- return {
127
- encrypted_doc_box: gr.update(value=" ".join(encrypted_quant_tokens_hex), lines=10),
128
- anonymized_doc_output: gr.update(visible=True, value=None),
129
- }
130
 
131
 
132
- def encrypt_query_fn(query):
133
 
134
- print(f"\n------------ Step 2: Query encryption: {query=}")
135
 
136
- if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
137
- return {output_encrypted_box: gr.update(value="Error ❌: Please generate the key first!", lines=8)}
138
 
139
- if is_user_query_valid(query):
140
- return {
141
- query_box: gr.update(
142
- value=(
143
- "Unable to process ❌: The request exceeds the length limit or falls "
144
- "outside the scope of this document. Please refine your query."
145
- )
146
- )
147
- }
148
 
149
- # Retrieve the client API
150
- client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
151
- client.load()
152
 
153
- encrypted_tokens = []
154
 
155
- # Pattern to identify words and non-words (including punctuation, spaces, etc.)
156
- tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", query)
157
 
158
- for token in tokens:
159
 
160
- # 1- Ignore non-words tokens
161
- if bool(re.match(r"^\s+$", token)):
162
- continue
163
 
164
- # 2- Directly append non-word tokens or whitespace to processed_tokens
165
 
166
- # Prediction for each word
167
- emb_x = get_batch_text_representation([token], EMBEDDINGS_MODEL, TOKENIZER)
168
- encrypted_x = client.quantize_encrypt_serialize(emb_x)
169
- assert isinstance(encrypted_x, bytes)
170
 
171
- encrypted_tokens.append(encrypted_x)
172
 
173
- print("Data encrypted ✅ on Client Side")
174
 
175
- assert len({len(token) for token in encrypted_tokens}) == 1
176
 
177
- write_bytes(KEYS_DIR / f"{USER_ID}/encrypted_input", b"".join(encrypted_tokens))
178
- write_bytes(
179
- KEYS_DIR / f"{USER_ID}/encrypted_input_len", len(encrypted_tokens[0]).to_bytes(10, "big")
180
- )
181
 
182
- encrypted_quant_tokens_hex = [token.hex()[500:580] for token in encrypted_tokens]
183
 
184
- return {
185
- output_encrypted_box: gr.update(value=" ".join(encrypted_quant_tokens_hex), lines=8),
186
- anonymized_query_output: gr.update(visible=True, value=None),
187
- identified_words_output_df: gr.update(visible=False, value=None),
188
- }
189
 
190
 
191
- def send_input_fn(query) -> Dict:
192
- """Send the encrypted data and the evaluation key to the server."""
193
 
194
- print("------------ Step 3.1: Send encrypted_data to the Server")
195
 
196
- evaluation_key_path = KEYS_DIR / f"{USER_ID}/evaluation_key"
197
- encrypted_input_path = KEYS_DIR / f"{USER_ID}/encrypted_input"
198
- encrypted_input_len_path = KEYS_DIR / f"{USER_ID}/encrypted_input_len"
199
 
200
- if not evaluation_key_path.is_file():
201
- error_message = (
202
- "Error Encountered While Sending Data to the Server: "
203
- f"The key has been generated correctly - {evaluation_key_path.is_file()=}"
204
- )
205
- return {anonymized_query_output: gr.update(value=error_message)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
- if not encrypted_input_path.is_file():
208
- error_message = (
209
- "Error Encountered While Sending Data to the Server: The data has not been encrypted "
210
- f"correctly on the client side - {encrypted_input_path.is_file()=}"
211
- )
212
- return {anonymized_query_output: gr.update(value=error_message)}
213
 
214
- # Define the data and files to post
215
- data = {"user_id": USER_ID, "input": query}
 
 
 
 
 
 
 
 
 
 
 
 
216
 
217
- files = [
218
- ("files", open(evaluation_key_path, "rb")),
219
- ("files", open(encrypted_input_path, "rb")),
220
- ("files", open(encrypted_input_len_path, "rb")),
221
- ]
 
 
 
 
 
222
 
223
- # Send the encrypted input and evaluation key to the server
224
- url = SERVER_URL + "send_input"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
 
226
- with requests.post(
227
- url=url,
228
- data=data,
229
- files=files,
230
- ) as resp:
231
- print("Data sent to the server ✅" if resp.ok else "Error ❌ in sending data to the server")
232
 
 
 
233
 
234
- def run_fhe_in_server_fn() -> Dict:
235
- """Run in FHE the anonymization of the query"""
236
 
237
- print("------------ Step 3.2: Run in FHE on the Server Side")
238
 
239
- evaluation_key_path = KEYS_DIR / f"{USER_ID}/evaluation_key"
240
- encrypted_input_path = KEYS_DIR / f"{USER_ID}/encrypted_input"
241
 
242
- if not evaluation_key_path.is_file():
243
- error_message = (
244
- "Error Encountered While Sending Data to the Server: "
245
- f"The key has been generated correctly - {evaluation_key_path.is_file()=}"
246
- )
247
- return {anonymized_query_output: gr.update(value=error_message)}
248
 
249
- if not encrypted_input_path.is_file():
250
- error_message = (
251
- "Error Encountered While Sending Data to the Server: The data has not been encrypted "
252
- f"correctly on the client side - {encrypted_input_path.is_file()=}"
253
- )
254
- return {anonymized_query_output: gr.update(value=error_message)}
255
 
256
- data = {
257
- "user_id": USER_ID,
258
- }
 
 
 
 
 
 
259
 
260
- url = SERVER_URL + "run_fhe"
261
 
262
- with requests.post(
263
- url=url,
264
- data=data,
265
- ) as response:
266
- if not response.ok:
267
- return {
268
- anonymized_query_output: gr.update(
269
- value=(
270
- "⚠️ An error occurred on the Server Side. "
271
- "Please check connectivity and data transmission."
272
- ),
273
- ),
274
- }
275
- else:
276
- time.sleep(1)
277
- print(f"The query anonymization was computed in {response.json():.2f} s per token.")
278
 
 
 
 
279
 
280
- def get_output_fn() -> Dict:
 
 
 
 
 
 
 
 
 
 
281
 
282
- print("------------ Step 3.3: Get the output from the Server Side")
 
283
 
284
- if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
285
- error_message = (
286
- "Error Encountered While Sending Data to the Server: "
287
- "The key has not been generated correctly"
288
- )
289
- return {anonymized_query_output: gr.update(value=error_message)}
290
 
291
- if not (KEYS_DIR / f"{USER_ID}/encrypted_input").is_file():
292
- error_message = (
293
- "Error Encountered While Sending Data to the Server: "
294
- "The data has not been encrypted correctly on the client side"
295
- )
296
- return {anonymized_query_output: gr.update(value=error_message)}
297
-
298
- data = {
299
- "user_id": USER_ID,
300
- }
301
-
302
- # Retrieve the encrypted output
303
- url = SERVER_URL + "get_output"
304
- with requests.post(
305
- url=url,
306
- data=data,
307
- ) as response:
308
- if response.ok:
309
- print("Data received ✅ from the remote Server")
310
- response_data = response.json()
311
- encrypted_output_base64 = response_data["encrypted_output"]
312
- length_encrypted_output_base64 = response_data["length"]
313
-
314
- # Decode the base64 encoded data
315
- encrypted_output = base64.b64decode(encrypted_output_base64)
316
- length_encrypted_output = base64.b64decode(length_encrypted_output_base64)
317
-
318
- # Save the encrypted output to bytes in a file as it is too large to pass through
319
- # regular Gradio buttons (see https://github.com/gradio-app/gradio/issues/1877)
320
-
321
- write_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output", encrypted_output)
322
- write_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output_len", length_encrypted_output)
323
-
324
- else:
325
- print("Error ❌ in getting data to the server")
326
-
327
-
328
- def decrypt_fn(text) -> Dict:
329
- """Dencrypt the data on the `Client Side`."""
330
-
331
- print("------------ Step 4: Dencrypt the data on the `Client Side`")
332
-
333
- # Get the encrypted output path
334
- encrypted_output_path = CLIENT_DIR / f"{USER_ID}_encrypted_output"
335
-
336
- if not encrypted_output_path.is_file():
337
- error_message = """⚠️ Please ensure that: \n
338
- - the connectivity \n
339
- - the query has been submitted \n
340
- - the evaluation key has been generated \n
341
- - the server processed the encrypted data \n
342
- - the Client received the data from the Server before decrypting the prediction
343
- """
344
- print(error_message)
345
 
346
- return error_message, None
 
347
 
348
- # Retrieve the client API
349
- client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
350
- client.load()
 
 
 
 
351
 
352
- # Load the encrypted output as bytes
353
- encrypted_output = read_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output")
354
- length = int.from_bytes(read_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output_len"), "big")
355
 
356
- tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", text)
357
 
358
- decrypted_output, identified_words_with_prob = [], []
359
 
360
- i = 0
361
- for token in tokens:
362
 
363
- # Directly append non-word tokens or whitespace to processed_tokens
364
- if bool(re.match(r"^\s+$", token)):
365
- continue
366
- else:
367
- encrypted_token = encrypted_output[i : i + length]
368
- prediction_proba = client.deserialize_decrypt_dequantize(encrypted_token)
369
- probability = prediction_proba[0][1]
370
- i += length
371
 
372
- if probability >= 0.77:
373
- identified_words_with_prob.append((token, probability))
374
 
375
- # Use the existing UUID if available, otherwise generate a new one
376
- tmp_uuid = UUID_MAP.get(token, str(uuid.uuid4())[:8])
377
- decrypted_output.append(tmp_uuid)
378
- UUID_MAP[token] = tmp_uuid
379
- else:
380
- decrypted_output.append(token)
381
 
382
- # Update the UUID map with query.
383
- write_json(MAPPING_UUID_PATH, UUID_MAP)
384
 
385
- # Removing Spaces Before Punctuation:
386
- anonymized_text = re.sub(r"\s([,.!?;:])", r"\1", " ".join(decrypted_output))
387
 
388
- # Convert the list of identified words and probabilities into a DataFrame
389
- if identified_words_with_prob:
390
- identified_df = pd.DataFrame(
391
- identified_words_with_prob, columns=["Identified Words", "Probability"]
392
- )
393
- else:
394
- identified_df = pd.DataFrame(columns=["Identified Words", "Probability"])
395
 
396
- print("Decryption done ✅ on Client Side")
397
 
398
- return anonymized_text, identified_df
399
 
 
400
 
401
- def anonymization_with_fn(selected_sentences, query):
 
 
402
 
403
- encrypt_query_fn(query)
 
 
404
 
405
- send_input_fn(query)
406
 
407
- run_fhe_in_server_fn()
 
 
 
 
 
 
 
 
 
408
 
409
- get_output_fn()
 
 
 
 
 
 
 
 
410
 
411
- anonymized_text, identified_df = decrypt_fn(query)
 
 
412
 
413
- return {
414
- anonymized_doc_output: gr.update(value=select_static_anonymized_sentences_fn(selected_sentences)),
415
- anonymized_query_output: gr.update(value=anonymized_text),
416
- identified_words_output_df: gr.update(value=identified_df, visible=False),
417
- }
418
 
 
 
 
 
 
419
 
420
- def query_chatgpt_fn(anonymized_query, anonymized_document):
 
 
 
 
421
 
422
- print("------------ Step 5: ChatGPT communication")
 
423
 
424
- if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
425
- error_message = "Error ❌: Please generate the key first!"
426
- return {chatgpt_response_anonymized: gr.update(value=error_message)}
427
 
428
- if not (CLIENT_DIR / f"{USER_ID}_encrypted_output").is_file():
429
- error_message = "Error ❌: Please encrypt your query first!"
430
- return {chatgpt_response_anonymized: gr.update(value=error_message)}
431
 
432
- context_prompt = read_txt(PROMPT_PATH)
433
 
434
- # Prepare prompt
435
- query = (
436
- "Document content:\n```\n"
437
- + anonymized_document
438
- + "\n\n```"
439
- + "Query:\n```\n"
440
- + anonymized_query
441
- + "\n```"
442
- )
443
- print(f'Prompt of CHATGPT:\n{query}')
444
-
445
- completion = client.chat.completions.create(
446
- model="gpt-4-1106-preview", # Replace with "gpt-4" if available
447
- messages=[
448
- {"role": "system", "content": context_prompt},
449
- {"role": "user", "content": query},
450
- ],
451
- )
452
- anonymized_response = completion.choices[0].message.content
453
- uuid_map = read_json(MAPPING_UUID_PATH)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
454
 
455
- inverse_uuid_map = {
456
- v: k for k, v in uuid_map.items()
457
- } # TODO load the inverse mapping from disk for efficiency
 
 
 
 
 
 
 
458
 
459
- # Pattern to identify words and non-words (including punctuation, spaces, etc.)
460
- tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", anonymized_response)
461
- processed_tokens = []
 
 
462
 
463
- for token in tokens:
464
- # Directly append non-word tokens or whitespace to processed_tokens
465
- if not token.strip() or not re.match(r"\w+", token):
466
- processed_tokens.append(token)
467
- continue
468
 
469
- if token in inverse_uuid_map:
470
- processed_tokens.append(inverse_uuid_map[token])
471
- else:
472
- processed_tokens.append(token)
473
- deanonymized_response = "".join(processed_tokens)
 
474
 
475
- return {chatgpt_response_anonymized: gr.update(value=anonymized_response),
476
- chatgpt_response_deanonymized: gr.update(value=deanonymized_response)}
477
 
 
 
 
 
 
478
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
479
  demo = gr.Blocks(css=".markdown-body { font-size: 18px; }")
480
 
481
  with demo:
 
 
 
 
 
 
 
482
 
483
  gr.Markdown(
484
  """
 
485
  <p align="center">
486
- <img width=200 src="https://user-images.githubusercontent.com/5758427/197816413-d9cddad3-ba38-4793-847d-120975e1da11.png">
487
- </p>
488
- """)
489
-
490
- gr.Markdown(
491
- """
492
- <h1 style="text-align: center;">Encrypted Anonymization Using Fully Homomorphic Encryption</h1>
493
- <p align="center">
494
- <a href="https://github.com/zama-ai/concrete-ml"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/github.png">Concrete-ML</a>
495
 
496
- <a href="https://docs.zama.ai/concrete-ml"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/documentation.png">Documentation</a>
497
-
498
- <a href=" https://community.zama.ai/c/concrete-ml/8"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/community.png">Community</a>
499
-
500
- <a href="https://twitter.com/zama_fhe"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/x.png">@zama_fhe</a>
501
  </p>
502
  """
503
  )
@@ -505,35 +765,61 @@ with demo:
505
  gr.Markdown(
506
  """
507
  <p align="center" style="font-size: 16px;">
508
- Anonymization is the process of removing personally identifiable information (PII) data from
509
- a document in order to protect individual privacy.</p>
 
510
 
511
- <p align="center" style="font-size: 16px;">
512
- Encrypted anonymization uses Fully Homomorphic Encryption (FHE) to anonymize personally
513
- identifiable information (PII) within encrypted documents, enabling computations to be
514
- performed on the encrypted data.</p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
515
 
516
- <p align="center" style="font-size: 16px;">
517
- In the example above, we're showing how encrypted anonymization can be leveraged to use LLM
518
- services such as ChatGPT in a privacy-preserving manner.</p>
519
- """
520
- )
521
-
522
  # gr.Markdown(
523
- # """
524
  # <p align="center">
525
- # <img width="75%" height="30%" src="https://raw.githubusercontent.com/kcelia/Img/main/fhe_anonymization_banner.png">
526
  # </p>
527
  # """
528
  # )
529
- gr.Markdown(
530
- f"""
531
- <p align="center">
532
- <img width="75%" height="30%" src="https://huggingface.co/spaces/Tenefix/private-fhe-fraud-detection/resolve/main/Img/schema.png">
533
- </p>
534
- """
535
- )
536
 
 
537
 
538
  ########################## Key Gen Part ##########################
539
 
@@ -549,158 +835,178 @@ with demo:
549
  gen_key_btn = gr.Button("Generate the secret and evaluation keys")
550
 
551
  gen_key_btn.click(
552
- key_gen_fn,
553
  inputs=[],
554
  outputs=[gen_key_btn],
555
- )
556
-
557
- ########################## Main document Part ##########################
558
 
559
  gr.Markdown("<hr />")
560
- gr.Markdown("## Step 2.1: Select the document you want to encrypt\n\n"
561
- """To make it simple, we pre-compiled the following document, but you are free to choose
562
- on which part you want to run this example.
563
- """
564
- )
565
-
566
- with gr.Row():
567
- with gr.Column(scale=5):
568
- original_sentences_box = gr.CheckboxGroup(
569
- ORIGINAL_DOCUMENT,
570
- value=ORIGINAL_DOCUMENT,
571
- label="Contract:",
572
- show_label=True,
573
- )
574
 
575
- with gr.Column(scale=1, min_width=6):
576
- gr.HTML("<div style='height: 77px;'></div>")
577
- encrypt_doc_btn = gr.Button("Encrypt the document")
578
 
579
- with gr.Column(scale=5):
580
- encrypted_doc_box = gr.Textbox(
581
- label="Encrypted document:", show_label=True, interactive=False, lines=10
582
- )
583
-
584
-
585
- ########################## User Query Part ##########################
 
586
 
587
- gr.Markdown("<hr />")
588
- gr.Markdown("## Step 2.2: Select the prompt you want to encrypt\n\n"
589
- """Please choose from the predefined options in
590
- <span style='color:grey'>“Prompt examples”</span> or craft a custom question in
591
- the <span style='color:grey'>“Customized prompt”</span> text box.
592
- Remain concise and relevant to the context. Any off-topic query will not be processed.""")
593
 
594
  with gr.Row():
595
- with gr.Column(scale=5):
596
-
597
- with gr.Column(scale=5):
598
- default_query_box = gr.Dropdown(
599
- list(DEFAULT_QUERIES.values()), label="PROMPT EXAMPLES:"
600
- )
601
 
602
- gr.Markdown("Or")
603
-
604
- query_box = gr.Textbox(
605
- value="What is Kate international bank account number?", label="CUSTOMIZED PROMPT:", interactive=True
 
 
 
 
 
606
  )
607
-
608
- default_query_box.change(
609
- fn=lambda default_query_box: default_query_box,
610
- inputs=[default_query_box],
611
- outputs=[query_box],
 
 
612
  )
613
-
614
- with gr.Column(scale=1, min_width=6):
615
- gr.HTML("<div style='height: 77px;'></div>")
616
- encrypt_query_btn = gr.Button("Encrypt the prompt")
617
- # gr.HTML("<div style='height: 50px;'></div>")
618
-
619
- with gr.Column(scale=5):
620
- output_encrypted_box = gr.Textbox(
621
- label="Encrypted anonymized query that will be sent to the anonymization server:",
622
- lines=8,
 
 
 
 
 
 
 
 
 
 
 
 
 
623
  )
624
 
625
- ########################## FHE processing Part ##########################
626
-
627
- gr.Markdown("<hr />")
628
- gr.Markdown("## Step 3: Anonymize the document and the prompt using FHE")
629
- gr.Markdown(
630
- """Once the client encrypts the document and the prompt locally, it will be sent to a remote
631
- server to perform the anonymization on encrypted data. When the computation is done, the
632
- server will return the result to the client for decryption.
633
- """
 
 
 
 
634
  )
635
 
636
- run_fhe_btn = gr.Button("Anonymize using FHE")
 
 
 
 
 
 
 
 
 
 
 
 
637
 
638
  with gr.Row():
639
- with gr.Column(scale=5):
640
-
641
- anonymized_doc_output = gr.Textbox(
642
- label="Decrypted and anonymized document", lines=10, interactive=True
643
- )
644
-
645
- with gr.Column(scale=5):
646
-
647
- anonymized_query_output = gr.Textbox(
648
- label="Decrypted and anonymized prompt", lines=10, interactive=True
649
  )
650
 
 
 
 
 
 
 
651
 
652
- identified_words_output_df = gr.Dataframe(label="Identified words:", visible=False)
653
 
654
- encrypt_doc_btn.click(
655
- fn=encrypt_doc_fn,
656
- inputs=[original_sentences_box],
657
- outputs=[encrypted_doc_box, anonymized_doc_output],
658
- )
659
 
660
- encrypt_query_btn.click(
661
- fn=encrypt_query_fn,
662
- inputs=[query_box],
663
- outputs=[
664
- query_box,
665
- output_encrypted_box,
666
- anonymized_query_output,
667
- identified_words_output_df,
668
- ],
669
  )
670
 
671
- run_fhe_btn.click(
672
- anonymization_with_fn,
673
- inputs=[original_sentences_box, query_box],
674
- outputs=[anonymized_doc_output, anonymized_query_output, identified_words_output_df],
675
  )
676
 
677
- ########################## ChatGpt Part ##########################
 
678
 
679
  gr.Markdown("<hr />")
680
- gr.Markdown("## Step 4: Send anonymized prompt to ChatGPT")
 
 
 
681
  gr.Markdown(
682
- """After securely anonymizing the query with FHE,
683
- you can forward it to ChatGPT without having any concern about information leakage."""
 
684
  )
685
 
686
- chatgpt_button = gr.Button("Query ChatGPT")
 
 
 
 
687
 
688
- with gr.Row():
689
- chatgpt_response_anonymized = gr.Textbox(label="ChatGPT's anonymized response:", lines=5)
690
- chatgpt_response_deanonymized = gr.Textbox(
691
- label="ChatGPT's non-anonymized response:", lines=5
692
- )
693
 
694
- chatgpt_button.click(
695
- query_chatgpt_fn,
696
- inputs=[anonymized_query_output, anonymized_doc_output],
697
- outputs=[chatgpt_response_anonymized, chatgpt_response_deanonymized],
698
  )
699
 
700
  gr.Markdown(
701
- """**Please note**: As this space is intended solely for demonstration purposes, some
702
- private information may be missed during by the anonymization algorithm. Please validate the
703
- following query before sending it to ChatGPT."""
704
  )
705
- # Launch the app
706
- demo.launch(share=False)
 
 
1
+ # """A Gradio app for anonymizing text data using FHE."""
2
 
3
+ # import os
4
+ # import re
5
+ # import subprocess
6
+ # import time
7
+ # import uuid
8
+ # from typing import Dict, List
 
9
 
10
+ # import numpy
11
+ # import pandas as pd
12
+ # import requests
13
+ # from fhe_anonymizer import FHEAnonymizer
14
+ # from utils_demo import *
 
15
 
16
+ # from concrete.ml.deployment import FHEModelClient
17
 
18
 
 
 
19
 
20
+ # import gradio as gr
21
+ # from predictor import predict, key_already_generated, pre_process_encrypt_send_purchase, decrypt_prediction
22
+ # import base64
23
 
 
 
 
24
 
25
+ # # Ensure the directory is clean before starting processes or reading files
26
+ # clean_directory()
 
 
 
 
 
27
 
28
+ # anonymizer = FHEAnonymizer()
 
29
 
30
+ # # Start the Uvicorn server hosting the FastAPI app
31
+ # subprocess.Popen(["uvicorn", "server:app"], cwd=CURRENT_DIR)
32
+ # time.sleep(3)
33
 
34
+ # # Load data from files required for the application
35
+ # UUID_MAP = read_json(MAPPING_UUID_PATH)
36
+ # ANONYMIZED_DOCUMENT = read_txt(ANONYMIZED_FILE_PATH)
37
+ # MAPPING_ANONYMIZED_SENTENCES = read_pickle(MAPPING_ANONYMIZED_SENTENCES_PATH)
38
+ # MAPPING_ENCRYPTED_SENTENCES = read_pickle(MAPPING_ENCRYPTED_SENTENCES_PATH)
39
+ # ORIGINAL_DOCUMENT = read_txt(ORIGINAL_FILE_PATH).split("\n\n")
40
+ # MAPPING_DOC_EMBEDDING = read_pickle(MAPPING_DOC_EMBEDDING_PATH)
41
 
42
+ # print(f"{ORIGINAL_DOCUMENT=}\n")
43
+ # print(f"{MAPPING_DOC_EMBEDDING.keys()=}")
44
 
45
+ # # 4. Data Processing and Operations (No specific operations shown here, assuming it's part of anonymizer or client usage)
46
 
47
+ # # 5. Utilizing External Services or APIs
48
+ # # (Assuming client initialization and anonymizer setup are parts of using external services or application-specific logic)
49
 
50
+ # # Generate a random user ID for this session
51
+ # USER_ID = numpy.random.randint(0, 2**32)
52
 
 
53
 
54
+ # def select_static_anonymized_sentences_fn(selected_sentences: List):
55
 
56
+ # selected_sentences = [MAPPING_ANONYMIZED_SENTENCES[sentence] for sentence in selected_sentences]
57
 
58
+ # anonymized_selected_sentence = sorted(selected_sentences, key=lambda x: x[0])
59
 
60
+ # anonymized_selected_sentence = [sentence for _, sentence in anonymized_selected_sentence]
 
61
 
62
+ # return "\n\n".join(anonymized_selected_sentence)
63
 
 
64
 
65
+ # def key_gen_fn() -> Dict:
66
+ # """Generate keys for a given user."""
67
 
68
+ # print("------------ Step 1: Key Generation:")
 
69
 
70
+ # print(f"Your user ID is: {USER_ID}....")
 
71
 
 
 
 
72
 
73
+ # client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
74
+ # client.load()
75
 
76
+ # # Creates the private and evaluation keys on the client side
77
+ # client.generate_private_and_evaluation_keys()
78
 
79
+ # # Get the serialized evaluation keys
80
+ # serialized_evaluation_keys = client.get_serialized_evaluation_keys()
81
+ # assert isinstance(serialized_evaluation_keys, bytes)
82
 
83
+ # # Save the evaluation key
84
+ # evaluation_key_path = KEYS_DIR / f"{USER_ID}/evaluation_key"
85
+
86
+ # write_bytes(evaluation_key_path, serialized_evaluation_keys)
 
 
 
 
 
87
 
88
+ # # anonymizer.generate_key()
89
 
90
+ # if not evaluation_key_path.is_file():
91
+ # error_message = (
92
+ # f"Error Encountered While generating the evaluation {evaluation_key_path.is_file()=}"
93
+ # )
94
+ # print(error_message)
95
+ # return {gen_key_btn: gr.update(value=error_message)}
96
+ # else:
97
+ # print("Keys have been generated ✅")
98
+ # return {gen_key_btn: gr.update(value="Keys have been generated ✅")}
99
 
 
100
 
101
+ # def encrypt_doc_fn(doc):
 
102
 
103
+ # print(f"\n------------ Step 2.1: Doc encryption: {doc=}")
 
 
104
 
105
+ # if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
106
+ # return {encrypted_doc_box: gr.update(value="Error ❌: Please generate the key first!", lines=10)}
107
+
108
+ # # Retrieve the client API
109
+ # client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
110
+ # client.load()
111
+
112
+ # encrypted_tokens = []
113
+ # tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+|\$\d+(?:\.\d+)?|\€\d+(?:\.\d+)?)", ' '.join(doc))
114
 
115
+ # for token in tokens:
116
+ # if token.strip() and re.match(r"\w+", token):
117
+ # emb_x = MAPPING_DOC_EMBEDDING[token]
118
+ # assert emb_x.shape == (1, 1024)
119
+ # encrypted_x = client.quantize_encrypt_serialize(emb_x)
120
+ # assert isinstance(encrypted_x, bytes)
121
+ # encrypted_tokens.append(encrypted_x)
122
 
123
+ # print("Doc encrypted ✅ on Client Side")
124
 
125
+ # # No need to save it
126
+ # # write_bytes(KEYS_DIR / f"{USER_ID}/encrypted_doc", b"".join(encrypted_tokens))
127
 
128
+ # encrypted_quant_tokens_hex = [token.hex()[500:510] for token in encrypted_tokens]
129
 
130
+ # return {
131
+ # encrypted_doc_box: gr.update(value=" ".join(encrypted_quant_tokens_hex), lines=10),
132
+ # anonymized_doc_output: gr.update(visible=True, value=None),
133
+ # }
134
 
135
 
136
+ # def encrypt_query_fn(query):
137
 
138
+ # print(f"\n------------ Step 2: Query encryption: {query=}")
139
 
140
+ # if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
141
+ # return {output_encrypted_box: gr.update(value="Error ❌: Please generate the key first!", lines=8)}
142
 
143
+ # if is_user_query_valid(query):
144
+ # return {
145
+ # query_box: gr.update(
146
+ # value=(
147
+ # "Unable to process ❌: The request exceeds the length limit or falls "
148
+ # "outside the scope of this document. Please refine your query."
149
+ # )
150
+ # )
151
+ # }
152
 
153
+ # # Retrieve the client API
154
+ # client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
155
+ # client.load()
156
 
157
+ # encrypted_tokens = []
158
 
159
+ # # Pattern to identify words and non-words (including punctuation, spaces, etc.)
160
+ # tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", query)
161
 
162
+ # for token in tokens:
163
 
164
+ # # 1- Ignore non-words tokens
165
+ # if bool(re.match(r"^\s+$", token)):
166
+ # continue
167
 
168
+ # # 2- Directly append non-word tokens or whitespace to processed_tokens
169
 
170
+ # # Prediction for each word
171
+ # emb_x = get_batch_text_representation([token], EMBEDDINGS_MODEL, TOKENIZER)
172
+ # encrypted_x = client.quantize_encrypt_serialize(emb_x)
173
+ # assert isinstance(encrypted_x, bytes)
174
 
175
+ # encrypted_tokens.append(encrypted_x)
176
 
177
+ # print("Data encrypted ✅ on Client Side")
178
 
179
+ # assert len({len(token) for token in encrypted_tokens}) == 1
180
 
181
+ # write_bytes(KEYS_DIR / f"{USER_ID}/encrypted_input", b"".join(encrypted_tokens))
182
+ # write_bytes(
183
+ # KEYS_DIR / f"{USER_ID}/encrypted_input_len", len(encrypted_tokens[0]).to_bytes(10, "big")
184
+ # )
185
 
186
+ # encrypted_quant_tokens_hex = [token.hex()[500:580] for token in encrypted_tokens]
187
 
188
+ # return {
189
+ # output_encrypted_box: gr.update(value=" ".join(encrypted_quant_tokens_hex), lines=8),
190
+ # anonymized_query_output: gr.update(visible=True, value=None),
191
+ # identified_words_output_df: gr.update(visible=False, value=None),
192
+ # }
193
 
194
 
195
+ # def send_input_fn(query) -> Dict:
196
+ # """Send the encrypted data and the evaluation key to the server."""
197
 
198
+ # print("------------ Step 3.1: Send encrypted_data to the Server")
199
 
200
+ # evaluation_key_path = KEYS_DIR / f"{USER_ID}/evaluation_key"
201
+ # encrypted_input_path = KEYS_DIR / f"{USER_ID}/encrypted_input"
202
+ # encrypted_input_len_path = KEYS_DIR / f"{USER_ID}/encrypted_input_len"
203
 
204
+ # if not evaluation_key_path.is_file():
205
+ # error_message = (
206
+ # "Error Encountered While Sending Data to the Server: "
207
+ # f"The key has been generated correctly - {evaluation_key_path.is_file()=}"
208
+ # )
209
+ # return {anonymized_query_output: gr.update(value=error_message)}
210
+
211
+ # if not encrypted_input_path.is_file():
212
+ # error_message = (
213
+ # "Error Encountered While Sending Data to the Server: The data has not been encrypted "
214
+ # f"correctly on the client side - {encrypted_input_path.is_file()=}"
215
+ # )
216
+ # return {anonymized_query_output: gr.update(value=error_message)}
217
+
218
+ # # Define the data and files to post
219
+ # data = {"user_id": USER_ID, "input": query}
220
+
221
+ # files = [
222
+ # ("files", open(evaluation_key_path, "rb")),
223
+ # ("files", open(encrypted_input_path, "rb")),
224
+ # ("files", open(encrypted_input_len_path, "rb")),
225
+ # ]
226
+
227
+ # # Send the encrypted input and evaluation key to the server
228
+ # url = SERVER_URL + "send_input"
229
+
230
+ # with requests.post(
231
+ # url=url,
232
+ # data=data,
233
+ # files=files,
234
+ # ) as resp:
235
+ # print("Data sent to the server ✅" if resp.ok else "Error ❌ in sending data to the server")
236
 
 
 
 
 
 
 
237
 
238
+ # def run_fhe_in_server_fn() -> Dict:
239
+ # """Run in FHE the anonymization of the query"""
240
+
241
+ # print("------------ Step 3.2: Run in FHE on the Server Side")
242
+
243
+ # evaluation_key_path = KEYS_DIR / f"{USER_ID}/evaluation_key"
244
+ # encrypted_input_path = KEYS_DIR / f"{USER_ID}/encrypted_input"
245
+
246
+ # if not evaluation_key_path.is_file():
247
+ # error_message = (
248
+ # "Error Encountered While Sending Data to the Server: "
249
+ # f"The key has been generated correctly - {evaluation_key_path.is_file()=}"
250
+ # )
251
+ # return {anonymized_query_output: gr.update(value=error_message)}
252
 
253
+ # if not encrypted_input_path.is_file():
254
+ # error_message = (
255
+ # "Error Encountered While Sending Data to the Server: The data has not been encrypted "
256
+ # f"correctly on the client side - {encrypted_input_path.is_file()=}"
257
+ # )
258
+ # return {anonymized_query_output: gr.update(value=error_message)}
259
+
260
+ # data = {
261
+ # "user_id": USER_ID,
262
+ # }
263
 
264
+ # url = SERVER_URL + "run_fhe"
265
+
266
+ # with requests.post(
267
+ # url=url,
268
+ # data=data,
269
+ # ) as response:
270
+ # if not response.ok:
271
+ # return {
272
+ # anonymized_query_output: gr.update(
273
+ # value=(
274
+ # "⚠️ An error occurred on the Server Side. "
275
+ # "Please check connectivity and data transmission."
276
+ # ),
277
+ # ),
278
+ # }
279
+ # else:
280
+ # time.sleep(1)
281
+ # print(f"The query anonymization was computed in {response.json():.2f} s per token.")
282
+
283
+
284
+ # def get_output_fn() -> Dict:
285
+
286
+ # print("------------ Step 3.3: Get the output from the Server Side")
287
+
288
+ # if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
289
+ # error_message = (
290
+ # "Error Encountered While Sending Data to the Server: "
291
+ # "The key has not been generated correctly"
292
+ # )
293
+ # return {anonymized_query_output: gr.update(value=error_message)}
294
+
295
+ # if not (KEYS_DIR / f"{USER_ID}/encrypted_input").is_file():
296
+ # error_message = (
297
+ # "Error Encountered While Sending Data to the Server: "
298
+ # "The data has not been encrypted correctly on the client side"
299
+ # )
300
+ # return {anonymized_query_output: gr.update(value=error_message)}
301
+
302
+ # data = {
303
+ # "user_id": USER_ID,
304
+ # }
305
+
306
+ # # Retrieve the encrypted output
307
+ # url = SERVER_URL + "get_output"
308
+ # with requests.post(
309
+ # url=url,
310
+ # data=data,
311
+ # ) as response:
312
+ # if response.ok:
313
+ # print("Data received ✅ from the remote Server")
314
+ # response_data = response.json()
315
+ # encrypted_output_base64 = response_data["encrypted_output"]
316
+ # length_encrypted_output_base64 = response_data["length"]
317
 
318
+ # # Decode the base64 encoded data
319
+ # encrypted_output = base64.b64decode(encrypted_output_base64)
320
+ # length_encrypted_output = base64.b64decode(length_encrypted_output_base64)
321
+
322
+ # # Save the encrypted output to bytes in a file as it is too large to pass through
323
+ # # regular Gradio buttons (see https://github.com/gradio-app/gradio/issues/1877)
324
 
325
+ # write_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output", encrypted_output)
326
+ # write_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output_len", length_encrypted_output)
327
 
328
+ # else:
329
+ # print("Error in getting data to the server")
330
 
 
331
 
332
+ # def decrypt_fn(text) -> Dict:
333
+ # """Dencrypt the data on the `Client Side`."""
334
 
335
+ # print("------------ Step 4: Dencrypt the data on the `Client Side`")
 
 
 
 
 
336
 
337
+ # # Get the encrypted output path
338
+ # encrypted_output_path = CLIENT_DIR / f"{USER_ID}_encrypted_output"
 
 
 
 
339
 
340
+ # if not encrypted_output_path.is_file():
341
+ # error_message = """⚠️ Please ensure that: \n
342
+ # - the connectivity \n
343
+ # - the query has been submitted \n
344
+ # - the evaluation key has been generated \n
345
+ # - the server processed the encrypted data \n
346
+ # - the Client received the data from the Server before decrypting the prediction
347
+ # """
348
+ # print(error_message)
349
 
350
+ # return error_message, None
351
 
352
+ # # Retrieve the client API
353
+ # client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
354
+ # client.load()
355
+
356
+ # # Load the encrypted output as bytes
357
+ # encrypted_output = read_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output")
358
+ # length = int.from_bytes(read_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output_len"), "big")
 
 
 
 
 
 
 
 
 
359
 
360
+ # tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", text)
361
+
362
+ # decrypted_output, identified_words_with_prob = [], []
363
 
364
+ # i = 0
365
+ # for token in tokens:
366
+
367
+ # # Directly append non-word tokens or whitespace to processed_tokens
368
+ # if bool(re.match(r"^\s+$", token)):
369
+ # continue
370
+ # else:
371
+ # encrypted_token = encrypted_output[i : i + length]
372
+ # prediction_proba = client.deserialize_decrypt_dequantize(encrypted_token)
373
+ # probability = prediction_proba[0][1]
374
+ # i += length
375
 
376
+ # if probability >= 0.77:
377
+ # identified_words_with_prob.append((token, probability))
378
 
379
+ # # Use the existing UUID if available, otherwise generate a new one
380
+ # tmp_uuid = UUID_MAP.get(token, str(uuid.uuid4())[:8])
381
+ # decrypted_output.append(tmp_uuid)
382
+ # UUID_MAP[token] = tmp_uuid
383
+ # else:
384
+ # decrypted_output.append(token)
385
 
386
+ # # Update the UUID map with query.
387
+ # write_json(MAPPING_UUID_PATH, UUID_MAP)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
388
 
389
+ # # Removing Spaces Before Punctuation:
390
+ # anonymized_text = re.sub(r"\s([,.!?;:])", r"\1", " ".join(decrypted_output))
391
 
392
+ # # Convert the list of identified words and probabilities into a DataFrame
393
+ # if identified_words_with_prob:
394
+ # identified_df = pd.DataFrame(
395
+ # identified_words_with_prob, columns=["Identified Words", "Probability"]
396
+ # )
397
+ # else:
398
+ # identified_df = pd.DataFrame(columns=["Identified Words", "Probability"])
399
 
400
+ # print("Decryption done on Client Side")
 
 
401
 
402
+ # return anonymized_text, identified_df
403
 
 
404
 
405
+ # def anonymization_with_fn(selected_sentences, query):
 
406
 
407
+ # encrypt_query_fn(query)
 
 
 
 
 
 
 
408
 
409
+ # send_input_fn(query)
 
410
 
411
+ # run_fhe_in_server_fn()
 
 
 
 
 
412
 
413
+ # get_output_fn()
 
414
 
415
+ # anonymized_text, identified_df = decrypt_fn(query)
 
416
 
417
+ # return {
418
+ # anonymized_doc_output: gr.update(value=select_static_anonymized_sentences_fn(selected_sentences)),
419
+ # anonymized_query_output: gr.update(value=anonymized_text),
420
+ # identified_words_output_df: gr.update(value=identified_df, visible=False),
421
+ # }
 
 
422
 
 
423
 
424
+ # def query_chatgpt_fn(anonymized_query, anonymized_document):
425
 
426
+ # print("------------ Step 5: ChatGPT communication")
427
 
428
+ # if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
429
+ # error_message = "Error ❌: Please generate the key first!"
430
+ # return {chatgpt_response_anonymized: gr.update(value=error_message)}
431
 
432
+ # if not (CLIENT_DIR / f"{USER_ID}_encrypted_output").is_file():
433
+ # error_message = "Error ❌: Please encrypt your query first!"
434
+ # return {chatgpt_response_anonymized: gr.update(value=error_message)}
435
 
436
+ # context_prompt = read_txt(PROMPT_PATH)
437
 
438
+ # # Prepare prompt
439
+ # query = (
440
+ # "Document content:\n```\n"
441
+ # + anonymized_document
442
+ # + "\n\n```"
443
+ # + "Query:\n```\n"
444
+ # + anonymized_query
445
+ # + "\n```"
446
+ # )
447
+ # print(f'Prompt of CHATGPT:\n{query}')
448
 
449
+ # completion = client.chat.completions.create(
450
+ # model="gpt-4-1106-preview", # Replace with "gpt-4" if available
451
+ # messages=[
452
+ # {"role": "system", "content": context_prompt},
453
+ # {"role": "user", "content": query},
454
+ # ],
455
+ # )
456
+ # anonymized_response = completion.choices[0].message.content
457
+ # uuid_map = read_json(MAPPING_UUID_PATH)
458
 
459
+ # inverse_uuid_map = {
460
+ # v: k for k, v in uuid_map.items()
461
+ # } # TODO load the inverse mapping from disk for efficiency
462
 
463
+ # # Pattern to identify words and non-words (including punctuation, spaces, etc.)
464
+ # tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", anonymized_response)
465
+ # processed_tokens = []
 
 
466
 
467
+ # for token in tokens:
468
+ # # Directly append non-word tokens or whitespace to processed_tokens
469
+ # if not token.strip() or not re.match(r"\w+", token):
470
+ # processed_tokens.append(token)
471
+ # continue
472
 
473
+ # if token in inverse_uuid_map:
474
+ # processed_tokens.append(inverse_uuid_map[token])
475
+ # else:
476
+ # processed_tokens.append(token)
477
+ # deanonymized_response = "".join(processed_tokens)
478
 
479
+ # return {chatgpt_response_anonymized: gr.update(value=anonymized_response),
480
+ # chatgpt_response_deanonymized: gr.update(value=deanonymized_response)}
481
 
 
 
 
482
 
483
+ # demo = gr.Blocks(css=".markdown-body { font-size: 18px; }")
 
 
484
 
485
+ # with demo:
486
 
487
+ # gr.Markdown(
488
+ # """
489
+ # <p align="center">
490
+ # <img width=200 src="https://user-images.githubusercontent.com/5758427/197816413-d9cddad3-ba38-4793-847d-120975e1da11.png">
491
+ # </p>
492
+ # """)
493
+
494
+ # gr.Markdown(
495
+ # """
496
+ # <h1 style="text-align: center;">Encrypted Anonymization Using Fully Homomorphic Encryption</h1>
497
+ # <p align="center">
498
+ # <a href="https://github.com/zama-ai/concrete-ml"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/github.png">Concrete-ML</a>
499
+ #
500
+ # <a href="https://docs.zama.ai/concrete-ml"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/documentation.png">Documentation</a>
501
+ # —
502
+ # <a href=" https://community.zama.ai/c/concrete-ml/8"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/community.png">Community</a>
503
+ # —
504
+ # <a href="https://twitter.com/zama_fhe"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/x.png">@zama_fhe</a>
505
+ # </p>
506
+ # """
507
+ # )
508
+
509
+ # gr.Markdown(
510
+ # """
511
+ # <p align="center" style="font-size: 16px;">
512
+ # Anonymization is the process of removing personally identifiable information (PII) data from
513
+ # a document in order to protect individual privacy.</p>
514
+
515
+ # <p align="center" style="font-size: 16px;">
516
+ # Encrypted anonymization uses Fully Homomorphic Encryption (FHE) to anonymize personally
517
+ # identifiable information (PII) within encrypted documents, enabling computations to be
518
+ # performed on the encrypted data.</p>
519
+
520
+ # <p align="center" style="font-size: 16px;">
521
+ # In the example above, we're showing how encrypted anonymization can be leveraged to use LLM
522
+ # services such as ChatGPT in a privacy-preserving manner.</p>
523
+ # """
524
+ # )
525
+
526
+ # # gr.Markdown(
527
+ # # """
528
+ # # <p align="center">
529
+ # # <img width="75%" height="30%" src="https://raw.githubusercontent.com/kcelia/Img/main/fhe_anonymization_banner.png">
530
+ # # </p>
531
+ # # """
532
+ # # )
533
+ # gr.Markdown(
534
+ # f"""
535
+ # <p align="center">
536
+ # <img width="75%" height="30%" src="https://huggingface.co/spaces/Tenefix/private-fhe-fraud-detection/resolve/main/Img/schema.png">
537
+ # </p>
538
+ # """
539
+ # )
540
+
541
+
542
+ # ########################## Key Gen Part ##########################
543
+
544
+ # gr.Markdown(
545
+ # "## Step 1: Generate the keys\n\n"
546
+ # """In Fully Homomorphic Encryption (FHE) methods, two types of keys are created. The first
547
+ # type, called secret keys, are used to encrypt and decrypt the user's data. The second type,
548
+ # called evaluation keys, enables a server to work on the encrypted data without seeing the
549
+ # actual data.
550
+ # """
551
+ # )
552
+
553
+ # gen_key_btn = gr.Button("Generate the secret and evaluation keys")
554
+
555
+ # gen_key_btn.click(
556
+ # key_gen_fn,
557
+ # inputs=[],
558
+ # outputs=[gen_key_btn],
559
+ # )
560
+
561
+ # ########################## Main document Part ##########################
562
+
563
+ # gr.Markdown("<hr />")
564
+ # gr.Markdown("## Step 2.1: Select the document you want to encrypt\n\n"
565
+ # """To make it simple, we pre-compiled the following document, but you are free to choose
566
+ # on which part you want to run this example.
567
+ # """
568
+ # )
569
+
570
+ # with gr.Row():
571
+ # with gr.Column(scale=5):
572
+ # original_sentences_box = gr.CheckboxGroup(
573
+ # ORIGINAL_DOCUMENT,
574
+ # value=ORIGINAL_DOCUMENT,
575
+ # label="Contract:",
576
+ # show_label=True,
577
+ # )
578
+
579
+ # with gr.Column(scale=1, min_width=6):
580
+ # gr.HTML("<div style='height: 77px;'></div>")
581
+ # encrypt_doc_btn = gr.Button("Encrypt the document")
582
+
583
+ # with gr.Column(scale=5):
584
+ # encrypted_doc_box = gr.Textbox(
585
+ # label="Encrypted document:", show_label=True, interactive=False, lines=10
586
+ # )
587
+
588
+
589
+ # ########################## User Query Part ##########################
590
+
591
+ # gr.Markdown("<hr />")
592
+ # gr.Markdown("## Step 2.2: Select the prompt you want to encrypt\n\n"
593
+ # """Please choose from the predefined options in
594
+ # <span style='color:grey'>“Prompt examples”</span> or craft a custom question in
595
+ # the <span style='color:grey'>“Customized prompt”</span> text box.
596
+ # Remain concise and relevant to the context. Any off-topic query will not be processed.""")
597
+
598
+ # with gr.Row():
599
+ # with gr.Column(scale=5):
600
+
601
+ # with gr.Column(scale=5):
602
+ # default_query_box = gr.Dropdown(
603
+ # list(DEFAULT_QUERIES.values()), label="PROMPT EXAMPLES:"
604
+ # )
605
+
606
+ # gr.Markdown("Or")
607
+
608
+ # query_box = gr.Textbox(
609
+ # value="What is Kate international bank account number?", label="CUSTOMIZED PROMPT:", interactive=True
610
+ # )
611
+
612
+ # default_query_box.change(
613
+ # fn=lambda default_query_box: default_query_box,
614
+ # inputs=[default_query_box],
615
+ # outputs=[query_box],
616
+ # )
617
+
618
+ # with gr.Column(scale=1, min_width=6):
619
+ # gr.HTML("<div style='height: 77px;'></div>")
620
+ # encrypt_query_btn = gr.Button("Encrypt the prompt")
621
+ # # gr.HTML("<div style='height: 50px;'></div>")
622
+
623
+ # with gr.Column(scale=5):
624
+ # output_encrypted_box = gr.Textbox(
625
+ # label="Encrypted anonymized query that will be sent to the anonymization server:",
626
+ # lines=8,
627
+ # )
628
+
629
+ # ########################## FHE processing Part ##########################
630
+
631
+ # gr.Markdown("<hr />")
632
+ # gr.Markdown("## Step 3: Anonymize the document and the prompt using FHE")
633
+ # gr.Markdown(
634
+ # """Once the client encrypts the document and the prompt locally, it will be sent to a remote
635
+ # server to perform the anonymization on encrypted data. When the computation is done, the
636
+ # server will return the result to the client for decryption.
637
+ # """
638
+ # )
639
+
640
+ # run_fhe_btn = gr.Button("Anonymize using FHE")
641
+
642
+ # with gr.Row():
643
+ # with gr.Column(scale=5):
644
+
645
+ # anonymized_doc_output = gr.Textbox(
646
+ # label="Decrypted and anonymized document", lines=10, interactive=True
647
+ # )
648
+
649
+ # with gr.Column(scale=5):
650
+
651
+ # anonymized_query_output = gr.Textbox(
652
+ # label="Decrypted and anonymized prompt", lines=10, interactive=True
653
+ # )
654
+
655
+
656
+ # identified_words_output_df = gr.Dataframe(label="Identified words:", visible=False)
657
+
658
+ # encrypt_doc_btn.click(
659
+ # fn=encrypt_doc_fn,
660
+ # inputs=[original_sentences_box],
661
+ # outputs=[encrypted_doc_box, anonymized_doc_output],
662
+ # )
663
 
664
+ # encrypt_query_btn.click(
665
+ # fn=encrypt_query_fn,
666
+ # inputs=[query_box],
667
+ # outputs=[
668
+ # query_box,
669
+ # output_encrypted_box,
670
+ # anonymized_query_output,
671
+ # identified_words_output_df,
672
+ # ],
673
+ # )
674
 
675
+ # run_fhe_btn.click(
676
+ # anonymization_with_fn,
677
+ # inputs=[original_sentences_box, query_box],
678
+ # outputs=[anonymized_doc_output, anonymized_query_output, identified_words_output_df],
679
+ # )
680
 
681
+ # ########################## ChatGpt Part ##########################
 
 
 
 
682
 
683
+ # gr.Markdown("<hr />")
684
+ # gr.Markdown("## Step 4: Send anonymized prompt to ChatGPT")
685
+ # gr.Markdown(
686
+ # """After securely anonymizing the query with FHE,
687
+ # you can forward it to ChatGPT without having any concern about information leakage."""
688
+ # )
689
 
690
+ # chatgpt_button = gr.Button("Query ChatGPT")
 
691
 
692
+ # with gr.Row():
693
+ # chatgpt_response_anonymized = gr.Textbox(label="ChatGPT's anonymized response:", lines=5)
694
+ # chatgpt_response_deanonymized = gr.Textbox(
695
+ # label="ChatGPT's non-anonymized response:", lines=5
696
+ # )
697
 
698
+ # chatgpt_button.click(
699
+ # query_chatgpt_fn,
700
+ # inputs=[anonymized_query_output, anonymized_doc_output],
701
+ # outputs=[chatgpt_response_anonymized, chatgpt_response_deanonymized],
702
+ # )
703
+
704
+ # gr.Markdown(
705
+ # """**Please note**: As this space is intended solely for demonstration purposes, some
706
+ # private information may be missed during by the anonymization algorithm. Please validate the
707
+ # following query before sending it to ChatGPT."""
708
+ # )
709
+ # # Launch the app
710
+ # demo.launch(share=False)
711
+
712
+
713
+
714
+
715
+
716
+
717
+
718
+ import gradio as gr
719
+ from predictor import predict, key_already_generated, pre_process_encrypt_send_purchase, decrypt_prediction
720
+ import base64
721
+
722
+ def key_generated():
723
+ """
724
+ Check if the evaluation keys have already been generated.
725
+ Returns:
726
+ bool: True if the evaluation keys have already been generated, False otherwise.
727
+ """
728
+ if not key_already_generated():
729
+ error_message = (
730
+ f"Error Encountered While generating the evaluation keys."
731
+ )
732
+ print(error_message)
733
+ return {gen_key_btn: gr.update(value=error_message)}
734
+ else:
735
+ print("Keys have been generated ✅")
736
+ return {gen_key_btn: gr.update(value="Keys have been generated ✅")}
737
+
738
+
739
  demo = gr.Blocks(css=".markdown-body { font-size: 18px; }")
740
 
741
  with demo:
742
+ with gr.Row():
743
+ with gr.Column(elem_id="center_column"):
744
+ gr.Image("Img/zama.png", width=200, show_label=False)
745
+ with gr.Column(elem_id="center_column"):
746
+ gr.Image("Img/Epita.png", width=200, show_label=False)
747
+
748
+
749
 
750
  gr.Markdown(
751
  """
752
+ <h1 style="text-align: center;">Fraud Detection with FHE Model</h1>
753
  <p align="center">
754
+ <a href="https://github.com/CirSandro/private-fhe-fraud-detection">
755
+ <span style="vertical-align: middle; display:inline-block; margin-right: 3px;">💳</span>private-fhe-fraud-detection
756
+ </a>
 
 
 
 
 
 
757
 
758
+ <a href="https://docs.zama.ai/concrete-ml">
759
+ <span style="vertical-align: middle; display:inline-block; margin-right: 3px;">🔒</span>Documentation Concrete-ML
760
+ </a>
 
 
761
  </p>
762
  """
763
  )
 
765
  gr.Markdown(
766
  """
767
  <p align="center" style="font-size: 16px;">
768
+ How to detect bank fraud without using your personal data ?</p>
769
+ """
770
+ )
771
 
772
+ with gr.Accordion("What is bank fraud detection?", open=False):
773
+ gr.Markdown(
774
+ """
775
+ Bank fraud detection is the process of identifying fraudulent activities or transactions
776
+ that may pose a risk to a bank or its customers. It is essential to detect fraudulent
777
+ activities to prevent financial losses and protect the integrity of the banking system.
778
+ """
779
+ )
780
+
781
+ with gr.Accordion("Why is it important to protect this data?", open=False):
782
+ gr.Markdown(
783
+ """
784
+ Banking and financial data often contain sensitive personal information, such as income,
785
+ spending habits, and account numbers. Protecting this information ensures that customers'
786
+ privacy is respected and safeguarded from unauthorized access.
787
+ """
788
+ )
789
+
790
+ with gr.Accordion("Why is Fully Homomorphic Encryption (FHE) a good solution?", open=False):
791
+ gr.Markdown(
792
+ """
793
+ Fully Homomorphic Encryption (FHE) is a powerful technique for enhancing privacy and accuracy
794
+ in the context of fraud detection, particularly when dealing with sensitive banking data. FHE
795
+ allows for the encryption of data, which can then be processed and analyzed without ever needing
796
+ to decrypt it.
797
+ Each party involved in the detection process can collaborate without compromising user privacy,
798
+ minimizing the risk of data leaks or breaches. The data remains confidential throughout the entire
799
+ process, ensuring that the privacy of users is maintained.
800
+ """
801
+ )
802
+
803
+ gr.Markdown(
804
+ """
805
+ <p style="text-align: center;">
806
+ Below, we will explain the flow in the image by simulating a purchase you've just made, and show you how our fraud detection model processes the transaction.
807
+ </p>
808
+ """
809
+ )
810
 
 
 
 
 
 
 
811
  # gr.Markdown(
812
+ # f"""
813
  # <p align="center">
814
+ # <img width="75%" height="30%" src="https://huggingface.co/spaces/Tenefix/private-fhe-fraud-detection/resolve/main/Img/schema.png">
815
  # </p>
816
  # """
817
  # )
818
+ with gr.Row():
819
+ with gr.Column(elem_id="center_column"):
820
+ gr.Image("Img/schema.png", width=200, show_label=False)
 
 
 
 
821
 
822
+ gr.Markdown("<hr />")
823
 
824
  ########################## Key Gen Part ##########################
825
 
 
835
  gen_key_btn = gr.Button("Generate the secret and evaluation keys")
836
 
837
  gen_key_btn.click(
838
+ key_generated,
839
  inputs=[],
840
  outputs=[gen_key_btn],
841
+ )#547
 
 
842
 
843
  gr.Markdown("<hr />")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
844
 
845
+ ########################## Encrypt Data ##########################
 
 
846
 
847
+ gr.Markdown(
848
+ "## Step 2: Make your purchase\n\n"
849
+ """
850
+ 🛍️ It's time to shop! To simulate your latest purchase, please provide the details of your most recent transaction.
851
+
852
+ If you don't have an idea, you can pre-fill with an example of fraud or non-fraud.
853
+ """
854
+ )
855
 
856
+ def prefill_fraud():
857
+ return 34, 50, 3, False, False, False, True
858
+
859
+ def prefill_no_fraud():
860
+ return 12, 2, 0.7, True, False, True, False
 
861
 
862
  with gr.Row():
863
+ prefill_button = gr.Button("Exemple Fraud")
864
+ prefill_button_no = gr.Button("Exemple No-Fraud")
 
 
 
 
865
 
866
+ with gr.Row():
867
+ with gr.Column():
868
+ distance_home = gr.Number(
869
+ minimum=float(0),
870
+ maximum=float(22000),
871
+ step=1,
872
+ value=10,
873
+ label="Distance from Home",
874
+ info="How far was the purchase from your home (in km)?"
875
  )
876
+ distance_last = gr.Number(
877
+ minimum=float(0),
878
+ maximum=float(22000),
879
+ step=1,
880
+ value=1,
881
+ label="Distance from Last Transaction",
882
+ info="Distance between this purchase and the last one (in km)?"
883
  )
884
+ ratio = gr.Number(
885
+ minimum=float(0),
886
+ maximum=float(10000),
887
+ step=0.1,
888
+ value=1,
889
+ label="Ratio to Median Purchase Price",
890
+ info="Purchase ratio compared to your average purchase",
891
+ )
892
+ repeat_retailer = gr.Checkbox(
893
+ label="Repeat Retailer",
894
+ info="Check if you are purchasing from the same retailer as your last transaction"
895
+ )
896
+ used_chip = gr.Checkbox(
897
+ label="Used Chip",
898
+ info="Check if you used a chip card for this transaction"
899
+ )
900
+ used_pin_number = gr.Checkbox(
901
+ label="Used Pin Number",
902
+ info="Check if you used your PIN number during the transaction"
903
+ )
904
+ online = gr.Checkbox(
905
+ label="Online Order",
906
+ info="Check if you made your purchase online"
907
  )
908
 
909
+
910
+ prefill_button.click(
911
+ fn=prefill_fraud,
912
+ inputs=[],
913
+ outputs=[
914
+ distance_home,
915
+ distance_last,
916
+ ratio,
917
+ repeat_retailer,
918
+ used_chip,
919
+ used_pin_number,
920
+ online
921
+ ]
922
  )
923
 
924
+ prefill_button_no.click(
925
+ fn=prefill_no_fraud,
926
+ inputs=[],
927
+ outputs=[
928
+ distance_home,
929
+ distance_last,
930
+ ratio,
931
+ repeat_retailer,
932
+ used_chip,
933
+ used_pin_number,
934
+ online
935
+ ]
936
+ )
937
 
938
  with gr.Row():
939
+ with gr.Column(scale=2):
940
+ encrypt_button_applicant = gr.Button("Encrypt the inputs and send to server.")
941
+
942
+ encrypted_input_applicant = gr.Textbox(
943
+ label="Encrypted input representation:", max_lines=2, interactive=False
 
 
 
 
 
944
  )
945
 
946
+ encrypt_button_applicant.click(
947
+ pre_process_encrypt_send_purchase,
948
+ inputs=[distance_home, distance_last, ratio, repeat_retailer, used_chip, used_pin_number, \
949
+ online],
950
+ outputs=[encrypted_input_applicant, encrypt_button_applicant],
951
+ )
952
 
953
+ gr.Markdown("<hr />")
954
 
955
+ ########################## Model Prediction ##########################
 
 
 
 
956
 
957
+ gr.Markdown("## Step 3: Run the FHE evaluation.")
958
+ gr.Markdown("<span style='color:grey'>Server Side</span>")
959
+ gr.Markdown(
960
+ """
961
+ It's high time to launch our prediction, by pressing the button you will launch the
962
+ fraud analysis that our fictitious bank offers you.
963
+ This server employs a [Random Forest (by Concrete-ML)](https://github.com/zama-ai/concrete-ml/blob/release/1.8.x/docs/references/api/concrete.ml.sklearn.rf.md#class-randomforestclassifier)
964
+ classifier model that has been trained on a synthetic data-set.
965
+ """
966
  )
967
 
968
+ execute_fhe_button = gr.Button("Run the FHE evaluation.")
969
+ fhe_execution_time = gr.Textbox(
970
+ label="Total FHE execution time (in seconds):", max_lines=1, interactive=False
 
971
  )
972
 
973
+ # Button to send the encodings to the server using post method
974
+ execute_fhe_button.click(predict, outputs=[fhe_execution_time, execute_fhe_button])
975
 
976
  gr.Markdown("<hr />")
977
+
978
+ ########################## Decrypt Prediction ##########################
979
+
980
+ gr.Markdown("## Step 4: Receive the encrypted output from the server and decrypt.")
981
  gr.Markdown(
982
+ """
983
+ 🔔 You will receive a notification! Is this a Fraud? The message is decrypted by pressing the button.
984
+ """
985
  )
986
 
987
+ get_output_button = gr.Button("Decrypt the prediction.")
988
+ prediction_output = gr.Textbox(
989
+ label="Prediction", max_lines=1, interactive=False
990
+ )
991
+ prediction_bar = gr.HTML(label="Prediction Bar") # For the percentage bar
992
 
993
+ get_output_button.click(
994
+ decrypt_prediction,
995
+ outputs=[prediction_output, get_output_button, prediction_bar],
996
+ )
997
+
998
 
999
+ gr.Markdown(
1000
+ """
1001
+ You now know that it is possible to detect bank fraud without knowing your personal information.
1002
+ """
1003
  )
1004
 
1005
  gr.Markdown(
1006
+ "The app was built with [Concrete-ML](https://github.com/zama-ai/concrete-ml), a "
1007
+ "Privacy-Preserving Machine Learning (PPML) open-source set of tools by [Zama](https://zama.ai/). "
1008
+ "Try it yourself and don't forget to star on Github &#11088;."
1009
  )
1010
+
1011
+ if __name__ == "__main__":
1012
+ demo.launch()