Sparkonix commited on
Commit
b8cffa7
·
1 Parent(s): 70a901d

added db to store email for later demasking

Browse files
.env ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Database configuration - using SQLite instead of PostgreSQL
2
+ DATABASE_PATH=emails.db
3
+
4
+ # API configuration
5
+ PORT=8000
6
+
7
+ # Model configuration
8
+ MODEL_PATH=Sparkonix/email-classifier-model
Dockerfile CHANGED
@@ -14,13 +14,17 @@ COPY . .
14
  # Set environment variables
15
  ENV PORT=7860
16
  ENV MODEL_PATH="Sparkonix/email-classifier-model"
17
- # Replace YOUR_ACTUAL_USERNAME with your Hugging Face username after uploading the model
 
 
18
 
19
  # Add this line to set cache location to a writable directory
20
  ENV HF_HOME="/app/.cache/huggingface"
21
 
22
  # Create the Hugging Face cache directory and set permissions
23
  RUN mkdir -p /app/.cache/huggingface && chmod -R 777 /app/.cache/huggingface
 
 
24
 
25
  # Expose the port
26
  EXPOSE 7860
 
14
  # Set environment variables
15
  ENV PORT=7860
16
  ENV MODEL_PATH="Sparkonix/email-classifier-model"
17
+
18
+ # Change this to point to SQLite database location
19
+ ENV DATABASE_PATH="/data/emails.db"
20
 
21
  # Add this line to set cache location to a writable directory
22
  ENV HF_HOME="/app/.cache/huggingface"
23
 
24
  # Create the Hugging Face cache directory and set permissions
25
  RUN mkdir -p /app/.cache/huggingface && chmod -R 777 /app/.cache/huggingface
26
+ # Create data directory for SQLite
27
+ RUN mkdir -p /data && chmod -R 777 /data
28
 
29
  # Expose the port
30
  EXPOSE 7860
Email_Classification_API_Tests.postman_collection.json ADDED
@@ -0,0 +1,596 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "info": {
3
+ "_postman_id": "7e5f2c6b-3e5d-4e40-a8f6-abc9f0c92a72",
4
+ "name": "Email Classification API Tests",
5
+ "description": "Tests for the Email Classification API hosted on Hugging Face Spaces",
6
+ "schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json"
7
+ },
8
+ "variable": [
9
+ {
10
+ "key": "baseUrl",
11
+ "value": "https://sparkonix-email-classification-model.hf.space",
12
+ "type": "string"
13
+ },
14
+ {
15
+ "key": "emailId",
16
+ "value": ""
17
+ },
18
+ {
19
+ "key": "accessKey",
20
+ "value": ""
21
+ },
22
+ {
23
+ "key": "piiEmailId",
24
+ "value": ""
25
+ },
26
+ {
27
+ "key": "piiAccessKey",
28
+ "value": ""
29
+ }
30
+ ],
31
+ "item": [
32
+ {
33
+ "name": "Health Check",
34
+ "request": {
35
+ "method": "GET",
36
+ "header": [],
37
+ "url": {
38
+ "raw": "{{baseUrl}}/health",
39
+ "host": [
40
+ "{{baseUrl}}"
41
+ ],
42
+ "path": [
43
+ "health"
44
+ ]
45
+ },
46
+ "description": "Check if the API is running"
47
+ },
48
+ "event": [
49
+ {
50
+ "listen": "test",
51
+ "script": {
52
+ "exec": [
53
+ "// Check if the response status is 200 OK",
54
+ "pm.test(\"Status code is 200\", function() {",
55
+ " pm.response.to.have.status(200);",
56
+ "});",
57
+ "",
58
+ "// Check if the health check returns the correct structure",
59
+ "pm.test(\"Health check returns correct structure\", function() {",
60
+ " var jsonData = pm.response.json();",
61
+ " pm.expect(jsonData).to.have.property(\"status\");",
62
+ " pm.expect(jsonData).to.have.property(\"message\");",
63
+ " pm.expect(jsonData.status).to.eql(\"healthy\");",
64
+ "});"
65
+ ],
66
+ "type": "text/javascript"
67
+ }
68
+ }
69
+ ],
70
+ "response": []
71
+ },
72
+ {
73
+ "name": "Basic Email Classification",
74
+ "request": {
75
+ "method": "POST",
76
+ "header": [],
77
+ "body": {
78
+ "mode": "raw",
79
+ "raw": "{\n \"input_email_body\": \"Hi, I am experiencing a problem with my login. The system is showing an error when I try to log in. Please help.\"\n}",
80
+ "options": {
81
+ "raw": {
82
+ "language": "json"
83
+ }
84
+ }
85
+ },
86
+ "url": {
87
+ "raw": "{{baseUrl}}/classify",
88
+ "host": [
89
+ "{{baseUrl}}"
90
+ ],
91
+ "path": [
92
+ "classify"
93
+ ]
94
+ },
95
+ "description": "Basic email classification test"
96
+ },
97
+ "event": [
98
+ {
99
+ "listen": "test",
100
+ "script": {
101
+ "exec": [
102
+ "// Check if the response status is 200 OK",
103
+ "pm.test(\"Status code is 200\", function() {",
104
+ " pm.response.to.have.status(200);",
105
+ "});",
106
+ "",
107
+ "// Check if the response structure is correct",
108
+ "pm.test(\"Response has correct structure\", function() {",
109
+ " var jsonData = pm.response.json();",
110
+ " pm.expect(jsonData).to.have.property(\"input_email_body\");",
111
+ " pm.expect(jsonData).to.have.property(\"masked_email\");",
112
+ " pm.expect(jsonData).to.have.property(\"list_of_masked_entities\");",
113
+ " pm.expect(jsonData).to.have.property(\"category_of_the_email\");",
114
+ "});",
115
+ "",
116
+ "// Check if the input email matches what we sent",
117
+ "pm.test(\"Input email matches request\", function() {",
118
+ " var jsonData = pm.response.json();",
119
+ " pm.expect(jsonData.input_email_body).to.eql(",
120
+ " \"Hi, I am experiencing a problem with my login. The system is showing an error when I try to log in. Please help.\"",
121
+ " );",
122
+ "});",
123
+ "",
124
+ "// Check if the category is one of the expected values",
125
+ "pm.test(\"Category is valid\", function() {",
126
+ " var jsonData = pm.response.json();",
127
+ " pm.expect([\"Incident\", \"Request\", \"Change\", \"Problem\"]).to.include(jsonData.category_of_the_email);",
128
+ "});",
129
+ "",
130
+ "// Store the email_id and access_key for database retrieval tests",
131
+ "var responseJson = pm.response.json();",
132
+ "if (responseJson.email_id && responseJson.access_key) {",
133
+ " pm.collectionVariables.set(\"emailId\", responseJson.email_id);",
134
+ " pm.collectionVariables.set(\"accessKey\", responseJson.access_key);",
135
+ " console.log(\"Stored email_id and access_key for retrieval tests\");",
136
+ "}"
137
+ ],
138
+ "type": "text/javascript"
139
+ }
140
+ }
141
+ ],
142
+ "response": []
143
+ },
144
+ {
145
+ "name": "Email with PII Classification",
146
+ "request": {
147
+ "method": "POST",
148
+ "header": [],
149
+ "body": {
150
+ "mode": "raw",
151
+ "raw": "{\n \"input_email_body\": \"Hello, my name is John Smith and my email is john.smith@example.com. I am having issues with my account. My phone number is 555-123-4567 and my credit card ending in 1234 seems to have been charged twice. Please help.\"\n}",
152
+ "options": {
153
+ "raw": {
154
+ "language": "json"
155
+ }
156
+ }
157
+ },
158
+ "url": {
159
+ "raw": "{{baseUrl}}/classify",
160
+ "host": [
161
+ "{{baseUrl}}"
162
+ ],
163
+ "path": [
164
+ "classify"
165
+ ]
166
+ },
167
+ "description": "Email classification with PII masking test"
168
+ },
169
+ "event": [
170
+ {
171
+ "listen": "test",
172
+ "script": {
173
+ "exec": [
174
+ "// Check if the response status is 200 OK",
175
+ "pm.test(\"Status code is 200\", function() {",
176
+ " pm.response.to.have.status(200);",
177
+ "});",
178
+ "",
179
+ "// Check if PII is being masked",
180
+ "pm.test(\"PII is masked properly\", function() {",
181
+ " var jsonData = pm.response.json();",
182
+ " pm.expect(jsonData.masked_email).to.not.include(\"John Smith\");",
183
+ " pm.expect(jsonData.masked_email).to.not.include(\"john.smith@example.com\");",
184
+ " pm.expect(jsonData.masked_email).to.not.include(\"555-123-4567\");",
185
+ " pm.expect(jsonData.masked_email).to.not.include(\"1234\");",
186
+ "});",
187
+ "",
188
+ "// Check if masked entities list is populated",
189
+ "pm.test(\"Masked entities list is populated\", function() {",
190
+ " var jsonData = pm.response.json();",
191
+ " pm.expect(jsonData.list_of_masked_entities).to.be.an(\"array\").that.is.not.empty;",
192
+ "});",
193
+ "",
194
+ "// Store the email_id and access_key for database retrieval tests specifically for PII emails",
195
+ "var responseJson = pm.response.json();",
196
+ "if (responseJson.email_id && responseJson.access_key) {",
197
+ " pm.collectionVariables.set(\"piiEmailId\", responseJson.email_id);",
198
+ " pm.collectionVariables.set(\"piiAccessKey\", responseJson.access_key);",
199
+ " console.log(\"Stored PII email_id and access_key for retrieval tests\");",
200
+ "}"
201
+ ],
202
+ "type": "text/javascript"
203
+ }
204
+ }
205
+ ],
206
+ "response": []
207
+ },
208
+ {
209
+ "name": "Incident Email Classification",
210
+ "request": {
211
+ "method": "POST",
212
+ "header": [],
213
+ "body": {
214
+ "mode": "raw",
215
+ "raw": "{\n \"input_email_body\": \"URGENT: The application is down. None of our users can access the system. This is causing severe business impact and we need immediate attention. Our customer operations are halted.\"\n}",
216
+ "options": {
217
+ "raw": {
218
+ "language": "json"
219
+ }
220
+ }
221
+ },
222
+ "url": {
223
+ "raw": "{{baseUrl}}/classify",
224
+ "host": [
225
+ "{{baseUrl}}"
226
+ ],
227
+ "path": [
228
+ "classify"
229
+ ]
230
+ },
231
+ "description": "Test for incident classification"
232
+ },
233
+ "event": [
234
+ {
235
+ "listen": "test",
236
+ "script": {
237
+ "exec": [
238
+ "// Check if the response status is 200 OK",
239
+ "pm.test(\"Status code is 200\", function() {",
240
+ " pm.response.to.have.status(200);",
241
+ "});",
242
+ "",
243
+ "// This should likely be classified as an Incident",
244
+ "pm.test(\"Should be classified as an Incident\", function() {",
245
+ " var jsonData = pm.response.json();",
246
+ " pm.expect(jsonData.category_of_the_email).to.eql(\"Incident\");",
247
+ "});"
248
+ ],
249
+ "type": "text/javascript"
250
+ }
251
+ }
252
+ ],
253
+ "response": []
254
+ },
255
+ {
256
+ "name": "Request Email Classification",
257
+ "request": {
258
+ "method": "POST",
259
+ "header": [],
260
+ "body": {
261
+ "mode": "raw",
262
+ "raw": "{\n \"input_email_body\": \"I would like to request access to the financial reporting system for my new team member. Their details are as follows: Name: Jane Doe, Department: Finance, Employee ID: 12345. Please provide access by end of week.\"\n}",
263
+ "options": {
264
+ "raw": {
265
+ "language": "json"
266
+ }
267
+ }
268
+ },
269
+ "url": {
270
+ "raw": "{{baseUrl}}/classify",
271
+ "host": [
272
+ "{{baseUrl}}"
273
+ ],
274
+ "path": [
275
+ "classify"
276
+ ]
277
+ },
278
+ "description": "Test for request classification"
279
+ },
280
+ "event": [
281
+ {
282
+ "listen": "test",
283
+ "script": {
284
+ "exec": [
285
+ "// Check if the response status is 200 OK",
286
+ "pm.test(\"Status code is 200\", function() {",
287
+ " pm.response.to.have.status(200);",
288
+ "});",
289
+ "",
290
+ "// This should likely be classified as a Request",
291
+ "pm.test(\"Should be classified as a Request\", function() {",
292
+ " var jsonData = pm.response.json();",
293
+ " pm.expect(jsonData.category_of_the_email).to.eql(\"Request\");",
294
+ "});",
295
+ "",
296
+ "// Check if PII is masked",
297
+ "pm.test(\"Name should be masked\", function() {",
298
+ " var jsonData = pm.response.json();",
299
+ " pm.expect(jsonData.masked_email).to.not.include(\"Jane Doe\");",
300
+ "});"
301
+ ],
302
+ "type": "text/javascript"
303
+ }
304
+ }
305
+ ],
306
+ "response": []
307
+ },
308
+ {
309
+ "name": "Change Email Classification",
310
+ "request": {
311
+ "method": "POST",
312
+ "header": [],
313
+ "body": {
314
+ "mode": "raw",
315
+ "raw": "{\n \"input_email_body\": \"We need to change the configuration of the production server to increase memory allocation. Please schedule this change for the next maintenance window this Sunday at 2 AM. Approval has been granted by the IT Director.\"\n}",
316
+ "options": {
317
+ "raw": {
318
+ "language": "json"
319
+ }
320
+ }
321
+ },
322
+ "url": {
323
+ "raw": "{{baseUrl}}/classify",
324
+ "host": [
325
+ "{{baseUrl}}"
326
+ ],
327
+ "path": [
328
+ "classify"
329
+ ]
330
+ },
331
+ "description": "Test for change classification"
332
+ },
333
+ "event": [
334
+ {
335
+ "listen": "test",
336
+ "script": {
337
+ "exec": [
338
+ "// Check if the response status is 200 OK",
339
+ "pm.test(\"Status code is 200\", function() {",
340
+ " pm.response.to.have.status(200);",
341
+ "});",
342
+ "",
343
+ "// This should likely be classified as a Change",
344
+ "pm.test(\"Should be classified as a Change\", function() {",
345
+ " var jsonData = pm.response.json();",
346
+ " pm.expect(jsonData.category_of_the_email).to.eql(\"Change\");",
347
+ "});"
348
+ ],
349
+ "type": "text/javascript"
350
+ }
351
+ }
352
+ ],
353
+ "response": []
354
+ },
355
+ {
356
+ "name": "Problem Email Classification",
357
+ "request": {
358
+ "method": "POST",
359
+ "header": [],
360
+ "body": {
361
+ "mode": "raw",
362
+ "raw": "{\n \"input_email_body\": \"We have noticed that the application has been running slow for the past week. This happens consistently during peak hours (10 AM - 2 PM). Can you investigate the root cause of this ongoing performance issue?\"\n}",
363
+ "options": {
364
+ "raw": {
365
+ "language": "json"
366
+ }
367
+ }
368
+ },
369
+ "url": {
370
+ "raw": "{{baseUrl}}/classify",
371
+ "host": [
372
+ "{{baseUrl}}"
373
+ ],
374
+ "path": [
375
+ "classify"
376
+ ]
377
+ },
378
+ "description": "Test for problem classification"
379
+ },
380
+ "event": [
381
+ {
382
+ "listen": "test",
383
+ "script": {
384
+ "exec": [
385
+ "// Check if the response status is 200 OK",
386
+ "pm.test(\"Status code is 200\", function() {",
387
+ " pm.response.to.have.status(200);",
388
+ "});",
389
+ "",
390
+ "// This should likely be classified as a Problem",
391
+ "pm.test(\"Should be classified as a Problem\", function() {",
392
+ " var jsonData = pm.response.json();",
393
+ " pm.expect(jsonData.category_of_the_email).to.eql(\"Problem\");",
394
+ "});"
395
+ ],
396
+ "type": "text/javascript"
397
+ }
398
+ }
399
+ ],
400
+ "response": []
401
+ },
402
+ {
403
+ "name": "Empty Email Test",
404
+ "request": {
405
+ "method": "POST",
406
+ "header": [],
407
+ "body": {
408
+ "mode": "raw",
409
+ "raw": "{\n \"input_email_body\": \"\"\n}",
410
+ "options": {
411
+ "raw": {
412
+ "language": "json"
413
+ }
414
+ }
415
+ },
416
+ "url": {
417
+ "raw": "{{baseUrl}}/classify",
418
+ "host": [
419
+ "{{baseUrl}}"
420
+ ],
421
+ "path": [
422
+ "classify"
423
+ ]
424
+ },
425
+ "description": "Test with empty email body"
426
+ },
427
+ "event": [
428
+ {
429
+ "listen": "test",
430
+ "script": {
431
+ "exec": [
432
+ "// The API should either return an error or a classification",
433
+ "pm.test(\"Response status is valid\", function() {",
434
+ " pm.expect(pm.response.code).to.be.oneOf([200, 400, 422, 500]);",
435
+ " ",
436
+ " if (pm.response.code === 200) {",
437
+ " var jsonData = pm.response.json();",
438
+ " pm.expect(jsonData).to.have.property(\"category_of_the_email\");",
439
+ " } else {",
440
+ " // If it is an error response, make sure it has a proper structure",
441
+ " var jsonData = pm.response.json();",
442
+ " pm.expect(jsonData).to.have.property(\"detail\");",
443
+ " }",
444
+ "});"
445
+ ],
446
+ "type": "text/javascript"
447
+ }
448
+ }
449
+ ],
450
+ "response": []
451
+ },
452
+ {
453
+ "name": "Retrieve Original Email",
454
+ "request": {
455
+ "method": "POST",
456
+ "header": [],
457
+ "body": {
458
+ "mode": "raw",
459
+ "raw": "{\n \"email_id\": \"{{piiEmailId}}\",\n \"access_key\": \"{{piiAccessKey}}\"\n}",
460
+ "options": {
461
+ "raw": {
462
+ "language": "json"
463
+ }
464
+ }
465
+ },
466
+ "url": {
467
+ "raw": "{{baseUrl}}/api/v1/original-email/retrieve",
468
+ "host": [
469
+ "{{baseUrl}}"
470
+ ],
471
+ "path": [
472
+ "api",
473
+ "v1",
474
+ "original-email",
475
+ "retrieve"
476
+ ]
477
+ },
478
+ "description": "Retrieve the original unmasked email from the database"
479
+ },
480
+ "event": [
481
+ {
482
+ "listen": "test",
483
+ "script": {
484
+ "exec": [
485
+ "// Check if the response status is 200 OK",
486
+ "pm.test(\"Status code is 200\", function() {",
487
+ " pm.response.to.have.status(200);",
488
+ "});",
489
+ "",
490
+ "// Check if the response has the correct structure",
491
+ "pm.test(\"Response has correct structure\", function() {",
492
+ " var jsonData = pm.response.json();",
493
+ " pm.expect(jsonData).to.have.property(\"status\");",
494
+ " pm.expect(jsonData).to.have.property(\"data\");",
495
+ " pm.expect(jsonData).to.have.property(\"message\");",
496
+ " pm.expect(jsonData.status).to.eql(\"success\");",
497
+ "});",
498
+ "",
499
+ "// Check if the data contains the original email with PII",
500
+ "pm.test(\"Data contains original email with PII\", function() {",
501
+ " var jsonData = pm.response.json();",
502
+ " pm.expect(jsonData.data).to.have.property(\"original_email\");",
503
+ " pm.expect(jsonData.data).to.have.property(\"masked_email\");",
504
+ " pm.expect(jsonData.data).to.have.property(\"masked_entities\");",
505
+ " ",
506
+ " // Check that the original email contains the PII (for the PII test email)",
507
+ " if (pm.collectionVariables.get(\"piiEmailId\")) {",
508
+ " pm.expect(jsonData.data.original_email).to.include(\"John Smith\");",
509
+ " pm.expect(jsonData.data.original_email).to.include(\"john.smith@example.com\");",
510
+ " pm.expect(jsonData.data.original_email).to.include(\"555-123-4567\");",
511
+ " }",
512
+ "});"
513
+ ],
514
+ "type": "text/javascript"
515
+ }
516
+ }
517
+ ],
518
+ "response": []
519
+ },
520
+ {
521
+ "name": "Retrieve With Invalid Access Key",
522
+ "request": {
523
+ "method": "POST",
524
+ "header": [],
525
+ "body": {
526
+ "mode": "raw",
527
+ "raw": "{\n \"email_id\": \"{{piiEmailId}}\",\n \"access_key\": \"invalid_access_key_123456\"\n}",
528
+ "options": {
529
+ "raw": {
530
+ "language": "json"
531
+ }
532
+ }
533
+ },
534
+ "url": {
535
+ "raw": "{{baseUrl}}/api/v1/original-email/retrieve",
536
+ "host": [
537
+ "{{baseUrl}}"
538
+ ],
539
+ "path": [
540
+ "api",
541
+ "v1",
542
+ "original-email",
543
+ "retrieve"
544
+ ]
545
+ },
546
+ "description": "Test security by attempting to retrieve email with invalid access key"
547
+ },
548
+ "event": [
549
+ {
550
+ "listen": "test",
551
+ "script": {
552
+ "exec": [
553
+ "// Check that we get an error (404) for invalid access key",
554
+ "pm.test(\"Should return error for invalid access key\", function() {",
555
+ " pm.expect(pm.response.code).to.equal(404);",
556
+ "});",
557
+ "",
558
+ "// Check error message",
559
+ "pm.test(\"Response contains appropriate error message\", function() {",
560
+ " var jsonData = pm.response.json();",
561
+ " pm.expect(jsonData).to.have.property(\"detail\");",
562
+ " pm.expect(jsonData.detail).to.include(\"not found\");",
563
+ "});"
564
+ ],
565
+ "type": "text/javascript"
566
+ }
567
+ }
568
+ ],
569
+ "response": []
570
+ }
571
+ ],
572
+ "event": [
573
+ {
574
+ "listen": "prerequest",
575
+ "script": {
576
+ "type": "text/javascript",
577
+ "exec": [
578
+ "// Reset the tests passed flag before each request",
579
+ "pm.variables.set(\"testsPassed\", true);"
580
+ ]
581
+ }
582
+ },
583
+ {
584
+ "listen": "test",
585
+ "script": {
586
+ "type": "text/javascript",
587
+ "exec": [
588
+ "// Update the tests passed flag if any tests failed",
589
+ "if (pm.test.allTests.filter(test => !test.passed).length > 0) {",
590
+ " pm.variables.set(\"testsPassed\", false);",
591
+ "}"
592
+ ]
593
+ }
594
+ }
595
+ ]
596
+ }
database.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Database module for handling email storage operations with SQLite.
3
+ """
4
+ import os
5
+ import json
6
+ import sqlite3
7
+ from typing import Dict, Any, Optional, List, Tuple
8
+ from datetime import datetime
9
+ import uuid
10
+ import hashlib
11
+
12
+
13
+ class EmailDatabase:
14
+ """
15
+ Database class for storing and retrieving email data with PII masking information.
16
+ Uses SQLite for storage in Hugging Face's persistent directory.
17
+ """
18
+
19
+ def __init__(self, connection_string: str = None):
20
+ """
21
+ Initialize the database connection.
22
+
23
+ Args:
24
+ connection_string: Database connection string or path.
25
+ For SQLite, this will be treated as a file path.
26
+ """
27
+ # Hugging Face Spaces has a /data directory that persists between restarts
28
+ self.db_path = connection_string or os.environ.get(
29
+ "DATABASE_PATH",
30
+ "/data/emails.db" # This path persists in Hugging Face Spaces
31
+ )
32
+
33
+ # Ensure the data directory exists
34
+ self._ensure_data_directory()
35
+
36
+ self._create_tables()
37
+
38
+ def _ensure_data_directory(self):
39
+ """Ensure the data directory exists, and use a fallback if needed."""
40
+ try:
41
+ data_dir = os.path.dirname(self.db_path)
42
+ if data_dir and not os.path.exists(data_dir):
43
+ os.makedirs(data_dir, exist_ok=True)
44
+ except (OSError, PermissionError):
45
+ # If we can't write to /data, fall back to the current directory
46
+ self.db_path = "emails.db"
47
+ print(f"Warning: Using fallback database path: {self.db_path}")
48
+
49
+ def _get_connection(self):
50
+ """Get a database connection."""
51
+ return sqlite3.connect(self.db_path)
52
+
53
+ def _create_tables(self):
54
+ """Create the necessary tables if they don't exist."""
55
+ conn = self._get_connection()
56
+ try:
57
+ cursor = conn.cursor()
58
+
59
+ # Create the emails table to store original emails and their masked versions
60
+ cursor.execute('''
61
+ CREATE TABLE IF NOT EXISTS emails (
62
+ id TEXT PRIMARY KEY,
63
+ original_email TEXT NOT NULL,
64
+ masked_email TEXT NOT NULL,
65
+ masked_entities TEXT NOT NULL,
66
+ category TEXT,
67
+ created_at TEXT NOT NULL,
68
+ access_key TEXT NOT NULL
69
+ )
70
+ ''')
71
+
72
+ # Create an index on the access_key field
73
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_access_key ON emails (access_key)')
74
+
75
+ conn.commit()
76
+ except Exception as e:
77
+ conn.rollback()
78
+ raise e
79
+ finally:
80
+ conn.close()
81
+
82
+ def _generate_id(self) -> str:
83
+ """Generate a unique ID for the email record."""
84
+ return str(uuid.uuid4())
85
+
86
+ def _generate_access_key(self, email_id: str) -> str:
87
+ """
88
+ Generate an access key for retrieving the original email.
89
+ This acts as a security measure to prevent unauthorized access.
90
+ """
91
+ # Use a combination of the email ID and a timestamp, hashed
92
+ data = f"{email_id}:{datetime.now().isoformat()}:{os.urandom(8).hex()}"
93
+ return hashlib.sha256(data.encode()).hexdigest()
94
+
95
+ def store_email(self, original_email: str, masked_email: str,
96
+ masked_entities: List[Dict[str, Any]], category: Optional[str] = None) -> Tuple[str, str]:
97
+ """
98
+ Store the original email along with its masked version and related information.
99
+
100
+ Args:
101
+ original_email: The original email with PII
102
+ masked_email: The masked version of the email
103
+ masked_entities: List of entities that were masked
104
+ category: Optional category of the email
105
+
106
+ Returns:
107
+ Tuple of (email_id, access_key) for future reference
108
+ """
109
+ conn = self._get_connection()
110
+ try:
111
+ cursor = conn.cursor()
112
+
113
+ email_id = self._generate_id()
114
+ access_key = self._generate_access_key(email_id)
115
+
116
+ # Store the email data
117
+ cursor.execute(
118
+ 'INSERT INTO emails (id, original_email, masked_email, masked_entities, category, created_at, access_key) '
119
+ 'VALUES (?, ?, ?, ?, ?, ?, ?)',
120
+ (
121
+ email_id,
122
+ original_email,
123
+ masked_email,
124
+ json.dumps(masked_entities), # Convert to JSON string for SQLite
125
+ category,
126
+ datetime.now().isoformat(),
127
+ access_key
128
+ )
129
+ )
130
+
131
+ conn.commit()
132
+ return email_id, access_key
133
+ except Exception as e:
134
+ conn.rollback()
135
+ raise e
136
+ finally:
137
+ conn.close()
138
+
139
+ def get_original_email(self, email_id: str, access_key: str) -> Optional[Dict[str, Any]]:
140
+ """
141
+ Retrieve the original email with PII using the access key.
142
+
143
+ Args:
144
+ email_id: The ID of the email record
145
+ access_key: The security key required to access the original email
146
+
147
+ Returns:
148
+ Dictionary with email data or None if not found or access_key is invalid
149
+ """
150
+ conn = self._get_connection()
151
+ try:
152
+ cursor = conn.cursor()
153
+
154
+ cursor.execute(
155
+ 'SELECT id, original_email, masked_email, masked_entities, category, created_at '
156
+ 'FROM emails WHERE id = ? AND access_key = ?',
157
+ (email_id, access_key)
158
+ )
159
+
160
+ row = cursor.fetchone()
161
+ if not row:
162
+ return None
163
+
164
+ return {
165
+ "id": row[0],
166
+ "original_email": row[1],
167
+ "masked_email": row[2],
168
+ "masked_entities": json.loads(row[3]), # Convert from JSON string back to Python dict
169
+ "category": row[4],
170
+ "created_at": row[5]
171
+ }
172
+ finally:
173
+ conn.close()
174
+
175
+ def get_email_by_id(self, email_id: str) -> Optional[Dict[str, Any]]:
176
+ """
177
+ Retrieve the masked email data without the original PII-containing email.
178
+
179
+ Args:
180
+ email_id: The ID of the email
181
+
182
+ Returns:
183
+ Dictionary with masked email data or None if not found
184
+ """
185
+ conn = self._get_connection()
186
+ try:
187
+ cursor = conn.cursor()
188
+
189
+ cursor.execute(
190
+ 'SELECT id, masked_email, masked_entities, category, created_at '
191
+ 'FROM emails WHERE id = ?',
192
+ (email_id,)
193
+ )
194
+
195
+ row = cursor.fetchone()
196
+ if not row:
197
+ return None
198
+
199
+ return {
200
+ "id": row[0],
201
+ "masked_email": row[1],
202
+ "masked_entities": json.loads(row[2]), # Convert from JSON string back to Python dict
203
+ "category": row[3],
204
+ "created_at": row[4]
205
+ }
206
+ finally:
207
+ conn.close()
docker-compose.yml CHANGED
@@ -7,7 +7,12 @@ services:
7
  - "8000:7860"
8
  volumes:
9
  - .:/app
 
10
  environment:
11
  - PORT=7860
12
  - HF_HOME=/app/.cache/huggingface
13
- restart: unless-stopped
 
 
 
 
 
7
  - "8000:7860"
8
  volumes:
9
  - .:/app
10
+ - sqlite_data:/data # Mount a volume for persistent SQLite data
11
  environment:
12
  - PORT=7860
13
  - HF_HOME=/app/.cache/huggingface
14
+ - DATABASE_PATH=/data/emails.db
15
+ restart: unless-stopped
16
+
17
+ volumes:
18
+ sqlite_data: # Define the volume for SQLite database
main.py CHANGED
@@ -3,17 +3,30 @@ from fastapi import FastAPI, HTTPException
3
  from pydantic import BaseModel
4
  from typing import Dict, Any, List, Tuple, Optional
5
  import uvicorn
 
6
 
7
  from utils import PIIMasker
8
  from models import EmailClassifier
9
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  # Initialize the FastAPI application
11
  app = FastAPI(title="Email Classification API",
12
  description="API for classifying support emails and masking PII",
13
  version="1.0.0")
14
 
15
  # Initialize the PII masker and email classifier
16
- pii_masker = PIIMasker()
17
  email_classifier = EmailClassifier()
18
 
19
  class EmailInput(BaseModel):
@@ -33,6 +46,11 @@ class EmailOutput(BaseModel):
33
  masked_email: str
34
  category_of_the_email: str
35
 
 
 
 
 
 
36
  @app.post("/classify", response_model=EmailOutput)
37
  async def classify_email(email_input: EmailInput) -> Dict[str, Any]:
38
  """
@@ -51,10 +69,46 @@ async def classify_email(email_input: EmailInput) -> Dict[str, Any]:
51
  # Classify the masked email
52
  classified_data = email_classifier.process_email(processed_data)
53
 
54
- return classified_data
 
 
 
 
 
 
55
  except Exception as e:
56
  raise HTTPException(status_code=500, detail=f"Error processing email: {str(e)}")
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  @app.get("/health")
59
  async def health_check():
60
  """
 
3
  from pydantic import BaseModel
4
  from typing import Dict, Any, List, Tuple, Optional
5
  import uvicorn
6
+ from dotenv import load_dotenv
7
 
8
  from utils import PIIMasker
9
  from models import EmailClassifier
10
 
11
+ # Load environment variables from .env file if available
12
+ try:
13
+ load_dotenv()
14
+ except ImportError:
15
+ pass # dotenv might not be installed in production
16
+
17
+ # Set database path for Hugging Face, using persistent storage
18
+ if os.path.exists('/data'):
19
+ db_path = "/data/emails.db"
20
+ else:
21
+ db_path = "emails.db" # Fallback to local directory
22
+
23
  # Initialize the FastAPI application
24
  app = FastAPI(title="Email Classification API",
25
  description="API for classifying support emails and masking PII",
26
  version="1.0.0")
27
 
28
  # Initialize the PII masker and email classifier
29
+ pii_masker = PIIMasker(db_path=db_path)
30
  email_classifier = EmailClassifier()
31
 
32
  class EmailInput(BaseModel):
 
46
  masked_email: str
47
  category_of_the_email: str
48
 
49
+ class EmailRetrievalInput(BaseModel):
50
+ """Input model for retrieving original email"""
51
+ email_id: str
52
+ access_key: str
53
+
54
  @app.post("/classify", response_model=EmailOutput)
55
  async def classify_email(email_input: EmailInput) -> Dict[str, Any]:
56
  """
 
69
  # Classify the masked email
70
  classified_data = email_classifier.process_email(processed_data)
71
 
72
+ # Make sure we return only the fields expected in the response model
73
+ return {
74
+ "input_email_body": email_input.input_email_body,
75
+ "list_of_masked_entities": classified_data["list_of_masked_entities"],
76
+ "masked_email": classified_data["masked_email"],
77
+ "category_of_the_email": classified_data["category_of_the_email"]
78
+ }
79
  except Exception as e:
80
  raise HTTPException(status_code=500, detail=f"Error processing email: {str(e)}")
81
 
82
+ @app.post("/api/v1/original-email/retrieve", response_model=Dict[str, Any])
83
+ async def retrieve_original_email_v1(retrieval_input: EmailRetrievalInput) -> Dict[str, Any]:
84
+ """
85
+ New API endpoint to retrieve the original unmasked email from SQLite database.
86
+
87
+ Args:
88
+ retrieval_input: The email ID and access key
89
+
90
+ Returns:
91
+ The original email data with PII information
92
+ """
93
+ try:
94
+ email_data = pii_masker.get_original_email(
95
+ retrieval_input.email_id,
96
+ retrieval_input.access_key
97
+ )
98
+
99
+ if not email_data:
100
+ raise HTTPException(status_code=404, detail="Email not found or invalid access key")
101
+
102
+ return {
103
+ "status": "success",
104
+ "data": email_data,
105
+ "message": "Original email retrieved successfully"
106
+ }
107
+ except Exception as e:
108
+ if isinstance(e, HTTPException):
109
+ raise e
110
+ raise HTTPException(status_code=500, detail=f"Error retrieving email: {str(e)}")
111
+
112
  @app.get("/health")
113
  async def health_check():
114
  """
requirements.in CHANGED
@@ -1,52 +1,74 @@
1
  # This file was autogenerated by uv via the following command:
2
  # uv pip compile requirements.in -o requirements.txt
3
  annotated-types==0.7.0
4
- # via pydantic
 
 
5
  anyio==4.9.0
6
- # via starlette
 
 
7
  blis==1.3.0
8
- # via thinc
 
 
9
  catalogue==2.0.10
10
  # via
 
11
  # spacy
12
  # srsly
13
  # thinc
14
  certifi==2025.4.26
15
- # via requests
 
 
16
  charset-normalizer==3.4.2
17
- # via requests
 
 
18
  click==8.2.0
19
  # via
 
20
  # typer
21
  # uvicorn
22
  cloudpathlib==0.21.1
23
- # via weasel
 
 
24
  confection==0.1.5
25
  # via
 
26
  # thinc
27
  # weasel
28
  cymem==2.0.11
29
  # via
 
30
  # preshed
31
  # spacy
32
  # thinc
33
  en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl
34
  # via -r requirements.in
35
  exceptiongroup==1.3.0
36
- # via anyio
 
 
37
  fastapi==0.115.12
38
  # via -r requirements.in
39
  filelock==3.18.0
40
  # via
 
41
  # huggingface-hub
42
  # torch
43
  # transformers
44
  fsspec==2025.3.2
45
  # via
 
46
  # huggingface-hub
47
  # torch
48
  h11==0.16.0
49
- # via uvicorn
 
 
50
  huggingface-hub==0.31.2
51
  # via
52
  # -r requirements.in
@@ -54,79 +76,130 @@ huggingface-hub==0.31.2
54
  # transformers
55
  idna==3.10
56
  # via
 
57
  # anyio
58
  # requests
59
  jinja2==3.1.6
60
  # via
 
61
  # spacy
62
  # torch
63
  langcodes==3.5.0
64
- # via spacy
 
 
65
  language-data==1.3.0
66
- # via langcodes
 
 
67
  marisa-trie==1.2.1
68
- # via language-data
 
 
69
  markdown-it-py==3.0.0
70
- # via rich
 
 
71
  markupsafe==3.0.2
72
- # via jinja2
 
 
73
  mdurl==0.1.2
74
- # via markdown-it-py
 
 
75
  mpmath==1.3.0
76
- # via sympy
 
 
77
  murmurhash==1.0.12
78
  # via
 
79
  # preshed
80
  # spacy
81
  # thinc
82
  networkx==3.4.2
83
- # via torch
 
 
84
  numpy==2.2.5
85
  # via
 
86
  # blis
87
  # spacy
88
  # spacy-transformers
89
  # thinc
90
  # transformers
 
 
 
 
 
91
  nvidia-cublas-cu12==12.6.4.1
92
  # via
 
93
  # nvidia-cudnn-cu12
94
  # nvidia-cusolver-cu12
95
  # torch
96
  nvidia-cuda-cupti-cu12==12.6.80
97
- # via torch
 
 
98
  nvidia-cuda-nvrtc-cu12==12.6.77
99
- # via torch
 
 
100
  nvidia-cuda-runtime-cu12==12.6.77
101
- # via torch
 
 
102
  nvidia-cudnn-cu12==9.5.1.17
103
- # via torch
 
 
104
  nvidia-cufft-cu12==11.3.0.4
105
- # via torch
 
 
106
  nvidia-cufile-cu12==1.11.1.6
107
- # via torch
 
 
108
  nvidia-curand-cu12==10.3.7.77
109
- # via torch
 
 
110
  nvidia-cusolver-cu12==11.7.1.2
111
- # via torch
 
 
112
  nvidia-cusparse-cu12==12.5.4.2
113
  # via
 
114
  # nvidia-cusolver-cu12
115
  # torch
116
  nvidia-cusparselt-cu12==0.6.3
117
- # via torch
 
 
118
  nvidia-nccl-cu12==2.26.2
119
- # via torch
 
 
120
  nvidia-nvjitlink-cu12==12.6.85
121
  # via
 
122
  # nvidia-cufft-cu12
123
  # nvidia-cusolver-cu12
124
  # nvidia-cusparse-cu12
125
  # torch
126
  nvidia-nvtx-cu12==12.6.77
127
- # via torch
 
 
128
  packaging==25.0
129
  # via
 
130
  # huggingface-hub
131
  # spacy
132
  # thinc
@@ -134,6 +207,7 @@ packaging==25.0
134
  # weasel
135
  preshed==3.0.9
136
  # via
 
137
  # spacy
138
  # thinc
139
  pydantic==2.11.4
@@ -145,74 +219,109 @@ pydantic==2.11.4
145
  # thinc
146
  # weasel
147
  pydantic-core==2.33.2
148
- # via pydantic
 
 
149
  pygments==2.19.1
150
- # via rich
 
 
151
  python-multipart==0.0.20
152
  # via -r requirements.in
153
  pyyaml==6.0.2
154
  # via
 
155
  # huggingface-hub
156
  # transformers
157
  regex==2024.11.6
158
- # via transformers
 
 
159
  requests==2.32.3
160
  # via
 
161
  # huggingface-hub
162
  # spacy
163
  # transformers
164
  # weasel
165
  rich==14.0.0
166
- # via typer
 
 
167
  safetensors==0.5.3
168
  # via
169
  # -r requirements.in
170
  # transformers
 
 
171
  setuptools==80.7.1
172
  # via
 
173
  # marisa-trie
174
  # spacy
175
  # thinc
176
  # triton
177
  shellingham==1.5.4
178
- # via typer
 
 
179
  smart-open==7.1.0
180
- # via weasel
 
 
181
  sniffio==1.3.1
182
- # via anyio
 
 
183
  spacy==3.8.5
184
  # via
185
  # -r requirements.in
186
  # spacy-transformers
187
  spacy-alignments==0.9.1
188
- # via spacy-transformers
 
 
189
  spacy-legacy==3.0.12
190
- # via spacy
 
 
191
  spacy-loggers==1.0.5
192
- # via spacy
 
 
193
  spacy-transformers==1.3.8
194
  # via -r requirements.in
195
  srsly==2.5.1
196
  # via
 
197
  # confection
198
  # spacy
199
  # spacy-transformers
200
  # thinc
201
  # weasel
202
  starlette==0.46.2
203
- # via fastapi
 
 
204
  sympy==1.14.0
205
- # via torch
 
 
206
  thinc==8.3.6
207
- # via spacy
 
 
208
  tokenizers==0.21.1
209
- # via transformers
 
 
210
  torch==2.7.0
211
  # via
212
  # -r requirements.in
213
  # spacy-transformers
214
  tqdm==4.67.1
215
  # via
 
216
  # huggingface-hub
217
  # spacy
218
  # transformers
@@ -221,13 +330,17 @@ transformers==4.49.0
221
  # -r requirements.in
222
  # spacy-transformers
223
  triton==3.3.0
224
- # via torch
 
 
225
  typer==0.15.3
226
  # via
 
227
  # spacy
228
  # weasel
229
  typing-extensions==4.13.2
230
  # via
 
231
  # anyio
232
  # cloudpathlib
233
  # exceptiongroup
@@ -241,20 +354,28 @@ typing-extensions==4.13.2
241
  # typing-inspection
242
  # uvicorn
243
  typing-inspection==0.4.0
244
- # via pydantic
 
 
245
  urllib3==2.4.0
246
- # via requests
 
 
247
  uvicorn==0.34.2
248
  # via -r requirements.in
249
  wasabi==1.1.3
250
  # via
 
251
  # spacy
252
  # thinc
253
  # weasel
254
  weasel==0.4.1
255
- # via spacy
 
 
256
  wrapt==1.17.2
257
- # via smart-open
 
 
258
  xx-ent-wiki-sm @ https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.8.0/xx_ent_wiki_sm-3.8.0-py3-none-any.whl
259
  # via -r requirements.in
260
- sentencepiece
 
1
  # This file was autogenerated by uv via the following command:
2
  # uv pip compile requirements.in -o requirements.txt
3
  annotated-types==0.7.0
4
+ # via
5
+ # -r requirements.in
6
+ # pydantic
7
  anyio==4.9.0
8
+ # via
9
+ # -r requirements.in
10
+ # starlette
11
  blis==1.3.0
12
+ # via
13
+ # -r requirements.in
14
+ # thinc
15
  catalogue==2.0.10
16
  # via
17
+ # -r requirements.in
18
  # spacy
19
  # srsly
20
  # thinc
21
  certifi==2025.4.26
22
+ # via
23
+ # -r requirements.in
24
+ # requests
25
  charset-normalizer==3.4.2
26
+ # via
27
+ # -r requirements.in
28
+ # requests
29
  click==8.2.0
30
  # via
31
+ # -r requirements.in
32
  # typer
33
  # uvicorn
34
  cloudpathlib==0.21.1
35
+ # via
36
+ # -r requirements.in
37
+ # weasel
38
  confection==0.1.5
39
  # via
40
+ # -r requirements.in
41
  # thinc
42
  # weasel
43
  cymem==2.0.11
44
  # via
45
+ # -r requirements.in
46
  # preshed
47
  # spacy
48
  # thinc
49
  en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl
50
  # via -r requirements.in
51
  exceptiongroup==1.3.0
52
+ # via
53
+ # -r requirements.in
54
+ # anyio
55
  fastapi==0.115.12
56
  # via -r requirements.in
57
  filelock==3.18.0
58
  # via
59
+ # -r requirements.in
60
  # huggingface-hub
61
  # torch
62
  # transformers
63
  fsspec==2025.3.2
64
  # via
65
+ # -r requirements.in
66
  # huggingface-hub
67
  # torch
68
  h11==0.16.0
69
+ # via
70
+ # -r requirements.in
71
+ # uvicorn
72
  huggingface-hub==0.31.2
73
  # via
74
  # -r requirements.in
 
76
  # transformers
77
  idna==3.10
78
  # via
79
+ # -r requirements.in
80
  # anyio
81
  # requests
82
  jinja2==3.1.6
83
  # via
84
+ # -r requirements.in
85
  # spacy
86
  # torch
87
  langcodes==3.5.0
88
+ # via
89
+ # -r requirements.in
90
+ # spacy
91
  language-data==1.3.0
92
+ # via
93
+ # -r requirements.in
94
+ # langcodes
95
  marisa-trie==1.2.1
96
+ # via
97
+ # -r requirements.in
98
+ # language-data
99
  markdown-it-py==3.0.0
100
+ # via
101
+ # -r requirements.in
102
+ # rich
103
  markupsafe==3.0.2
104
+ # via
105
+ # -r requirements.in
106
+ # jinja2
107
  mdurl==0.1.2
108
+ # via
109
+ # -r requirements.in
110
+ # markdown-it-py
111
  mpmath==1.3.0
112
+ # via
113
+ # -r requirements.in
114
+ # sympy
115
  murmurhash==1.0.12
116
  # via
117
+ # -r requirements.in
118
  # preshed
119
  # spacy
120
  # thinc
121
  networkx==3.4.2
122
+ # via
123
+ # -r requirements.in
124
+ # torch
125
  numpy==2.2.5
126
  # via
127
+ # -r requirements.in
128
  # blis
129
  # spacy
130
  # spacy-transformers
131
  # thinc
132
  # transformers
133
+
134
+ # SQLite is included in Python standard library
135
+ python-dotenv
136
+ # for environment variable management
137
+
138
  nvidia-cublas-cu12==12.6.4.1
139
  # via
140
+ # -r requirements.in
141
  # nvidia-cudnn-cu12
142
  # nvidia-cusolver-cu12
143
  # torch
144
  nvidia-cuda-cupti-cu12==12.6.80
145
+ # via
146
+ # -r requirements.in
147
+ # torch
148
  nvidia-cuda-nvrtc-cu12==12.6.77
149
+ # via
150
+ # -r requirements.in
151
+ # torch
152
  nvidia-cuda-runtime-cu12==12.6.77
153
+ # via
154
+ # -r requirements.in
155
+ # torch
156
  nvidia-cudnn-cu12==9.5.1.17
157
+ # via
158
+ # -r requirements.in
159
+ # torch
160
  nvidia-cufft-cu12==11.3.0.4
161
+ # via
162
+ # -r requirements.in
163
+ # torch
164
  nvidia-cufile-cu12==1.11.1.6
165
+ # via
166
+ # -r requirements.in
167
+ # torch
168
  nvidia-curand-cu12==10.3.7.77
169
+ # via
170
+ # -r requirements.in
171
+ # torch
172
  nvidia-cusolver-cu12==11.7.1.2
173
+ # via
174
+ # -r requirements.in
175
+ # torch
176
  nvidia-cusparse-cu12==12.5.4.2
177
  # via
178
+ # -r requirements.in
179
  # nvidia-cusolver-cu12
180
  # torch
181
  nvidia-cusparselt-cu12==0.6.3
182
+ # via
183
+ # -r requirements.in
184
+ # torch
185
  nvidia-nccl-cu12==2.26.2
186
+ # via
187
+ # -r requirements.in
188
+ # torch
189
  nvidia-nvjitlink-cu12==12.6.85
190
  # via
191
+ # -r requirements.in
192
  # nvidia-cufft-cu12
193
  # nvidia-cusolver-cu12
194
  # nvidia-cusparse-cu12
195
  # torch
196
  nvidia-nvtx-cu12==12.6.77
197
+ # via
198
+ # -r requirements.in
199
+ # torch
200
  packaging==25.0
201
  # via
202
+ # -r requirements.in
203
  # huggingface-hub
204
  # spacy
205
  # thinc
 
207
  # weasel
208
  preshed==3.0.9
209
  # via
210
+ # -r requirements.in
211
  # spacy
212
  # thinc
213
  pydantic==2.11.4
 
219
  # thinc
220
  # weasel
221
  pydantic-core==2.33.2
222
+ # via
223
+ # -r requirements.in
224
+ # pydantic
225
  pygments==2.19.1
226
+ # via
227
+ # -r requirements.in
228
+ # rich
229
  python-multipart==0.0.20
230
  # via -r requirements.in
231
  pyyaml==6.0.2
232
  # via
233
+ # -r requirements.in
234
  # huggingface-hub
235
  # transformers
236
  regex==2024.11.6
237
+ # via
238
+ # -r requirements.in
239
+ # transformers
240
  requests==2.32.3
241
  # via
242
+ # -r requirements.in
243
  # huggingface-hub
244
  # spacy
245
  # transformers
246
  # weasel
247
  rich==14.0.0
248
+ # via
249
+ # -r requirements.in
250
+ # typer
251
  safetensors==0.5.3
252
  # via
253
  # -r requirements.in
254
  # transformers
255
+ sentencepiece==0.2.0
256
+ # via -r requirements.in
257
  setuptools==80.7.1
258
  # via
259
+ # -r requirements.in
260
  # marisa-trie
261
  # spacy
262
  # thinc
263
  # triton
264
  shellingham==1.5.4
265
+ # via
266
+ # -r requirements.in
267
+ # typer
268
  smart-open==7.1.0
269
+ # via
270
+ # -r requirements.in
271
+ # weasel
272
  sniffio==1.3.1
273
+ # via
274
+ # -r requirements.in
275
+ # anyio
276
  spacy==3.8.5
277
  # via
278
  # -r requirements.in
279
  # spacy-transformers
280
  spacy-alignments==0.9.1
281
+ # via
282
+ # -r requirements.in
283
+ # spacy-transformers
284
  spacy-legacy==3.0.12
285
+ # via
286
+ # -r requirements.in
287
+ # spacy
288
  spacy-loggers==1.0.5
289
+ # via
290
+ # -r requirements.in
291
+ # spacy
292
  spacy-transformers==1.3.8
293
  # via -r requirements.in
294
  srsly==2.5.1
295
  # via
296
+ # -r requirements.in
297
  # confection
298
  # spacy
299
  # spacy-transformers
300
  # thinc
301
  # weasel
302
  starlette==0.46.2
303
+ # via
304
+ # -r requirements.in
305
+ # fastapi
306
  sympy==1.14.0
307
+ # via
308
+ # -r requirements.in
309
+ # torch
310
  thinc==8.3.6
311
+ # via
312
+ # -r requirements.in
313
+ # spacy
314
  tokenizers==0.21.1
315
+ # via
316
+ # -r requirements.in
317
+ # transformers
318
  torch==2.7.0
319
  # via
320
  # -r requirements.in
321
  # spacy-transformers
322
  tqdm==4.67.1
323
  # via
324
+ # -r requirements.in
325
  # huggingface-hub
326
  # spacy
327
  # transformers
 
330
  # -r requirements.in
331
  # spacy-transformers
332
  triton==3.3.0
333
+ # via
334
+ # -r requirements.in
335
+ # torch
336
  typer==0.15.3
337
  # via
338
+ # -r requirements.in
339
  # spacy
340
  # weasel
341
  typing-extensions==4.13.2
342
  # via
343
+ # -r requirements.in
344
  # anyio
345
  # cloudpathlib
346
  # exceptiongroup
 
354
  # typing-inspection
355
  # uvicorn
356
  typing-inspection==0.4.0
357
+ # via
358
+ # -r requirements.in
359
+ # pydantic
360
  urllib3==2.4.0
361
+ # via
362
+ # -r requirements.in
363
+ # requests
364
  uvicorn==0.34.2
365
  # via -r requirements.in
366
  wasabi==1.1.3
367
  # via
368
+ # -r requirements.in
369
  # spacy
370
  # thinc
371
  # weasel
372
  weasel==0.4.1
373
+ # via
374
+ # -r requirements.in
375
+ # spacy
376
  wrapt==1.17.2
377
+ # via
378
+ # -r requirements.in
379
+ # smart-open
380
  xx-ent-wiki-sm @ https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.8.0/xx_ent_wiki_sm-3.8.0-py3-none-any.whl
381
  # via -r requirements.in
 
requirements.txt CHANGED
@@ -221,6 +221,8 @@ pygments==2.19.1
221
  # via
222
  # -r requirements.in
223
  # rich
 
 
224
  python-multipart==0.0.20
225
  # via -r requirements.in
226
  pyyaml==6.0.2
 
221
  # via
222
  # -r requirements.in
223
  # rich
224
+ python-dotenv==1.1.0
225
+ # via -r requirements.in
226
  python-multipart==0.0.20
227
  # via -r requirements.in
228
  pyyaml==6.0.2
utils.py CHANGED
@@ -1,6 +1,7 @@
1
  import re
2
  import spacy
3
- from typing import List, Dict, Tuple, Any
 
4
 
5
  class Entity:
6
  def __init__(self, start: int, end: int, entity_type: str, value: str):
@@ -20,7 +21,7 @@ class Entity:
20
  return f"Entity(type='{self.entity_type}', value='{self.value}', start={self.start}, end={self.end})"
21
 
22
  class PIIMasker:
23
- def __init__(self, spacy_model_name: str = "xx_ent_wiki_sm"): # Allow model choice
24
  # Load SpaCy model
25
  try:
26
  self.nlp = spacy.load(spacy_model_name)
@@ -39,7 +40,9 @@ class PIIMasker:
39
  spacy.cli.download("en_core_web_sm")
40
  self.nlp = spacy.load("en_core_web_sm")
41
 
42
-
 
 
43
  # Initialize regex patterns
44
  self._initialize_patterns()
45
 
@@ -320,12 +323,50 @@ class PIIMasker:
320
 
321
  def process_email(self, email_text: str) -> Dict[str, Any]:
322
  """
323
- Process an email by detecting and masking PII entities
 
324
  """
 
325
  masked_email, entity_info = self.mask_text(email_text)
 
 
 
 
 
 
 
 
 
326
  return {
327
- "input_email_body": email_text,
328
  "list_of_masked_entities": entity_info,
329
  "masked_email": masked_email,
330
- "category_of_the_email": ""
331
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import re
2
  import spacy
3
+ from typing import List, Dict, Tuple, Any, Optional
4
+ from database import EmailDatabase
5
 
6
  class Entity:
7
  def __init__(self, start: int, end: int, entity_type: str, value: str):
 
21
  return f"Entity(type='{self.entity_type}', value='{self.value}', start={self.start}, end={self.end})"
22
 
23
  class PIIMasker:
24
+ def __init__(self, spacy_model_name: str = "xx_ent_wiki_sm", db_path: str = None): # Allow model choice
25
  # Load SpaCy model
26
  try:
27
  self.nlp = spacy.load(spacy_model_name)
 
40
  spacy.cli.download("en_core_web_sm")
41
  self.nlp = spacy.load("en_core_web_sm")
42
 
43
+ # Initialize database connection with SQLite path
44
+ self.db = EmailDatabase(connection_string=db_path)
45
+
46
  # Initialize regex patterns
47
  self._initialize_patterns()
48
 
 
323
 
324
  def process_email(self, email_text: str) -> Dict[str, Any]:
325
  """
326
+ Process an email by detecting and masking PII entities.
327
+ The original email is stored in the database for later retrieval if needed.
328
  """
329
+ # Mask the email
330
  masked_email, entity_info = self.mask_text(email_text)
331
+
332
+ # Store the email in the SQLite database
333
+ email_id, access_key = self.db.store_email(
334
+ original_email=email_text,
335
+ masked_email=masked_email,
336
+ masked_entities=entity_info
337
+ )
338
+
339
+ # Return the processed data with database references
340
  return {
341
+ "input_email_body": email_text, # Return original input for compatibility
342
  "list_of_masked_entities": entity_info,
343
  "masked_email": masked_email,
344
+ "category_of_the_email": "",
345
+ "email_id": email_id,
346
+ "access_key": access_key # Include access key for immediate retrieval if needed
347
+ }
348
+
349
+ def get_original_email(self, email_id: str, access_key: str) -> Optional[Dict[str, Any]]:
350
+ """
351
+ Retrieve the original email with PII using the email ID and access key.
352
+
353
+ Args:
354
+ email_id: The ID of the stored email
355
+ access_key: The security key for accessing the original email
356
+
357
+ Returns:
358
+ The original email data or None if not found or access_key is invalid
359
+ """
360
+ return self.db.get_original_email(email_id, access_key)
361
+
362
+ def get_masked_email_by_id(self, email_id: str) -> Optional[Dict[str, Any]]:
363
+ """
364
+ Retrieve a masked email by its ID (without the original PII-containing email).
365
+
366
+ Args:
367
+ email_id: The ID of the stored email
368
+
369
+ Returns:
370
+ The masked email data or None if not found
371
+ """
372
+ return self.db.get_email_by_id(email_id)