InfosysResponsibleAiToolKit commited on
Commit
f496f54
·
1 Parent(s): e40ae6e

Add large model file to Git LFS

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +4 -0
  2. LICENSE.md +8 -0
  3. api-spec/rai-privacy.yaml +389 -0
  4. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/build_config.yaml +14 -0
  5. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/create_wheel_file.py +44 -0
  6. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/dist/presidio_analyzer-4.0.5-py3-none-any.whl +0 -0
  7. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/dist/presidio_analyzer-4.0.5.tar.gz +3 -0
  8. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/dist/presidio_analyzer-4.0.6-py3-none-any.whl +0 -0
  9. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/dist/presidio_analyzer-4.0.6.tar.gz +3 -0
  10. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/dist/presidio_analyzer-4.1.0-py3-none-any.whl +0 -0
  11. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/dist/presidio_analyzer-4.1.0.tar.gz +3 -0
  12. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer.egg-info/PKG-INFO +12 -0
  13. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer.egg-info/SOURCES.txt +64 -0
  14. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer.egg-info/dependency_links.txt +1 -0
  15. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer.egg-info/top_level.txt +1 -0
  16. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/__init__.py +52 -0
  17. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/analysis_explanation.py +64 -0
  18. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/analyzer_engine.py +372 -0
  19. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/analyzer_request.py +36 -0
  20. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/app_tracer.py +27 -0
  21. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/batch_analyzer_engine.py +145 -0
  22. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/context_aware_enhancers/__init__.py +5 -0
  23. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/context_aware_enhancers/context_aware_enhancer.py +68 -0
  24. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/context_aware_enhancers/lemma_context_aware_enhancer.py +334 -0
  25. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/dict_analyzer_result.py +29 -0
  26. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/entity_recognizer.py +199 -0
  27. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/local_recognizer.py +7 -0
  28. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/__init__.py +19 -0
  29. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/client_nlp_engine.py +108 -0
  30. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/nlp_artifacts.py +74 -0
  31. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/nlp_engine.py +42 -0
  32. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py +128 -0
  33. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py +96 -0
  34. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py +39 -0
  35. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py +155 -0
  36. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/pattern.py +45 -0
  37. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/pattern_recognizer.py +253 -0
  38. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/Aadhaar_Number.py +46 -0
  39. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/PAN_Number.py +46 -0
  40. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/__init__.py +77 -0
  41. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/au_abn_recognizer.py +93 -0
  42. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/au_acn_recognizer.py +90 -0
  43. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/au_medicare_recognizer.py +89 -0
  44. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/au_tfn_recognizer.py +95 -0
  45. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/credit_card_recognizer.py +85 -0
  46. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/crypto_recognizer.py +54 -0
  47. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/data_recognizer.py +184 -0
  48. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/date_recognizer.py +127 -0
  49. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/email_recognizer.py +46 -0
  50. presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/es_nif_recognizer.py +58 -0
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ rai_privacy_package/privacy/rai_privacy/privacy/privacy/util/face_detect/face_detector/res10_300x300_ssd_iter_140000.caffemodel filter=lfs diff=lfs merge=lfs -text
37
+ rai_privacy_package/privacy/rai_privacy/privacy/privacy/util/face_detect/doc/10.jpg filter=lfs diff=lfs merge=lfs -text
38
+ rai_privacy_package/privacy/rai_privacy/privacy/privacy/util/face_detect/doc/5.jpg filter=lfs diff=lfs merge=lfs -text
39
+ rai_privacy_package/privacy/rai_privacy/privacy/privacy/util/face_detect/doc/8.jpg filter=lfs diff=lfs merge=lfs -text
LICENSE.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+
2
+ MIT license https://opensource.org/licenses/MIT Copyright 2024-2025 Infosys Ltd
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
5
+
6
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
7
+
8
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
api-spec/rai-privacy.yaml ADDED
@@ -0,0 +1,389 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ openapi: 3.0.2
2
+ info:
3
+ title: Infosys Responsible AI - responsible-ai-privacy - OpenAPI 3.0
4
+ description: API specs for Infosys Responsible AI Privacy pillar in OpenAPI 3.0 format
5
+ termsOfService: https://www.infosys.com
6
+ contact:
7
+ email: aina@infosys.com
8
+ license:
9
+ name: Infosys
10
+ url: https://www.infosys.com
11
+ version: v$version
12
+ security:
13
+ - oauth_auth:
14
+ - write:users
15
+ - read:users
16
+ paths:
17
+ /api/v1/privacy/pii/analyze:
18
+ post:
19
+ tags:
20
+ - PII Privacy
21
+ summary: Analyze
22
+ operationId: analyze_api_v1_privacy_pii_analyze_post
23
+ security:
24
+ - oauth_auth:
25
+ - write:users
26
+ requestBody:
27
+ content:
28
+ application/json:
29
+ schema:
30
+ $ref: '#/components/schemas/PIIAnalyzeRequest'
31
+ required: true
32
+ responses:
33
+ '200':
34
+ description: Successful Response
35
+ content:
36
+ application/json:
37
+ schema:
38
+ $ref: '#/components/schemas/PIIAnalyzeResponse'
39
+ '401':
40
+ description: Unauthorized
41
+ content:
42
+ application/json:
43
+ schema:
44
+ $ref: '#/components/schemas/Error'
45
+ '403':
46
+ description: Forbidden
47
+ content:
48
+ application/json:
49
+ schema:
50
+ $ref: '#/components/schemas/Error'
51
+ '422':
52
+ description: Validation Error
53
+ content:
54
+ application/json:
55
+ schema:
56
+ $ref: '#/components/schemas/HTTPValidationError'
57
+ /api/v1/privacy/pii/anonymize:
58
+ post:
59
+ tags:
60
+ - PII Privacy
61
+ summary: Anonymize
62
+ operationId: anonymize_api_v1_privacy_pii_anonymize_post
63
+ security:
64
+ - oauth_auth:
65
+ - write:users
66
+ requestBody:
67
+ content:
68
+ application/json:
69
+ schema:
70
+ $ref: '#/components/schemas/PIIAnonymizeRequest'
71
+ required: true
72
+ responses:
73
+ '200':
74
+ description: Successful Response
75
+ content:
76
+ application/json:
77
+ schema:
78
+ $ref: '#/components/schemas/PIIAnonymizeResponse'
79
+ '401':
80
+ description: Unauthorized
81
+ content:
82
+ application/json:
83
+ schema:
84
+ $ref: '#/components/schemas/Error'
85
+ '403':
86
+ description: Forbidden
87
+ content:
88
+ application/json:
89
+ schema:
90
+ $ref: '#/components/schemas/Error'
91
+ '422':
92
+ description: Validation Error
93
+ content:
94
+ application/json:
95
+ schema:
96
+ $ref: '#/components/schemas/HTTPValidationError'
97
+ /api/v1/privacy/pii/image/analyze:
98
+ post:
99
+ tags:
100
+ - PII Privacy
101
+ summary: Image Analyze
102
+ operationId: image_analyze_api_v1_privacy_pii_image_analyze_post
103
+ security:
104
+ - oauth_auth:
105
+ - write:users
106
+ requestBody:
107
+ content:
108
+ multipart/form-data:
109
+ schema:
110
+ $ref: '#/components/schemas/Body_image_analyze_api_v1_privacy_pii_image_analyze_post'
111
+ required: true
112
+ responses:
113
+ '200':
114
+ description: Successful Response
115
+ content:
116
+ application/json:
117
+ schema:
118
+ $ref: '#/components/schemas/PIIImageAnalyzeResponse'
119
+ '401':
120
+ description: Unauthorized
121
+ content:
122
+ application/json:
123
+ schema:
124
+ $ref: '#/components/schemas/Error'
125
+ '403':
126
+ description: Forbidden
127
+ content:
128
+ application/json:
129
+ schema:
130
+ $ref: '#/components/schemas/Error'
131
+ '422':
132
+ description: Validation Error
133
+ content:
134
+ application/json:
135
+ schema:
136
+ $ref: '#/components/schemas/HTTPValidationError'
137
+ /api/v1/privacy/pii/image/anonymize:
138
+ post:
139
+ tags:
140
+ - PII Privacy
141
+ summary: Image Anonymize
142
+ operationId: image_anonymize_api_v1_privacy_pii_image_anonymize_post
143
+ security:
144
+ - oauth_auth:
145
+ - write:users
146
+ requestBody:
147
+ content:
148
+ multipart/form-data:
149
+ schema:
150
+ $ref: '#/components/schemas/Body_image_anonymize_api_v1_privacy_pii_image_anonymize_post'
151
+ required: true
152
+ responses:
153
+ '200':
154
+ description: Successful Response
155
+ content:
156
+ application/json:
157
+ schema: {}
158
+ '401':
159
+ description: Unauthorized
160
+ content:
161
+ application/json:
162
+ schema:
163
+ $ref: '#/components/schemas/Error'
164
+ '403':
165
+ description: Forbidden
166
+ content:
167
+ application/json:
168
+ schema:
169
+ $ref: '#/components/schemas/Error'
170
+ '422':
171
+ description: Validation Error
172
+ content:
173
+ application/json:
174
+ schema:
175
+ $ref: '#/components/schemas/HTTPValidationError'
176
+ /api/v1/privacy/pii/image/verify:
177
+ post:
178
+ tags:
179
+ - PII Privacy
180
+ summary: Image Verify
181
+ operationId: image_verify_api_v1_privacy_pii_image_verify_post
182
+ security:
183
+ - oauth_auth:
184
+ - write:users
185
+ requestBody:
186
+ content:
187
+ multipart/form-data:
188
+ schema:
189
+ $ref: '#/components/schemas/Body_image_verify_api_v1_privacy_pii_image_verify_post'
190
+ required: true
191
+ responses:
192
+ '200':
193
+ description: Successful Response
194
+ content:
195
+ application/json:
196
+ schema: {}
197
+ '401':
198
+ description: Unauthorized
199
+ content:
200
+ application/json:
201
+ schema:
202
+ $ref: '#/components/schemas/Error'
203
+ '403':
204
+ description: Forbidden
205
+ content:
206
+ application/json:
207
+ schema:
208
+ $ref: '#/components/schemas/Error'
209
+ '422':
210
+ description: Validation Error
211
+ content:
212
+ application/json:
213
+ schema:
214
+ $ref: '#/components/schemas/HTTPValidationError'
215
+ components:
216
+ schemas:
217
+ Body_image_analyze_api_v1_privacy_pii_image_analyze_post:
218
+ title: Body_image_analyze_api_v1_privacy_pii_image_analyze_post
219
+ required:
220
+ - payload
221
+ type: object
222
+ properties:
223
+ payload:
224
+ title: Payload
225
+ type: string
226
+ format: binary
227
+ Body_image_anonymize_api_v1_privacy_pii_image_anonymize_post:
228
+ title: Body_image_anonymize_api_v1_privacy_pii_image_anonymize_post
229
+ required:
230
+ - payload
231
+ type: object
232
+ properties:
233
+ payload:
234
+ title: Payload
235
+ type: string
236
+ format: binary
237
+ Body_image_verify_api_v1_privacy_pii_image_verify_post:
238
+ title: Body_image_verify_api_v1_privacy_pii_image_verify_post
239
+ required:
240
+ - payload
241
+ type: object
242
+ properties:
243
+ payload:
244
+ title: Payload
245
+ type: string
246
+ format: binary
247
+ HTTPValidationError:
248
+ title: HTTPValidationError
249
+ type: object
250
+ properties:
251
+ detail:
252
+ title: Detail
253
+ type: array
254
+ items:
255
+ $ref: '#/components/schemas/ValidationError'
256
+ PIIAnalyzeRequest:
257
+ title: PIIAnalyzeRequest
258
+ required:
259
+ - inputText
260
+ type: object
261
+ properties:
262
+ inputText:
263
+ title: Inputtext
264
+ type: string
265
+ example: John Smith's SSN is 012884567
266
+ PIIAnalyzeResponse:
267
+ title: PIIAnalyzeResponse
268
+ required:
269
+ - PIIEntities
270
+ type: object
271
+ properties:
272
+ PIIEntities:
273
+ title: Piientities
274
+ type: array
275
+ items:
276
+ $ref: '#/components/schemas/PIIEntity'
277
+ PIIAnonymizeRequest:
278
+ title: PIIAnonymizeRequest
279
+ required:
280
+ - inputText
281
+ type: object
282
+ properties:
283
+ inputText:
284
+ title: Inputtext
285
+ type: string
286
+ example: John Smith's SSN is 012884567
287
+ piiEntitiesToBeRedacted:
288
+ title: Piientitiestoberedacted
289
+ type: array
290
+ items: {}
291
+ example:
292
+ - US_SSN
293
+ redactionType:
294
+ title: Redactiontype
295
+ type: string
296
+ example: replace
297
+ PIIAnonymizeResponse:
298
+ title: PIIAnonymizeResponse
299
+ required:
300
+ - anonymizedText
301
+ type: object
302
+ properties:
303
+ anonymizedText:
304
+ title: Anonymizedtext
305
+ type: string
306
+ example: John Smith's SSN is <US_SSN>
307
+ PIIEntity:
308
+ title: PIIEntity
309
+ required:
310
+ - type
311
+ - beginOffset
312
+ - endOffset
313
+ - confidenceScore
314
+ type: object
315
+ properties:
316
+ type:
317
+ title: Type
318
+ type: string
319
+ example: US_SSN
320
+ beginOffset:
321
+ title: Beginoffset
322
+ type: integer
323
+ example: 19
324
+ endOffset:
325
+ title: Endoffset
326
+ type: integer
327
+ example: 28
328
+ confidenceScore:
329
+ title: ConfidenceScore
330
+ type: number
331
+ example: 0.25
332
+ PIIImageAnalyzeResponse:
333
+ title: PIIImageAnalyzeResponse
334
+ required:
335
+ - PIIEntities
336
+ type: object
337
+ properties:
338
+ PIIEntities:
339
+ title: Piientities
340
+ type: array
341
+ items:
342
+ $ref: '#/components/schemas/PIIImageEntity'
343
+ PIIImageEntity:
344
+ title: PIIImageEntity
345
+ required:
346
+ - type
347
+ type: object
348
+ properties:
349
+ type:
350
+ title: Type
351
+ type: string
352
+ example: US_SSN
353
+ ValidationError:
354
+ title: ValidationError
355
+ required:
356
+ - loc
357
+ - msg
358
+ - type
359
+ type: object
360
+ properties:
361
+ loc:
362
+ title: Location
363
+ type: array
364
+ items:
365
+ anyOf:
366
+ - type: string
367
+ - type: integer
368
+ msg:
369
+ title: Message
370
+ type: string
371
+ type:
372
+ title: Error Type
373
+ type: string
374
+ securitySchemes:
375
+ oauth_auth:
376
+ type: oauth2
377
+ flows:
378
+ authorizationCode:
379
+ authorizationUrl: https://example.com/oauth/authorize
380
+ tokenUrl: https://example.com/oauth/token
381
+ scopes:
382
+ write:users: modify user profile
383
+
384
+ tags:
385
+ - name: PII Privacy
386
+ description: Operations required for a PII entity (e.g. IN_ADHAAR, IN_PAN, US_SSN etc)
387
+ externalDocs:
388
+ description: Find out more
389
+ url: https://www.infosys.com
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/build_config.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -
2
+ name: presidio_analyzer
3
+ version: 4.1.0
4
+ build: 0.0.1
5
+ author: Amit Hegde
6
+ author_email: amitumamaheshwar.h@infosys.com
7
+ description: Infosys Intelligent Assistant
8
+ long_description: Infosys Intelligent Assistant
9
+ classifiers: ["Programming Language :: Python :: 3",
10
+ "License :: OSI Approved :: MIT License",
11
+ "Operating System :: OS Independent",]
12
+ package_dir: {"": "presidio_analyzer"}
13
+ packages: presidio_analyzer
14
+ python_requires: ['>=3.6']
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/create_wheel_file.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __copyright__ = """ 2020 - 2021 Infosys Limited, Bangalore, India. All Rights Reserved.
2
+ Version: 2.5.0.0
3
+ Except for any free or open source software components embedded in this Infosys proprietary software program (“Program”), this Program is protected by copyright laws, international treaties and other pending or existing intellectual property rights in India, the United States and other countries.
4
+ Except as expressly permitted, any unauthorized reproduction, storage, transmission in any form or by any means (including without limitation electronic, mechanical, printing, photocopying, recording or otherwise), or any distribution of this Program, or any portion of it, may result in severe civil and criminal penalties, and will be prosecuted to the maximum extent possible under the law.
5
+ """
6
+ import yaml
7
+ import subprocess
8
+ import os
9
+ with open(r'.\build_config.yaml') as build_file:
10
+ build_config_list = yaml.safe_load(build_file)
11
+
12
+
13
+ for build_config in build_config_list:
14
+
15
+ try:
16
+ print(build_config)
17
+
18
+ if os.path.exists(f"./{build_config['packages']}"):
19
+
20
+ setup_str = f"import setuptools\r" \
21
+ f"setuptools.setup(\r \
22
+ name='{build_config['name']}',\r \
23
+ version='{build_config['version']}',\r \
24
+ author='{build_config['author']}',\r \
25
+ author_email='{build_config['author_email']}',\r \
26
+ description='{build_config['description']}',\r \
27
+ long_description='{build_config['long_description']}',\r \
28
+ classifiers={build_config['classifiers']},\r \
29
+ package_dir={build_config['package_dir']},\r \
30
+ packages=setuptools.find_packages(where='{build_config['packages']}'),\r \
31
+ python_requires='{build_config['python_requires'][0]}',\r \
32
+ )"
33
+
34
+ with open('setup.py','w') as file:
35
+ file.write(setup_str)
36
+
37
+ subprocess.run(["python", "-m","build"])
38
+ wheel_file = f"{build_config['name']}-{build_config['version']}_build_{build_config['build']}-py3-none-any.whl"
39
+ print(f"wheel_file: {wheel_file}")
40
+ subprocess.run(["python", "-m", "pyc_wheel", f"dist\{wheel_file}"])
41
+ else:
42
+ print(f"Path does not exist ./{build_config['packages']}")
43
+ except Exception as e:
44
+ print("Exception occurred")
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/dist/presidio_analyzer-4.0.5-py3-none-any.whl ADDED
Binary file (78.9 kB). View file
 
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/dist/presidio_analyzer-4.0.5.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:effdee5c88badc2a4605dcabc7fe1ff43df586df0a7c2be3f4dbc4d440c7e4d6
3
+ size 44375
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/dist/presidio_analyzer-4.0.6-py3-none-any.whl ADDED
Binary file (79.1 kB). View file
 
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/dist/presidio_analyzer-4.0.6.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c49ca4ee3acda590bb69b68697e02cbfc81b89bd8dcfcaf9ff90b07fec062515
3
+ size 44656
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/dist/presidio_analyzer-4.1.0-py3-none-any.whl ADDED
Binary file (79.1 kB). View file
 
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/dist/presidio_analyzer-4.1.0.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:595ba3a58a473cc94a2a5c421eea075c5db52cb0181335a92f3a222f5cc76736
3
+ size 44675
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer.egg-info/PKG-INFO ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.1
2
+ Name: presidio_analyzer
3
+ Version: 4.1.0
4
+ Summary: Infosys Intelligent Assistant
5
+ Author: Amit Hegde
6
+ Author-email: amitumamaheshwar.h@infosys.com
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Python: >=3.6
11
+
12
+ Infosys Intelligent Assistant
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ setup.py
2
+ presidio_analyzer/presidio_analyzer/__init__.py
3
+ presidio_analyzer/presidio_analyzer/analysis_explanation.py
4
+ presidio_analyzer/presidio_analyzer/analyzer_engine.py
5
+ presidio_analyzer/presidio_analyzer/analyzer_request.py
6
+ presidio_analyzer/presidio_analyzer/app_tracer.py
7
+ presidio_analyzer/presidio_analyzer/batch_analyzer_engine.py
8
+ presidio_analyzer/presidio_analyzer/dict_analyzer_result.py
9
+ presidio_analyzer/presidio_analyzer/entity_recognizer.py
10
+ presidio_analyzer/presidio_analyzer/local_recognizer.py
11
+ presidio_analyzer/presidio_analyzer/pattern.py
12
+ presidio_analyzer/presidio_analyzer/pattern_recognizer.py
13
+ presidio_analyzer/presidio_analyzer/recognizer_result.py
14
+ presidio_analyzer/presidio_analyzer/remote_recognizer.py
15
+ presidio_analyzer/presidio_analyzer.egg-info/PKG-INFO
16
+ presidio_analyzer/presidio_analyzer.egg-info/SOURCES.txt
17
+ presidio_analyzer/presidio_analyzer.egg-info/dependency_links.txt
18
+ presidio_analyzer/presidio_analyzer.egg-info/top_level.txt
19
+ presidio_analyzer/presidio_analyzer/context_aware_enhancers/__init__.py
20
+ presidio_analyzer/presidio_analyzer/context_aware_enhancers/context_aware_enhancer.py
21
+ presidio_analyzer/presidio_analyzer/context_aware_enhancers/lemma_context_aware_enhancer.py
22
+ presidio_analyzer/presidio_analyzer/nlp_engine/__init__.py
23
+ presidio_analyzer/presidio_analyzer/nlp_engine/client_nlp_engine.py
24
+ presidio_analyzer/presidio_analyzer/nlp_engine/nlp_artifacts.py
25
+ presidio_analyzer/presidio_analyzer/nlp_engine/nlp_engine.py
26
+ presidio_analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py
27
+ presidio_analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py
28
+ presidio_analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py
29
+ presidio_analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py
30
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/Aadhaar_Number.py
31
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/PAN_Number.py
32
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/__init__.py
33
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/au_abn_recognizer.py
34
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/au_acn_recognizer.py
35
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/au_medicare_recognizer.py
36
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/au_tfn_recognizer.py
37
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/credit_card_recognizer.py
38
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/crypto_recognizer.py
39
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/data_recognizer.py
40
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/date_recognizer.py
41
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/email_recognizer.py
42
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/es_nif_recognizer.py
43
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/iban_patterns.py
44
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/iban_recognizer.py
45
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/ip_recognizer.py
46
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/it_driver_license_recognizer.py
47
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/it_fiscal_code_recognizer.py
48
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/it_identity_card_recognizer.py
49
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/it_passport_recognizer.py
50
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/it_vat_code.py
51
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/medical_license_recognizer.py
52
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/phone_recognizer.py
53
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/sg_fin_recognizer.py
54
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/spacy_recognizer.py
55
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/stanza_recognizer.py
56
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/transformers_recognizer.py
57
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/uk_nhs_recognizer.py
58
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/url_recognizer.py
59
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/us_driver_license_recognizer.py
60
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/us_itin_recognizer.py
61
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/us_passport_recognizer.py
62
+ presidio_analyzer/presidio_analyzer/predefined_recognizers/us_ssn_recognizer.py
63
+ presidio_analyzer/presidio_analyzer/recognizer_registry/__init__.py
64
+ presidio_analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ presidio_analyzer
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/__init__.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Presidio analyzer package."""
2
+
3
+ import logging
4
+
5
+ from presidio_analyzer.pattern import Pattern
6
+ from presidio_analyzer.analysis_explanation import AnalysisExplanation
7
+ from presidio_analyzer.recognizer_result import RecognizerResult
8
+ from presidio_analyzer.dict_analyzer_result import DictAnalyzerResult
9
+ from presidio_analyzer.entity_recognizer import EntityRecognizer
10
+ from presidio_analyzer.local_recognizer import LocalRecognizer
11
+ from presidio_analyzer.pattern_recognizer import PatternRecognizer
12
+ from presidio_analyzer.remote_recognizer import RemoteRecognizer
13
+ from presidio_analyzer.recognizer_registry import RecognizerRegistry
14
+ from presidio_analyzer.analyzer_engine import AnalyzerEngine
15
+ from presidio_analyzer.batch_analyzer_engine import BatchAnalyzerEngine
16
+ from presidio_analyzer.analyzer_request import AnalyzerRequest
17
+ from presidio_analyzer.context_aware_enhancers import ContextAwareEnhancer
18
+ from presidio_analyzer.context_aware_enhancers import LemmaContextAwareEnhancer
19
+
20
+
21
+ # Define default loggers behavior
22
+
23
+ # 1. presidio_analyzer logger
24
+
25
+ logging.getLogger("presidio_analyzer").addHandler(logging.NullHandler())
26
+
27
+ # 2. decision_process logger.
28
+ # Setting the decision process trace here as we would want it
29
+ # to be activated using a parameter to AnalyzeEngine and not by default.
30
+
31
+ decision_process_logger = logging.getLogger("decision_process")
32
+ ch = logging.StreamHandler()
33
+ formatter = logging.Formatter("[%(asctime)s][%(name)s][%(levelname)s]%(message)s")
34
+ ch.setFormatter(formatter)
35
+ decision_process_logger.addHandler(ch)
36
+ decision_process_logger.setLevel("INFO")
37
+ __all__ = [
38
+ "Pattern",
39
+ "AnalysisExplanation",
40
+ "RecognizerResult",
41
+ "DictAnalyzerResult",
42
+ "EntityRecognizer",
43
+ "LocalRecognizer",
44
+ "PatternRecognizer",
45
+ "RemoteRecognizer",
46
+ "RecognizerRegistry",
47
+ "AnalyzerEngine",
48
+ "AnalyzerRequest",
49
+ "ContextAwareEnhancer",
50
+ "LemmaContextAwareEnhancer",
51
+ "BatchAnalyzerEngine",
52
+ ]
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/analysis_explanation.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict
2
+
3
+
4
+ class AnalysisExplanation:
5
+ """
6
+ Hold tracing information to explain why PII entities were identified as such.
7
+
8
+ :param recognizer: name of recognizer that made the decision
9
+ :param original_score: recognizer's confidence in result
10
+ :param pattern_name: name of pattern
11
+ (if decision was made by a PatternRecognizer)
12
+ :param pattern: regex pattern that was applied (if PatternRecognizer)
13
+ :param validation_result: result of a validation (e.g. checksum)
14
+ :param textual_explanation: Free text for describing
15
+ a decision of a logic or model
16
+ """
17
+
18
+ def __init__(
19
+ self,
20
+ recognizer: str,
21
+ original_score: float,
22
+ pattern_name: str = None,
23
+ pattern: str = None,
24
+ validation_result: float = None,
25
+ textual_explanation: str = None,
26
+ ):
27
+
28
+ self.recognizer = recognizer
29
+ self.pattern_name = pattern_name
30
+ self.pattern = pattern
31
+ self.original_score = original_score
32
+ self.score = original_score
33
+ self.textual_explanation = textual_explanation
34
+ self.score_context_improvement = 0
35
+ self.supportive_context_word = ""
36
+ self.validation_result = validation_result
37
+
38
+ def __repr__(self):
39
+ """Create string representation of the object."""
40
+ return str(self.__dict__)
41
+
42
+ def set_improved_score(self, score: float) -> None:
43
+ """Update the score and calculate the difference from the original score."""
44
+ self.score = score
45
+ self.score_context_improvement = self.score - self.original_score
46
+
47
+ def set_supportive_context_word(self, word: str) -> None:
48
+ """Set the context word which helped increase the score."""
49
+ self.supportive_context_word = word
50
+
51
+ def append_textual_explanation_line(self, text: str) -> None:
52
+ """Append a new line to textual_explanation field."""
53
+ if self.textual_explanation is None:
54
+ self.textual_explanation = text
55
+ else:
56
+ self.textual_explanation = "{}\n{}".format(self.textual_explanation, text)
57
+
58
+ def to_dict(self) -> Dict:
59
+ """
60
+ Serialize self to dictionary.
61
+
62
+ :return: a dictionary
63
+ """
64
+ return self.__dict__
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/analyzer_engine.py ADDED
@@ -0,0 +1,372 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ from typing import List, Optional
4
+
5
+ from presidio_analyzer import (
6
+ RecognizerRegistry,
7
+ RecognizerResult,
8
+ EntityRecognizer,
9
+ )
10
+ from presidio_analyzer.app_tracer import AppTracer
11
+ from presidio_analyzer.context_aware_enhancers import (
12
+ ContextAwareEnhancer,
13
+ LemmaContextAwareEnhancer,
14
+ )
15
+ from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider, NlpArtifacts
16
+
17
+ logger = logging.getLogger("presidio_analyzer")
18
+
19
+
20
+ class AnalyzerEngine:
21
+ """
22
+ Entry point for Presidio Analyzer.
23
+
24
+ Orchestrating the detection of PII entities and all related logic.
25
+
26
+ :param registry: instance of type RecognizerRegistry
27
+ :param nlp_engine: instance of type NlpEngine
28
+ (for example SpacyNlpEngine)
29
+ :param app_tracer: instance of type AppTracer, used to trace the logic
30
+ used during each request for interpretability reasons.
31
+ :param log_decision_process: bool,
32
+ defines whether the decision process within the analyzer should be logged or not.
33
+ :param default_score_threshold: Minimum confidence value
34
+ for detected entities to be returned
35
+ :param supported_languages: List of possible languages this engine could be run on.
36
+ Used for loading the right NLP models and recognizers for these languages.
37
+ :param context_aware_enhancer: instance of type ContextAwareEnhancer for enhancing
38
+ confidence score based on context words, (LemmaContextAwareEnhancer will be created
39
+ by default if None passed)
40
+ """
41
+
42
+ def __init__(
43
+ self,
44
+ registry: RecognizerRegistry = None,
45
+ nlp_engine: NlpEngine = None,
46
+ app_tracer: AppTracer = None,
47
+ log_decision_process: bool = False,
48
+ default_score_threshold: float = 0,
49
+ supported_languages: List[str] = None,
50
+ context_aware_enhancer: Optional[ContextAwareEnhancer] = None,
51
+ ):
52
+ if not supported_languages:
53
+ supported_languages = ["en"]
54
+
55
+ if not nlp_engine:
56
+ logger.info("nlp_engine not provided, creating default.")
57
+ provider = NlpEngineProvider()
58
+ nlp_engine = provider.create_engine()
59
+
60
+ if not registry:
61
+ logger.info("registry not provided, creating default.")
62
+ registry = RecognizerRegistry()
63
+ if not app_tracer:
64
+ app_tracer = AppTracer()
65
+ self.app_tracer = app_tracer
66
+
67
+ self.supported_languages = supported_languages
68
+
69
+ self.nlp_engine = nlp_engine
70
+ self.registry = registry
71
+
72
+ # load all recognizers
73
+ if not registry.recognizers:
74
+ registry.load_predefined_recognizers(
75
+ nlp_engine=self.nlp_engine, languages=self.supported_languages
76
+ )
77
+
78
+ self.log_decision_process = log_decision_process
79
+ self.default_score_threshold = default_score_threshold
80
+
81
+ if not context_aware_enhancer:
82
+ logger.debug(
83
+ "context aware enhancer not provided, creating default"
84
+ + " lemma based enhancer."
85
+ )
86
+ context_aware_enhancer = LemmaContextAwareEnhancer()
87
+
88
+ self.context_aware_enhancer = context_aware_enhancer
89
+
90
+ def get_recognizers(self, language: Optional[str] = None) -> List[EntityRecognizer]:
91
+ """
92
+ Return a list of PII recognizers currently loaded.
93
+
94
+ :param language: Return the recognizers supporting a given language.
95
+ :return: List of [Recognizer] as a RecognizersAllResponse
96
+ """
97
+ if not language:
98
+ languages = self.supported_languages
99
+ else:
100
+ languages = [language]
101
+
102
+ recognizers = []
103
+ for language in languages:
104
+ logger.info(f"Fetching all recognizers for language {language}")
105
+ recognizers.extend(
106
+ self.registry.get_recognizers(language=language, all_fields=True)
107
+ )
108
+
109
+ return list(set(recognizers))
110
+
111
+ def get_supported_entities(self, language: Optional[str] = None) -> List[str]:
112
+ """
113
+ Return a list of the entities that can be detected.
114
+
115
+ :param language: Return only entities supported in a specific language.
116
+ :return: List of entity names
117
+ """
118
+ recognizers = self.get_recognizers(language=language)
119
+ supported_entities = []
120
+ for recognizer in recognizers:
121
+ supported_entities.extend(recognizer.get_supported_entities())
122
+
123
+ return list(set(supported_entities))
124
+
125
+ def analyze(
126
+ self,
127
+ text: str,
128
+ language: str,
129
+ entities: Optional[List[str]] = None,
130
+ correlation_id: Optional[str] = None,
131
+ score_threshold: Optional[float] = None,
132
+ return_decision_process: Optional[bool] = False,
133
+ ad_hoc_recognizers: Optional[List[EntityRecognizer]] = None,
134
+ context: Optional[List[str]] = None,
135
+ allow_list: Optional[List[str]] = None,
136
+ nlp_artifacts: Optional[NlpArtifacts] = None,
137
+ ) -> List[RecognizerResult]:
138
+ """
139
+ Find PII entities in text using different PII recognizers for a given language.
140
+
141
+ :param text: the text to analyze
142
+ :param language: the language of the text
143
+ :param entities: List of PII entities that should be looked for in the text.
144
+ If entities=None then all entities are looked for.
145
+ :param correlation_id: cross call ID for this request
146
+ :param score_threshold: A minimum value for which
147
+ to return an identified entity
148
+ :param return_decision_process: Whether the analysis decision process steps
149
+ returned in the response.
150
+ :param ad_hoc_recognizers: List of recognizers which will be used only
151
+ for this specific request.
152
+ :param context: List of context words to enhance confidence score if matched
153
+ with the recognized entity's recognizer context
154
+ :param allow_list: List of words that the user defines as being allowed to keep
155
+ in the text
156
+ :param nlp_artifacts: precomputed NlpArtifacts
157
+ :return: an array of the found entities in the text
158
+
159
+ :example:
160
+
161
+ >>> from presidio_analyzer import AnalyzerEngine
162
+
163
+ >>> # Set up the engine, loads the NLP module (spaCy model by default)
164
+ >>> # and other PII recognizers
165
+ >>> analyzer = AnalyzerEngine()
166
+
167
+ >>> # Call analyzer to get results
168
+ >>> results = analyzer.analyze(text='My phone number is 212-555-5555', entities=['PHONE_NUMBER'], language='en') # noqa D501
169
+ >>> print(results)
170
+ [type: PHONE_NUMBER, start: 19, end: 31, score: 0.85]
171
+ """
172
+ all_fields = not entities
173
+
174
+ recognizers = self.registry.get_recognizers(
175
+ language=language,
176
+ entities=entities,
177
+ all_fields=all_fields,
178
+ ad_hoc_recognizers=ad_hoc_recognizers,
179
+ )
180
+
181
+ if all_fields:
182
+ # Since all_fields=True, list all entities by iterating
183
+ # over all recognizers
184
+ entities = self.get_supported_entities(language=language)
185
+
186
+ # run the nlp pipeline over the given text, store the results in
187
+ # a NlpArtifacts instance
188
+ if not nlp_artifacts:
189
+ nlp_artifacts = self.nlp_engine.process_text(text, language)
190
+
191
+ if self.log_decision_process:
192
+ self.app_tracer.trace(
193
+ correlation_id, "nlp artifacts:" + nlp_artifacts.to_json()
194
+ )
195
+
196
+ results = []
197
+ for recognizer in recognizers:
198
+ # Lazy loading of the relevant recognizers
199
+ if not recognizer.is_loaded:
200
+ recognizer.load()
201
+ recognizer.is_loaded = True
202
+
203
+ # analyze using the current recognizer and append the results
204
+ current_results = recognizer.analyze(
205
+ text=text, entities=entities, nlp_artifacts=nlp_artifacts
206
+ )
207
+
208
+ if current_results:
209
+ # add recognizer name to recognition metadata inside results
210
+ # if not exists
211
+ self.__add_recognizer_id_if_not_exists(current_results, recognizer)
212
+ results.extend(current_results)
213
+
214
+
215
+ results = self._enhance_using_context(
216
+ text, results, nlp_artifacts, recognizers, context
217
+ )
218
+
219
+ if self.log_decision_process:
220
+ self.app_tracer.trace(
221
+ correlation_id,
222
+ json.dumps([str(result.to_dict()) for result in results]),
223
+ )
224
+
225
+ # Remove duplicates or low score results
226
+ results = EntityRecognizer.remove_duplicates(results)
227
+ results = self.__remove_low_scores(results, score_threshold)
228
+
229
+ if allow_list:
230
+ results = self._remove_allow_list(results, allow_list, text)
231
+
232
+ if not return_decision_process:
233
+ results = self.__remove_decision_process(results)
234
+
235
+ return results
236
+
237
+ def _enhance_using_context(
238
+ self,
239
+ text: str,
240
+ raw_results: List[RecognizerResult],
241
+ nlp_artifacts: NlpArtifacts,
242
+ recognizers: List[EntityRecognizer],
243
+ context: Optional[List[str]] = None,
244
+ ) -> List[RecognizerResult]:
245
+ """
246
+ Enhance confidence score using context words.
247
+
248
+ :param text: The actual text that was analyzed
249
+ :param raw_results: Recognizer results which didn't take
250
+ context into consideration
251
+ :param nlp_artifacts: The nlp artifacts contains elements
252
+ such as lemmatized tokens for better
253
+ accuracy of the context enhancement process
254
+ :param recognizers: the list of recognizers
255
+ :param context: list of context words
256
+ """
257
+ results = []
258
+
259
+ for recognizer in recognizers:
260
+ recognizer_results = [
261
+ r
262
+ for r in raw_results
263
+ if r.recognition_metadata[RecognizerResult.RECOGNIZER_IDENTIFIER_KEY]
264
+ == recognizer.id
265
+ ]
266
+ other_recognizer_results = [
267
+ r
268
+ for r in raw_results
269
+ if r.recognition_metadata[RecognizerResult.RECOGNIZER_IDENTIFIER_KEY]
270
+ != recognizer.id
271
+ ]
272
+
273
+ # enhance score using context in recognizer level if implemented
274
+ recognizer_results = recognizer.enhance_using_context(
275
+ text=text,
276
+ # each recognizer will get access to all recognizer results
277
+ # to allow related entities contex enhancement
278
+ raw_recognizer_results=recognizer_results,
279
+ other_raw_recognizer_results=other_recognizer_results,
280
+ nlp_artifacts=nlp_artifacts,
281
+ context=context,
282
+ )
283
+
284
+ results.extend(recognizer_results)
285
+
286
+ # Update results in case surrounding words or external context are relevant to
287
+ # the context words.
288
+ results = self.context_aware_enhancer.enhance_using_context(
289
+ text=text,
290
+ raw_results=results,
291
+ nlp_artifacts=nlp_artifacts,
292
+ recognizers=recognizers,
293
+ context=context,
294
+ )
295
+
296
+ return results
297
+
298
+ def __remove_low_scores(
299
+ self, results: List[RecognizerResult], score_threshold: float = None
300
+ ) -> List[RecognizerResult]:
301
+ """
302
+ Remove results for which the confidence is lower than the threshold.
303
+
304
+ :param results: List of RecognizerResult
305
+ :param score_threshold: float value for minimum possible confidence
306
+ :return: List[RecognizerResult]
307
+ """
308
+ if score_threshold is None:
309
+ score_threshold = self.default_score_threshold
310
+
311
+ new_results = [result for result in results if result.score >= score_threshold]
312
+ return new_results
313
+
314
+ @staticmethod
315
+ def _remove_allow_list(
316
+ results: List[RecognizerResult], allow_list: List[str], text: str
317
+ ) -> List[RecognizerResult]:
318
+ """
319
+ Remove results which are part of the allow list.
320
+
321
+ :param results: List of RecognizerResult
322
+ :param allow_list: list of allowed terms
323
+ :param text: the text to analyze
324
+ :return: List[RecognizerResult]
325
+ """
326
+ new_results = []
327
+ for result in results:
328
+ word = text[result.start : result.end]
329
+ # if the word is not specified to be allowed, keep in the PII entities
330
+ if word not in allow_list:
331
+ new_results.append(result)
332
+
333
+ return new_results
334
+
335
+ @staticmethod
336
+ def __add_recognizer_id_if_not_exists(
337
+ results: List[RecognizerResult], recognizer: EntityRecognizer
338
+ ):
339
+ """Ensure recognition metadata with recognizer id existence.
340
+
341
+ Ensure recognizer result list contains recognizer id inside recognition
342
+ metadata dictionary, and if not create it. recognizer_id is needed
343
+ for context aware enhancement.
344
+
345
+ :param results: List of RecognizerResult
346
+ :param recognizer: Entity recognizer
347
+ """
348
+ for result in results:
349
+ if not result.recognition_metadata:
350
+ result.recognition_metadata = dict()
351
+ if (
352
+ RecognizerResult.RECOGNIZER_IDENTIFIER_KEY
353
+ not in result.recognition_metadata
354
+ ):
355
+ result.recognition_metadata[
356
+ RecognizerResult.RECOGNIZER_IDENTIFIER_KEY
357
+ ] = recognizer.id
358
+ if RecognizerResult.RECOGNIZER_NAME_KEY not in result.recognition_metadata:
359
+ result.recognition_metadata[
360
+ RecognizerResult.RECOGNIZER_NAME_KEY
361
+ ] = recognizer.name
362
+
363
+ @staticmethod
364
+ def __remove_decision_process(
365
+ results: List[RecognizerResult],
366
+ ) -> List[RecognizerResult]:
367
+ """Remove decision process / analysis explanation from response."""
368
+
369
+ for result in results:
370
+ result.analysis_explanation = None
371
+
372
+ return results
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/analyzer_request.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict
2
+
3
+ from presidio_analyzer import PatternRecognizer
4
+
5
+
6
+ class AnalyzerRequest:
7
+ """
8
+ Analyzer request data.
9
+
10
+ :param req_data: A request dictionary with the following fields:
11
+ text: the text to analyze
12
+ language: the language of the text
13
+ entities: List of PII entities that should be looked for in the text.
14
+ If entities=None then all entities are looked for.
15
+ correlation_id: cross call ID for this request
16
+ score_threshold: A minimum value for which to return an identified entity
17
+ log_decision_process: Should the decision points within the analysis
18
+ be logged
19
+ return_decision_process: Should the decision points within the analysis
20
+ returned as part of the response
21
+ """
22
+
23
+ def __init__(self, req_data: Dict):
24
+ self.text = req_data.get("text")
25
+ self.language = req_data.get("language")
26
+ self.entities = req_data.get("entities")
27
+ self.correlation_id = req_data.get("correlation_id")
28
+ self.score_threshold = req_data.get("score_threshold")
29
+ self.return_decision_process = req_data.get("return_decision_process")
30
+ ad_hoc_recognizers = req_data.get("ad_hoc_recognizers")
31
+ self.ad_hoc_recognizers = []
32
+ if ad_hoc_recognizers:
33
+ self.ad_hoc_recognizers = [
34
+ PatternRecognizer.from_dict(rec) for rec in ad_hoc_recognizers
35
+ ]
36
+ self.context = req_data.get("context")
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/app_tracer.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+
4
+ class AppTracer:
5
+ """
6
+ Allow logging/tracing the system's decisions.
7
+
8
+ Relevant in cases where we want to know which modules were used for detection,
9
+ which logic was utilized, what results were given and potentially why.
10
+ This can be useful for analyzing the detection accuracy of the system.
11
+ :param enabled: Whether tracing should be activated.
12
+ """
13
+
14
+ def __init__(self, enabled: bool = True):
15
+ self.logger = logging.getLogger("decision_process")
16
+ self.enabled = enabled
17
+
18
+ def trace(self, request_id: str, trace_data: str) -> None:
19
+ """
20
+ Write a value associated with a decision for a specific request into the trace.
21
+
22
+ Tracing for further inspection if needed.
23
+ :param request_id: A unique ID, to correlate across calls.
24
+ :param trace_data: A string to write to the log.
25
+ """
26
+ if self.enabled:
27
+ self.logger.info("[%s][%s]", request_id, trace_data)
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/batch_analyzer_engine.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import List, Iterable, Dict, Union, Any, Optional, Iterator, Tuple
3
+
4
+ from presidio_analyzer import DictAnalyzerResult, RecognizerResult, AnalyzerEngine
5
+ from presidio_analyzer.nlp_engine import NlpArtifacts
6
+
7
+ logger = logging.getLogger("presidio_analyzer")
8
+
9
+
10
+ class BatchAnalyzerEngine:
11
+ """
12
+ Batch analysis of documents (tables, lists, dicts).
13
+
14
+ Wrapper class to run Presidio Analyzer Engine on multiple values,
15
+ either lists/iterators of strings, or dictionaries.
16
+
17
+ :param: analyzer_engine: AnalyzerEngine instance to use
18
+ for handling the values in those collections.
19
+ """
20
+
21
+ def __init__(self, analyzer_engine: Optional[AnalyzerEngine] = None):
22
+
23
+ self.analyzer_engine = analyzer_engine
24
+ if not analyzer_engine:
25
+ self.analyzer_engine = AnalyzerEngine()
26
+
27
+ def analyze_iterator(
28
+ self,
29
+ texts: Iterable[Union[str, bool, float, int]],
30
+ language: str,
31
+ **kwargs,
32
+ ) -> List[List[RecognizerResult]]:
33
+ """
34
+ Analyze an iterable of strings.
35
+
36
+ :param texts: An list containing strings to be analyzed.
37
+ :param language: Input language
38
+ :param kwargs: Additional parameters for the `AnalyzerEngine.analyze` method.
39
+ """
40
+
41
+ # validate types
42
+ texts = self._validate_types(texts)
43
+
44
+ # Process the texts as batch for improved performance
45
+ nlp_artifacts_batch: Iterator[
46
+ Tuple[str, NlpArtifacts]
47
+ ] = self.analyzer_engine.nlp_engine.process_batch(
48
+ texts=texts, language=language
49
+ )
50
+
51
+ list_results = []
52
+ for text, nlp_artifacts in nlp_artifacts_batch:
53
+ results = self.analyzer_engine.analyze(
54
+ text=str(text), nlp_artifacts=nlp_artifacts, language=language, **kwargs
55
+ )
56
+
57
+ list_results.append(results)
58
+
59
+ return list_results
60
+
61
+ def analyze_dict(
62
+ self,
63
+ input_dict: Dict[str, Union[Any, Iterable[Any]]],
64
+ language: str,
65
+ keys_to_skip: Optional[List[str]] = None,
66
+ **kwargs,
67
+ ) -> Iterator[DictAnalyzerResult]:
68
+ """
69
+ Analyze a dictionary of keys (strings) and values/iterable of values.
70
+
71
+ Non-string values are returned as is.
72
+
73
+ :param input_dict: The input dictionary for analysis
74
+ :param language: Input language
75
+ :param keys_to_skip: Keys to ignore during analysis
76
+ :param kwargs: Additional keyword arguments
77
+ for the `AnalyzerEngine.analyze` method.
78
+ Use this to pass arguments to the analyze method,
79
+ such as `ad_hoc_recognizers`, `context`, `return_decision_process`.
80
+ See `AnalyzerEngine.analyze` for the full list.
81
+ """
82
+
83
+ context = []
84
+ if "context" in kwargs:
85
+ context = kwargs["context"]
86
+ del kwargs["context"]
87
+
88
+ if not keys_to_skip:
89
+ keys_to_skip = []
90
+
91
+ for key, value in input_dict.items():
92
+ if not value or key in keys_to_skip:
93
+ yield DictAnalyzerResult(key=key, value=value, recognizer_results=[])
94
+ continue # skip this key as requested
95
+
96
+ # Add the key as an additional context
97
+ specific_context = context[:]
98
+ specific_context.append(key)
99
+
100
+ if type(value) in (str, int, bool, float):
101
+ results: List[RecognizerResult] = self.analyzer_engine.analyze(
102
+ text=str(value), language=language, context=[key], **kwargs
103
+ )
104
+ elif isinstance(value, dict):
105
+ new_keys_to_skip = self._get_nested_keys_to_skip(key, keys_to_skip)
106
+ results = self.analyze_dict(
107
+ input_dict=value,
108
+ language=language,
109
+ context=specific_context,
110
+ keys_to_skip=new_keys_to_skip,
111
+ **kwargs,
112
+ )
113
+ elif isinstance(value, Iterable):
114
+ # Recursively iterate nested dicts
115
+
116
+ results: List[List[RecognizerResult]] = self.analyze_iterator(
117
+ texts=value,
118
+ language=language,
119
+ context=specific_context,
120
+ **kwargs,
121
+ )
122
+ else:
123
+ raise ValueError(f"type {type(value)} is unsupported.")
124
+
125
+ yield DictAnalyzerResult(key=key, value=value, recognizer_results=results)
126
+
127
+ @staticmethod
128
+ def _validate_types(value_iterator: Iterable[Any]) -> Iterator[Any]:
129
+ for val in value_iterator:
130
+ if val and not type(val) in (int, float, bool, str):
131
+ err_msg = (
132
+ "Analyzer.analyze_iterator only works "
133
+ "on primitive types (int, float, bool, str). "
134
+ "Lists of objects are not yet supported."
135
+ )
136
+ logger.error(err_msg)
137
+ raise ValueError(err_msg)
138
+ yield val
139
+
140
+ @staticmethod
141
+ def _get_nested_keys_to_skip(key, keys_to_skip):
142
+ new_keys_to_skip = [
143
+ k.replace(f"{key}.", "") for k in keys_to_skip if k.startswith(key)
144
+ ]
145
+ return new_keys_to_skip
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/context_aware_enhancers/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """Recognizer registry init."""
2
+ from .context_aware_enhancer import ContextAwareEnhancer
3
+ from .lemma_context_aware_enhancer import LemmaContextAwareEnhancer
4
+
5
+ __all__ = ["ContextAwareEnhancer", "LemmaContextAwareEnhancer"]
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/context_aware_enhancers/context_aware_enhancer.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from abc import abstractmethod
3
+ from typing import List, Optional
4
+
5
+ from presidio_analyzer import RecognizerResult
6
+ from presidio_analyzer import EntityRecognizer
7
+ from presidio_analyzer.nlp_engine import NlpArtifacts
8
+
9
+ logger = logging.getLogger("presidio_analyzer")
10
+
11
+
12
+ class ContextAwareEnhancer:
13
+ """
14
+ A class representing an abstract context aware enhancer.
15
+
16
+ Context words might enhance confidence score of a recognized entity,
17
+ ContextAwareEnhancer is an abstract class to be inherited by a context aware
18
+ enhancer logic.
19
+
20
+ :param context_similarity_factor: How much to enhance confidence of match entity
21
+ :param min_score_with_context_similarity: Minimum confidence score
22
+ :param context_prefix_count: how many words before the entity to match context
23
+ :param context_suffix_count: how many words after the entity to match context
24
+ """
25
+
26
+ MIN_SCORE = 0
27
+ MAX_SCORE = 1.0
28
+
29
+ def __init__(
30
+ self,
31
+ context_similarity_factor: float,
32
+ min_score_with_context_similarity: float,
33
+ context_prefix_count: int,
34
+ context_suffix_count: int,
35
+ ):
36
+
37
+ self.context_similarity_factor = context_similarity_factor
38
+ self.min_score_with_context_similarity = min_score_with_context_similarity
39
+ self.context_prefix_count = context_prefix_count
40
+ self.context_suffix_count = context_suffix_count
41
+
42
+ @abstractmethod
43
+ def enhance_using_context(
44
+ self,
45
+ text: str,
46
+ raw_results: List[RecognizerResult],
47
+ nlp_artifacts: NlpArtifacts,
48
+ recognizers: List[EntityRecognizer],
49
+ context: Optional[List[str]] = None,
50
+ ) -> List[RecognizerResult]:
51
+ """
52
+ Update results in case surrounding words are relevant to the context words.
53
+
54
+ Using the surrounding words of the actual word matches, look
55
+ for specific strings that if found contribute to the score
56
+ of the result, improving the confidence that the match is
57
+ indeed of that PII entity type
58
+
59
+ :param text: The actual text that was analyzed
60
+ :param raw_results: Recognizer results which didn't take
61
+ context into consideration
62
+ :param nlp_artifacts: The nlp artifacts contains elements
63
+ such as lemmatized tokens for better
64
+ accuracy of the context enhancement process
65
+ :param recognizers: the list of recognizers
66
+ :param context: list of context words
67
+ """
68
+ return raw_results
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/context_aware_enhancers/lemma_context_aware_enhancer.py ADDED
@@ -0,0 +1,334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import logging
3
+ from typing import List, Optional
4
+
5
+ from presidio_analyzer import RecognizerResult
6
+ from presidio_analyzer import EntityRecognizer
7
+ from presidio_analyzer.nlp_engine import NlpArtifacts
8
+ from presidio_analyzer.context_aware_enhancers import ContextAwareEnhancer
9
+
10
+ logger = logging.getLogger("presidio_analyzer")
11
+
12
+
13
+ class LemmaContextAwareEnhancer(ContextAwareEnhancer):
14
+ """
15
+ A class representing a lemma based context aware enhancer logic.
16
+
17
+ Context words might enhance confidence score of a recognized entity,
18
+ LemmaContextAwareEnhancer is an implementation of Lemma based context aware logic,
19
+ it compares spacy lemmas of each word in context of the matched entity to given
20
+ context and the recognizer context words,
21
+ if matched it enhance the recognized entity confidence score by a given factor.
22
+
23
+ :param context_similarity_factor: How much to enhance confidence of match entity
24
+ :param min_score_with_context_similarity: Minimum confidence score
25
+ :param context_prefix_count: how many words before the entity to match context
26
+ :param context_suffix_count: how many words after the entity to match context
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ context_similarity_factor: float = 0.35,
32
+ min_score_with_context_similarity: float = 0.4,
33
+ context_prefix_count: int = 5,
34
+ context_suffix_count: int = 0,
35
+ ):
36
+ super().__init__(
37
+ context_similarity_factor=context_similarity_factor,
38
+ min_score_with_context_similarity=min_score_with_context_similarity,
39
+ context_prefix_count=context_prefix_count,
40
+ context_suffix_count=context_suffix_count,
41
+ )
42
+
43
+ def enhance_using_context(
44
+ self,
45
+ text: str,
46
+ raw_results: List[RecognizerResult],
47
+ nlp_artifacts: NlpArtifacts,
48
+ recognizers: List[EntityRecognizer],
49
+ context: Optional[List[str]] = None,
50
+ ) -> List[RecognizerResult]:
51
+ """
52
+ Update results in case the lemmas of surrounding words or input context
53
+ words are identical to the context words.
54
+
55
+ Using the surrounding words of the actual word matches, look
56
+ for specific strings that if found contribute to the score
57
+ of the result, improving the confidence that the match is
58
+ indeed of that PII entity type
59
+
60
+ :param text: The actual text that was analyzed
61
+ :param raw_results: Recognizer results which didn't take
62
+ context into consideration
63
+ :param nlp_artifacts: The nlp artifacts contains elements
64
+ such as lemmatized tokens for better
65
+ accuracy of the context enhancement process
66
+ :param recognizers: the list of recognizers
67
+ :param context: list of context words
68
+ """ # noqa D205 D400
69
+
70
+ # create a deep copy of the results object, so we can manipulate it
71
+ results = copy.deepcopy(raw_results)
72
+
73
+ # create recognizer context dictionary
74
+ recognizers_dict = {recognizer.id: recognizer for recognizer in recognizers}
75
+
76
+ # Create empty list in None or lowercase all context words in the list
77
+ if not context:
78
+ context = []
79
+ else:
80
+ context = [word.lower() for word in context]
81
+
82
+ # Sanity
83
+ if nlp_artifacts is None:
84
+ logger.warning("NLP artifacts were not provided")
85
+ return results
86
+
87
+ for result in results:
88
+ recognizer = None
89
+ # get recognizer matching the result, if found.
90
+ if (
91
+ result.recognition_metadata
92
+ and RecognizerResult.RECOGNIZER_IDENTIFIER_KEY
93
+ in result.recognition_metadata.keys()
94
+ ):
95
+ recognizer = recognizers_dict.get(
96
+ result.recognition_metadata[
97
+ RecognizerResult.RECOGNIZER_IDENTIFIER_KEY
98
+ ]
99
+ )
100
+
101
+ if not recognizer:
102
+ logger.debug(
103
+ "Recognizer name not found as part of the "
104
+ "recognition_metadata dict in the RecognizerResult. "
105
+ )
106
+ continue
107
+
108
+ # skip recognizer result if the recognizer doesn't support
109
+ # context enhancement
110
+ if not recognizer.context:
111
+ logger.debug(
112
+ "recognizer '%s' does not support context enhancement",
113
+ recognizer.name,
114
+ )
115
+ continue
116
+
117
+ # skip context enhancement if already boosted by recognizer level
118
+ if result.recognition_metadata.get(
119
+ RecognizerResult.IS_SCORE_ENHANCED_BY_CONTEXT_KEY
120
+ ):
121
+ logger.debug("result score already boosted, skipping")
122
+ continue
123
+
124
+ # extract lemmatized context from the surrounding of the match
125
+ word = text[result.start : result.end]
126
+
127
+ surrounding_words = self._extract_surrounding_words(
128
+ nlp_artifacts=nlp_artifacts, word=word, start=result.start
129
+ )
130
+
131
+ # combine other sources of context with surrounding words
132
+ surrounding_words.extend(context)
133
+
134
+ supportive_context_word = self._find_supportive_word_in_context(
135
+ surrounding_words, recognizer.context
136
+ )
137
+ if supportive_context_word != "":
138
+ result.score += self.context_similarity_factor
139
+ result.score = max(result.score, self.min_score_with_context_similarity)
140
+ result.score = min(result.score, ContextAwareEnhancer.MAX_SCORE)
141
+
142
+ # Update the explainability object with context information
143
+ # helped to improve the score
144
+ result.analysis_explanation.set_supportive_context_word(
145
+ supportive_context_word
146
+ )
147
+ result.analysis_explanation.set_improved_score(result.score)
148
+ return results
149
+
150
+ @staticmethod
151
+ def _find_supportive_word_in_context(
152
+ context_list: List[str], recognizer_context_list: List[str]
153
+ ) -> str:
154
+ """
155
+ Find words in the text which are relevant for context evaluation.
156
+
157
+ A word is considered a supportive context word if there's exact match
158
+ between a keyword in context_text and any keyword in context_list.
159
+
160
+ :param context_list words before and after the matched entity within
161
+ a specified window size
162
+ :param recognizer_context_list a list of words considered as
163
+ context keywords manually specified by the recognizer's author
164
+ """
165
+ word = ""
166
+ # If the context list is empty, no need to continue
167
+ if context_list is None or recognizer_context_list is None:
168
+ return word
169
+
170
+ for predefined_context_word in recognizer_context_list:
171
+ # result == true only if any of the predefined context words
172
+ # is found exactly or as a substring in any of the collected
173
+ # context words
174
+ result = next(
175
+ (
176
+ True
177
+ for keyword in context_list
178
+ if predefined_context_word in keyword
179
+ ),
180
+ False,
181
+ )
182
+ if result:
183
+ logger.debug("Found context keyword '%s'", predefined_context_word)
184
+ word = predefined_context_word
185
+ break
186
+
187
+ return word
188
+
189
+ def _extract_surrounding_words(
190
+ self, nlp_artifacts: NlpArtifacts, word: str, start: int
191
+ ) -> List[str]:
192
+ """Extract words surrounding another given word.
193
+
194
+ The text from which the context is extracted is given in the nlp
195
+ doc.
196
+
197
+ :param nlp_artifacts: An abstraction layer which holds different
198
+ items which are the result of a NLP pipeline
199
+ execution on a given text
200
+ :param word: The word to look for context around
201
+ :param start: The start index of the word in the original text
202
+ """
203
+ if not nlp_artifacts.tokens:
204
+ logger.info("Skipping context extraction due to lack of NLP artifacts")
205
+ # if there are no nlp artifacts, this is ok, we can
206
+ # extract context and we return a valid, yet empty
207
+ # context
208
+ return [""]
209
+
210
+ # Get the already prepared words in the given text, in their
211
+ # LEMMATIZED version
212
+ lemmatized_keywords = nlp_artifacts.keywords
213
+
214
+ # since the list of tokens is not necessarily aligned
215
+ # with the actual index of the match, we look for the
216
+ # token index which corresponds to the match
217
+ token_index = self._find_index_of_match_token(
218
+ word, start, nlp_artifacts.tokens, nlp_artifacts.tokens_indices
219
+ )
220
+
221
+ # index i belongs to the PII entity, take the preceding n words
222
+ # and the successing m words into a context list
223
+
224
+ backward_context = self._add_n_words_backward(
225
+ token_index,
226
+ self.context_prefix_count,
227
+ nlp_artifacts.lemmas,
228
+ lemmatized_keywords,
229
+ )
230
+ forward_context = self._add_n_words_forward(
231
+ token_index,
232
+ self.context_suffix_count,
233
+ nlp_artifacts.lemmas,
234
+ lemmatized_keywords,
235
+ )
236
+
237
+ context_list = []
238
+ context_list.extend(backward_context)
239
+ context_list.extend(forward_context)
240
+ context_list = list(set(context_list))
241
+ logger.debug("Context list is: %s", " ".join(context_list))
242
+ return context_list
243
+
244
+ @staticmethod
245
+ def _find_index_of_match_token(
246
+ word: str, start: int, tokens, tokens_indices: List[int] # noqa ANN001
247
+ ) -> int:
248
+ found = False
249
+ # we use the known start index of the original word to find the actual
250
+ # token at that index, we are not checking for equivilance since the
251
+ # token might be just a substring of that word (e.g. for phone number
252
+ # 555-124564 the first token might be just '555' or for a match like '
253
+ # rocket' the actual token will just be 'rocket' hence the misalignment
254
+ # of indices)
255
+ # Note: we are iterating over the original tokens (not the lemmatized)
256
+ i = -1
257
+ for i, token in enumerate(tokens, 0):
258
+ # Either we found a token with the exact location, or
259
+ # we take a token which its characters indices covers
260
+ # the index we are looking for.
261
+ if (tokens_indices[i] == start) or (start < tokens_indices[i] + len(token)):
262
+ # found the interesting token, the one that around it
263
+ # we take n words, we save the matching lemma
264
+ found = True
265
+ break
266
+
267
+ if not found:
268
+ raise ValueError(
269
+ "Did not find word '" + word + "' "
270
+ "in the list of tokens although it "
271
+ "is expected to be found"
272
+ )
273
+ return i
274
+
275
+ @staticmethod
276
+ def _add_n_words(
277
+ index: int,
278
+ n_words: int,
279
+ lemmas: List[str],
280
+ lemmatized_filtered_keywords: List[str],
281
+ is_backward: bool,
282
+ ) -> List[str]:
283
+ """
284
+ Prepare a string of context words.
285
+
286
+ Return a list of words which surrounds a lemma at a given index.
287
+ The words will be collected only if exist in the filtered array
288
+
289
+ :param index: index of the lemma that its surrounding words we want
290
+ :param n_words: number of words to take
291
+ :param lemmas: array of lemmas
292
+ :param lemmatized_filtered_keywords: the array of filtered
293
+ lemmas from the original sentence,
294
+ :param is_backward: if true take the preceeding words, if false,
295
+ take the successing words
296
+ """
297
+ i = index
298
+ context_words = []
299
+ # The entity itself is no interest to us...however we want to
300
+ # consider it anyway for cases were it is attached with no spaces
301
+ # to an interesting context word, so we allow it and add 1 to
302
+ # the number of collected words
303
+
304
+ # collect at most n words (in lower case)
305
+ remaining = n_words + 1
306
+ while 0 <= i < len(lemmas) and remaining > 0:
307
+ lower_lemma = lemmas[i].lower()
308
+ if lower_lemma in lemmatized_filtered_keywords:
309
+ context_words.append(lower_lemma)
310
+ remaining -= 1
311
+ i = i - 1 if is_backward else i + 1
312
+ return context_words
313
+
314
+ def _add_n_words_forward(
315
+ self,
316
+ index: int,
317
+ n_words: int,
318
+ lemmas: List[str],
319
+ lemmatized_filtered_keywords: List[str],
320
+ ) -> List[str]:
321
+ return self._add_n_words(
322
+ index, n_words, lemmas, lemmatized_filtered_keywords, False
323
+ )
324
+
325
+ def _add_n_words_backward(
326
+ self,
327
+ index: int,
328
+ n_words: int,
329
+ lemmas: List[str],
330
+ lemmatized_filtered_keywords: List[str],
331
+ ) -> List[str]:
332
+ return self._add_n_words(
333
+ index, n_words, lemmas, lemmatized_filtered_keywords, True
334
+ )
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/dict_analyzer_result.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import List, Union, Iterator
3
+
4
+ from presidio_analyzer import RecognizerResult
5
+
6
+
7
+ @dataclass
8
+ class DictAnalyzerResult:
9
+ """
10
+ Data class for holding the output of the Presidio Analyzer on dictionaries.
11
+
12
+ :param key: key in dictionary
13
+ :param value: value to run analysis on (either string or list of strings)
14
+ :param recognizer_results: Analyzer output for one value.
15
+ Could be either:
16
+ - A list of recognizer results if the input is one string
17
+ - A list of lists of recognizer results, if the input is a list of strings.
18
+ - An iterator of a DictAnalyzerResult, if the input is a dictionary.
19
+ In this case the recognizer_results would be the iterator
20
+ of the DictAnalyzerResults next level in the dictionary.
21
+ """
22
+
23
+ key: str
24
+ value: Union[str, List[str], dict]
25
+ recognizer_results: Union[
26
+ List[RecognizerResult],
27
+ List[List[RecognizerResult]],
28
+ Iterator["DictAnalyzerResult"],
29
+ ]
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/entity_recognizer.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from abc import abstractmethod
3
+ from typing import List, Dict, Optional
4
+
5
+ from presidio_analyzer import RecognizerResult
6
+ from presidio_analyzer.nlp_engine import NlpArtifacts
7
+
8
+ logger = logging.getLogger("presidio_analyzer")
9
+
10
+
11
+ class EntityRecognizer:
12
+ """
13
+ A class representing an abstract PII entity recognizer.
14
+
15
+ EntityRecognizer is an abstract class to be inherited by
16
+ Recognizers which hold the logic for recognizing specific PII entities.
17
+
18
+ EntityRecognizer exposes a method called enhance_using_context which
19
+ can be overridden in case a custom context aware enhancement is needed
20
+ in derived class of a recognizer.
21
+
22
+ :param supported_entities: the entities supported by this recognizer
23
+ (for example, phone number, address, etc.)
24
+ :param supported_language: the language supported by this recognizer.
25
+ The supported langauge code is iso6391Name
26
+ :param name: the name of this recognizer (optional)
27
+ :param version: the recognizer current version
28
+ :param context: a list of words which can help boost confidence score
29
+ when they appear in context of the matched entity
30
+ """
31
+
32
+ MIN_SCORE = 0
33
+ MAX_SCORE = 1.0
34
+
35
+ def __init__(
36
+ self,
37
+ supported_entities: List[str],
38
+ name: str = None,
39
+ supported_language: str = "en",
40
+ version: str = "0.0.1",
41
+ context: Optional[List[str]] = None,
42
+ ):
43
+
44
+ self.supported_entities = supported_entities
45
+
46
+ if name is None:
47
+ self.name = self.__class__.__name__ # assign class name as name
48
+ else:
49
+ self.name = name
50
+
51
+ self._id = f"{self.name}_{id(self)}"
52
+
53
+ self.supported_language = supported_language
54
+ self.version = version
55
+ self.is_loaded = False
56
+ self.context = context if context else []
57
+
58
+ self.load()
59
+ logger.info("Loaded recognizer: %s", self.name)
60
+ self.is_loaded = True
61
+
62
+ @property
63
+ def id(self):
64
+ """Return a unique identifier of this recognizer."""
65
+
66
+ return self._id
67
+
68
+ @abstractmethod
69
+ def load(self) -> None:
70
+ """
71
+ Initialize the recognizer assets if needed.
72
+
73
+ (e.g. machine learning models)
74
+ """
75
+
76
+ @abstractmethod
77
+ def analyze(
78
+ self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts
79
+ ) -> List[RecognizerResult]:
80
+ """
81
+ Analyze text to identify entities.
82
+
83
+ :param text: The text to be analyzed
84
+ :param entities: The list of entities this recognizer is able to detect
85
+ :param nlp_artifacts: A group of attributes which are the result of
86
+ an NLP process over the input text.
87
+ :return: List of results detected by this recognizer.
88
+ """
89
+ return None
90
+
91
+ def enhance_using_context(
92
+ self,
93
+ text: str,
94
+ raw_recognizer_results: List[RecognizerResult],
95
+ other_raw_recognizer_results: List[RecognizerResult],
96
+ nlp_artifacts: NlpArtifacts,
97
+ context: Optional[List[str]] = None,
98
+ ) -> List[RecognizerResult]:
99
+ """Enhance confidence score using context of the entity.
100
+
101
+ Override this method in derived class in case a custom logic
102
+ is needed, otherwise return value will be equal to
103
+ raw_results.
104
+
105
+ in case a result score is boosted, derived class need to update
106
+ result.recognition_metadata[RecognizerResult.IS_SCORE_ENHANCED_BY_CONTEXT_KEY]
107
+
108
+ :param text: The actual text that was analyzed
109
+ :param raw_recognizer_results: This recognizer's results, to be updated
110
+ based on recognizer specific context.
111
+ :param other_raw_recognizer_results: Other recognizer results matched in
112
+ the given text to allow related entity context enhancement
113
+ :param nlp_artifacts: The nlp artifacts contains elements
114
+ such as lemmatized tokens for better
115
+ accuracy of the context enhancement process
116
+ :param context: list of context words
117
+ """
118
+ return raw_recognizer_results
119
+
120
+ def get_supported_entities(self) -> List[str]:
121
+ """
122
+ Return the list of entities this recognizer can identify.
123
+
124
+ :return: A list of the supported entities by this recognizer
125
+ """
126
+ return self.supported_entities
127
+
128
+ def get_supported_language(self) -> str:
129
+ """
130
+ Return the language this recognizer can support.
131
+
132
+ :return: A list of the supported language by this recognizer
133
+ """
134
+ return self.supported_language
135
+
136
+ def get_version(self) -> str:
137
+ """
138
+ Return the version of this recognizer.
139
+
140
+ :return: The current version of this recognizer
141
+ """
142
+ return self.version
143
+
144
+ def to_dict(self) -> Dict:
145
+ """
146
+ Serialize self to dictionary.
147
+
148
+ :return: a dictionary
149
+ """
150
+ return_dict = {
151
+ "supported_entities": self.supported_entities,
152
+ "supported_language": self.supported_language,
153
+ "name": self.name,
154
+ "version": self.version,
155
+ }
156
+ return return_dict
157
+
158
+ @classmethod
159
+ def from_dict(cls, entity_recognizer_dict: Dict) -> "EntityRecognizer":
160
+ """
161
+ Create EntityRecognizer from a dict input.
162
+
163
+ :param entity_recognizer_dict: Dict containing keys and values for instantiation
164
+ """
165
+ return cls(**entity_recognizer_dict)
166
+
167
+ @staticmethod
168
+ def remove_duplicates(results: List[RecognizerResult]) -> List[RecognizerResult]:
169
+ """
170
+ Remove duplicate results.
171
+
172
+ Remove duplicates in case the two results
173
+ have identical start and ends and types.
174
+ :param results: List[RecognizerResult]
175
+ :return: List[RecognizerResult]
176
+ """
177
+ results = list(set(results))
178
+ results = sorted(results, key=lambda x: (-x.score, x.start, -(x.end - x.start)))
179
+ filtered_results = []
180
+
181
+ for result in results:
182
+ if result.score == 0:
183
+ continue
184
+
185
+ to_keep = result not in filtered_results # equals based comparison
186
+ if to_keep:
187
+ for filtered in filtered_results:
188
+ # If result is contained in one of the other results
189
+ if (
190
+ result.contained_in(filtered)
191
+ and result.entity_type == filtered.entity_type
192
+ ):
193
+ to_keep = False
194
+ break
195
+
196
+ if to_keep:
197
+ filtered_results.append(result)
198
+
199
+ return filtered_results
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/local_recognizer.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from abc import ABC
2
+
3
+ from presidio_analyzer import EntityRecognizer
4
+
5
+
6
+ class LocalRecognizer(ABC, EntityRecognizer):
7
+ """PII entity recognizer which runs on the same process as the AnalyzerEngine."""
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/__init__.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """NLP engine package. Performs text pre-processing."""
2
+
3
+ from .nlp_artifacts import NlpArtifacts
4
+ from .nlp_engine import NlpEngine
5
+ from .spacy_nlp_engine import SpacyNlpEngine
6
+ from .client_nlp_engine import ClientNlpEngine
7
+ from .stanza_nlp_engine import StanzaNlpEngine
8
+ from .transformers_nlp_engine import TransformersNlpEngine
9
+ from .nlp_engine_provider import NlpEngineProvider
10
+
11
+ __all__ = [
12
+ "NlpArtifacts",
13
+ "NlpEngine",
14
+ "SpacyNlpEngine",
15
+ "ClientNlpEngine",
16
+ "StanzaNlpEngine",
17
+ "NlpEngineProvider",
18
+ "TransformersNlpEngine",
19
+ ]
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/client_nlp_engine.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ # import logging
3
+
4
+ try:
5
+ import client
6
+ import spacy_client
7
+ except ImportError:
8
+ client = None
9
+
10
+ from typing import Optional, Dict, Iterator, Tuple, Union, List
11
+
12
+ import spacy
13
+ from spacy.language import Language
14
+ from spacy.tokens import Doc
15
+
16
+ from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngine
17
+
18
+ logger = logging.getLogger("presidio_analyzer")
19
+
20
+
21
+ class ClientNlpEngine(NlpEngine):
22
+ """
23
+ SpacyNlpEngine is an abstraction layer over the nlp module.
24
+
25
+ It provides processing functionality as well as other queries
26
+ on tokens.
27
+ The SpacyNlpEngine uses SpaCy as its NLP module
28
+ """
29
+
30
+
31
+ engine_name="spacy"
32
+
33
+ is_available = bool(spacy)
34
+
35
+
36
+ def __init__(self, models: Optional[Dict[str, str]] = None):
37
+ """
38
+ Initialize a wrapper on spaCy functionality.
39
+
40
+ :param models: Dictionary with the name of the spaCy model per language.
41
+ For example: models = {"en": "en_core_web_lg"}
42
+ """
43
+ if not models:
44
+ models = {"en": "en_core_web_lg"}
45
+ logger.debug(f"Loading SpaCy models: {models.values()}")
46
+
47
+ self.nlp = {
48
+ lang_code: spacy.load(model_name, disable=["parser"])
49
+ for lang_code, model_name in models.items()
50
+ }
51
+
52
+
53
+
54
+ def process_text(self, text: str, language: str) -> NlpArtifacts:
55
+ """Execute the SpaCy NLP pipeline on the given text and language."""
56
+
57
+ doc = self.nlp[language](text)
58
+ return self._doc_to_nlp_artifact(doc, language)
59
+
60
+ def process_batch(
61
+ self,
62
+ texts: Union[List[str], List[Tuple[str, object]]],
63
+ language: str,
64
+ as_tuples: bool = False,
65
+ ) -> Iterator[Optional[NlpArtifacts]]:
66
+ """Execute the NLP pipeline on a batch of texts using spacy pipe."""
67
+ texts = (str(text) for text in texts)
68
+ docs = self.nlp[language].pipe(texts, as_tuples=as_tuples)
69
+ for doc in docs:
70
+ yield doc.text, self._doc_to_nlp_artifact(doc, language)
71
+
72
+ def is_stopword(self, word: str, language: str) -> bool:
73
+ """
74
+ Return true if the given word is a stop word.
75
+
76
+ (within the given language)
77
+ """
78
+ return self.nlp[language].vocab[word].is_stop
79
+
80
+ def is_punct(self, word: str, language: str) -> bool:
81
+ """
82
+ Return true if the given word is a punctuation word.
83
+
84
+ (within the given language).
85
+ """
86
+ return self.nlp[language].vocab[word].is_punct
87
+
88
+ def get_nlp(self, language: str) -> Language:
89
+ """
90
+ Return the language model loaded for a language.
91
+
92
+ :param language: Name of language
93
+ :return: Language model from spaCy
94
+ """
95
+ return self.nlp[language]
96
+
97
+ def _doc_to_nlp_artifact(self, doc: Doc, language: str) -> NlpArtifacts:
98
+ lemmas = [token.lemma_ for token in doc]
99
+ tokens_indices = [token.idx for token in doc]
100
+ entities = doc.ents
101
+ return NlpArtifacts(
102
+ entities=entities,
103
+ tokens=doc,
104
+ tokens_indices=tokens_indices,
105
+ lemmas=lemmas,
106
+ nlp_engine=self,
107
+ language=language,
108
+ )
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/nlp_artifacts.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import List
3
+
4
+ from spacy.tokens import Doc, Span
5
+
6
+
7
+ class NlpArtifacts:
8
+ """
9
+ NlpArtifacts is an abstraction layer over the results of an NLP pipeline.
10
+
11
+ processing over a given text, it holds attributes such as entities,
12
+ tokens and lemmas which can be used by any recognizer
13
+ """
14
+
15
+ def __init__(
16
+ self,
17
+ entities: List[Span],
18
+ tokens: Doc,
19
+ tokens_indices: List[int],
20
+ lemmas: List[str],
21
+ nlp_engine, # noqa ANN001
22
+ language: str,
23
+ ):
24
+ self.entities = entities
25
+ self.tokens = tokens
26
+ self.lemmas = lemmas
27
+ self.tokens_indices = tokens_indices
28
+ self.keywords = self.set_keywords(nlp_engine, lemmas, language)
29
+ self.nlp_engine = nlp_engine
30
+
31
+ @staticmethod
32
+ def set_keywords(
33
+ nlp_engine, lemmas: List[str], language: str # noqa ANN001
34
+ ) -> List[str]:
35
+ """
36
+ Return keywords fpr text.
37
+
38
+ Extracts lemmas with certain conditions as keywords.
39
+ """
40
+ if not nlp_engine:
41
+ return []
42
+ keywords = [
43
+ k.lower()
44
+ for k in lemmas
45
+ if not nlp_engine.is_stopword(k, language)
46
+ and not nlp_engine.is_punct(k, language)
47
+ and k != "-PRON-"
48
+ and k != "be"
49
+ ]
50
+
51
+ # best effort, try even further to break tokens into sub tokens,
52
+ # this can result in reducing false negatives
53
+ keywords = [i.split(":") for i in keywords]
54
+
55
+ # splitting the list can, if happened, will result in list of lists,
56
+ # we flatten the list
57
+ keywords = [item for sublist in keywords for item in sublist]
58
+ return keywords
59
+
60
+ def to_json(self) -> str:
61
+ """Convert nlp artifacts to json."""
62
+
63
+ return_dict = self.__dict__.copy()
64
+
65
+ # Ignore NLP engine as it's not serializable currently
66
+ del return_dict["nlp_engine"]
67
+
68
+ # Converting spaCy tokens and spans to string as they are not serializable
69
+ if "tokens" in return_dict:
70
+ return_dict["tokens"] = [token.text for token in self.tokens]
71
+ if "entities" in return_dict:
72
+ return_dict["entities"] = [entity.text for entity in self.entities]
73
+
74
+ return json.dumps(return_dict)
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/nlp_engine.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from typing import Iterable, Iterator, Tuple
3
+
4
+ from presidio_analyzer.nlp_engine import NlpArtifacts
5
+
6
+
7
+ class NlpEngine(ABC):
8
+ """
9
+ NlpEngine is an abstraction layer over the nlp module.
10
+
11
+ It provides NLP preprocessing functionality as well as other queries
12
+ on tokens.
13
+ """
14
+
15
+ @abstractmethod
16
+ def process_text(self, text: str, language: str) -> NlpArtifacts:
17
+ """Execute the NLP pipeline on the given text and language."""
18
+
19
+ @abstractmethod
20
+ def process_batch(
21
+ self, texts: Iterable[str], language: str, **kwargs
22
+ ) -> Iterator[Tuple[str, NlpArtifacts]]:
23
+ """Execute the NLP pipeline on a batch of texts.
24
+
25
+ Returns a tuple of (text, NlpArtifacts)
26
+ """
27
+
28
+ @abstractmethod
29
+ def is_stopword(self, word: str, language: str) -> bool:
30
+ """
31
+ Return true if the given word is a stop word.
32
+
33
+ (within the given language)
34
+ """
35
+
36
+ @abstractmethod
37
+ def is_punct(self, word: str, language: str) -> bool:
38
+ """
39
+ Return true if the given word is a punctuation word.
40
+
41
+ (within the given language)
42
+ """
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Optional, Dict, Union, Tuple
4
+
5
+ import yaml
6
+
7
+ from presidio_analyzer.nlp_engine import (
8
+ StanzaNlpEngine,
9
+ SpacyNlpEngine,
10
+ NlpEngine,
11
+ ClientNlpEngine,
12
+ TransformersNlpEngine,
13
+ )
14
+
15
+ logger = logging.getLogger("presidio_analyzer")
16
+
17
+
18
+ class NlpEngineProvider:
19
+ """Create different NLP engines from configuration.
20
+
21
+ :param nlp_engines: List of available NLP engines.
22
+ Default: (SpacyNlpEngine, StanzaNlpEngine)
23
+ :param nlp_configuration: Dict containing nlp configuration
24
+ :example: configuration:
25
+ {
26
+ "nlp_engine_name": "spacy",
27
+ "models": [{"lang_code": "en",
28
+ "model_name": "en_core_web_lg"
29
+ }]
30
+ }
31
+ Nlp engine names available by default: spacy, stanza.
32
+ :param conf_file: Path to yaml file containing nlp engine configuration.
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ nlp_engines: Optional[Tuple] = None,
38
+ conf_file: Optional[Union[Path, str]] = None,
39
+ nlp_configuration: Optional[Dict] = None,
40
+ ):
41
+
42
+ if not nlp_engines:
43
+ nlp_engines = (SpacyNlpEngine, StanzaNlpEngine, TransformersNlpEngine,ClientNlpEngine)
44
+
45
+
46
+
47
+ self.nlp_engines = {
48
+ engine.engine_name: engine for engine in nlp_engines if engine.is_available
49
+ }
50
+
51
+ logger.debug(
52
+ f"Loaded these available nlp engines: {list(self.nlp_engines.keys())}"
53
+ )
54
+
55
+ if conf_file and nlp_configuration:
56
+ raise ValueError(
57
+ "Either conf_file or nlp_configuration should be provided, not both."
58
+ )
59
+
60
+ if nlp_configuration:
61
+ self.nlp_configuration = nlp_configuration
62
+
63
+ if conf_file:
64
+ self.nlp_configuration = self._read_nlp_conf(conf_file)
65
+
66
+ if not conf_file and not nlp_configuration:
67
+ conf_file = self._get_full_conf_path()
68
+ logger.debug(f"Reading default conf file from {conf_file}")
69
+ self.nlp_configuration = self._read_nlp_conf(conf_file)
70
+
71
+ def create_engine(self) -> NlpEngine:
72
+ """Create an NLP engine instance."""
73
+ if (
74
+ not self.nlp_configuration
75
+ or not self.nlp_configuration.get("models")
76
+ or not self.nlp_configuration.get("nlp_engine_name")
77
+ ):
78
+ raise ValueError(
79
+ "Illegal nlp configuration. "
80
+ "Configuration should include nlp_engine_name and models "
81
+ "(list of model_name for each lang_code)."
82
+ )
83
+ nlp_engine_name = self.nlp_configuration["nlp_engine_name"]
84
+ if nlp_engine_name not in self.nlp_engines:
85
+ raise ValueError(
86
+ f"NLP engine '{nlp_engine_name}' is not available. "
87
+ "Make sure you have all required packages installed"
88
+ )
89
+ try:
90
+ nlp_engine_class = self.nlp_engines[nlp_engine_name]
91
+ nlp_engine_opts = {
92
+ m["lang_code"]: m["model_name"]
93
+ for m in self.nlp_configuration["models"]
94
+ }
95
+ engine = nlp_engine_class(nlp_engine_opts)
96
+ logger.info(
97
+ f"Created NLP engine: {engine.engine_name}. "
98
+ f"Loaded models: {list(engine.nlp.keys())}"
99
+ )
100
+ return engine
101
+ except KeyError:
102
+ raise ValueError("Wrong NLP engine configuration")
103
+
104
+ @staticmethod
105
+ def _read_nlp_conf(conf_file: Union[Path, str]) -> dict:
106
+ """Read the nlp configuration from a provided yaml file."""
107
+
108
+ if not Path(conf_file).exists():
109
+ nlp_configuration = {
110
+ "nlp_engine_name": "spacy",
111
+ "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
112
+ }
113
+ logger.warning(
114
+ f"configuration file {conf_file} not found. "
115
+ f"Using default config: {nlp_configuration}."
116
+ )
117
+
118
+ else:
119
+ nlp_configuration = yaml.safe_load(open(conf_file))
120
+
121
+ return nlp_configuration
122
+
123
+ @staticmethod
124
+ def _get_full_conf_path(
125
+ default_conf_file: Union[Path, str] = "default.yaml"
126
+ ) -> Path:
127
+ """Return a Path to the default conf file."""
128
+ return Path(Path(__file__).parent.parent.parent, "conf", default_conf_file)
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Optional, Dict, Iterator, Tuple, Union, List
3
+
4
+ import spacy
5
+ from spacy.language import Language
6
+ from spacy.tokens import Doc
7
+
8
+ from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngine
9
+
10
+ logger = logging.getLogger("presidio_analyzer")
11
+
12
+
13
+ class SpacyNlpEngine(NlpEngine):
14
+ """
15
+ SpacyNlpEngine is an abstraction layer over the nlp module.
16
+
17
+ It provides processing functionality as well as other queries
18
+ on tokens.
19
+ The SpacyNlpEngine uses SpaCy as its NLP module
20
+ """
21
+
22
+ engine_name = "spacy"
23
+ is_available = bool(spacy)
24
+
25
+
26
+ def __init__(self, models: Optional[Dict[str, str]] = None):
27
+ """
28
+ Initialize a wrapper on spaCy functionality.
29
+
30
+ :param models: Dictionary with the name of the spaCy model per language.
31
+ For example: models = {"en": "en_core_web_lg"}
32
+ """
33
+ if not models:
34
+ models = {"en": "en_core_web_lg"}
35
+ logger.debug(f"Loading SpaCy models: {models.values()}")
36
+
37
+ self.nlp = {
38
+ lang_code: spacy.load(model_name, disable=["parser"])
39
+ for lang_code, model_name in models.items()
40
+ }
41
+
42
+ def process_text(self, text: str, language: str) -> NlpArtifacts:
43
+ """Execute the SpaCy NLP pipeline on the given text and language."""
44
+
45
+ doc = self.nlp[language](text)
46
+ return self._doc_to_nlp_artifact(doc, language)
47
+
48
+ def process_batch(
49
+ self,
50
+ texts: Union[List[str], List[Tuple[str, object]]],
51
+ language: str,
52
+ as_tuples: bool = False,
53
+ ) -> Iterator[Optional[NlpArtifacts]]:
54
+ """Execute the NLP pipeline on a batch of texts using spacy pipe."""
55
+ texts = (str(text) for text in texts)
56
+ docs = self.nlp[language].pipe(texts, as_tuples=as_tuples)
57
+ for doc in docs:
58
+ yield doc.text, self._doc_to_nlp_artifact(doc, language)
59
+
60
+ def is_stopword(self, word: str, language: str) -> bool:
61
+ """
62
+ Return true if the given word is a stop word.
63
+
64
+ (within the given language)
65
+ """
66
+ return self.nlp[language].vocab[word].is_stop
67
+
68
+ def is_punct(self, word: str, language: str) -> bool:
69
+ """
70
+ Return true if the given word is a punctuation word.
71
+
72
+ (within the given language).
73
+ """
74
+ return self.nlp[language].vocab[word].is_punct
75
+
76
+ def get_nlp(self, language: str) -> Language:
77
+ """
78
+ Return the language model loaded for a language.
79
+
80
+ :param language: Name of language
81
+ :return: Language model from spaCy
82
+ """
83
+ return self.nlp[language]
84
+
85
+ def _doc_to_nlp_artifact(self, doc: Doc, language: str) -> NlpArtifacts:
86
+ lemmas = [token.lemma_ for token in doc]
87
+ tokens_indices = [token.idx for token in doc]
88
+ entities = doc.ents
89
+ return NlpArtifacts(
90
+ entities=entities,
91
+ tokens=doc,
92
+ tokens_indices=tokens_indices,
93
+ lemmas=lemmas,
94
+ nlp_engine=self,
95
+ language=language,
96
+ )
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ try:
4
+ import stanza
5
+ import spacy_stanza
6
+ except ImportError:
7
+ stanza = None
8
+
9
+ from presidio_analyzer.nlp_engine import SpacyNlpEngine
10
+
11
+ logger = logging.getLogger("presidio_analyzer")
12
+
13
+
14
+ class StanzaNlpEngine(SpacyNlpEngine):
15
+ """
16
+ StanzaNlpEngine is an abstraction layer over the nlp module.
17
+
18
+ It provides processing functionality as well as other queries
19
+ on tokens.
20
+ The StanzaNlpEngine uses spacy-stanza and stanza as its NLP module
21
+
22
+ :param models: Dictionary with the name of the stanza model per language.
23
+ For example: models = {"en": "en"}
24
+ """
25
+
26
+ engine_name = "stanza"
27
+ is_available = bool(stanza)
28
+ def __init__(self, models=None): # noqa ANN201
29
+ if not models:
30
+ models = {"en": "en"}
31
+ logger.debug(f"Loading Stanza models: {models.values()}")
32
+
33
+ self.nlp = {
34
+ lang_code: spacy_stanza.load_pipeline(
35
+ model_name,
36
+ processors="tokenize,pos,lemma,ner",
37
+ )
38
+ for lang_code, model_name in models.items()
39
+ }
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Optional, Dict
3
+
4
+ import spacy
5
+ from spacy.language import Language
6
+ from spacy.tokens import Doc, Span
7
+
8
+ from presidio_analyzer.nlp_engine import SpacyNlpEngine
9
+
10
+
11
+ try:
12
+ import torch
13
+ import transformers
14
+ from transformers import (
15
+ AutoTokenizer,
16
+ AutoModelForTokenClassification,
17
+ pipeline,
18
+ )
19
+ except ImportError:
20
+ torch = None
21
+ transformers = None
22
+
23
+ logger = logging.getLogger("presidio_analyzer")
24
+
25
+
26
+ @Language.factory(
27
+ "transformers",
28
+ default_config={"pretrained_model_name_or_path": "dslim/bert-base-NER"},
29
+ )
30
+ def create_transformer_component(nlp, name, pretrained_model_name_or_path: str):
31
+ """Spacy Language factory for creating custom component."""
32
+ return TransformersComponent(
33
+ pretrained_model_name_or_path=pretrained_model_name_or_path
34
+ )
35
+
36
+
37
+ class TransformersComponent:
38
+ """
39
+ Custom component to use in spacy pipeline.
40
+
41
+ Using HaggingFace transformers pretrained models for entity recognition.
42
+ :param pretrained_model_name_or_path: HaggingFace pretrained_model_name_or_path
43
+ """
44
+
45
+ def __init__(self, pretrained_model_name_or_path: str) -> None:
46
+ Span.set_extension("confidence_score", default=1.0, force=True)
47
+ tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
48
+ model = AutoModelForTokenClassification.from_pretrained(
49
+ pretrained_model_name_or_path
50
+ )
51
+ self.nlp = pipeline(
52
+ "ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple"
53
+ )
54
+
55
+ def __call__(self, doc: Doc) -> Doc:
56
+ """Write transformers results to doc entities."""
57
+
58
+ res = self.nlp(doc.text)
59
+ ents = []
60
+ for d in res:
61
+ span = doc.char_span(
62
+ d["start"], d["end"], label=d["entity_group"], alignment_mode="expand"
63
+ )
64
+ if span is not None:
65
+ span._.confidence_score = d["score"]
66
+ ents.append(span)
67
+ else:
68
+ logger.warning(
69
+ f"Transformers model returned {d} but no valid span was found."
70
+ )
71
+ doc.ents = ents
72
+ return doc
73
+
74
+
75
+ class TransformersNlpEngine(SpacyNlpEngine):
76
+ """
77
+
78
+ SpacyTransformersNlpEngine is a transformers based NlpEngine.
79
+
80
+ It comprises a spacy pipeline used for tokenization,
81
+ lemmatization, pos, and a transformers component for NER.
82
+
83
+ Both the underlying spacy pipeline and the transformers engine could be
84
+ configured by the user.
85
+
86
+ :param models: a dictionary containing the model names per language.
87
+ :example:
88
+ {
89
+ "en": {
90
+ "spacy": "en_core_web_sm",
91
+ "transformers": "dslim/bert-base-NER"
92
+ }
93
+ }
94
+
95
+ Note that since the spaCy model is not used for NER,
96
+ we recommend using a simple model, such as en_core_web_sm for English.
97
+ For potential Transformers models, see a list of models here:
98
+ https://huggingface.co/models?pipeline_tag=token-classification
99
+ It is further recommended to fine-tune these models
100
+ to the specific scenario in hand.
101
+ """
102
+
103
+ engine_name = "transformers"
104
+ is_available = bool(spacy) and bool(transformers)
105
+
106
+
107
+ def __init__(self, models: Optional[Dict[str, Dict[str, str]]] = None):
108
+ # default models if not specified
109
+ if not models:
110
+ models = {
111
+ "en": {"spacy": "en_core_web_sm", "transformers": "dslim/bert-base-NER"}
112
+ }
113
+ # validate models type
114
+ elif type(models) is not dict:
115
+ logger.error(f"''models' argument must be dict, not {type(models)}")
116
+ raise KeyError(f"Expected 'models' argument to be dict, not {type(models)}")
117
+ # validate models[model_lang] type is dict for all model_lang
118
+ elif any(
119
+ [type(model_dict) is not dict for model_lang, model_dict in models.items()]
120
+ ):
121
+ # elif type(models["model_name"]) is not dict:
122
+ logger.error(
123
+ "'models.model_name' argument must be dict,"
124
+ f"not {type(models['model_name'])}"
125
+ )
126
+ raise KeyError(
127
+ "Expected 'models.model_name' argument to be dict,"
128
+ f"not {type(models['model_name'])}"
129
+ )
130
+ # chack that model_name dict includes the keys: "spacy" and "transformers"
131
+ elif any(
132
+ [
133
+ any([key not in model_dict for key in ("spacy", "transformers")])
134
+ for model_lang, model_dict in models.items()
135
+ ]
136
+ ):
137
+ logger.error(
138
+ "'models.model_name' must contains 'spacy' and 'transformers' keys"
139
+ )
140
+ raise KeyError(
141
+ "Expected keys ('spacy' and 'transformers') was not found in "
142
+ "models.model_name dict"
143
+ )
144
+
145
+ logger.debug(f"Loading SpaCy and transformers models: {models.values()}")
146
+
147
+ self.nlp = {}
148
+ for lang_code, model_name in models.items():
149
+ nlp = spacy.load(model_name["spacy"], disable=["parser", "ner"])
150
+ nlp.add_pipe(
151
+ "transformers",
152
+ config={"pretrained_model_name_or_path": model_name["transformers"]},
153
+ last=True,
154
+ )
155
+ self.nlp[lang_code] = nlp
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/pattern.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import Dict
3
+
4
+
5
+ class Pattern:
6
+ """
7
+ A class that represents a regex pattern.
8
+
9
+ :param name: the name of the pattern
10
+ :param regex: the regex pattern to detect
11
+ :param score: the pattern's strength (values varies 0-1)
12
+ """
13
+
14
+ def __init__(self, name: str, regex: str, score: float):
15
+
16
+ self.name = name
17
+ self.regex = regex
18
+ self.score = score
19
+
20
+ def to_dict(self) -> Dict:
21
+ """
22
+ Turn this instance into a dictionary.
23
+
24
+ :return: a dictionary
25
+ """
26
+ return_dict = {"name": self.name, "score": self.score, "regex": self.regex}
27
+ return return_dict
28
+
29
+ @classmethod
30
+ def from_dict(cls, pattern_dict: Dict) -> "Pattern":
31
+ """
32
+ Load an instance from a dictionary.
33
+
34
+ :param pattern_dict: a dictionary holding the pattern's parameters
35
+ :return: a Pattern instance
36
+ """
37
+ return cls(**pattern_dict)
38
+
39
+ def __repr__(self):
40
+ """Return string representation of instance."""
41
+ return json.dumps(self.to_dict())
42
+
43
+ def __str__(self):
44
+ """Return string representation of instance."""
45
+ return json.dumps(self.to_dict())
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/pattern_recognizer.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import logging
3
+ from typing import List, Optional, Dict
4
+
5
+ import regex as re
6
+
7
+ from presidio_analyzer import (
8
+ LocalRecognizer,
9
+ Pattern,
10
+ RecognizerResult,
11
+ EntityRecognizer,
12
+ AnalysisExplanation,
13
+ )
14
+ from presidio_analyzer.nlp_engine import NlpArtifacts
15
+
16
+ logger = logging.getLogger("presidio_analyzer")
17
+
18
+
19
+ class PatternRecognizer(LocalRecognizer):
20
+ """
21
+ PII entity recognizer using regular expressions or deny-lists.
22
+
23
+ :param patterns: A list of patterns to detect
24
+ :param deny_list: A list of words to detect,
25
+ in case our recognizer uses a predefined list of words (deny list)
26
+ :param context: list of context words
27
+ :param deny_list_score: confidence score for a term
28
+ identified using a deny-list
29
+ """
30
+
31
+ def __init__(
32
+ self,
33
+ supported_entity: str,
34
+ name: str = None,
35
+ supported_language: str = "en",
36
+ patterns: List[Pattern] = None,
37
+ deny_list: List[str] = None,
38
+ context: List[str] = None,
39
+ deny_list_score: float = 1.0,
40
+ version: str = "0.0.1",
41
+ ):
42
+
43
+ if not supported_entity:
44
+ raise ValueError("Pattern recognizer should be initialized with entity")
45
+
46
+ if not patterns and not deny_list:
47
+ raise ValueError(
48
+ "Pattern recognizer should be initialized with patterns"
49
+ " or with deny list"
50
+ )
51
+
52
+ super().__init__(
53
+ supported_entities=[supported_entity],
54
+ supported_language=supported_language,
55
+ name=name,
56
+ version=version,
57
+ )
58
+ if patterns is None:
59
+ self.patterns = []
60
+ else:
61
+ self.patterns = patterns
62
+ self.context = context
63
+ self.deny_list_score = deny_list_score
64
+
65
+ if deny_list:
66
+ deny_list_pattern = self._deny_list_to_regex(deny_list)
67
+ self.patterns.append(deny_list_pattern)
68
+ self.deny_list = deny_list
69
+ else:
70
+ self.deny_list = []
71
+
72
+ def load(self): # noqa D102
73
+ pass
74
+
75
+ def analyze(
76
+ self,
77
+ text: str,
78
+ entities: List[str],
79
+ nlp_artifacts: NlpArtifacts = None,
80
+ regex_flags: int = None,
81
+ ) -> List[RecognizerResult]:
82
+ """
83
+ Analyzes text to detect PII using regular expressions or deny-lists.
84
+
85
+ :param text: Text to be analyzed
86
+ :param entities: Entities this recognizer can detect
87
+ :param nlp_artifacts: Output values from the NLP engine
88
+ :param regex_flags:
89
+ :return:
90
+ """
91
+ results = []
92
+
93
+ if self.patterns:
94
+ pattern_result = self.__analyze_patterns(text, regex_flags)
95
+ results.extend(pattern_result)
96
+
97
+ return results
98
+
99
+ def _deny_list_to_regex(self, deny_list: List[str]) -> Pattern:
100
+ """
101
+ Convert a list of words to a matching regex.
102
+
103
+ To be analyzed by the analyze method as any other regex patterns.
104
+
105
+ :param deny_list: the list of words to detect
106
+ :return:the regex of the words for detection
107
+ """
108
+
109
+ # Escape deny list elements as preparation for regex
110
+ escaped_deny_list = [re.escape(element) for element in deny_list]
111
+ regex = r"(?:^|(?<=\W))(" + "|".join(escaped_deny_list) + r")(?:(?=\W)|$)"
112
+ return Pattern(name="deny_list", regex=regex, score=self.deny_list_score)
113
+
114
+ def validate_result(self, pattern_text: str) -> Optional[bool]:
115
+ """
116
+ Validate the pattern logic e.g., by running checksum on a detected pattern.
117
+
118
+ :param pattern_text: the text to validated.
119
+ Only the part in text that was detected by the regex engine
120
+ :return: A bool indicating whether the validation was successful.
121
+ """
122
+ return None
123
+
124
+ def invalidate_result(self, pattern_text: str) -> Optional[bool]:
125
+ """
126
+ Logic to check for result invalidation by running pruning logic.
127
+
128
+ For example, each SSN number group should not consist of all the same digits.
129
+
130
+ :param pattern_text: the text to validated.
131
+ Only the part in text that was detected by the regex engine
132
+ :return: A bool indicating whether the result is invalidated
133
+ """
134
+ return None
135
+
136
+ @staticmethod
137
+ def build_regex_explanation(
138
+ recognizer_name: str,
139
+ pattern_name: str,
140
+ pattern: str,
141
+ original_score: float,
142
+ validation_result: bool,
143
+ ) -> AnalysisExplanation:
144
+ """
145
+ Construct an explanation for why this entity was detected.
146
+
147
+ :param recognizer_name: Name of recognizer detecting the entity
148
+ :param pattern_name: Regex pattern name which detected the entity
149
+ :param pattern: Regex pattern logic
150
+ :param original_score: Score given by the recognizer
151
+ :param validation_result: Whether validation was used and its result
152
+ :return: Analysis explanation
153
+ """
154
+ explanation = AnalysisExplanation(
155
+ recognizer=recognizer_name,
156
+ original_score=original_score,
157
+ pattern_name=pattern_name,
158
+ pattern=pattern,
159
+ validation_result=validation_result,
160
+ )
161
+ return explanation
162
+
163
+ def __analyze_patterns(
164
+ self, text: str, flags: int = None
165
+ ) -> List[RecognizerResult]:
166
+ """
167
+ Evaluate all patterns in the provided text.
168
+
169
+ Including words in the provided deny-list
170
+
171
+ :param text: text to analyze
172
+ :param flags: regex flags
173
+ :return: A list of RecognizerResult
174
+ """
175
+ flags = flags if flags else re.DOTALL | re.MULTILINE
176
+ results = []
177
+ for pattern in self.patterns:
178
+ match_start_time = datetime.datetime.now()
179
+ matches = re.finditer(pattern.regex, text, flags=flags)
180
+ match_time = datetime.datetime.now() - match_start_time
181
+ logger.debug(
182
+ "--- match_time[%s]: %s.%s seconds",
183
+ pattern.name,
184
+ match_time.seconds,
185
+ match_time.microseconds,
186
+ )
187
+
188
+ for match in matches:
189
+ start, end = match.span()
190
+ current_match = text[start:end]
191
+
192
+ # Skip empty results
193
+ if current_match == "":
194
+ continue
195
+
196
+ score = pattern.score
197
+
198
+ validation_result = self.validate_result(current_match)
199
+ description = self.build_regex_explanation(
200
+ self.name, pattern.name, pattern.regex, score, validation_result
201
+ )
202
+ pattern_result = RecognizerResult(
203
+ entity_type=self.supported_entities[0],
204
+ start=start,
205
+ end=end,
206
+ score=score,
207
+ analysis_explanation=description,
208
+ recognition_metadata={
209
+ RecognizerResult.RECOGNIZER_NAME_KEY: self.name,
210
+ RecognizerResult.RECOGNIZER_IDENTIFIER_KEY: self.id,
211
+ },
212
+ )
213
+
214
+ if validation_result is not None:
215
+ if validation_result:
216
+ pattern_result.score = EntityRecognizer.MAX_SCORE
217
+ else:
218
+ pattern_result.score = EntityRecognizer.MIN_SCORE
219
+
220
+ invalidation_result = self.invalidate_result(current_match)
221
+ if invalidation_result is not None and invalidation_result:
222
+ pattern_result.score = EntityRecognizer.MIN_SCORE
223
+
224
+ if pattern_result.score > EntityRecognizer.MIN_SCORE:
225
+ results.append(pattern_result)
226
+
227
+ # Update analysis explanation score following validation or invalidation
228
+ description.score = pattern_result.score
229
+
230
+ results = EntityRecognizer.remove_duplicates(results)
231
+ return results
232
+
233
+ def to_dict(self) -> Dict:
234
+ """Serialize instance into a dictionary."""
235
+ return_dict = super().to_dict()
236
+
237
+ return_dict["patterns"] = [pat.to_dict() for pat in self.patterns]
238
+ return_dict["deny_list"] = self.deny_list
239
+ return_dict["context"] = self.context
240
+ return_dict["supported_entity"] = return_dict["supported_entities"][0]
241
+ del return_dict["supported_entities"]
242
+
243
+ return return_dict
244
+
245
+ @classmethod
246
+ def from_dict(cls, entity_recognizer_dict: Dict) -> "PatternRecognizer":
247
+ """Create instance from a serialized dict."""
248
+ patterns = entity_recognizer_dict.get("patterns")
249
+ if patterns:
250
+ patterns_list = [Pattern.from_dict(pat) for pat in patterns]
251
+ entity_recognizer_dict["patterns"] = patterns_list
252
+
253
+ return cls(**entity_recognizer_dict)
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/Aadhaar_Number.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, List
2
+
3
+ from presidio_analyzer import Pattern, PatternRecognizer
4
+
5
+
6
+ class Aadhaar_Number(PatternRecognizer):
7
+ """
8
+ Recognizes US bank number using regex.
9
+
10
+ :param patterns: List of patterns to be used by this recognizer
11
+ :param context: List of context words to increase confidence in detection
12
+ :param supported_language: Language this recognizer supports
13
+ :param supported_entity: The entity this recognizer can detect
14
+ """
15
+
16
+ PATTERNS = [
17
+ Pattern(name="aadhaar_number_pattern", regex="[2-9]{1}[0-9]{3}\s{1}[0-9]{4}\s{1}[0-9]{4}", score=0.5),
18
+ ]
19
+
20
+ CONTEXT = [
21
+ "bank"
22
+ # Task #603: Support keyphrases: change to "checking account"
23
+ # as part of keyphrase change
24
+ "check",
25
+ "account",
26
+ "account#",
27
+ "acct",
28
+ "save",
29
+ "debit",
30
+ ]
31
+
32
+ def __init__(
33
+ self,
34
+ patterns: Optional[List[Pattern]] = None,
35
+ context: Optional[List[str]] = None,
36
+ supported_language: str = "en",
37
+ supported_entity: str = "AADHAR_NUMBER",
38
+ ):
39
+ patterns = patterns if patterns else self.PATTERNS
40
+ context = context if context else self.CONTEXT
41
+ super().__init__(
42
+ supported_entity=supported_entity,
43
+ patterns=patterns,
44
+ context=context,
45
+ supported_language=supported_language,
46
+ )
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/PAN_Number.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, List
2
+
3
+ from presidio_analyzer import Pattern, PatternRecognizer
4
+
5
+
6
+ class PAN_Number(PatternRecognizer):
7
+ """
8
+ Recognizes US bank number using regex.
9
+
10
+ :param patterns: List of patterns to be used by this recognizer
11
+ :param context: List of context words to increase confidence in detection
12
+ :param supported_language: Language this recognizer supports
13
+ :param supported_entity: The entity this recognizer can detect
14
+ """
15
+
16
+ PATTERNS = [
17
+ Pattern(name="pan_number_pattern", regex="[A-Z]{5}[0-9]{4}[A-Z]{1}", score=0.5),
18
+ ]
19
+
20
+ CONTEXT = [
21
+ "bank"
22
+ # Task #603: Support keyphrases: change to "checking account"
23
+ # as part of keyphrase change
24
+ "check",
25
+ "account",
26
+ "account#",
27
+ "acct",
28
+ "save",
29
+ "debit",
30
+ ]
31
+
32
+ def __init__(
33
+ self,
34
+ patterns: Optional[List[Pattern]] = None,
35
+ context: Optional[List[str]] = None,
36
+ supported_language: str = "en",
37
+ supported_entity: str = "PAN_Number",
38
+ ):
39
+ patterns = patterns if patterns else self.PATTERNS
40
+ context = context if context else self.CONTEXT
41
+ super().__init__(
42
+ supported_entity=supported_entity,
43
+ patterns=patterns,
44
+ context=context,
45
+ supported_language=supported_language,
46
+ )
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/__init__.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Predefined recognizers package. Holds all the default recognizers."""
2
+
3
+ from presidio_analyzer.predefined_recognizers.transformers_recognizer import (
4
+ TransformersRecognizer,
5
+ )
6
+ from .PAN_Number import PAN_Number
7
+ from .credit_card_recognizer import CreditCardRecognizer
8
+ from .crypto_recognizer import CryptoRecognizer
9
+ from .date_recognizer import DateRecognizer
10
+ from .email_recognizer import EmailRecognizer
11
+ from .iban_recognizer import IbanRecognizer
12
+ from .ip_recognizer import IpRecognizer
13
+ from .medical_license_recognizer import MedicalLicenseRecognizer
14
+ from .phone_recognizer import PhoneRecognizer
15
+ from .sg_fin_recognizer import SgFinRecognizer
16
+ from .spacy_recognizer import SpacyRecognizer
17
+ from .stanza_recognizer import StanzaRecognizer
18
+ from .uk_nhs_recognizer import NhsRecognizer
19
+ from .url_recognizer import UrlRecognizer
20
+ from .Aadhaar_Number import Aadhaar_Number
21
+ from .data_recognizer import ClientListRecognizer
22
+ from .us_driver_license_recognizer import UsLicenseRecognizer
23
+ from .us_itin_recognizer import UsItinRecognizer
24
+ from .us_passport_recognizer import UsPassportRecognizer
25
+ from .us_ssn_recognizer import UsSsnRecognizer
26
+ from .es_nif_recognizer import EsNifRecognizer
27
+ from .au_abn_recognizer import AuAbnRecognizer
28
+ from .au_acn_recognizer import AuAcnRecognizer
29
+ from .au_tfn_recognizer import AuTfnRecognizer
30
+ from .au_medicare_recognizer import AuMedicareRecognizer
31
+ from .it_driver_license_recognizer import ItDriverLicenseRecognizer
32
+ from .it_fiscal_code_recognizer import ItFiscalCodeRecognizer
33
+ from .it_vat_code import ItVatCodeRecognizer
34
+ from .it_identity_card_recognizer import ItIdentityCardRecognizer
35
+ from .it_passport_recognizer import ItPassportRecognizer
36
+
37
+ NLP_RECOGNIZERS = {
38
+ "spacy": SpacyRecognizer,
39
+ "stanza": StanzaRecognizer,
40
+ "transformers": TransformersRecognizer,
41
+ "client":ClientListRecognizer
42
+ }
43
+
44
+ __all__ = [
45
+ "AbaRoutingRecognizer",
46
+ "CreditCardRecognizer",
47
+ "CryptoRecognizer",
48
+ "DateRecognizer",
49
+ "EmailRecognizer",
50
+ "IbanRecognizer",
51
+ "IpRecognizer",
52
+ "NhsRecognizer",
53
+ "MedicalLicenseRecognizer",
54
+ "PhoneRecognizer",
55
+ "SgFinRecognizer",
56
+ "UrlRecognizer",
57
+ "UsBankRecognizer",
58
+ "UsItinRecognizer",
59
+ "UsLicenseRecognizer",
60
+ "UsPassportRecognizer",
61
+ "UsSsnRecognizer",
62
+ "EsNifRecognizer",
63
+ "SpacyRecognizer",
64
+ "ClientListRecognizer",
65
+ "StanzaRecognizer",
66
+ "NLP_RECOGNIZERS",
67
+ "AuAbnRecognizer",
68
+ "AuAcnRecognizer",
69
+ "AuTfnRecognizer",
70
+ "AuMedicareRecognizer",
71
+ "TransformersRecognizer",
72
+ "ItDriverLicenseRecognizer",
73
+ "ItFiscalCodeRecognizer",
74
+ "ItVatCodeRecognizer",
75
+ "ItIdentityCardRecognizer",
76
+ "ItPassportRecognizer",
77
+ ]
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/au_abn_recognizer.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, List, Tuple
2
+
3
+ from presidio_analyzer import Pattern, PatternRecognizer
4
+
5
+
6
+ class AuAbnRecognizer(PatternRecognizer):
7
+ """
8
+ Recognizes Australian Business Number ("ABN").
9
+
10
+ The Australian Business Number (ABN) is a unique 11
11
+ digit identifier issued to all entities registered in
12
+ the Australian Business Register (ABR).
13
+ The 11 digit ABN is structured as a 9 digit identifier
14
+ with two leading check digits.
15
+ The leading check digits are derived using a modulus 89 calculation.
16
+ This recognizer identifies ABN using regex, context words and checksum.
17
+ Reference: https://abr.business.gov.au/Help/AbnFormat
18
+
19
+ :param patterns: List of patterns to be used by this recognizer
20
+ :param context: List of context words to increase confidence in detection
21
+ :param supported_language: Language this recognizer supports
22
+ :param supported_entity: The entity this recognizer can detect
23
+ :param replacement_pairs: List of tuples with potential replacement values
24
+ for different strings to be used during pattern matching.
25
+ This can allow a greater variety in input, for example by removing dashes or spaces.
26
+ """
27
+
28
+ PATTERNS = [
29
+ Pattern(
30
+ "ABN (Medium)",
31
+ r"\b\d{2}\s\d{3}\s\d{3}\s\d{3}\b",
32
+ 0.1,
33
+ ),
34
+ Pattern(
35
+ "ABN (Low)",
36
+ r"\b\d{11}\b",
37
+ 0.01,
38
+ ),
39
+ ]
40
+
41
+ CONTEXT = [
42
+ "australian business number",
43
+ "abn",
44
+ ]
45
+
46
+ def __init__(
47
+ self,
48
+ patterns: Optional[List[Pattern]] = None,
49
+ context: Optional[List[str]] = None,
50
+ supported_language: str = "en",
51
+ supported_entity: str = "AU_ABN",
52
+ replacement_pairs: Optional[List[Tuple[str, str]]] = None,
53
+ ):
54
+ self.replacement_pairs = (
55
+ replacement_pairs if replacement_pairs else [("-", ""), (" ", "")]
56
+ )
57
+ patterns = patterns if patterns else self.PATTERNS
58
+ context = context if context else self.CONTEXT
59
+ super().__init__(
60
+ supported_entity=supported_entity,
61
+ patterns=patterns,
62
+ context=context,
63
+ supported_language=supported_language,
64
+ )
65
+
66
+ def validate_result(self, pattern_text: str) -> bool:
67
+ """
68
+ Validate the pattern logic e.g., by running checksum on a detected pattern.
69
+
70
+ :param pattern_text: the text to validated.
71
+ Only the part in text that was detected by the regex engine
72
+ :return: A bool indicating whether the validation was successful.
73
+ """
74
+ # Pre-processing before validation checks
75
+ text = self.__sanitize_value(pattern_text, self.replacement_pairs)
76
+ abn_list = [int(digit) for digit in text if not digit.isspace()]
77
+
78
+ # Set weights based on digit position
79
+ weight = [10, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19]
80
+
81
+ # Perform checksums
82
+ abn_list[0] = 9 if abn_list[0] == 0 else abn_list[0] - 1
83
+ sum_product = 0
84
+ for i in range(11):
85
+ sum_product += abn_list[i] * weight[i]
86
+ remainder = sum_product % 89
87
+ return remainder == 0
88
+
89
+ @staticmethod
90
+ def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
91
+ for search_string, replacement_string in replacement_pairs:
92
+ text = text.replace(search_string, replacement_string)
93
+ return text
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/au_acn_recognizer.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, List, Tuple
2
+
3
+ from presidio_analyzer import Pattern, PatternRecognizer
4
+
5
+
6
+ class AuAcnRecognizer(PatternRecognizer):
7
+ """
8
+ Recognizes Australian Company Number ("ACN").
9
+
10
+ The Australian Company Number (ACN) is a nine digit number
11
+ with the last digit being a check digit calculated using a
12
+ modified modulus 10 calculation.
13
+ This recognizer identifies ACN using regex, context words, and checksum.
14
+ Reference: https://asic.gov.au/
15
+
16
+ :param patterns: List of patterns to be used by this recognizer
17
+ :param context: List of context words to increase confidence in detection
18
+ :param supported_language: Language this recognizer supports
19
+ :param supported_entity: The entity this recognizer can detect
20
+ :param replacement_pairs: List of tuples with potential replacement values
21
+ for different strings to be used during pattern matching.
22
+ This can allow a greater variety in input, for example by removing dashes or spaces.
23
+ """
24
+
25
+ PATTERNS = [
26
+ Pattern(
27
+ "ACN (Medium)",
28
+ r"\b\d{3}\s\d{3}\s\d{3}\b",
29
+ 0.1,
30
+ ),
31
+ Pattern(
32
+ "ACN (Low)",
33
+ r"\b\d{9}\b",
34
+ 0.01,
35
+ ),
36
+ ]
37
+
38
+ CONTEXT = [
39
+ "australian company number",
40
+ "acn",
41
+ ]
42
+
43
+ def __init__(
44
+ self,
45
+ patterns: Optional[List[Pattern]] = None,
46
+ context: Optional[List[str]] = None,
47
+ supported_language: str = "en",
48
+ supported_entity: str = "AU_ACN",
49
+ replacement_pairs: Optional[List[Tuple[str, str]]] = None,
50
+ ):
51
+ self.replacement_pairs = (
52
+ replacement_pairs if replacement_pairs else [("-", ""), (" ", "")]
53
+ )
54
+ patterns = patterns if patterns else self.PATTERNS
55
+ context = context if context else self.CONTEXT
56
+ super().__init__(
57
+ supported_entity=supported_entity,
58
+ patterns=patterns,
59
+ context=context,
60
+ supported_language=supported_language,
61
+ )
62
+
63
+ def validate_result(self, pattern_text: str) -> bool:
64
+ """
65
+ Validate the pattern logic e.g., by running checksum on a detected pattern.
66
+
67
+ :param pattern_text: the text to validated.
68
+ Only the part in text that was detected by the regex engine
69
+ :return: A bool indicating whether the validation was successful.
70
+ """
71
+ # Pre-processing before validation checks
72
+ text = self.__sanitize_value(pattern_text, self.replacement_pairs)
73
+ acn_list = [int(digit) for digit in text if not digit.isspace()]
74
+
75
+ # Set weights based on digit position
76
+ weight = [8, 7, 6, 5, 4, 3, 2, 1]
77
+
78
+ # Perform checksums
79
+ sum_product = 0
80
+ for i in range(8):
81
+ sum_product += acn_list[i] * weight[i]
82
+ remainder = sum_product % 10
83
+ complement = 10 - remainder
84
+ return complement == acn_list[-1]
85
+
86
+ @staticmethod
87
+ def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
88
+ for search_string, replacement_string in replacement_pairs:
89
+ text = text.replace(search_string, replacement_string)
90
+ return text
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/au_medicare_recognizer.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, List, Tuple
2
+
3
+ from presidio_analyzer import Pattern, PatternRecognizer
4
+
5
+
6
+ class AuMedicareRecognizer(PatternRecognizer):
7
+ """
8
+ Recognizes Australian Medicare number using regex, context words, and checksum.
9
+
10
+ Medicare number is a unique identifier issued by Australian Government
11
+ that enables the cardholder to receive a rebates of medical expenses
12
+ under Australia's Medicare system.
13
+ It uses a modulus 10 checksum scheme to validate the number.
14
+ Reference: https://en.wikipedia.org/wiki/Medicare_card_(Australia)
15
+
16
+
17
+ :param patterns: List of patterns to be used by this recognizer
18
+ :param context: List of context words to increase confidence in detection
19
+ :param supported_language: Language this recognizer supports
20
+ :param supported_entity: The entity this recognizer can detect
21
+ :param replacement_pairs: List of tuples with potential replacement values
22
+ for different strings to be used during pattern matching.
23
+ This can allow a greater variety in input, for example by removing dashes or spaces.
24
+ """
25
+
26
+ PATTERNS = [
27
+ Pattern(
28
+ "Australian Medicare Number (Medium)",
29
+ r"\b[2-6]\d{3}\s\d{5}\s\d\b",
30
+ 0.1,
31
+ ),
32
+ Pattern(
33
+ "Australian Medicare Number (Low)",
34
+ r"\b[2-6]\d{9}\b",
35
+ 0.01,
36
+ ),
37
+ ]
38
+
39
+ CONTEXT = [
40
+ "medicare",
41
+ ]
42
+
43
+ def __init__(
44
+ self,
45
+ patterns: Optional[List[Pattern]] = None,
46
+ context: Optional[List[str]] = None,
47
+ supported_language: str = "en",
48
+ supported_entity: str = "AU_MEDICARE",
49
+ replacement_pairs: Optional[List[Tuple[str, str]]] = None,
50
+ ):
51
+ self.replacement_pairs = (
52
+ replacement_pairs if replacement_pairs else [("-", ""), (" ", "")]
53
+ )
54
+ patterns = patterns if patterns else self.PATTERNS
55
+ context = context if context else self.CONTEXT
56
+ super().__init__(
57
+ supported_entity=supported_entity,
58
+ patterns=patterns,
59
+ context=context,
60
+ supported_language=supported_language,
61
+ )
62
+
63
+ def validate_result(self, pattern_text: str) -> bool:
64
+ """
65
+ Validate the pattern logic e.g., by running checksum on a detected pattern.
66
+
67
+ :param pattern_text: the text to validated.
68
+ Only the part in text that was detected by the regex engine
69
+ :return: A bool indicating whether the validation was successful.
70
+ """
71
+ # Pre-processing before validation checks
72
+ text = self.__sanitize_value(pattern_text, self.replacement_pairs)
73
+ medicare_list = [int(digit) for digit in text if not digit.isspace()]
74
+
75
+ # Set weights based on digit position
76
+ weight = [1, 3, 7, 9, 1, 3, 7, 9]
77
+
78
+ # Perform checksums
79
+ sum_product = 0
80
+ for i in range(8):
81
+ sum_product += medicare_list[i] * weight[i]
82
+ remainder = sum_product % 10
83
+ return remainder == medicare_list[8]
84
+
85
+ @staticmethod
86
+ def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
87
+ for search_string, replacement_string in replacement_pairs:
88
+ text = text.replace(search_string, replacement_string)
89
+ return text
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/au_tfn_recognizer.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, List, Tuple
2
+
3
+ from presidio_analyzer import Pattern, PatternRecognizer
4
+
5
+
6
+ class AuTfnRecognizer(PatternRecognizer):
7
+ """
8
+ Recognizes Australian Tax File Numbers ("TFN").
9
+
10
+ The tax file number (TFN) is a unique identifier
11
+ issued by the Australian Taxation Office
12
+ to each taxpaying entity — an individual, company,
13
+ superannuation fund, partnership, or trust.
14
+ The TFN consists of a nine digit number, usually
15
+ presented in the format NNN NNN NNN.
16
+ TFN includes a check digit for detecting erroneous
17
+ number based on simple modulo 11.
18
+ This recognizer uses regex, context words,
19
+ and checksum to identify TFN.
20
+ Reference: https://www.ato.gov.au/individuals/tax-file-number/
21
+
22
+ :param patterns: List of patterns to be used by this recognizer
23
+ :param context: List of context words to increase confidence in detection
24
+ :param supported_language: Language this recognizer supports
25
+ :param supported_entity: The entity this recognizer can detect
26
+ :param replacement_pairs: List of tuples with potential replacement values
27
+ for different strings to be used during pattern matching.
28
+ This can allow a greater variety in input, for example by removing dashes or spaces.
29
+ """
30
+
31
+ PATTERNS = [
32
+ Pattern(
33
+ "TFN (Medium)",
34
+ r"\b\d{3}\s\d{3}\s\d{3}\b",
35
+ 0.1,
36
+ ),
37
+ Pattern(
38
+ "TFN (Low)",
39
+ r"\b\d{9}\b",
40
+ 0.01,
41
+ ),
42
+ ]
43
+
44
+ CONTEXT = [
45
+ "tax file number",
46
+ "tfn",
47
+ ]
48
+
49
+ def __init__(
50
+ self,
51
+ patterns: Optional[List[Pattern]] = None,
52
+ context: Optional[List[str]] = None,
53
+ supported_language: str = "en",
54
+ supported_entity: str = "AU_TFN",
55
+ replacement_pairs: Optional[List[Tuple[str, str]]] = None,
56
+ ):
57
+ self.replacement_pairs = (
58
+ replacement_pairs if replacement_pairs else [("-", ""), (" ", "")]
59
+ )
60
+ patterns = patterns if patterns else self.PATTERNS
61
+ context = context if context else self.CONTEXT
62
+ super().__init__(
63
+ supported_entity=supported_entity,
64
+ patterns=patterns,
65
+ context=context,
66
+ supported_language=supported_language,
67
+ )
68
+
69
+ def validate_result(self, pattern_text: str) -> bool:
70
+ """
71
+ Validate the pattern logic e.g., by running checksum on a detected pattern.
72
+
73
+ :param pattern_text: the text to validated.
74
+ Only the part in text that was detected by the regex engine
75
+ :return: A bool indicating whether the validation was successful.
76
+ """
77
+ # Pre-processing before validation checks
78
+ text = self.__sanitize_value(pattern_text, self.replacement_pairs)
79
+ tfn_list = [int(digit) for digit in text if not digit.isspace()]
80
+
81
+ # Set weights based on digit position
82
+ weight = [1, 4, 3, 7, 5, 8, 6, 9, 10]
83
+
84
+ # Perform checksums
85
+ sum_product = 0
86
+ for i in range(9):
87
+ sum_product += tfn_list[i] * weight[i]
88
+ remainder = sum_product % 11
89
+ return remainder == 0
90
+
91
+ @staticmethod
92
+ def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
93
+ for search_string, replacement_string in replacement_pairs:
94
+ text = text.replace(search_string, replacement_string)
95
+ return text
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/credit_card_recognizer.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple, Optional
2
+
3
+ from presidio_analyzer import Pattern, PatternRecognizer
4
+
5
+
6
+ class CreditCardRecognizer(PatternRecognizer):
7
+ """
8
+ Recognize common credit card numbers using regex + checksum.
9
+
10
+ :param patterns: List of patterns to be used by this recognizer
11
+ :param context: List of context words to increase confidence in detection
12
+ :param supported_language: Language this recognizer supports
13
+ :param supported_entity: The entity this recognizer can detect
14
+ :param replacement_pairs: List of tuples with potential replacement values
15
+ for different strings to be used during pattern matching.
16
+ This can allow a greater variety in input, for example by removing dashes or spaces.
17
+ """
18
+
19
+ PATTERNS = [
20
+ Pattern(
21
+ "All Credit Cards (weak)",
22
+ r"\b((4\d{3})|(5[0-5]\d{2})|(6\d{3})|(1\d{3})|(3\d{3}))[- ]?(\d{3,4})[- ]?(\d{3,4})[- ]?(\d{3,5})\b", # noqa: E501
23
+ 0.3,
24
+ ),
25
+ ]
26
+
27
+ CONTEXT = [
28
+ "credit",
29
+ "card",
30
+ "visa",
31
+ "mastercard",
32
+ "cc ",
33
+ "amex",
34
+ "discover",
35
+ "jcb",
36
+ "diners",
37
+ "maestro",
38
+ "instapayment",
39
+ ]
40
+
41
+ def __init__(
42
+ self,
43
+ patterns: Optional[List[Pattern]] = None,
44
+ context: Optional[List[str]] = None,
45
+ supported_language: str = "en",
46
+ supported_entity: str = "CREDIT_CARD",
47
+ replacement_pairs: Optional[List[Tuple[str, str]]] = None,
48
+ ):
49
+
50
+ self.replacement_pairs = (
51
+ replacement_pairs if replacement_pairs else [("-", ""), (" ", "")]
52
+ )
53
+ patterns = patterns if patterns else self.PATTERNS
54
+ context = context if context else self.CONTEXT
55
+ super().__init__(
56
+ supported_entity=supported_entity,
57
+ patterns=patterns,
58
+ context=context,
59
+ supported_language=supported_language,
60
+ )
61
+
62
+ def validate_result(self, pattern_text: str) -> bool: # noqa D102
63
+ sanitized_value = self.__sanitize_value(pattern_text, self.replacement_pairs)
64
+ checksum = self.__luhn_checksum(sanitized_value)
65
+
66
+ return checksum
67
+
68
+ @staticmethod
69
+ def __luhn_checksum(sanitized_value: str) -> bool:
70
+ def digits_of(n: str) -> List[int]:
71
+ return [int(dig) for dig in str(n)]
72
+
73
+ digits = digits_of(sanitized_value)
74
+ odd_digits = digits[-1::-2]
75
+ even_digits = digits[-2::-2]
76
+ checksum = sum(odd_digits)
77
+ for d in even_digits:
78
+ checksum += sum(digits_of(str(d * 2)))
79
+ return checksum % 10 == 0
80
+
81
+ @staticmethod
82
+ def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
83
+ for search_string, replacement_string in replacement_pairs:
84
+ text = text.replace(search_string, replacement_string)
85
+ return text
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/crypto_recognizer.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from hashlib import sha256
2
+ from typing import List, Optional
3
+
4
+ from presidio_analyzer import Pattern, PatternRecognizer
5
+
6
+ # Copied from:
7
+ # http://rosettacode.org/wiki/Bitcoin/address_validation#Python
8
+
9
+
10
+ class CryptoRecognizer(PatternRecognizer):
11
+ """Recognize common crypto account numbers using regex + checksum.
12
+
13
+ :param patterns: List of patterns to be used by this recognizer
14
+ :param context: List of context words to increase confidence in detection
15
+ :param supported_language: Language this recognizer supports
16
+ :param supported_entity: The entity this recognizer can detect
17
+ """
18
+
19
+ PATTERNS = [
20
+ Pattern("Crypto (Medium)", r"\b[13][a-km-zA-HJ-NP-Z1-9]{26,33}\b", 0.5),
21
+ ]
22
+
23
+ CONTEXT = ["wallet", "btc", "bitcoin", "crypto"]
24
+
25
+ def __init__(
26
+ self,
27
+ patterns: Optional[List[Pattern]] = None,
28
+ context: Optional[List[str]] = None,
29
+ supported_language: str = "en",
30
+ supported_entity: str = "CRYPTO",
31
+ ):
32
+ patterns = patterns if patterns else self.PATTERNS
33
+ context = context if context else self.CONTEXT
34
+ super().__init__(
35
+ supported_entity=supported_entity,
36
+ patterns=patterns,
37
+ context=context,
38
+ supported_language=supported_language,
39
+ )
40
+
41
+ def validate_result(self, pattern_text: str) -> bool: # noqa D102
42
+ try:
43
+ bcbytes = self.__decode_base58(pattern_text, 25)
44
+ return bcbytes[-4:] == sha256(sha256(bcbytes[:-4]).digest()).digest()[:4]
45
+ except ValueError:
46
+ return False
47
+
48
+ @staticmethod
49
+ def __decode_base58(bc: str, length: int) -> bytes:
50
+ digits58 = "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz"
51
+ n = 0
52
+ for char in bc:
53
+ n = n * 58 + digits58.index(char)
54
+ return n.to_bytes(length, "big")
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/data_recognizer.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Optional, List, Tuple, Set
3
+ import spacy
4
+ from spacy.matcher import PhraseMatcher
5
+ from presidio_analyzer.predefined_recognizers.spacy_recognizer import SpacyRecognizer
6
+ # from presidio_analyzer.predefined_recognizers import SpacyRecognizer
7
+ from presidio_analyzer import RecognizerResult
8
+ import copy
9
+
10
+
11
+
12
+
13
+ from presidio_analyzer import (
14
+ RecognizerResult,
15
+ LocalRecognizer,
16
+ AnalysisExplanation,
17
+ )
18
+
19
+ logger = logging.getLogger("presidio_analyzer")
20
+ # terms = ["1&1 Telecommunication SE","1010 data services LLC","AMA",
21
+ # "A O Smith Corporations","ABBMST","Addidas India","CITI","Cisco Systems","ERICSSON","Gati Ltd","IBM",
22
+ # "Infosys Ltd","Intel Corporation","Johnson","JTC Corporation","NSC Global","SUZUKI MOTOR CORPORATION",
23
+ # "Synopsys Ltd","TIBCOO", "T-Mobile UK","Toyota Systems Corporation","TSB Bank","UBS Bank"
24
+ # ,"United Health Corporation","Vodafone quickcom","Voltas","VOLVO CARS","WIPRO LIMITED",
25
+ # "Walmart", "CVS Health", "Walgreens Boots Alliance"]
26
+ terms=[]
27
+ class DataList:
28
+ # def __init__(self,val) -> None:
29
+ # self.Entiity=val
30
+ entity=[]
31
+ def setData(values):
32
+ terms.extend(values)
33
+ # print(terms)
34
+ def resetData():
35
+ terms.clear()
36
+ # def setEntity(val):
37
+ # DataList.Entity=val
38
+ # ClientListRecognizer(supported_entities=val)
39
+ # def getE():
40
+ # return self.Entiity
41
+
42
+
43
+ nlp = spacy.load("en_core_web_lg")
44
+
45
+
46
+
47
+
48
+
49
+ class ClientListRecognizer(SpacyRecognizer):
50
+ """
51
+ Recognize PII entities using a spaCy NLP model.
52
+
53
+ Since the spaCy pipeline is ran by the AnalyzerEngine,
54
+ this recognizer only extracts the entities from the NlpArtifacts
55
+ and replaces their types to align with Presidio's.
56
+
57
+ :param supported_language: Language this recognizer supports
58
+ :param supported_entities: The entities this recognizer can detect
59
+ :param ner_strength: Default confidence for NER prediction
60
+ :param check_label_groups: Tuple containing Presidio entity names
61
+ and spaCy entity names, for verifying that the right entity
62
+ is translated into a Presidio entity.
63
+ """
64
+
65
+ ENTITIES = DataList.entity
66
+
67
+ DEFAULT_EXPLANATION = "Identified as {} by Spacy's Named Entity Recognition"
68
+
69
+ CHECK_LABEL_GROUPS = [
70
+ # ({"LOCATION"}, {"GPE", "LOC"}),
71
+ # ({"PERSON", "PER"}, {"PERSON", "PER"}),
72
+ # ({"DATE_TIME"}, {"DATE", "TIME"}),
73
+ # ({"NRP"}, {"NORP"}),
74
+ # ({"ORGANIZATION"}, {"ORG"}),
75
+ # ()
76
+ ]
77
+
78
+
79
+
80
+
81
+
82
+ def __init__(
83
+ self,
84
+ supported_language: str = "en",
85
+ supported_entities: Optional[List[str]] = None,
86
+ ner_strength: float = 0.85,
87
+ check_label_groups: Optional[Tuple[Set, Set]] = None,
88
+ context: Optional[List[str]] = None,
89
+ ):
90
+ self.ner_strength = ner_strength
91
+ self.check_label_groups = (
92
+ check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
93
+ )
94
+ supported_entities = supported_entities if supported_entities else self.ENTITIES
95
+ # print("=========",supported_entities)
96
+ super().__init__(
97
+ supported_entities=supported_entities,
98
+ supported_language=supported_language,
99
+ context=context,
100
+ )
101
+
102
+ def load(self) -> None: # noqa D102
103
+ # no need to load anything as the analyze method already receives
104
+ # preprocessed nlp artifacts
105
+ pass
106
+
107
+
108
+ def build_spacy_explanation(
109
+ self, original_score: float, explanation: str
110
+ ) -> AnalysisExplanation:
111
+ """
112
+ Create explanation for why this result was detected.
113
+
114
+ :param original_score: Score given by this recognizer
115
+ :param explanation: Explanation string
116
+ :return:
117
+ """
118
+ explanation = AnalysisExplanation(
119
+ recognizer=self.__class__.__name__,
120
+ original_score=original_score,
121
+ textual_explanation=explanation,
122
+ )
123
+ return explanation
124
+
125
+ def analyze(self, text, entities, nlp_artifacts=None): # noqa D102
126
+
127
+ # print("=========",self.supported_entities)
128
+
129
+ # matcher = PhraseMatcher(nlp.vocab)
130
+
131
+ # # Only run nlp.make_doc to speed things up
132
+ # patterns = [nlp.make_doc(text) for text in terms]
133
+
134
+ # matcher.add("TerminologyList", patterns)
135
+ # result = []
136
+
137
+ matcher = PhraseMatcher(nlp.vocab)
138
+
139
+ # Only run nlp.make_doc to speed things up
140
+ patterns = [nlp.make_doc(text) for text in terms]
141
+
142
+ matcher.add("TerminologyList", patterns)
143
+
144
+ results = []
145
+ # result =[]
146
+
147
+ doc = nlp(text)
148
+ doc1 = str(doc)
149
+
150
+ matches = matcher(doc)
151
+ for match_id, start, end in matches:
152
+ span = doc[start:end]
153
+
154
+ if doc1.find(str(span)):
155
+ doc1=doc1.replace(str(span.text),"<COMPANY_NAME>")
156
+ etype=copy.deepcopy(DataList.entity)
157
+ spacy_result = RecognizerResult(
158
+
159
+ entity_type=etype[0],
160
+ start=span.start_char,
161
+ end=span.end_char,
162
+ score=self.ner_strength,
163
+ # analysis_explanation=explanation,
164
+ recognition_metadata={
165
+ RecognizerResult.RECOGNIZER_NAME_KEY: self.name,
166
+ RecognizerResult.RECOGNIZER_IDENTIFIER_KEY: self.id,
167
+ },
168
+ )
169
+
170
+
171
+ results.append(spacy_result)
172
+
173
+
174
+
175
+
176
+ return results
177
+
178
+ @staticmethod
179
+ def __check_label(
180
+ entity: str, label: str, check_label_groups: Tuple[Set, Set]
181
+ ) -> bool:
182
+ return any(
183
+ [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
184
+ )
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/date_recognizer.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, List
2
+
3
+ from presidio_analyzer import Pattern, PatternRecognizer, RecognizerResult
4
+ from presidio_analyzer.nlp_engine import NlpArtifacts
5
+
6
+ import regex as re
7
+
8
+
9
+ class DateRecognizer(PatternRecognizer):
10
+ """
11
+ Recognize date using regex.
12
+
13
+ :param patterns: List of patterns to be used by this recognizer
14
+ :param context: List of context words to increase confidence in detection
15
+ :param supported_language: Language this recognizer supports
16
+ :param supported_entity: The entity this recognizer can detect
17
+ """
18
+
19
+ PATTERNS = [
20
+ Pattern(
21
+ "mm/dd/yyyy or mm/dd/yy",
22
+ r"\b(([1-9]|0[1-9]|1[0-2])/([1-9]|0[1-9]|[1-2][0-9]|3[0-1])/(\d{4}|\d{2}))\b", # noqa: E501
23
+ 0.6,
24
+ ),
25
+ Pattern(
26
+ "dd/mm/yyyy or dd/mm/yy",
27
+ r"\b(([1-9]|0[1-9]|[1-2][0-9]|3[0-1])/([1-9]|0[1-9]|1[0-2])/(\d{4}|\d{2}))\b", # noqa: E501
28
+ 0.6,
29
+ ),
30
+ Pattern(
31
+ "yyyy/mm/dd",
32
+ r"\b(\d{4}/([1-9]|0[1-9]|1[0-2])/([1-9]|0[1-9]|[1-2][0-9]|3[0-1]))\b",
33
+ 0.6,
34
+ ),
35
+ Pattern(
36
+ "mm-dd-yyyy",
37
+ r"\b(([1-9]|0[1-9]|1[0-2])-([1-9]|0[1-9]|[1-2][0-9]|3[0-1])-\d{4})\b",
38
+ 0.6,
39
+ ),
40
+ Pattern(
41
+ "dd-mm-yyyy",
42
+ r"\b(([1-9]|0[1-9]|[1-2][0-9]|3[0-1])-([1-9]|0[1-9]|1[0-2])-\d{4})\b",
43
+ 0.6,
44
+ ),
45
+ Pattern(
46
+ "yyyy-mm-dd",
47
+ r"\b(\d{4}-([1-9]|0[1-9]|1[0-2])-([1-9]|0[1-9]|[1-2][0-9]|3[0-1]))\b",
48
+ 0.6,
49
+ ),
50
+ Pattern(
51
+ "dd.mm.yyyy or dd.mm.yy",
52
+ r"\b(([1-9]|0[1-9]|[1-2][0-9]|3[0-1])\.([1-9]|0[1-9]|1[0-2])\.(\d{4}|\d{2}))\b", # noqa: E501
53
+ 0.6,
54
+ ),
55
+ Pattern(
56
+ "dd-MMM-yyyy or dd-MMM-yy",
57
+ r"\b(([1-9]|0[1-9]|[1-2][0-9]|3[0-1])-(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)-(\d{4}|\d{2}))\b", # noqa: E501
58
+ 0.6,
59
+ ),
60
+ Pattern(
61
+ "MMM-yyyy or MMM-yy",
62
+ r"\b((JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)-(\d{4}|\d{2}))\b", # noqa: E501
63
+ 0.6,
64
+ ),
65
+ Pattern(
66
+ "dd-MMM or dd-MMM",
67
+ r"\b(([1-9]|0[1-9]|[1-2][0-9]|3[0-1])-(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC))\b", # noqa: E501
68
+ 0.6,
69
+ ),
70
+ Pattern(
71
+ "mm/yyyy or m/yyyy",
72
+ r"\b(([1-9]|0[1-9]|1[0-2])/\d{4})\b",
73
+ 0.2,
74
+ ),
75
+ Pattern(
76
+ "mm/yy or m/yy",
77
+ r"\b(([1-9]|0[1-9]|1[0-2])/\d{2})\b",
78
+ 0.1,
79
+ ),
80
+ ]
81
+
82
+ CONTEXT = ["date", "birthday"]
83
+
84
+ def __init__(
85
+ self,
86
+ patterns: Optional[List[Pattern]] = None,
87
+ context: Optional[List[str]] = None,
88
+ supported_language: str = "en",
89
+ supported_entity: str = "DATE_TIME",
90
+ ):
91
+ patterns = patterns if patterns else self.PATTERNS
92
+ context = context if context else self.CONTEXT
93
+ super().__init__(
94
+ supported_entity=supported_entity,
95
+ patterns=patterns,
96
+ context=context,
97
+ supported_language=supported_language,
98
+ )
99
+
100
+ def analyze(
101
+ self,
102
+ text: str,
103
+ entities: List[str],
104
+ nlp_artifacts: NlpArtifacts = None,
105
+ regex_flags: int = None,
106
+ ) -> List[RecognizerResult]:
107
+ """
108
+ Analyzes text to detect PII using regular expressions or deny-lists.
109
+
110
+ :param text: Text to be analyzed
111
+ :param entities: Entities this recognizer can detect
112
+ :param nlp_artifacts: Output values from the NLP engine
113
+ :param regex_flags:
114
+ :return:
115
+ """
116
+ regex_flags = (
117
+ regex_flags | re.IGNORECASE
118
+ if regex_flags
119
+ else re.DOTALL | re.MULTILINE | re.IGNORECASE
120
+ ) # noqa: E501
121
+
122
+ return super().analyze(
123
+ text=text,
124
+ entities=entities,
125
+ nlp_artifacts=nlp_artifacts,
126
+ regex_flags=regex_flags,
127
+ )
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/email_recognizer.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional
2
+
3
+ import tldextract
4
+
5
+ from presidio_analyzer import Pattern, PatternRecognizer
6
+
7
+
8
+ class EmailRecognizer(PatternRecognizer):
9
+ """
10
+ Recognize email addresses using regex.
11
+
12
+ :param patterns: List of patterns to be used by this recognizer
13
+ :param context: List of context words to increase confidence in detection
14
+ :param supported_language: Language this recognizer supports
15
+ :param supported_entity: The entity this recognizer can detect
16
+ """
17
+
18
+ PATTERNS = [
19
+ Pattern(
20
+ "Email (Medium)",
21
+ r"\b((([!#$%&'*+\-/=?^_`{|}~\w])|([!#$%&'*+\-/=?^_`{|}~\w][!#$%&'*+\-/=?^_`{|}~\.\w]{0,}[!#$%&'*+\-/=?^_`{|}~\w]))[@]\w+([-.]\w+)*\.\w+([-.]\w+)*)\b", # noqa: E501
22
+ 0.5,
23
+ ),
24
+ ]
25
+
26
+ CONTEXT = ["email"]
27
+
28
+ def __init__(
29
+ self,
30
+ patterns: Optional[List[Pattern]] = None,
31
+ context: Optional[List[str]] = None,
32
+ supported_language: str = "en",
33
+ supported_entity: str = "EMAIL_ADDRESS",
34
+ ):
35
+ patterns = patterns if patterns else self.PATTERNS
36
+ context = context if context else self.CONTEXT
37
+ super().__init__(
38
+ supported_entity=supported_entity,
39
+ patterns=patterns,
40
+ context=context,
41
+ supported_language=supported_language,
42
+ )
43
+
44
+ def validate_result(self, pattern_text: str): # noqa D102
45
+ result = tldextract.extract(pattern_text)
46
+ return result.fqdn != ""
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/es_nif_recognizer.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple, Optional
2
+
3
+ from presidio_analyzer import Pattern, PatternRecognizer
4
+
5
+
6
+ class EsNifRecognizer(PatternRecognizer):
7
+ """
8
+ Recognize NIF number using regex and checksum.
9
+
10
+ :param patterns: List of patterns to be used by this recognizer
11
+ :param context: List of context words to increase confidence in detection
12
+ :param supported_language: Language this recognizer supports
13
+ :param supported_entity: The entity this recognizer can detect
14
+ :param replacement_pairs: List of tuples with potential replacement values
15
+ for different strings to be used during pattern matching.
16
+ This can allow a greater variety in input, for example by removing dashes or spaces.
17
+ """
18
+
19
+ PATTERNS = [
20
+ Pattern(
21
+ "NIF",
22
+ r"\b[0-9]?[0-9]{7}[-]?[A-Z]\b",
23
+ 0.5,
24
+ ),
25
+ ]
26
+
27
+ CONTEXT = ["documento nacional de identidad", "DNI", "NIF", "identificación"]
28
+
29
+ def __init__(
30
+ self,
31
+ patterns: Optional[List[Pattern]] = None,
32
+ context: Optional[List[str]] = None,
33
+ supported_language: str = "es",
34
+ supported_entity: str = "ES_NIF",
35
+ replacement_pairs: Optional[List[Tuple[str, str]]] = None,
36
+ ):
37
+ self.replacement_pairs = (
38
+ replacement_pairs if replacement_pairs else [("-", ""), (" ", "")]
39
+ )
40
+ patterns = patterns if patterns else self.PATTERNS
41
+ context = context if context else self.CONTEXT
42
+ super().__init__(
43
+ supported_entity=supported_entity,
44
+ patterns=patterns,
45
+ context=context,
46
+ supported_language=supported_language,
47
+ )
48
+
49
+ def validate_result(self, pattern_text: str) -> bool: # noqa D102
50
+ pattern_text = EsNifRecognizer.__sanitize_value(pattern_text)
51
+ letter = pattern_text[-1]
52
+ number = int("".join(filter(str.isdigit, pattern_text)))
53
+ letters = "TRWAGMYFPDXBNJZSQVHLCKE"
54
+ return letter == letters[number % 23]
55
+
56
+ @staticmethod
57
+ def __sanitize_value(text: str) -> str:
58
+ return text.replace("-", "").replace(" ", "")