Commit ·
f496f54
1
Parent(s): e40ae6e
Add large model file to Git LFS
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +4 -0
- LICENSE.md +8 -0
- api-spec/rai-privacy.yaml +389 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/build_config.yaml +14 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/create_wheel_file.py +44 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/dist/presidio_analyzer-4.0.5-py3-none-any.whl +0 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/dist/presidio_analyzer-4.0.5.tar.gz +3 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/dist/presidio_analyzer-4.0.6-py3-none-any.whl +0 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/dist/presidio_analyzer-4.0.6.tar.gz +3 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/dist/presidio_analyzer-4.1.0-py3-none-any.whl +0 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/dist/presidio_analyzer-4.1.0.tar.gz +3 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer.egg-info/PKG-INFO +12 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer.egg-info/SOURCES.txt +64 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer.egg-info/dependency_links.txt +1 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer.egg-info/top_level.txt +1 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/__init__.py +52 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/analysis_explanation.py +64 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/analyzer_engine.py +372 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/analyzer_request.py +36 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/app_tracer.py +27 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/batch_analyzer_engine.py +145 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/context_aware_enhancers/__init__.py +5 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/context_aware_enhancers/context_aware_enhancer.py +68 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/context_aware_enhancers/lemma_context_aware_enhancer.py +334 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/dict_analyzer_result.py +29 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/entity_recognizer.py +199 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/local_recognizer.py +7 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/__init__.py +19 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/client_nlp_engine.py +108 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/nlp_artifacts.py +74 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/nlp_engine.py +42 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py +128 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py +96 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py +39 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py +155 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/pattern.py +45 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/pattern_recognizer.py +253 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/Aadhaar_Number.py +46 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/PAN_Number.py +46 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/__init__.py +77 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/au_abn_recognizer.py +93 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/au_acn_recognizer.py +90 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/au_medicare_recognizer.py +89 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/au_tfn_recognizer.py +95 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/credit_card_recognizer.py +85 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/crypto_recognizer.py +54 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/data_recognizer.py +184 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/date_recognizer.py +127 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/email_recognizer.py +46 -0
- presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/es_nif_recognizer.py +58 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
rai_privacy_package/privacy/rai_privacy/privacy/privacy/util/face_detect/face_detector/res10_300x300_ssd_iter_140000.caffemodel filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
rai_privacy_package/privacy/rai_privacy/privacy/privacy/util/face_detect/doc/10.jpg filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
rai_privacy_package/privacy/rai_privacy/privacy/privacy/util/face_detect/doc/5.jpg filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
rai_privacy_package/privacy/rai_privacy/privacy/privacy/util/face_detect/doc/8.jpg filter=lfs diff=lfs merge=lfs -text
|
LICENSE.md
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
MIT license https://opensource.org/licenses/MIT Copyright 2024-2025 Infosys Ltd
|
| 3 |
+
|
| 4 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
| 5 |
+
|
| 6 |
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
| 7 |
+
|
| 8 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
api-spec/rai-privacy.yaml
ADDED
|
@@ -0,0 +1,389 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openapi: 3.0.2
|
| 2 |
+
info:
|
| 3 |
+
title: Infosys Responsible AI - responsible-ai-privacy - OpenAPI 3.0
|
| 4 |
+
description: API specs for Infosys Responsible AI Privacy pillar in OpenAPI 3.0 format
|
| 5 |
+
termsOfService: https://www.infosys.com
|
| 6 |
+
contact:
|
| 7 |
+
email: aina@infosys.com
|
| 8 |
+
license:
|
| 9 |
+
name: Infosys
|
| 10 |
+
url: https://www.infosys.com
|
| 11 |
+
version: v$version
|
| 12 |
+
security:
|
| 13 |
+
- oauth_auth:
|
| 14 |
+
- write:users
|
| 15 |
+
- read:users
|
| 16 |
+
paths:
|
| 17 |
+
/api/v1/privacy/pii/analyze:
|
| 18 |
+
post:
|
| 19 |
+
tags:
|
| 20 |
+
- PII Privacy
|
| 21 |
+
summary: Analyze
|
| 22 |
+
operationId: analyze_api_v1_privacy_pii_analyze_post
|
| 23 |
+
security:
|
| 24 |
+
- oauth_auth:
|
| 25 |
+
- write:users
|
| 26 |
+
requestBody:
|
| 27 |
+
content:
|
| 28 |
+
application/json:
|
| 29 |
+
schema:
|
| 30 |
+
$ref: '#/components/schemas/PIIAnalyzeRequest'
|
| 31 |
+
required: true
|
| 32 |
+
responses:
|
| 33 |
+
'200':
|
| 34 |
+
description: Successful Response
|
| 35 |
+
content:
|
| 36 |
+
application/json:
|
| 37 |
+
schema:
|
| 38 |
+
$ref: '#/components/schemas/PIIAnalyzeResponse'
|
| 39 |
+
'401':
|
| 40 |
+
description: Unauthorized
|
| 41 |
+
content:
|
| 42 |
+
application/json:
|
| 43 |
+
schema:
|
| 44 |
+
$ref: '#/components/schemas/Error'
|
| 45 |
+
'403':
|
| 46 |
+
description: Forbidden
|
| 47 |
+
content:
|
| 48 |
+
application/json:
|
| 49 |
+
schema:
|
| 50 |
+
$ref: '#/components/schemas/Error'
|
| 51 |
+
'422':
|
| 52 |
+
description: Validation Error
|
| 53 |
+
content:
|
| 54 |
+
application/json:
|
| 55 |
+
schema:
|
| 56 |
+
$ref: '#/components/schemas/HTTPValidationError'
|
| 57 |
+
/api/v1/privacy/pii/anonymize:
|
| 58 |
+
post:
|
| 59 |
+
tags:
|
| 60 |
+
- PII Privacy
|
| 61 |
+
summary: Anonymize
|
| 62 |
+
operationId: anonymize_api_v1_privacy_pii_anonymize_post
|
| 63 |
+
security:
|
| 64 |
+
- oauth_auth:
|
| 65 |
+
- write:users
|
| 66 |
+
requestBody:
|
| 67 |
+
content:
|
| 68 |
+
application/json:
|
| 69 |
+
schema:
|
| 70 |
+
$ref: '#/components/schemas/PIIAnonymizeRequest'
|
| 71 |
+
required: true
|
| 72 |
+
responses:
|
| 73 |
+
'200':
|
| 74 |
+
description: Successful Response
|
| 75 |
+
content:
|
| 76 |
+
application/json:
|
| 77 |
+
schema:
|
| 78 |
+
$ref: '#/components/schemas/PIIAnonymizeResponse'
|
| 79 |
+
'401':
|
| 80 |
+
description: Unauthorized
|
| 81 |
+
content:
|
| 82 |
+
application/json:
|
| 83 |
+
schema:
|
| 84 |
+
$ref: '#/components/schemas/Error'
|
| 85 |
+
'403':
|
| 86 |
+
description: Forbidden
|
| 87 |
+
content:
|
| 88 |
+
application/json:
|
| 89 |
+
schema:
|
| 90 |
+
$ref: '#/components/schemas/Error'
|
| 91 |
+
'422':
|
| 92 |
+
description: Validation Error
|
| 93 |
+
content:
|
| 94 |
+
application/json:
|
| 95 |
+
schema:
|
| 96 |
+
$ref: '#/components/schemas/HTTPValidationError'
|
| 97 |
+
/api/v1/privacy/pii/image/analyze:
|
| 98 |
+
post:
|
| 99 |
+
tags:
|
| 100 |
+
- PII Privacy
|
| 101 |
+
summary: Image Analyze
|
| 102 |
+
operationId: image_analyze_api_v1_privacy_pii_image_analyze_post
|
| 103 |
+
security:
|
| 104 |
+
- oauth_auth:
|
| 105 |
+
- write:users
|
| 106 |
+
requestBody:
|
| 107 |
+
content:
|
| 108 |
+
multipart/form-data:
|
| 109 |
+
schema:
|
| 110 |
+
$ref: '#/components/schemas/Body_image_analyze_api_v1_privacy_pii_image_analyze_post'
|
| 111 |
+
required: true
|
| 112 |
+
responses:
|
| 113 |
+
'200':
|
| 114 |
+
description: Successful Response
|
| 115 |
+
content:
|
| 116 |
+
application/json:
|
| 117 |
+
schema:
|
| 118 |
+
$ref: '#/components/schemas/PIIImageAnalyzeResponse'
|
| 119 |
+
'401':
|
| 120 |
+
description: Unauthorized
|
| 121 |
+
content:
|
| 122 |
+
application/json:
|
| 123 |
+
schema:
|
| 124 |
+
$ref: '#/components/schemas/Error'
|
| 125 |
+
'403':
|
| 126 |
+
description: Forbidden
|
| 127 |
+
content:
|
| 128 |
+
application/json:
|
| 129 |
+
schema:
|
| 130 |
+
$ref: '#/components/schemas/Error'
|
| 131 |
+
'422':
|
| 132 |
+
description: Validation Error
|
| 133 |
+
content:
|
| 134 |
+
application/json:
|
| 135 |
+
schema:
|
| 136 |
+
$ref: '#/components/schemas/HTTPValidationError'
|
| 137 |
+
/api/v1/privacy/pii/image/anonymize:
|
| 138 |
+
post:
|
| 139 |
+
tags:
|
| 140 |
+
- PII Privacy
|
| 141 |
+
summary: Image Anonymize
|
| 142 |
+
operationId: image_anonymize_api_v1_privacy_pii_image_anonymize_post
|
| 143 |
+
security:
|
| 144 |
+
- oauth_auth:
|
| 145 |
+
- write:users
|
| 146 |
+
requestBody:
|
| 147 |
+
content:
|
| 148 |
+
multipart/form-data:
|
| 149 |
+
schema:
|
| 150 |
+
$ref: '#/components/schemas/Body_image_anonymize_api_v1_privacy_pii_image_anonymize_post'
|
| 151 |
+
required: true
|
| 152 |
+
responses:
|
| 153 |
+
'200':
|
| 154 |
+
description: Successful Response
|
| 155 |
+
content:
|
| 156 |
+
application/json:
|
| 157 |
+
schema: {}
|
| 158 |
+
'401':
|
| 159 |
+
description: Unauthorized
|
| 160 |
+
content:
|
| 161 |
+
application/json:
|
| 162 |
+
schema:
|
| 163 |
+
$ref: '#/components/schemas/Error'
|
| 164 |
+
'403':
|
| 165 |
+
description: Forbidden
|
| 166 |
+
content:
|
| 167 |
+
application/json:
|
| 168 |
+
schema:
|
| 169 |
+
$ref: '#/components/schemas/Error'
|
| 170 |
+
'422':
|
| 171 |
+
description: Validation Error
|
| 172 |
+
content:
|
| 173 |
+
application/json:
|
| 174 |
+
schema:
|
| 175 |
+
$ref: '#/components/schemas/HTTPValidationError'
|
| 176 |
+
/api/v1/privacy/pii/image/verify:
|
| 177 |
+
post:
|
| 178 |
+
tags:
|
| 179 |
+
- PII Privacy
|
| 180 |
+
summary: Image Verify
|
| 181 |
+
operationId: image_verify_api_v1_privacy_pii_image_verify_post
|
| 182 |
+
security:
|
| 183 |
+
- oauth_auth:
|
| 184 |
+
- write:users
|
| 185 |
+
requestBody:
|
| 186 |
+
content:
|
| 187 |
+
multipart/form-data:
|
| 188 |
+
schema:
|
| 189 |
+
$ref: '#/components/schemas/Body_image_verify_api_v1_privacy_pii_image_verify_post'
|
| 190 |
+
required: true
|
| 191 |
+
responses:
|
| 192 |
+
'200':
|
| 193 |
+
description: Successful Response
|
| 194 |
+
content:
|
| 195 |
+
application/json:
|
| 196 |
+
schema: {}
|
| 197 |
+
'401':
|
| 198 |
+
description: Unauthorized
|
| 199 |
+
content:
|
| 200 |
+
application/json:
|
| 201 |
+
schema:
|
| 202 |
+
$ref: '#/components/schemas/Error'
|
| 203 |
+
'403':
|
| 204 |
+
description: Forbidden
|
| 205 |
+
content:
|
| 206 |
+
application/json:
|
| 207 |
+
schema:
|
| 208 |
+
$ref: '#/components/schemas/Error'
|
| 209 |
+
'422':
|
| 210 |
+
description: Validation Error
|
| 211 |
+
content:
|
| 212 |
+
application/json:
|
| 213 |
+
schema:
|
| 214 |
+
$ref: '#/components/schemas/HTTPValidationError'
|
| 215 |
+
components:
|
| 216 |
+
schemas:
|
| 217 |
+
Body_image_analyze_api_v1_privacy_pii_image_analyze_post:
|
| 218 |
+
title: Body_image_analyze_api_v1_privacy_pii_image_analyze_post
|
| 219 |
+
required:
|
| 220 |
+
- payload
|
| 221 |
+
type: object
|
| 222 |
+
properties:
|
| 223 |
+
payload:
|
| 224 |
+
title: Payload
|
| 225 |
+
type: string
|
| 226 |
+
format: binary
|
| 227 |
+
Body_image_anonymize_api_v1_privacy_pii_image_anonymize_post:
|
| 228 |
+
title: Body_image_anonymize_api_v1_privacy_pii_image_anonymize_post
|
| 229 |
+
required:
|
| 230 |
+
- payload
|
| 231 |
+
type: object
|
| 232 |
+
properties:
|
| 233 |
+
payload:
|
| 234 |
+
title: Payload
|
| 235 |
+
type: string
|
| 236 |
+
format: binary
|
| 237 |
+
Body_image_verify_api_v1_privacy_pii_image_verify_post:
|
| 238 |
+
title: Body_image_verify_api_v1_privacy_pii_image_verify_post
|
| 239 |
+
required:
|
| 240 |
+
- payload
|
| 241 |
+
type: object
|
| 242 |
+
properties:
|
| 243 |
+
payload:
|
| 244 |
+
title: Payload
|
| 245 |
+
type: string
|
| 246 |
+
format: binary
|
| 247 |
+
HTTPValidationError:
|
| 248 |
+
title: HTTPValidationError
|
| 249 |
+
type: object
|
| 250 |
+
properties:
|
| 251 |
+
detail:
|
| 252 |
+
title: Detail
|
| 253 |
+
type: array
|
| 254 |
+
items:
|
| 255 |
+
$ref: '#/components/schemas/ValidationError'
|
| 256 |
+
PIIAnalyzeRequest:
|
| 257 |
+
title: PIIAnalyzeRequest
|
| 258 |
+
required:
|
| 259 |
+
- inputText
|
| 260 |
+
type: object
|
| 261 |
+
properties:
|
| 262 |
+
inputText:
|
| 263 |
+
title: Inputtext
|
| 264 |
+
type: string
|
| 265 |
+
example: John Smith's SSN is 012884567
|
| 266 |
+
PIIAnalyzeResponse:
|
| 267 |
+
title: PIIAnalyzeResponse
|
| 268 |
+
required:
|
| 269 |
+
- PIIEntities
|
| 270 |
+
type: object
|
| 271 |
+
properties:
|
| 272 |
+
PIIEntities:
|
| 273 |
+
title: Piientities
|
| 274 |
+
type: array
|
| 275 |
+
items:
|
| 276 |
+
$ref: '#/components/schemas/PIIEntity'
|
| 277 |
+
PIIAnonymizeRequest:
|
| 278 |
+
title: PIIAnonymizeRequest
|
| 279 |
+
required:
|
| 280 |
+
- inputText
|
| 281 |
+
type: object
|
| 282 |
+
properties:
|
| 283 |
+
inputText:
|
| 284 |
+
title: Inputtext
|
| 285 |
+
type: string
|
| 286 |
+
example: John Smith's SSN is 012884567
|
| 287 |
+
piiEntitiesToBeRedacted:
|
| 288 |
+
title: Piientitiestoberedacted
|
| 289 |
+
type: array
|
| 290 |
+
items: {}
|
| 291 |
+
example:
|
| 292 |
+
- US_SSN
|
| 293 |
+
redactionType:
|
| 294 |
+
title: Redactiontype
|
| 295 |
+
type: string
|
| 296 |
+
example: replace
|
| 297 |
+
PIIAnonymizeResponse:
|
| 298 |
+
title: PIIAnonymizeResponse
|
| 299 |
+
required:
|
| 300 |
+
- anonymizedText
|
| 301 |
+
type: object
|
| 302 |
+
properties:
|
| 303 |
+
anonymizedText:
|
| 304 |
+
title: Anonymizedtext
|
| 305 |
+
type: string
|
| 306 |
+
example: John Smith's SSN is <US_SSN>
|
| 307 |
+
PIIEntity:
|
| 308 |
+
title: PIIEntity
|
| 309 |
+
required:
|
| 310 |
+
- type
|
| 311 |
+
- beginOffset
|
| 312 |
+
- endOffset
|
| 313 |
+
- confidenceScore
|
| 314 |
+
type: object
|
| 315 |
+
properties:
|
| 316 |
+
type:
|
| 317 |
+
title: Type
|
| 318 |
+
type: string
|
| 319 |
+
example: US_SSN
|
| 320 |
+
beginOffset:
|
| 321 |
+
title: Beginoffset
|
| 322 |
+
type: integer
|
| 323 |
+
example: 19
|
| 324 |
+
endOffset:
|
| 325 |
+
title: Endoffset
|
| 326 |
+
type: integer
|
| 327 |
+
example: 28
|
| 328 |
+
confidenceScore:
|
| 329 |
+
title: ConfidenceScore
|
| 330 |
+
type: number
|
| 331 |
+
example: 0.25
|
| 332 |
+
PIIImageAnalyzeResponse:
|
| 333 |
+
title: PIIImageAnalyzeResponse
|
| 334 |
+
required:
|
| 335 |
+
- PIIEntities
|
| 336 |
+
type: object
|
| 337 |
+
properties:
|
| 338 |
+
PIIEntities:
|
| 339 |
+
title: Piientities
|
| 340 |
+
type: array
|
| 341 |
+
items:
|
| 342 |
+
$ref: '#/components/schemas/PIIImageEntity'
|
| 343 |
+
PIIImageEntity:
|
| 344 |
+
title: PIIImageEntity
|
| 345 |
+
required:
|
| 346 |
+
- type
|
| 347 |
+
type: object
|
| 348 |
+
properties:
|
| 349 |
+
type:
|
| 350 |
+
title: Type
|
| 351 |
+
type: string
|
| 352 |
+
example: US_SSN
|
| 353 |
+
ValidationError:
|
| 354 |
+
title: ValidationError
|
| 355 |
+
required:
|
| 356 |
+
- loc
|
| 357 |
+
- msg
|
| 358 |
+
- type
|
| 359 |
+
type: object
|
| 360 |
+
properties:
|
| 361 |
+
loc:
|
| 362 |
+
title: Location
|
| 363 |
+
type: array
|
| 364 |
+
items:
|
| 365 |
+
anyOf:
|
| 366 |
+
- type: string
|
| 367 |
+
- type: integer
|
| 368 |
+
msg:
|
| 369 |
+
title: Message
|
| 370 |
+
type: string
|
| 371 |
+
type:
|
| 372 |
+
title: Error Type
|
| 373 |
+
type: string
|
| 374 |
+
securitySchemes:
|
| 375 |
+
oauth_auth:
|
| 376 |
+
type: oauth2
|
| 377 |
+
flows:
|
| 378 |
+
authorizationCode:
|
| 379 |
+
authorizationUrl: https://example.com/oauth/authorize
|
| 380 |
+
tokenUrl: https://example.com/oauth/token
|
| 381 |
+
scopes:
|
| 382 |
+
write:users: modify user profile
|
| 383 |
+
|
| 384 |
+
tags:
|
| 385 |
+
- name: PII Privacy
|
| 386 |
+
description: Operations required for a PII entity (e.g. IN_ADHAAR, IN_PAN, US_SSN etc)
|
| 387 |
+
externalDocs:
|
| 388 |
+
description: Find out more
|
| 389 |
+
url: https://www.infosys.com
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/build_config.yaml
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-
|
| 2 |
+
name: presidio_analyzer
|
| 3 |
+
version: 4.1.0
|
| 4 |
+
build: 0.0.1
|
| 5 |
+
author: Amit Hegde
|
| 6 |
+
author_email: amitumamaheshwar.h@infosys.com
|
| 7 |
+
description: Infosys Intelligent Assistant
|
| 8 |
+
long_description: Infosys Intelligent Assistant
|
| 9 |
+
classifiers: ["Programming Language :: Python :: 3",
|
| 10 |
+
"License :: OSI Approved :: MIT License",
|
| 11 |
+
"Operating System :: OS Independent",]
|
| 12 |
+
package_dir: {"": "presidio_analyzer"}
|
| 13 |
+
packages: presidio_analyzer
|
| 14 |
+
python_requires: ['>=3.6']
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/create_wheel_file.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__copyright__ = """ 2020 - 2021 Infosys Limited, Bangalore, India. All Rights Reserved.
|
| 2 |
+
Version: 2.5.0.0
|
| 3 |
+
Except for any free or open source software components embedded in this Infosys proprietary software program (“Program”), this Program is protected by copyright laws, international treaties and other pending or existing intellectual property rights in India, the United States and other countries.
|
| 4 |
+
Except as expressly permitted, any unauthorized reproduction, storage, transmission in any form or by any means (including without limitation electronic, mechanical, printing, photocopying, recording or otherwise), or any distribution of this Program, or any portion of it, may result in severe civil and criminal penalties, and will be prosecuted to the maximum extent possible under the law.
|
| 5 |
+
"""
|
| 6 |
+
import yaml
|
| 7 |
+
import subprocess
|
| 8 |
+
import os
|
| 9 |
+
with open(r'.\build_config.yaml') as build_file:
|
| 10 |
+
build_config_list = yaml.safe_load(build_file)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
for build_config in build_config_list:
|
| 14 |
+
|
| 15 |
+
try:
|
| 16 |
+
print(build_config)
|
| 17 |
+
|
| 18 |
+
if os.path.exists(f"./{build_config['packages']}"):
|
| 19 |
+
|
| 20 |
+
setup_str = f"import setuptools\r" \
|
| 21 |
+
f"setuptools.setup(\r \
|
| 22 |
+
name='{build_config['name']}',\r \
|
| 23 |
+
version='{build_config['version']}',\r \
|
| 24 |
+
author='{build_config['author']}',\r \
|
| 25 |
+
author_email='{build_config['author_email']}',\r \
|
| 26 |
+
description='{build_config['description']}',\r \
|
| 27 |
+
long_description='{build_config['long_description']}',\r \
|
| 28 |
+
classifiers={build_config['classifiers']},\r \
|
| 29 |
+
package_dir={build_config['package_dir']},\r \
|
| 30 |
+
packages=setuptools.find_packages(where='{build_config['packages']}'),\r \
|
| 31 |
+
python_requires='{build_config['python_requires'][0]}',\r \
|
| 32 |
+
)"
|
| 33 |
+
|
| 34 |
+
with open('setup.py','w') as file:
|
| 35 |
+
file.write(setup_str)
|
| 36 |
+
|
| 37 |
+
subprocess.run(["python", "-m","build"])
|
| 38 |
+
wheel_file = f"{build_config['name']}-{build_config['version']}_build_{build_config['build']}-py3-none-any.whl"
|
| 39 |
+
print(f"wheel_file: {wheel_file}")
|
| 40 |
+
subprocess.run(["python", "-m", "pyc_wheel", f"dist\{wheel_file}"])
|
| 41 |
+
else:
|
| 42 |
+
print(f"Path does not exist ./{build_config['packages']}")
|
| 43 |
+
except Exception as e:
|
| 44 |
+
print("Exception occurred")
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/dist/presidio_analyzer-4.0.5-py3-none-any.whl
ADDED
|
Binary file (78.9 kB). View file
|
|
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/dist/presidio_analyzer-4.0.5.tar.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:effdee5c88badc2a4605dcabc7fe1ff43df586df0a7c2be3f4dbc4d440c7e4d6
|
| 3 |
+
size 44375
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/dist/presidio_analyzer-4.0.6-py3-none-any.whl
ADDED
|
Binary file (79.1 kB). View file
|
|
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/dist/presidio_analyzer-4.0.6.tar.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c49ca4ee3acda590bb69b68697e02cbfc81b89bd8dcfcaf9ff90b07fec062515
|
| 3 |
+
size 44656
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/dist/presidio_analyzer-4.1.0-py3-none-any.whl
ADDED
|
Binary file (79.1 kB). View file
|
|
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/dist/presidio_analyzer-4.1.0.tar.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:595ba3a58a473cc94a2a5c421eea075c5db52cb0181335a92f3a222f5cc76736
|
| 3 |
+
size 44675
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer.egg-info/PKG-INFO
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.1
|
| 2 |
+
Name: presidio_analyzer
|
| 3 |
+
Version: 4.1.0
|
| 4 |
+
Summary: Infosys Intelligent Assistant
|
| 5 |
+
Author: Amit Hegde
|
| 6 |
+
Author-email: amitumamaheshwar.h@infosys.com
|
| 7 |
+
Classifier: Programming Language :: Python :: 3
|
| 8 |
+
Classifier: License :: OSI Approved :: MIT License
|
| 9 |
+
Classifier: Operating System :: OS Independent
|
| 10 |
+
Requires-Python: >=3.6
|
| 11 |
+
|
| 12 |
+
Infosys Intelligent Assistant
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer.egg-info/SOURCES.txt
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
setup.py
|
| 2 |
+
presidio_analyzer/presidio_analyzer/__init__.py
|
| 3 |
+
presidio_analyzer/presidio_analyzer/analysis_explanation.py
|
| 4 |
+
presidio_analyzer/presidio_analyzer/analyzer_engine.py
|
| 5 |
+
presidio_analyzer/presidio_analyzer/analyzer_request.py
|
| 6 |
+
presidio_analyzer/presidio_analyzer/app_tracer.py
|
| 7 |
+
presidio_analyzer/presidio_analyzer/batch_analyzer_engine.py
|
| 8 |
+
presidio_analyzer/presidio_analyzer/dict_analyzer_result.py
|
| 9 |
+
presidio_analyzer/presidio_analyzer/entity_recognizer.py
|
| 10 |
+
presidio_analyzer/presidio_analyzer/local_recognizer.py
|
| 11 |
+
presidio_analyzer/presidio_analyzer/pattern.py
|
| 12 |
+
presidio_analyzer/presidio_analyzer/pattern_recognizer.py
|
| 13 |
+
presidio_analyzer/presidio_analyzer/recognizer_result.py
|
| 14 |
+
presidio_analyzer/presidio_analyzer/remote_recognizer.py
|
| 15 |
+
presidio_analyzer/presidio_analyzer.egg-info/PKG-INFO
|
| 16 |
+
presidio_analyzer/presidio_analyzer.egg-info/SOURCES.txt
|
| 17 |
+
presidio_analyzer/presidio_analyzer.egg-info/dependency_links.txt
|
| 18 |
+
presidio_analyzer/presidio_analyzer.egg-info/top_level.txt
|
| 19 |
+
presidio_analyzer/presidio_analyzer/context_aware_enhancers/__init__.py
|
| 20 |
+
presidio_analyzer/presidio_analyzer/context_aware_enhancers/context_aware_enhancer.py
|
| 21 |
+
presidio_analyzer/presidio_analyzer/context_aware_enhancers/lemma_context_aware_enhancer.py
|
| 22 |
+
presidio_analyzer/presidio_analyzer/nlp_engine/__init__.py
|
| 23 |
+
presidio_analyzer/presidio_analyzer/nlp_engine/client_nlp_engine.py
|
| 24 |
+
presidio_analyzer/presidio_analyzer/nlp_engine/nlp_artifacts.py
|
| 25 |
+
presidio_analyzer/presidio_analyzer/nlp_engine/nlp_engine.py
|
| 26 |
+
presidio_analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py
|
| 27 |
+
presidio_analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py
|
| 28 |
+
presidio_analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py
|
| 29 |
+
presidio_analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py
|
| 30 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/Aadhaar_Number.py
|
| 31 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/PAN_Number.py
|
| 32 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/__init__.py
|
| 33 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/au_abn_recognizer.py
|
| 34 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/au_acn_recognizer.py
|
| 35 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/au_medicare_recognizer.py
|
| 36 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/au_tfn_recognizer.py
|
| 37 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/credit_card_recognizer.py
|
| 38 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/crypto_recognizer.py
|
| 39 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/data_recognizer.py
|
| 40 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/date_recognizer.py
|
| 41 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/email_recognizer.py
|
| 42 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/es_nif_recognizer.py
|
| 43 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/iban_patterns.py
|
| 44 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/iban_recognizer.py
|
| 45 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/ip_recognizer.py
|
| 46 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/it_driver_license_recognizer.py
|
| 47 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/it_fiscal_code_recognizer.py
|
| 48 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/it_identity_card_recognizer.py
|
| 49 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/it_passport_recognizer.py
|
| 50 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/it_vat_code.py
|
| 51 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/medical_license_recognizer.py
|
| 52 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/phone_recognizer.py
|
| 53 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/sg_fin_recognizer.py
|
| 54 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/spacy_recognizer.py
|
| 55 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/stanza_recognizer.py
|
| 56 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/transformers_recognizer.py
|
| 57 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/uk_nhs_recognizer.py
|
| 58 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/url_recognizer.py
|
| 59 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/us_driver_license_recognizer.py
|
| 60 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/us_itin_recognizer.py
|
| 61 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/us_passport_recognizer.py
|
| 62 |
+
presidio_analyzer/presidio_analyzer/predefined_recognizers/us_ssn_recognizer.py
|
| 63 |
+
presidio_analyzer/presidio_analyzer/recognizer_registry/__init__.py
|
| 64 |
+
presidio_analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer.egg-info/dependency_links.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer.egg-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
presidio_analyzer
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/__init__.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Presidio analyzer package."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
|
| 5 |
+
from presidio_analyzer.pattern import Pattern
|
| 6 |
+
from presidio_analyzer.analysis_explanation import AnalysisExplanation
|
| 7 |
+
from presidio_analyzer.recognizer_result import RecognizerResult
|
| 8 |
+
from presidio_analyzer.dict_analyzer_result import DictAnalyzerResult
|
| 9 |
+
from presidio_analyzer.entity_recognizer import EntityRecognizer
|
| 10 |
+
from presidio_analyzer.local_recognizer import LocalRecognizer
|
| 11 |
+
from presidio_analyzer.pattern_recognizer import PatternRecognizer
|
| 12 |
+
from presidio_analyzer.remote_recognizer import RemoteRecognizer
|
| 13 |
+
from presidio_analyzer.recognizer_registry import RecognizerRegistry
|
| 14 |
+
from presidio_analyzer.analyzer_engine import AnalyzerEngine
|
| 15 |
+
from presidio_analyzer.batch_analyzer_engine import BatchAnalyzerEngine
|
| 16 |
+
from presidio_analyzer.analyzer_request import AnalyzerRequest
|
| 17 |
+
from presidio_analyzer.context_aware_enhancers import ContextAwareEnhancer
|
| 18 |
+
from presidio_analyzer.context_aware_enhancers import LemmaContextAwareEnhancer
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# Define default loggers behavior
|
| 22 |
+
|
| 23 |
+
# 1. presidio_analyzer logger
|
| 24 |
+
|
| 25 |
+
logging.getLogger("presidio_analyzer").addHandler(logging.NullHandler())
|
| 26 |
+
|
| 27 |
+
# 2. decision_process logger.
|
| 28 |
+
# Setting the decision process trace here as we would want it
|
| 29 |
+
# to be activated using a parameter to AnalyzeEngine and not by default.
|
| 30 |
+
|
| 31 |
+
decision_process_logger = logging.getLogger("decision_process")
|
| 32 |
+
ch = logging.StreamHandler()
|
| 33 |
+
formatter = logging.Formatter("[%(asctime)s][%(name)s][%(levelname)s]%(message)s")
|
| 34 |
+
ch.setFormatter(formatter)
|
| 35 |
+
decision_process_logger.addHandler(ch)
|
| 36 |
+
decision_process_logger.setLevel("INFO")
|
| 37 |
+
__all__ = [
|
| 38 |
+
"Pattern",
|
| 39 |
+
"AnalysisExplanation",
|
| 40 |
+
"RecognizerResult",
|
| 41 |
+
"DictAnalyzerResult",
|
| 42 |
+
"EntityRecognizer",
|
| 43 |
+
"LocalRecognizer",
|
| 44 |
+
"PatternRecognizer",
|
| 45 |
+
"RemoteRecognizer",
|
| 46 |
+
"RecognizerRegistry",
|
| 47 |
+
"AnalyzerEngine",
|
| 48 |
+
"AnalyzerRequest",
|
| 49 |
+
"ContextAwareEnhancer",
|
| 50 |
+
"LemmaContextAwareEnhancer",
|
| 51 |
+
"BatchAnalyzerEngine",
|
| 52 |
+
]
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/analysis_explanation.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class AnalysisExplanation:
|
| 5 |
+
"""
|
| 6 |
+
Hold tracing information to explain why PII entities were identified as such.
|
| 7 |
+
|
| 8 |
+
:param recognizer: name of recognizer that made the decision
|
| 9 |
+
:param original_score: recognizer's confidence in result
|
| 10 |
+
:param pattern_name: name of pattern
|
| 11 |
+
(if decision was made by a PatternRecognizer)
|
| 12 |
+
:param pattern: regex pattern that was applied (if PatternRecognizer)
|
| 13 |
+
:param validation_result: result of a validation (e.g. checksum)
|
| 14 |
+
:param textual_explanation: Free text for describing
|
| 15 |
+
a decision of a logic or model
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
def __init__(
|
| 19 |
+
self,
|
| 20 |
+
recognizer: str,
|
| 21 |
+
original_score: float,
|
| 22 |
+
pattern_name: str = None,
|
| 23 |
+
pattern: str = None,
|
| 24 |
+
validation_result: float = None,
|
| 25 |
+
textual_explanation: str = None,
|
| 26 |
+
):
|
| 27 |
+
|
| 28 |
+
self.recognizer = recognizer
|
| 29 |
+
self.pattern_name = pattern_name
|
| 30 |
+
self.pattern = pattern
|
| 31 |
+
self.original_score = original_score
|
| 32 |
+
self.score = original_score
|
| 33 |
+
self.textual_explanation = textual_explanation
|
| 34 |
+
self.score_context_improvement = 0
|
| 35 |
+
self.supportive_context_word = ""
|
| 36 |
+
self.validation_result = validation_result
|
| 37 |
+
|
| 38 |
+
def __repr__(self):
|
| 39 |
+
"""Create string representation of the object."""
|
| 40 |
+
return str(self.__dict__)
|
| 41 |
+
|
| 42 |
+
def set_improved_score(self, score: float) -> None:
|
| 43 |
+
"""Update the score and calculate the difference from the original score."""
|
| 44 |
+
self.score = score
|
| 45 |
+
self.score_context_improvement = self.score - self.original_score
|
| 46 |
+
|
| 47 |
+
def set_supportive_context_word(self, word: str) -> None:
|
| 48 |
+
"""Set the context word which helped increase the score."""
|
| 49 |
+
self.supportive_context_word = word
|
| 50 |
+
|
| 51 |
+
def append_textual_explanation_line(self, text: str) -> None:
|
| 52 |
+
"""Append a new line to textual_explanation field."""
|
| 53 |
+
if self.textual_explanation is None:
|
| 54 |
+
self.textual_explanation = text
|
| 55 |
+
else:
|
| 56 |
+
self.textual_explanation = "{}\n{}".format(self.textual_explanation, text)
|
| 57 |
+
|
| 58 |
+
def to_dict(self) -> Dict:
|
| 59 |
+
"""
|
| 60 |
+
Serialize self to dictionary.
|
| 61 |
+
|
| 62 |
+
:return: a dictionary
|
| 63 |
+
"""
|
| 64 |
+
return self.__dict__
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/analyzer_engine.py
ADDED
|
@@ -0,0 +1,372 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import logging
|
| 3 |
+
from typing import List, Optional
|
| 4 |
+
|
| 5 |
+
from presidio_analyzer import (
|
| 6 |
+
RecognizerRegistry,
|
| 7 |
+
RecognizerResult,
|
| 8 |
+
EntityRecognizer,
|
| 9 |
+
)
|
| 10 |
+
from presidio_analyzer.app_tracer import AppTracer
|
| 11 |
+
from presidio_analyzer.context_aware_enhancers import (
|
| 12 |
+
ContextAwareEnhancer,
|
| 13 |
+
LemmaContextAwareEnhancer,
|
| 14 |
+
)
|
| 15 |
+
from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider, NlpArtifacts
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger("presidio_analyzer")
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class AnalyzerEngine:
|
| 21 |
+
"""
|
| 22 |
+
Entry point for Presidio Analyzer.
|
| 23 |
+
|
| 24 |
+
Orchestrating the detection of PII entities and all related logic.
|
| 25 |
+
|
| 26 |
+
:param registry: instance of type RecognizerRegistry
|
| 27 |
+
:param nlp_engine: instance of type NlpEngine
|
| 28 |
+
(for example SpacyNlpEngine)
|
| 29 |
+
:param app_tracer: instance of type AppTracer, used to trace the logic
|
| 30 |
+
used during each request for interpretability reasons.
|
| 31 |
+
:param log_decision_process: bool,
|
| 32 |
+
defines whether the decision process within the analyzer should be logged or not.
|
| 33 |
+
:param default_score_threshold: Minimum confidence value
|
| 34 |
+
for detected entities to be returned
|
| 35 |
+
:param supported_languages: List of possible languages this engine could be run on.
|
| 36 |
+
Used for loading the right NLP models and recognizers for these languages.
|
| 37 |
+
:param context_aware_enhancer: instance of type ContextAwareEnhancer for enhancing
|
| 38 |
+
confidence score based on context words, (LemmaContextAwareEnhancer will be created
|
| 39 |
+
by default if None passed)
|
| 40 |
+
"""
|
| 41 |
+
|
| 42 |
+
def __init__(
|
| 43 |
+
self,
|
| 44 |
+
registry: RecognizerRegistry = None,
|
| 45 |
+
nlp_engine: NlpEngine = None,
|
| 46 |
+
app_tracer: AppTracer = None,
|
| 47 |
+
log_decision_process: bool = False,
|
| 48 |
+
default_score_threshold: float = 0,
|
| 49 |
+
supported_languages: List[str] = None,
|
| 50 |
+
context_aware_enhancer: Optional[ContextAwareEnhancer] = None,
|
| 51 |
+
):
|
| 52 |
+
if not supported_languages:
|
| 53 |
+
supported_languages = ["en"]
|
| 54 |
+
|
| 55 |
+
if not nlp_engine:
|
| 56 |
+
logger.info("nlp_engine not provided, creating default.")
|
| 57 |
+
provider = NlpEngineProvider()
|
| 58 |
+
nlp_engine = provider.create_engine()
|
| 59 |
+
|
| 60 |
+
if not registry:
|
| 61 |
+
logger.info("registry not provided, creating default.")
|
| 62 |
+
registry = RecognizerRegistry()
|
| 63 |
+
if not app_tracer:
|
| 64 |
+
app_tracer = AppTracer()
|
| 65 |
+
self.app_tracer = app_tracer
|
| 66 |
+
|
| 67 |
+
self.supported_languages = supported_languages
|
| 68 |
+
|
| 69 |
+
self.nlp_engine = nlp_engine
|
| 70 |
+
self.registry = registry
|
| 71 |
+
|
| 72 |
+
# load all recognizers
|
| 73 |
+
if not registry.recognizers:
|
| 74 |
+
registry.load_predefined_recognizers(
|
| 75 |
+
nlp_engine=self.nlp_engine, languages=self.supported_languages
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
self.log_decision_process = log_decision_process
|
| 79 |
+
self.default_score_threshold = default_score_threshold
|
| 80 |
+
|
| 81 |
+
if not context_aware_enhancer:
|
| 82 |
+
logger.debug(
|
| 83 |
+
"context aware enhancer not provided, creating default"
|
| 84 |
+
+ " lemma based enhancer."
|
| 85 |
+
)
|
| 86 |
+
context_aware_enhancer = LemmaContextAwareEnhancer()
|
| 87 |
+
|
| 88 |
+
self.context_aware_enhancer = context_aware_enhancer
|
| 89 |
+
|
| 90 |
+
def get_recognizers(self, language: Optional[str] = None) -> List[EntityRecognizer]:
|
| 91 |
+
"""
|
| 92 |
+
Return a list of PII recognizers currently loaded.
|
| 93 |
+
|
| 94 |
+
:param language: Return the recognizers supporting a given language.
|
| 95 |
+
:return: List of [Recognizer] as a RecognizersAllResponse
|
| 96 |
+
"""
|
| 97 |
+
if not language:
|
| 98 |
+
languages = self.supported_languages
|
| 99 |
+
else:
|
| 100 |
+
languages = [language]
|
| 101 |
+
|
| 102 |
+
recognizers = []
|
| 103 |
+
for language in languages:
|
| 104 |
+
logger.info(f"Fetching all recognizers for language {language}")
|
| 105 |
+
recognizers.extend(
|
| 106 |
+
self.registry.get_recognizers(language=language, all_fields=True)
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
return list(set(recognizers))
|
| 110 |
+
|
| 111 |
+
def get_supported_entities(self, language: Optional[str] = None) -> List[str]:
|
| 112 |
+
"""
|
| 113 |
+
Return a list of the entities that can be detected.
|
| 114 |
+
|
| 115 |
+
:param language: Return only entities supported in a specific language.
|
| 116 |
+
:return: List of entity names
|
| 117 |
+
"""
|
| 118 |
+
recognizers = self.get_recognizers(language=language)
|
| 119 |
+
supported_entities = []
|
| 120 |
+
for recognizer in recognizers:
|
| 121 |
+
supported_entities.extend(recognizer.get_supported_entities())
|
| 122 |
+
|
| 123 |
+
return list(set(supported_entities))
|
| 124 |
+
|
| 125 |
+
def analyze(
|
| 126 |
+
self,
|
| 127 |
+
text: str,
|
| 128 |
+
language: str,
|
| 129 |
+
entities: Optional[List[str]] = None,
|
| 130 |
+
correlation_id: Optional[str] = None,
|
| 131 |
+
score_threshold: Optional[float] = None,
|
| 132 |
+
return_decision_process: Optional[bool] = False,
|
| 133 |
+
ad_hoc_recognizers: Optional[List[EntityRecognizer]] = None,
|
| 134 |
+
context: Optional[List[str]] = None,
|
| 135 |
+
allow_list: Optional[List[str]] = None,
|
| 136 |
+
nlp_artifacts: Optional[NlpArtifacts] = None,
|
| 137 |
+
) -> List[RecognizerResult]:
|
| 138 |
+
"""
|
| 139 |
+
Find PII entities in text using different PII recognizers for a given language.
|
| 140 |
+
|
| 141 |
+
:param text: the text to analyze
|
| 142 |
+
:param language: the language of the text
|
| 143 |
+
:param entities: List of PII entities that should be looked for in the text.
|
| 144 |
+
If entities=None then all entities are looked for.
|
| 145 |
+
:param correlation_id: cross call ID for this request
|
| 146 |
+
:param score_threshold: A minimum value for which
|
| 147 |
+
to return an identified entity
|
| 148 |
+
:param return_decision_process: Whether the analysis decision process steps
|
| 149 |
+
returned in the response.
|
| 150 |
+
:param ad_hoc_recognizers: List of recognizers which will be used only
|
| 151 |
+
for this specific request.
|
| 152 |
+
:param context: List of context words to enhance confidence score if matched
|
| 153 |
+
with the recognized entity's recognizer context
|
| 154 |
+
:param allow_list: List of words that the user defines as being allowed to keep
|
| 155 |
+
in the text
|
| 156 |
+
:param nlp_artifacts: precomputed NlpArtifacts
|
| 157 |
+
:return: an array of the found entities in the text
|
| 158 |
+
|
| 159 |
+
:example:
|
| 160 |
+
|
| 161 |
+
>>> from presidio_analyzer import AnalyzerEngine
|
| 162 |
+
|
| 163 |
+
>>> # Set up the engine, loads the NLP module (spaCy model by default)
|
| 164 |
+
>>> # and other PII recognizers
|
| 165 |
+
>>> analyzer = AnalyzerEngine()
|
| 166 |
+
|
| 167 |
+
>>> # Call analyzer to get results
|
| 168 |
+
>>> results = analyzer.analyze(text='My phone number is 212-555-5555', entities=['PHONE_NUMBER'], language='en') # noqa D501
|
| 169 |
+
>>> print(results)
|
| 170 |
+
[type: PHONE_NUMBER, start: 19, end: 31, score: 0.85]
|
| 171 |
+
"""
|
| 172 |
+
all_fields = not entities
|
| 173 |
+
|
| 174 |
+
recognizers = self.registry.get_recognizers(
|
| 175 |
+
language=language,
|
| 176 |
+
entities=entities,
|
| 177 |
+
all_fields=all_fields,
|
| 178 |
+
ad_hoc_recognizers=ad_hoc_recognizers,
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
if all_fields:
|
| 182 |
+
# Since all_fields=True, list all entities by iterating
|
| 183 |
+
# over all recognizers
|
| 184 |
+
entities = self.get_supported_entities(language=language)
|
| 185 |
+
|
| 186 |
+
# run the nlp pipeline over the given text, store the results in
|
| 187 |
+
# a NlpArtifacts instance
|
| 188 |
+
if not nlp_artifacts:
|
| 189 |
+
nlp_artifacts = self.nlp_engine.process_text(text, language)
|
| 190 |
+
|
| 191 |
+
if self.log_decision_process:
|
| 192 |
+
self.app_tracer.trace(
|
| 193 |
+
correlation_id, "nlp artifacts:" + nlp_artifacts.to_json()
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
results = []
|
| 197 |
+
for recognizer in recognizers:
|
| 198 |
+
# Lazy loading of the relevant recognizers
|
| 199 |
+
if not recognizer.is_loaded:
|
| 200 |
+
recognizer.load()
|
| 201 |
+
recognizer.is_loaded = True
|
| 202 |
+
|
| 203 |
+
# analyze using the current recognizer and append the results
|
| 204 |
+
current_results = recognizer.analyze(
|
| 205 |
+
text=text, entities=entities, nlp_artifacts=nlp_artifacts
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
if current_results:
|
| 209 |
+
# add recognizer name to recognition metadata inside results
|
| 210 |
+
# if not exists
|
| 211 |
+
self.__add_recognizer_id_if_not_exists(current_results, recognizer)
|
| 212 |
+
results.extend(current_results)
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
results = self._enhance_using_context(
|
| 216 |
+
text, results, nlp_artifacts, recognizers, context
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
if self.log_decision_process:
|
| 220 |
+
self.app_tracer.trace(
|
| 221 |
+
correlation_id,
|
| 222 |
+
json.dumps([str(result.to_dict()) for result in results]),
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
# Remove duplicates or low score results
|
| 226 |
+
results = EntityRecognizer.remove_duplicates(results)
|
| 227 |
+
results = self.__remove_low_scores(results, score_threshold)
|
| 228 |
+
|
| 229 |
+
if allow_list:
|
| 230 |
+
results = self._remove_allow_list(results, allow_list, text)
|
| 231 |
+
|
| 232 |
+
if not return_decision_process:
|
| 233 |
+
results = self.__remove_decision_process(results)
|
| 234 |
+
|
| 235 |
+
return results
|
| 236 |
+
|
| 237 |
+
def _enhance_using_context(
|
| 238 |
+
self,
|
| 239 |
+
text: str,
|
| 240 |
+
raw_results: List[RecognizerResult],
|
| 241 |
+
nlp_artifacts: NlpArtifacts,
|
| 242 |
+
recognizers: List[EntityRecognizer],
|
| 243 |
+
context: Optional[List[str]] = None,
|
| 244 |
+
) -> List[RecognizerResult]:
|
| 245 |
+
"""
|
| 246 |
+
Enhance confidence score using context words.
|
| 247 |
+
|
| 248 |
+
:param text: The actual text that was analyzed
|
| 249 |
+
:param raw_results: Recognizer results which didn't take
|
| 250 |
+
context into consideration
|
| 251 |
+
:param nlp_artifacts: The nlp artifacts contains elements
|
| 252 |
+
such as lemmatized tokens for better
|
| 253 |
+
accuracy of the context enhancement process
|
| 254 |
+
:param recognizers: the list of recognizers
|
| 255 |
+
:param context: list of context words
|
| 256 |
+
"""
|
| 257 |
+
results = []
|
| 258 |
+
|
| 259 |
+
for recognizer in recognizers:
|
| 260 |
+
recognizer_results = [
|
| 261 |
+
r
|
| 262 |
+
for r in raw_results
|
| 263 |
+
if r.recognition_metadata[RecognizerResult.RECOGNIZER_IDENTIFIER_KEY]
|
| 264 |
+
== recognizer.id
|
| 265 |
+
]
|
| 266 |
+
other_recognizer_results = [
|
| 267 |
+
r
|
| 268 |
+
for r in raw_results
|
| 269 |
+
if r.recognition_metadata[RecognizerResult.RECOGNIZER_IDENTIFIER_KEY]
|
| 270 |
+
!= recognizer.id
|
| 271 |
+
]
|
| 272 |
+
|
| 273 |
+
# enhance score using context in recognizer level if implemented
|
| 274 |
+
recognizer_results = recognizer.enhance_using_context(
|
| 275 |
+
text=text,
|
| 276 |
+
# each recognizer will get access to all recognizer results
|
| 277 |
+
# to allow related entities contex enhancement
|
| 278 |
+
raw_recognizer_results=recognizer_results,
|
| 279 |
+
other_raw_recognizer_results=other_recognizer_results,
|
| 280 |
+
nlp_artifacts=nlp_artifacts,
|
| 281 |
+
context=context,
|
| 282 |
+
)
|
| 283 |
+
|
| 284 |
+
results.extend(recognizer_results)
|
| 285 |
+
|
| 286 |
+
# Update results in case surrounding words or external context are relevant to
|
| 287 |
+
# the context words.
|
| 288 |
+
results = self.context_aware_enhancer.enhance_using_context(
|
| 289 |
+
text=text,
|
| 290 |
+
raw_results=results,
|
| 291 |
+
nlp_artifacts=nlp_artifacts,
|
| 292 |
+
recognizers=recognizers,
|
| 293 |
+
context=context,
|
| 294 |
+
)
|
| 295 |
+
|
| 296 |
+
return results
|
| 297 |
+
|
| 298 |
+
def __remove_low_scores(
|
| 299 |
+
self, results: List[RecognizerResult], score_threshold: float = None
|
| 300 |
+
) -> List[RecognizerResult]:
|
| 301 |
+
"""
|
| 302 |
+
Remove results for which the confidence is lower than the threshold.
|
| 303 |
+
|
| 304 |
+
:param results: List of RecognizerResult
|
| 305 |
+
:param score_threshold: float value for minimum possible confidence
|
| 306 |
+
:return: List[RecognizerResult]
|
| 307 |
+
"""
|
| 308 |
+
if score_threshold is None:
|
| 309 |
+
score_threshold = self.default_score_threshold
|
| 310 |
+
|
| 311 |
+
new_results = [result for result in results if result.score >= score_threshold]
|
| 312 |
+
return new_results
|
| 313 |
+
|
| 314 |
+
@staticmethod
|
| 315 |
+
def _remove_allow_list(
|
| 316 |
+
results: List[RecognizerResult], allow_list: List[str], text: str
|
| 317 |
+
) -> List[RecognizerResult]:
|
| 318 |
+
"""
|
| 319 |
+
Remove results which are part of the allow list.
|
| 320 |
+
|
| 321 |
+
:param results: List of RecognizerResult
|
| 322 |
+
:param allow_list: list of allowed terms
|
| 323 |
+
:param text: the text to analyze
|
| 324 |
+
:return: List[RecognizerResult]
|
| 325 |
+
"""
|
| 326 |
+
new_results = []
|
| 327 |
+
for result in results:
|
| 328 |
+
word = text[result.start : result.end]
|
| 329 |
+
# if the word is not specified to be allowed, keep in the PII entities
|
| 330 |
+
if word not in allow_list:
|
| 331 |
+
new_results.append(result)
|
| 332 |
+
|
| 333 |
+
return new_results
|
| 334 |
+
|
| 335 |
+
@staticmethod
|
| 336 |
+
def __add_recognizer_id_if_not_exists(
|
| 337 |
+
results: List[RecognizerResult], recognizer: EntityRecognizer
|
| 338 |
+
):
|
| 339 |
+
"""Ensure recognition metadata with recognizer id existence.
|
| 340 |
+
|
| 341 |
+
Ensure recognizer result list contains recognizer id inside recognition
|
| 342 |
+
metadata dictionary, and if not create it. recognizer_id is needed
|
| 343 |
+
for context aware enhancement.
|
| 344 |
+
|
| 345 |
+
:param results: List of RecognizerResult
|
| 346 |
+
:param recognizer: Entity recognizer
|
| 347 |
+
"""
|
| 348 |
+
for result in results:
|
| 349 |
+
if not result.recognition_metadata:
|
| 350 |
+
result.recognition_metadata = dict()
|
| 351 |
+
if (
|
| 352 |
+
RecognizerResult.RECOGNIZER_IDENTIFIER_KEY
|
| 353 |
+
not in result.recognition_metadata
|
| 354 |
+
):
|
| 355 |
+
result.recognition_metadata[
|
| 356 |
+
RecognizerResult.RECOGNIZER_IDENTIFIER_KEY
|
| 357 |
+
] = recognizer.id
|
| 358 |
+
if RecognizerResult.RECOGNIZER_NAME_KEY not in result.recognition_metadata:
|
| 359 |
+
result.recognition_metadata[
|
| 360 |
+
RecognizerResult.RECOGNIZER_NAME_KEY
|
| 361 |
+
] = recognizer.name
|
| 362 |
+
|
| 363 |
+
@staticmethod
|
| 364 |
+
def __remove_decision_process(
|
| 365 |
+
results: List[RecognizerResult],
|
| 366 |
+
) -> List[RecognizerResult]:
|
| 367 |
+
"""Remove decision process / analysis explanation from response."""
|
| 368 |
+
|
| 369 |
+
for result in results:
|
| 370 |
+
result.analysis_explanation = None
|
| 371 |
+
|
| 372 |
+
return results
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/analyzer_request.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict
|
| 2 |
+
|
| 3 |
+
from presidio_analyzer import PatternRecognizer
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class AnalyzerRequest:
|
| 7 |
+
"""
|
| 8 |
+
Analyzer request data.
|
| 9 |
+
|
| 10 |
+
:param req_data: A request dictionary with the following fields:
|
| 11 |
+
text: the text to analyze
|
| 12 |
+
language: the language of the text
|
| 13 |
+
entities: List of PII entities that should be looked for in the text.
|
| 14 |
+
If entities=None then all entities are looked for.
|
| 15 |
+
correlation_id: cross call ID for this request
|
| 16 |
+
score_threshold: A minimum value for which to return an identified entity
|
| 17 |
+
log_decision_process: Should the decision points within the analysis
|
| 18 |
+
be logged
|
| 19 |
+
return_decision_process: Should the decision points within the analysis
|
| 20 |
+
returned as part of the response
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
def __init__(self, req_data: Dict):
|
| 24 |
+
self.text = req_data.get("text")
|
| 25 |
+
self.language = req_data.get("language")
|
| 26 |
+
self.entities = req_data.get("entities")
|
| 27 |
+
self.correlation_id = req_data.get("correlation_id")
|
| 28 |
+
self.score_threshold = req_data.get("score_threshold")
|
| 29 |
+
self.return_decision_process = req_data.get("return_decision_process")
|
| 30 |
+
ad_hoc_recognizers = req_data.get("ad_hoc_recognizers")
|
| 31 |
+
self.ad_hoc_recognizers = []
|
| 32 |
+
if ad_hoc_recognizers:
|
| 33 |
+
self.ad_hoc_recognizers = [
|
| 34 |
+
PatternRecognizer.from_dict(rec) for rec in ad_hoc_recognizers
|
| 35 |
+
]
|
| 36 |
+
self.context = req_data.get("context")
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/app_tracer.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class AppTracer:
|
| 5 |
+
"""
|
| 6 |
+
Allow logging/tracing the system's decisions.
|
| 7 |
+
|
| 8 |
+
Relevant in cases where we want to know which modules were used for detection,
|
| 9 |
+
which logic was utilized, what results were given and potentially why.
|
| 10 |
+
This can be useful for analyzing the detection accuracy of the system.
|
| 11 |
+
:param enabled: Whether tracing should be activated.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
def __init__(self, enabled: bool = True):
|
| 15 |
+
self.logger = logging.getLogger("decision_process")
|
| 16 |
+
self.enabled = enabled
|
| 17 |
+
|
| 18 |
+
def trace(self, request_id: str, trace_data: str) -> None:
|
| 19 |
+
"""
|
| 20 |
+
Write a value associated with a decision for a specific request into the trace.
|
| 21 |
+
|
| 22 |
+
Tracing for further inspection if needed.
|
| 23 |
+
:param request_id: A unique ID, to correlate across calls.
|
| 24 |
+
:param trace_data: A string to write to the log.
|
| 25 |
+
"""
|
| 26 |
+
if self.enabled:
|
| 27 |
+
self.logger.info("[%s][%s]", request_id, trace_data)
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/batch_analyzer_engine.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import List, Iterable, Dict, Union, Any, Optional, Iterator, Tuple
|
| 3 |
+
|
| 4 |
+
from presidio_analyzer import DictAnalyzerResult, RecognizerResult, AnalyzerEngine
|
| 5 |
+
from presidio_analyzer.nlp_engine import NlpArtifacts
|
| 6 |
+
|
| 7 |
+
logger = logging.getLogger("presidio_analyzer")
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class BatchAnalyzerEngine:
|
| 11 |
+
"""
|
| 12 |
+
Batch analysis of documents (tables, lists, dicts).
|
| 13 |
+
|
| 14 |
+
Wrapper class to run Presidio Analyzer Engine on multiple values,
|
| 15 |
+
either lists/iterators of strings, or dictionaries.
|
| 16 |
+
|
| 17 |
+
:param: analyzer_engine: AnalyzerEngine instance to use
|
| 18 |
+
for handling the values in those collections.
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
def __init__(self, analyzer_engine: Optional[AnalyzerEngine] = None):
|
| 22 |
+
|
| 23 |
+
self.analyzer_engine = analyzer_engine
|
| 24 |
+
if not analyzer_engine:
|
| 25 |
+
self.analyzer_engine = AnalyzerEngine()
|
| 26 |
+
|
| 27 |
+
def analyze_iterator(
|
| 28 |
+
self,
|
| 29 |
+
texts: Iterable[Union[str, bool, float, int]],
|
| 30 |
+
language: str,
|
| 31 |
+
**kwargs,
|
| 32 |
+
) -> List[List[RecognizerResult]]:
|
| 33 |
+
"""
|
| 34 |
+
Analyze an iterable of strings.
|
| 35 |
+
|
| 36 |
+
:param texts: An list containing strings to be analyzed.
|
| 37 |
+
:param language: Input language
|
| 38 |
+
:param kwargs: Additional parameters for the `AnalyzerEngine.analyze` method.
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
# validate types
|
| 42 |
+
texts = self._validate_types(texts)
|
| 43 |
+
|
| 44 |
+
# Process the texts as batch for improved performance
|
| 45 |
+
nlp_artifacts_batch: Iterator[
|
| 46 |
+
Tuple[str, NlpArtifacts]
|
| 47 |
+
] = self.analyzer_engine.nlp_engine.process_batch(
|
| 48 |
+
texts=texts, language=language
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
list_results = []
|
| 52 |
+
for text, nlp_artifacts in nlp_artifacts_batch:
|
| 53 |
+
results = self.analyzer_engine.analyze(
|
| 54 |
+
text=str(text), nlp_artifacts=nlp_artifacts, language=language, **kwargs
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
list_results.append(results)
|
| 58 |
+
|
| 59 |
+
return list_results
|
| 60 |
+
|
| 61 |
+
def analyze_dict(
|
| 62 |
+
self,
|
| 63 |
+
input_dict: Dict[str, Union[Any, Iterable[Any]]],
|
| 64 |
+
language: str,
|
| 65 |
+
keys_to_skip: Optional[List[str]] = None,
|
| 66 |
+
**kwargs,
|
| 67 |
+
) -> Iterator[DictAnalyzerResult]:
|
| 68 |
+
"""
|
| 69 |
+
Analyze a dictionary of keys (strings) and values/iterable of values.
|
| 70 |
+
|
| 71 |
+
Non-string values are returned as is.
|
| 72 |
+
|
| 73 |
+
:param input_dict: The input dictionary for analysis
|
| 74 |
+
:param language: Input language
|
| 75 |
+
:param keys_to_skip: Keys to ignore during analysis
|
| 76 |
+
:param kwargs: Additional keyword arguments
|
| 77 |
+
for the `AnalyzerEngine.analyze` method.
|
| 78 |
+
Use this to pass arguments to the analyze method,
|
| 79 |
+
such as `ad_hoc_recognizers`, `context`, `return_decision_process`.
|
| 80 |
+
See `AnalyzerEngine.analyze` for the full list.
|
| 81 |
+
"""
|
| 82 |
+
|
| 83 |
+
context = []
|
| 84 |
+
if "context" in kwargs:
|
| 85 |
+
context = kwargs["context"]
|
| 86 |
+
del kwargs["context"]
|
| 87 |
+
|
| 88 |
+
if not keys_to_skip:
|
| 89 |
+
keys_to_skip = []
|
| 90 |
+
|
| 91 |
+
for key, value in input_dict.items():
|
| 92 |
+
if not value or key in keys_to_skip:
|
| 93 |
+
yield DictAnalyzerResult(key=key, value=value, recognizer_results=[])
|
| 94 |
+
continue # skip this key as requested
|
| 95 |
+
|
| 96 |
+
# Add the key as an additional context
|
| 97 |
+
specific_context = context[:]
|
| 98 |
+
specific_context.append(key)
|
| 99 |
+
|
| 100 |
+
if type(value) in (str, int, bool, float):
|
| 101 |
+
results: List[RecognizerResult] = self.analyzer_engine.analyze(
|
| 102 |
+
text=str(value), language=language, context=[key], **kwargs
|
| 103 |
+
)
|
| 104 |
+
elif isinstance(value, dict):
|
| 105 |
+
new_keys_to_skip = self._get_nested_keys_to_skip(key, keys_to_skip)
|
| 106 |
+
results = self.analyze_dict(
|
| 107 |
+
input_dict=value,
|
| 108 |
+
language=language,
|
| 109 |
+
context=specific_context,
|
| 110 |
+
keys_to_skip=new_keys_to_skip,
|
| 111 |
+
**kwargs,
|
| 112 |
+
)
|
| 113 |
+
elif isinstance(value, Iterable):
|
| 114 |
+
# Recursively iterate nested dicts
|
| 115 |
+
|
| 116 |
+
results: List[List[RecognizerResult]] = self.analyze_iterator(
|
| 117 |
+
texts=value,
|
| 118 |
+
language=language,
|
| 119 |
+
context=specific_context,
|
| 120 |
+
**kwargs,
|
| 121 |
+
)
|
| 122 |
+
else:
|
| 123 |
+
raise ValueError(f"type {type(value)} is unsupported.")
|
| 124 |
+
|
| 125 |
+
yield DictAnalyzerResult(key=key, value=value, recognizer_results=results)
|
| 126 |
+
|
| 127 |
+
@staticmethod
|
| 128 |
+
def _validate_types(value_iterator: Iterable[Any]) -> Iterator[Any]:
|
| 129 |
+
for val in value_iterator:
|
| 130 |
+
if val and not type(val) in (int, float, bool, str):
|
| 131 |
+
err_msg = (
|
| 132 |
+
"Analyzer.analyze_iterator only works "
|
| 133 |
+
"on primitive types (int, float, bool, str). "
|
| 134 |
+
"Lists of objects are not yet supported."
|
| 135 |
+
)
|
| 136 |
+
logger.error(err_msg)
|
| 137 |
+
raise ValueError(err_msg)
|
| 138 |
+
yield val
|
| 139 |
+
|
| 140 |
+
@staticmethod
|
| 141 |
+
def _get_nested_keys_to_skip(key, keys_to_skip):
|
| 142 |
+
new_keys_to_skip = [
|
| 143 |
+
k.replace(f"{key}.", "") for k in keys_to_skip if k.startswith(key)
|
| 144 |
+
]
|
| 145 |
+
return new_keys_to_skip
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/context_aware_enhancers/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Recognizer registry init."""
|
| 2 |
+
from .context_aware_enhancer import ContextAwareEnhancer
|
| 3 |
+
from .lemma_context_aware_enhancer import LemmaContextAwareEnhancer
|
| 4 |
+
|
| 5 |
+
__all__ = ["ContextAwareEnhancer", "LemmaContextAwareEnhancer"]
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/context_aware_enhancers/context_aware_enhancer.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from abc import abstractmethod
|
| 3 |
+
from typing import List, Optional
|
| 4 |
+
|
| 5 |
+
from presidio_analyzer import RecognizerResult
|
| 6 |
+
from presidio_analyzer import EntityRecognizer
|
| 7 |
+
from presidio_analyzer.nlp_engine import NlpArtifacts
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger("presidio_analyzer")
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class ContextAwareEnhancer:
|
| 13 |
+
"""
|
| 14 |
+
A class representing an abstract context aware enhancer.
|
| 15 |
+
|
| 16 |
+
Context words might enhance confidence score of a recognized entity,
|
| 17 |
+
ContextAwareEnhancer is an abstract class to be inherited by a context aware
|
| 18 |
+
enhancer logic.
|
| 19 |
+
|
| 20 |
+
:param context_similarity_factor: How much to enhance confidence of match entity
|
| 21 |
+
:param min_score_with_context_similarity: Minimum confidence score
|
| 22 |
+
:param context_prefix_count: how many words before the entity to match context
|
| 23 |
+
:param context_suffix_count: how many words after the entity to match context
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
MIN_SCORE = 0
|
| 27 |
+
MAX_SCORE = 1.0
|
| 28 |
+
|
| 29 |
+
def __init__(
|
| 30 |
+
self,
|
| 31 |
+
context_similarity_factor: float,
|
| 32 |
+
min_score_with_context_similarity: float,
|
| 33 |
+
context_prefix_count: int,
|
| 34 |
+
context_suffix_count: int,
|
| 35 |
+
):
|
| 36 |
+
|
| 37 |
+
self.context_similarity_factor = context_similarity_factor
|
| 38 |
+
self.min_score_with_context_similarity = min_score_with_context_similarity
|
| 39 |
+
self.context_prefix_count = context_prefix_count
|
| 40 |
+
self.context_suffix_count = context_suffix_count
|
| 41 |
+
|
| 42 |
+
@abstractmethod
|
| 43 |
+
def enhance_using_context(
|
| 44 |
+
self,
|
| 45 |
+
text: str,
|
| 46 |
+
raw_results: List[RecognizerResult],
|
| 47 |
+
nlp_artifacts: NlpArtifacts,
|
| 48 |
+
recognizers: List[EntityRecognizer],
|
| 49 |
+
context: Optional[List[str]] = None,
|
| 50 |
+
) -> List[RecognizerResult]:
|
| 51 |
+
"""
|
| 52 |
+
Update results in case surrounding words are relevant to the context words.
|
| 53 |
+
|
| 54 |
+
Using the surrounding words of the actual word matches, look
|
| 55 |
+
for specific strings that if found contribute to the score
|
| 56 |
+
of the result, improving the confidence that the match is
|
| 57 |
+
indeed of that PII entity type
|
| 58 |
+
|
| 59 |
+
:param text: The actual text that was analyzed
|
| 60 |
+
:param raw_results: Recognizer results which didn't take
|
| 61 |
+
context into consideration
|
| 62 |
+
:param nlp_artifacts: The nlp artifacts contains elements
|
| 63 |
+
such as lemmatized tokens for better
|
| 64 |
+
accuracy of the context enhancement process
|
| 65 |
+
:param recognizers: the list of recognizers
|
| 66 |
+
:param context: list of context words
|
| 67 |
+
"""
|
| 68 |
+
return raw_results
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/context_aware_enhancers/lemma_context_aware_enhancer.py
ADDED
|
@@ -0,0 +1,334 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import copy
|
| 2 |
+
import logging
|
| 3 |
+
from typing import List, Optional
|
| 4 |
+
|
| 5 |
+
from presidio_analyzer import RecognizerResult
|
| 6 |
+
from presidio_analyzer import EntityRecognizer
|
| 7 |
+
from presidio_analyzer.nlp_engine import NlpArtifacts
|
| 8 |
+
from presidio_analyzer.context_aware_enhancers import ContextAwareEnhancer
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger("presidio_analyzer")
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class LemmaContextAwareEnhancer(ContextAwareEnhancer):
|
| 14 |
+
"""
|
| 15 |
+
A class representing a lemma based context aware enhancer logic.
|
| 16 |
+
|
| 17 |
+
Context words might enhance confidence score of a recognized entity,
|
| 18 |
+
LemmaContextAwareEnhancer is an implementation of Lemma based context aware logic,
|
| 19 |
+
it compares spacy lemmas of each word in context of the matched entity to given
|
| 20 |
+
context and the recognizer context words,
|
| 21 |
+
if matched it enhance the recognized entity confidence score by a given factor.
|
| 22 |
+
|
| 23 |
+
:param context_similarity_factor: How much to enhance confidence of match entity
|
| 24 |
+
:param min_score_with_context_similarity: Minimum confidence score
|
| 25 |
+
:param context_prefix_count: how many words before the entity to match context
|
| 26 |
+
:param context_suffix_count: how many words after the entity to match context
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
def __init__(
|
| 30 |
+
self,
|
| 31 |
+
context_similarity_factor: float = 0.35,
|
| 32 |
+
min_score_with_context_similarity: float = 0.4,
|
| 33 |
+
context_prefix_count: int = 5,
|
| 34 |
+
context_suffix_count: int = 0,
|
| 35 |
+
):
|
| 36 |
+
super().__init__(
|
| 37 |
+
context_similarity_factor=context_similarity_factor,
|
| 38 |
+
min_score_with_context_similarity=min_score_with_context_similarity,
|
| 39 |
+
context_prefix_count=context_prefix_count,
|
| 40 |
+
context_suffix_count=context_suffix_count,
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
def enhance_using_context(
|
| 44 |
+
self,
|
| 45 |
+
text: str,
|
| 46 |
+
raw_results: List[RecognizerResult],
|
| 47 |
+
nlp_artifacts: NlpArtifacts,
|
| 48 |
+
recognizers: List[EntityRecognizer],
|
| 49 |
+
context: Optional[List[str]] = None,
|
| 50 |
+
) -> List[RecognizerResult]:
|
| 51 |
+
"""
|
| 52 |
+
Update results in case the lemmas of surrounding words or input context
|
| 53 |
+
words are identical to the context words.
|
| 54 |
+
|
| 55 |
+
Using the surrounding words of the actual word matches, look
|
| 56 |
+
for specific strings that if found contribute to the score
|
| 57 |
+
of the result, improving the confidence that the match is
|
| 58 |
+
indeed of that PII entity type
|
| 59 |
+
|
| 60 |
+
:param text: The actual text that was analyzed
|
| 61 |
+
:param raw_results: Recognizer results which didn't take
|
| 62 |
+
context into consideration
|
| 63 |
+
:param nlp_artifacts: The nlp artifacts contains elements
|
| 64 |
+
such as lemmatized tokens for better
|
| 65 |
+
accuracy of the context enhancement process
|
| 66 |
+
:param recognizers: the list of recognizers
|
| 67 |
+
:param context: list of context words
|
| 68 |
+
""" # noqa D205 D400
|
| 69 |
+
|
| 70 |
+
# create a deep copy of the results object, so we can manipulate it
|
| 71 |
+
results = copy.deepcopy(raw_results)
|
| 72 |
+
|
| 73 |
+
# create recognizer context dictionary
|
| 74 |
+
recognizers_dict = {recognizer.id: recognizer for recognizer in recognizers}
|
| 75 |
+
|
| 76 |
+
# Create empty list in None or lowercase all context words in the list
|
| 77 |
+
if not context:
|
| 78 |
+
context = []
|
| 79 |
+
else:
|
| 80 |
+
context = [word.lower() for word in context]
|
| 81 |
+
|
| 82 |
+
# Sanity
|
| 83 |
+
if nlp_artifacts is None:
|
| 84 |
+
logger.warning("NLP artifacts were not provided")
|
| 85 |
+
return results
|
| 86 |
+
|
| 87 |
+
for result in results:
|
| 88 |
+
recognizer = None
|
| 89 |
+
# get recognizer matching the result, if found.
|
| 90 |
+
if (
|
| 91 |
+
result.recognition_metadata
|
| 92 |
+
and RecognizerResult.RECOGNIZER_IDENTIFIER_KEY
|
| 93 |
+
in result.recognition_metadata.keys()
|
| 94 |
+
):
|
| 95 |
+
recognizer = recognizers_dict.get(
|
| 96 |
+
result.recognition_metadata[
|
| 97 |
+
RecognizerResult.RECOGNIZER_IDENTIFIER_KEY
|
| 98 |
+
]
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
if not recognizer:
|
| 102 |
+
logger.debug(
|
| 103 |
+
"Recognizer name not found as part of the "
|
| 104 |
+
"recognition_metadata dict in the RecognizerResult. "
|
| 105 |
+
)
|
| 106 |
+
continue
|
| 107 |
+
|
| 108 |
+
# skip recognizer result if the recognizer doesn't support
|
| 109 |
+
# context enhancement
|
| 110 |
+
if not recognizer.context:
|
| 111 |
+
logger.debug(
|
| 112 |
+
"recognizer '%s' does not support context enhancement",
|
| 113 |
+
recognizer.name,
|
| 114 |
+
)
|
| 115 |
+
continue
|
| 116 |
+
|
| 117 |
+
# skip context enhancement if already boosted by recognizer level
|
| 118 |
+
if result.recognition_metadata.get(
|
| 119 |
+
RecognizerResult.IS_SCORE_ENHANCED_BY_CONTEXT_KEY
|
| 120 |
+
):
|
| 121 |
+
logger.debug("result score already boosted, skipping")
|
| 122 |
+
continue
|
| 123 |
+
|
| 124 |
+
# extract lemmatized context from the surrounding of the match
|
| 125 |
+
word = text[result.start : result.end]
|
| 126 |
+
|
| 127 |
+
surrounding_words = self._extract_surrounding_words(
|
| 128 |
+
nlp_artifacts=nlp_artifacts, word=word, start=result.start
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
# combine other sources of context with surrounding words
|
| 132 |
+
surrounding_words.extend(context)
|
| 133 |
+
|
| 134 |
+
supportive_context_word = self._find_supportive_word_in_context(
|
| 135 |
+
surrounding_words, recognizer.context
|
| 136 |
+
)
|
| 137 |
+
if supportive_context_word != "":
|
| 138 |
+
result.score += self.context_similarity_factor
|
| 139 |
+
result.score = max(result.score, self.min_score_with_context_similarity)
|
| 140 |
+
result.score = min(result.score, ContextAwareEnhancer.MAX_SCORE)
|
| 141 |
+
|
| 142 |
+
# Update the explainability object with context information
|
| 143 |
+
# helped to improve the score
|
| 144 |
+
result.analysis_explanation.set_supportive_context_word(
|
| 145 |
+
supportive_context_word
|
| 146 |
+
)
|
| 147 |
+
result.analysis_explanation.set_improved_score(result.score)
|
| 148 |
+
return results
|
| 149 |
+
|
| 150 |
+
@staticmethod
|
| 151 |
+
def _find_supportive_word_in_context(
|
| 152 |
+
context_list: List[str], recognizer_context_list: List[str]
|
| 153 |
+
) -> str:
|
| 154 |
+
"""
|
| 155 |
+
Find words in the text which are relevant for context evaluation.
|
| 156 |
+
|
| 157 |
+
A word is considered a supportive context word if there's exact match
|
| 158 |
+
between a keyword in context_text and any keyword in context_list.
|
| 159 |
+
|
| 160 |
+
:param context_list words before and after the matched entity within
|
| 161 |
+
a specified window size
|
| 162 |
+
:param recognizer_context_list a list of words considered as
|
| 163 |
+
context keywords manually specified by the recognizer's author
|
| 164 |
+
"""
|
| 165 |
+
word = ""
|
| 166 |
+
# If the context list is empty, no need to continue
|
| 167 |
+
if context_list is None or recognizer_context_list is None:
|
| 168 |
+
return word
|
| 169 |
+
|
| 170 |
+
for predefined_context_word in recognizer_context_list:
|
| 171 |
+
# result == true only if any of the predefined context words
|
| 172 |
+
# is found exactly or as a substring in any of the collected
|
| 173 |
+
# context words
|
| 174 |
+
result = next(
|
| 175 |
+
(
|
| 176 |
+
True
|
| 177 |
+
for keyword in context_list
|
| 178 |
+
if predefined_context_word in keyword
|
| 179 |
+
),
|
| 180 |
+
False,
|
| 181 |
+
)
|
| 182 |
+
if result:
|
| 183 |
+
logger.debug("Found context keyword '%s'", predefined_context_word)
|
| 184 |
+
word = predefined_context_word
|
| 185 |
+
break
|
| 186 |
+
|
| 187 |
+
return word
|
| 188 |
+
|
| 189 |
+
def _extract_surrounding_words(
|
| 190 |
+
self, nlp_artifacts: NlpArtifacts, word: str, start: int
|
| 191 |
+
) -> List[str]:
|
| 192 |
+
"""Extract words surrounding another given word.
|
| 193 |
+
|
| 194 |
+
The text from which the context is extracted is given in the nlp
|
| 195 |
+
doc.
|
| 196 |
+
|
| 197 |
+
:param nlp_artifacts: An abstraction layer which holds different
|
| 198 |
+
items which are the result of a NLP pipeline
|
| 199 |
+
execution on a given text
|
| 200 |
+
:param word: The word to look for context around
|
| 201 |
+
:param start: The start index of the word in the original text
|
| 202 |
+
"""
|
| 203 |
+
if not nlp_artifacts.tokens:
|
| 204 |
+
logger.info("Skipping context extraction due to lack of NLP artifacts")
|
| 205 |
+
# if there are no nlp artifacts, this is ok, we can
|
| 206 |
+
# extract context and we return a valid, yet empty
|
| 207 |
+
# context
|
| 208 |
+
return [""]
|
| 209 |
+
|
| 210 |
+
# Get the already prepared words in the given text, in their
|
| 211 |
+
# LEMMATIZED version
|
| 212 |
+
lemmatized_keywords = nlp_artifacts.keywords
|
| 213 |
+
|
| 214 |
+
# since the list of tokens is not necessarily aligned
|
| 215 |
+
# with the actual index of the match, we look for the
|
| 216 |
+
# token index which corresponds to the match
|
| 217 |
+
token_index = self._find_index_of_match_token(
|
| 218 |
+
word, start, nlp_artifacts.tokens, nlp_artifacts.tokens_indices
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
# index i belongs to the PII entity, take the preceding n words
|
| 222 |
+
# and the successing m words into a context list
|
| 223 |
+
|
| 224 |
+
backward_context = self._add_n_words_backward(
|
| 225 |
+
token_index,
|
| 226 |
+
self.context_prefix_count,
|
| 227 |
+
nlp_artifacts.lemmas,
|
| 228 |
+
lemmatized_keywords,
|
| 229 |
+
)
|
| 230 |
+
forward_context = self._add_n_words_forward(
|
| 231 |
+
token_index,
|
| 232 |
+
self.context_suffix_count,
|
| 233 |
+
nlp_artifacts.lemmas,
|
| 234 |
+
lemmatized_keywords,
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
context_list = []
|
| 238 |
+
context_list.extend(backward_context)
|
| 239 |
+
context_list.extend(forward_context)
|
| 240 |
+
context_list = list(set(context_list))
|
| 241 |
+
logger.debug("Context list is: %s", " ".join(context_list))
|
| 242 |
+
return context_list
|
| 243 |
+
|
| 244 |
+
@staticmethod
|
| 245 |
+
def _find_index_of_match_token(
|
| 246 |
+
word: str, start: int, tokens, tokens_indices: List[int] # noqa ANN001
|
| 247 |
+
) -> int:
|
| 248 |
+
found = False
|
| 249 |
+
# we use the known start index of the original word to find the actual
|
| 250 |
+
# token at that index, we are not checking for equivilance since the
|
| 251 |
+
# token might be just a substring of that word (e.g. for phone number
|
| 252 |
+
# 555-124564 the first token might be just '555' or for a match like '
|
| 253 |
+
# rocket' the actual token will just be 'rocket' hence the misalignment
|
| 254 |
+
# of indices)
|
| 255 |
+
# Note: we are iterating over the original tokens (not the lemmatized)
|
| 256 |
+
i = -1
|
| 257 |
+
for i, token in enumerate(tokens, 0):
|
| 258 |
+
# Either we found a token with the exact location, or
|
| 259 |
+
# we take a token which its characters indices covers
|
| 260 |
+
# the index we are looking for.
|
| 261 |
+
if (tokens_indices[i] == start) or (start < tokens_indices[i] + len(token)):
|
| 262 |
+
# found the interesting token, the one that around it
|
| 263 |
+
# we take n words, we save the matching lemma
|
| 264 |
+
found = True
|
| 265 |
+
break
|
| 266 |
+
|
| 267 |
+
if not found:
|
| 268 |
+
raise ValueError(
|
| 269 |
+
"Did not find word '" + word + "' "
|
| 270 |
+
"in the list of tokens although it "
|
| 271 |
+
"is expected to be found"
|
| 272 |
+
)
|
| 273 |
+
return i
|
| 274 |
+
|
| 275 |
+
@staticmethod
|
| 276 |
+
def _add_n_words(
|
| 277 |
+
index: int,
|
| 278 |
+
n_words: int,
|
| 279 |
+
lemmas: List[str],
|
| 280 |
+
lemmatized_filtered_keywords: List[str],
|
| 281 |
+
is_backward: bool,
|
| 282 |
+
) -> List[str]:
|
| 283 |
+
"""
|
| 284 |
+
Prepare a string of context words.
|
| 285 |
+
|
| 286 |
+
Return a list of words which surrounds a lemma at a given index.
|
| 287 |
+
The words will be collected only if exist in the filtered array
|
| 288 |
+
|
| 289 |
+
:param index: index of the lemma that its surrounding words we want
|
| 290 |
+
:param n_words: number of words to take
|
| 291 |
+
:param lemmas: array of lemmas
|
| 292 |
+
:param lemmatized_filtered_keywords: the array of filtered
|
| 293 |
+
lemmas from the original sentence,
|
| 294 |
+
:param is_backward: if true take the preceeding words, if false,
|
| 295 |
+
take the successing words
|
| 296 |
+
"""
|
| 297 |
+
i = index
|
| 298 |
+
context_words = []
|
| 299 |
+
# The entity itself is no interest to us...however we want to
|
| 300 |
+
# consider it anyway for cases were it is attached with no spaces
|
| 301 |
+
# to an interesting context word, so we allow it and add 1 to
|
| 302 |
+
# the number of collected words
|
| 303 |
+
|
| 304 |
+
# collect at most n words (in lower case)
|
| 305 |
+
remaining = n_words + 1
|
| 306 |
+
while 0 <= i < len(lemmas) and remaining > 0:
|
| 307 |
+
lower_lemma = lemmas[i].lower()
|
| 308 |
+
if lower_lemma in lemmatized_filtered_keywords:
|
| 309 |
+
context_words.append(lower_lemma)
|
| 310 |
+
remaining -= 1
|
| 311 |
+
i = i - 1 if is_backward else i + 1
|
| 312 |
+
return context_words
|
| 313 |
+
|
| 314 |
+
def _add_n_words_forward(
|
| 315 |
+
self,
|
| 316 |
+
index: int,
|
| 317 |
+
n_words: int,
|
| 318 |
+
lemmas: List[str],
|
| 319 |
+
lemmatized_filtered_keywords: List[str],
|
| 320 |
+
) -> List[str]:
|
| 321 |
+
return self._add_n_words(
|
| 322 |
+
index, n_words, lemmas, lemmatized_filtered_keywords, False
|
| 323 |
+
)
|
| 324 |
+
|
| 325 |
+
def _add_n_words_backward(
|
| 326 |
+
self,
|
| 327 |
+
index: int,
|
| 328 |
+
n_words: int,
|
| 329 |
+
lemmas: List[str],
|
| 330 |
+
lemmatized_filtered_keywords: List[str],
|
| 331 |
+
) -> List[str]:
|
| 332 |
+
return self._add_n_words(
|
| 333 |
+
index, n_words, lemmas, lemmatized_filtered_keywords, True
|
| 334 |
+
)
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/dict_analyzer_result.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
from typing import List, Union, Iterator
|
| 3 |
+
|
| 4 |
+
from presidio_analyzer import RecognizerResult
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
@dataclass
|
| 8 |
+
class DictAnalyzerResult:
|
| 9 |
+
"""
|
| 10 |
+
Data class for holding the output of the Presidio Analyzer on dictionaries.
|
| 11 |
+
|
| 12 |
+
:param key: key in dictionary
|
| 13 |
+
:param value: value to run analysis on (either string or list of strings)
|
| 14 |
+
:param recognizer_results: Analyzer output for one value.
|
| 15 |
+
Could be either:
|
| 16 |
+
- A list of recognizer results if the input is one string
|
| 17 |
+
- A list of lists of recognizer results, if the input is a list of strings.
|
| 18 |
+
- An iterator of a DictAnalyzerResult, if the input is a dictionary.
|
| 19 |
+
In this case the recognizer_results would be the iterator
|
| 20 |
+
of the DictAnalyzerResults next level in the dictionary.
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
key: str
|
| 24 |
+
value: Union[str, List[str], dict]
|
| 25 |
+
recognizer_results: Union[
|
| 26 |
+
List[RecognizerResult],
|
| 27 |
+
List[List[RecognizerResult]],
|
| 28 |
+
Iterator["DictAnalyzerResult"],
|
| 29 |
+
]
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/entity_recognizer.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from abc import abstractmethod
|
| 3 |
+
from typing import List, Dict, Optional
|
| 4 |
+
|
| 5 |
+
from presidio_analyzer import RecognizerResult
|
| 6 |
+
from presidio_analyzer.nlp_engine import NlpArtifacts
|
| 7 |
+
|
| 8 |
+
logger = logging.getLogger("presidio_analyzer")
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class EntityRecognizer:
|
| 12 |
+
"""
|
| 13 |
+
A class representing an abstract PII entity recognizer.
|
| 14 |
+
|
| 15 |
+
EntityRecognizer is an abstract class to be inherited by
|
| 16 |
+
Recognizers which hold the logic for recognizing specific PII entities.
|
| 17 |
+
|
| 18 |
+
EntityRecognizer exposes a method called enhance_using_context which
|
| 19 |
+
can be overridden in case a custom context aware enhancement is needed
|
| 20 |
+
in derived class of a recognizer.
|
| 21 |
+
|
| 22 |
+
:param supported_entities: the entities supported by this recognizer
|
| 23 |
+
(for example, phone number, address, etc.)
|
| 24 |
+
:param supported_language: the language supported by this recognizer.
|
| 25 |
+
The supported langauge code is iso6391Name
|
| 26 |
+
:param name: the name of this recognizer (optional)
|
| 27 |
+
:param version: the recognizer current version
|
| 28 |
+
:param context: a list of words which can help boost confidence score
|
| 29 |
+
when they appear in context of the matched entity
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
MIN_SCORE = 0
|
| 33 |
+
MAX_SCORE = 1.0
|
| 34 |
+
|
| 35 |
+
def __init__(
|
| 36 |
+
self,
|
| 37 |
+
supported_entities: List[str],
|
| 38 |
+
name: str = None,
|
| 39 |
+
supported_language: str = "en",
|
| 40 |
+
version: str = "0.0.1",
|
| 41 |
+
context: Optional[List[str]] = None,
|
| 42 |
+
):
|
| 43 |
+
|
| 44 |
+
self.supported_entities = supported_entities
|
| 45 |
+
|
| 46 |
+
if name is None:
|
| 47 |
+
self.name = self.__class__.__name__ # assign class name as name
|
| 48 |
+
else:
|
| 49 |
+
self.name = name
|
| 50 |
+
|
| 51 |
+
self._id = f"{self.name}_{id(self)}"
|
| 52 |
+
|
| 53 |
+
self.supported_language = supported_language
|
| 54 |
+
self.version = version
|
| 55 |
+
self.is_loaded = False
|
| 56 |
+
self.context = context if context else []
|
| 57 |
+
|
| 58 |
+
self.load()
|
| 59 |
+
logger.info("Loaded recognizer: %s", self.name)
|
| 60 |
+
self.is_loaded = True
|
| 61 |
+
|
| 62 |
+
@property
|
| 63 |
+
def id(self):
|
| 64 |
+
"""Return a unique identifier of this recognizer."""
|
| 65 |
+
|
| 66 |
+
return self._id
|
| 67 |
+
|
| 68 |
+
@abstractmethod
|
| 69 |
+
def load(self) -> None:
|
| 70 |
+
"""
|
| 71 |
+
Initialize the recognizer assets if needed.
|
| 72 |
+
|
| 73 |
+
(e.g. machine learning models)
|
| 74 |
+
"""
|
| 75 |
+
|
| 76 |
+
@abstractmethod
|
| 77 |
+
def analyze(
|
| 78 |
+
self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts
|
| 79 |
+
) -> List[RecognizerResult]:
|
| 80 |
+
"""
|
| 81 |
+
Analyze text to identify entities.
|
| 82 |
+
|
| 83 |
+
:param text: The text to be analyzed
|
| 84 |
+
:param entities: The list of entities this recognizer is able to detect
|
| 85 |
+
:param nlp_artifacts: A group of attributes which are the result of
|
| 86 |
+
an NLP process over the input text.
|
| 87 |
+
:return: List of results detected by this recognizer.
|
| 88 |
+
"""
|
| 89 |
+
return None
|
| 90 |
+
|
| 91 |
+
def enhance_using_context(
|
| 92 |
+
self,
|
| 93 |
+
text: str,
|
| 94 |
+
raw_recognizer_results: List[RecognizerResult],
|
| 95 |
+
other_raw_recognizer_results: List[RecognizerResult],
|
| 96 |
+
nlp_artifacts: NlpArtifacts,
|
| 97 |
+
context: Optional[List[str]] = None,
|
| 98 |
+
) -> List[RecognizerResult]:
|
| 99 |
+
"""Enhance confidence score using context of the entity.
|
| 100 |
+
|
| 101 |
+
Override this method in derived class in case a custom logic
|
| 102 |
+
is needed, otherwise return value will be equal to
|
| 103 |
+
raw_results.
|
| 104 |
+
|
| 105 |
+
in case a result score is boosted, derived class need to update
|
| 106 |
+
result.recognition_metadata[RecognizerResult.IS_SCORE_ENHANCED_BY_CONTEXT_KEY]
|
| 107 |
+
|
| 108 |
+
:param text: The actual text that was analyzed
|
| 109 |
+
:param raw_recognizer_results: This recognizer's results, to be updated
|
| 110 |
+
based on recognizer specific context.
|
| 111 |
+
:param other_raw_recognizer_results: Other recognizer results matched in
|
| 112 |
+
the given text to allow related entity context enhancement
|
| 113 |
+
:param nlp_artifacts: The nlp artifacts contains elements
|
| 114 |
+
such as lemmatized tokens for better
|
| 115 |
+
accuracy of the context enhancement process
|
| 116 |
+
:param context: list of context words
|
| 117 |
+
"""
|
| 118 |
+
return raw_recognizer_results
|
| 119 |
+
|
| 120 |
+
def get_supported_entities(self) -> List[str]:
|
| 121 |
+
"""
|
| 122 |
+
Return the list of entities this recognizer can identify.
|
| 123 |
+
|
| 124 |
+
:return: A list of the supported entities by this recognizer
|
| 125 |
+
"""
|
| 126 |
+
return self.supported_entities
|
| 127 |
+
|
| 128 |
+
def get_supported_language(self) -> str:
|
| 129 |
+
"""
|
| 130 |
+
Return the language this recognizer can support.
|
| 131 |
+
|
| 132 |
+
:return: A list of the supported language by this recognizer
|
| 133 |
+
"""
|
| 134 |
+
return self.supported_language
|
| 135 |
+
|
| 136 |
+
def get_version(self) -> str:
|
| 137 |
+
"""
|
| 138 |
+
Return the version of this recognizer.
|
| 139 |
+
|
| 140 |
+
:return: The current version of this recognizer
|
| 141 |
+
"""
|
| 142 |
+
return self.version
|
| 143 |
+
|
| 144 |
+
def to_dict(self) -> Dict:
|
| 145 |
+
"""
|
| 146 |
+
Serialize self to dictionary.
|
| 147 |
+
|
| 148 |
+
:return: a dictionary
|
| 149 |
+
"""
|
| 150 |
+
return_dict = {
|
| 151 |
+
"supported_entities": self.supported_entities,
|
| 152 |
+
"supported_language": self.supported_language,
|
| 153 |
+
"name": self.name,
|
| 154 |
+
"version": self.version,
|
| 155 |
+
}
|
| 156 |
+
return return_dict
|
| 157 |
+
|
| 158 |
+
@classmethod
|
| 159 |
+
def from_dict(cls, entity_recognizer_dict: Dict) -> "EntityRecognizer":
|
| 160 |
+
"""
|
| 161 |
+
Create EntityRecognizer from a dict input.
|
| 162 |
+
|
| 163 |
+
:param entity_recognizer_dict: Dict containing keys and values for instantiation
|
| 164 |
+
"""
|
| 165 |
+
return cls(**entity_recognizer_dict)
|
| 166 |
+
|
| 167 |
+
@staticmethod
|
| 168 |
+
def remove_duplicates(results: List[RecognizerResult]) -> List[RecognizerResult]:
|
| 169 |
+
"""
|
| 170 |
+
Remove duplicate results.
|
| 171 |
+
|
| 172 |
+
Remove duplicates in case the two results
|
| 173 |
+
have identical start and ends and types.
|
| 174 |
+
:param results: List[RecognizerResult]
|
| 175 |
+
:return: List[RecognizerResult]
|
| 176 |
+
"""
|
| 177 |
+
results = list(set(results))
|
| 178 |
+
results = sorted(results, key=lambda x: (-x.score, x.start, -(x.end - x.start)))
|
| 179 |
+
filtered_results = []
|
| 180 |
+
|
| 181 |
+
for result in results:
|
| 182 |
+
if result.score == 0:
|
| 183 |
+
continue
|
| 184 |
+
|
| 185 |
+
to_keep = result not in filtered_results # equals based comparison
|
| 186 |
+
if to_keep:
|
| 187 |
+
for filtered in filtered_results:
|
| 188 |
+
# If result is contained in one of the other results
|
| 189 |
+
if (
|
| 190 |
+
result.contained_in(filtered)
|
| 191 |
+
and result.entity_type == filtered.entity_type
|
| 192 |
+
):
|
| 193 |
+
to_keep = False
|
| 194 |
+
break
|
| 195 |
+
|
| 196 |
+
if to_keep:
|
| 197 |
+
filtered_results.append(result)
|
| 198 |
+
|
| 199 |
+
return filtered_results
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/local_recognizer.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from abc import ABC
|
| 2 |
+
|
| 3 |
+
from presidio_analyzer import EntityRecognizer
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class LocalRecognizer(ABC, EntityRecognizer):
|
| 7 |
+
"""PII entity recognizer which runs on the same process as the AnalyzerEngine."""
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""NLP engine package. Performs text pre-processing."""
|
| 2 |
+
|
| 3 |
+
from .nlp_artifacts import NlpArtifacts
|
| 4 |
+
from .nlp_engine import NlpEngine
|
| 5 |
+
from .spacy_nlp_engine import SpacyNlpEngine
|
| 6 |
+
from .client_nlp_engine import ClientNlpEngine
|
| 7 |
+
from .stanza_nlp_engine import StanzaNlpEngine
|
| 8 |
+
from .transformers_nlp_engine import TransformersNlpEngine
|
| 9 |
+
from .nlp_engine_provider import NlpEngineProvider
|
| 10 |
+
|
| 11 |
+
__all__ = [
|
| 12 |
+
"NlpArtifacts",
|
| 13 |
+
"NlpEngine",
|
| 14 |
+
"SpacyNlpEngine",
|
| 15 |
+
"ClientNlpEngine",
|
| 16 |
+
"StanzaNlpEngine",
|
| 17 |
+
"NlpEngineProvider",
|
| 18 |
+
"TransformersNlpEngine",
|
| 19 |
+
]
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/client_nlp_engine.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
# import logging
|
| 3 |
+
|
| 4 |
+
try:
|
| 5 |
+
import client
|
| 6 |
+
import spacy_client
|
| 7 |
+
except ImportError:
|
| 8 |
+
client = None
|
| 9 |
+
|
| 10 |
+
from typing import Optional, Dict, Iterator, Tuple, Union, List
|
| 11 |
+
|
| 12 |
+
import spacy
|
| 13 |
+
from spacy.language import Language
|
| 14 |
+
from spacy.tokens import Doc
|
| 15 |
+
|
| 16 |
+
from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngine
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger("presidio_analyzer")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class ClientNlpEngine(NlpEngine):
|
| 22 |
+
"""
|
| 23 |
+
SpacyNlpEngine is an abstraction layer over the nlp module.
|
| 24 |
+
|
| 25 |
+
It provides processing functionality as well as other queries
|
| 26 |
+
on tokens.
|
| 27 |
+
The SpacyNlpEngine uses SpaCy as its NLP module
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
engine_name="spacy"
|
| 32 |
+
|
| 33 |
+
is_available = bool(spacy)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def __init__(self, models: Optional[Dict[str, str]] = None):
|
| 37 |
+
"""
|
| 38 |
+
Initialize a wrapper on spaCy functionality.
|
| 39 |
+
|
| 40 |
+
:param models: Dictionary with the name of the spaCy model per language.
|
| 41 |
+
For example: models = {"en": "en_core_web_lg"}
|
| 42 |
+
"""
|
| 43 |
+
if not models:
|
| 44 |
+
models = {"en": "en_core_web_lg"}
|
| 45 |
+
logger.debug(f"Loading SpaCy models: {models.values()}")
|
| 46 |
+
|
| 47 |
+
self.nlp = {
|
| 48 |
+
lang_code: spacy.load(model_name, disable=["parser"])
|
| 49 |
+
for lang_code, model_name in models.items()
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def process_text(self, text: str, language: str) -> NlpArtifacts:
|
| 55 |
+
"""Execute the SpaCy NLP pipeline on the given text and language."""
|
| 56 |
+
|
| 57 |
+
doc = self.nlp[language](text)
|
| 58 |
+
return self._doc_to_nlp_artifact(doc, language)
|
| 59 |
+
|
| 60 |
+
def process_batch(
|
| 61 |
+
self,
|
| 62 |
+
texts: Union[List[str], List[Tuple[str, object]]],
|
| 63 |
+
language: str,
|
| 64 |
+
as_tuples: bool = False,
|
| 65 |
+
) -> Iterator[Optional[NlpArtifacts]]:
|
| 66 |
+
"""Execute the NLP pipeline on a batch of texts using spacy pipe."""
|
| 67 |
+
texts = (str(text) for text in texts)
|
| 68 |
+
docs = self.nlp[language].pipe(texts, as_tuples=as_tuples)
|
| 69 |
+
for doc in docs:
|
| 70 |
+
yield doc.text, self._doc_to_nlp_artifact(doc, language)
|
| 71 |
+
|
| 72 |
+
def is_stopword(self, word: str, language: str) -> bool:
|
| 73 |
+
"""
|
| 74 |
+
Return true if the given word is a stop word.
|
| 75 |
+
|
| 76 |
+
(within the given language)
|
| 77 |
+
"""
|
| 78 |
+
return self.nlp[language].vocab[word].is_stop
|
| 79 |
+
|
| 80 |
+
def is_punct(self, word: str, language: str) -> bool:
|
| 81 |
+
"""
|
| 82 |
+
Return true if the given word is a punctuation word.
|
| 83 |
+
|
| 84 |
+
(within the given language).
|
| 85 |
+
"""
|
| 86 |
+
return self.nlp[language].vocab[word].is_punct
|
| 87 |
+
|
| 88 |
+
def get_nlp(self, language: str) -> Language:
|
| 89 |
+
"""
|
| 90 |
+
Return the language model loaded for a language.
|
| 91 |
+
|
| 92 |
+
:param language: Name of language
|
| 93 |
+
:return: Language model from spaCy
|
| 94 |
+
"""
|
| 95 |
+
return self.nlp[language]
|
| 96 |
+
|
| 97 |
+
def _doc_to_nlp_artifact(self, doc: Doc, language: str) -> NlpArtifacts:
|
| 98 |
+
lemmas = [token.lemma_ for token in doc]
|
| 99 |
+
tokens_indices = [token.idx for token in doc]
|
| 100 |
+
entities = doc.ents
|
| 101 |
+
return NlpArtifacts(
|
| 102 |
+
entities=entities,
|
| 103 |
+
tokens=doc,
|
| 104 |
+
tokens_indices=tokens_indices,
|
| 105 |
+
lemmas=lemmas,
|
| 106 |
+
nlp_engine=self,
|
| 107 |
+
language=language,
|
| 108 |
+
)
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/nlp_artifacts.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from typing import List
|
| 3 |
+
|
| 4 |
+
from spacy.tokens import Doc, Span
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class NlpArtifacts:
|
| 8 |
+
"""
|
| 9 |
+
NlpArtifacts is an abstraction layer over the results of an NLP pipeline.
|
| 10 |
+
|
| 11 |
+
processing over a given text, it holds attributes such as entities,
|
| 12 |
+
tokens and lemmas which can be used by any recognizer
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
def __init__(
|
| 16 |
+
self,
|
| 17 |
+
entities: List[Span],
|
| 18 |
+
tokens: Doc,
|
| 19 |
+
tokens_indices: List[int],
|
| 20 |
+
lemmas: List[str],
|
| 21 |
+
nlp_engine, # noqa ANN001
|
| 22 |
+
language: str,
|
| 23 |
+
):
|
| 24 |
+
self.entities = entities
|
| 25 |
+
self.tokens = tokens
|
| 26 |
+
self.lemmas = lemmas
|
| 27 |
+
self.tokens_indices = tokens_indices
|
| 28 |
+
self.keywords = self.set_keywords(nlp_engine, lemmas, language)
|
| 29 |
+
self.nlp_engine = nlp_engine
|
| 30 |
+
|
| 31 |
+
@staticmethod
|
| 32 |
+
def set_keywords(
|
| 33 |
+
nlp_engine, lemmas: List[str], language: str # noqa ANN001
|
| 34 |
+
) -> List[str]:
|
| 35 |
+
"""
|
| 36 |
+
Return keywords fpr text.
|
| 37 |
+
|
| 38 |
+
Extracts lemmas with certain conditions as keywords.
|
| 39 |
+
"""
|
| 40 |
+
if not nlp_engine:
|
| 41 |
+
return []
|
| 42 |
+
keywords = [
|
| 43 |
+
k.lower()
|
| 44 |
+
for k in lemmas
|
| 45 |
+
if not nlp_engine.is_stopword(k, language)
|
| 46 |
+
and not nlp_engine.is_punct(k, language)
|
| 47 |
+
and k != "-PRON-"
|
| 48 |
+
and k != "be"
|
| 49 |
+
]
|
| 50 |
+
|
| 51 |
+
# best effort, try even further to break tokens into sub tokens,
|
| 52 |
+
# this can result in reducing false negatives
|
| 53 |
+
keywords = [i.split(":") for i in keywords]
|
| 54 |
+
|
| 55 |
+
# splitting the list can, if happened, will result in list of lists,
|
| 56 |
+
# we flatten the list
|
| 57 |
+
keywords = [item for sublist in keywords for item in sublist]
|
| 58 |
+
return keywords
|
| 59 |
+
|
| 60 |
+
def to_json(self) -> str:
|
| 61 |
+
"""Convert nlp artifacts to json."""
|
| 62 |
+
|
| 63 |
+
return_dict = self.__dict__.copy()
|
| 64 |
+
|
| 65 |
+
# Ignore NLP engine as it's not serializable currently
|
| 66 |
+
del return_dict["nlp_engine"]
|
| 67 |
+
|
| 68 |
+
# Converting spaCy tokens and spans to string as they are not serializable
|
| 69 |
+
if "tokens" in return_dict:
|
| 70 |
+
return_dict["tokens"] = [token.text for token in self.tokens]
|
| 71 |
+
if "entities" in return_dict:
|
| 72 |
+
return_dict["entities"] = [entity.text for entity in self.entities]
|
| 73 |
+
|
| 74 |
+
return json.dumps(return_dict)
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/nlp_engine.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from abc import ABC, abstractmethod
|
| 2 |
+
from typing import Iterable, Iterator, Tuple
|
| 3 |
+
|
| 4 |
+
from presidio_analyzer.nlp_engine import NlpArtifacts
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class NlpEngine(ABC):
|
| 8 |
+
"""
|
| 9 |
+
NlpEngine is an abstraction layer over the nlp module.
|
| 10 |
+
|
| 11 |
+
It provides NLP preprocessing functionality as well as other queries
|
| 12 |
+
on tokens.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
@abstractmethod
|
| 16 |
+
def process_text(self, text: str, language: str) -> NlpArtifacts:
|
| 17 |
+
"""Execute the NLP pipeline on the given text and language."""
|
| 18 |
+
|
| 19 |
+
@abstractmethod
|
| 20 |
+
def process_batch(
|
| 21 |
+
self, texts: Iterable[str], language: str, **kwargs
|
| 22 |
+
) -> Iterator[Tuple[str, NlpArtifacts]]:
|
| 23 |
+
"""Execute the NLP pipeline on a batch of texts.
|
| 24 |
+
|
| 25 |
+
Returns a tuple of (text, NlpArtifacts)
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
@abstractmethod
|
| 29 |
+
def is_stopword(self, word: str, language: str) -> bool:
|
| 30 |
+
"""
|
| 31 |
+
Return true if the given word is a stop word.
|
| 32 |
+
|
| 33 |
+
(within the given language)
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
@abstractmethod
|
| 37 |
+
def is_punct(self, word: str, language: str) -> bool:
|
| 38 |
+
"""
|
| 39 |
+
Return true if the given word is a punctuation word.
|
| 40 |
+
|
| 41 |
+
(within the given language)
|
| 42 |
+
"""
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from typing import Optional, Dict, Union, Tuple
|
| 4 |
+
|
| 5 |
+
import yaml
|
| 6 |
+
|
| 7 |
+
from presidio_analyzer.nlp_engine import (
|
| 8 |
+
StanzaNlpEngine,
|
| 9 |
+
SpacyNlpEngine,
|
| 10 |
+
NlpEngine,
|
| 11 |
+
ClientNlpEngine,
|
| 12 |
+
TransformersNlpEngine,
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger("presidio_analyzer")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class NlpEngineProvider:
|
| 19 |
+
"""Create different NLP engines from configuration.
|
| 20 |
+
|
| 21 |
+
:param nlp_engines: List of available NLP engines.
|
| 22 |
+
Default: (SpacyNlpEngine, StanzaNlpEngine)
|
| 23 |
+
:param nlp_configuration: Dict containing nlp configuration
|
| 24 |
+
:example: configuration:
|
| 25 |
+
{
|
| 26 |
+
"nlp_engine_name": "spacy",
|
| 27 |
+
"models": [{"lang_code": "en",
|
| 28 |
+
"model_name": "en_core_web_lg"
|
| 29 |
+
}]
|
| 30 |
+
}
|
| 31 |
+
Nlp engine names available by default: spacy, stanza.
|
| 32 |
+
:param conf_file: Path to yaml file containing nlp engine configuration.
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
def __init__(
|
| 36 |
+
self,
|
| 37 |
+
nlp_engines: Optional[Tuple] = None,
|
| 38 |
+
conf_file: Optional[Union[Path, str]] = None,
|
| 39 |
+
nlp_configuration: Optional[Dict] = None,
|
| 40 |
+
):
|
| 41 |
+
|
| 42 |
+
if not nlp_engines:
|
| 43 |
+
nlp_engines = (SpacyNlpEngine, StanzaNlpEngine, TransformersNlpEngine,ClientNlpEngine)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
self.nlp_engines = {
|
| 48 |
+
engine.engine_name: engine for engine in nlp_engines if engine.is_available
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
logger.debug(
|
| 52 |
+
f"Loaded these available nlp engines: {list(self.nlp_engines.keys())}"
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
if conf_file and nlp_configuration:
|
| 56 |
+
raise ValueError(
|
| 57 |
+
"Either conf_file or nlp_configuration should be provided, not both."
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
if nlp_configuration:
|
| 61 |
+
self.nlp_configuration = nlp_configuration
|
| 62 |
+
|
| 63 |
+
if conf_file:
|
| 64 |
+
self.nlp_configuration = self._read_nlp_conf(conf_file)
|
| 65 |
+
|
| 66 |
+
if not conf_file and not nlp_configuration:
|
| 67 |
+
conf_file = self._get_full_conf_path()
|
| 68 |
+
logger.debug(f"Reading default conf file from {conf_file}")
|
| 69 |
+
self.nlp_configuration = self._read_nlp_conf(conf_file)
|
| 70 |
+
|
| 71 |
+
def create_engine(self) -> NlpEngine:
|
| 72 |
+
"""Create an NLP engine instance."""
|
| 73 |
+
if (
|
| 74 |
+
not self.nlp_configuration
|
| 75 |
+
or not self.nlp_configuration.get("models")
|
| 76 |
+
or not self.nlp_configuration.get("nlp_engine_name")
|
| 77 |
+
):
|
| 78 |
+
raise ValueError(
|
| 79 |
+
"Illegal nlp configuration. "
|
| 80 |
+
"Configuration should include nlp_engine_name and models "
|
| 81 |
+
"(list of model_name for each lang_code)."
|
| 82 |
+
)
|
| 83 |
+
nlp_engine_name = self.nlp_configuration["nlp_engine_name"]
|
| 84 |
+
if nlp_engine_name not in self.nlp_engines:
|
| 85 |
+
raise ValueError(
|
| 86 |
+
f"NLP engine '{nlp_engine_name}' is not available. "
|
| 87 |
+
"Make sure you have all required packages installed"
|
| 88 |
+
)
|
| 89 |
+
try:
|
| 90 |
+
nlp_engine_class = self.nlp_engines[nlp_engine_name]
|
| 91 |
+
nlp_engine_opts = {
|
| 92 |
+
m["lang_code"]: m["model_name"]
|
| 93 |
+
for m in self.nlp_configuration["models"]
|
| 94 |
+
}
|
| 95 |
+
engine = nlp_engine_class(nlp_engine_opts)
|
| 96 |
+
logger.info(
|
| 97 |
+
f"Created NLP engine: {engine.engine_name}. "
|
| 98 |
+
f"Loaded models: {list(engine.nlp.keys())}"
|
| 99 |
+
)
|
| 100 |
+
return engine
|
| 101 |
+
except KeyError:
|
| 102 |
+
raise ValueError("Wrong NLP engine configuration")
|
| 103 |
+
|
| 104 |
+
@staticmethod
|
| 105 |
+
def _read_nlp_conf(conf_file: Union[Path, str]) -> dict:
|
| 106 |
+
"""Read the nlp configuration from a provided yaml file."""
|
| 107 |
+
|
| 108 |
+
if not Path(conf_file).exists():
|
| 109 |
+
nlp_configuration = {
|
| 110 |
+
"nlp_engine_name": "spacy",
|
| 111 |
+
"models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
|
| 112 |
+
}
|
| 113 |
+
logger.warning(
|
| 114 |
+
f"configuration file {conf_file} not found. "
|
| 115 |
+
f"Using default config: {nlp_configuration}."
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
else:
|
| 119 |
+
nlp_configuration = yaml.safe_load(open(conf_file))
|
| 120 |
+
|
| 121 |
+
return nlp_configuration
|
| 122 |
+
|
| 123 |
+
@staticmethod
|
| 124 |
+
def _get_full_conf_path(
|
| 125 |
+
default_conf_file: Union[Path, str] = "default.yaml"
|
| 126 |
+
) -> Path:
|
| 127 |
+
"""Return a Path to the default conf file."""
|
| 128 |
+
return Path(Path(__file__).parent.parent.parent, "conf", default_conf_file)
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import Optional, Dict, Iterator, Tuple, Union, List
|
| 3 |
+
|
| 4 |
+
import spacy
|
| 5 |
+
from spacy.language import Language
|
| 6 |
+
from spacy.tokens import Doc
|
| 7 |
+
|
| 8 |
+
from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngine
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger("presidio_analyzer")
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class SpacyNlpEngine(NlpEngine):
|
| 14 |
+
"""
|
| 15 |
+
SpacyNlpEngine is an abstraction layer over the nlp module.
|
| 16 |
+
|
| 17 |
+
It provides processing functionality as well as other queries
|
| 18 |
+
on tokens.
|
| 19 |
+
The SpacyNlpEngine uses SpaCy as its NLP module
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
engine_name = "spacy"
|
| 23 |
+
is_available = bool(spacy)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def __init__(self, models: Optional[Dict[str, str]] = None):
|
| 27 |
+
"""
|
| 28 |
+
Initialize a wrapper on spaCy functionality.
|
| 29 |
+
|
| 30 |
+
:param models: Dictionary with the name of the spaCy model per language.
|
| 31 |
+
For example: models = {"en": "en_core_web_lg"}
|
| 32 |
+
"""
|
| 33 |
+
if not models:
|
| 34 |
+
models = {"en": "en_core_web_lg"}
|
| 35 |
+
logger.debug(f"Loading SpaCy models: {models.values()}")
|
| 36 |
+
|
| 37 |
+
self.nlp = {
|
| 38 |
+
lang_code: spacy.load(model_name, disable=["parser"])
|
| 39 |
+
for lang_code, model_name in models.items()
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
def process_text(self, text: str, language: str) -> NlpArtifacts:
|
| 43 |
+
"""Execute the SpaCy NLP pipeline on the given text and language."""
|
| 44 |
+
|
| 45 |
+
doc = self.nlp[language](text)
|
| 46 |
+
return self._doc_to_nlp_artifact(doc, language)
|
| 47 |
+
|
| 48 |
+
def process_batch(
|
| 49 |
+
self,
|
| 50 |
+
texts: Union[List[str], List[Tuple[str, object]]],
|
| 51 |
+
language: str,
|
| 52 |
+
as_tuples: bool = False,
|
| 53 |
+
) -> Iterator[Optional[NlpArtifacts]]:
|
| 54 |
+
"""Execute the NLP pipeline on a batch of texts using spacy pipe."""
|
| 55 |
+
texts = (str(text) for text in texts)
|
| 56 |
+
docs = self.nlp[language].pipe(texts, as_tuples=as_tuples)
|
| 57 |
+
for doc in docs:
|
| 58 |
+
yield doc.text, self._doc_to_nlp_artifact(doc, language)
|
| 59 |
+
|
| 60 |
+
def is_stopword(self, word: str, language: str) -> bool:
|
| 61 |
+
"""
|
| 62 |
+
Return true if the given word is a stop word.
|
| 63 |
+
|
| 64 |
+
(within the given language)
|
| 65 |
+
"""
|
| 66 |
+
return self.nlp[language].vocab[word].is_stop
|
| 67 |
+
|
| 68 |
+
def is_punct(self, word: str, language: str) -> bool:
|
| 69 |
+
"""
|
| 70 |
+
Return true if the given word is a punctuation word.
|
| 71 |
+
|
| 72 |
+
(within the given language).
|
| 73 |
+
"""
|
| 74 |
+
return self.nlp[language].vocab[word].is_punct
|
| 75 |
+
|
| 76 |
+
def get_nlp(self, language: str) -> Language:
|
| 77 |
+
"""
|
| 78 |
+
Return the language model loaded for a language.
|
| 79 |
+
|
| 80 |
+
:param language: Name of language
|
| 81 |
+
:return: Language model from spaCy
|
| 82 |
+
"""
|
| 83 |
+
return self.nlp[language]
|
| 84 |
+
|
| 85 |
+
def _doc_to_nlp_artifact(self, doc: Doc, language: str) -> NlpArtifacts:
|
| 86 |
+
lemmas = [token.lemma_ for token in doc]
|
| 87 |
+
tokens_indices = [token.idx for token in doc]
|
| 88 |
+
entities = doc.ents
|
| 89 |
+
return NlpArtifacts(
|
| 90 |
+
entities=entities,
|
| 91 |
+
tokens=doc,
|
| 92 |
+
tokens_indices=tokens_indices,
|
| 93 |
+
lemmas=lemmas,
|
| 94 |
+
nlp_engine=self,
|
| 95 |
+
language=language,
|
| 96 |
+
)
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
|
| 3 |
+
try:
|
| 4 |
+
import stanza
|
| 5 |
+
import spacy_stanza
|
| 6 |
+
except ImportError:
|
| 7 |
+
stanza = None
|
| 8 |
+
|
| 9 |
+
from presidio_analyzer.nlp_engine import SpacyNlpEngine
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger("presidio_analyzer")
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class StanzaNlpEngine(SpacyNlpEngine):
|
| 15 |
+
"""
|
| 16 |
+
StanzaNlpEngine is an abstraction layer over the nlp module.
|
| 17 |
+
|
| 18 |
+
It provides processing functionality as well as other queries
|
| 19 |
+
on tokens.
|
| 20 |
+
The StanzaNlpEngine uses spacy-stanza and stanza as its NLP module
|
| 21 |
+
|
| 22 |
+
:param models: Dictionary with the name of the stanza model per language.
|
| 23 |
+
For example: models = {"en": "en"}
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
engine_name = "stanza"
|
| 27 |
+
is_available = bool(stanza)
|
| 28 |
+
def __init__(self, models=None): # noqa ANN201
|
| 29 |
+
if not models:
|
| 30 |
+
models = {"en": "en"}
|
| 31 |
+
logger.debug(f"Loading Stanza models: {models.values()}")
|
| 32 |
+
|
| 33 |
+
self.nlp = {
|
| 34 |
+
lang_code: spacy_stanza.load_pipeline(
|
| 35 |
+
model_name,
|
| 36 |
+
processors="tokenize,pos,lemma,ner",
|
| 37 |
+
)
|
| 38 |
+
for lang_code, model_name in models.items()
|
| 39 |
+
}
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import Optional, Dict
|
| 3 |
+
|
| 4 |
+
import spacy
|
| 5 |
+
from spacy.language import Language
|
| 6 |
+
from spacy.tokens import Doc, Span
|
| 7 |
+
|
| 8 |
+
from presidio_analyzer.nlp_engine import SpacyNlpEngine
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
try:
|
| 12 |
+
import torch
|
| 13 |
+
import transformers
|
| 14 |
+
from transformers import (
|
| 15 |
+
AutoTokenizer,
|
| 16 |
+
AutoModelForTokenClassification,
|
| 17 |
+
pipeline,
|
| 18 |
+
)
|
| 19 |
+
except ImportError:
|
| 20 |
+
torch = None
|
| 21 |
+
transformers = None
|
| 22 |
+
|
| 23 |
+
logger = logging.getLogger("presidio_analyzer")
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@Language.factory(
|
| 27 |
+
"transformers",
|
| 28 |
+
default_config={"pretrained_model_name_or_path": "dslim/bert-base-NER"},
|
| 29 |
+
)
|
| 30 |
+
def create_transformer_component(nlp, name, pretrained_model_name_or_path: str):
|
| 31 |
+
"""Spacy Language factory for creating custom component."""
|
| 32 |
+
return TransformersComponent(
|
| 33 |
+
pretrained_model_name_or_path=pretrained_model_name_or_path
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class TransformersComponent:
|
| 38 |
+
"""
|
| 39 |
+
Custom component to use in spacy pipeline.
|
| 40 |
+
|
| 41 |
+
Using HaggingFace transformers pretrained models for entity recognition.
|
| 42 |
+
:param pretrained_model_name_or_path: HaggingFace pretrained_model_name_or_path
|
| 43 |
+
"""
|
| 44 |
+
|
| 45 |
+
def __init__(self, pretrained_model_name_or_path: str) -> None:
|
| 46 |
+
Span.set_extension("confidence_score", default=1.0, force=True)
|
| 47 |
+
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
|
| 48 |
+
model = AutoModelForTokenClassification.from_pretrained(
|
| 49 |
+
pretrained_model_name_or_path
|
| 50 |
+
)
|
| 51 |
+
self.nlp = pipeline(
|
| 52 |
+
"ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple"
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
def __call__(self, doc: Doc) -> Doc:
|
| 56 |
+
"""Write transformers results to doc entities."""
|
| 57 |
+
|
| 58 |
+
res = self.nlp(doc.text)
|
| 59 |
+
ents = []
|
| 60 |
+
for d in res:
|
| 61 |
+
span = doc.char_span(
|
| 62 |
+
d["start"], d["end"], label=d["entity_group"], alignment_mode="expand"
|
| 63 |
+
)
|
| 64 |
+
if span is not None:
|
| 65 |
+
span._.confidence_score = d["score"]
|
| 66 |
+
ents.append(span)
|
| 67 |
+
else:
|
| 68 |
+
logger.warning(
|
| 69 |
+
f"Transformers model returned {d} but no valid span was found."
|
| 70 |
+
)
|
| 71 |
+
doc.ents = ents
|
| 72 |
+
return doc
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
class TransformersNlpEngine(SpacyNlpEngine):
|
| 76 |
+
"""
|
| 77 |
+
|
| 78 |
+
SpacyTransformersNlpEngine is a transformers based NlpEngine.
|
| 79 |
+
|
| 80 |
+
It comprises a spacy pipeline used for tokenization,
|
| 81 |
+
lemmatization, pos, and a transformers component for NER.
|
| 82 |
+
|
| 83 |
+
Both the underlying spacy pipeline and the transformers engine could be
|
| 84 |
+
configured by the user.
|
| 85 |
+
|
| 86 |
+
:param models: a dictionary containing the model names per language.
|
| 87 |
+
:example:
|
| 88 |
+
{
|
| 89 |
+
"en": {
|
| 90 |
+
"spacy": "en_core_web_sm",
|
| 91 |
+
"transformers": "dslim/bert-base-NER"
|
| 92 |
+
}
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
Note that since the spaCy model is not used for NER,
|
| 96 |
+
we recommend using a simple model, such as en_core_web_sm for English.
|
| 97 |
+
For potential Transformers models, see a list of models here:
|
| 98 |
+
https://huggingface.co/models?pipeline_tag=token-classification
|
| 99 |
+
It is further recommended to fine-tune these models
|
| 100 |
+
to the specific scenario in hand.
|
| 101 |
+
"""
|
| 102 |
+
|
| 103 |
+
engine_name = "transformers"
|
| 104 |
+
is_available = bool(spacy) and bool(transformers)
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def __init__(self, models: Optional[Dict[str, Dict[str, str]]] = None):
|
| 108 |
+
# default models if not specified
|
| 109 |
+
if not models:
|
| 110 |
+
models = {
|
| 111 |
+
"en": {"spacy": "en_core_web_sm", "transformers": "dslim/bert-base-NER"}
|
| 112 |
+
}
|
| 113 |
+
# validate models type
|
| 114 |
+
elif type(models) is not dict:
|
| 115 |
+
logger.error(f"''models' argument must be dict, not {type(models)}")
|
| 116 |
+
raise KeyError(f"Expected 'models' argument to be dict, not {type(models)}")
|
| 117 |
+
# validate models[model_lang] type is dict for all model_lang
|
| 118 |
+
elif any(
|
| 119 |
+
[type(model_dict) is not dict for model_lang, model_dict in models.items()]
|
| 120 |
+
):
|
| 121 |
+
# elif type(models["model_name"]) is not dict:
|
| 122 |
+
logger.error(
|
| 123 |
+
"'models.model_name' argument must be dict,"
|
| 124 |
+
f"not {type(models['model_name'])}"
|
| 125 |
+
)
|
| 126 |
+
raise KeyError(
|
| 127 |
+
"Expected 'models.model_name' argument to be dict,"
|
| 128 |
+
f"not {type(models['model_name'])}"
|
| 129 |
+
)
|
| 130 |
+
# chack that model_name dict includes the keys: "spacy" and "transformers"
|
| 131 |
+
elif any(
|
| 132 |
+
[
|
| 133 |
+
any([key not in model_dict for key in ("spacy", "transformers")])
|
| 134 |
+
for model_lang, model_dict in models.items()
|
| 135 |
+
]
|
| 136 |
+
):
|
| 137 |
+
logger.error(
|
| 138 |
+
"'models.model_name' must contains 'spacy' and 'transformers' keys"
|
| 139 |
+
)
|
| 140 |
+
raise KeyError(
|
| 141 |
+
"Expected keys ('spacy' and 'transformers') was not found in "
|
| 142 |
+
"models.model_name dict"
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
logger.debug(f"Loading SpaCy and transformers models: {models.values()}")
|
| 146 |
+
|
| 147 |
+
self.nlp = {}
|
| 148 |
+
for lang_code, model_name in models.items():
|
| 149 |
+
nlp = spacy.load(model_name["spacy"], disable=["parser", "ner"])
|
| 150 |
+
nlp.add_pipe(
|
| 151 |
+
"transformers",
|
| 152 |
+
config={"pretrained_model_name_or_path": model_name["transformers"]},
|
| 153 |
+
last=True,
|
| 154 |
+
)
|
| 155 |
+
self.nlp[lang_code] = nlp
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/pattern.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from typing import Dict
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class Pattern:
|
| 6 |
+
"""
|
| 7 |
+
A class that represents a regex pattern.
|
| 8 |
+
|
| 9 |
+
:param name: the name of the pattern
|
| 10 |
+
:param regex: the regex pattern to detect
|
| 11 |
+
:param score: the pattern's strength (values varies 0-1)
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
def __init__(self, name: str, regex: str, score: float):
|
| 15 |
+
|
| 16 |
+
self.name = name
|
| 17 |
+
self.regex = regex
|
| 18 |
+
self.score = score
|
| 19 |
+
|
| 20 |
+
def to_dict(self) -> Dict:
|
| 21 |
+
"""
|
| 22 |
+
Turn this instance into a dictionary.
|
| 23 |
+
|
| 24 |
+
:return: a dictionary
|
| 25 |
+
"""
|
| 26 |
+
return_dict = {"name": self.name, "score": self.score, "regex": self.regex}
|
| 27 |
+
return return_dict
|
| 28 |
+
|
| 29 |
+
@classmethod
|
| 30 |
+
def from_dict(cls, pattern_dict: Dict) -> "Pattern":
|
| 31 |
+
"""
|
| 32 |
+
Load an instance from a dictionary.
|
| 33 |
+
|
| 34 |
+
:param pattern_dict: a dictionary holding the pattern's parameters
|
| 35 |
+
:return: a Pattern instance
|
| 36 |
+
"""
|
| 37 |
+
return cls(**pattern_dict)
|
| 38 |
+
|
| 39 |
+
def __repr__(self):
|
| 40 |
+
"""Return string representation of instance."""
|
| 41 |
+
return json.dumps(self.to_dict())
|
| 42 |
+
|
| 43 |
+
def __str__(self):
|
| 44 |
+
"""Return string representation of instance."""
|
| 45 |
+
return json.dumps(self.to_dict())
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/pattern_recognizer.py
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import datetime
|
| 2 |
+
import logging
|
| 3 |
+
from typing import List, Optional, Dict
|
| 4 |
+
|
| 5 |
+
import regex as re
|
| 6 |
+
|
| 7 |
+
from presidio_analyzer import (
|
| 8 |
+
LocalRecognizer,
|
| 9 |
+
Pattern,
|
| 10 |
+
RecognizerResult,
|
| 11 |
+
EntityRecognizer,
|
| 12 |
+
AnalysisExplanation,
|
| 13 |
+
)
|
| 14 |
+
from presidio_analyzer.nlp_engine import NlpArtifacts
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger("presidio_analyzer")
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class PatternRecognizer(LocalRecognizer):
|
| 20 |
+
"""
|
| 21 |
+
PII entity recognizer using regular expressions or deny-lists.
|
| 22 |
+
|
| 23 |
+
:param patterns: A list of patterns to detect
|
| 24 |
+
:param deny_list: A list of words to detect,
|
| 25 |
+
in case our recognizer uses a predefined list of words (deny list)
|
| 26 |
+
:param context: list of context words
|
| 27 |
+
:param deny_list_score: confidence score for a term
|
| 28 |
+
identified using a deny-list
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
def __init__(
|
| 32 |
+
self,
|
| 33 |
+
supported_entity: str,
|
| 34 |
+
name: str = None,
|
| 35 |
+
supported_language: str = "en",
|
| 36 |
+
patterns: List[Pattern] = None,
|
| 37 |
+
deny_list: List[str] = None,
|
| 38 |
+
context: List[str] = None,
|
| 39 |
+
deny_list_score: float = 1.0,
|
| 40 |
+
version: str = "0.0.1",
|
| 41 |
+
):
|
| 42 |
+
|
| 43 |
+
if not supported_entity:
|
| 44 |
+
raise ValueError("Pattern recognizer should be initialized with entity")
|
| 45 |
+
|
| 46 |
+
if not patterns and not deny_list:
|
| 47 |
+
raise ValueError(
|
| 48 |
+
"Pattern recognizer should be initialized with patterns"
|
| 49 |
+
" or with deny list"
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
super().__init__(
|
| 53 |
+
supported_entities=[supported_entity],
|
| 54 |
+
supported_language=supported_language,
|
| 55 |
+
name=name,
|
| 56 |
+
version=version,
|
| 57 |
+
)
|
| 58 |
+
if patterns is None:
|
| 59 |
+
self.patterns = []
|
| 60 |
+
else:
|
| 61 |
+
self.patterns = patterns
|
| 62 |
+
self.context = context
|
| 63 |
+
self.deny_list_score = deny_list_score
|
| 64 |
+
|
| 65 |
+
if deny_list:
|
| 66 |
+
deny_list_pattern = self._deny_list_to_regex(deny_list)
|
| 67 |
+
self.patterns.append(deny_list_pattern)
|
| 68 |
+
self.deny_list = deny_list
|
| 69 |
+
else:
|
| 70 |
+
self.deny_list = []
|
| 71 |
+
|
| 72 |
+
def load(self): # noqa D102
|
| 73 |
+
pass
|
| 74 |
+
|
| 75 |
+
def analyze(
|
| 76 |
+
self,
|
| 77 |
+
text: str,
|
| 78 |
+
entities: List[str],
|
| 79 |
+
nlp_artifacts: NlpArtifacts = None,
|
| 80 |
+
regex_flags: int = None,
|
| 81 |
+
) -> List[RecognizerResult]:
|
| 82 |
+
"""
|
| 83 |
+
Analyzes text to detect PII using regular expressions or deny-lists.
|
| 84 |
+
|
| 85 |
+
:param text: Text to be analyzed
|
| 86 |
+
:param entities: Entities this recognizer can detect
|
| 87 |
+
:param nlp_artifacts: Output values from the NLP engine
|
| 88 |
+
:param regex_flags:
|
| 89 |
+
:return:
|
| 90 |
+
"""
|
| 91 |
+
results = []
|
| 92 |
+
|
| 93 |
+
if self.patterns:
|
| 94 |
+
pattern_result = self.__analyze_patterns(text, regex_flags)
|
| 95 |
+
results.extend(pattern_result)
|
| 96 |
+
|
| 97 |
+
return results
|
| 98 |
+
|
| 99 |
+
def _deny_list_to_regex(self, deny_list: List[str]) -> Pattern:
|
| 100 |
+
"""
|
| 101 |
+
Convert a list of words to a matching regex.
|
| 102 |
+
|
| 103 |
+
To be analyzed by the analyze method as any other regex patterns.
|
| 104 |
+
|
| 105 |
+
:param deny_list: the list of words to detect
|
| 106 |
+
:return:the regex of the words for detection
|
| 107 |
+
"""
|
| 108 |
+
|
| 109 |
+
# Escape deny list elements as preparation for regex
|
| 110 |
+
escaped_deny_list = [re.escape(element) for element in deny_list]
|
| 111 |
+
regex = r"(?:^|(?<=\W))(" + "|".join(escaped_deny_list) + r")(?:(?=\W)|$)"
|
| 112 |
+
return Pattern(name="deny_list", regex=regex, score=self.deny_list_score)
|
| 113 |
+
|
| 114 |
+
def validate_result(self, pattern_text: str) -> Optional[bool]:
|
| 115 |
+
"""
|
| 116 |
+
Validate the pattern logic e.g., by running checksum on a detected pattern.
|
| 117 |
+
|
| 118 |
+
:param pattern_text: the text to validated.
|
| 119 |
+
Only the part in text that was detected by the regex engine
|
| 120 |
+
:return: A bool indicating whether the validation was successful.
|
| 121 |
+
"""
|
| 122 |
+
return None
|
| 123 |
+
|
| 124 |
+
def invalidate_result(self, pattern_text: str) -> Optional[bool]:
|
| 125 |
+
"""
|
| 126 |
+
Logic to check for result invalidation by running pruning logic.
|
| 127 |
+
|
| 128 |
+
For example, each SSN number group should not consist of all the same digits.
|
| 129 |
+
|
| 130 |
+
:param pattern_text: the text to validated.
|
| 131 |
+
Only the part in text that was detected by the regex engine
|
| 132 |
+
:return: A bool indicating whether the result is invalidated
|
| 133 |
+
"""
|
| 134 |
+
return None
|
| 135 |
+
|
| 136 |
+
@staticmethod
|
| 137 |
+
def build_regex_explanation(
|
| 138 |
+
recognizer_name: str,
|
| 139 |
+
pattern_name: str,
|
| 140 |
+
pattern: str,
|
| 141 |
+
original_score: float,
|
| 142 |
+
validation_result: bool,
|
| 143 |
+
) -> AnalysisExplanation:
|
| 144 |
+
"""
|
| 145 |
+
Construct an explanation for why this entity was detected.
|
| 146 |
+
|
| 147 |
+
:param recognizer_name: Name of recognizer detecting the entity
|
| 148 |
+
:param pattern_name: Regex pattern name which detected the entity
|
| 149 |
+
:param pattern: Regex pattern logic
|
| 150 |
+
:param original_score: Score given by the recognizer
|
| 151 |
+
:param validation_result: Whether validation was used and its result
|
| 152 |
+
:return: Analysis explanation
|
| 153 |
+
"""
|
| 154 |
+
explanation = AnalysisExplanation(
|
| 155 |
+
recognizer=recognizer_name,
|
| 156 |
+
original_score=original_score,
|
| 157 |
+
pattern_name=pattern_name,
|
| 158 |
+
pattern=pattern,
|
| 159 |
+
validation_result=validation_result,
|
| 160 |
+
)
|
| 161 |
+
return explanation
|
| 162 |
+
|
| 163 |
+
def __analyze_patterns(
|
| 164 |
+
self, text: str, flags: int = None
|
| 165 |
+
) -> List[RecognizerResult]:
|
| 166 |
+
"""
|
| 167 |
+
Evaluate all patterns in the provided text.
|
| 168 |
+
|
| 169 |
+
Including words in the provided deny-list
|
| 170 |
+
|
| 171 |
+
:param text: text to analyze
|
| 172 |
+
:param flags: regex flags
|
| 173 |
+
:return: A list of RecognizerResult
|
| 174 |
+
"""
|
| 175 |
+
flags = flags if flags else re.DOTALL | re.MULTILINE
|
| 176 |
+
results = []
|
| 177 |
+
for pattern in self.patterns:
|
| 178 |
+
match_start_time = datetime.datetime.now()
|
| 179 |
+
matches = re.finditer(pattern.regex, text, flags=flags)
|
| 180 |
+
match_time = datetime.datetime.now() - match_start_time
|
| 181 |
+
logger.debug(
|
| 182 |
+
"--- match_time[%s]: %s.%s seconds",
|
| 183 |
+
pattern.name,
|
| 184 |
+
match_time.seconds,
|
| 185 |
+
match_time.microseconds,
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
for match in matches:
|
| 189 |
+
start, end = match.span()
|
| 190 |
+
current_match = text[start:end]
|
| 191 |
+
|
| 192 |
+
# Skip empty results
|
| 193 |
+
if current_match == "":
|
| 194 |
+
continue
|
| 195 |
+
|
| 196 |
+
score = pattern.score
|
| 197 |
+
|
| 198 |
+
validation_result = self.validate_result(current_match)
|
| 199 |
+
description = self.build_regex_explanation(
|
| 200 |
+
self.name, pattern.name, pattern.regex, score, validation_result
|
| 201 |
+
)
|
| 202 |
+
pattern_result = RecognizerResult(
|
| 203 |
+
entity_type=self.supported_entities[0],
|
| 204 |
+
start=start,
|
| 205 |
+
end=end,
|
| 206 |
+
score=score,
|
| 207 |
+
analysis_explanation=description,
|
| 208 |
+
recognition_metadata={
|
| 209 |
+
RecognizerResult.RECOGNIZER_NAME_KEY: self.name,
|
| 210 |
+
RecognizerResult.RECOGNIZER_IDENTIFIER_KEY: self.id,
|
| 211 |
+
},
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
if validation_result is not None:
|
| 215 |
+
if validation_result:
|
| 216 |
+
pattern_result.score = EntityRecognizer.MAX_SCORE
|
| 217 |
+
else:
|
| 218 |
+
pattern_result.score = EntityRecognizer.MIN_SCORE
|
| 219 |
+
|
| 220 |
+
invalidation_result = self.invalidate_result(current_match)
|
| 221 |
+
if invalidation_result is not None and invalidation_result:
|
| 222 |
+
pattern_result.score = EntityRecognizer.MIN_SCORE
|
| 223 |
+
|
| 224 |
+
if pattern_result.score > EntityRecognizer.MIN_SCORE:
|
| 225 |
+
results.append(pattern_result)
|
| 226 |
+
|
| 227 |
+
# Update analysis explanation score following validation or invalidation
|
| 228 |
+
description.score = pattern_result.score
|
| 229 |
+
|
| 230 |
+
results = EntityRecognizer.remove_duplicates(results)
|
| 231 |
+
return results
|
| 232 |
+
|
| 233 |
+
def to_dict(self) -> Dict:
|
| 234 |
+
"""Serialize instance into a dictionary."""
|
| 235 |
+
return_dict = super().to_dict()
|
| 236 |
+
|
| 237 |
+
return_dict["patterns"] = [pat.to_dict() for pat in self.patterns]
|
| 238 |
+
return_dict["deny_list"] = self.deny_list
|
| 239 |
+
return_dict["context"] = self.context
|
| 240 |
+
return_dict["supported_entity"] = return_dict["supported_entities"][0]
|
| 241 |
+
del return_dict["supported_entities"]
|
| 242 |
+
|
| 243 |
+
return return_dict
|
| 244 |
+
|
| 245 |
+
@classmethod
|
| 246 |
+
def from_dict(cls, entity_recognizer_dict: Dict) -> "PatternRecognizer":
|
| 247 |
+
"""Create instance from a serialized dict."""
|
| 248 |
+
patterns = entity_recognizer_dict.get("patterns")
|
| 249 |
+
if patterns:
|
| 250 |
+
patterns_list = [Pattern.from_dict(pat) for pat in patterns]
|
| 251 |
+
entity_recognizer_dict["patterns"] = patterns_list
|
| 252 |
+
|
| 253 |
+
return cls(**entity_recognizer_dict)
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/Aadhaar_Number.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional, List
|
| 2 |
+
|
| 3 |
+
from presidio_analyzer import Pattern, PatternRecognizer
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class Aadhaar_Number(PatternRecognizer):
|
| 7 |
+
"""
|
| 8 |
+
Recognizes US bank number using regex.
|
| 9 |
+
|
| 10 |
+
:param patterns: List of patterns to be used by this recognizer
|
| 11 |
+
:param context: List of context words to increase confidence in detection
|
| 12 |
+
:param supported_language: Language this recognizer supports
|
| 13 |
+
:param supported_entity: The entity this recognizer can detect
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
PATTERNS = [
|
| 17 |
+
Pattern(name="aadhaar_number_pattern", regex="[2-9]{1}[0-9]{3}\s{1}[0-9]{4}\s{1}[0-9]{4}", score=0.5),
|
| 18 |
+
]
|
| 19 |
+
|
| 20 |
+
CONTEXT = [
|
| 21 |
+
"bank"
|
| 22 |
+
# Task #603: Support keyphrases: change to "checking account"
|
| 23 |
+
# as part of keyphrase change
|
| 24 |
+
"check",
|
| 25 |
+
"account",
|
| 26 |
+
"account#",
|
| 27 |
+
"acct",
|
| 28 |
+
"save",
|
| 29 |
+
"debit",
|
| 30 |
+
]
|
| 31 |
+
|
| 32 |
+
def __init__(
|
| 33 |
+
self,
|
| 34 |
+
patterns: Optional[List[Pattern]] = None,
|
| 35 |
+
context: Optional[List[str]] = None,
|
| 36 |
+
supported_language: str = "en",
|
| 37 |
+
supported_entity: str = "AADHAR_NUMBER",
|
| 38 |
+
):
|
| 39 |
+
patterns = patterns if patterns else self.PATTERNS
|
| 40 |
+
context = context if context else self.CONTEXT
|
| 41 |
+
super().__init__(
|
| 42 |
+
supported_entity=supported_entity,
|
| 43 |
+
patterns=patterns,
|
| 44 |
+
context=context,
|
| 45 |
+
supported_language=supported_language,
|
| 46 |
+
)
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/PAN_Number.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional, List
|
| 2 |
+
|
| 3 |
+
from presidio_analyzer import Pattern, PatternRecognizer
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class PAN_Number(PatternRecognizer):
|
| 7 |
+
"""
|
| 8 |
+
Recognizes US bank number using regex.
|
| 9 |
+
|
| 10 |
+
:param patterns: List of patterns to be used by this recognizer
|
| 11 |
+
:param context: List of context words to increase confidence in detection
|
| 12 |
+
:param supported_language: Language this recognizer supports
|
| 13 |
+
:param supported_entity: The entity this recognizer can detect
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
PATTERNS = [
|
| 17 |
+
Pattern(name="pan_number_pattern", regex="[A-Z]{5}[0-9]{4}[A-Z]{1}", score=0.5),
|
| 18 |
+
]
|
| 19 |
+
|
| 20 |
+
CONTEXT = [
|
| 21 |
+
"bank"
|
| 22 |
+
# Task #603: Support keyphrases: change to "checking account"
|
| 23 |
+
# as part of keyphrase change
|
| 24 |
+
"check",
|
| 25 |
+
"account",
|
| 26 |
+
"account#",
|
| 27 |
+
"acct",
|
| 28 |
+
"save",
|
| 29 |
+
"debit",
|
| 30 |
+
]
|
| 31 |
+
|
| 32 |
+
def __init__(
|
| 33 |
+
self,
|
| 34 |
+
patterns: Optional[List[Pattern]] = None,
|
| 35 |
+
context: Optional[List[str]] = None,
|
| 36 |
+
supported_language: str = "en",
|
| 37 |
+
supported_entity: str = "PAN_Number",
|
| 38 |
+
):
|
| 39 |
+
patterns = patterns if patterns else self.PATTERNS
|
| 40 |
+
context = context if context else self.CONTEXT
|
| 41 |
+
super().__init__(
|
| 42 |
+
supported_entity=supported_entity,
|
| 43 |
+
patterns=patterns,
|
| 44 |
+
context=context,
|
| 45 |
+
supported_language=supported_language,
|
| 46 |
+
)
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/__init__.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Predefined recognizers package. Holds all the default recognizers."""
|
| 2 |
+
|
| 3 |
+
from presidio_analyzer.predefined_recognizers.transformers_recognizer import (
|
| 4 |
+
TransformersRecognizer,
|
| 5 |
+
)
|
| 6 |
+
from .PAN_Number import PAN_Number
|
| 7 |
+
from .credit_card_recognizer import CreditCardRecognizer
|
| 8 |
+
from .crypto_recognizer import CryptoRecognizer
|
| 9 |
+
from .date_recognizer import DateRecognizer
|
| 10 |
+
from .email_recognizer import EmailRecognizer
|
| 11 |
+
from .iban_recognizer import IbanRecognizer
|
| 12 |
+
from .ip_recognizer import IpRecognizer
|
| 13 |
+
from .medical_license_recognizer import MedicalLicenseRecognizer
|
| 14 |
+
from .phone_recognizer import PhoneRecognizer
|
| 15 |
+
from .sg_fin_recognizer import SgFinRecognizer
|
| 16 |
+
from .spacy_recognizer import SpacyRecognizer
|
| 17 |
+
from .stanza_recognizer import StanzaRecognizer
|
| 18 |
+
from .uk_nhs_recognizer import NhsRecognizer
|
| 19 |
+
from .url_recognizer import UrlRecognizer
|
| 20 |
+
from .Aadhaar_Number import Aadhaar_Number
|
| 21 |
+
from .data_recognizer import ClientListRecognizer
|
| 22 |
+
from .us_driver_license_recognizer import UsLicenseRecognizer
|
| 23 |
+
from .us_itin_recognizer import UsItinRecognizer
|
| 24 |
+
from .us_passport_recognizer import UsPassportRecognizer
|
| 25 |
+
from .us_ssn_recognizer import UsSsnRecognizer
|
| 26 |
+
from .es_nif_recognizer import EsNifRecognizer
|
| 27 |
+
from .au_abn_recognizer import AuAbnRecognizer
|
| 28 |
+
from .au_acn_recognizer import AuAcnRecognizer
|
| 29 |
+
from .au_tfn_recognizer import AuTfnRecognizer
|
| 30 |
+
from .au_medicare_recognizer import AuMedicareRecognizer
|
| 31 |
+
from .it_driver_license_recognizer import ItDriverLicenseRecognizer
|
| 32 |
+
from .it_fiscal_code_recognizer import ItFiscalCodeRecognizer
|
| 33 |
+
from .it_vat_code import ItVatCodeRecognizer
|
| 34 |
+
from .it_identity_card_recognizer import ItIdentityCardRecognizer
|
| 35 |
+
from .it_passport_recognizer import ItPassportRecognizer
|
| 36 |
+
|
| 37 |
+
NLP_RECOGNIZERS = {
|
| 38 |
+
"spacy": SpacyRecognizer,
|
| 39 |
+
"stanza": StanzaRecognizer,
|
| 40 |
+
"transformers": TransformersRecognizer,
|
| 41 |
+
"client":ClientListRecognizer
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
__all__ = [
|
| 45 |
+
"AbaRoutingRecognizer",
|
| 46 |
+
"CreditCardRecognizer",
|
| 47 |
+
"CryptoRecognizer",
|
| 48 |
+
"DateRecognizer",
|
| 49 |
+
"EmailRecognizer",
|
| 50 |
+
"IbanRecognizer",
|
| 51 |
+
"IpRecognizer",
|
| 52 |
+
"NhsRecognizer",
|
| 53 |
+
"MedicalLicenseRecognizer",
|
| 54 |
+
"PhoneRecognizer",
|
| 55 |
+
"SgFinRecognizer",
|
| 56 |
+
"UrlRecognizer",
|
| 57 |
+
"UsBankRecognizer",
|
| 58 |
+
"UsItinRecognizer",
|
| 59 |
+
"UsLicenseRecognizer",
|
| 60 |
+
"UsPassportRecognizer",
|
| 61 |
+
"UsSsnRecognizer",
|
| 62 |
+
"EsNifRecognizer",
|
| 63 |
+
"SpacyRecognizer",
|
| 64 |
+
"ClientListRecognizer",
|
| 65 |
+
"StanzaRecognizer",
|
| 66 |
+
"NLP_RECOGNIZERS",
|
| 67 |
+
"AuAbnRecognizer",
|
| 68 |
+
"AuAcnRecognizer",
|
| 69 |
+
"AuTfnRecognizer",
|
| 70 |
+
"AuMedicareRecognizer",
|
| 71 |
+
"TransformersRecognizer",
|
| 72 |
+
"ItDriverLicenseRecognizer",
|
| 73 |
+
"ItFiscalCodeRecognizer",
|
| 74 |
+
"ItVatCodeRecognizer",
|
| 75 |
+
"ItIdentityCardRecognizer",
|
| 76 |
+
"ItPassportRecognizer",
|
| 77 |
+
]
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/au_abn_recognizer.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional, List, Tuple
|
| 2 |
+
|
| 3 |
+
from presidio_analyzer import Pattern, PatternRecognizer
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class AuAbnRecognizer(PatternRecognizer):
|
| 7 |
+
"""
|
| 8 |
+
Recognizes Australian Business Number ("ABN").
|
| 9 |
+
|
| 10 |
+
The Australian Business Number (ABN) is a unique 11
|
| 11 |
+
digit identifier issued to all entities registered in
|
| 12 |
+
the Australian Business Register (ABR).
|
| 13 |
+
The 11 digit ABN is structured as a 9 digit identifier
|
| 14 |
+
with two leading check digits.
|
| 15 |
+
The leading check digits are derived using a modulus 89 calculation.
|
| 16 |
+
This recognizer identifies ABN using regex, context words and checksum.
|
| 17 |
+
Reference: https://abr.business.gov.au/Help/AbnFormat
|
| 18 |
+
|
| 19 |
+
:param patterns: List of patterns to be used by this recognizer
|
| 20 |
+
:param context: List of context words to increase confidence in detection
|
| 21 |
+
:param supported_language: Language this recognizer supports
|
| 22 |
+
:param supported_entity: The entity this recognizer can detect
|
| 23 |
+
:param replacement_pairs: List of tuples with potential replacement values
|
| 24 |
+
for different strings to be used during pattern matching.
|
| 25 |
+
This can allow a greater variety in input, for example by removing dashes or spaces.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
PATTERNS = [
|
| 29 |
+
Pattern(
|
| 30 |
+
"ABN (Medium)",
|
| 31 |
+
r"\b\d{2}\s\d{3}\s\d{3}\s\d{3}\b",
|
| 32 |
+
0.1,
|
| 33 |
+
),
|
| 34 |
+
Pattern(
|
| 35 |
+
"ABN (Low)",
|
| 36 |
+
r"\b\d{11}\b",
|
| 37 |
+
0.01,
|
| 38 |
+
),
|
| 39 |
+
]
|
| 40 |
+
|
| 41 |
+
CONTEXT = [
|
| 42 |
+
"australian business number",
|
| 43 |
+
"abn",
|
| 44 |
+
]
|
| 45 |
+
|
| 46 |
+
def __init__(
|
| 47 |
+
self,
|
| 48 |
+
patterns: Optional[List[Pattern]] = None,
|
| 49 |
+
context: Optional[List[str]] = None,
|
| 50 |
+
supported_language: str = "en",
|
| 51 |
+
supported_entity: str = "AU_ABN",
|
| 52 |
+
replacement_pairs: Optional[List[Tuple[str, str]]] = None,
|
| 53 |
+
):
|
| 54 |
+
self.replacement_pairs = (
|
| 55 |
+
replacement_pairs if replacement_pairs else [("-", ""), (" ", "")]
|
| 56 |
+
)
|
| 57 |
+
patterns = patterns if patterns else self.PATTERNS
|
| 58 |
+
context = context if context else self.CONTEXT
|
| 59 |
+
super().__init__(
|
| 60 |
+
supported_entity=supported_entity,
|
| 61 |
+
patterns=patterns,
|
| 62 |
+
context=context,
|
| 63 |
+
supported_language=supported_language,
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
def validate_result(self, pattern_text: str) -> bool:
|
| 67 |
+
"""
|
| 68 |
+
Validate the pattern logic e.g., by running checksum on a detected pattern.
|
| 69 |
+
|
| 70 |
+
:param pattern_text: the text to validated.
|
| 71 |
+
Only the part in text that was detected by the regex engine
|
| 72 |
+
:return: A bool indicating whether the validation was successful.
|
| 73 |
+
"""
|
| 74 |
+
# Pre-processing before validation checks
|
| 75 |
+
text = self.__sanitize_value(pattern_text, self.replacement_pairs)
|
| 76 |
+
abn_list = [int(digit) for digit in text if not digit.isspace()]
|
| 77 |
+
|
| 78 |
+
# Set weights based on digit position
|
| 79 |
+
weight = [10, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19]
|
| 80 |
+
|
| 81 |
+
# Perform checksums
|
| 82 |
+
abn_list[0] = 9 if abn_list[0] == 0 else abn_list[0] - 1
|
| 83 |
+
sum_product = 0
|
| 84 |
+
for i in range(11):
|
| 85 |
+
sum_product += abn_list[i] * weight[i]
|
| 86 |
+
remainder = sum_product % 89
|
| 87 |
+
return remainder == 0
|
| 88 |
+
|
| 89 |
+
@staticmethod
|
| 90 |
+
def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
|
| 91 |
+
for search_string, replacement_string in replacement_pairs:
|
| 92 |
+
text = text.replace(search_string, replacement_string)
|
| 93 |
+
return text
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/au_acn_recognizer.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional, List, Tuple
|
| 2 |
+
|
| 3 |
+
from presidio_analyzer import Pattern, PatternRecognizer
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class AuAcnRecognizer(PatternRecognizer):
|
| 7 |
+
"""
|
| 8 |
+
Recognizes Australian Company Number ("ACN").
|
| 9 |
+
|
| 10 |
+
The Australian Company Number (ACN) is a nine digit number
|
| 11 |
+
with the last digit being a check digit calculated using a
|
| 12 |
+
modified modulus 10 calculation.
|
| 13 |
+
This recognizer identifies ACN using regex, context words, and checksum.
|
| 14 |
+
Reference: https://asic.gov.au/
|
| 15 |
+
|
| 16 |
+
:param patterns: List of patterns to be used by this recognizer
|
| 17 |
+
:param context: List of context words to increase confidence in detection
|
| 18 |
+
:param supported_language: Language this recognizer supports
|
| 19 |
+
:param supported_entity: The entity this recognizer can detect
|
| 20 |
+
:param replacement_pairs: List of tuples with potential replacement values
|
| 21 |
+
for different strings to be used during pattern matching.
|
| 22 |
+
This can allow a greater variety in input, for example by removing dashes or spaces.
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
PATTERNS = [
|
| 26 |
+
Pattern(
|
| 27 |
+
"ACN (Medium)",
|
| 28 |
+
r"\b\d{3}\s\d{3}\s\d{3}\b",
|
| 29 |
+
0.1,
|
| 30 |
+
),
|
| 31 |
+
Pattern(
|
| 32 |
+
"ACN (Low)",
|
| 33 |
+
r"\b\d{9}\b",
|
| 34 |
+
0.01,
|
| 35 |
+
),
|
| 36 |
+
]
|
| 37 |
+
|
| 38 |
+
CONTEXT = [
|
| 39 |
+
"australian company number",
|
| 40 |
+
"acn",
|
| 41 |
+
]
|
| 42 |
+
|
| 43 |
+
def __init__(
|
| 44 |
+
self,
|
| 45 |
+
patterns: Optional[List[Pattern]] = None,
|
| 46 |
+
context: Optional[List[str]] = None,
|
| 47 |
+
supported_language: str = "en",
|
| 48 |
+
supported_entity: str = "AU_ACN",
|
| 49 |
+
replacement_pairs: Optional[List[Tuple[str, str]]] = None,
|
| 50 |
+
):
|
| 51 |
+
self.replacement_pairs = (
|
| 52 |
+
replacement_pairs if replacement_pairs else [("-", ""), (" ", "")]
|
| 53 |
+
)
|
| 54 |
+
patterns = patterns if patterns else self.PATTERNS
|
| 55 |
+
context = context if context else self.CONTEXT
|
| 56 |
+
super().__init__(
|
| 57 |
+
supported_entity=supported_entity,
|
| 58 |
+
patterns=patterns,
|
| 59 |
+
context=context,
|
| 60 |
+
supported_language=supported_language,
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
def validate_result(self, pattern_text: str) -> bool:
|
| 64 |
+
"""
|
| 65 |
+
Validate the pattern logic e.g., by running checksum on a detected pattern.
|
| 66 |
+
|
| 67 |
+
:param pattern_text: the text to validated.
|
| 68 |
+
Only the part in text that was detected by the regex engine
|
| 69 |
+
:return: A bool indicating whether the validation was successful.
|
| 70 |
+
"""
|
| 71 |
+
# Pre-processing before validation checks
|
| 72 |
+
text = self.__sanitize_value(pattern_text, self.replacement_pairs)
|
| 73 |
+
acn_list = [int(digit) for digit in text if not digit.isspace()]
|
| 74 |
+
|
| 75 |
+
# Set weights based on digit position
|
| 76 |
+
weight = [8, 7, 6, 5, 4, 3, 2, 1]
|
| 77 |
+
|
| 78 |
+
# Perform checksums
|
| 79 |
+
sum_product = 0
|
| 80 |
+
for i in range(8):
|
| 81 |
+
sum_product += acn_list[i] * weight[i]
|
| 82 |
+
remainder = sum_product % 10
|
| 83 |
+
complement = 10 - remainder
|
| 84 |
+
return complement == acn_list[-1]
|
| 85 |
+
|
| 86 |
+
@staticmethod
|
| 87 |
+
def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
|
| 88 |
+
for search_string, replacement_string in replacement_pairs:
|
| 89 |
+
text = text.replace(search_string, replacement_string)
|
| 90 |
+
return text
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/au_medicare_recognizer.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional, List, Tuple
|
| 2 |
+
|
| 3 |
+
from presidio_analyzer import Pattern, PatternRecognizer
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class AuMedicareRecognizer(PatternRecognizer):
|
| 7 |
+
"""
|
| 8 |
+
Recognizes Australian Medicare number using regex, context words, and checksum.
|
| 9 |
+
|
| 10 |
+
Medicare number is a unique identifier issued by Australian Government
|
| 11 |
+
that enables the cardholder to receive a rebates of medical expenses
|
| 12 |
+
under Australia's Medicare system.
|
| 13 |
+
It uses a modulus 10 checksum scheme to validate the number.
|
| 14 |
+
Reference: https://en.wikipedia.org/wiki/Medicare_card_(Australia)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
:param patterns: List of patterns to be used by this recognizer
|
| 18 |
+
:param context: List of context words to increase confidence in detection
|
| 19 |
+
:param supported_language: Language this recognizer supports
|
| 20 |
+
:param supported_entity: The entity this recognizer can detect
|
| 21 |
+
:param replacement_pairs: List of tuples with potential replacement values
|
| 22 |
+
for different strings to be used during pattern matching.
|
| 23 |
+
This can allow a greater variety in input, for example by removing dashes or spaces.
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
PATTERNS = [
|
| 27 |
+
Pattern(
|
| 28 |
+
"Australian Medicare Number (Medium)",
|
| 29 |
+
r"\b[2-6]\d{3}\s\d{5}\s\d\b",
|
| 30 |
+
0.1,
|
| 31 |
+
),
|
| 32 |
+
Pattern(
|
| 33 |
+
"Australian Medicare Number (Low)",
|
| 34 |
+
r"\b[2-6]\d{9}\b",
|
| 35 |
+
0.01,
|
| 36 |
+
),
|
| 37 |
+
]
|
| 38 |
+
|
| 39 |
+
CONTEXT = [
|
| 40 |
+
"medicare",
|
| 41 |
+
]
|
| 42 |
+
|
| 43 |
+
def __init__(
|
| 44 |
+
self,
|
| 45 |
+
patterns: Optional[List[Pattern]] = None,
|
| 46 |
+
context: Optional[List[str]] = None,
|
| 47 |
+
supported_language: str = "en",
|
| 48 |
+
supported_entity: str = "AU_MEDICARE",
|
| 49 |
+
replacement_pairs: Optional[List[Tuple[str, str]]] = None,
|
| 50 |
+
):
|
| 51 |
+
self.replacement_pairs = (
|
| 52 |
+
replacement_pairs if replacement_pairs else [("-", ""), (" ", "")]
|
| 53 |
+
)
|
| 54 |
+
patterns = patterns if patterns else self.PATTERNS
|
| 55 |
+
context = context if context else self.CONTEXT
|
| 56 |
+
super().__init__(
|
| 57 |
+
supported_entity=supported_entity,
|
| 58 |
+
patterns=patterns,
|
| 59 |
+
context=context,
|
| 60 |
+
supported_language=supported_language,
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
def validate_result(self, pattern_text: str) -> bool:
|
| 64 |
+
"""
|
| 65 |
+
Validate the pattern logic e.g., by running checksum on a detected pattern.
|
| 66 |
+
|
| 67 |
+
:param pattern_text: the text to validated.
|
| 68 |
+
Only the part in text that was detected by the regex engine
|
| 69 |
+
:return: A bool indicating whether the validation was successful.
|
| 70 |
+
"""
|
| 71 |
+
# Pre-processing before validation checks
|
| 72 |
+
text = self.__sanitize_value(pattern_text, self.replacement_pairs)
|
| 73 |
+
medicare_list = [int(digit) for digit in text if not digit.isspace()]
|
| 74 |
+
|
| 75 |
+
# Set weights based on digit position
|
| 76 |
+
weight = [1, 3, 7, 9, 1, 3, 7, 9]
|
| 77 |
+
|
| 78 |
+
# Perform checksums
|
| 79 |
+
sum_product = 0
|
| 80 |
+
for i in range(8):
|
| 81 |
+
sum_product += medicare_list[i] * weight[i]
|
| 82 |
+
remainder = sum_product % 10
|
| 83 |
+
return remainder == medicare_list[8]
|
| 84 |
+
|
| 85 |
+
@staticmethod
|
| 86 |
+
def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
|
| 87 |
+
for search_string, replacement_string in replacement_pairs:
|
| 88 |
+
text = text.replace(search_string, replacement_string)
|
| 89 |
+
return text
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/au_tfn_recognizer.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional, List, Tuple
|
| 2 |
+
|
| 3 |
+
from presidio_analyzer import Pattern, PatternRecognizer
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class AuTfnRecognizer(PatternRecognizer):
|
| 7 |
+
"""
|
| 8 |
+
Recognizes Australian Tax File Numbers ("TFN").
|
| 9 |
+
|
| 10 |
+
The tax file number (TFN) is a unique identifier
|
| 11 |
+
issued by the Australian Taxation Office
|
| 12 |
+
to each taxpaying entity — an individual, company,
|
| 13 |
+
superannuation fund, partnership, or trust.
|
| 14 |
+
The TFN consists of a nine digit number, usually
|
| 15 |
+
presented in the format NNN NNN NNN.
|
| 16 |
+
TFN includes a check digit for detecting erroneous
|
| 17 |
+
number based on simple modulo 11.
|
| 18 |
+
This recognizer uses regex, context words,
|
| 19 |
+
and checksum to identify TFN.
|
| 20 |
+
Reference: https://www.ato.gov.au/individuals/tax-file-number/
|
| 21 |
+
|
| 22 |
+
:param patterns: List of patterns to be used by this recognizer
|
| 23 |
+
:param context: List of context words to increase confidence in detection
|
| 24 |
+
:param supported_language: Language this recognizer supports
|
| 25 |
+
:param supported_entity: The entity this recognizer can detect
|
| 26 |
+
:param replacement_pairs: List of tuples with potential replacement values
|
| 27 |
+
for different strings to be used during pattern matching.
|
| 28 |
+
This can allow a greater variety in input, for example by removing dashes or spaces.
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
PATTERNS = [
|
| 32 |
+
Pattern(
|
| 33 |
+
"TFN (Medium)",
|
| 34 |
+
r"\b\d{3}\s\d{3}\s\d{3}\b",
|
| 35 |
+
0.1,
|
| 36 |
+
),
|
| 37 |
+
Pattern(
|
| 38 |
+
"TFN (Low)",
|
| 39 |
+
r"\b\d{9}\b",
|
| 40 |
+
0.01,
|
| 41 |
+
),
|
| 42 |
+
]
|
| 43 |
+
|
| 44 |
+
CONTEXT = [
|
| 45 |
+
"tax file number",
|
| 46 |
+
"tfn",
|
| 47 |
+
]
|
| 48 |
+
|
| 49 |
+
def __init__(
|
| 50 |
+
self,
|
| 51 |
+
patterns: Optional[List[Pattern]] = None,
|
| 52 |
+
context: Optional[List[str]] = None,
|
| 53 |
+
supported_language: str = "en",
|
| 54 |
+
supported_entity: str = "AU_TFN",
|
| 55 |
+
replacement_pairs: Optional[List[Tuple[str, str]]] = None,
|
| 56 |
+
):
|
| 57 |
+
self.replacement_pairs = (
|
| 58 |
+
replacement_pairs if replacement_pairs else [("-", ""), (" ", "")]
|
| 59 |
+
)
|
| 60 |
+
patterns = patterns if patterns else self.PATTERNS
|
| 61 |
+
context = context if context else self.CONTEXT
|
| 62 |
+
super().__init__(
|
| 63 |
+
supported_entity=supported_entity,
|
| 64 |
+
patterns=patterns,
|
| 65 |
+
context=context,
|
| 66 |
+
supported_language=supported_language,
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
def validate_result(self, pattern_text: str) -> bool:
|
| 70 |
+
"""
|
| 71 |
+
Validate the pattern logic e.g., by running checksum on a detected pattern.
|
| 72 |
+
|
| 73 |
+
:param pattern_text: the text to validated.
|
| 74 |
+
Only the part in text that was detected by the regex engine
|
| 75 |
+
:return: A bool indicating whether the validation was successful.
|
| 76 |
+
"""
|
| 77 |
+
# Pre-processing before validation checks
|
| 78 |
+
text = self.__sanitize_value(pattern_text, self.replacement_pairs)
|
| 79 |
+
tfn_list = [int(digit) for digit in text if not digit.isspace()]
|
| 80 |
+
|
| 81 |
+
# Set weights based on digit position
|
| 82 |
+
weight = [1, 4, 3, 7, 5, 8, 6, 9, 10]
|
| 83 |
+
|
| 84 |
+
# Perform checksums
|
| 85 |
+
sum_product = 0
|
| 86 |
+
for i in range(9):
|
| 87 |
+
sum_product += tfn_list[i] * weight[i]
|
| 88 |
+
remainder = sum_product % 11
|
| 89 |
+
return remainder == 0
|
| 90 |
+
|
| 91 |
+
@staticmethod
|
| 92 |
+
def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
|
| 93 |
+
for search_string, replacement_string in replacement_pairs:
|
| 94 |
+
text = text.replace(search_string, replacement_string)
|
| 95 |
+
return text
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/credit_card_recognizer.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Tuple, Optional
|
| 2 |
+
|
| 3 |
+
from presidio_analyzer import Pattern, PatternRecognizer
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class CreditCardRecognizer(PatternRecognizer):
|
| 7 |
+
"""
|
| 8 |
+
Recognize common credit card numbers using regex + checksum.
|
| 9 |
+
|
| 10 |
+
:param patterns: List of patterns to be used by this recognizer
|
| 11 |
+
:param context: List of context words to increase confidence in detection
|
| 12 |
+
:param supported_language: Language this recognizer supports
|
| 13 |
+
:param supported_entity: The entity this recognizer can detect
|
| 14 |
+
:param replacement_pairs: List of tuples with potential replacement values
|
| 15 |
+
for different strings to be used during pattern matching.
|
| 16 |
+
This can allow a greater variety in input, for example by removing dashes or spaces.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
PATTERNS = [
|
| 20 |
+
Pattern(
|
| 21 |
+
"All Credit Cards (weak)",
|
| 22 |
+
r"\b((4\d{3})|(5[0-5]\d{2})|(6\d{3})|(1\d{3})|(3\d{3}))[- ]?(\d{3,4})[- ]?(\d{3,4})[- ]?(\d{3,5})\b", # noqa: E501
|
| 23 |
+
0.3,
|
| 24 |
+
),
|
| 25 |
+
]
|
| 26 |
+
|
| 27 |
+
CONTEXT = [
|
| 28 |
+
"credit",
|
| 29 |
+
"card",
|
| 30 |
+
"visa",
|
| 31 |
+
"mastercard",
|
| 32 |
+
"cc ",
|
| 33 |
+
"amex",
|
| 34 |
+
"discover",
|
| 35 |
+
"jcb",
|
| 36 |
+
"diners",
|
| 37 |
+
"maestro",
|
| 38 |
+
"instapayment",
|
| 39 |
+
]
|
| 40 |
+
|
| 41 |
+
def __init__(
|
| 42 |
+
self,
|
| 43 |
+
patterns: Optional[List[Pattern]] = None,
|
| 44 |
+
context: Optional[List[str]] = None,
|
| 45 |
+
supported_language: str = "en",
|
| 46 |
+
supported_entity: str = "CREDIT_CARD",
|
| 47 |
+
replacement_pairs: Optional[List[Tuple[str, str]]] = None,
|
| 48 |
+
):
|
| 49 |
+
|
| 50 |
+
self.replacement_pairs = (
|
| 51 |
+
replacement_pairs if replacement_pairs else [("-", ""), (" ", "")]
|
| 52 |
+
)
|
| 53 |
+
patterns = patterns if patterns else self.PATTERNS
|
| 54 |
+
context = context if context else self.CONTEXT
|
| 55 |
+
super().__init__(
|
| 56 |
+
supported_entity=supported_entity,
|
| 57 |
+
patterns=patterns,
|
| 58 |
+
context=context,
|
| 59 |
+
supported_language=supported_language,
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
def validate_result(self, pattern_text: str) -> bool: # noqa D102
|
| 63 |
+
sanitized_value = self.__sanitize_value(pattern_text, self.replacement_pairs)
|
| 64 |
+
checksum = self.__luhn_checksum(sanitized_value)
|
| 65 |
+
|
| 66 |
+
return checksum
|
| 67 |
+
|
| 68 |
+
@staticmethod
|
| 69 |
+
def __luhn_checksum(sanitized_value: str) -> bool:
|
| 70 |
+
def digits_of(n: str) -> List[int]:
|
| 71 |
+
return [int(dig) for dig in str(n)]
|
| 72 |
+
|
| 73 |
+
digits = digits_of(sanitized_value)
|
| 74 |
+
odd_digits = digits[-1::-2]
|
| 75 |
+
even_digits = digits[-2::-2]
|
| 76 |
+
checksum = sum(odd_digits)
|
| 77 |
+
for d in even_digits:
|
| 78 |
+
checksum += sum(digits_of(str(d * 2)))
|
| 79 |
+
return checksum % 10 == 0
|
| 80 |
+
|
| 81 |
+
@staticmethod
|
| 82 |
+
def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
|
| 83 |
+
for search_string, replacement_string in replacement_pairs:
|
| 84 |
+
text = text.replace(search_string, replacement_string)
|
| 85 |
+
return text
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/crypto_recognizer.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from hashlib import sha256
|
| 2 |
+
from typing import List, Optional
|
| 3 |
+
|
| 4 |
+
from presidio_analyzer import Pattern, PatternRecognizer
|
| 5 |
+
|
| 6 |
+
# Copied from:
|
| 7 |
+
# http://rosettacode.org/wiki/Bitcoin/address_validation#Python
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class CryptoRecognizer(PatternRecognizer):
|
| 11 |
+
"""Recognize common crypto account numbers using regex + checksum.
|
| 12 |
+
|
| 13 |
+
:param patterns: List of patterns to be used by this recognizer
|
| 14 |
+
:param context: List of context words to increase confidence in detection
|
| 15 |
+
:param supported_language: Language this recognizer supports
|
| 16 |
+
:param supported_entity: The entity this recognizer can detect
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
PATTERNS = [
|
| 20 |
+
Pattern("Crypto (Medium)", r"\b[13][a-km-zA-HJ-NP-Z1-9]{26,33}\b", 0.5),
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
CONTEXT = ["wallet", "btc", "bitcoin", "crypto"]
|
| 24 |
+
|
| 25 |
+
def __init__(
|
| 26 |
+
self,
|
| 27 |
+
patterns: Optional[List[Pattern]] = None,
|
| 28 |
+
context: Optional[List[str]] = None,
|
| 29 |
+
supported_language: str = "en",
|
| 30 |
+
supported_entity: str = "CRYPTO",
|
| 31 |
+
):
|
| 32 |
+
patterns = patterns if patterns else self.PATTERNS
|
| 33 |
+
context = context if context else self.CONTEXT
|
| 34 |
+
super().__init__(
|
| 35 |
+
supported_entity=supported_entity,
|
| 36 |
+
patterns=patterns,
|
| 37 |
+
context=context,
|
| 38 |
+
supported_language=supported_language,
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
def validate_result(self, pattern_text: str) -> bool: # noqa D102
|
| 42 |
+
try:
|
| 43 |
+
bcbytes = self.__decode_base58(pattern_text, 25)
|
| 44 |
+
return bcbytes[-4:] == sha256(sha256(bcbytes[:-4]).digest()).digest()[:4]
|
| 45 |
+
except ValueError:
|
| 46 |
+
return False
|
| 47 |
+
|
| 48 |
+
@staticmethod
|
| 49 |
+
def __decode_base58(bc: str, length: int) -> bytes:
|
| 50 |
+
digits58 = "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz"
|
| 51 |
+
n = 0
|
| 52 |
+
for char in bc:
|
| 53 |
+
n = n * 58 + digits58.index(char)
|
| 54 |
+
return n.to_bytes(length, "big")
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/data_recognizer.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import Optional, List, Tuple, Set
|
| 3 |
+
import spacy
|
| 4 |
+
from spacy.matcher import PhraseMatcher
|
| 5 |
+
from presidio_analyzer.predefined_recognizers.spacy_recognizer import SpacyRecognizer
|
| 6 |
+
# from presidio_analyzer.predefined_recognizers import SpacyRecognizer
|
| 7 |
+
from presidio_analyzer import RecognizerResult
|
| 8 |
+
import copy
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
from presidio_analyzer import (
|
| 14 |
+
RecognizerResult,
|
| 15 |
+
LocalRecognizer,
|
| 16 |
+
AnalysisExplanation,
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
logger = logging.getLogger("presidio_analyzer")
|
| 20 |
+
# terms = ["1&1 Telecommunication SE","1010 data services LLC","AMA",
|
| 21 |
+
# "A O Smith Corporations","ABBMST","Addidas India","CITI","Cisco Systems","ERICSSON","Gati Ltd","IBM",
|
| 22 |
+
# "Infosys Ltd","Intel Corporation","Johnson","JTC Corporation","NSC Global","SUZUKI MOTOR CORPORATION",
|
| 23 |
+
# "Synopsys Ltd","TIBCOO", "T-Mobile UK","Toyota Systems Corporation","TSB Bank","UBS Bank"
|
| 24 |
+
# ,"United Health Corporation","Vodafone quickcom","Voltas","VOLVO CARS","WIPRO LIMITED",
|
| 25 |
+
# "Walmart", "CVS Health", "Walgreens Boots Alliance"]
|
| 26 |
+
terms=[]
|
| 27 |
+
class DataList:
|
| 28 |
+
# def __init__(self,val) -> None:
|
| 29 |
+
# self.Entiity=val
|
| 30 |
+
entity=[]
|
| 31 |
+
def setData(values):
|
| 32 |
+
terms.extend(values)
|
| 33 |
+
# print(terms)
|
| 34 |
+
def resetData():
|
| 35 |
+
terms.clear()
|
| 36 |
+
# def setEntity(val):
|
| 37 |
+
# DataList.Entity=val
|
| 38 |
+
# ClientListRecognizer(supported_entities=val)
|
| 39 |
+
# def getE():
|
| 40 |
+
# return self.Entiity
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
nlp = spacy.load("en_core_web_lg")
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class ClientListRecognizer(SpacyRecognizer):
|
| 50 |
+
"""
|
| 51 |
+
Recognize PII entities using a spaCy NLP model.
|
| 52 |
+
|
| 53 |
+
Since the spaCy pipeline is ran by the AnalyzerEngine,
|
| 54 |
+
this recognizer only extracts the entities from the NlpArtifacts
|
| 55 |
+
and replaces their types to align with Presidio's.
|
| 56 |
+
|
| 57 |
+
:param supported_language: Language this recognizer supports
|
| 58 |
+
:param supported_entities: The entities this recognizer can detect
|
| 59 |
+
:param ner_strength: Default confidence for NER prediction
|
| 60 |
+
:param check_label_groups: Tuple containing Presidio entity names
|
| 61 |
+
and spaCy entity names, for verifying that the right entity
|
| 62 |
+
is translated into a Presidio entity.
|
| 63 |
+
"""
|
| 64 |
+
|
| 65 |
+
ENTITIES = DataList.entity
|
| 66 |
+
|
| 67 |
+
DEFAULT_EXPLANATION = "Identified as {} by Spacy's Named Entity Recognition"
|
| 68 |
+
|
| 69 |
+
CHECK_LABEL_GROUPS = [
|
| 70 |
+
# ({"LOCATION"}, {"GPE", "LOC"}),
|
| 71 |
+
# ({"PERSON", "PER"}, {"PERSON", "PER"}),
|
| 72 |
+
# ({"DATE_TIME"}, {"DATE", "TIME"}),
|
| 73 |
+
# ({"NRP"}, {"NORP"}),
|
| 74 |
+
# ({"ORGANIZATION"}, {"ORG"}),
|
| 75 |
+
# ()
|
| 76 |
+
]
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def __init__(
|
| 83 |
+
self,
|
| 84 |
+
supported_language: str = "en",
|
| 85 |
+
supported_entities: Optional[List[str]] = None,
|
| 86 |
+
ner_strength: float = 0.85,
|
| 87 |
+
check_label_groups: Optional[Tuple[Set, Set]] = None,
|
| 88 |
+
context: Optional[List[str]] = None,
|
| 89 |
+
):
|
| 90 |
+
self.ner_strength = ner_strength
|
| 91 |
+
self.check_label_groups = (
|
| 92 |
+
check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
|
| 93 |
+
)
|
| 94 |
+
supported_entities = supported_entities if supported_entities else self.ENTITIES
|
| 95 |
+
# print("=========",supported_entities)
|
| 96 |
+
super().__init__(
|
| 97 |
+
supported_entities=supported_entities,
|
| 98 |
+
supported_language=supported_language,
|
| 99 |
+
context=context,
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
def load(self) -> None: # noqa D102
|
| 103 |
+
# no need to load anything as the analyze method already receives
|
| 104 |
+
# preprocessed nlp artifacts
|
| 105 |
+
pass
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def build_spacy_explanation(
|
| 109 |
+
self, original_score: float, explanation: str
|
| 110 |
+
) -> AnalysisExplanation:
|
| 111 |
+
"""
|
| 112 |
+
Create explanation for why this result was detected.
|
| 113 |
+
|
| 114 |
+
:param original_score: Score given by this recognizer
|
| 115 |
+
:param explanation: Explanation string
|
| 116 |
+
:return:
|
| 117 |
+
"""
|
| 118 |
+
explanation = AnalysisExplanation(
|
| 119 |
+
recognizer=self.__class__.__name__,
|
| 120 |
+
original_score=original_score,
|
| 121 |
+
textual_explanation=explanation,
|
| 122 |
+
)
|
| 123 |
+
return explanation
|
| 124 |
+
|
| 125 |
+
def analyze(self, text, entities, nlp_artifacts=None): # noqa D102
|
| 126 |
+
|
| 127 |
+
# print("=========",self.supported_entities)
|
| 128 |
+
|
| 129 |
+
# matcher = PhraseMatcher(nlp.vocab)
|
| 130 |
+
|
| 131 |
+
# # Only run nlp.make_doc to speed things up
|
| 132 |
+
# patterns = [nlp.make_doc(text) for text in terms]
|
| 133 |
+
|
| 134 |
+
# matcher.add("TerminologyList", patterns)
|
| 135 |
+
# result = []
|
| 136 |
+
|
| 137 |
+
matcher = PhraseMatcher(nlp.vocab)
|
| 138 |
+
|
| 139 |
+
# Only run nlp.make_doc to speed things up
|
| 140 |
+
patterns = [nlp.make_doc(text) for text in terms]
|
| 141 |
+
|
| 142 |
+
matcher.add("TerminologyList", patterns)
|
| 143 |
+
|
| 144 |
+
results = []
|
| 145 |
+
# result =[]
|
| 146 |
+
|
| 147 |
+
doc = nlp(text)
|
| 148 |
+
doc1 = str(doc)
|
| 149 |
+
|
| 150 |
+
matches = matcher(doc)
|
| 151 |
+
for match_id, start, end in matches:
|
| 152 |
+
span = doc[start:end]
|
| 153 |
+
|
| 154 |
+
if doc1.find(str(span)):
|
| 155 |
+
doc1=doc1.replace(str(span.text),"<COMPANY_NAME>")
|
| 156 |
+
etype=copy.deepcopy(DataList.entity)
|
| 157 |
+
spacy_result = RecognizerResult(
|
| 158 |
+
|
| 159 |
+
entity_type=etype[0],
|
| 160 |
+
start=span.start_char,
|
| 161 |
+
end=span.end_char,
|
| 162 |
+
score=self.ner_strength,
|
| 163 |
+
# analysis_explanation=explanation,
|
| 164 |
+
recognition_metadata={
|
| 165 |
+
RecognizerResult.RECOGNIZER_NAME_KEY: self.name,
|
| 166 |
+
RecognizerResult.RECOGNIZER_IDENTIFIER_KEY: self.id,
|
| 167 |
+
},
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
results.append(spacy_result)
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
return results
|
| 177 |
+
|
| 178 |
+
@staticmethod
|
| 179 |
+
def __check_label(
|
| 180 |
+
entity: str, label: str, check_label_groups: Tuple[Set, Set]
|
| 181 |
+
) -> bool:
|
| 182 |
+
return any(
|
| 183 |
+
[entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
|
| 184 |
+
)
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/date_recognizer.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional, List
|
| 2 |
+
|
| 3 |
+
from presidio_analyzer import Pattern, PatternRecognizer, RecognizerResult
|
| 4 |
+
from presidio_analyzer.nlp_engine import NlpArtifacts
|
| 5 |
+
|
| 6 |
+
import regex as re
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class DateRecognizer(PatternRecognizer):
|
| 10 |
+
"""
|
| 11 |
+
Recognize date using regex.
|
| 12 |
+
|
| 13 |
+
:param patterns: List of patterns to be used by this recognizer
|
| 14 |
+
:param context: List of context words to increase confidence in detection
|
| 15 |
+
:param supported_language: Language this recognizer supports
|
| 16 |
+
:param supported_entity: The entity this recognizer can detect
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
PATTERNS = [
|
| 20 |
+
Pattern(
|
| 21 |
+
"mm/dd/yyyy or mm/dd/yy",
|
| 22 |
+
r"\b(([1-9]|0[1-9]|1[0-2])/([1-9]|0[1-9]|[1-2][0-9]|3[0-1])/(\d{4}|\d{2}))\b", # noqa: E501
|
| 23 |
+
0.6,
|
| 24 |
+
),
|
| 25 |
+
Pattern(
|
| 26 |
+
"dd/mm/yyyy or dd/mm/yy",
|
| 27 |
+
r"\b(([1-9]|0[1-9]|[1-2][0-9]|3[0-1])/([1-9]|0[1-9]|1[0-2])/(\d{4}|\d{2}))\b", # noqa: E501
|
| 28 |
+
0.6,
|
| 29 |
+
),
|
| 30 |
+
Pattern(
|
| 31 |
+
"yyyy/mm/dd",
|
| 32 |
+
r"\b(\d{4}/([1-9]|0[1-9]|1[0-2])/([1-9]|0[1-9]|[1-2][0-9]|3[0-1]))\b",
|
| 33 |
+
0.6,
|
| 34 |
+
),
|
| 35 |
+
Pattern(
|
| 36 |
+
"mm-dd-yyyy",
|
| 37 |
+
r"\b(([1-9]|0[1-9]|1[0-2])-([1-9]|0[1-9]|[1-2][0-9]|3[0-1])-\d{4})\b",
|
| 38 |
+
0.6,
|
| 39 |
+
),
|
| 40 |
+
Pattern(
|
| 41 |
+
"dd-mm-yyyy",
|
| 42 |
+
r"\b(([1-9]|0[1-9]|[1-2][0-9]|3[0-1])-([1-9]|0[1-9]|1[0-2])-\d{4})\b",
|
| 43 |
+
0.6,
|
| 44 |
+
),
|
| 45 |
+
Pattern(
|
| 46 |
+
"yyyy-mm-dd",
|
| 47 |
+
r"\b(\d{4}-([1-9]|0[1-9]|1[0-2])-([1-9]|0[1-9]|[1-2][0-9]|3[0-1]))\b",
|
| 48 |
+
0.6,
|
| 49 |
+
),
|
| 50 |
+
Pattern(
|
| 51 |
+
"dd.mm.yyyy or dd.mm.yy",
|
| 52 |
+
r"\b(([1-9]|0[1-9]|[1-2][0-9]|3[0-1])\.([1-9]|0[1-9]|1[0-2])\.(\d{4}|\d{2}))\b", # noqa: E501
|
| 53 |
+
0.6,
|
| 54 |
+
),
|
| 55 |
+
Pattern(
|
| 56 |
+
"dd-MMM-yyyy or dd-MMM-yy",
|
| 57 |
+
r"\b(([1-9]|0[1-9]|[1-2][0-9]|3[0-1])-(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)-(\d{4}|\d{2}))\b", # noqa: E501
|
| 58 |
+
0.6,
|
| 59 |
+
),
|
| 60 |
+
Pattern(
|
| 61 |
+
"MMM-yyyy or MMM-yy",
|
| 62 |
+
r"\b((JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)-(\d{4}|\d{2}))\b", # noqa: E501
|
| 63 |
+
0.6,
|
| 64 |
+
),
|
| 65 |
+
Pattern(
|
| 66 |
+
"dd-MMM or dd-MMM",
|
| 67 |
+
r"\b(([1-9]|0[1-9]|[1-2][0-9]|3[0-1])-(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC))\b", # noqa: E501
|
| 68 |
+
0.6,
|
| 69 |
+
),
|
| 70 |
+
Pattern(
|
| 71 |
+
"mm/yyyy or m/yyyy",
|
| 72 |
+
r"\b(([1-9]|0[1-9]|1[0-2])/\d{4})\b",
|
| 73 |
+
0.2,
|
| 74 |
+
),
|
| 75 |
+
Pattern(
|
| 76 |
+
"mm/yy or m/yy",
|
| 77 |
+
r"\b(([1-9]|0[1-9]|1[0-2])/\d{2})\b",
|
| 78 |
+
0.1,
|
| 79 |
+
),
|
| 80 |
+
]
|
| 81 |
+
|
| 82 |
+
CONTEXT = ["date", "birthday"]
|
| 83 |
+
|
| 84 |
+
def __init__(
|
| 85 |
+
self,
|
| 86 |
+
patterns: Optional[List[Pattern]] = None,
|
| 87 |
+
context: Optional[List[str]] = None,
|
| 88 |
+
supported_language: str = "en",
|
| 89 |
+
supported_entity: str = "DATE_TIME",
|
| 90 |
+
):
|
| 91 |
+
patterns = patterns if patterns else self.PATTERNS
|
| 92 |
+
context = context if context else self.CONTEXT
|
| 93 |
+
super().__init__(
|
| 94 |
+
supported_entity=supported_entity,
|
| 95 |
+
patterns=patterns,
|
| 96 |
+
context=context,
|
| 97 |
+
supported_language=supported_language,
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
def analyze(
|
| 101 |
+
self,
|
| 102 |
+
text: str,
|
| 103 |
+
entities: List[str],
|
| 104 |
+
nlp_artifacts: NlpArtifacts = None,
|
| 105 |
+
regex_flags: int = None,
|
| 106 |
+
) -> List[RecognizerResult]:
|
| 107 |
+
"""
|
| 108 |
+
Analyzes text to detect PII using regular expressions or deny-lists.
|
| 109 |
+
|
| 110 |
+
:param text: Text to be analyzed
|
| 111 |
+
:param entities: Entities this recognizer can detect
|
| 112 |
+
:param nlp_artifacts: Output values from the NLP engine
|
| 113 |
+
:param regex_flags:
|
| 114 |
+
:return:
|
| 115 |
+
"""
|
| 116 |
+
regex_flags = (
|
| 117 |
+
regex_flags | re.IGNORECASE
|
| 118 |
+
if regex_flags
|
| 119 |
+
else re.DOTALL | re.MULTILINE | re.IGNORECASE
|
| 120 |
+
) # noqa: E501
|
| 121 |
+
|
| 122 |
+
return super().analyze(
|
| 123 |
+
text=text,
|
| 124 |
+
entities=entities,
|
| 125 |
+
nlp_artifacts=nlp_artifacts,
|
| 126 |
+
regex_flags=regex_flags,
|
| 127 |
+
)
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/email_recognizer.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Optional
|
| 2 |
+
|
| 3 |
+
import tldextract
|
| 4 |
+
|
| 5 |
+
from presidio_analyzer import Pattern, PatternRecognizer
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class EmailRecognizer(PatternRecognizer):
|
| 9 |
+
"""
|
| 10 |
+
Recognize email addresses using regex.
|
| 11 |
+
|
| 12 |
+
:param patterns: List of patterns to be used by this recognizer
|
| 13 |
+
:param context: List of context words to increase confidence in detection
|
| 14 |
+
:param supported_language: Language this recognizer supports
|
| 15 |
+
:param supported_entity: The entity this recognizer can detect
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
PATTERNS = [
|
| 19 |
+
Pattern(
|
| 20 |
+
"Email (Medium)",
|
| 21 |
+
r"\b((([!#$%&'*+\-/=?^_`{|}~\w])|([!#$%&'*+\-/=?^_`{|}~\w][!#$%&'*+\-/=?^_`{|}~\.\w]{0,}[!#$%&'*+\-/=?^_`{|}~\w]))[@]\w+([-.]\w+)*\.\w+([-.]\w+)*)\b", # noqa: E501
|
| 22 |
+
0.5,
|
| 23 |
+
),
|
| 24 |
+
]
|
| 25 |
+
|
| 26 |
+
CONTEXT = ["email"]
|
| 27 |
+
|
| 28 |
+
def __init__(
|
| 29 |
+
self,
|
| 30 |
+
patterns: Optional[List[Pattern]] = None,
|
| 31 |
+
context: Optional[List[str]] = None,
|
| 32 |
+
supported_language: str = "en",
|
| 33 |
+
supported_entity: str = "EMAIL_ADDRESS",
|
| 34 |
+
):
|
| 35 |
+
patterns = patterns if patterns else self.PATTERNS
|
| 36 |
+
context = context if context else self.CONTEXT
|
| 37 |
+
super().__init__(
|
| 38 |
+
supported_entity=supported_entity,
|
| 39 |
+
patterns=patterns,
|
| 40 |
+
context=context,
|
| 41 |
+
supported_language=supported_language,
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
def validate_result(self, pattern_text: str): # noqa D102
|
| 45 |
+
result = tldextract.extract(pattern_text)
|
| 46 |
+
return result.fqdn != ""
|
presidio_analyzer/presidio_analyzer/Infosys_presidio_analyzer/presidio_analyzer/presidio_analyzer/predefined_recognizers/es_nif_recognizer.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Tuple, Optional
|
| 2 |
+
|
| 3 |
+
from presidio_analyzer import Pattern, PatternRecognizer
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class EsNifRecognizer(PatternRecognizer):
|
| 7 |
+
"""
|
| 8 |
+
Recognize NIF number using regex and checksum.
|
| 9 |
+
|
| 10 |
+
:param patterns: List of patterns to be used by this recognizer
|
| 11 |
+
:param context: List of context words to increase confidence in detection
|
| 12 |
+
:param supported_language: Language this recognizer supports
|
| 13 |
+
:param supported_entity: The entity this recognizer can detect
|
| 14 |
+
:param replacement_pairs: List of tuples with potential replacement values
|
| 15 |
+
for different strings to be used during pattern matching.
|
| 16 |
+
This can allow a greater variety in input, for example by removing dashes or spaces.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
PATTERNS = [
|
| 20 |
+
Pattern(
|
| 21 |
+
"NIF",
|
| 22 |
+
r"\b[0-9]?[0-9]{7}[-]?[A-Z]\b",
|
| 23 |
+
0.5,
|
| 24 |
+
),
|
| 25 |
+
]
|
| 26 |
+
|
| 27 |
+
CONTEXT = ["documento nacional de identidad", "DNI", "NIF", "identificación"]
|
| 28 |
+
|
| 29 |
+
def __init__(
|
| 30 |
+
self,
|
| 31 |
+
patterns: Optional[List[Pattern]] = None,
|
| 32 |
+
context: Optional[List[str]] = None,
|
| 33 |
+
supported_language: str = "es",
|
| 34 |
+
supported_entity: str = "ES_NIF",
|
| 35 |
+
replacement_pairs: Optional[List[Tuple[str, str]]] = None,
|
| 36 |
+
):
|
| 37 |
+
self.replacement_pairs = (
|
| 38 |
+
replacement_pairs if replacement_pairs else [("-", ""), (" ", "")]
|
| 39 |
+
)
|
| 40 |
+
patterns = patterns if patterns else self.PATTERNS
|
| 41 |
+
context = context if context else self.CONTEXT
|
| 42 |
+
super().__init__(
|
| 43 |
+
supported_entity=supported_entity,
|
| 44 |
+
patterns=patterns,
|
| 45 |
+
context=context,
|
| 46 |
+
supported_language=supported_language,
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
def validate_result(self, pattern_text: str) -> bool: # noqa D102
|
| 50 |
+
pattern_text = EsNifRecognizer.__sanitize_value(pattern_text)
|
| 51 |
+
letter = pattern_text[-1]
|
| 52 |
+
number = int("".join(filter(str.isdigit, pattern_text)))
|
| 53 |
+
letters = "TRWAGMYFPDXBNJZSQVHLCKE"
|
| 54 |
+
return letter == letters[number % 23]
|
| 55 |
+
|
| 56 |
+
@staticmethod
|
| 57 |
+
def __sanitize_value(text: str) -> str:
|
| 58 |
+
return text.replace("-", "").replace(" ", "")
|