Spaces:
Running
Running
langdonholmes
commited on
Commit
·
9637704
1
Parent(s):
0e58a18
custom anonymizer as class
Browse files- .gitignore +0 -1
- app.py +4 -6
- piilo/engines/analyzer.py +21 -19
- piilo/engines/anonymizer.py +2 -1
- piilo/main.py +4 -4
.gitignore
CHANGED
|
@@ -1,3 +1,2 @@
|
|
| 1 |
-
__pycache__/*
|
| 2 |
.ipynb_checkpoints
|
| 3 |
__pycache__
|
|
|
|
|
|
|
| 1 |
.ipynb_checkpoints
|
| 2 |
__pycache__
|
app.py
CHANGED
|
@@ -9,8 +9,8 @@ import pandas as pd
|
|
| 9 |
import streamlit as st
|
| 10 |
from annotated_text import annotated_text
|
| 11 |
|
| 12 |
-
from piilo.engines.analyzer import
|
| 13 |
-
from piilo.engines.anonymizer import
|
| 14 |
|
| 15 |
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
| 16 |
warnings.filterwarnings('ignore')
|
|
@@ -26,14 +26,12 @@ def analyzer_engine():
|
|
| 26 |
{'lang_code': 'en', 'model_name': 'en_student_name_detector'}],
|
| 27 |
}
|
| 28 |
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
return analyzer
|
| 32 |
|
| 33 |
@st.cache(allow_output_mutation=True)
|
| 34 |
def anonymizer_engine():
|
| 35 |
'''Return generate surrogate anonymizer.'''
|
| 36 |
-
return
|
| 37 |
|
| 38 |
def annotate(text, st_analyze_results, st_entities):
|
| 39 |
tokens = []
|
|
|
|
| 9 |
import streamlit as st
|
| 10 |
from annotated_text import annotated_text
|
| 11 |
|
| 12 |
+
from piilo.engines.analyzer import CustomAnalyzer
|
| 13 |
+
from piilo.engines.anonymizer import SurrogateAnonymizer
|
| 14 |
|
| 15 |
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
| 16 |
warnings.filterwarnings('ignore')
|
|
|
|
| 26 |
{'lang_code': 'en', 'model_name': 'en_student_name_detector'}],
|
| 27 |
}
|
| 28 |
|
| 29 |
+
return CustomAnalyzer(configuration=configuration)
|
|
|
|
|
|
|
| 30 |
|
| 31 |
@st.cache(allow_output_mutation=True)
|
| 32 |
def anonymizer_engine():
|
| 33 |
'''Return generate surrogate anonymizer.'''
|
| 34 |
+
return SurrogateAnonymizer()
|
| 35 |
|
| 36 |
def annotate(text, st_analyze_results, st_entities):
|
| 37 |
tokens = []
|
piilo/engines/analyzer.py
CHANGED
|
@@ -117,25 +117,27 @@ class CustomSpacyRecognizer(LocalRecognizer):
|
|
| 117 |
[entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
|
| 118 |
)
|
| 119 |
|
| 120 |
-
|
| 121 |
-
'''
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
registry = RecognizerRegistry()
|
| 131 |
-
registry.load_predefined_recognizers(nlp_engine=nlp_engine)
|
| 132 |
-
registry.add_recognizer(spacy_recognizer)
|
| 133 |
|
| 134 |
-
|
| 135 |
-
|
|
|
|
|
|
|
| 136 |
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
supported_languages=['en'])
|
| 140 |
|
| 141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
[entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
|
| 118 |
)
|
| 119 |
|
| 120 |
+
class CustomAnalyzer(AnalyzerEngine):
|
| 121 |
+
'''Custom Analyzer Engine for Presidio.'''
|
| 122 |
+
|
| 123 |
+
def __init__(self, configuration):
|
| 124 |
+
|
| 125 |
+
spacy_recognizer = CustomSpacyRecognizer()
|
| 126 |
+
|
| 127 |
+
# Create NLP engine based on configuration
|
| 128 |
+
provider = NlpEngineProvider(nlp_configuration=configuration)
|
| 129 |
+
nlp_engine = provider.create_engine()
|
|
|
|
|
|
|
|
|
|
| 130 |
|
| 131 |
+
# add rule-based recognizers
|
| 132 |
+
registry = RecognizerRegistry()
|
| 133 |
+
registry.load_predefined_recognizers(nlp_engine=nlp_engine)
|
| 134 |
+
registry.add_recognizer(spacy_recognizer)
|
| 135 |
|
| 136 |
+
# remove the nlp engine we passed, to use custom label mappings
|
| 137 |
+
registry.remove_recognizer('SpacyRecognizer')
|
|
|
|
| 138 |
|
| 139 |
+
super().__init__(
|
| 140 |
+
nlp_engine=nlp_engine,
|
| 141 |
+
registry=registry,
|
| 142 |
+
supported_languages=['en']
|
| 143 |
+
)
|
piilo/engines/anonymizer.py
CHANGED
|
@@ -14,6 +14,7 @@ data = Path(__file__).parent.parent.parent / 'data'
|
|
| 14 |
name_table = data / 'ascii_names.parquet'
|
| 15 |
|
| 16 |
logger = logging.getLogger('anonymizer')
|
|
|
|
| 17 |
class NameDatabase(NameDataset):
|
| 18 |
'''A wrapper around the names_dataset.NameDataset class.
|
| 19 |
'''
|
|
@@ -45,7 +46,7 @@ class NameDatabase(NameDataset):
|
|
| 45 |
country = NameWrapper(self.search(last_names)).country
|
| 46 |
return country if country else None
|
| 47 |
|
| 48 |
-
class
|
| 49 |
'''A wrapper around the presidio_anonymizer.AnonymizerEngine class.
|
| 50 |
'''
|
| 51 |
|
|
|
|
| 14 |
name_table = data / 'ascii_names.parquet'
|
| 15 |
|
| 16 |
logger = logging.getLogger('anonymizer')
|
| 17 |
+
|
| 18 |
class NameDatabase(NameDataset):
|
| 19 |
'''A wrapper around the names_dataset.NameDataset class.
|
| 20 |
'''
|
|
|
|
| 46 |
country = NameWrapper(self.search(last_names)).country
|
| 47 |
return country if country else None
|
| 48 |
|
| 49 |
+
class SurrogateAnonymizer(AnonymizerEngine):
|
| 50 |
'''A wrapper around the presidio_anonymizer.AnonymizerEngine class.
|
| 51 |
'''
|
| 52 |
|
piilo/main.py
CHANGED
|
@@ -5,8 +5,8 @@ import logging
|
|
| 5 |
from fastapi import FastAPI
|
| 6 |
from fastapi.middleware.cors import CORSMiddleware
|
| 7 |
|
| 8 |
-
from engines.analyzer import
|
| 9 |
-
from engines.anonymizer import
|
| 10 |
from models.anonymize import AnonymizeRequest, AnonymizeResponse
|
| 11 |
|
| 12 |
configuration = {
|
|
@@ -19,8 +19,8 @@ logger = logging.getLogger('api')
|
|
| 19 |
logging.basicConfig(level=logging.INFO)
|
| 20 |
|
| 21 |
logger.info("Loading Custom Presidio Analyzer and Anonymizer...")
|
| 22 |
-
analyzer =
|
| 23 |
-
anonymizer =
|
| 24 |
logger.info("Loading Successful!")
|
| 25 |
|
| 26 |
app = FastAPI()
|
|
|
|
| 5 |
from fastapi import FastAPI
|
| 6 |
from fastapi.middleware.cors import CORSMiddleware
|
| 7 |
|
| 8 |
+
from engines.analyzer import CustomAnalyzer
|
| 9 |
+
from engines.anonymizer import SurrogateAnonymizer
|
| 10 |
from models.anonymize import AnonymizeRequest, AnonymizeResponse
|
| 11 |
|
| 12 |
configuration = {
|
|
|
|
| 19 |
logging.basicConfig(level=logging.INFO)
|
| 20 |
|
| 21 |
logger.info("Loading Custom Presidio Analyzer and Anonymizer...")
|
| 22 |
+
analyzer = CustomAnalyzer(configuration)
|
| 23 |
+
anonymizer = SurrogateAnonymizer()
|
| 24 |
logger.info("Loading Successful!")
|
| 25 |
|
| 26 |
app = FastAPI()
|