Spaces:

KarishmaShirsath
/

PIIMasking

Runtime error

App Files Files Community

KarishmaShirsath commited on Apr 13, 2024

Commit

1e7dab8

verified ·

1 Parent(s): 495f267

Upload 4 files

Browse files

added more de-identification methods

Files changed (4) hide show

Final_file.py +17 -32
PiiMaskingService.py +183 -0
app.py +30 -13
flair_recognizer.py +186 -0

Final_file.py CHANGED Viewed

@@ -733,54 +733,39 @@ class FlairRecognizer2():
         text: str,
         operator: str,
         # analyze_results: List[RecognizerResult],
-        mask_char: Optional[str] = None,
-        number_of_chars: Optional[str] = None,
-        encrypt_key: Optional[str] = None,
     ):
         """Anonymize identified input using Presidio Anonymizer.
         :param text: Full text
         :param operator: Operator name
-        :param mask_char: Mask char (for mask operator)
-        :param number_of_chars: Number of characters to mask (for mask operator)
-        :param encrypt_key: Encryption key (for encrypt operator)
         :param analyze_results: list of results from presidio analyzer engine
         """
-        if operator == "mask":
             operator_config = {
                 "type": "mask",
-                "masking_char": mask_char,
-                "chars_to_mask": number_of_chars,
                 "from_end": False,
             }
-        # Define operator config
         elif operator == "encrypt":
             operator_config = {"key": encrypt_key}
         elif operator == "highlight":
             operator_config = {"lambda": lambda x: x}
-        else:
-            operator_config = None
-        # Change operator if needed as intermediate step
         if operator == "highlight":
             operator = "custom"
-        elif operator == "synthesize":
-            operator = "replace"
-        else:
-            operator = operator
-        # res = AnonymizerEngine().anonymize(
-        #     text,
-        #     analyze_results,
-        #     operators={"DEFAULT": OperatorConfig("redact", operator_config)},
-        # )
-        entitiesToRecognize=['PHONE_NUMBER', 'PERSON', 'ID', 'LOCATION', 'EMAIL', 'URL', 'CREDIT_CARD', 'AGE', 'DATE_TIME', 'CRYPTO'
-                             'IP_ADDRESS', 'US_PASSPORT', 'US_BANK_NUMBER'
-                             ]
         analyzer = AnalyzerEngine()
@@ -794,8 +779,8 @@ class FlairRecognizer2():
             # Operators to define the anonymization type.
         result = engine.anonymize(
             text=text,
-            analyzer_results=results,
-            operators={"DEFAULT": OperatorConfig(operator, {"new_value": "BIP"})}
         )
         print("res:")
         print(result)

         text: str,
         operator: str,
         # analyze_results: List[RecognizerResult],
     ):
         """Anonymize identified input using Presidio Anonymizer.
         :param text: Full text
         :param operator: Operator name
         :param analyze_results: list of results from presidio analyzer engine
         """
+        entitiesToRecognize=['UK_NHS','EMAIL','AU_ABN','CRYPTO','ID','URL',
+                             'AU_MEDICARE','IN_PAN','ORGANIZATION','IN_AADHAAR',
+                             'SG_NRIC_FIN','EMAIL_ADDRESS','AU_ACN','US_DRIVER_LICENSE',
+                             'IP_ADDRESS','DATE_TIME','LOCATION','PERSON','CREDIT_CARD',
+                             'IBAN_CODE','US_BANK_NUMBER','PHONE_NUMBER','MEDICAL_LICENSE',
+                             'US_SSN','AU_TFN','US_PASSPORT','US_ITIN','NRP','AGE','GENERIC_PII'
+                             ]
+        operator_config = None
+        encrypt_key = "WmZq4t7w!z%C&F)J"
+        if operator == 'mask':
             operator_config = {
                 "type": "mask",
+                "masking_char": "*",
+                "chars_to_mask": 10,
                 "from_end": False,
             }
         elif operator == "encrypt":
             operator_config = {"key": encrypt_key}
         elif operator == "highlight":
             operator_config = {"lambda": lambda x: x}
         if operator == "highlight":
             operator = "custom"
         analyzer = AnalyzerEngine()
             # Operators to define the anonymization type.
         result = engine.anonymize(
             text=text,
+            operators={"DEFAULT": OperatorConfig(operator, operator_config)},
+            analyzer_results=results
         )
         print("res:")
         print(result)

PiiMaskingService.py ADDED Viewed

	@@ -0,0 +1,183 @@

+from typing import List, Dict, Optional, Tuple, Type
+from presidio_anonymizer import AnonymizerEngine
+from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
+from presidio_anonymizer.entities import (
+    OperatorConfig,
+)
+from presidio_analyzer.nlp_engine import (
+    NlpEngine,
+    NlpEngineProvider,
+)
+class PiiMaskingService():
+    def analyze(self, text: str):
+        entitiesToRecognize=['UK_NHS','EMAIL','AU_ABN','CRYPTO','ID','URL',
+                             'AU_MEDICARE','IN_PAN','ORGANIZATION','IN_AADHAAR',
+                             'SG_NRIC_FIN','EMAIL_ADDRESS','AU_ACN','US_DRIVER_LICENSE',
+                             'IP_ADDRESS','DATE_TIME','LOCATION','PERSON','CREDIT_CARD',
+                             'IBAN_CODE','US_BANK_NUMBER','PHONE_NUMBER','MEDICAL_LICENSE',
+                             'US_SSN','AU_TFN','US_PASSPORT','US_ITIN','NRP','AGE','GENERIC_PII'
+                             ]
+        a, b= self.create_nlp_engine_with_flair("flair/ner-english-large")
+        print(a)
+        print(b)
+        analyzer = AnalyzerEngine()
+        results = analyzer.analyze(text=text, entities=entitiesToRecognize, language='en')
+        print("analyzer results:")
+        print(results)
+        return results
+    def anonymize(
+            self,
+            text: str,
+            operator: str,
+            # analyze_results: List[RecognizerResult],
+        ):
+        operator_config = None
+        encrypt_key = "WmZq4t7w!z%C&F)J"
+        if operator == 'mask':
+            operator_config = {
+                "type": "mask",
+                "masking_char": "*",
+                "chars_to_mask": 10,
+                "from_end": False,
+            }
+        elif operator == "encrypt":
+            operator_config = {"key": encrypt_key}
+        elif operator == "highlight":
+            operator_config = {"lambda": lambda x: x}
+        if operator == "highlight":
+            operator = "custom"
+        analyzer_result = self.analyze(text)
+        engine = AnonymizerEngine()
+            # Invoke the anonymize function with the text, analyzer results and
+            # Operators to define the anonymization type.
+        result = engine.anonymize(
+            text=text,
+            operators={"DEFAULT": OperatorConfig(operator, operator_config)},
+            analyzer_results=analyzer_result
+        )
+        print("res:")
+        print(result)
+        print(result.text)
+        print(type(result.text))
+        return result.text
+    def create_nlp_engine_with_flair(
+            self,
+            model_path: str,
+    ) -> Tuple[NlpEngine, RecognizerRegistry]:
+        """
+        Instantiate an NlpEngine with a FlairRecognizer and a small spaCy model.
+        The FlairRecognizer would return results from Flair models, the spaCy model
+        would return NlpArtifacts such as POS and lemmas.
+        :param model_path: Flair model path.
+        """
+        from flair_recognizer import FlairRecognizer
+        registry = RecognizerRegistry()
+        registry.load_predefined_recognizers()
+        # there is no official Flair NlpEngine, hence we load it as an additional recognizer
+        # if not spacy.util.is_package("en_core_web_sm"):
+        #     spacy.cli.download("en_core_web_sm")
+        # Using a small spaCy model + a Flair NER model
+        flair_recognizer = FlairRecognizer(model_path=model_path)
+        nlp_configuration = {
+            "nlp_engine_name": "spacy",
+            "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
+        }
+        registry.add_recognizer(flair_recognizer)
+        registry.remove_recognizer("SpacyRecognizer")
+        nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
+        return nlp_engine, registry
+    def create_nlp_engine_with_transformers(
+            self,
+            model_path: str,
+    ) -> Tuple[NlpEngine, RecognizerRegistry]:
+        """
+        Instantiate an NlpEngine with a TransformersRecognizer and a small spaCy model.
+        The TransformersRecognizer would return results from Transformers models, the spaCy model
+        would return NlpArtifacts such as POS and lemmas.
+        :param model_path: HuggingFace model path.
+        """
+        print(f"Loading Transformers model: {model_path} of type {type(model_path)}")
+        nlp_configuration = {
+            "nlp_engine_name": "transformers",
+            "models": [
+                {
+                    "lang_code": "en",
+                    "model_name": {"spacy": "en_core_web_sm", "transformers": model_path},
+                }
+            ],
+            "ner_model_configuration": {
+                "model_to_presidio_entity_mapping": {
+                    "PER": "PERSON",
+                    "PERSON": "PERSON",
+                    "LOC": "LOCATION",
+                    "LOCATION": "LOCATION",
+                    "GPE": "LOCATION",
+                    "ORG": "ORGANIZATION",
+                    "ORGANIZATION": "ORGANIZATION",
+                    "NORP": "NRP",
+                    "AGE": "AGE",
+                    "ID": "ID",
+                    "EMAIL": "EMAIL",
+                    "PATIENT": "PERSON",
+                    "STAFF": "PERSON",
+                    "HOSP": "ORGANIZATION",
+                    "PATORG": "ORGANIZATION",
+                    "DATE": "DATE_TIME",
+                    "TIME": "DATE_TIME",
+                    "PHONE": "PHONE_NUMBER",
+                    "HCW": "PERSON",
+                    "HOSPITAL": "ORGANIZATION",
+                    "FACILITY": "LOCATION",
+                },
+                "low_confidence_score_multiplier": 0.4,
+                "low_score_entity_names": ["ID"],
+                "labels_to_ignore": [
+                    "CARDINAL",
+                    "EVENT",
+                    "LANGUAGE",
+                    "LAW",
+                    "MONEY",
+                    "ORDINAL",
+                    "PERCENT",
+                    "PRODUCT",
+                    "QUANTITY",
+                    "WORK_OF_ART",
+                ],
+            },
+        }
+        nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
+        registry = RecognizerRegistry()
+        registry.load_predefined_recognizers(nlp_engine=nlp_engine)
+        return nlp_engine, registry

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ import docx
 from fpdf import FPDF
 import io
 from docx import Document
 # Cache the model loading and prediction function
 @st.cache_resource
@@ -23,6 +24,10 @@ def cached_analyze_text(text, operator):
 def cached_anonimize_text(text, operator):
     return FlairRecognizer2.anonymize(text, operator)
 def download_masked_file(masked_text, file_extension):
     # Create a temporary file to store the masked text
@@ -73,29 +78,38 @@ def main():
     st_operator = st.sidebar.selectbox(
         "De-identification approach",
-        ["redact", "replace", "hash"],
         index=1,
         help="""
         Select which manipulation to the text is requested after PII has been identified.\n
         - Redact: Completely remove the PII text\n
         - Replace: Replace the PII text with a constant, e.g. <PERSON>\n
-        - Synthesize: Replace with fake values (requires an OpenAI key)\n
         - Highlight: Shows the original text with PII highlighted in colors\n
         - Mask: Replaces a requested number of characters with an asterisk (or other mask character)\n
         - Hash: Replaces with the hash of the PII string\n
         - Encrypt: Replaces with an AES encryption of the PII string, allowing the process to be reversed
             """,
     )
-    # # Dropdown menu with four choices
-    # st.sidebar.header('Masking Options')
-    # choice = st.sidebar.selectbox('Choose your masking option:', ['Option 1', 'Option 2', 'Option 3', 'Option 4'])
     masked_text_public = ''
     if upload_option == 'Text Input':
         input_text = st.text_area("Enter text here:")
         if st.button('Analyze'):
             with st.spinner('Wait for it... the model is loading'):
-                cached_predict_ner_tags(input_text)
-                masked_text = cached_anonimize_text(input_text, st_operator)
             st.text_area("Masked text:", value=masked_text, height=200)
     elif upload_option == 'File Upload':
         uploaded_file = st.file_uploader("Upload a file", type=['txt', 'pdf', 'docx'])
@@ -106,8 +120,9 @@ def main():
                 extracted_text = extract_text_from_pdf(uploaded_file)
                 if st.button('Analyze'):
                     with st.spinner('Wait for it... the model is loading'):
-                        cached_predict_ner_tags(extracted_text)
-                        masked_text = cached_analyze_text(extracted_text)
                     st.text_area("Masked text:", value=masked_text, height=200) # Display the extracted text
                     if extracted_text:
                         pdf = create_pdf(masked_text)
@@ -128,8 +143,9 @@ def main():
                     text += paragraph.text
                 if st.button('Analyze'):
                     with st.spinner('Wait for it... the model is loading'):
-                        cached_predict_ner_tags(text)
-                        masked_text = cached_analyze_text(text)
                     st.text_area("Masked text:", value=masked_text, height=200)
                     #create word file
                     doc_io = create_word_file(masked_text)
@@ -138,8 +154,9 @@ def main():
             else:
                 if st.button('Analyze'):
                     with st.spinner('Wait for it... the model is loading'):
-                        cached_predict_ner_tags(file_contents.decode())
-                        masked_text = cached_analyze_text(file_contents.decode())
                     st.text_area("Masked text:", value=masked_text, height=200)
                     st.download_button(label="Download",data = masked_text,file_name="masked_text.txt")

 from fpdf import FPDF
 import io
 from docx import Document
+from PiiMaskingService import PiiMaskingService
 # Cache the model loading and prediction function
 @st.cache_resource
 def cached_anonimize_text(text, operator):
     return FlairRecognizer2.anonymize(text, operator)
+@st.cache_resource
+def anonymize(text, operator):
+    return PiiMaskingService().anonymize(text, operator)
 def download_masked_file(masked_text, file_extension):
     # Create a temporary file to store the masked text
     st_operator = st.sidebar.selectbox(
         "De-identification approach",
+        ["redact", "replace", "encrypt", "hash", "mask"],
         index=1,
         help="""
         Select which manipulation to the text is requested after PII has been identified.\n
         - Redact: Completely remove the PII text\n
         - Replace: Replace the PII text with a constant, e.g. <PERSON>\n
         - Highlight: Shows the original text with PII highlighted in colors\n
         - Mask: Replaces a requested number of characters with an asterisk (or other mask character)\n
         - Hash: Replaces with the hash of the PII string\n
         - Encrypt: Replaces with an AES encryption of the PII string, allowing the process to be reversed
             """,
     )
+    # st_model = st.sidebar.selectbox(
+    #     "NER model package",
+    #     [
+    #         "spaCy/en_core_web_lg",
+    #         "flair/ner-english-large",
+    #         "HuggingFace/obi/deid_roberta_i2b2",
+    #         "HuggingFace/StanfordAIMI/stanford-deidentifier-base",
+    #     ],
+    #     index=2,
+    # )
     masked_text_public = ''
     if upload_option == 'Text Input':
         input_text = st.text_area("Enter text here:")
         if st.button('Analyze'):
             with st.spinner('Wait for it... the model is loading'):
+                # cached_predict_ner_tags(input_text)
+                masked_text = anonymize(input_text, st_operator)
+                # masked_text = cached_anonimize_text(input_text, st_operator)
             st.text_area("Masked text:", value=masked_text, height=200)
     elif upload_option == 'File Upload':
         uploaded_file = st.file_uploader("Upload a file", type=['txt', 'pdf', 'docx'])
                 extracted_text = extract_text_from_pdf(uploaded_file)
                 if st.button('Analyze'):
                     with st.spinner('Wait for it... the model is loading'):
+                        # cached_predict_ner_tags(extracted_text)
+                        masked_text = anonymize(extracted_text, st_operator)
+                        # masked_text = cached_analyze_text(extracted_text)
                     st.text_area("Masked text:", value=masked_text, height=200) # Display the extracted text
                     if extracted_text:
                         pdf = create_pdf(masked_text)
                     text += paragraph.text
                 if st.button('Analyze'):
                     with st.spinner('Wait for it... the model is loading'):
+                        # cached_predict_ner_tags(text)
+                        masked_text = anonymize(text, st_operator)
+                        # masked_text = cached_analyze_text(text)
                     st.text_area("Masked text:", value=masked_text, height=200)
                     #create word file
                     doc_io = create_word_file(masked_text)
             else:
                 if st.button('Analyze'):
                     with st.spinner('Wait for it... the model is loading'):
+                        # cached_predict_ner_tags(file_contents.decode())
+                        # masked_text = cached_analyze_text(file_contents.decode())
+                        masked_text = anonymize(file_contents.decode(), st_operator)
                     st.text_area("Masked text:", value=masked_text, height=200)
                     st.download_button(label="Download",data = masked_text,file_name="masked_text.txt")

flair_recognizer.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import logging
+from typing import Optional, List, Tuple, Set
+from presidio_analyzer import (
+    RecognizerResult,
+    EntityRecognizer,
+    AnalysisExplanation,
+)
+from presidio_analyzer.nlp_engine import NlpArtifacts
+from flair.data import Sentence
+from flair.models import SequenceTagger
+logger = logging.getLogger("presidio-analyzer")
+class FlairRecognizer(EntityRecognizer):
+    """
+    Wrapper for a flair model, if needed to be used within Presidio Analyzer.
+    :example:
+    >from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
+    >flair_recognizer = FlairRecognizer()
+    >registry = RecognizerRegistry()
+    >registry.add_recognizer(flair_recognizer)
+    >analyzer = AnalyzerEngine(registry=registry)
+    >results = analyzer.analyze(
+    >    "My name is Christopher and I live in Irbid.",
+    >    language="en",
+    >    return_decision_process=True,
+    >)
+    >for result in results:
+    >    print(result)
+    >    print(result.analysis_explanation)
+    """
+    ENTITIES = [
+        "LOCATION",
+        "PERSON",
+        "ORGANIZATION",
+        # "MISCELLANEOUS"   # - There are no direct correlation with Presidio entities.
+    ]
+    DEFAULT_EXPLANATION = "Identified as {} by Flair's Named Entity Recognition"
+    CHECK_LABEL_GROUPS = [
+        ({"LOCATION"}, {"LOC", "LOCATION"}),
+        ({"PERSON"}, {"PER", "PERSON"}),
+        ({"ORGANIZATION"}, {"ORG"}),
+        # ({"MISCELLANEOUS"}, {"MISC"}), # Probably not PII
+    ]
+    MODEL_LANGUAGES = {"en": "flair/ner-english-large"}
+    PRESIDIO_EQUIVALENCES = {
+        "PER": "PERSON",
+        "LOC": "LOCATION",
+        "ORG": "ORGANIZATION",
+        # 'MISC': 'MISCELLANEOUS'   # - Probably not PII
+    }
+    def __init__(
+        self,
+        supported_language: str = "en",
+        supported_entities: Optional[List[str]] = None,
+        check_label_groups: Optional[Tuple[Set, Set]] = None,
+        model: SequenceTagger = None,
+        model_path: Optional[str] = None,
+    ):
+        self.check_label_groups = (
+            check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
+        )
+        supported_entities = supported_entities if supported_entities else self.ENTITIES
+        if model and model_path:
+            raise ValueError("Only one of model or model_path should be provided.")
+        elif model and not model_path:
+            self.model = model
+        elif not model and model_path:
+            print(f"Loading model from {model_path}")
+            self.model = SequenceTagger.load(model_path)
+        else:
+            print(f"Loading model for language {supported_language}")
+            self.model = SequenceTagger.load(
+                self.MODEL_LANGUAGES.get(supported_language)
+            )
+        super().__init__(
+            supported_entities=supported_entities,
+            supported_language=supported_language,
+            name="Flair Analytics",
+        )
+    def load(self) -> None:
+        """Load the model, not used. Model is loaded during initialization."""
+        pass
+    def get_supported_entities(self) -> List[str]:
+        """
+        Return supported entities by this model.
+        :return: List of the supported entities.
+        """
+        return self.supported_entities
+    # Class to use Flair with Presidio as an external recognizer.
+    def analyze(
+        self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts = None
+    ) -> List[RecognizerResult]:
+        """
+        Analyze text using Text Analytics.
+        :param text: The text for analysis.
+        :param entities: Not working properly for this recognizer.
+        :param nlp_artifacts: Not used by this recognizer.
+        :param language: Text language. Supported languages in MODEL_LANGUAGES
+        :return: The list of Presidio RecognizerResult constructed from the recognized
+            Flair detections.
+        """
+        results = []
+        sentences = Sentence(text)
+        self.model.predict(sentences)
+        # If there are no specific list of entities, we will look for all of it.
+        if not entities:
+            entities = self.supported_entities
+        for entity in entities:
+            if entity not in self.supported_entities:
+                continue
+            for ent in sentences.get_spans("ner"):
+                if not self.__check_label(
+                    entity, ent.labels[0].value, self.check_label_groups
+                ):
+                    continue
+                textual_explanation = self.DEFAULT_EXPLANATION.format(
+                    ent.labels[0].value
+                )
+                explanation = self.build_flair_explanation(
+                    round(ent.score, 2), textual_explanation
+                )
+                flair_result = self._convert_to_recognizer_result(ent, explanation)
+                results.append(flair_result)
+        return results
+    def _convert_to_recognizer_result(self, entity, explanation) -> RecognizerResult:
+        entity_type = self.PRESIDIO_EQUIVALENCES.get(entity.tag, entity.tag)
+        flair_score = round(entity.score, 2)
+        flair_results = RecognizerResult(
+            entity_type=entity_type,
+            start=entity.start_position,
+            end=entity.end_position,
+            score=flair_score,
+            analysis_explanation=explanation,
+        )
+        return flair_results
+    def build_flair_explanation(
+        self, original_score: float, explanation: str
+    ) -> AnalysisExplanation:
+        """
+        Create explanation for why this result was detected.
+        :param original_score: Score given by this recognizer
+        :param explanation: Explanation string
+        :return:
+        """
+        explanation = AnalysisExplanation(
+            recognizer=self.__class__.__name__,
+            original_score=original_score,
+            textual_explanation=explanation,
+        )
+        return explanation
+    @staticmethod
+    def __check_label(
+        entity: str, label: str, check_label_groups: Tuple[Set, Set]
+    ) -> bool:
+        return any(
+            [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
+        )