presidio-de-identify

Sleeping

App Files Files Community

presidio commited on May 30, 2023

Commit

3477655

1 Parent(s): 3ddc22a

Upload 10 files

Browse files

Files changed (8) hide show

flair_recognizer.py +14 -5
index.md +8 -4
openai_fake_data_generator.py +45 -13
presidio_helpers.py +120 -63
presidio_nlp_engine_config.py +137 -0
presidio_streamlit.py +243 -118
requirements.txt +5 -1
text_analytics_wrapper.py +123 -0

flair_recognizer.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import logging
 from typing import Optional, List, Tuple, Set
@@ -74,17 +76,24 @@ class FlairRecognizer(EntityRecognizer):
         supported_entities: Optional[List[str]] = None,
         check_label_groups: Optional[Tuple[Set, Set]] = None,
         model: SequenceTagger = None,
     ):
         self.check_label_groups = (
             check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
         )
         supported_entities = supported_entities if supported_entities else self.ENTITIES
-        self.model = (
-            model
-            if model
-            else SequenceTagger.load(self.MODEL_LANGUAGES.get(supported_language))
-        )
         super().__init__(
             supported_entities=supported_entities,

+## Taken from https://github.com/microsoft/presidio/blob/main/docs/samples/python/flair_recognizer.py
 import logging
 from typing import Optional, List, Tuple, Set
         supported_entities: Optional[List[str]] = None,
         check_label_groups: Optional[Tuple[Set, Set]] = None,
         model: SequenceTagger = None,
+        model_path: Optional[str] = None
     ):
         self.check_label_groups = (
             check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
         )
         supported_entities = supported_entities if supported_entities else self.ENTITIES
+        if model and model_path:
+            raise ValueError("Only one of model or model_path should be provided.")
+        elif model and not model_path:
+            self.model = model
+        elif not model and model_path:
+            print(f"Loading model from {model_path}")
+            self.model = SequenceTagger.load(model_path)
+        else:
+            print(f"Loading model for language {supported_language}")
+            self.model = SequenceTagger.load(self.MODEL_LANGUAGES.get(supported_language))
         super().__init__(
             supported_entities=supported_entities,

index.md CHANGED Viewed

@@ -2,15 +2,19 @@
 Here's a simple app, written in pure Python, to create a demo website for Presidio.
 The app is based on the [streamlit](https://streamlit.io/) package.
 ## Requirements
 1. Install dependencies (preferably in a virtual environment)
 ```sh
-pip install streamlit pandas presidio-analyzer presidio-anonymizer
 ```
-2. Download the [presidio_streamlit.py](presidio_streamlit.py) file.
-3. *Optional*: Update the `analyzer_engine` and `anonymizer_engine` functions for your specific implementation
 3. Start the app:
 ```sh
@@ -19,4 +23,4 @@ streamlit run presidio_streamlit.py
 ## Output
 Output should be similar to this screenshot:
-![image](https://user-images.githubusercontent.com/3776619/120109161-efe21080-c170-11eb-8a29-9eaf71e722ee.png)

 Here's a simple app, written in pure Python, to create a demo website for Presidio.
 The app is based on the [streamlit](https://streamlit.io/) package.
+A live version can be found here: https://huggingface.co/spaces/presidio/presidio_demo
 ## Requirements
+1. Clone the repo and move to the `docs/samples/python/streamlit ` folder
 1. Install dependencies (preferably in a virtual environment)
 ```sh
+pip install -r requirements
 ```
+> Note: This would install additional packages such as `transformers` and `flair` which are not mandatory for using Presidio.
+2.
+3. *Optional*: Update the `analyzer_engine` and `anonymizer_engine` functions for your specific implementation (in `presidio_helpers.py`).
 3. Start the app:
 ```sh
 ## Output
 Output should be similar to this screenshot:
+![image](https://user-images.githubusercontent.com/3776619/232289541-d59992e1-52a4-44c1-b904-b22c72c02a5b.png)

openai_fake_data_generator.py CHANGED Viewed

@@ -1,25 +1,50 @@
 import openai
-def set_openai_key(openai_key: str):
     """Set the OpenAI API key.
-    :param openai_key: the open AI key (https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key)
     """
-    openai.api_key = openai_key
 def call_completion_model(
-    prompt: str, model: str = "text-davinci-003", max_tokens: int = 512
 ) -> str:
     """Creates a request for the OpenAI Completion service and returns the response.
     :param prompt: The prompt for the completion model
     :param model: OpenAI model name
     :param max_tokens: Model's max_tokens parameter
     """
-    response = openai.Completion.create(
-        model=model, prompt=prompt, max_tokens=max_tokens
-    )
     return response["choices"][0].text
@@ -32,16 +57,23 @@ def create_prompt(anonymized_text: str) -> str:
     """
     prompt = f"""
-    Your role is to create synthetic text based on de-identified text with placeholders instead of personally identifiable information.
-    Replace the placeholders (e.g. , , {{DATE}}, {{ip_address}}) with fake values.
     Instructions:
-    Use completely random numbers, so every digit is drawn between 0 and 9.
-    Use realistic names that come from diverse genders, ethnicities and countries.
-    If there are no placeholders, return the text as is and provide an answer.
     input: How do I change the limit on my credit card {{credit_card_number}}?
     output: How do I change the limit on my credit card 2539 3519 2345 1555?
     input: {anonymized_text}
     output:
     """

+from collections import namedtuple
+from typing import Optional
 import openai
+import logging
+logger = logging.getLogger("presidio-streamlit")
+OpenAIParams = namedtuple(
+    "open_ai_params",
+    ["openai_key", "model", "api_base", "deployment_name", "api_version", "api_type"],
+)
+def set_openai_params(openai_params: OpenAIParams):
     """Set the OpenAI API key.
+    :param openai_params: OpenAIParams object with the following fields: key, model, api version, deployment_name,
+    The latter only relate to Azure OpenAI deployments.
     """
+    openai.api_key = openai_params.openai_key
+    openai.api_version = openai_params.api_version
+    if openai_params.api_base:
+        openai.api_base = openai_params.api_base
+        openai.api_type = openai_params.api_type
 def call_completion_model(
+    prompt: str,
+    model: str = "text-davinci-003",
+    max_tokens: int = 512,
+    deployment_id: Optional[str] = None,
 ) -> str:
     """Creates a request for the OpenAI Completion service and returns the response.
     :param prompt: The prompt for the completion model
     :param model: OpenAI model name
     :param max_tokens: Model's max_tokens parameter
+    :param deployment_id: Azure OpenAI deployment ID
     """
+    if deployment_id:
+        response = openai.Completion.create(
+            deployment_id=deployment_id, model=model, prompt=prompt, max_tokens=max_tokens
+        )
+    else:
+        response = openai.Completion.create(
+            model=model, prompt=prompt, max_tokens=max_tokens
+        )
     return response["choices"][0].text
     """
     prompt = f"""
+    Your role is to create synthetic text based on de-identified text with placeholders instead of Personally Identifiable Information (PII).
+    Replace the placeholders (e.g. ,<PERSON>, {{DATE}}, {{ip_address}}) with fake values.
     Instructions:
+    a. Use completely random numbers, so every digit is drawn between 0 and 9.
+    b. Use realistic names that come from diverse genders, ethnicities and countries.
+    c. If there are no placeholders, return the text as is and provide an answer.
+    d. Keep the formatting as close to the original as possible.
+    e. If PII exists in the input, replace it with fake values in the output.
     input: How do I change the limit on my credit card {{credit_card_number}}?
     output: How do I change the limit on my credit card 2539 3519 2345 1555?
+    input: <PERSON> was the chief science officer at <ORGANIZATION>.
+    output: Katherine Buckjov was the chief science officer at NASA.
+    input: Cameroon lives in <LOCATION>.
+    output: Vladimir lives in Moscow.
     input: {anonymized_text}
     output:
     """

presidio_helpers.py CHANGED Viewed

@@ -1,79 +1,85 @@
 """
 Helper methods for the Presidio Streamlit app
 """
-from typing import List, Optional
-import spacy
 import streamlit as st
-from presidio_analyzer import AnalyzerEngine, RecognizerResult, RecognizerRegistry
-from presidio_analyzer.nlp_engine import NlpEngineProvider
 from presidio_anonymizer import AnonymizerEngine
 from presidio_anonymizer.entities import OperatorConfig
-from flair_recognizer import FlairRecognizer
 from openai_fake_data_generator import (
-    set_openai_key,
     call_completion_model,
     create_prompt,
 )
-from transformers_rec import (
-    STANFORD_COFIGURATION,
-    TransformersRecognizer,
-    BERT_DEID_CONFIGURATION,
 )
-@st.cache_resource
-def analyzer_engine(model_path: str):
-    """Return AnalyzerEngine.
-    :param model_path: Which model to use for NER:
         "StanfordAIMI/stanford-deidentifier-base",
         "obi/deid_roberta_i2b2",
         "en_core_web_lg"
     """
-    registry = RecognizerRegistry()
-    registry.load_predefined_recognizers()
     # Set up NLP Engine according to the model of choice
-    if model_path == "en_core_web_lg":
-        if not spacy.util.is_package("en_core_web_lg"):
-            spacy.cli.download("en_core_web_lg")
-        nlp_configuration = {
-            "nlp_engine_name": "spacy",
-            "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
-        }
-    elif model_path == "flair/ner-english-large":
-        flair_recognizer = FlairRecognizer()
-        nlp_configuration = {
-            "nlp_engine_name": "spacy",
-            "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
-        }
-        registry.add_recognizer(flair_recognizer)
-        registry.remove_recognizer("SpacyRecognizer")
     else:
-        if not spacy.util.is_package("en_core_web_sm"):
-            spacy.cli.download("en_core_web_sm")
-        # Using a small spaCy model + a HF NER model
-        transformers_recognizer = TransformersRecognizer(model_path=model_path)
-        registry.remove_recognizer("SpacyRecognizer")
-        if model_path == "StanfordAIMI/stanford-deidentifier-base":
-            transformers_recognizer.load_transformer(**STANFORD_COFIGURATION)
-        elif model_path == "obi/deid_roberta_i2b2":
-            transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
-        # Use small spaCy model, no need for both spacy and HF models
-        # The transformers model is used here as a recognizer, not as an NlpEngine
-        nlp_configuration = {
-            "nlp_engine_name": "spacy",
-            "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
-        }
-        registry.add_recognizer(transformers_recognizer)
-    nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
     analyzer = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry)
     return analyzer
@@ -85,17 +91,36 @@ def anonymizer_engine():
 @st.cache_data
-def get_supported_entities(st_model: str):
     """Return supported entities from the Analyzer Engine."""
-    return analyzer_engine(st_model).get_supported_entities()
 @st.cache_data
-def analyze(st_model: str, **kwargs):
     """Analyze input using Analyzer engine and input arguments (kwargs)."""
     if "entities" not in kwargs or "All" in kwargs["entities"]:
         kwargs["entities"] = None
-    return analyzer_engine(st_model).analyze(**kwargs)
 def anonymize(
@@ -184,20 +209,52 @@ def annotate(text: str, analyze_results: List[RecognizerResult]):
 def create_fake_data(
     text: str,
     analyze_results: List[RecognizerResult],
-    openai_key: str,
-    openai_model_name: str,
 ):
     """Creates a synthetic version of the text using OpenAI APIs"""
-    if not openai_key:
         return "Please provide your OpenAI key"
     results = anonymize(text=text, operator="replace", analyze_results=analyze_results)
-    set_openai_key(openai_key)
     prompt = create_prompt(results.text)
-    fake = call_openai_api(prompt, openai_model_name)
     return fake
 @st.cache_data
-def call_openai_api(prompt: str, openai_model_name: str) -> str:
-    fake_data = call_completion_model(prompt, model=openai_model_name)
     return fake_data

 """
 Helper methods for the Presidio Streamlit app
 """
+from typing import List, Optional, Tuple
+import logging
 import streamlit as st
+from presidio_analyzer import (
+    AnalyzerEngine,
+    RecognizerResult,
+    RecognizerRegistry,
+    PatternRecognizer,
+    Pattern,
+)
+from presidio_analyzer.nlp_engine import NlpEngine
 from presidio_anonymizer import AnonymizerEngine
 from presidio_anonymizer.entities import OperatorConfig
 from openai_fake_data_generator import (
+    set_openai_params,
     call_completion_model,
     create_prompt,
+    OpenAIParams,
 )
+from presidio_nlp_engine_config import (
+    create_nlp_engine_with_spacy,
+    create_nlp_engine_with_flair,
+    create_nlp_engine_with_transformers,
+    create_nlp_engine_with_azure_text_analytics,
 )
+logger = logging.getLogger("presidio-streamlit")
+@st.cache_resource
+def nlp_engine_and_registry(
+    model_family: str,
+    model_path: str,
+    ta_key: Optional[str] = None,
+    ta_endpoint: Optional[str] = None,
+) -> Tuple[NlpEngine, RecognizerRegistry]:
+    """Create the NLP Engine instance based on the requested model.
+    :param model_family: Which model package to use for NER.
+    :param model_path: Which model to use for NER. E.g.,
         "StanfordAIMI/stanford-deidentifier-base",
         "obi/deid_roberta_i2b2",
         "en_core_web_lg"
+    :param ta_key: Key to the Text Analytics endpoint (only if model_path = "Azure Text Analytics")
+    :param ta_endpoint: Endpoint of the Text Analytics instance (only if model_path = "Azure Text Analytics")
     """
     # Set up NLP Engine according to the model of choice
+    if "spaCy" in model_family:
+        return create_nlp_engine_with_spacy(model_path)
+    elif "flair" in model_family:
+        return create_nlp_engine_with_flair(model_path)
+    elif "HuggingFace" in model_family:
+        return create_nlp_engine_with_transformers(model_path)
+    elif "Azure Text Analytics" in model_family:
+        return create_nlp_engine_with_azure_text_analytics(ta_key, ta_endpoint)
     else:
+        raise ValueError(f"Model family {model_family} not supported")
+@st.cache_resource
+def analyzer_engine(
+    model_family: str,
+    model_path: str,
+    ta_key: Optional[str] = None,
+    ta_endpoint: Optional[str] = None,
+) -> AnalyzerEngine:
+    """Create the NLP Engine instance based on the requested model.
+    :param model_family: Which model package to use for NER.
+    :param model_path: Which model to use for NER:
+        "StanfordAIMI/stanford-deidentifier-base",
+        "obi/deid_roberta_i2b2",
+        "en_core_web_lg"
+    :param ta_key: Key to the Text Analytics endpoint (only if model_path = "Azure Text Analytics")
+    :param ta_endpoint: Endpoint of the Text Analytics instance (only if model_path = "Azure Text Analytics")
+    """
+    nlp_engine, registry = nlp_engine_and_registry(
+        model_family, model_path, ta_key, ta_endpoint
+    )
     analyzer = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry)
     return analyzer
 @st.cache_data
+def get_supported_entities(
+    model_family: str, model_path: str, ta_key: str, ta_endpoint: str
+):
     """Return supported entities from the Analyzer Engine."""
+    return analyzer_engine(
+        model_family, model_path, ta_key, ta_endpoint
+    ).get_supported_entities() + ["GENERIC_PII"]
 @st.cache_data
+def analyze(
+    model_family: str, model_path: str, ta_key: str, ta_endpoint: str, **kwargs
+):
     """Analyze input using Analyzer engine and input arguments (kwargs)."""
     if "entities" not in kwargs or "All" in kwargs["entities"]:
         kwargs["entities"] = None
+    if "deny_list" in kwargs and kwargs["deny_list"] is not None:
+        ad_hoc_recognizer = create_ad_hoc_deny_list_recognizer(kwargs["deny_list"])
+        kwargs["ad_hoc_recognizers"] = [ad_hoc_recognizer] if ad_hoc_recognizer else []
+        del kwargs["deny_list"]
+    if "regex_params" in kwargs and len(kwargs["regex_params"]) > 0:
+        ad_hoc_recognizer = create_ad_hoc_regex_recognizer(*kwargs["regex_params"])
+        kwargs["ad_hoc_recognizers"] = [ad_hoc_recognizer] if ad_hoc_recognizer else []
+        del kwargs["regex_params"]
+    return analyzer_engine(model_family, model_path, ta_key, ta_endpoint).analyze(
+        **kwargs
+    )
 def anonymize(
 def create_fake_data(
     text: str,
     analyze_results: List[RecognizerResult],
+    openai_params: OpenAIParams,
 ):
     """Creates a synthetic version of the text using OpenAI APIs"""
+    if not openai_params.openai_key:
         return "Please provide your OpenAI key"
     results = anonymize(text=text, operator="replace", analyze_results=analyze_results)
+    set_openai_params(openai_params)
     prompt = create_prompt(results.text)
+    print(f"Prompt: {prompt}")
+    fake = call_openai_api(
+        prompt=prompt,
+        openai_model_name=openai_params.model,
+        openai_deployment_name=openai_params.deployment_name,
+    )
     return fake
 @st.cache_data
+def call_openai_api(
+    prompt: str, openai_model_name: str, openai_deployment_name: Optional[str] = None
+) -> str:
+    fake_data = call_completion_model(
+        prompt, model=openai_model_name, deployment_id=openai_deployment_name
+    )
     return fake_data
+def create_ad_hoc_deny_list_recognizer(
+    deny_list=Optional[List[str]],
+) -> Optional[PatternRecognizer]:
+    if not deny_list:
+        return None
+    deny_list_recognizer = PatternRecognizer(
+        supported_entity="GENERIC_PII", deny_list=deny_list
+    )
+    return deny_list_recognizer
+def create_ad_hoc_regex_recognizer(
+    regex: str, entity_type: str, score: float, context: Optional[List[str]] = None
+) -> Optional[PatternRecognizer]:
+    if not regex:
+        return None
+    pattern = Pattern(name="Regex pattern", regex=regex, score=score)
+    regex_recognizer = PatternRecognizer(
+        supported_entity=entity_type, patterns=[pattern], context=context
+    )
+    return regex_recognizer

presidio_nlp_engine_config.py ADDED Viewed

	@@ -0,0 +1,137 @@

+from typing import Tuple
+import logging
+import spacy
+from presidio_analyzer import RecognizerRegistry
+from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider
+logger = logging.getLogger("presidio-streamlit")
+def create_nlp_engine_with_spacy(
+    model_path: str,
+) -> Tuple[NlpEngine, RecognizerRegistry]:
+    """
+    Instantiate an NlpEngine with a spaCy model
+    :param model_path: spaCy model path.
+    """
+    registry = RecognizerRegistry()
+    registry.load_predefined_recognizers()
+    if not spacy.util.is_package(model_path):
+        spacy.cli.download(model_path)
+    nlp_configuration = {
+        "nlp_engine_name": "spacy",
+        "models": [{"lang_code": "en", "model_name": model_path}],
+    }
+    nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
+    return nlp_engine, registry
+def create_nlp_engine_with_transformers(
+    model_path: str,
+) -> Tuple[NlpEngine, RecognizerRegistry]:
+    """
+    Instantiate an NlpEngine with a TransformersRecognizer and a small spaCy model.
+    The TransformersRecognizer would return results from Transformers models, the spaCy model
+    would return NlpArtifacts such as POS and lemmas.
+    :param model_path: HuggingFace model path.
+    """
+    from transformers_rec import (
+        STANFORD_COFIGURATION,
+        BERT_DEID_CONFIGURATION,
+        TransformersRecognizer,
+    )
+    registry = RecognizerRegistry()
+    registry.load_predefined_recognizers()
+    if not spacy.util.is_package("en_core_web_sm"):
+        spacy.cli.download("en_core_web_sm")
+    # Using a small spaCy model + a HF NER model
+    transformers_recognizer = TransformersRecognizer(model_path=model_path)
+    if model_path == "StanfordAIMI/stanford-deidentifier-base":
+        transformers_recognizer.load_transformer(**STANFORD_COFIGURATION)
+    elif model_path == "obi/deid_roberta_i2b2":
+        transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
+    else:
+        print(f"Warning: Model has no configuration, loading default.")
+        transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
+    # Use small spaCy model, no need for both spacy and HF models
+    # The transformers model is used here as a recognizer, not as an NlpEngine
+    nlp_configuration = {
+        "nlp_engine_name": "spacy",
+        "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
+    }
+    registry.add_recognizer(transformers_recognizer)
+    registry.remove_recognizer("SpacyRecognizer")
+    nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
+    return nlp_engine, registry
+def create_nlp_engine_with_flair(
+    model_path: str,
+) -> Tuple[NlpEngine, RecognizerRegistry]:
+    """
+    Instantiate an NlpEngine with a FlairRecognizer and a small spaCy model.
+    The FlairRecognizer would return results from Flair models, the spaCy model
+    would return NlpArtifacts such as POS and lemmas.
+    :param model_path: Flair model path.
+    """
+    from flair_recognizer import FlairRecognizer
+    registry = RecognizerRegistry()
+    registry.load_predefined_recognizers()
+    if not spacy.util.is_package("en_core_web_sm"):
+        spacy.cli.download("en_core_web_sm")
+    # Using a small spaCy model + a Flair NER model
+    flair_recognizer = FlairRecognizer(model_path=model_path)
+    nlp_configuration = {
+        "nlp_engine_name": "spacy",
+        "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
+    }
+    registry.add_recognizer(flair_recognizer)
+    registry.remove_recognizer("SpacyRecognizer")
+    nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
+    return nlp_engine, registry
+def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str):
+    """
+    Instantiate an NlpEngine with a TextAnalyticsWrapper and a small spaCy model.
+    The TextAnalyticsWrapper would return results from calling Azure Text Analytics PII, the spaCy model
+    would return NlpArtifacts such as POS and lemmas.
+    :param ta_key: Azure Text Analytics key.
+    :param ta_endpoint: Azure Text Analytics endpoint.
+    """
+    from text_analytics_wrapper import TextAnalyticsWrapper
+    if not ta_key or not ta_endpoint:
+        raise RuntimeError("Please fill in the Text Analytics endpoint details")
+    registry = RecognizerRegistry()
+    registry.load_predefined_recognizers()
+    ta_recognizer = TextAnalyticsWrapper(ta_endpoint=ta_endpoint, ta_key=ta_key)
+    nlp_configuration = {
+        "nlp_engine_name": "spacy",
+        "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
+    }
+    nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
+    registry.add_recognizer(ta_recognizer)
+    registry.remove_recognizer("SpacyRecognizer")
+    return nlp_engine, registry

presidio_streamlit.py CHANGED Viewed

@@ -1,13 +1,16 @@
 """Streamlit app for Presidio."""
 import os
-from json import JSONEncoder
 import pandas as pd
 import streamlit as st
 import streamlit.components.v1 as components
 from annotated_text import annotated_text
 from presidio_helpers import (
     get_supported_entities,
     analyze,
@@ -17,45 +20,86 @@ from presidio_helpers import (
     analyzer_engine,
 )
-st.set_page_config(page_title="Presidio demo", layout="wide")
 # Sidebar
 st.sidebar.header(
     """
-PII De-Identification with Microsoft Presidio
 """
 )
-st.sidebar.info(
-    "Presidio is an open source customizable framework for PII detection and de-identification\n"
-    "[Code](https://aka.ms/presidio) | "
-    "[Tutorial](https://microsoft.github.io/presidio/tutorial/) | "
-    "[Installation](https://microsoft.github.io/presidio/installation/) | "
-    "[FAQ](https://microsoft.github.io/presidio/faq/)",
-    icon="ℹ️",
-)
-st.sidebar.markdown(
-    "[![Pypi Downloads](https://img.shields.io/pypi/dm/presidio-analyzer.svg)](https://img.shields.io/pypi/dm/presidio-analyzer.svg)" # noqa
-    "[![MIT license](https://img.shields.io/badge/license-MIT-brightgreen.svg)](https://opensource.org/licenses/MIT)"
-    "![GitHub Repo stars](https://img.shields.io/github/stars/microsoft/presidio?style=social)"
-)
 st_model = st.sidebar.selectbox(
-    "NER model for PII detection",
-    [
-        "StanfordAIMI/stanford-deidentifier-base",
-        "obi/deid_roberta_i2b2",
-        "flair/ner-english-large",
-        "en_core_web_lg",
-    ],
-    index=1,
-    help="""
-    Select which Named Entity Recognition (NER) model to use for PII detection, in parallel to rule-based recognizers.
-    Presidio supports multiple NER packages off-the-shelf, such as spaCy, Huggingface, Stanza and Flair.
-    """,
 )
-st.sidebar.markdown("> Note: Models might take some time to download. ")
 st_operator = st.sidebar.selectbox(
     "De-identification approach",
@@ -75,8 +119,11 @@ st_operator = st.sidebar.selectbox(
 st_mask_char = "*"
 st_number_of_chars = 15
 st_encrypt_key = "WmZq4t7w!z%C&F)J"
-st_openai_key = ""
-st_openai_model = "text-davinci-003"
 if st_operator == "mask":
     st_number_of_chars = st.sidebar.number_input(
         "number of chars", value=st_number_of_chars, min_value=0, max_value=100
@@ -87,6 +134,22 @@ if st_operator == "mask":
 elif st_operator == "encrypt":
     st_encrypt_key = st.sidebar.text_input("AES key", value=st_encrypt_key)
 elif st_operator == "synthesize":
     st_openai_key = st.sidebar.text_input(
         "OPENAI_KEY",
         value=os.getenv("OPENAI_KEY", default=""),
@@ -95,36 +158,87 @@ elif st_operator == "synthesize":
     )
     st_openai_model = st.sidebar.text_input(
         "OpenAI model for text synthesis",
-        value=st_openai_model,
         help="See more here: https://platform.openai.com/docs/models/",
     )
 st_threshold = st.sidebar.slider(
     label="Acceptance threshold",
     min_value=0.0,
     max_value=1.0,
     value=0.35,
-    help="Define the threshold for accepting a detection as PII.",
 )
 st_return_decision_process = st.sidebar.checkbox(
     "Add analysis explanations to findings",
     value=False,
     help="Add the decision process to the output table. "
-         "More information can be found here: https://microsoft.github.io/presidio/analyzer/decision_process/",
 )
-st_entities = st.sidebar.multiselect(
-    label="Which entities to look for?",
-    options=get_supported_entities(st_model),
-    default=list(get_supported_entities(st_model)),
-    help="Limit the list of PII entities detected. "
-         "This list is dynamic and based on the NER model and registered recognizers. "
-         "More information can be found here: https://microsoft.github.io/presidio/analyzer/adding_recognizers/",
 )
 # Main panel
 analyzer_load_state = st.info("Starting Presidio analyzer...")
-engine = analyzer_engine(model_path=st_model)
 analyzer_load_state.empty()
 # Read default text
@@ -135,92 +249,103 @@ with open("demo_text.txt") as f:
 col1, col2 = st.columns(2)
 # Before:
-col1.subheader("Input string:")
 st_text = col1.text_area(
-    label="Enter text",
-    value="".join(demo_text),
-    height=400,
 )
-st_analyze_results = analyze(
-    st_model=st_model,
-    text=st_text,
-    entities=st_entities,
-    language="en",
-    score_threshold=st_threshold,
-    return_decision_process=st_return_decision_process,
-)
-# After
-if st_operator not in ("highlight", "synthesize"):
-    with col2:
-        st.subheader(f"Output")
-        st_anonymize_results = anonymize(
-            text=st_text,
-            operator=st_operator,
-            mask_char=st_mask_char,
-            number_of_chars=st_number_of_chars,
-            encrypt_key=st_encrypt_key,
-            analyze_results=st_analyze_results,
-        )
-        st.text_area(label="De-identified", value=st_anonymize_results.text, height=400)
-elif st_operator == "synthesize":
-    with col2:
-        st.subheader(f"OpenAI Generated output")
-        fake_data = create_fake_data(
-            st_text,
-            st_analyze_results,
-            openai_key=st_openai_key,
-            openai_model_name=st_openai_model,
-        )
-        st.text_area(label="Synthetic data", value=fake_data, height=400)
-else:
-    st.subheader("Highlighted")
-    annotated_tokens = annotate(
-        text=st_text,
-        analyze_results=st_analyze_results
     )
-    # annotated_tokens
-    annotated_text(*annotated_tokens)
-# json result
-class ToDictEncoder(JSONEncoder):
-    """Encode dict to json."""
-    def default(self, o):
-        """Encode to JSON using to_dict."""
-        return o.to_dict()
-# table result
-st.subheader(
-    "Findings" if not st_return_decision_process else "Findings with decision factors"
-)
-if st_analyze_results:
-    df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
-    df["text"] = [st_text[res.start: res.end] for res in st_analyze_results]
-    df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
-        {
-            "entity_type": "Entity type",
-            "text": "Text",
-            "start": "Start",
-            "end": "End",
-            "score": "Confidence",
-        },
-        axis=1,
-    )
-    df_subset["Text"] = [st_text[res.start: res.end] for res in st_analyze_results]
-    if st_return_decision_process:
-        analysis_explanation_df = pd.DataFrame.from_records(
-            [r.analysis_explanation.to_dict() for r in st_analyze_results]
         )
-        df_subset = pd.concat([df_subset, analysis_explanation_df], axis=1)
-    st.dataframe(df_subset.reset_index(drop=True), use_container_width=True)
-else:
-    st.text("No findings")
 components.html(
     """

 """Streamlit app for Presidio."""
+import logging
 import os
+import traceback
+import dotenv
 import pandas as pd
 import streamlit as st
 import streamlit.components.v1 as components
 from annotated_text import annotated_text
+from streamlit_tags import st_tags
+from openai_fake_data_generator import OpenAIParams
 from presidio_helpers import (
     get_supported_entities,
     analyze,
     analyzer_engine,
 )
+st.set_page_config(
+    page_title="Presidio demo",
+    layout="wide",
+    initial_sidebar_state="expanded",
+    menu_items={
+        "About": "https://microsoft.github.io/presidio/",
+    },
+)
+dotenv.load_dotenv()
+logger = logging.getLogger("presidio-streamlit")
+allow_other_models = os.getenv("ALLOW_OTHER_MODELS", False)
 # Sidebar
 st.sidebar.header(
     """
+PII De-Identification with [Microsoft Presidio](https://microsoft.github.io/presidio/)
 """
 )
+model_help_text = """
+    Select which Named Entity Recognition (NER) model to use for PII detection, in parallel to rule-based recognizers.
+    Presidio supports multiple NER packages off-the-shelf, such as spaCy, Huggingface, Stanza and Flair,
+    as well as service such as Azure Text Analytics PII.
+    """
+st_ta_key = st_ta_endpoint = ""
+model_list = [
+    "spaCy/en_core_web_lg",
+    "flair/ner-english-large",
+    "HuggingFace/obi/deid_roberta_i2b2",
+    "HuggingFace/StanfordAIMI/stanford-deidentifier-base",
+    "Azure Text Analytics PII",
+    "Other",
+]
+if not allow_other_models:
+    model_list.pop()
+# Select model
 st_model = st.sidebar.selectbox(
+    "NER model package",
+    model_list,
+    index=2,
+    help=model_help_text,
 )
+# Extract model package.
+st_model_package = st_model.split("/")[0]
+# Remove package prefix (if needed)
+st_model = (
+    st_model
+    if st_model_package not in ("spaCy", "HuggingFace")
+    else "/".join(st_model.split("/")[1:])
+)
+if st_model == "Other":
+    st_model_package = st.sidebar.selectbox(
+        "NER model OSS package", options=["spaCy", "Flair", "HuggingFace"]
+    )
+    st_model = st.sidebar.text_input(f"NER model name", value="")
+if st_model == "Azure Text Analytics PII":
+    st_ta_key = st.sidebar.text_input(
+        f"Text Analytics key", value=os.getenv("TA_KEY", ""), type="password"
+    )
+    st_ta_endpoint = st.sidebar.text_input(
+        f"Text Analytics endpoint",
+        value=os.getenv("TA_ENDPOINT", default=""),
+        help="For more info: https://learn.microsoft.com/en-us/azure/cognitive-services/language-service/personally-identifiable-information/overview",  # noqa: E501
+    )
+st.sidebar.warning("Note: Models might take some time to download. ")
+analyzer_params = (st_model_package, st_model, st_ta_key, st_ta_endpoint)
+logger.debug(f"analyzer_params: {analyzer_params}")
 st_operator = st.sidebar.selectbox(
     "De-identification approach",
 st_mask_char = "*"
 st_number_of_chars = 15
 st_encrypt_key = "WmZq4t7w!z%C&F)J"
+open_ai_params = None
+logger.debug(f"st_operator: {st_operator}")
 if st_operator == "mask":
     st_number_of_chars = st.sidebar.number_input(
         "number of chars", value=st_number_of_chars, min_value=0, max_value=100
 elif st_operator == "encrypt":
     st_encrypt_key = st.sidebar.text_input("AES key", value=st_encrypt_key)
 elif st_operator == "synthesize":
+    if os.getenv("OPENAI_TYPE", default="openai") == "Azure":
+        openai_api_type = "azure"
+        st_openai_api_base = st.sidebar.text_input(
+            "Azure OpenAI base URL",
+            value=os.getenv("AZURE_OPENAI_ENDPOINT", default=""),
+        )
+        st_deployment_name = st.sidebar.text_input(
+            "Deployment name", value=os.getenv("AZURE_OPENAI_DEPLOYMENT", default="")
+        )
+        st_openai_version = st.sidebar.text_input(
+            "OpenAI version",
+            value=os.getenv("OPENAI_API_VERSION", default="2023-05-15"),
+        )
+    else:
+        st_openai_version = openai_api_type = st_openai_api_base = None
+        st_deployment_name = ""
     st_openai_key = st.sidebar.text_input(
         "OPENAI_KEY",
         value=os.getenv("OPENAI_KEY", default=""),
     )
     st_openai_model = st.sidebar.text_input(
         "OpenAI model for text synthesis",
+        value=os.getenv("OPENAI_MODEL", default="text-davinci-003"),
         help="See more here: https://platform.openai.com/docs/models/",
     )
+    open_ai_params = OpenAIParams(
+        openai_key=st_openai_key,
+        model=st_openai_model,
+        api_base=st_openai_api_base,
+        deployment_name=st_deployment_name,
+        api_version=st_openai_version,
+        api_type=openai_api_type,
+    )
 st_threshold = st.sidebar.slider(
     label="Acceptance threshold",
     min_value=0.0,
     max_value=1.0,
     value=0.35,
+    help="Define the threshold for accepting a detection as PII. See more here: ",
 )
 st_return_decision_process = st.sidebar.checkbox(
     "Add analysis explanations to findings",
     value=False,
     help="Add the decision process to the output table. "
+    "More information can be found here: https://microsoft.github.io/presidio/analyzer/decision_process/",
 )
+# Allow and deny lists
+st_deny_allow_expander = st.sidebar.expander(
+    "Allowlists and denylists",
+    expanded=False,
 )
+with st_deny_allow_expander:
+    st_allow_list = st_tags(
+        label="Add words to the allowlist", text="Enter word and press enter."
+    )
+    st.caption(
+        "Allowlists contain words that are not considered PII, but are detected as such."
+    )
+    st_deny_list = st_tags(
+        label="Add words to the denylist", text="Enter word and press enter."
+    )
+    st.caption(
+        "Denylists contain words that are considered PII, but are not detected as such."
+    )
 # Main panel
+with st.expander("About this demo", expanded=False):
+    st.info(
+        """Presidio is an open source customizable framework for PII detection and de-identification.
+        \n\n[Code](https://aka.ms/presidio) |
+        [Tutorial](https://microsoft.github.io/presidio/tutorial/) |
+        [Installation](https://microsoft.github.io/presidio/installation/) |
+        [FAQ](https://microsoft.github.io/presidio/faq/) |"""
+    )
+    st.info(
+        """
+    Use this demo to:
+    - Experiment with different off-the-shelf models and NLP packages.
+    - Explore the different de-identification options, including redaction, masking, encryption and more.
+    - Generate synthetic text with Microsoft Presidio and OpenAI.
+    - Configure allow and deny lists.
+    This demo website shows some of Presidio's capabilities.
+    [Visit our website](https://microsoft.github.io/presidio) for more info,
+    samples and deployment options.
+    """
+    )
+    st.markdown(
+        "[![Pypi Downloads](https://img.shields.io/pypi/dm/presidio-analyzer.svg)](https://img.shields.io/pypi/dm/presidio-analyzer.svg)"  # noqa
+        "[![MIT license](https://img.shields.io/badge/license-MIT-brightgreen.svg)](https://opensource.org/licenses/MIT)"
+        "![GitHub Repo stars](https://img.shields.io/github/stars/microsoft/presidio?style=social)"
+    )
 analyzer_load_state = st.info("Starting Presidio analyzer...")
 analyzer_load_state.empty()
 # Read default text
 col1, col2 = st.columns(2)
 # Before:
+col1.subheader("Input")
 st_text = col1.text_area(
+    label="Enter text", value="".join(demo_text), height=400, key="text_input"
 )
+try:
+    # Choose entities
+    st_entities_expander = st.sidebar.expander("Choose entities to look for")
+    st_entities = st_entities_expander.multiselect(
+        label="Which entities to look for?",
+        options=get_supported_entities(*analyzer_params),
+        default=list(get_supported_entities(*analyzer_params)),
+        help="Limit the list of PII entities detected. "
+        "This list is dynamic and based on the NER model and registered recognizers. "
+        "More information can be found here: https://microsoft.github.io/presidio/analyzer/adding_recognizers/",
     )
+    # Before
+    analyzer_load_state = st.info("Starting Presidio analyzer...")
+    analyzer = analyzer_engine(*analyzer_params)
+    analyzer_load_state.empty()
+    st_analyze_results = analyze(
+        *analyzer_params,
+        text=st_text,
+        entities=st_entities,
+        language="en",
+        score_threshold=st_threshold,
+        return_decision_process=st_return_decision_process,
+        allow_list=st_allow_list,
+        deny_list=st_deny_list,
+    )
+    # After
+    if st_operator not in ("highlight", "synthesize"):
+        with col2:
+            st.subheader(f"Output")
+            st_anonymize_results = anonymize(
+                text=st_text,
+                operator=st_operator,
+                mask_char=st_mask_char,
+                number_of_chars=st_number_of_chars,
+                encrypt_key=st_encrypt_key,
+                analyze_results=st_analyze_results,
+            )
+            st.text_area(
+                label="De-identified", value=st_anonymize_results.text, height=400
+            )
+    elif st_operator == "synthesize":
+        with col2:
+            st.subheader(f"OpenAI Generated output")
+            fake_data = create_fake_data(
+                st_text,
+                st_analyze_results,
+                open_ai_params,
+            )
+            st.text_area(label="Synthetic data", value=fake_data, height=400)
+    else:
+        st.subheader("Highlighted")
+        annotated_tokens = annotate(text=st_text, analyze_results=st_analyze_results)
+        # annotated_tokens
+        annotated_text(*annotated_tokens)
+    # table result
+    st.subheader(
+        "Findings"
+        if not st_return_decision_process
+        else "Findings with decision factors"
+    )
+    if st_analyze_results:
+        df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
+        df["text"] = [st_text[res.start : res.end] for res in st_analyze_results]
+        df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
+            {
+                "entity_type": "Entity type",
+                "text": "Text",
+                "start": "Start",
+                "end": "End",
+                "score": "Confidence",
+            },
+            axis=1,
         )
+        df_subset["Text"] = [st_text[res.start : res.end] for res in st_analyze_results]
+        if st_return_decision_process:
+            analysis_explanation_df = pd.DataFrame.from_records(
+                [r.analysis_explanation.to_dict() for r in st_analyze_results]
+            )
+            df_subset = pd.concat([df_subset, analysis_explanation_df], axis=1)
+        st.dataframe(df_subset.reset_index(drop=True), use_container_width=True)
+    else:
+        st.text("No findings")
+except Exception as e:
+    print(e)
+    traceback.print_exc()
+    st.error(e)
 components.html(
     """

requirements.txt CHANGED Viewed

@@ -1,9 +1,13 @@
 presidio-analyzer
 presidio-anonymizer
 streamlit
 pandas
 st-annotated-text
 torch
 transformers
 flair
-openai

 presidio-analyzer
 presidio-anonymizer
 streamlit
+streamlit-tags
 pandas
+python-dotenv
 st-annotated-text
 torch
 transformers
 flair
+openai
+spacy
+azure-ai-textanalytics

text_analytics_wrapper.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import os
+from typing import List, Optional
+import logging
+import dotenv
+from azure.ai.textanalytics import TextAnalyticsClient
+from azure.core.credentials import AzureKeyCredential
+from presidio_analyzer import EntityRecognizer, RecognizerResult, AnalysisExplanation
+from presidio_analyzer.nlp_engine import NlpArtifacts
+logger = logging.getLogger("presidio-streamlit")
+class TextAnalyticsWrapper(EntityRecognizer):
+    from azure.ai.textanalytics._models import PiiEntityCategory
+    TA_SUPPORTED_ENTITIES = [r.value for r in PiiEntityCategory]
+    def __init__(
+        self,
+        supported_entities: Optional[List[str]] = None,
+        supported_language: str = "en",
+        ta_client: Optional[TextAnalyticsClient] = None,
+        ta_key: Optional[str] = None,
+        ta_endpoint: Optional[str] = None,
+    ):
+        """
+        Wrapper for the Azure Text Analytics client
+        :param ta_client: object of type TextAnalyticsClient
+        :param ta_key: Azure cognitive Services for Language key
+        :param ta_endpoint: Azure cognitive Services for Language endpoint
+        """
+        if not supported_entities:
+            supported_entities = self.TA_SUPPORTED_ENTITIES
+        super().__init__(
+            supported_entities=supported_entities,
+            supported_language=supported_language,
+            name="Azure Text Analytics PII",
+        )
+        self.ta_key = ta_key
+        self.ta_endpoint = ta_endpoint
+        if not ta_client:
+            ta_client = self.__authenticate_client(ta_key, ta_endpoint)
+        self.ta_client = ta_client
+    @staticmethod
+    def __authenticate_client(key: str, endpoint: str):
+        ta_credential = AzureKeyCredential(key)
+        text_analytics_client = TextAnalyticsClient(
+            endpoint=endpoint, credential=ta_credential
+        )
+        return text_analytics_client
+    def analyze(
+        self, text: str, entities: List[str] = None, nlp_artifacts: NlpArtifacts = None
+    ) -> List[RecognizerResult]:
+        if not entities:
+            entities = []
+        response = self.ta_client.recognize_pii_entities(
+            [text], language=self.supported_language
+        )
+        results = [doc for doc in response if not doc.is_error]
+        recognizer_results = []
+        for res in results:
+            for entity in res.entities:
+                if entity.category not in self.supported_entities:
+                    continue
+                analysis_explanation = TextAnalyticsWrapper._build_explanation(
+                    original_score=entity.confidence_score,
+                    entity_type=entity.category,
+                )
+                recognizer_results.append(
+                    RecognizerResult(
+                        entity_type=entity.category,
+                        start=entity.offset,
+                        end=entity.offset + len(entity.text),
+                        score=entity.confidence_score,
+                        analysis_explanation=analysis_explanation,
+                    )
+                )
+        return recognizer_results
+    @staticmethod
+    def _build_explanation(
+        original_score: float, entity_type: str
+    ) -> AnalysisExplanation:
+        explanation = AnalysisExplanation(
+            recognizer=TextAnalyticsWrapper.__class__.__name__,
+            original_score=original_score,
+            textual_explanation=f"Identified as {entity_type} by Text Analytics",
+        )
+        return explanation
+    def load(self) -> None:
+        pass
+if __name__ == "__main__":
+    import presidio_helpers
+    dotenv.load_dotenv()
+    text = """
+    Here are a few example sentences we currently support:
+    Hello, my name is David Johnson and I live in Maine.
+    My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.
+    On September 18 I visited microsoft.com and sent an email to test@presidio.site,  from the IP 192.168.0.1.
+    My passport: 191280342 and my phone number: (212) 555-1234.
+    This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544?
+    Kate's social security number is 078-05-1126.  Her driver license? it is 1234567A.
+    """
+    analyzer = presidio_helpers.analyzer_engine(
+        model_path="Azure Text Analytics PII",
+        ta_key=os.environ["TA_KEY"],
+        ta_endpoint=os.environ["TA_ENDPOINT"],
+    )
+    analyzer.analyze(text=text, language="en")