Spaces:

langdonholmes
/

piilo

Sleeping

App Files Files Community

langdonholmes commited on Nov 15, 2022

Commit

e9abb72

1 Parent(s): f0664d7

init

Browse files

Files changed (3) hide show

app.py +199 -2
requirements.txt +7 -0
spacy_recognizer.py +131 -0

app.py CHANGED Viewed

@@ -1,4 +1,201 @@
 import streamlit as st
-x = st.slider('Select a value')
-st.write(x, 'squared is', x * x)

+"""Streamlit app for Student Name Detection models."""
+import spacy
+from spacy_recognizer import CustomSpacyRecognizer
+from presidio_analyzer.nlp_engine import NlpEngineProvider
+from presidio_anonymizer import AnonymizerEngine
+from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
+import pandas as pd
+from annotated_text import annotated_text
+from json import JSONEncoder
+import json
+import warnings
 import streamlit as st
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+warnings.filterwarnings('ignore')
+# Helper methods
+@st.cache(allow_output_mutation=True)
+def analyzer_engine():
+    """Return AnalyzerEngine."""
+    spacy_recognizer = CustomSpacyRecognizer()
+    configuration = {
+        "nlp_engine_name": "spacy",
+        "models": [
+            {"lang_code": "en", "model_name": "INSERT MODEL NAME"}],
+    }
+    # Create NLP engine based on configuration
+    provider = NlpEngineProvider(nlp_configuration=configuration)
+    nlp_engine = provider.create_engine()
+    registry = RecognizerRegistry()
+    # add rule-based recognizers
+    registry.load_predefined_recognizers(nlp_engine=nlp_engine)
+    registry.add_recognizer(spacy_recognizer)
+    # remove the nlp engine we passed, to use custom label mappings
+    registry.remove_recognizer("SpacyRecognizer")
+    analyzer = AnalyzerEngine(nlp_engine=nlp_engine,
+                              registry=registry, supported_languages=["en"])
+    return analyzer
+@st.cache(allow_output_mutation=True)
+def anonymizer_engine():
+    """Return AnonymizerEngine."""
+    return AnonymizerEngine()
+def get_supported_entities():
+    """Return supported entities from the Analyzer Engine."""
+    return analyzer_engine().get_supported_entities()
+def analyze(**kwargs):
+    """Analyze input using Analyzer engine and input arguments (kwargs)."""
+    if "entities" not in kwargs or "All" in kwargs["entities"]:
+        kwargs["entities"] = None
+    return analyzer_engine().analyze(**kwargs)
+def anonymize(text, analyze_results):
+    """Anonymize identified input using Presidio Anonymizer."""
+    if not text:
+        return
+    res = anonymizer_engine().anonymize(text, analyze_results)
+    return res.text
+def annotate(text, st_analyze_results, st_entities):
+    tokens = []
+    # sort by start index
+    results = sorted(st_analyze_results, key=lambda x: x.start)
+    for i, res in enumerate(results):
+        if i == 0:
+            tokens.append(text[:res.start])
+        # append entity text and entity type
+        tokens.append((text[res.start: res.end], res.entity_type))
+        # if another entity coming i.e. we're not at the last results element, add text up to next entity
+        if i != len(results) - 1:
+            tokens.append(text[res.end:results[i+1].start])
+        # if no more entities coming, add all remaining text
+        else:
+            tokens.append(text[res.end:])
+    return tokens
+st.set_page_config(page_title="Student Name Detector (English)", layout="wide")
+# Side bar
+st.sidebar.markdown(
+    """Detect and anonymize PII in text using an [NLP model](https://huggingface.co/MY_MODEL_NAME) [trained](https://github.com/aialoe/deidentification-pipeline/tree/8bea38040d36ef62e0638fec8cca3ec652539cbe) on student-generated text collected by Coursera.
+"""
+)
+st_entities = st.sidebar.multiselect(
+    label="Which entities to look for?",
+    options=get_supported_entities(),
+    default=list(get_supported_entities()),
+)
+st_threshold = st.sidebar.slider(
+    label="Acceptance threshold", min_value=0.0, max_value=1.0, value=0.35
+)
+st_return_decision_process = st.sidebar.checkbox(
+    "Add analysis explanations in json")
+st.sidebar.info(
+    "This is part of a deidentification project for student-generated text."
+)
+# Main panel
+analyzer_load_state = st.info(
+    "Starting Presidio analyzer and loading Longformer-based model...")
+engine = analyzer_engine()
+analyzer_load_state.empty()
+st_text = st.text_area(
+    label="Type in some text",
+    value="Learning Reflection\n\nJohn Williams\n\nIn this course I learned many things. As Liedtke (2004) said, \"Students grow when they learn\" \n\nBy John H. Williams",
+    height=200,
+)
+button = st.button("Detect Student Names")
+if 'first_load' not in st.session_state:
+    st.session_state['first_load'] = True
+# After
+st.subheader("Analyzed")
+with st.spinner("Analyzing..."):
+    if button or st.session_state.first_load:
+        st_analyze_results = analyze(
+            text=st_text,
+            entities=st_entities,
+            language="en",
+            score_threshold=st_threshold,
+            return_decision_process=st_return_decision_process,
+        )
+        annotated_tokens = annotate(st_text, st_analyze_results, st_entities)
+        # annotated_tokens
+        annotated_text(*annotated_tokens)
+# vertical space
+st.text("")
+st.subheader("Anonymized")
+with st.spinner("Anonymizing..."):
+    if button or st.session_state.first_load:
+        st_anonymize_results = anonymize(st_text, st_analyze_results)
+        st_anonymize_results
+# table result
+st.subheader("Detailed Findings")
+if st_analyze_results:
+    res_dicts = [r.to_dict() for r in st_analyze_results]
+    for d in res_dicts:
+        d['Value'] = st_text[d['start']:d['end']]
+    df = pd.DataFrame.from_records(res_dicts)
+    df = df[["entity_type", "Value", "score", "start", "end"]].rename(
+        {
+            "entity_type": "Entity type",
+            "start": "Start",
+            "end": "End",
+            "score": "Confidence",
+        },
+        axis=1,
+    )
+    st.dataframe(df, width=1000)
+else:
+    st.text("No findings")
+st.session_state['first_load'] = True
+# json result
+class ToDictListEncoder(JSONEncoder):
+    """Encode dict to json."""
+    def default(self, o):
+        """Encode to JSON using to_dict."""
+        if o:
+            return o.to_dict()
+        return []
+if st_return_decision_process:
+    st.json(json.dumps(st_analyze_results, cls=ToDictListEncoder))

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+pandas
+streamlit
+presidio-anonymizer
+presidio-analyzer
+torch
+st-annotated-text
+#https://huggingface.co/my_model.whl

spacy_recognizer.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import logging
+from typing import Optional, List, Tuple, Set
+from presidio_analyzer import (
+    RecognizerResult,
+    LocalRecognizer,
+    AnalysisExplanation,
+)
+from presidio_analyzer.nlp_engine import NlpArtifacts
+from presidio_analyzer.predefined_recognizers.spacy_recognizer import SpacyRecognizer
+logger = logging.getLogger("presidio-analyzer")
+class CustomSpacyRecognizer(LocalRecognizer):
+    ENTITIES = [
+        "LOCATION",
+        "PERSON",
+        "NRP",
+        "ORGANIZATION",
+        "DATE_TIME",
+    ]
+    DEFAULT_EXPLANATION = "Identified as {} by Spacy's Named Entity Recognition"
+    CHECK_LABEL_GROUPS = [
+        ({"LOCATION"}, {"LOC", "LOCATION", "STREET_ADDRESS", "COORDINATE"}),
+        ({"PERSON"}, {"PER", "PERSON"}),
+        ({"NRP"}, {"NORP", "NRP"}),
+        ({"ORGANIZATION"}, {"ORG"}),
+        ({"DATE_TIME"}, {"DATE_TIME"}),
+    ]
+    MODEL_LANGUAGES = {
+        "en": "beki/en_spacy_pii_distilbert",
+    }
+    PRESIDIO_EQUIVALENCES = {
+        "PER": "PERSON",
+        "LOC": "LOCATION",
+        "ORG": "ORGANIZATION",
+        "NROP": "NRP",
+        "DATE_TIME": "DATE_TIME",
+    }
+    def __init__(
+        self,
+        supported_language: str = "en",
+        supported_entities: Optional[List[str]] = None,
+        check_label_groups: Optional[Tuple[Set, Set]] = None,
+        context: Optional[List[str]] = None,
+        ner_strength: float = 0.85,
+    ):
+        self.ner_strength = ner_strength
+        self.check_label_groups = (
+            check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
+        )
+        supported_entities = supported_entities if supported_entities else self.ENTITIES
+        super().__init__(
+            supported_entities=supported_entities,
+            supported_language=supported_language,
+        )
+    def load(self) -> None:
+        """Load the model, not used. Model is loaded during initialization."""
+        pass
+    def get_supported_entities(self) -> List[str]:
+        """
+        Return supported entities by this model.
+        :return: List of the supported entities.
+        """
+        return self.supported_entities
+    def build_spacy_explanation(
+        self, original_score: float, explanation: str
+    ) -> AnalysisExplanation:
+        """
+        Create explanation for why this result was detected.
+        :param original_score: Score given by this recognizer
+        :param explanation: Explanation string
+        :return:
+        """
+        explanation = AnalysisExplanation(
+            recognizer=self.__class__.__name__,
+            original_score=original_score,
+            textual_explanation=explanation,
+        )
+        return explanation
+    def analyze(self, text, entities, nlp_artifacts=None):  # noqa D102
+        results = []
+        if not nlp_artifacts:
+            logger.warning("Skipping SpaCy, nlp artifacts not provided...")
+            return results
+        ner_entities = nlp_artifacts.entities
+        for entity in entities:
+            if entity not in self.supported_entities:
+                continue
+            for ent in ner_entities:
+                if not self.__check_label(entity, ent.label_, self.check_label_groups):
+                    continue
+                textual_explanation = self.DEFAULT_EXPLANATION.format(
+                    ent.label_)
+                explanation = self.build_spacy_explanation(
+                    self.ner_strength, textual_explanation
+                )
+                spacy_result = RecognizerResult(
+                    entity_type=entity,
+                    start=ent.start_char,
+                    end=ent.end_char,
+                    score=self.ner_strength,
+                    analysis_explanation=explanation,
+                    recognition_metadata={
+                        RecognizerResult.RECOGNIZER_NAME_KEY: self.name
+                    },
+                )
+                results.append(spacy_result)
+        return results
+    @staticmethod
+    def __check_label(
+        entity: str, label: str, check_label_groups: Tuple[Set, Set]
+    ) -> bool:
+        return any(
+            [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
+        )