Spaces:
Runtime error
Runtime error
| from presidio_anonymizer import AnonymizerEngine | |
| from presidio_analyzer import AnalyzerEngine | |
| from presidio_anonymizer.entities import RecognizerResult, OperatorConfig | |
| from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline | |
| import torch | |
| import re | |
| import gradio as gr | |
| # Initialize the engine: | |
| analyzer = AnalyzerEngine() | |
| anonymizer = AnonymizerEngine() | |
| # Create the NER pipeline | |
| tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER-uncased") | |
| tokenizer.add_tokens('<person>') | |
| model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER-uncased") | |
| pipe = pipeline(model=model, tokenizer=tokenizer, task='ner') | |
| # https://microsoft.github.io/presidio/supported_entities/ | |
| ENT_TYPES = [ | |
| # 'PERSON', | |
| 'CREDIT_CARD', | |
| 'EMAIL_ADDRESS', | |
| 'IP_ADDRESS', | |
| 'PHONE_NUMBER' | |
| ] | |
| def mask_names_hf(text): | |
| # Tokenize inputs | |
| inputs = tokenizer(text, return_tensors='pt', truncation=True) | |
| tokens = inputs.tokens() | |
| # Make inferences | |
| outputs = model(**inputs).logits | |
| predictions = torch.argmax(outputs, dim=2) | |
| # Replace tokens that are people with <PERSON> | |
| words = [] | |
| for token, prediction in zip(tokens, predictions[0].numpy()): | |
| prediction = model.config.id2label[prediction] | |
| if prediction not in ('I-PER', 'B-PER'): | |
| words.append(token) | |
| elif prediction == 'B-PER': | |
| if words[-1] != '<PERSON>': | |
| words.append('<PERSON>') | |
| else: | |
| pass | |
| # Convert those tokens to a string | |
| return tokenizer.convert_tokens_to_string(words[1:-1]) | |
| # def mask_names_hf(text): | |
| # outputs = pipe(text) | |
| # tokens = [] | |
| # for token in outputs: | |
| # if 'PER' in token['entity']: | |
| # if tokens[-1] != '<PERSON>': | |
| # tokens.append('<PERSON>') | |
| # else: | |
| # tokens.append(token['word']) | |
| # t = tokenizer.convert_tokens_to_string(tokens) | |
| # return t | |
| def anonymize(text, min_len=3): | |
| # Find and replace other stuff (Presidio NER) | |
| ents = analyzer.analyze(text, language='en', entities=ENT_TYPES) | |
| results = anonymizer.anonymize(text, analyzer_results=ents) | |
| t = results.text | |
| # t = copy(text) | |
| # Find and replace names (HF NER) | |
| t = mask_names_hf(t) | |
| pats = re.findall('<.+?>', t) | |
| for p in pats: | |
| t = t.replace(p, p.upper().replace(' ', '')) | |
| t = t.replace('<PERSON><PERSON>', '<PERSON>') | |
| return t | |
| title = "PII Masking" | |
| description = """ | |
| In many applications, personally identifiable information (PII) is easy to remove from databases since a column may contain specific PII. | |
| Common techniques like hashing also allow the identity of these values to be preserved without exposing the contents of the value. | |
| However, it can be less straightforward to remove from unstructured text data, where PII may or may not be present. | |
| Further, text may contain multiple types of PII that present an increased risk of exposure when coupled together. | |
| For example, a name and IP address together may be used to pinpoint a specific person's location. | |
| Hashing the data outright is not an option since consumers of these data often prefer to work with the raw text data. | |
| Thus, preserving privacy in raw text data remains a challenge. | |
| This space applies both rule-based and ML-based approaches to remove names, phone numbers, emails, and IP addresses from raw text. | |
| This app accepts raw text and returns the same text, but with PII replaced with special tokens that preserve some characteristics of the masked entities without revealing their contents. | |
| """ | |
| gr.Interface( | |
| anonymize, | |
| inputs='text', | |
| outputs='text', | |
| title=title, | |
| description=description, | |
| examples=[ | |
| "Hi, my name is Mike and my phone number is 1-234-567-9000", | |
| "Hi, my name is Mike and my email address is my_name@my_domain.com", | |
| "Hi, my name is Mike and my IP address is 127.0.0.1", | |
| # "Hi, my name is Mike and my credit card is 1200 3859 8281 0593" | |
| ] | |
| ).launch(debug=True) | |