Spaces:

mgfrantz
/

pii_masking

Runtime error

App Files Files Community

pii_masking / app.py

mgfrantz

Update app.py

8640308 over 3 years ago

raw

history blame contribute delete

4.12 kB

	from presidio_anonymizer import AnonymizerEngine
	from presidio_analyzer import AnalyzerEngine
	from presidio_anonymizer.entities import RecognizerResult, OperatorConfig

	from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
	import torch
	import re

	import gradio as gr

	# Initialize the engine:
	analyzer = AnalyzerEngine()
	anonymizer = AnonymizerEngine()

	# Create the NER pipeline
	tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER-uncased")
	tokenizer.add_tokens('<person>')
	model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER-uncased")
	pipe = pipeline(model=model, tokenizer=tokenizer, task='ner')

	# https://microsoft.github.io/presidio/supported_entities/
	ENT_TYPES = [
	# 'PERSON',
	'CREDIT_CARD',
	'EMAIL_ADDRESS',
	'IP_ADDRESS',
	'PHONE_NUMBER'
	]

	def mask_names_hf(text):
	# Tokenize inputs
	inputs = tokenizer(text, return_tensors='pt', truncation=True)
	tokens = inputs.tokens()

	# Make inferences
	outputs = model(**inputs).logits
	predictions = torch.argmax(outputs, dim=2)

	# Replace tokens that are people with <PERSON>
	words = []
	for token, prediction in zip(tokens, predictions[0].numpy()):
	prediction = model.config.id2label[prediction]
	if prediction not in ('I-PER', 'B-PER'):
	words.append(token)
	elif prediction == 'B-PER':
	if words[-1] != '<PERSON>':
	words.append('<PERSON>')
	else:
	pass
	# Convert those tokens to a string
	return tokenizer.convert_tokens_to_string(words[1:-1])

	# def mask_names_hf(text):
	# outputs = pipe(text)
	# tokens = []
	# for token in outputs:
	# if 'PER' in token['entity']:
	# if tokens[-1] != '<PERSON>':
	# tokens.append('<PERSON>')
	# else:
	# tokens.append(token['word'])

	# t = tokenizer.convert_tokens_to_string(tokens)
	# return t

	def anonymize(text, min_len=3):

	# Find and replace other stuff (Presidio NER)
	ents = analyzer.analyze(text, language='en', entities=ENT_TYPES)
	results = anonymizer.anonymize(text, analyzer_results=ents)
	t = results.text

	# t = copy(text)
	# Find and replace names (HF NER)
	t = mask_names_hf(t)

	pats = re.findall('<.+?>', t)
	for p in pats:
	t = t.replace(p, p.upper().replace(' ', ''))


	t = t.replace('<PERSON><PERSON>', '<PERSON>')
	return t

	title = "PII Masking"
	description = """
	In many applications, personally identifiable information (PII) is easy to remove from databases since a column may contain specific PII.
	Common techniques like hashing also allow the identity of these values to be preserved without exposing the contents of the value.

	However, it can be less straightforward to remove from unstructured text data, where PII may or may not be present.
	Further, text may contain multiple types of PII that present an increased risk of exposure when coupled together.
	For example, a name and IP address together may be used to pinpoint a specific person's location.
	Hashing the data outright is not an option since consumers of these data often prefer to work with the raw text data.
	Thus, preserving privacy in raw text data remains a challenge.

	This space applies both rule-based and ML-based approaches to remove names, phone numbers, emails, and IP addresses from raw text.
	This app accepts raw text and returns the same text, but with PII replaced with special tokens that preserve some characteristics of the masked entities without revealing their contents.
	"""

	gr.Interface(
	anonymize,
	inputs='text',
	outputs='text',
	title=title,
	description=description,
	examples=[
	"Hi, my name is Mike and my phone number is 1-234-567-9000",
	"Hi, my name is Mike and my email address is my_name@my_domain.com",
	"Hi, my name is Mike and my IP address is 127.0.0.1",
	# "Hi, my name is Mike and my credit card is 1200 3859 8281 0593"
	]
	).launch(debug=True)