Spaces:

Kiran5
/

Privacy

Build error

App Files Files Community

Privacy / src /privacy /util /code_detect /pii_detection.py

Kiran5

Track large files and images with Git LFS

54fa0c8 11 months ago

raw

history blame contribute delete

3.47 kB

	import json

	#from utils.emails_ip_addresses_detection import detect_email_addresses
	from privacy.util.code_detect.utils.emails_ip_addresses_detection import detect_email_addresses
	from privacy.util.code_detect.utils.keys_detection import detect_keys


	def postprocess_secrets(secrets):
	"""Postprocess the secrets found by the scan_secrets function"""
	if secrets:
	matches = json.dumps(secrets)
	has_secrets = True
	else:
	matches = json.dumps([])
	has_secrets = False
	return matches, has_secrets

	## DETECTION MODIFIED FOR FILE

	def scan_pii_batch(examples, key_detector="other"):
	"""Scan a batch of examples from a dataset to detect PII
	This add two columns to the dataset:
	- secrets: (list) of secrets/PII found
	- has_secrets: (bool) whether the example contains secrets/PII
	"""
	list_secrets = []
	list_has_secrets = []
	number_secrets = []
	for example in examples:
	text = example["content"]
	secrets = []
	if key_detector == "regex":
	# use a regex to detect keys + emails + ips
	secrets = secrets + detect_email_addresses(
	text, tag_types={"KEY", "EMAIL", "IP_ADDRESS"}
	)
	else:
	# detect emails and ip addresses with regexes
	secrets = secrets + detect_email_addresses(
	text, tag_types={"EMAIL", "IP_ADDRESS"}
	)
	# for keys use detect-secrets tool
	secrets = secrets + detect_keys(text)
	# to add this as new columns to datasets we need the same number of samples in each row
	# we save secrets as json strings instead of lists
	matches, has_secrets = postprocess_secrets(secrets)
	list_secrets.append(matches)
	list_has_secrets.append(has_secrets)
	number_secrets.append(len(secrets))
	return {
	"secrets": list_secrets,
	"has_secrets": list_has_secrets,
	"number_secrets": number_secrets,
	}
	# def scan_pii_batch(examples, key_detector="other"):
	# """Scan a batch of examples from a dataset to detect PII
	# This add two columns to the dataset:
	# - secrets: (list) of secrets/PII found
	# - has_secrets: (bool) whether the example contains secrets/PII
	# """
	# list_secrets = []
	# list_has_secrets = []
	# number_secrets = []
	# for text in examples["content"]:
	# secrets = []
	# if key_detector == "regex":
	# # use a regex to detect keys + emails + ips
	# secrets = secrets + detect_email_addresses(
	# text, tag_types={"KEY", "EMAIL", "IP_ADDRESS"}
	# )
	# else:
	# # detect emails and ip addresses with regexes
	# secrets = secrets + detect_email_addresses(
	# text, tag_types={"EMAIL", "IP_ADDRESS"}
	# )
	# # for keys use detect-secrets tool
	# secrets = secrets + detect_keys(text)
	# # to add this as new columns to datasets we need the same number of samples in each row
	# # we save secrets as json strings instead of lists
	# matches, has_secrets = postprocess_secrets(secrets)
	# list_secrets.append(matches)
	# list_has_secrets.append(has_secrets)
	# number_secrets.append(len(secrets))
	# return {
	# "secrets": list_secrets,
	# "has_secrets": list_has_secrets,
	# "number_secrets": number_secrets,
	# }