| import json | |
| #from utils.emails_ip_addresses_detection import detect_email_addresses | |
| from privacy.util.code_detect.utils.emails_ip_addresses_detection import detect_email_addresses | |
| from privacy.util.code_detect.utils.keys_detection import detect_keys | |
| def postprocess_secrets(secrets): | |
| """Postprocess the secrets found by the scan_secrets function""" | |
| if secrets: | |
| matches = json.dumps(secrets) | |
| has_secrets = True | |
| else: | |
| matches = json.dumps([]) | |
| has_secrets = False | |
| return matches, has_secrets | |
| ## DETECTION MODIFIED FOR FILE | |
| def scan_pii_batch(examples, key_detector="other"): | |
| """Scan a batch of examples from a dataset to detect PII | |
| This add two columns to the dataset: | |
| - secrets: (list) of secrets/PII found | |
| - has_secrets: (bool) whether the example contains secrets/PII | |
| """ | |
| list_secrets = [] | |
| list_has_secrets = [] | |
| number_secrets = [] | |
| for example in examples: | |
| text = example["content"] | |
| secrets = [] | |
| if key_detector == "regex": | |
| # use a regex to detect keys + emails + ips | |
| secrets = secrets + detect_email_addresses( | |
| text, tag_types={"KEY", "EMAIL", "IP_ADDRESS"} | |
| ) | |
| else: | |
| # detect emails and ip addresses with regexes | |
| secrets = secrets + detect_email_addresses( | |
| text, tag_types={"EMAIL", "IP_ADDRESS"} | |
| ) | |
| # for keys use detect-secrets tool | |
| secrets = secrets + detect_keys(text) | |
| # to add this as new columns to datasets we need the same number of samples in each row | |
| # we save secrets as json strings instead of lists | |
| matches, has_secrets = postprocess_secrets(secrets) | |
| list_secrets.append(matches) | |
| list_has_secrets.append(has_secrets) | |
| number_secrets.append(len(secrets)) | |
| return { | |
| "secrets": list_secrets, | |
| "has_secrets": list_has_secrets, | |
| "number_secrets": number_secrets, | |
| } | |
| # def scan_pii_batch(examples, key_detector="other"): | |
| # """Scan a batch of examples from a dataset to detect PII | |
| # This add two columns to the dataset: | |
| # - secrets: (list) of secrets/PII found | |
| # - has_secrets: (bool) whether the example contains secrets/PII | |
| # """ | |
| # list_secrets = [] | |
| # list_has_secrets = [] | |
| # number_secrets = [] | |
| # for text in examples["content"]: | |
| # secrets = [] | |
| # if key_detector == "regex": | |
| # # use a regex to detect keys + emails + ips | |
| # secrets = secrets + detect_email_addresses( | |
| # text, tag_types={"KEY", "EMAIL", "IP_ADDRESS"} | |
| # ) | |
| # else: | |
| # # detect emails and ip addresses with regexes | |
| # secrets = secrets + detect_email_addresses( | |
| # text, tag_types={"EMAIL", "IP_ADDRESS"} | |
| # ) | |
| # # for keys use detect-secrets tool | |
| # secrets = secrets + detect_keys(text) | |
| # # to add this as new columns to datasets we need the same number of samples in each row | |
| # # we save secrets as json strings instead of lists | |
| # matches, has_secrets = postprocess_secrets(secrets) | |
| # list_secrets.append(matches) | |
| # list_has_secrets.append(has_secrets) | |
| # number_secrets.append(len(secrets)) | |
| # return { | |
| # "secrets": list_secrets, | |
| # "has_secrets": list_has_secrets, | |
| # "number_secrets": number_secrets, | |
| # } | |