| ## REGEX FOR FILE | |
| # import argparse | |
| # import json | |
| # import logging | |
| # import random | |
| # from pii_detection import scan_pii_batch | |
| # from pii_redaction import redact_pii_batch, random_replacements | |
| # def parse_args(): | |
| # parser = argparse.ArgumentParser(description="PII detection and redaction for a code file") | |
| # parser.add_argument( | |
| # "--input_code_file", | |
| # required=True, | |
| # type=str, | |
| # help="Path to the input code file for PII detection and redaction", | |
| # ) | |
| # parser.add_argument( | |
| # "--output_file", | |
| # required=True, | |
| # type=str, | |
| # help="Path to save the redacted code file", | |
| # ) | |
| # parser.add_argument( | |
| # "--batch_size", | |
| # default=8, | |
| # type=int, | |
| # help="Batch size for the PII detection/redaction", | |
| # ) | |
| # parser.add_argument( | |
| # "--seed", | |
| # default=0, | |
| # type=int, | |
| # help="Seed for random", | |
| # ) | |
| # parser.add_argument( | |
| # "--num_proc", | |
| # default=8, | |
| # type=int, | |
| # help="Number of processes to use for PII detection/redaction", | |
| # ) | |
| # parser.add_argument( | |
| # "--no_redaction", | |
| # action="store_true", | |
| # help="If set, do not perform redaction", | |
| # ) | |
| # parser.add_argument( | |
| # "--load_replacements", | |
| # default=True, | |
| # help="If set, load replacements from file replacements.json", | |
| # ) | |
| # parser.add_argument( | |
| # "--add_reference_text", | |
| # default=True, | |
| # type=bool, | |
| # help="If True, add reference text with PII between delimiters in the redacted text (used for visualization)", | |
| # ) | |
| # return parser.parse_args() | |
| # def main(): | |
| # logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", level=logging.INFO) | |
| # args = parse_args() | |
| # # Read input code file | |
| # with open(args.input_code_file, "r") as input_file: | |
| # code_content = input_file.read() | |
| # # Apply PII detection | |
| # ds_pii = scan_pii_batch([{"content": code_content}]) | |
| # logging.info(f"PII detection results:\n{ds_pii}") | |
| # logging.info(f"Number of samples that contained PII: {sum(ds_pii['has_secrets'])}") | |
| # logging.info(f"Total number of secrets found: {sum(ds_pii['number_secrets'])}") | |
| # # Redact PII in the code | |
| # if not args.no_redaction: | |
| # logging.info(f" ===== Applying PII redaction =====") | |
| # random.seed(args.seed) | |
| # # Use random replacements by default | |
| # if args.load_replacements: | |
| # with open("replacements.json", "r") as f: | |
| # replacements = json.load(f) | |
| # else: | |
| # replacements = random_replacements() | |
| # with open("random_replacements.json", "w") as f: | |
| # json.dump(replacements, f) | |
| # logging.info(f"Using the following replacements:\n{replacements}") | |
| # ds_pii_redacted = redact_pii_batch( | |
| # [{"content": code_content, "secrets": ds_pii['secrets'], "has_secrets": ds_pii['has_secrets'], "number_secrets": ds_pii['number_secrets']}], | |
| # replacements=replacements, | |
| # add_references=args.add_reference_text | |
| # ) | |
| # redacted_code = ds_pii_redacted["new_content"][0] # Access the redacted code | |
| # print("Redacted Code:") | |
| # print(redacted_code) | |
| # # Save the redacted code to the output file | |
| # with open(args.output_file, "w") as output_file: | |
| # output_file.write(redacted_code[0] if isinstance(redacted_code, list) else redacted_code) | |
| # logging.info("Redacted code saved successfully.") | |
| # if __name__ == "__main__": | |
| # main() | |
| #REGEX AS For text DETECTION | |
| import json | |
| import logging | |
| import random | |
| import os | |
| from privacy.util.code_detect.pii_detection import scan_pii_batch | |
| from privacy.util.code_detect.pii_redaction import redact_pii_batch, random_replacements | |
| class code_detect: | |
| def codeDetectRegex(input_code_text): | |
| #output_file | |
| batch_size=8 | |
| seed=0 | |
| num_proc=8 | |
| no_redaction=False | |
| load_replacements=True | |
| add_reference_text=True | |
| logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", level=logging.INFO) | |
| print("input_code_text",input_code_text) | |
| # Apply PII detection | |
| ds_pii = scan_pii_batch([{"content": input_code_text}]) | |
| logging.info(f"PII detection results:\n{ds_pii}") | |
| logging.info(f"Number of samples that contained PII: {sum(ds_pii['has_secrets'])}") | |
| logging.info(f"Total number of secrets found: {sum(ds_pii['number_secrets'])}") | |
| # Redact PII in the code | |
| if not no_redaction: | |
| logging.info(f" ===== Applying PII redaction =====") | |
| random.seed(seed) | |
| # Use random replacements by default | |
| if load_replacements: | |
| with open("privacy/util/code_detect/replacements.json", "r") as f: | |
| replacements = json.load(f) | |
| else: | |
| # Get the path to the directory of the current script | |
| current_dir = os.path.dirname(os.path.abspath(__file__)) | |
| replacements_file_path = os.path.join(current_dir, "privacy", "util", "code_detect", "replacements.json") | |
| print("replacements_file_path",replacements_file_path) | |
| replacements = random_replacements() | |
| with open(replacements_file_path, "w") as f: | |
| json.dump(replacements, f) | |
| logging.info(f"Using the following replacements:\n{replacements}") | |
| ds_pii_redacted = redact_pii_batch( | |
| [{"content": input_code_text, "secrets": ds_pii['secrets'], "has_secrets": ds_pii['has_secrets'], | |
| "number_secrets": ds_pii['number_secrets']}], | |
| replacements=replacements, | |
| add_references=add_reference_text | |
| ) | |
| redacted_code = ds_pii_redacted["new_content"][0] # Access the redacted code | |
| print("Redacted Code:") | |
| print(redacted_code) | |
| # # Save the redacted code to the output file | |
| # with open(output_file, "w") as output_file: | |
| # output_file.write(redacted_code[0] if isinstance(redacted_code, list) else redacted_code) | |
| logging.info("Redacted code saved successfully.") | |
| return redacted_code | |