| # import os | |
| # from dataclasses import dataclass, field | |
| # from tqdm import tqdm | |
| # import pandas as pd | |
| # import torch | |
| # from torch.utils.data import DataLoader | |
| # import datasets | |
| # from datasets import load_dataset, Dataset | |
| # from accelerate import Accelerator | |
| # from transformers import HfArgumentParser | |
| # from transformers import AutoModelForTokenClassification, AutoTokenizer | |
| # from typing import Optional | |
| # from utils import PiiNERPipeline | |
| # import time | |
| # @dataclass | |
| # class PipelineArgs: | |
| # model_name: Optional[str] = field(default="./", metadata={"help": "the model name"}) | |
| # process_batch_size: int = field(default=10_000, metadata={"help": "files per worker"}) | |
| # batch_size: Optional[int] = field(default=1024, metadata={"help": "batch size"}) | |
| # dataset: Optional[str] = field(default="./", metadata={"help": "dataset"}) | |
| # subset: Optional[str] = field(default="data/python/", metadata={"help": "dataset subdirectory"}) | |
| # out_path: Optional[str] = field(default="./results/", metadata={"help": "path for output"}) | |
| # email= "abhi@gmail.com" | |
| # def main(): | |
| # """launch code | |
| # >>>> accelerate config | |
| # >>>> accelerate launch ner_inference.py --process_batch_size=8 --out_path=processed_dataset | |
| # """ | |
| # parser = HfArgumentParser(PipelineArgs) | |
| # args = parser.parse_args() | |
| # accelerator = Accelerator() | |
| # out_dir = f"{args.out_path}{args.subset.strip('/').split('/')[-2]}" | |
| # if accelerator.is_main_process: | |
| # if not os.path.exists(out_dir): | |
| # os.mkdir(out_dir) | |
| # dataset = load_dataset(args.dataset, data_dir=args.subset, use_auth_token=True, split="train", num_proc=12) | |
| # dataset = dataset.map( | |
| # lambda example, idx: { | |
| # "id": f"{idx}", | |
| # "max_stars_count": example["max_stars_count"] if example["max_stars_count"] is not None else 0 | |
| # }, | |
| # with_indices=True, num_proc=12) | |
| # shard_size = (len(dataset))/8 | |
| # if shard_size > 1_000_000: | |
| # process_batch_size = 200_000 | |
| # elif shard_size > 100_000: | |
| # process_batch_size = 100_000 | |
| # else: | |
| # process_batch_size = 10_000 | |
| # model = AutoModelForTokenClassification.from_pretrained(args.model_name, use_auth_token=True) | |
| # id_to_label = model.config.id2label | |
| # tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_auth_token=True) | |
| # columns = dataset.column_names | |
| # dataset = dataset.remove_columns([col for col in columns if col not in ["content", "id", "max_stars_repo_name", "max_stars_repo_path", "max_stars_count"]]) | |
| # dataloader = DataLoader(dataset, batch_size=process_batch_size, shuffle=False, num_workers=4) | |
| # model, dataloader = accelerator.prepare(model, dataloader) | |
| # pipeline = PiiNERPipeline( | |
| # model, | |
| # tokenizer=tokenizer, | |
| # batch_size=args.batch_size, | |
| # window_size=512, | |
| # device=accelerator.local_process_index, | |
| # num_workers=1, | |
| # use_auth_token=True, | |
| # id_to_label=id_to_label, | |
| # window_overlap=False, | |
| # bf16=True | |
| # ) | |
| # num_samples = 0 | |
| # for i, batch in enumerate(tqdm(dataloader)): | |
| # # last batches are filled - remove filling | |
| # if i==len(dataloader)-1 and int(batch["id"][0])>int(batch["id"][-1]): | |
| # for j in range(len(batch["id"])-1): | |
| # if int(batch["id"][j])>int(batch["id"][j+1]): | |
| # stop_index = j+1 | |
| # for key in batch: | |
| # batch[key] = batch[key][:stop_index] | |
| # result = list(pipeline(datasets.Dataset.from_dict(batch))) | |
| # # add original data | |
| # for k, element in enumerate(result): | |
| # for key in batch: | |
| # element[key] = batch[key][k] | |
| # processed_dataset = Dataset.from_dict(pd.DataFrame(result)) | |
| # processed_dataset.to_parquet(f"{out_dir}/job_{accelerator.process_index}_{i}.parquet") | |
| # if __name__ == "__main__": | |
| # main() | |
| # import torch | |
| # from transformers import AutoModelForTokenClassification, AutoTokenizer | |
| # from privacy.util.code_detect.ner.ner_inference import PiiNERPipeline | |
| # from datasets import Dataset | |
| # from privacy.util.code_detect.ner.pii_redaction.utils import get_replacements, redact_pii_batch | |
| # def main(): | |
| # # Specify the path to your local model and input code file | |
| # model_path = "pii_inference/nermodel" | |
| # code_file_path = "input_code.java" | |
| # # Load the model and tokenizer | |
| # model = AutoModelForTokenClassification.from_pretrained(model_path) | |
| # tokenizer = AutoTokenizer.from_pretrained(model_path) | |
| # # Create the NER pipeline | |
| # pipeline = PiiNERPipeline( | |
| # model, | |
| # tokenizer=tokenizer, | |
| # batch_size=1024, | |
| # window_size=512, | |
| # device=torch.device("cuda" if torch.cuda.is_available() else "cpu"), | |
| # num_workers=1, | |
| # id_to_label=model.config.id2label, | |
| # window_overlap=False, | |
| # bf16=True | |
| # ) | |
| # # Read the input code file | |
| # with open(code_file_path, "r") as file: | |
| # code = file.read() | |
| # # Split the code into sentences | |
| # sentences = code.split(". ") | |
| # print(sentences, "SENTENCES") | |
| # # Create an id list | |
| # ids = list(range(len(sentences))) | |
| # # Create a Dataset object from the sentences | |
| # dataset = Dataset.from_dict({"content": sentences, "id": ids}) | |
| # # Process the sentences with the NER pipeline | |
| # result = pipeline(dataset) | |
| # replacements = get_replacements() | |
| # # Convert the generator to a list and print the results | |
| # results = list(result) | |
| # print(results, "RESULT") | |
| # # Redact the PII from the results | |
| # redacted_results = redact_pii_batch(results, replacements) | |
| # print(redacted_results, "redacted_results") | |
| # if __name__ == "__main__": | |
| # main() | |