| | import pandas as pd |
| | from transformers import BertTokenizer, AutoTokenizer |
| | import json |
| | from tqdm import tqdm |
| | import argparse |
| |
|
| | def get_impressions_from_csv(path): |
| | df = pd.read_csv(path) |
| | imp = df['Report Impression'] |
| | imp = imp.str.strip() |
| | imp = imp.replace('\n',' ', regex=True) |
| | imp = imp.replace('\s+', ' ', regex=True) |
| | imp = imp.str.strip() |
| | return imp |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | def tokenize(impressions, tokenizer): |
| | new_impressions = [] |
| | |
| | tokenized_imp = tokenizer.tokenize(impressions) |
| | if tokenized_imp: |
| | res = tokenizer.encode_plus(tokenized_imp)['input_ids'] |
| | if len(res) > 512: |
| | |
| | res = res[:511] + [tokenizer.sep_token_id] |
| | new_impressions.append(res) |
| | else: |
| | new_impressions.append([tokenizer.cls_token_id, tokenizer.sep_token_id]) |
| | return new_impressions |
| |
|
| | def load_list(path): |
| | with open(path, 'r') as filehandle: |
| | impressions = json.load(filehandle) |
| | return impressions |
| |
|
| | if __name__ == "__main__": |
| | parser = argparse.ArgumentParser(description='Tokenize radiology report impressions and save as a list.') |
| | parser.add_argument('-d', '--data', type=str, nargs='?', required=True, |
| | help='path to csv containing reports. The reports should be \ |
| | under the \"Report Impression\" column') |
| | parser.add_argument('-o', '--output_path', type=str, nargs='?', required=True, |
| | help='path to intended output file') |
| | |
| | parser.add_argument('-s', '--sentence', type=str, nargs='?', required=True, |
| | help="A sentence containing an impression which is replaced the '\n', and '\s' with ' '") |
| | args = parser.parse_args() |
| | csv_path = args.data |
| | out_path = args.output_path |
| | impressions = args.sentence |
| | |
| | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') |
| |
|
| | |
| | new_impressions = tokenize(impressions, tokenizer) |
| | with open(out_path, 'w') as filehandle: |
| | json.dump(new_impressions, filehandle) |
| |
|