File size: 3,351 Bytes
0f8411f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import pandas as pd
from transformers import BertTokenizer, AutoTokenizer
import json
from tqdm import tqdm
import argparse

def get_impressions_from_csv(path):	
        df = pd.read_csv(path)
        imp = df['Report Impression']
        imp = imp.str.strip()
        imp = imp.replace('\n',' ', regex=True)
        imp = imp.replace('\s+', ' ', regex=True)
        imp = imp.str.strip()
        return imp

# def tokenize(impressions, tokenizer):
#         new_impressions = []
#         print("\nTokenizing report impressions. All reports are cut off at 512 tokens.")
#         for i in tqdm(range(impressions.shape[0])):
#                 tokenized_imp = tokenizer.tokenize(impressions.iloc[i])
#                 if tokenized_imp: #not an empty report
#                         res = tokenizer.encode_plus(tokenized_imp)['input_ids']
#                         if len(res) > 512: #length exceeds maximum size
#                                 #print("report length bigger than 512")
#                                 res = res[:511] + [tokenizer.sep_token_id]
#                         new_impressions.append(res)
#                 else: #an empty report
#                         new_impressions.append([tokenizer.cls_token_id, tokenizer.sep_token_id]) 
#         return new_impressions

def tokenize(impressions, tokenizer):
        new_impressions = []
        # print("\nTokenizing report impressions. All reports are cut off at 512 tokens.")
        tokenized_imp = tokenizer.tokenize(impressions)
        if tokenized_imp: #not an empty report
                res = tokenizer.encode_plus(tokenized_imp)['input_ids']
                if len(res) > 512: #length exceeds maximum size
                        #print("report length bigger than 512")
                        res = res[:511] + [tokenizer.sep_token_id]
                new_impressions.append(res)
        else: #an empty report
                new_impressions.append([tokenizer.cls_token_id, tokenizer.sep_token_id]) 
        return new_impressions

def load_list(path):
        with open(path, 'r') as filehandle:
                impressions = json.load(filehandle)
                return impressions

if __name__ == "__main__":
        parser = argparse.ArgumentParser(description='Tokenize radiology report impressions and save as a list.')
        parser.add_argument('-d', '--data', type=str, nargs='?', required=True,
                            help='path to csv containing reports. The reports should be \
                            under the \"Report Impression\" column')
        parser.add_argument('-o', '--output_path', type=str, nargs='?', required=True,
                            help='path to intended output file')
        
        parser.add_argument('-s', '--sentence', type=str, nargs='?', required=True,
                            help="A sentence containing an impression which is replaced the '\n', and '\s' with ' '")
        args = parser.parse_args()
        csv_path = args.data
        out_path = args.output_path
        impressions = args.sentence
        
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        # impressions = get_impressions_from_csv(csv_path)
        new_impressions = tokenize(impressions, tokenizer)
        with open(out_path, 'w') as filehandle:
                json.dump(new_impressions, filehandle)