File size: 8,262 Bytes
75854b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2af0e94
 
 
75854b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2af0e94
 
75854b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2af0e94
 
75854b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2af0e94
 
75854b3
 
 
 
2af0e94
75854b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import torch
from torch.utils.data import Dataset, DataLoader
import json
import SimpleITK as sitk
import numpy as np
from skimage.transform import rescale, resize, downscale_local_mean
# from torchvision.transforms import v2
import sys
from bert_helper import *
sys.path.append('./')
from Dataloader.dataloader_utils import *
import random



mapping_files = {
    # 'MSD': '/home/jachin/data/Github/data/data_gen_def/DATASETS_processed/MSD_processed/nifti_mappings_updated.json',
    # 'TotalSegmentor': '/home/data/Github/data/data_gen_def/DATASETS_processed/TotalSegmentorCT_MRI/nifti_mappings.json',
    # 'Kaggle_osic': '/home/jachin/data/Github/data/data_gen_def/DATASETS_processed/Kaggle_osic_new/nifti_mappings.json',
    # 'CancerImageArchive': '/home/data/Github/data/data_gen_def/DATASETS_processed/CancerImageArchive_test/nifti_mappings.json',
    # 'MnMs': '/home/jachin/data/Github/data/data_gen_def/DATASETS_processed/MnMs/nifti_mappings.json',
    # 'Brats2019': '/home/jachin/data/Github/data/data_gen_def/DATASETS_processed/BRATS/BRATS2019/nifti_mappings.json',
    # 'Brats2020': '/home/jachin/data/Github/data/data_gen_def/DATASETS_processed/BRATS/BRATS2020/nifti_mappings.json',
    # 'Brats2021': '/home/jachin/data/Github/data/data_gen_def/DATASETS_processed/BRATS/BRATS2021/nifti_mappings.json',
    # 'OASIS_1': '/home/data/Github/data/data_gen_def/DATASETS_processed/OASIS/OASIS_1/CS_SECTIONAL/nifti_mappings.json',
    # 'OASIS_2': '/home/data/Github/data/data_gen_def/DATASETS_processed/OASIS/OASIS_2/RAW_V2/nifti_mappings.json',
    'OAI_ZIB_KL': '/home/dn-zhen2/rds/rds-airr-p51-TWhPgQVLKbA/Data/Omini3D/DATASETS_processed/OAI_ZIB/nifti_mappings.json',
    'OAI_ZIB_WOMAC': '/home/dn-zhen2/rds/rds-airr-p51-TWhPgQVLKbA/Data/Omini3D/DATASETS_processed/OAI_ZIB/nifti_mappings.json',
    # 'PSMA-FDG-PET-CT-LESION':'/home/jachin/data/Github/data/data_gen_def/DATASETS_processed/PSMA/PSMA-FDG-PET-CT-LESION/V2/nifti_mappings.json',
    # 'PSMA-CT':'/home/jachin/data/Github/data/data_gen_def/DATASETS_processed/PSMA/Longitudinal-CT/nifti_mappings.json',
    # 'AbdomenAtlas':'/home/jachin/data/Github/data/data_gen_def/DATASETS_processed/AbdomenAtlas_v2/nifti_mappings.json',
    # 'AbdomenCT1k':'/home/jachin/data/Github/data/data_gen_def/DATASETS_processed/AbdomenCT1k/nifti_mappings.json',
    
}
save_paths = {
    'MSD': '/home/data/Github/OmniMorph/Dataloader/nifty_mappings/MSD_mappings.json',
    'TotalSegmentor': '/home/data/Github/OmniMorph/Dataloader/nifty_mappings/TotalSegmentorCT_MRI_mappings.json',
    'Kaggle_osic': '/home/data/Github/OmniMorph/Dataloader/nifty_mappings/Kaggle_osic_mappings.json',
    'CancerImageArchive': '/home/data/Github/OmniMorph/Dataloader/nifty_mappings/CIA_mappings.json',
    'MnMs': '/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/MnMs_mappings.json',
    'Brats2019': '/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/Brats2019_mappings.json',
    'Brats2020': '/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/Brats2020_mappings.json',
    'Brats2021': '/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/Brats2021_mappings.json',
    'OASIS_1': '/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/OASIS_1_mappings.json',
    'OASIS_2': '/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/OASIS_2_mappings.json',
    'PSMA-FDG-PET-CT-LESION':'/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/PSMA-FDG-PET-CT-LESION_mappings.json',
    'PSMA-CT':'/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/PSMA-CT-Longitud_mappings.json',
    'AbdomenAtlas':'/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/AbdomenAtlas_mappings.json',
    'AbdomenCT1k':'/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/AbdomenCT1k_mappings.json',
    'OAI_ZIB_KL': '/home/dn-zhen2/rds/rds-airr-p51-TWhPgQVLKbA/Code/OmniMorph/Dataloader/nifty_mappings/OAI_ZIB_KL_mappings.json',
    'OAI_ZIB_WOMAC': '/home/dn-zhen2/rds/rds-airr-p51-TWhPgQVLKbA/Code/OmniMorph/Dataloader/nifty_mappings/OAI_ZIB_WOMAC_mappings.json',
}
query = {
    'MSD': ['description'],
    'TotalSegmentor': ['age','gender'],
    'Kaggle_osic': ['Age','Sex','Smoke_Status','Weeks','FVC','Percent'],
    'CancerImageArchive':['Series_Description', 'Study_Description', 'Manufacturer'],
    'MnMs': ['Age','Sex','Height','Weight'],
    'Brats2019': ['Age', 'Grade', 'Survival','ResectionStatus'],
    'Brats2020': ['Age', 'Grade', 'Survival','ResectionStatus'],
    'Brats2021': ['Age', 'Grade', 'Survival','ResectionStatus'],
    'OASIS_1': ['Age', 'M/F','ASF','Educ','SES','MMSE','eTIV','CDR','nWBV'],
    'OASIS_2': ['Age', 'Group','M/F','ASF','Educ','SES','MMSE','eTIV','CDR','nWBV'],
    'PSMA-FDG-PET-CT-LESION':['Study Description', 'diagnosis','age','sex',"pet_radionuclide",'ct_contrast_agent'],
    'PSMA-CT':[],
    'AbdomenAtlas':[],
    'AbdomenCT1k':[],
    'OAI_ZIB_KL': ['Age', 'Gender', 'KL_Grade', 'BMI'],
    'OAI_ZIB_WOMAC': ['Age', 'Gender', 'WOMAC_Pain', 'WOMAC_ADL', 'WOMAC_Stiffness', 'BMI'],
}
add_text = {
    'MSD': {},
    'TotalSegmentor': {},
    'Kaggle_osic': {'description': 'pulmonary fibrosis progression'},
    'CancerImageArchive': {},
    'MnMs': {},
    'Brats2019': {'description': 'could include brain tumor, glioma, glioblastoma, low grade glioma, high grade glioma'},
    'Brats2020': {'description': 'could include brain tumor, glioma, glioblastoma, low grade glioma, high grade glioma'},
    'Brats2021': {'description': 'could include brain tumor, glioma, glioblastoma, low grade glioma, high grade glioma'},
    'OASIS_1': {},
    'OASIS_2': {},
    'PSMA-CT':{'description': 'melanoma patients'},
    'PSMA-FDG-PET-CT-LESION':{'description': 'malignant melanoma, lymphoma, lung cancer, or healthy'},
    'AbdomenAtlas':{},
    'AbdomenCT1k':{},
    'OAI_ZIB_KL': {'description': 'right knee osteoarthritis'},
    'OAI_ZIB_WOMAC': {'description': 'right knee osteoarthritis'},
}


# bert intialization
model_name = '/rds/project/rds-TWhPgQVLKbA/Code/OmniMorph/External/Models/bert_large_uncased'
reduce_method = 'mean'
max_words_num = 32  # max number of words in the caption > 2
# max_words_num = 64  # max number of words in the caption > 2

embeder, tokenizer = get_frozen_embeder(model_name)
def embed_str_filter(str_input, filter_words=['segmentation', 'registration']):
    '''
    Filter out specific words from the input string.
    '''
    for word in filter_words:
        str_input = str_input.replace(word, '')
    return str_input

for dataset in mapping_files.keys():
    jsn_path = mapping_files[dataset]
    
    with open(jsn_path, 'r') as f:
        embd_json = json.load(f)
        for key in embd_json.keys():
            embd_json_temp = {}


            embd_json_temp['Modality'] = embd_json[key]['Modality']
            embd_json_temp['ROI'] = embd_json[key]['ROI']


            query_key = query[dataset]

            meta_data = embd_json[key]['Metadata']
            for q in query_key:
                if q in meta_data:
                    embd_json_temp[q] = meta_data[q]
                else:
                    embd_json_temp[q] = 'N/A'
            for q in add_text[dataset].keys():
                if q in embd_json_temp:
                    embd_json_temp[q] += ', ' + add_text[dataset][q]
                else:
                    embd_json_temp[q] = add_text[dataset][q]
            emdb_str = str(embd_json_temp)[1:-1].lower()
            embd_str = replace_text(emdb_str, get_synonyms_dict(None))
            embd_str = embed_str_filter(embd_str)

            print(f'embd_json_temp: {str(embd_json_temp)}')
            print(f'embd_str: {embd_str}')
            print(f'words_num: {len(embd_str.split())}')
            assert(len(embd_str.split()) <= max_words_num), f'Too many words in the caption: {embd_str}'
       
            embd = str2emb(embd_str, max_words_num, embeder, tokenizer, reduce_method=reduce_method)
            print(embd)
            embd_json[key]['embd'] = embd.tolist()[0]
            embd_json[key]['embd_key'] = embd_str
        
        # exit()

        new_jsn_path = save_paths[dataset]
        with open(new_jsn_path, 'w') as f:
            json.dump(embd_json, f, indent=4)