File size: 1,735 Bytes
2015edf c309c6f 2015edf c309c6f 2015edf c309c6f 2015edf c309c6f 2015edf c309c6f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | from glob import glob
import pandas as pd
import json
import multiprocessing
def dicts_to_dataframe(json_data):
dict_full = []
for e1 in json_data['patient_level']['biomarkers']['details']:
bm = e1['prediction']
for e2 in e1['attribute']:
row_dict = {}
for e3 in e2['attribute_details']:
row_dict['BM'] = bm
if e3['attribute_name'] == 'pif_key':
pif_key = e3['attribute_prediction']
row_dict['pif_key'] = pif_key
if e3['attribute_name'] != 'pif_key':
row_dict[e3['attribute_name']] = e3['attribute_prediction']
dict_full.append(row_dict)
df = pd.DataFrame()
for d in dict_full:
temp_df = pd.DataFrame([d])
df = pd.concat([df, temp_df], ignore_index=True)
df = df.fillna('')
# df.drop_duplicates(inplace=True)
return df
def process_json_file(json_file):
msi_list = []
filename = json_file.split('/')[-1]
profile = filename.split('.')[0]
with open(json_file) as of:
json_data = json.load(of)
tmp = json_data['patient_level']['biomarkers']['base_prediction']
if 'msi' in tmp.split(';'):
kdf = dicts_to_dataframe(json_data)
kdf.insert(0, 'profile_key', profile)
msi_list.append(kdf[kdf['BM']=='MSI'])
return msi_list
def main():
json_files = glob('/nlp_efs/pat_level_json_delivery_lungca_2024.tar.gz/*.json')
pool = multiprocessing.Pool(processes=10)
results = pool.map(process_json_file, json_files)
pool.close()
pool.join()
msi_df = pd.concat([df for sublist in results for df in sublist])
print(msi_df)
if __name__ == "__main__":
main()
|