| from glob import glob |
| import pandas as pd |
| import json |
| import multiprocessing |
|
|
| def dicts_to_dataframe(json_data): |
| dict_full = [] |
| for e1 in json_data['patient_level']['biomarkers']['details']: |
| bm = e1['prediction'] |
| for e2 in e1['attribute']: |
| row_dict = {} |
| for e3 in e2['attribute_details']: |
| row_dict['BM'] = bm |
| if e3['attribute_name'] == 'pif_key': |
| pif_key = e3['attribute_prediction'] |
| row_dict['pif_key'] = pif_key |
| if e3['attribute_name'] != 'pif_key': |
| row_dict[e3['attribute_name']] = e3['attribute_prediction'] |
| dict_full.append(row_dict) |
| df = pd.DataFrame() |
| for d in dict_full: |
| temp_df = pd.DataFrame([d]) |
| df = pd.concat([df, temp_df], ignore_index=True) |
|
|
| df = df.fillna('') |
| |
|
|
| return df |
|
|
| def process_json_file(json_file): |
| msi_list = [] |
| filename = json_file.split('/')[-1] |
| profile = filename.split('.')[0] |
| with open(json_file) as of: |
| json_data = json.load(of) |
| tmp = json_data['patient_level']['biomarkers']['base_prediction'] |
| if 'msi' in tmp.split(';'): |
| kdf = dicts_to_dataframe(json_data) |
| kdf.insert(0, 'profile_key', profile) |
| msi_list.append(kdf[kdf['BM']=='MSI']) |
| return msi_list |
|
|
| def main(): |
| json_files = glob('/nlp_efs/pat_level_json_delivery_lungca_2024.tar.gz/*.json') |
| pool = multiprocessing.Pool(processes=10) |
| results = pool.map(process_json_file, json_files) |
| pool.close() |
| pool.join() |
|
|
| msi_df = pd.concat([df for sublist in results for df in sublist]) |
| print(msi_df) |
|
|
| if __name__ == "__main__": |
| main() |
|
|