from glob import glob import pandas as pd import json import multiprocessing def dicts_to_dataframe(json_data): dict_full = [] for e1 in json_data['patient_level']['biomarkers']['details']: bm = e1['prediction'] for e2 in e1['attribute']: row_dict = {} for e3 in e2['attribute_details']: row_dict['BM'] = bm if e3['attribute_name'] == 'pif_key': pif_key = e3['attribute_prediction'] row_dict['pif_key'] = pif_key if e3['attribute_name'] != 'pif_key': row_dict[e3['attribute_name']] = e3['attribute_prediction'] dict_full.append(row_dict) df = pd.DataFrame() for d in dict_full: temp_df = pd.DataFrame([d]) df = pd.concat([df, temp_df], ignore_index=True) df = df.fillna('') # df.drop_duplicates(inplace=True) return df def process_json_file(json_file): msi_list = [] filename = json_file.split('/')[-1] profile = filename.split('.')[0] with open(json_file) as of: json_data = json.load(of) tmp = json_data['patient_level']['biomarkers']['base_prediction'] if 'msi' in tmp.split(';'): kdf = dicts_to_dataframe(json_data) kdf.insert(0, 'profile_key', profile) msi_list.append(kdf[kdf['BM']=='MSI']) return msi_list def main(): json_files = glob('/nlp_efs/pat_level_json_delivery_lungca_2024.tar.gz/*.json') pool = multiprocessing.Pool(processes=10) results = pool.map(process_json_file, json_files) pool.close() pool.join() msi_df = pd.concat([df for sublist in results for df in sublist]) print(msi_df) if __name__ == "__main__": main()