from glob import glob
import pandas as pd
import json
import multiprocessing

def dicts_to_dataframe(json_data):
    dict_full = []
    for e1 in json_data['patient_level']['biomarkers']['details']:
        bm = e1['prediction']
        for e2 in e1['attribute']:
            row_dict = {}
            for e3 in e2['attribute_details']:
                row_dict['BM'] = bm
                if e3['attribute_name'] == 'pif_key':
                    pif_key = e3['attribute_prediction']
                row_dict['pif_key'] = pif_key
                if e3['attribute_name'] != 'pif_key':
                    row_dict[e3['attribute_name']] = e3['attribute_prediction']
            dict_full.append(row_dict)
    df = pd.DataFrame()
    for d in dict_full:
        temp_df = pd.DataFrame([d])
        df = pd.concat([df, temp_df], ignore_index=True)

    df = df.fillna('')
#     df.drop_duplicates(inplace=True)

    return df

def process_json_file(json_file):
    msi_list = []
    filename = json_file.split('/')[-1]
    profile = filename.split('.')[0]
    with open(json_file) as of:
        json_data = json.load(of)
    tmp = json_data['patient_level']['biomarkers']['base_prediction']
    if 'msi' in tmp.split(';'):
        kdf = dicts_to_dataframe(json_data)
        kdf.insert(0, 'profile_key', profile)
        msi_list.append(kdf[kdf['BM']=='MSI'])
    return msi_list

def main():
    json_files = glob('/nlp_efs/pat_level_json_delivery_lungca_2024.tar.gz/*.json')
    pool = multiprocessing.Pool(processes=10)
    results = pool.map(process_json_file, json_files)
    pool.close()
    pool.join()

    msi_df = pd.concat([df for sublist in results for df in sublist])
    print(msi_df)

if __name__ == "__main__":
    main()