Update MSI_issue_check.py
Browse files- MSI_issue_check.py +20 -6
MSI_issue_check.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
from glob import glob
|
| 2 |
import pandas as pd
|
| 3 |
import json
|
|
|
|
| 4 |
|
| 5 |
def dicts_to_dataframe(json_data):
|
| 6 |
dict_full = []
|
|
@@ -14,7 +15,7 @@ def dicts_to_dataframe(json_data):
|
|
| 14 |
pif_key = e3['attribute_prediction']
|
| 15 |
row_dict['pif_key'] = pif_key
|
| 16 |
if e3['attribute_name'] != 'pif_key':
|
| 17 |
-
row_dict[e3['attribute_name']]=e3['attribute_prediction']
|
| 18 |
dict_full.append(row_dict)
|
| 19 |
df = pd.DataFrame()
|
| 20 |
for d in dict_full:
|
|
@@ -26,15 +27,28 @@ def dicts_to_dataframe(json_data):
|
|
| 26 |
|
| 27 |
return df
|
| 28 |
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
filename =
|
| 32 |
profile = filename.split('.')[0]
|
| 33 |
-
with open(
|
| 34 |
json_data = json.load(of)
|
| 35 |
tmp = json_data['patient_level']['biomarkers']['base_prediction']
|
| 36 |
if 'msi' in tmp.split(';'):
|
| 37 |
kdf = dicts_to_dataframe(json_data)
|
| 38 |
kdf.insert(0, 'profile_key', profile)
|
| 39 |
msi_list.append(kdf[kdf['BM']=='MSI'])
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from glob import glob
|
| 2 |
import pandas as pd
|
| 3 |
import json
|
| 4 |
+
import multiprocessing
|
| 5 |
|
| 6 |
def dicts_to_dataframe(json_data):
|
| 7 |
dict_full = []
|
|
|
|
| 15 |
pif_key = e3['attribute_prediction']
|
| 16 |
row_dict['pif_key'] = pif_key
|
| 17 |
if e3['attribute_name'] != 'pif_key':
|
| 18 |
+
row_dict[e3['attribute_name']] = e3['attribute_prediction']
|
| 19 |
dict_full.append(row_dict)
|
| 20 |
df = pd.DataFrame()
|
| 21 |
for d in dict_full:
|
|
|
|
| 27 |
|
| 28 |
return df
|
| 29 |
|
| 30 |
+
def process_json_file(json_file):
|
| 31 |
+
msi_list = []
|
| 32 |
+
filename = json_file.split('/')[-1]
|
| 33 |
profile = filename.split('.')[0]
|
| 34 |
+
with open(json_file) as of:
|
| 35 |
json_data = json.load(of)
|
| 36 |
tmp = json_data['patient_level']['biomarkers']['base_prediction']
|
| 37 |
if 'msi' in tmp.split(';'):
|
| 38 |
kdf = dicts_to_dataframe(json_data)
|
| 39 |
kdf.insert(0, 'profile_key', profile)
|
| 40 |
msi_list.append(kdf[kdf['BM']=='MSI'])
|
| 41 |
+
return msi_list
|
| 42 |
+
|
| 43 |
+
def main():
|
| 44 |
+
json_files = glob('/nlp_efs/pat_level_json_delivery_lungca_2024.tar.gz/*.json')
|
| 45 |
+
pool = multiprocessing.Pool(processes=10)
|
| 46 |
+
results = pool.map(process_json_file, json_files)
|
| 47 |
+
pool.close()
|
| 48 |
+
pool.join()
|
| 49 |
+
|
| 50 |
+
msi_df = pd.concat([df for sublist in results for df in sublist])
|
| 51 |
+
print(msi_df)
|
| 52 |
+
|
| 53 |
+
if __name__ == "__main__":
|
| 54 |
+
main()
|