Create MSI_issue_check.py
Browse files- MSI_issue_check.py +40 -0
MSI_issue_check.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from glob import glob
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import json
|
| 4 |
+
|
| 5 |
+
def dicts_to_dataframe(json_data):
|
| 6 |
+
dict_full = []
|
| 7 |
+
for e1 in json_data['patient_level']['biomarkers']['details']:
|
| 8 |
+
bm = e1['prediction']
|
| 9 |
+
for e2 in e1['attribute']:
|
| 10 |
+
row_dict = {}
|
| 11 |
+
for e3 in e2['attribute_details']:
|
| 12 |
+
row_dict['BM'] = bm
|
| 13 |
+
if e3['attribute_name'] == 'pif_key':
|
| 14 |
+
pif_key = e3['attribute_prediction']
|
| 15 |
+
row_dict['pif_key'] = pif_key
|
| 16 |
+
if e3['attribute_name'] != 'pif_key':
|
| 17 |
+
row_dict[e3['attribute_name']]=e3['attribute_prediction']
|
| 18 |
+
dict_full.append(row_dict)
|
| 19 |
+
df = pd.DataFrame()
|
| 20 |
+
for d in dict_full:
|
| 21 |
+
temp_df = pd.DataFrame([d])
|
| 22 |
+
df = pd.concat([df, temp_df], ignore_index=True)
|
| 23 |
+
|
| 24 |
+
df = df.fillna('')
|
| 25 |
+
# df.drop_duplicates(inplace=True)
|
| 26 |
+
|
| 27 |
+
return df
|
| 28 |
+
|
| 29 |
+
msi_list = []
|
| 30 |
+
for json_f in glob('/nlp_efs/pat_level_json_delivery_lungca_2024.tar.gz/*.json'):
|
| 31 |
+
filename = json_f.split('/')[-1]
|
| 32 |
+
profile = filename.split('.')[0]
|
| 33 |
+
with open(json_f) as of:
|
| 34 |
+
json_data = json.load(of)
|
| 35 |
+
tmp = json_data['patient_level']['biomarkers']['base_prediction']
|
| 36 |
+
if 'msi' in tmp.split(';'):
|
| 37 |
+
kdf = dicts_to_dataframe(json_data)
|
| 38 |
+
kdf.insert(0, 'profile_key', profile)
|
| 39 |
+
msi_list.append(kdf[kdf['BM']=='MSI'])
|
| 40 |
+
msi_df = pd.concat(msi_list)
|