jkushwaha commited on
Commit
c309c6f
·
verified ·
1 Parent(s): 2015edf

Update MSI_issue_check.py

Browse files
Files changed (1) hide show
  1. MSI_issue_check.py +20 -6
MSI_issue_check.py CHANGED
@@ -1,6 +1,7 @@
1
  from glob import glob
2
  import pandas as pd
3
  import json
 
4
 
5
  def dicts_to_dataframe(json_data):
6
  dict_full = []
@@ -14,7 +15,7 @@ def dicts_to_dataframe(json_data):
14
  pif_key = e3['attribute_prediction']
15
  row_dict['pif_key'] = pif_key
16
  if e3['attribute_name'] != 'pif_key':
17
- row_dict[e3['attribute_name']]=e3['attribute_prediction']
18
  dict_full.append(row_dict)
19
  df = pd.DataFrame()
20
  for d in dict_full:
@@ -26,15 +27,28 @@ def dicts_to_dataframe(json_data):
26
 
27
  return df
28
 
29
- msi_list = []
30
- for json_f in glob('/nlp_efs/pat_level_json_delivery_lungca_2024.tar.gz/*.json'):
31
- filename = json_f.split('/')[-1]
32
  profile = filename.split('.')[0]
33
- with open(json_f) as of:
34
  json_data = json.load(of)
35
  tmp = json_data['patient_level']['biomarkers']['base_prediction']
36
  if 'msi' in tmp.split(';'):
37
  kdf = dicts_to_dataframe(json_data)
38
  kdf.insert(0, 'profile_key', profile)
39
  msi_list.append(kdf[kdf['BM']=='MSI'])
40
- msi_df = pd.concat(msi_list)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from glob import glob
2
  import pandas as pd
3
  import json
4
+ import multiprocessing
5
 
6
  def dicts_to_dataframe(json_data):
7
  dict_full = []
 
15
  pif_key = e3['attribute_prediction']
16
  row_dict['pif_key'] = pif_key
17
  if e3['attribute_name'] != 'pif_key':
18
+ row_dict[e3['attribute_name']] = e3['attribute_prediction']
19
  dict_full.append(row_dict)
20
  df = pd.DataFrame()
21
  for d in dict_full:
 
27
 
28
  return df
29
 
30
+ def process_json_file(json_file):
31
+ msi_list = []
32
+ filename = json_file.split('/')[-1]
33
  profile = filename.split('.')[0]
34
+ with open(json_file) as of:
35
  json_data = json.load(of)
36
  tmp = json_data['patient_level']['biomarkers']['base_prediction']
37
  if 'msi' in tmp.split(';'):
38
  kdf = dicts_to_dataframe(json_data)
39
  kdf.insert(0, 'profile_key', profile)
40
  msi_list.append(kdf[kdf['BM']=='MSI'])
41
+ return msi_list
42
+
43
+ def main():
44
+ json_files = glob('/nlp_efs/pat_level_json_delivery_lungca_2024.tar.gz/*.json')
45
+ pool = multiprocessing.Pool(processes=10)
46
+ results = pool.map(process_json_file, json_files)
47
+ pool.close()
48
+ pool.join()
49
+
50
+ msi_df = pd.concat([df for sublist in results for df in sublist])
51
+ print(msi_df)
52
+
53
+ if __name__ == "__main__":
54
+ main()