jkushwaha
/

code

Model card Files Files and versions

code / MSI_issue_check.py

jkushwaha's picture

Update MSI_issue_check.py

c309c6f verified almost 2 years ago

history blame contribute delete

1.74 kB

	from glob import glob
	import pandas as pd
	import json
	import multiprocessing

	def dicts_to_dataframe(json_data):
	dict_full = []
	for e1 in json_data['patient_level']['biomarkers']['details']:
	bm = e1['prediction']
	for e2 in e1['attribute']:
	row_dict = {}
	for e3 in e2['attribute_details']:
	row_dict['BM'] = bm
	if e3['attribute_name'] == 'pif_key':
	pif_key = e3['attribute_prediction']
	row_dict['pif_key'] = pif_key
	if e3['attribute_name'] != 'pif_key':
	row_dict[e3['attribute_name']] = e3['attribute_prediction']
	dict_full.append(row_dict)
	df = pd.DataFrame()
	for d in dict_full:
	temp_df = pd.DataFrame([d])
	df = pd.concat([df, temp_df], ignore_index=True)

	df = df.fillna('')
	# df.drop_duplicates(inplace=True)

	return df

	def process_json_file(json_file):
	msi_list = []
	filename = json_file.split('/')[-1]
	profile = filename.split('.')[0]
	with open(json_file) as of:
	json_data = json.load(of)
	tmp = json_data['patient_level']['biomarkers']['base_prediction']
	if 'msi' in tmp.split(';'):
	kdf = dicts_to_dataframe(json_data)
	kdf.insert(0, 'profile_key', profile)
	msi_list.append(kdf[kdf['BM']=='MSI'])
	return msi_list

	def main():
	json_files = glob('/nlp_efs/pat_level_json_delivery_lungca_2024.tar.gz/*.json')
	pool = multiprocessing.Pool(processes=10)
	results = pool.map(process_json_file, json_files)
	pool.close()
	pool.join()

	msi_df = pd.concat([df for sublist in results for df in sublist])
	print(msi_df)

	if __name__ == "__main__":
	main()