flexpert / data /scripts /prepare_dataset.py
Honzus24's picture
initial commit
7968cb0
from data_utils import parse_PDB, align_pdb_dict_formats
import os
import re
import json
import yaml
from tqdm import tqdm
in_dir = yaml.load(open('configs/data_config.yaml', 'r'), Loader=yaml.FullLoader)['pdb_dir']
out_dir = yaml.load(open('configs/data_config.yaml', 'r'), Loader=yaml.FullLoader)['preprocessed_dir']
fold_list = []
fold_files = os.listdir(in_dir)
fold_files = [filename for filename in fold_files if re.match(".*\.pdb$", filename)]
for file in tqdm(fold_files):
_name= file.split('_')[0]
_chain = file.split('_')[1].split('.')[0]
_path = f'{in_dir}/{file}'
old_pdb = parse_PDB(_path,name=_name, input_chain_list=[_chain])[0]
new_pdb = align_pdb_dict_formats(old_pdb,_chain)
fold_list.append(new_pdb)
with open(f'{out_dir}/chain_set.jsonl','w') as f:
for dict in fold_list:
json.dump(dict,f)
f.write('\n')