Spaces:
Running
on
Zero
Running
on
Zero
File size: 1,822 Bytes
7968cb0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
import os
import json
import numpy as np
import torch.utils.data as data
class CASPDataset(data.Dataset):
def __init__(self, path = './', split='test'):
if not os.path.exists(path):
raise "no such file:{} !!!".format(path)
else:
with open(os.path.join(path,'casp15.jsonl')) as f:
lines = f.readlines()
# casp15_data = json.load(open(path+'casp15.json', 'r'))
alphabet='ACDEFGHIKLMNPQRSTVWY'
alphabet_set = set([a for a in alphabet])
self.data = []
for line in lines:
entry = json.loads(line)
seq = entry['seq']
for key, val in entry['coords'].items():
entry['coords'][key] = np.asarray(val)
bad_chars = set([s for s in seq]).difference(alphabet_set)
if len(bad_chars) == 0:
chain_length = len(entry['seq'])
chain_mask = np.ones(chain_length)
self.data.append({
'title':entry['name'],
'seq':entry['seq'],
'CA':entry['coords']['CA'],
'C':entry['coords']['C'],
'O':entry['coords']['O'],
'N':entry['coords']['N'],
'chain_mask': chain_mask,
'chain_encoding': 1*chain_mask,
'classification': entry['classification']
})
def __len__(self):
return len(self.data)
def get_item(self, index):
return self.data[index]
def __getitem__(self, index):
return self.data[index]
if __name__ == '__main__':
dataset = CASPDataset('/gaozhangyang/experiments/OpenCPD/data/casp15/')
for data in dataset:
print(data) |