ChromFound / src /data /tf_data_process.py
YifengJiao's picture
Upload folder using huggingface_hub
534e5a3 verified
class DataProcessorForPad:
def __init__(self, **kwargs):
self.chromosome_vocab = kwargs.get("chromosome_vocab")
self.max_length = kwargs.get("max_length", 1024)
self.add_cls = kwargs.get('add_cls', False)
self.padding_value = kwargs.get('padding_value', -1)
if self.add_cls:
self.max_length -= 1
def process(self, value_data, chromosome, hg38_start, hg38_end):
chromosome_list = [self.chromosome_vocab[chr_] for chr_ in chromosome]
# padding
if len(value_data) >= self.max_length:
value_data = value_data[:self.max_length]
chromosome_list = chromosome_list[:self.max_length]
hg38_start = hg38_start[:self.max_length]
hg38_end = hg38_end[:self.max_length]
else:
value_data.extend([self.padding_value] * (self.max_length - len(value_data)))
chromosome_list.extend([self.chromosome_vocab["pad"]] * (self.max_length - len(chromosome_list)))
hg38_start.extend([0] * (self.max_length - len(hg38_start)))
hg38_end.extend([0] * (self.max_length - len(hg38_end)))
if self.add_cls:
value_data.insert(0, 0)
chromosome_list.insert(0, self.chromosome_vocab["pad"])
hg38_start.insert(0, 0)
hg38_end.insert(0, 0)
return value_data, chromosome_list, hg38_start, hg38_end