File size: 1,408 Bytes
534e5a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
class DataProcessorForPad:
    def __init__(self, **kwargs):
        self.chromosome_vocab = kwargs.get("chromosome_vocab")
        self.max_length = kwargs.get("max_length", 1024)
        self.add_cls = kwargs.get('add_cls', False)
        self.padding_value = kwargs.get('padding_value', -1)
        if self.add_cls:
            self.max_length -= 1

    def process(self, value_data, chromosome, hg38_start, hg38_end):
        chromosome_list = [self.chromosome_vocab[chr_] for chr_ in chromosome]
        # padding
        if len(value_data) >= self.max_length:
            value_data = value_data[:self.max_length]
            chromosome_list = chromosome_list[:self.max_length]
            hg38_start = hg38_start[:self.max_length]
            hg38_end = hg38_end[:self.max_length]
        else:
            value_data.extend([self.padding_value] * (self.max_length - len(value_data)))
            chromosome_list.extend([self.chromosome_vocab["pad"]] * (self.max_length - len(chromosome_list)))
            hg38_start.extend([0] * (self.max_length - len(hg38_start)))
            hg38_end.extend([0] * (self.max_length - len(hg38_end)))
        if self.add_cls:
            value_data.insert(0, 0)
            chromosome_list.insert(0, self.chromosome_vocab["pad"])
            hg38_start.insert(0, 0)
            hg38_end.insert(0, 0)
        return value_data, chromosome_list, hg38_start, hg38_end