File size: 1,434 Bytes
45e8fda
 
 
dd2df80
 
45e8fda
dd2df80
 
 
 
 
 
 
 
45e8fda
dd2df80
 
 
 
45e8fda
dd2df80
 
 
45e8fda
 
dd2df80
45e8fda
 
 
 
 
dd2df80
 
 
 
45e8fda
dd2df80
45e8fda
 
 
 
 
 
dd2df80
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import numpy as np

def dnaseq_features(seq):
    start = 0
    n_segs = 101
    seq_name = 'seq'
    seq = seq.strip().upper()

    # اگر طول توالی کمتر از n_segs بود
    if len(seq) < n_segs:
        raise ValueError(f"Sequence too short ({len(seq)} bp). Must be at least {n_segs} bases long.")

    remaind = len(seq) % n_segs
    if remaind != 0:
        last_id = len(seq) - remaind
        upd_seq = seq[start:last_id]
    else:
        upd_seq = seq  # کل توالی استفاده شود اگر مضرب کامل است

    dic_seq = {}
    for i in range(0, len(upd_seq) // n_segs):
        a = int(i * n_segs)
        b = int(i * n_segs) + n_segs
        identifier = f"{seq_name}_{a}:{b}"
        dic_seq[identifier] = upd_seq[a:b]

    lst_seq = dic_seq.values()
    index = list(dic_seq.keys())
    values = list(dic_seq.values())

    # One hot encode    
    abc = 'ACGT'
    char_to_int = dict((c, i) for i, c in enumerate(abc))

    matrix_list = []
    for data in lst_seq:
        int_enc = [char_to_int[char] for char in data if char in abc]
        ohe = []
        for value in int_enc:
            base = [0 for _ in range(len(abc))]
            base[value] = 1
            ohe.append(base)
        np_mat = np.array(ohe)
        np_mat = np.expand_dims(np_mat, axis=0)
        matrix_list.append(np_mat)

    matrix = np.concatenate(matrix_list, axis=0)
    return matrix, index, values