Spaces:
Sleeping
Sleeping
| import numpy as np | |
| def dnaseq_features(seq): | |
| start = 0 | |
| n_segs = 101 | |
| seq_name = 'seq' | |
| seq = seq.strip().upper() | |
| # اگر طول توالی کمتر از n_segs بود | |
| if len(seq) < n_segs: | |
| raise ValueError(f"Sequence too short ({len(seq)} bp). Must be at least {n_segs} bases long.") | |
| remaind = len(seq) % n_segs | |
| if remaind != 0: | |
| last_id = len(seq) - remaind | |
| upd_seq = seq[start:last_id] | |
| else: | |
| upd_seq = seq # کل توالی استفاده شود اگر مضرب کامل است | |
| dic_seq = {} | |
| for i in range(0, len(upd_seq) // n_segs): | |
| a = int(i * n_segs) | |
| b = int(i * n_segs) + n_segs | |
| identifier = f"{seq_name}_{a}:{b}" | |
| dic_seq[identifier] = upd_seq[a:b] | |
| lst_seq = dic_seq.values() | |
| index = list(dic_seq.keys()) | |
| values = list(dic_seq.values()) | |
| # One hot encode | |
| abc = 'ACGT' | |
| char_to_int = dict((c, i) for i, c in enumerate(abc)) | |
| matrix_list = [] | |
| for data in lst_seq: | |
| int_enc = [char_to_int[char] for char in data if char in abc] | |
| ohe = [] | |
| for value in int_enc: | |
| base = [0 for _ in range(len(abc))] | |
| base[value] = 1 | |
| ohe.append(base) | |
| np_mat = np.array(ohe) | |
| np_mat = np.expand_dims(np_mat, axis=0) | |
| matrix_list.append(np_mat) | |
| matrix = np.concatenate(matrix_list, axis=0) | |
| return matrix, index, values | |