import numpy as np def dnaseq_features(seq): start = 0 n_segs = 101 seq_name = 'seq' seq = seq.strip().upper() # اگر طول توالی کمتر از n_segs بود if len(seq) < n_segs: raise ValueError(f"Sequence too short ({len(seq)} bp). Must be at least {n_segs} bases long.") remaind = len(seq) % n_segs if remaind != 0: last_id = len(seq) - remaind upd_seq = seq[start:last_id] else: upd_seq = seq # کل توالی استفاده شود اگر مضرب کامل است dic_seq = {} for i in range(0, len(upd_seq) // n_segs): a = int(i * n_segs) b = int(i * n_segs) + n_segs identifier = f"{seq_name}_{a}:{b}" dic_seq[identifier] = upd_seq[a:b] lst_seq = dic_seq.values() index = list(dic_seq.keys()) values = list(dic_seq.values()) # One hot encode abc = 'ACGT' char_to_int = dict((c, i) for i, c in enumerate(abc)) matrix_list = [] for data in lst_seq: int_enc = [char_to_int[char] for char in data if char in abc] ohe = [] for value in int_enc: base = [0 for _ in range(len(abc))] base[value] = 1 ohe.append(base) np_mat = np.array(ohe) np_mat = np.expand_dims(np_mat, axis=0) matrix_list.append(np_mat) matrix = np.concatenate(matrix_list, axis=0) return matrix, index, values