|
|
import pickle,os,h5py |
|
|
import numpy as np |
|
|
from scipy.sparse import load_npz,csr_matrix,save_npz |
|
|
import torch |
|
|
from sklearn import metrics |
|
|
from huggingface_hub import hf_hub_download |
|
|
def pad_seq_matrix(matrix, pad_len=300): |
|
|
|
|
|
paddings = np.zeros((1, 4, pad_len)).astype('int8') |
|
|
dmatrix = np.concatenate((paddings, matrix[:, :, -pad_len:]), axis=0)[:-1, :, :] |
|
|
umatrix = np.concatenate((matrix[:, :, :pad_len], paddings), axis=0)[1:, :, :] |
|
|
return np.concatenate((dmatrix, matrix, umatrix), axis=2) |
|
|
|
|
|
def pad_signal_matrix(matrix, pad_len=300): |
|
|
paddings = np.zeros(pad_len).astype('float32') |
|
|
dmatrix = np.vstack((paddings, matrix[:, -pad_len:]))[:-1, :] |
|
|
umatrix = np.vstack((matrix[:, :pad_len], paddings))[1:, :] |
|
|
return np.hstack((dmatrix, matrix, umatrix)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_ref_genome(chr): |
|
|
|
|
|
|
|
|
filename = f"chr{chr}.npz" |
|
|
|
|
|
ref_file = hf_hub_download( |
|
|
repo_id="luosanj/epcotv2_data", |
|
|
filename=filename, |
|
|
repo_type="dataset" |
|
|
) |
|
|
|
|
|
ref_gen_data = load_npz(ref_file).toarray().reshape(4, -1, 1000).swapaxes(0, 1) |
|
|
|
|
|
return torch.tensor(pad_seq_matrix(ref_gen_data)) |
|
|
|
|
|
def normalize_seq(x,percentile): |
|
|
data=x.data.copy() |
|
|
val=np.percentile(data,percentile) |
|
|
minv=data.min() |
|
|
x.data= np.clip(data/val,0,1)*5 |
|
|
return x |
|
|
|
|
|
def load_dnase(dnase_seq,normalize=False): |
|
|
if normalize: |
|
|
dnase_seq=normalize_seq(dnase_seq,98) |
|
|
dnase_seq = np.expand_dims(pad_signal_matrix(dnase_seq.toarray().reshape(-1, 1000)), axis=1) |
|
|
return torch.tensor(dnase_seq) |
|
|
|
|
|
|
|
|
def prepare_train_data(bulk_cls): |
|
|
bulk_dnase_data={} |
|
|
ref_data={} |
|
|
chroms=[i for i in range(1,23)] |
|
|
bulk_path = '../atac_bw/' |
|
|
for chr in chroms: |
|
|
ref_data[chr] = load_ref_genome(chr) |
|
|
for cl in bulk_cls: |
|
|
bulk_dnase_data[cl]={} |
|
|
with open(bulk_path + '%s_atac.pickle' % cl, 'rb') as f: |
|
|
bulkdnase = pickle.load(f) |
|
|
for chr in chroms: |
|
|
bulk_dnase_data[cl][chr] = load_dnase(bulkdnase[chr]) |
|
|
return bulk_dnase_data,ref_data |
|
|
|
|
|
def prepare_train_data_1(bulk_cls): |
|
|
bulk_dnase_data={} |
|
|
ref_data={} |
|
|
chroms=[i for i in range(1,23)] |
|
|
bulk_path = '/scratch/drjieliu_root/drjieliu/zhenhaoz/ATAC-seq/bw/' |
|
|
for chr in chroms: |
|
|
ref_data[chr] = load_ref_genome(chr) |
|
|
for cl in bulk_cls: |
|
|
bulk_dnase_data[cl]={} |
|
|
with open(bulk_path + '%s_atac_1.pickle' % cl, 'rb') as f: |
|
|
bulkdnase = pickle.load(f) |
|
|
for chr in chroms: |
|
|
bulk_dnase_data[cl][chr] = load_dnase(bulkdnase[chr]) |
|
|
return bulk_dnase_data,ref_data |
|
|
|
|
|
def prepare_bru(cls): |
|
|
bru={} |
|
|
for cl in cls: |
|
|
tmp=h5py.File('/scratch/drjieliu_root/drjieliu/zhenhaoz/bru/data/%s_bru_seq_cov.h5'%cl)['targets'] |
|
|
tmp=np.array(tmp).astype('float32') |
|
|
tmp=np.arcsinh(tmp / np.percentile(tmp[tmp > 0], 95)) |
|
|
tmp=np.expand_dims(tmp,-1) |
|
|
|
|
|
tmp1 = h5py.File('/scratch/drjieliu_root/drjieliu/zhenhaoz/bru/data/%s_bruuv_seq_cov.h5' % cl)['targets'] |
|
|
tmp1 = np.array(tmp1).astype('float32') |
|
|
tmp1 = np.arcsinh(tmp1 / np.percentile(tmp1[tmp1 > 0], 95)) |
|
|
tmp1 = np.expand_dims(tmp1, -1) |
|
|
|
|
|
tmp2 = h5py.File('/scratch/drjieliu_root/drjieliu/zhenhaoz/bru/data/%s_bruchase_seq_cov.h5' % cl)['targets'] |
|
|
tmp2 = np.array(tmp2).astype('float32') |
|
|
tmp2 = np.arcsinh(tmp2 / np.percentile(tmp2[tmp2 > 0], 95)) |
|
|
tmp2 = np.expand_dims(tmp2, -1) |
|
|
|
|
|
|
|
|
bru[cl]=torch.tensor(np.concatenate((tmp,tmp1,tmp2),axis=-1)).float() |
|
|
print(cl,bru[cl].shape) |
|
|
return bru |
|
|
|
|
|
def prepare_rna(cls): |
|
|
bru={} |
|
|
for cl in cls: |
|
|
tmp=h5py.File('/nfs/turbo/umms-drjieliu/proj/CAGE-seq/data/%s_cage_seq_cov.h5'%cl)['targets'] |
|
|
tmp=np.array(tmp).astype('float32') |
|
|
tmp=np.arcsinh(tmp / np.percentile(tmp[tmp > 0], 95)) |
|
|
tmp=np.expand_dims(tmp,-1) |
|
|
|
|
|
tmp1 = h5py.File('/nfs/turbo/umms-drjieliu/proj/RNA-seq/data/%s_trna_seq_cov.h5' % cl)['targets'] |
|
|
tmp1 = np.array(tmp1).astype('float32') |
|
|
tmp1 = np.arcsinh(tmp1 / np.percentile(tmp1[tmp1 > 0], 95)) |
|
|
tmp1 = np.expand_dims(tmp1, -1) |
|
|
|
|
|
tmp2 = h5py.File('/nfs/turbo/umms-drjieliu/proj/RNA-seq/data/%s_prna_seq_cov.h5' % cl)['targets'] |
|
|
tmp2 = np.array(tmp2).astype('float32') |
|
|
tmp2 = np.arcsinh(tmp2 / np.percentile(tmp2[tmp2 > 0], 95)) |
|
|
tmp2 = np.expand_dims(tmp2, -1) |
|
|
|
|
|
|
|
|
bru[cl]=torch.tensor(np.concatenate((tmp,tmp1,tmp2),axis=-1)).float() |
|
|
print(cl,bru[cl].shape) |
|
|
return bru |
|
|
|