EPCOTv2 / erna /util.py
Xin Luo
a
5ed9d51
import pickle,os,h5py
import numpy as np
from scipy.sparse import load_npz,csr_matrix,save_npz
import torch
from sklearn import metrics
from huggingface_hub import hf_hub_download
def pad_seq_matrix(matrix, pad_len=300):
# add flanking region to each sample
paddings = np.zeros((1, 4, pad_len)).astype('int8')
dmatrix = np.concatenate((paddings, matrix[:, :, -pad_len:]), axis=0)[:-1, :, :]
umatrix = np.concatenate((matrix[:, :, :pad_len], paddings), axis=0)[1:, :, :]
return np.concatenate((dmatrix, matrix, umatrix), axis=2)
def pad_signal_matrix(matrix, pad_len=300):
paddings = np.zeros(pad_len).astype('float32')
dmatrix = np.vstack((paddings, matrix[:, -pad_len:]))[:-1, :]
umatrix = np.vstack((matrix[:, :pad_len], paddings))[1:, :]
return np.hstack((dmatrix, matrix, umatrix))
#def load_ref_genome(chr):
# ref_path = '/nfs/turbo/umms-drjieliu/usr/zzh/KGbert/3D/data/ref_genome/'
# ref_file = os.path.join(ref_path, 'chr%s.npz' % chr)
# ref_gen_data = load_npz(ref_file).toarray().reshape(4, -1, 1000).swapaxes(0, 1)
# return torch.tensor(pad_seq_matrix(ref_gen_data))
def load_ref_genome(chr):
# Construct the filename based on the repository structure.
# If the file is at the root, use:
filename = f"chr{chr}.npz"
# Download the file from the dataset repository "luosanj/epcotv2_data"
ref_file = hf_hub_download(
repo_id="luosanj/epcotv2_data",
filename=filename,
repo_type="dataset" # Specify that it's a dataset repo
)
# Load the sparse matrix, convert it to an array, reshape, and swap axes
ref_gen_data = load_npz(ref_file).toarray().reshape(4, -1, 1000).swapaxes(0, 1)
# Pad the sequence matrix as needed and return a Torch tensor
return torch.tensor(pad_seq_matrix(ref_gen_data))
def normalize_seq(x,percentile):
data=x.data.copy()
val=np.percentile(data,percentile)
minv=data.min()
x.data= np.clip(data/val,0,1)*5
return x
def load_dnase(dnase_seq,normalize=False):
if normalize:
dnase_seq=normalize_seq(dnase_seq,98)
dnase_seq = np.expand_dims(pad_signal_matrix(dnase_seq.toarray().reshape(-1, 1000)), axis=1)
return torch.tensor(dnase_seq)
def prepare_train_data(bulk_cls):
bulk_dnase_data={}
ref_data={}
chroms=[i for i in range(1,23)]
bulk_path = '../atac_bw/'
for chr in chroms:
ref_data[chr] = load_ref_genome(chr)
for cl in bulk_cls:
bulk_dnase_data[cl]={}
with open(bulk_path + '%s_atac.pickle' % cl, 'rb') as f:
bulkdnase = pickle.load(f)
for chr in chroms:
bulk_dnase_data[cl][chr] = load_dnase(bulkdnase[chr])
return bulk_dnase_data,ref_data
def prepare_train_data_1(bulk_cls):
bulk_dnase_data={}
ref_data={}
chroms=[i for i in range(1,23)]
bulk_path = '/scratch/drjieliu_root/drjieliu/zhenhaoz/ATAC-seq/bw/'
for chr in chroms:
ref_data[chr] = load_ref_genome(chr)
for cl in bulk_cls:
bulk_dnase_data[cl]={}
with open(bulk_path + '%s_atac_1.pickle' % cl, 'rb') as f:
bulkdnase = pickle.load(f)
for chr in chroms:
bulk_dnase_data[cl][chr] = load_dnase(bulkdnase[chr])
return bulk_dnase_data,ref_data
def prepare_bru(cls):
bru={}
for cl in cls:
tmp=h5py.File('/scratch/drjieliu_root/drjieliu/zhenhaoz/bru/data/%s_bru_seq_cov.h5'%cl)['targets']
tmp=np.array(tmp).astype('float32')
tmp=np.arcsinh(tmp / np.percentile(tmp[tmp > 0], 95))
tmp=np.expand_dims(tmp,-1)
tmp1 = h5py.File('/scratch/drjieliu_root/drjieliu/zhenhaoz/bru/data/%s_bruuv_seq_cov.h5' % cl)['targets']
tmp1 = np.array(tmp1).astype('float32')
tmp1 = np.arcsinh(tmp1 / np.percentile(tmp1[tmp1 > 0], 95))
tmp1 = np.expand_dims(tmp1, -1)
tmp2 = h5py.File('/scratch/drjieliu_root/drjieliu/zhenhaoz/bru/data/%s_bruchase_seq_cov.h5' % cl)['targets']
tmp2 = np.array(tmp2).astype('float32')
tmp2 = np.arcsinh(tmp2 / np.percentile(tmp2[tmp2 > 0], 95))
tmp2 = np.expand_dims(tmp2, -1)
bru[cl]=torch.tensor(np.concatenate((tmp,tmp1,tmp2),axis=-1)).float()
print(cl,bru[cl].shape)
return bru
def prepare_rna(cls):
bru={}
for cl in cls:
tmp=h5py.File('/nfs/turbo/umms-drjieliu/proj/CAGE-seq/data/%s_cage_seq_cov.h5'%cl)['targets']
tmp=np.array(tmp).astype('float32')
tmp=np.arcsinh(tmp / np.percentile(tmp[tmp > 0], 95))
tmp=np.expand_dims(tmp,-1)
tmp1 = h5py.File('/nfs/turbo/umms-drjieliu/proj/RNA-seq/data/%s_trna_seq_cov.h5' % cl)['targets']
tmp1 = np.array(tmp1).astype('float32')
tmp1 = np.arcsinh(tmp1 / np.percentile(tmp1[tmp1 > 0], 95))
tmp1 = np.expand_dims(tmp1, -1)
tmp2 = h5py.File('/nfs/turbo/umms-drjieliu/proj/RNA-seq/data/%s_prna_seq_cov.h5' % cl)['targets']
tmp2 = np.array(tmp2).astype('float32')
tmp2 = np.arcsinh(tmp2 / np.percentile(tmp2[tmp2 > 0], 95))
tmp2 = np.expand_dims(tmp2, -1)
bru[cl]=torch.tensor(np.concatenate((tmp,tmp1,tmp2),axis=-1)).float()
print(cl,bru[cl].shape)
return bru