import pickle,os,h5py import numpy as np from scipy.sparse import load_npz,csr_matrix,save_npz import torch from sklearn import metrics from huggingface_hub import hf_hub_download def pad_seq_matrix(matrix, pad_len=300): # add flanking region to each sample paddings = np.zeros((1, 4, pad_len)).astype('int8') dmatrix = np.concatenate((paddings, matrix[:, :, -pad_len:]), axis=0)[:-1, :, :] umatrix = np.concatenate((matrix[:, :, :pad_len], paddings), axis=0)[1:, :, :] return np.concatenate((dmatrix, matrix, umatrix), axis=2) def pad_signal_matrix(matrix, pad_len=300): paddings = np.zeros(pad_len).astype('float32') dmatrix = np.vstack((paddings, matrix[:, -pad_len:]))[:-1, :] umatrix = np.vstack((matrix[:, :pad_len], paddings))[1:, :] return np.hstack((dmatrix, matrix, umatrix)) #def load_ref_genome(chr): # ref_path = '/nfs/turbo/umms-drjieliu/usr/zzh/KGbert/3D/data/ref_genome/' # ref_file = os.path.join(ref_path, 'chr%s.npz' % chr) # ref_gen_data = load_npz(ref_file).toarray().reshape(4, -1, 1000).swapaxes(0, 1) # return torch.tensor(pad_seq_matrix(ref_gen_data)) def load_ref_genome(chr): # Construct the filename based on the repository structure. # If the file is at the root, use: filename = f"chr{chr}.npz" # Download the file from the dataset repository "luosanj/epcotv2_data" ref_file = hf_hub_download( repo_id="luosanj/epcotv2_data", filename=filename, repo_type="dataset" # Specify that it's a dataset repo ) # Load the sparse matrix, convert it to an array, reshape, and swap axes ref_gen_data = load_npz(ref_file).toarray().reshape(4, -1, 1000).swapaxes(0, 1) # Pad the sequence matrix as needed and return a Torch tensor return torch.tensor(pad_seq_matrix(ref_gen_data)) def normalize_seq(x,percentile): data=x.data.copy() val=np.percentile(data,percentile) minv=data.min() x.data= np.clip(data/val,0,1)*5 return x def load_dnase(dnase_seq,normalize=False): if normalize: dnase_seq=normalize_seq(dnase_seq,98) dnase_seq = np.expand_dims(pad_signal_matrix(dnase_seq.toarray().reshape(-1, 1000)), axis=1) return torch.tensor(dnase_seq) def prepare_train_data(bulk_cls): bulk_dnase_data={} ref_data={} chroms=[i for i in range(1,23)] bulk_path = '../atac_bw/' for chr in chroms: ref_data[chr] = load_ref_genome(chr) for cl in bulk_cls: bulk_dnase_data[cl]={} with open(bulk_path + '%s_atac.pickle' % cl, 'rb') as f: bulkdnase = pickle.load(f) for chr in chroms: bulk_dnase_data[cl][chr] = load_dnase(bulkdnase[chr]) return bulk_dnase_data,ref_data def prepare_train_data_1(bulk_cls): bulk_dnase_data={} ref_data={} chroms=[i for i in range(1,23)] bulk_path = '/scratch/drjieliu_root/drjieliu/zhenhaoz/ATAC-seq/bw/' for chr in chroms: ref_data[chr] = load_ref_genome(chr) for cl in bulk_cls: bulk_dnase_data[cl]={} with open(bulk_path + '%s_atac_1.pickle' % cl, 'rb') as f: bulkdnase = pickle.load(f) for chr in chroms: bulk_dnase_data[cl][chr] = load_dnase(bulkdnase[chr]) return bulk_dnase_data,ref_data def prepare_bru(cls): bru={} for cl in cls: tmp=h5py.File('/scratch/drjieliu_root/drjieliu/zhenhaoz/bru/data/%s_bru_seq_cov.h5'%cl)['targets'] tmp=np.array(tmp).astype('float32') tmp=np.arcsinh(tmp / np.percentile(tmp[tmp > 0], 95)) tmp=np.expand_dims(tmp,-1) tmp1 = h5py.File('/scratch/drjieliu_root/drjieliu/zhenhaoz/bru/data/%s_bruuv_seq_cov.h5' % cl)['targets'] tmp1 = np.array(tmp1).astype('float32') tmp1 = np.arcsinh(tmp1 / np.percentile(tmp1[tmp1 > 0], 95)) tmp1 = np.expand_dims(tmp1, -1) tmp2 = h5py.File('/scratch/drjieliu_root/drjieliu/zhenhaoz/bru/data/%s_bruchase_seq_cov.h5' % cl)['targets'] tmp2 = np.array(tmp2).astype('float32') tmp2 = np.arcsinh(tmp2 / np.percentile(tmp2[tmp2 > 0], 95)) tmp2 = np.expand_dims(tmp2, -1) bru[cl]=torch.tensor(np.concatenate((tmp,tmp1,tmp2),axis=-1)).float() print(cl,bru[cl].shape) return bru def prepare_rna(cls): bru={} for cl in cls: tmp=h5py.File('/nfs/turbo/umms-drjieliu/proj/CAGE-seq/data/%s_cage_seq_cov.h5'%cl)['targets'] tmp=np.array(tmp).astype('float32') tmp=np.arcsinh(tmp / np.percentile(tmp[tmp > 0], 95)) tmp=np.expand_dims(tmp,-1) tmp1 = h5py.File('/nfs/turbo/umms-drjieliu/proj/RNA-seq/data/%s_trna_seq_cov.h5' % cl)['targets'] tmp1 = np.array(tmp1).astype('float32') tmp1 = np.arcsinh(tmp1 / np.percentile(tmp1[tmp1 > 0], 95)) tmp1 = np.expand_dims(tmp1, -1) tmp2 = h5py.File('/nfs/turbo/umms-drjieliu/proj/RNA-seq/data/%s_prna_seq_cov.h5' % cl)['targets'] tmp2 = np.array(tmp2).astype('float32') tmp2 = np.arcsinh(tmp2 / np.percentile(tmp2[tmp2 > 0], 95)) tmp2 = np.expand_dims(tmp2, -1) bru[cl]=torch.tensor(np.concatenate((tmp,tmp1,tmp2),axis=-1)).float() print(cl,bru[cl].shape) return bru