Spaces:
Running
Running
| import numpy as np | |
| import torch | |
| import pickle | |
| from sklearn.preprocessing import OneHotEncoder | |
| from sklearn.compose import make_column_transformer | |
| from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler | |
| class PreprocessLLMData: | |
| """ | |
| A class to automate preprocessing for large language model input data. | |
| """ | |
| def __init__(self, X,y, start_token=12, end_token=13): | |
| """ | |
| Args: | |
| pickle_file_path (str): Path to the pickled data file. | |
| mask_shape (tuple): Shape of the mask (rows, sequence length). | |
| start_token (int): Token to prepend to the input sequences. | |
| end_token (int): Token to append to the input sequences. | |
| """ | |
| #self.pickle_file_path = pickle_file_path | |
| self.X=X | |
| self.y= y | |
| self.start_token = start_token | |
| self.end_token = end_token | |
| def load_process_data(self): | |
| """ | |
| Load the pickled data file and extract DNA label encodings and labels. | |
| Returns: | |
| inputs (np.array): DNA label-encoded sequences. | |
| y (np.array): Labels. | |
| """ | |
| try: | |
| #with open(self.pickle_file_path, 'rb') as pickle_file: | |
| # data = pickle.load(pickle_file) | |
| #X = np.array(self.X) | |
| y = np.array(self.y) | |
| def reads(seq): | |
| n = len(seq) | |
| reads = [] | |
| for i in range(0, n, 250): | |
| read = seq[i:i+250] | |
| # Add zero padding if necessary | |
| if len(read) < 250: | |
| read = np.pad(read, (0, 250-len(read)), mode='constant') | |
| reads.append(read) | |
| return reads | |
| X=reads(self.X) | |
| X = list(map(lambda t: list(t), X)) | |
| #X=X[:-1] | |
| return X, y | |
| except FileNotFoundError: | |
| print(f"Pickle file not found at {self.pickle_file_path}") | |
| raise | |
| except KeyError as e: | |
| print(f"Missing key in the pickle file: {e}") | |
| raise | |
| def shuffle_data(self, inputs, y): | |
| """ | |
| Shuffle inputs and labels using a random permutation. | |
| Args: | |
| inputs (np.array): DNA sequences. | |
| y (np.array): Labels. | |
| Returns: | |
| inputs (torch.Tensor): Shuffled input sequences as tensor. | |
| masks (torch.Tensor): Shuffled mask tensor. | |
| y (list): Shuffled labels. | |
| """ | |
| # indices = torch.randperm(len(y)) | |
| # inputs = torch.from_numpy(np.array(inputs)).type(torch.int8)[indices] | |
| # y = [y[i] for i in list(indices.numpy())] | |
| mask_shape = (len(inputs), 250) | |
| # Generate masks of the appropriate shape | |
| masks = np.ones(mask_shape, dtype=np.int8) | |
| masks = torch.from_numpy(masks).type(torch.int8) #[indices] | |
| return inputs, masks, y | |
| def one_hot_encode_labels(self, y): | |
| """ | |
| One-hot encode the labels using sklearn's OneHotEncoder. | |
| Args: | |
| y (list): Labels. | |
| Returns: | |
| y_ (np.array): One-hot encoded labels. | |
| """ | |
| if(y=='Other Choredate Host'): | |
| y_=[1,0] | |
| elif(y=='Homo sapiens'): | |
| y_=[0,1] | |
| else: | |
| y_=[1,0] | |
| return y_ | |
| def add_tokens_and_adjust_masks(self, inputs, masks): | |
| """ | |
| Add start and end tokens to the input sequences and adjust masks accordingly. | |
| Args: | |
| inputs (torch.Tensor): DNA sequences. | |
| masks (torch.Tensor): Mask tensor. | |
| Returns: | |
| inputs_ (np.array): Inputs with start and end tokens added. | |
| masks_ (np.array): Masks adjusted for the added tokens. | |
| """ | |
| # Insert start tokens at the beginning and end tokens at the end of each sequence | |
| inputs_ = np.insert(inputs, 0, self.start_token, axis=1) | |
| inputs_ = np.insert(inputs_, inputs_.shape[1], self.end_token, axis=1) | |
| # Adjust masks for the added tokens | |
| masks_ = np.insert(masks, 0, 1, axis=1) # Start token | |
| masks_ = np.insert(masks_, masks_.shape[1], 1, axis=1) # End token | |
| return inputs_, masks_ | |
| def preprocess(self): | |
| """ | |
| Main method to preprocess the data for large language models. | |
| Returns: | |
| inputs_ (torch.Tensor): Final processed inputs. | |
| masks_ (torch.Tensor): Final processed masks. | |
| y_ (np.array): One-hot encoded labels. | |
| """ | |
| # Step 1: Load data | |
| inputs, y = self.load_process_data() | |
| # Step 2: Shuffle data | |
| inputs, masks, y = self.shuffle_data(inputs, y) | |
| # Step 3: One-hot encode labels | |
| y_ = self.one_hot_encode_labels(y) | |
| y_=np.tile(y_,len(inputs)).reshape(-1,2) | |
| # Step 4: Add tokens and adjust masks | |
| inputs_, masks_ = self.add_tokens_and_adjust_masks(inputs, masks) | |
| test_labels = torch.tensor(y_).type(torch.FloatTensor) | |
| test_inputs = torch.tensor(inputs_).type(torch.LongTensor) | |
| test_masks = torch.tensor(masks_).type(torch.LongTensor) | |
| test_data = TensorDataset(test_inputs, test_masks, test_labels) | |
| test_dataloader = DataLoader(test_data, batch_size=16) | |
| return torch.from_numpy(np.array(inputs_)), torch.from_numpy(np.array(masks_)), y_,test_dataloader | |
| # Example usage | |
| # if __name__ == "__main__": | |
| # # Define the path to the pickle file and mask shape | |
| # pickle_file_path = "/content/Viruses/numfeatures_spec.pkl" | |
| # mask_shape = (20003842, 250) # Update this based on your data | |
| # # Initialize the PreprocessLLMData class | |
| # preprocessor = PreprocessLLMData(pickle_file_path, mask_shape) | |
| # # Preprocess the data | |
| # inputs_, masks_, y_ = preprocessor.preprocess() | |
| # # Save or use the preprocessed data | |
| # print(f"Processed Inputs Shape: {inputs_.shape}") | |
| # print(f"Processed Masks Shape: {masks_.shape}") | |
| # print(f"One-Hot Encoded Labels Shape: {y_.shape}") |