| | |
| |
|
| | import os |
| | import numpy as np |
| | from sklearn.model_selection import train_test_split |
| | from tensorflow.keras.utils import to_categorical |
| | import pickle |
| |
|
| | def load_sequences(preprocessed_dir='preprocessed_sequences'): |
| | """ |
| | Loads preprocessed sequences and their labels. |
| | |
| | Args: |
| | preprocessed_dir (str): Directory containing preprocessed sequences. |
| | |
| | Returns: |
| | tuple: Lists of sequences and labels, label mapping dictionary. |
| | """ |
| | X = [] |
| | y = [] |
| | label_map = {} |
| | classes = sorted(os.listdir(preprocessed_dir)) |
| | |
| | for idx, cls in enumerate(classes): |
| | label_map[cls] = idx |
| | cls_path = os.path.join(preprocessed_dir, cls) |
| | if not os.path.isdir(cls_path): |
| | continue |
| | sequence_files = [f for f in os.listdir(cls_path) if f.endswith('.npy')] |
| | for seq_file in sequence_files: |
| | seq_path = os.path.join(cls_path, seq_file) |
| | sequence = np.load(seq_path) |
| | X.append(sequence) |
| | y.append(idx) |
| | |
| | |
| | y = np.array(y) |
| | y = to_categorical(y, num_classes=len(label_map)) |
| | |
| | return X, y, label_map |
| |
|
| | def pad_sequences_fixed(X, max_seq_length): |
| | """ |
| | Pads or truncates sequences to a fixed length. |
| | |
| | Args: |
| | X (list of numpy.ndarray): List of sequences with shape (frames, height, width, channels). |
| | max_seq_length (int): Desired sequence length. |
| | |
| | Returns: |
| | numpy.ndarray: Padded/truncated sequences. |
| | """ |
| | padded_X = [] |
| | for seq in X: |
| | if seq.shape[0] < max_seq_length: |
| | pad_width = max_seq_length - seq.shape[0] |
| | padding = np.zeros((pad_width, *seq.shape[1:]), dtype=seq.dtype) |
| | padded_seq = np.concatenate((seq, padding), axis=0) |
| | else: |
| | padded_seq = seq[:max_seq_length] |
| | padded_X.append(padded_seq) |
| | return np.array(padded_X) |
| |
|
| | def save_dataset(X_train, X_test, y_train, y_test, label_map, output_path='dataset_sequences.pkl'): |
| | """ |
| | Saves the dataset into a pickle file. |
| | |
| | Args: |
| | X_train, X_test, y_train, y_test: Split data. |
| | label_map (dict): Mapping from class names to indices. |
| | output_path (str): Path to save the pickle file. |
| | """ |
| | with open(output_path, 'wb') as f: |
| | pickle.dump({ |
| | 'X_train': X_train, |
| | 'X_test': X_test, |
| | 'y_train': y_train, |
| | 'y_test': y_test, |
| | 'label_map': label_map |
| | }, f) |
| | print(f"Dataset saved to {output_path}.") |
| |
|
| | def load_dataset_pickle(pickle_path='dataset_sequences.pkl'): |
| | """ |
| | Loads the dataset from a pickle file. |
| | |
| | Args: |
| | pickle_path (str): Path to the pickle file. |
| | |
| | Returns: |
| | tuple: Split data and label mapping. |
| | """ |
| | with open(pickle_path, 'rb') as f: |
| | data = pickle.load(f) |
| | return data['X_train'], data['X_test'], data['y_train'], data['y_test'], data['label_map'] |
| |
|
| | if __name__ == "__main__": |
| | |
| | X, y, label_map = load_sequences(preprocessed_dir='preprocessed_sequences') |
| | print(f"Total samples: {len(X)}") |
| | |
| | |
| | max_seq_length = max([seq.shape[0] for seq in X]) |
| | print(f"Maximum sequence length: {max_seq_length}") |
| | |
| | |
| | X_padded = pad_sequences_fixed(X, max_seq_length) |
| | print(f"Padded sequences shape: {X_padded.shape}") |
| | |
| | |
| | X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42) |
| | print(f"Training samples: {X_train.shape[0]}") |
| | print(f"Testing samples: {X_test.shape[0]}") |
| | |
| | |
| | save_dataset(X_train, X_test, y_train, y_test, label_map, output_path='dataset_sequences.pkl') |
| |
|