Teeradej Sawettraporn commited on
Commit
7942e02
·
verified ·
1 Parent(s): 314f0b8

Training Tools and Environment upload

Browse files
Training Tools/CNN/valid_f1_earlystop_mastered.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #After fix log
2
+ import torch
3
+ import torch.optim as optim
4
+ import torch.nn as nn
5
+ from torch.utils.data import DataLoader
6
+ from sklearn.model_selection import train_test_split
7
+ from torch.utils.data import Dataset, SubsetRandomSampler
8
+ import numpy as np
9
+ import torchvision.models as models
10
+ from datetime import datetime
11
+ import os
12
+
13
+ # Custom Dataset to Load npz Data
14
+ class DNASequencesDataset(Dataset):
15
+ def __init__(self, npz_file):
16
+ data = np.load(npz_file)
17
+ self.dna_sequences = data['dna_sequences']
18
+ self.labels = data['labels']
19
+
20
+ def __len__(self):
21
+ return len(self.dna_sequences)
22
+
23
+ def __getitem__(self, idx):
24
+ dna_seq = torch.tensor(self.dna_sequences[idx], dtype=torch.float32) # [4, 224, 224]
25
+ label = torch.tensor(self.labels[idx], dtype=torch.long)
26
+ return dna_seq, label
27
+
28
+ # Model Definition
29
+ class VGG16Modified(nn.Module):
30
+ def __init__(self, num_classes=3):
31
+ super(VGG16Modified, self).__init__()
32
+ vgg16 = models.vgg16(pretrained=True)
33
+
34
+ # Modify input layer to 4 channels
35
+ vgg16.features[0] = nn.Conv2d(in_channels=4, out_channels=64, kernel_size=3, stride=1, padding=1)
36
+
37
+ # Retain the rest of the model
38
+ self.features = vgg16.features
39
+ self.classifier = nn.Sequential(
40
+ nn.Linear(512 * 7 * 7, 4096),
41
+ nn.ReLU(inplace=True),
42
+ nn.Dropout(),
43
+ nn.Linear(4096, 4096),
44
+ nn.ReLU(inplace=True),
45
+ nn.Dropout(),
46
+ nn.Linear(4096, num_classes) # Modify to match the number of output classes
47
+ )
48
+
49
+ def forward(self, x):
50
+ x = self.features(x)
51
+ x = x.view(x.size(0), -1) # Flatten for the fully connected layer
52
+ x = self.classifier(x)
53
+ return x
54
+
55
+ # Create log file and log messages
56
+ def create_log(npz_file, checkpoint_interval, num_epochs, log_dir, learning_rate, batch_size, test_size, val_size, num_classes, optimizer):
57
+ current_date = datetime.now().strftime("%d-%m-%y")
58
+
59
+ # Ensure log directory exists
60
+ if not os.path.exists(log_dir):
61
+ os.makedirs(log_dir)
62
+
63
+ log_filename = os.path.join(log_dir, f'log_{current_date}.txt')
64
+
65
+ with open(log_filename, 'w') as log_file:
66
+ log_file.write(f"File: {npz_file}\n")
67
+ log_file.write(f"Running date: {current_date}\n")
68
+ log_file.write("Hyperparameters:\n")
69
+ log_file.write(f" - Number of epochs: {num_epochs}\n")
70
+ log_file.write(f" - Checkpoint interval: {checkpoint_interval}\n")
71
+ log_file.write(f" - Learning rate: {learning_rate}\n")
72
+ log_file.write(f" - Batch size: {batch_size}\n")
73
+ log_file.write(f" - Test size: {test_size}\n")
74
+ log_file.write(f" - Validation size: {val_size}\n")
75
+ log_file.write(f" - Number of classes: {num_classes}\n")
76
+
77
+ # Log optimizer details
78
+ log_file.write("Optimizer:\n")
79
+ log_file.write(f" - Optimizer Type: {optimizer.__class__.__name__}\n")
80
+ log_file.write(f" - Learning Rate: {optimizer.param_groups[0]['lr']}\n")
81
+
82
+ # Add specific optimizer details if available
83
+ if isinstance(optimizer, optim.SGD):
84
+ log_file.write(f" - Momentum: {optimizer.param_groups[0].get('momentum', 0)}\n")
85
+ log_file.write(f" - Weight Decay: {optimizer.param_groups[0].get('weight_decay', 0)}\n")
86
+ elif isinstance(optimizer, optim.Adam):
87
+ log_file.write(f" - Betas: {optimizer.param_groups[0].get('betas', (0.9, 0.999))}\n")
88
+ log_file.write(f" - Weight Decay: {optimizer.param_groups[0].get('weight_decay', 0)}\n")
89
+
90
+ log_file.write(f"Log directory: {log_dir}\n")
91
+
92
+ return log_filename
93
+
94
+ def log_epoch(log_filename, epoch, test_loss, test_acc, val_loss, val_acc):
95
+ with open(log_filename, 'a') as log_file:
96
+ log_file.write(f"Epoch {epoch} | test_loss = {test_loss:.4f}, test_acc = {test_acc:.4f}, ")
97
+ log_file.write(f"val_loss = {val_loss:.4f}, val_acc = {val_acc:.4f}\n")
98
+
99
+ # Split data into train, validation, and test sets
100
+ def split_data(dataset, test_size=0.2, val_size=0.1, batch_size=32):
101
+ dataset_size = len(dataset)
102
+
103
+ # Split into train and test sets
104
+ indices = list(range(dataset_size))
105
+ train_indices, test_indices = train_test_split(indices, test_size=test_size, random_state=42)
106
+
107
+ # Further split train set into train and validation sets
108
+ train_indices, val_indices = train_test_split(train_indices, test_size=val_size / (1 - test_size), random_state=42)
109
+
110
+ # Define samplers for training, validation, and test sets
111
+ train_sampler = SubsetRandomSampler(train_indices)
112
+ val_sampler = SubsetRandomSampler(val_indices)
113
+ test_sampler = SubsetRandomSampler(test_indices)
114
+
115
+ # Create DataLoaders for each split
116
+ train_loader = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)
117
+ val_loader = DataLoader(dataset, batch_size=batch_size, sampler=val_sampler)
118
+ test_loader = DataLoader(dataset, batch_size=batch_size, sampler=test_sampler)
119
+
120
+ return train_loader, val_loader, test_loader
121
+
122
+ from sklearn.metrics import classification_report
123
+
124
+ def train_model(
125
+ model, train_loader, val_loader, test_loader, criterion, optimizer, num_epochs,
126
+ device, checkpoint_interval, log_filename, checkpoint_dir, best_model_path, patience=10
127
+ ):
128
+ model.to(device)
129
+ best_val_acc = 0.0
130
+ epochs_since_improvement = 0
131
+
132
+ for epoch in range(1, num_epochs + 1):
133
+ model.train()
134
+ running_loss = 0.0
135
+ correct_predictions = 0
136
+ total_predictions = 0
137
+
138
+ for inputs, labels in train_loader:
139
+ inputs, labels = inputs.to(device), labels.to(device)
140
+ optimizer.zero_grad()
141
+ outputs = model(inputs)
142
+ loss = criterion(outputs, labels)
143
+ loss.backward()
144
+ optimizer.step()
145
+
146
+ running_loss += loss.item()
147
+ _, predicted = torch.max(outputs, 1)
148
+ correct_predictions += (predicted == labels).sum().item()
149
+ total_predictions += labels.size(0)
150
+
151
+ # Calculate training accuracy
152
+ train_accuracy = correct_predictions / total_predictions
153
+
154
+ # Validate the model and calculate metrics
155
+ val_loss, val_acc, val_f1_report = validate_model(model, val_loader, criterion, device)
156
+
157
+ # Evaluate on the test set
158
+ test_loss, test_acc = test_model(model, test_loader, criterion, device, log=False)
159
+
160
+ # Save the best model if validation accuracy improves
161
+ if val_acc > best_val_acc:
162
+ best_val_acc = val_acc
163
+ torch.save(model.state_dict(), best_model_path)
164
+ print(f"New best model saved with validation accuracy: {val_acc:.4f} at epoch {epoch}")
165
+ epochs_since_improvement = 0
166
+ else:
167
+ epochs_since_improvement += 1
168
+
169
+ # Log training, validation, and test metrics
170
+ if log_filename:
171
+ with open(log_filename, 'a') as log_file:
172
+ log_file.write(
173
+ f"Epoch {epoch} | train_loss = {running_loss / len(train_loader):.4f}, "
174
+ f"train_acc = {train_accuracy:.4f}, "
175
+ f"val_loss = {val_loss:.4f}, val_acc = {val_acc:.4f}, "
176
+ f"test_loss = {test_loss:.4f}, test_acc = {test_acc:.4f}\n"
177
+ )
178
+ log_file.write(f"Classification Report:\n{val_f1_report}\n")
179
+
180
+ # Save checkpoints at specified intervals
181
+ if epoch % checkpoint_interval == 0:
182
+ checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch}.pth')
183
+ save_checkpoint(model, optimizer, epoch, 0, checkpoint_path)
184
+
185
+ # Handle early stopping
186
+ if epochs_since_improvement >= patience:
187
+ print(f"Early stopping triggered at epoch {epoch}. Best validation accuracy: {best_val_acc:.4f}")
188
+ break
189
+
190
+
191
+ def validate_model(model, val_loader, criterion, device):
192
+ model.eval()
193
+ running_val_loss = 0.0
194
+ correct_val_predictions = 0
195
+ total_val_predictions = 0
196
+ all_labels = []
197
+ all_predictions = []
198
+
199
+ with torch.no_grad():
200
+ for inputs, labels in val_loader:
201
+ inputs, labels = inputs.to(device), labels.to(device)
202
+ outputs = model(inputs)
203
+ loss = criterion(outputs, labels)
204
+ running_val_loss += loss.item()
205
+
206
+ _, predicted = torch.max(outputs, 1)
207
+ correct_val_predictions += (predicted == labels).sum().item()
208
+ total_val_predictions += labels.size(0)
209
+ all_labels.extend(labels.cpu().numpy())
210
+ all_predictions.extend(predicted.cpu().numpy())
211
+
212
+ # Calculate validation loss and accuracy
213
+ val_loss = running_val_loss / len(val_loader)
214
+ val_accuracy = correct_val_predictions / total_val_predictions
215
+
216
+ # Generate classification report for F1-score
217
+ val_f1_report = classification_report(all_labels, all_predictions, digits=4)
218
+
219
+ return val_loss, val_accuracy, val_f1_report
220
+
221
+
222
+ def test_model(model, test_loader, criterion, device, log=True):
223
+ model.eval() # Set model to evaluation mode
224
+ running_test_loss = 0.0
225
+ correct_test_predictions = 0
226
+ total_test_predictions = 0
227
+
228
+ with torch.no_grad(): # Disable gradient computation
229
+ for inputs, labels in test_loader:
230
+ inputs, labels = inputs.to(device), labels.to(device)
231
+
232
+ # Forward pass
233
+ outputs = model(inputs)
234
+ loss = criterion(outputs, labels)
235
+
236
+ running_test_loss += loss.item()
237
+ _, predicted = torch.max(outputs, 1)
238
+ correct_test_predictions += (predicted == labels).sum().item()
239
+ total_test_predictions += labels.size(0)
240
+
241
+ # Calculate test loss and accuracy
242
+ test_loss = running_test_loss / len(test_loader)
243
+ test_accuracy = correct_test_predictions / total_test_predictions
244
+
245
+ if log:
246
+ print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")
247
+
248
+ return test_loss, test_accuracy
249
+
250
+
251
+ # Save checkpoint
252
+ def save_checkpoint(model, optimizer, epoch, step, checkpoint_path):
253
+ checkpoint = {
254
+ 'epoch': epoch,
255
+ 'step': step,
256
+ 'model_state_dict': model.state_dict(),
257
+ 'optimizer_state_dict': optimizer.state_dict(),
258
+ }
259
+ torch.save(checkpoint, checkpoint_path)
260
+ print(f"Checkpoint saved at {checkpoint_path}")
261
+
262
+ if __name__ == "__main__":
263
+ # Hyperparameters and configurations
264
+ npz_file = '/home/user/torch_shrimp/until-tools/mod/Shrimp_V1_5.npz'
265
+ num_classes = 3
266
+ learning_rate = 0.0001
267
+ num_epochs = 20
268
+ batch_size = 32
269
+ test_size = 0.2
270
+ val_size = 0.2
271
+ checkpoint_interval = 2
272
+ momentum = 0.9
273
+ weight_decay = 0.001
274
+ patience = 1
275
+ log_dir = '/home/user/torch_shrimp/until-tools/mod/vgg16_mod/file_tunning/tune_14/tune14_b10'
276
+ checkpoint_dir = log_dir # Directory for saving checkpoints
277
+ model_save_path = os.path.join(log_dir, 'saved_model.pth') # Final model save path
278
+
279
+ # Create dataset and split into train, val, test DataLoaders
280
+ dataset = DNASequencesDataset(npz_file)
281
+ train_loader, val_loader, test_loader = split_data(dataset, test_size=test_size, val_size=val_size, batch_size=batch_size)
282
+
283
+ # Initialize the model
284
+ model = VGG16Modified(num_classes=num_classes)
285
+
286
+ # Define criterion and optimizer with momentum
287
+ criterion = nn.CrossEntropyLoss()
288
+
289
+ # Use SGD with momentum instead of Adam
290
+ optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, weight_decay = weight_decay)
291
+ #optimizer = optim.Adam(model.parameters(), lr=learning_rate)
292
+
293
+ # Create log file
294
+ log_filename = create_log(npz_file, checkpoint_interval, num_epochs, log_dir, learning_rate, batch_size, test_size, val_size, num_classes, optimizer)
295
+
296
+ # Training configuration
297
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
298
+
299
+ # Train model
300
+ train_model(model, train_loader, val_loader, test_loader, criterion, optimizer, num_epochs, device, checkpoint_interval, log_filename, checkpoint_dir, model_save_path,patience=patience)
301
+
302
+
Training Tools/CNN/valid_testing_sample.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Training Tools/RNN/valid_LSTM_embedded_v7.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Training Tools/RNN/valid_k-mer-score.ipynb ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 5,
6
+ "id": "87acfd86-b1f7-4ec9-b1b4-01d5b63bf435",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "Number of unique k-mers: 64\n",
14
+ "Label mapping: {'AHPND': 0, 'WSSV': 1, 'healthy': 2}\n",
15
+ "K-mer feature matrix and labels saved to /home/user/torch_shrimp/until-tools/mod/k-mer/test5101.csv\n"
16
+ ]
17
+ }
18
+ ],
19
+ "source": [
20
+ "import pandas as pd\n",
21
+ "import numpy as np\n",
22
+ "from sklearn.preprocessing import LabelEncoder\n",
23
+ "\n",
24
+ "def compute_kmer_scores_to_csv(input_csv, output_csv, kmer_column='K-mer', label_column='status'):\n",
25
+ " \"\"\"\n",
26
+ " Compute k-mer frequency-based scores for each DNA sequence in a CSV file\n",
27
+ " and save the resulting feature matrix and labels into a new CSV file.\n",
28
+ "\n",
29
+ " Parameters:\n",
30
+ " input_csv (str): Path to the input CSV file.\n",
31
+ " output_csv (str): Path to save the output CSV file.\n",
32
+ " kmer_column (str): Column name containing k-mer sequences.\n",
33
+ " label_column (str): Column name containing labels.\n",
34
+ " \"\"\"\n",
35
+ " # Load the CSV file\n",
36
+ " df = pd.read_csv(input_csv)\n",
37
+ " \n",
38
+ " # Get the unique k-mers across all sequences\n",
39
+ " all_kmers = set()\n",
40
+ " for seq in df[kmer_column]:\n",
41
+ " all_kmers.update(seq.split())\n",
42
+ " kmer_vocab = sorted(all_kmers) # Consistent ordering\n",
43
+ " \n",
44
+ " # Create a mapping from k-mers to indices\n",
45
+ " kmer_to_index = {kmer: idx for idx, kmer in enumerate(kmer_vocab)}\n",
46
+ " vocab_size = len(kmer_vocab)\n",
47
+ " print(f\"Number of unique k-mers: {vocab_size}\")\n",
48
+ " \n",
49
+ " # Compute k-mer frequency vectors\n",
50
+ " feature_matrix = []\n",
51
+ " for seq in df[kmer_column]:\n",
52
+ " # Initialize a frequency vector for the sequence\n",
53
+ " kmer_counts = np.zeros(vocab_size, dtype=np.float32)\n",
54
+ " for kmer in seq.split():\n",
55
+ " kmer_counts[kmer_to_index[kmer]] += 1\n",
56
+ " # Normalize frequencies\n",
57
+ " kmer_counts /= kmer_counts.sum() # Ensure probabilities sum to 1\n",
58
+ " feature_matrix.append(kmer_counts)\n",
59
+ " \n",
60
+ " # Convert to DataFrame\n",
61
+ " feature_df = pd.DataFrame(feature_matrix, columns=kmer_vocab)\n",
62
+ " \n",
63
+ " # Encode the labels as integers\n",
64
+ " label_encoder = LabelEncoder()\n",
65
+ " feature_df[label_column] = label_encoder.fit_transform(df[label_column])\n",
66
+ " label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))\n",
67
+ " print(f\"Label mapping: {label_mapping}\")\n",
68
+ " \n",
69
+ " # Save the feature matrix and labels to a new CSV file\n",
70
+ " feature_df.to_csv(output_csv, index=False)\n",
71
+ " print(f\"K-mer feature matrix and labels saved to {output_csv}\")\n",
72
+ "\n",
73
+ "# Example usage\n",
74
+ "input_csv = '/home/user/torch_shrimp/dataset/Mixed/Cleansed-kmer/kmer_test5101.csv'\n",
75
+ "output_csv = '/home/user/torch_shrimp/until-tools/mod/k-mer/test5101.csv'\n",
76
+ "compute_kmer_scores_to_csv(input_csv, output_csv)\n"
77
+ ]
78
+ },
79
+ {
80
+ "cell_type": "code",
81
+ "execution_count": null,
82
+ "id": "63208b57-0c68-4e16-9b6f-f8a733034e4d",
83
+ "metadata": {},
84
+ "outputs": [],
85
+ "source": []
86
+ }
87
+ ],
88
+ "metadata": {
89
+ "kernelspec": {
90
+ "display_name": "Python 3 (ipykernel)",
91
+ "language": "python",
92
+ "name": "python3"
93
+ },
94
+ "language_info": {
95
+ "codemirror_mode": {
96
+ "name": "ipython",
97
+ "version": 3
98
+ },
99
+ "file_extension": ".py",
100
+ "mimetype": "text/x-python",
101
+ "name": "python",
102
+ "nbconvert_exporter": "python",
103
+ "pygments_lexer": "ipython3",
104
+ "version": "3.10.12"
105
+ }
106
+ },
107
+ "nbformat": 4,
108
+ "nbformat_minor": 5
109
+ }