Teeradej Sawettraporn commited on
Training Tools and Environment upload
Browse files
Training Tools/CNN/valid_f1_earlystop_mastered.py
ADDED
|
@@ -0,0 +1,302 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#After fix log
|
| 2 |
+
import torch
|
| 3 |
+
import torch.optim as optim
|
| 4 |
+
import torch.nn as nn
|
| 5 |
+
from torch.utils.data import DataLoader
|
| 6 |
+
from sklearn.model_selection import train_test_split
|
| 7 |
+
from torch.utils.data import Dataset, SubsetRandomSampler
|
| 8 |
+
import numpy as np
|
| 9 |
+
import torchvision.models as models
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
import os
|
| 12 |
+
|
| 13 |
+
# Custom Dataset to Load npz Data
|
| 14 |
+
class DNASequencesDataset(Dataset):
|
| 15 |
+
def __init__(self, npz_file):
|
| 16 |
+
data = np.load(npz_file)
|
| 17 |
+
self.dna_sequences = data['dna_sequences']
|
| 18 |
+
self.labels = data['labels']
|
| 19 |
+
|
| 20 |
+
def __len__(self):
|
| 21 |
+
return len(self.dna_sequences)
|
| 22 |
+
|
| 23 |
+
def __getitem__(self, idx):
|
| 24 |
+
dna_seq = torch.tensor(self.dna_sequences[idx], dtype=torch.float32) # [4, 224, 224]
|
| 25 |
+
label = torch.tensor(self.labels[idx], dtype=torch.long)
|
| 26 |
+
return dna_seq, label
|
| 27 |
+
|
| 28 |
+
# Model Definition
|
| 29 |
+
class VGG16Modified(nn.Module):
|
| 30 |
+
def __init__(self, num_classes=3):
|
| 31 |
+
super(VGG16Modified, self).__init__()
|
| 32 |
+
vgg16 = models.vgg16(pretrained=True)
|
| 33 |
+
|
| 34 |
+
# Modify input layer to 4 channels
|
| 35 |
+
vgg16.features[0] = nn.Conv2d(in_channels=4, out_channels=64, kernel_size=3, stride=1, padding=1)
|
| 36 |
+
|
| 37 |
+
# Retain the rest of the model
|
| 38 |
+
self.features = vgg16.features
|
| 39 |
+
self.classifier = nn.Sequential(
|
| 40 |
+
nn.Linear(512 * 7 * 7, 4096),
|
| 41 |
+
nn.ReLU(inplace=True),
|
| 42 |
+
nn.Dropout(),
|
| 43 |
+
nn.Linear(4096, 4096),
|
| 44 |
+
nn.ReLU(inplace=True),
|
| 45 |
+
nn.Dropout(),
|
| 46 |
+
nn.Linear(4096, num_classes) # Modify to match the number of output classes
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
def forward(self, x):
|
| 50 |
+
x = self.features(x)
|
| 51 |
+
x = x.view(x.size(0), -1) # Flatten for the fully connected layer
|
| 52 |
+
x = self.classifier(x)
|
| 53 |
+
return x
|
| 54 |
+
|
| 55 |
+
# Create log file and log messages
|
| 56 |
+
def create_log(npz_file, checkpoint_interval, num_epochs, log_dir, learning_rate, batch_size, test_size, val_size, num_classes, optimizer):
|
| 57 |
+
current_date = datetime.now().strftime("%d-%m-%y")
|
| 58 |
+
|
| 59 |
+
# Ensure log directory exists
|
| 60 |
+
if not os.path.exists(log_dir):
|
| 61 |
+
os.makedirs(log_dir)
|
| 62 |
+
|
| 63 |
+
log_filename = os.path.join(log_dir, f'log_{current_date}.txt')
|
| 64 |
+
|
| 65 |
+
with open(log_filename, 'w') as log_file:
|
| 66 |
+
log_file.write(f"File: {npz_file}\n")
|
| 67 |
+
log_file.write(f"Running date: {current_date}\n")
|
| 68 |
+
log_file.write("Hyperparameters:\n")
|
| 69 |
+
log_file.write(f" - Number of epochs: {num_epochs}\n")
|
| 70 |
+
log_file.write(f" - Checkpoint interval: {checkpoint_interval}\n")
|
| 71 |
+
log_file.write(f" - Learning rate: {learning_rate}\n")
|
| 72 |
+
log_file.write(f" - Batch size: {batch_size}\n")
|
| 73 |
+
log_file.write(f" - Test size: {test_size}\n")
|
| 74 |
+
log_file.write(f" - Validation size: {val_size}\n")
|
| 75 |
+
log_file.write(f" - Number of classes: {num_classes}\n")
|
| 76 |
+
|
| 77 |
+
# Log optimizer details
|
| 78 |
+
log_file.write("Optimizer:\n")
|
| 79 |
+
log_file.write(f" - Optimizer Type: {optimizer.__class__.__name__}\n")
|
| 80 |
+
log_file.write(f" - Learning Rate: {optimizer.param_groups[0]['lr']}\n")
|
| 81 |
+
|
| 82 |
+
# Add specific optimizer details if available
|
| 83 |
+
if isinstance(optimizer, optim.SGD):
|
| 84 |
+
log_file.write(f" - Momentum: {optimizer.param_groups[0].get('momentum', 0)}\n")
|
| 85 |
+
log_file.write(f" - Weight Decay: {optimizer.param_groups[0].get('weight_decay', 0)}\n")
|
| 86 |
+
elif isinstance(optimizer, optim.Adam):
|
| 87 |
+
log_file.write(f" - Betas: {optimizer.param_groups[0].get('betas', (0.9, 0.999))}\n")
|
| 88 |
+
log_file.write(f" - Weight Decay: {optimizer.param_groups[0].get('weight_decay', 0)}\n")
|
| 89 |
+
|
| 90 |
+
log_file.write(f"Log directory: {log_dir}\n")
|
| 91 |
+
|
| 92 |
+
return log_filename
|
| 93 |
+
|
| 94 |
+
def log_epoch(log_filename, epoch, test_loss, test_acc, val_loss, val_acc):
|
| 95 |
+
with open(log_filename, 'a') as log_file:
|
| 96 |
+
log_file.write(f"Epoch {epoch} | test_loss = {test_loss:.4f}, test_acc = {test_acc:.4f}, ")
|
| 97 |
+
log_file.write(f"val_loss = {val_loss:.4f}, val_acc = {val_acc:.4f}\n")
|
| 98 |
+
|
| 99 |
+
# Split data into train, validation, and test sets
|
| 100 |
+
def split_data(dataset, test_size=0.2, val_size=0.1, batch_size=32):
|
| 101 |
+
dataset_size = len(dataset)
|
| 102 |
+
|
| 103 |
+
# Split into train and test sets
|
| 104 |
+
indices = list(range(dataset_size))
|
| 105 |
+
train_indices, test_indices = train_test_split(indices, test_size=test_size, random_state=42)
|
| 106 |
+
|
| 107 |
+
# Further split train set into train and validation sets
|
| 108 |
+
train_indices, val_indices = train_test_split(train_indices, test_size=val_size / (1 - test_size), random_state=42)
|
| 109 |
+
|
| 110 |
+
# Define samplers for training, validation, and test sets
|
| 111 |
+
train_sampler = SubsetRandomSampler(train_indices)
|
| 112 |
+
val_sampler = SubsetRandomSampler(val_indices)
|
| 113 |
+
test_sampler = SubsetRandomSampler(test_indices)
|
| 114 |
+
|
| 115 |
+
# Create DataLoaders for each split
|
| 116 |
+
train_loader = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)
|
| 117 |
+
val_loader = DataLoader(dataset, batch_size=batch_size, sampler=val_sampler)
|
| 118 |
+
test_loader = DataLoader(dataset, batch_size=batch_size, sampler=test_sampler)
|
| 119 |
+
|
| 120 |
+
return train_loader, val_loader, test_loader
|
| 121 |
+
|
| 122 |
+
from sklearn.metrics import classification_report
|
| 123 |
+
|
| 124 |
+
def train_model(
|
| 125 |
+
model, train_loader, val_loader, test_loader, criterion, optimizer, num_epochs,
|
| 126 |
+
device, checkpoint_interval, log_filename, checkpoint_dir, best_model_path, patience=10
|
| 127 |
+
):
|
| 128 |
+
model.to(device)
|
| 129 |
+
best_val_acc = 0.0
|
| 130 |
+
epochs_since_improvement = 0
|
| 131 |
+
|
| 132 |
+
for epoch in range(1, num_epochs + 1):
|
| 133 |
+
model.train()
|
| 134 |
+
running_loss = 0.0
|
| 135 |
+
correct_predictions = 0
|
| 136 |
+
total_predictions = 0
|
| 137 |
+
|
| 138 |
+
for inputs, labels in train_loader:
|
| 139 |
+
inputs, labels = inputs.to(device), labels.to(device)
|
| 140 |
+
optimizer.zero_grad()
|
| 141 |
+
outputs = model(inputs)
|
| 142 |
+
loss = criterion(outputs, labels)
|
| 143 |
+
loss.backward()
|
| 144 |
+
optimizer.step()
|
| 145 |
+
|
| 146 |
+
running_loss += loss.item()
|
| 147 |
+
_, predicted = torch.max(outputs, 1)
|
| 148 |
+
correct_predictions += (predicted == labels).sum().item()
|
| 149 |
+
total_predictions += labels.size(0)
|
| 150 |
+
|
| 151 |
+
# Calculate training accuracy
|
| 152 |
+
train_accuracy = correct_predictions / total_predictions
|
| 153 |
+
|
| 154 |
+
# Validate the model and calculate metrics
|
| 155 |
+
val_loss, val_acc, val_f1_report = validate_model(model, val_loader, criterion, device)
|
| 156 |
+
|
| 157 |
+
# Evaluate on the test set
|
| 158 |
+
test_loss, test_acc = test_model(model, test_loader, criterion, device, log=False)
|
| 159 |
+
|
| 160 |
+
# Save the best model if validation accuracy improves
|
| 161 |
+
if val_acc > best_val_acc:
|
| 162 |
+
best_val_acc = val_acc
|
| 163 |
+
torch.save(model.state_dict(), best_model_path)
|
| 164 |
+
print(f"New best model saved with validation accuracy: {val_acc:.4f} at epoch {epoch}")
|
| 165 |
+
epochs_since_improvement = 0
|
| 166 |
+
else:
|
| 167 |
+
epochs_since_improvement += 1
|
| 168 |
+
|
| 169 |
+
# Log training, validation, and test metrics
|
| 170 |
+
if log_filename:
|
| 171 |
+
with open(log_filename, 'a') as log_file:
|
| 172 |
+
log_file.write(
|
| 173 |
+
f"Epoch {epoch} | train_loss = {running_loss / len(train_loader):.4f}, "
|
| 174 |
+
f"train_acc = {train_accuracy:.4f}, "
|
| 175 |
+
f"val_loss = {val_loss:.4f}, val_acc = {val_acc:.4f}, "
|
| 176 |
+
f"test_loss = {test_loss:.4f}, test_acc = {test_acc:.4f}\n"
|
| 177 |
+
)
|
| 178 |
+
log_file.write(f"Classification Report:\n{val_f1_report}\n")
|
| 179 |
+
|
| 180 |
+
# Save checkpoints at specified intervals
|
| 181 |
+
if epoch % checkpoint_interval == 0:
|
| 182 |
+
checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch}.pth')
|
| 183 |
+
save_checkpoint(model, optimizer, epoch, 0, checkpoint_path)
|
| 184 |
+
|
| 185 |
+
# Handle early stopping
|
| 186 |
+
if epochs_since_improvement >= patience:
|
| 187 |
+
print(f"Early stopping triggered at epoch {epoch}. Best validation accuracy: {best_val_acc:.4f}")
|
| 188 |
+
break
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
def validate_model(model, val_loader, criterion, device):
|
| 192 |
+
model.eval()
|
| 193 |
+
running_val_loss = 0.0
|
| 194 |
+
correct_val_predictions = 0
|
| 195 |
+
total_val_predictions = 0
|
| 196 |
+
all_labels = []
|
| 197 |
+
all_predictions = []
|
| 198 |
+
|
| 199 |
+
with torch.no_grad():
|
| 200 |
+
for inputs, labels in val_loader:
|
| 201 |
+
inputs, labels = inputs.to(device), labels.to(device)
|
| 202 |
+
outputs = model(inputs)
|
| 203 |
+
loss = criterion(outputs, labels)
|
| 204 |
+
running_val_loss += loss.item()
|
| 205 |
+
|
| 206 |
+
_, predicted = torch.max(outputs, 1)
|
| 207 |
+
correct_val_predictions += (predicted == labels).sum().item()
|
| 208 |
+
total_val_predictions += labels.size(0)
|
| 209 |
+
all_labels.extend(labels.cpu().numpy())
|
| 210 |
+
all_predictions.extend(predicted.cpu().numpy())
|
| 211 |
+
|
| 212 |
+
# Calculate validation loss and accuracy
|
| 213 |
+
val_loss = running_val_loss / len(val_loader)
|
| 214 |
+
val_accuracy = correct_val_predictions / total_val_predictions
|
| 215 |
+
|
| 216 |
+
# Generate classification report for F1-score
|
| 217 |
+
val_f1_report = classification_report(all_labels, all_predictions, digits=4)
|
| 218 |
+
|
| 219 |
+
return val_loss, val_accuracy, val_f1_report
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
def test_model(model, test_loader, criterion, device, log=True):
|
| 223 |
+
model.eval() # Set model to evaluation mode
|
| 224 |
+
running_test_loss = 0.0
|
| 225 |
+
correct_test_predictions = 0
|
| 226 |
+
total_test_predictions = 0
|
| 227 |
+
|
| 228 |
+
with torch.no_grad(): # Disable gradient computation
|
| 229 |
+
for inputs, labels in test_loader:
|
| 230 |
+
inputs, labels = inputs.to(device), labels.to(device)
|
| 231 |
+
|
| 232 |
+
# Forward pass
|
| 233 |
+
outputs = model(inputs)
|
| 234 |
+
loss = criterion(outputs, labels)
|
| 235 |
+
|
| 236 |
+
running_test_loss += loss.item()
|
| 237 |
+
_, predicted = torch.max(outputs, 1)
|
| 238 |
+
correct_test_predictions += (predicted == labels).sum().item()
|
| 239 |
+
total_test_predictions += labels.size(0)
|
| 240 |
+
|
| 241 |
+
# Calculate test loss and accuracy
|
| 242 |
+
test_loss = running_test_loss / len(test_loader)
|
| 243 |
+
test_accuracy = correct_test_predictions / total_test_predictions
|
| 244 |
+
|
| 245 |
+
if log:
|
| 246 |
+
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")
|
| 247 |
+
|
| 248 |
+
return test_loss, test_accuracy
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
# Save checkpoint
|
| 252 |
+
def save_checkpoint(model, optimizer, epoch, step, checkpoint_path):
|
| 253 |
+
checkpoint = {
|
| 254 |
+
'epoch': epoch,
|
| 255 |
+
'step': step,
|
| 256 |
+
'model_state_dict': model.state_dict(),
|
| 257 |
+
'optimizer_state_dict': optimizer.state_dict(),
|
| 258 |
+
}
|
| 259 |
+
torch.save(checkpoint, checkpoint_path)
|
| 260 |
+
print(f"Checkpoint saved at {checkpoint_path}")
|
| 261 |
+
|
| 262 |
+
if __name__ == "__main__":
|
| 263 |
+
# Hyperparameters and configurations
|
| 264 |
+
npz_file = '/home/user/torch_shrimp/until-tools/mod/Shrimp_V1_5.npz'
|
| 265 |
+
num_classes = 3
|
| 266 |
+
learning_rate = 0.0001
|
| 267 |
+
num_epochs = 20
|
| 268 |
+
batch_size = 32
|
| 269 |
+
test_size = 0.2
|
| 270 |
+
val_size = 0.2
|
| 271 |
+
checkpoint_interval = 2
|
| 272 |
+
momentum = 0.9
|
| 273 |
+
weight_decay = 0.001
|
| 274 |
+
patience = 1
|
| 275 |
+
log_dir = '/home/user/torch_shrimp/until-tools/mod/vgg16_mod/file_tunning/tune_14/tune14_b10'
|
| 276 |
+
checkpoint_dir = log_dir # Directory for saving checkpoints
|
| 277 |
+
model_save_path = os.path.join(log_dir, 'saved_model.pth') # Final model save path
|
| 278 |
+
|
| 279 |
+
# Create dataset and split into train, val, test DataLoaders
|
| 280 |
+
dataset = DNASequencesDataset(npz_file)
|
| 281 |
+
train_loader, val_loader, test_loader = split_data(dataset, test_size=test_size, val_size=val_size, batch_size=batch_size)
|
| 282 |
+
|
| 283 |
+
# Initialize the model
|
| 284 |
+
model = VGG16Modified(num_classes=num_classes)
|
| 285 |
+
|
| 286 |
+
# Define criterion and optimizer with momentum
|
| 287 |
+
criterion = nn.CrossEntropyLoss()
|
| 288 |
+
|
| 289 |
+
# Use SGD with momentum instead of Adam
|
| 290 |
+
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, weight_decay = weight_decay)
|
| 291 |
+
#optimizer = optim.Adam(model.parameters(), lr=learning_rate)
|
| 292 |
+
|
| 293 |
+
# Create log file
|
| 294 |
+
log_filename = create_log(npz_file, checkpoint_interval, num_epochs, log_dir, learning_rate, batch_size, test_size, val_size, num_classes, optimizer)
|
| 295 |
+
|
| 296 |
+
# Training configuration
|
| 297 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 298 |
+
|
| 299 |
+
# Train model
|
| 300 |
+
train_model(model, train_loader, val_loader, test_loader, criterion, optimizer, num_epochs, device, checkpoint_interval, log_filename, checkpoint_dir, model_save_path,patience=patience)
|
| 301 |
+
|
| 302 |
+
|
Training Tools/CNN/valid_testing_sample.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Training Tools/RNN/valid_LSTM_embedded_v7.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Training Tools/RNN/valid_k-mer-score.ipynb
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 5,
|
| 6 |
+
"id": "87acfd86-b1f7-4ec9-b1b4-01d5b63bf435",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [
|
| 9 |
+
{
|
| 10 |
+
"name": "stdout",
|
| 11 |
+
"output_type": "stream",
|
| 12 |
+
"text": [
|
| 13 |
+
"Number of unique k-mers: 64\n",
|
| 14 |
+
"Label mapping: {'AHPND': 0, 'WSSV': 1, 'healthy': 2}\n",
|
| 15 |
+
"K-mer feature matrix and labels saved to /home/user/torch_shrimp/until-tools/mod/k-mer/test5101.csv\n"
|
| 16 |
+
]
|
| 17 |
+
}
|
| 18 |
+
],
|
| 19 |
+
"source": [
|
| 20 |
+
"import pandas as pd\n",
|
| 21 |
+
"import numpy as np\n",
|
| 22 |
+
"from sklearn.preprocessing import LabelEncoder\n",
|
| 23 |
+
"\n",
|
| 24 |
+
"def compute_kmer_scores_to_csv(input_csv, output_csv, kmer_column='K-mer', label_column='status'):\n",
|
| 25 |
+
" \"\"\"\n",
|
| 26 |
+
" Compute k-mer frequency-based scores for each DNA sequence in a CSV file\n",
|
| 27 |
+
" and save the resulting feature matrix and labels into a new CSV file.\n",
|
| 28 |
+
"\n",
|
| 29 |
+
" Parameters:\n",
|
| 30 |
+
" input_csv (str): Path to the input CSV file.\n",
|
| 31 |
+
" output_csv (str): Path to save the output CSV file.\n",
|
| 32 |
+
" kmer_column (str): Column name containing k-mer sequences.\n",
|
| 33 |
+
" label_column (str): Column name containing labels.\n",
|
| 34 |
+
" \"\"\"\n",
|
| 35 |
+
" # Load the CSV file\n",
|
| 36 |
+
" df = pd.read_csv(input_csv)\n",
|
| 37 |
+
" \n",
|
| 38 |
+
" # Get the unique k-mers across all sequences\n",
|
| 39 |
+
" all_kmers = set()\n",
|
| 40 |
+
" for seq in df[kmer_column]:\n",
|
| 41 |
+
" all_kmers.update(seq.split())\n",
|
| 42 |
+
" kmer_vocab = sorted(all_kmers) # Consistent ordering\n",
|
| 43 |
+
" \n",
|
| 44 |
+
" # Create a mapping from k-mers to indices\n",
|
| 45 |
+
" kmer_to_index = {kmer: idx for idx, kmer in enumerate(kmer_vocab)}\n",
|
| 46 |
+
" vocab_size = len(kmer_vocab)\n",
|
| 47 |
+
" print(f\"Number of unique k-mers: {vocab_size}\")\n",
|
| 48 |
+
" \n",
|
| 49 |
+
" # Compute k-mer frequency vectors\n",
|
| 50 |
+
" feature_matrix = []\n",
|
| 51 |
+
" for seq in df[kmer_column]:\n",
|
| 52 |
+
" # Initialize a frequency vector for the sequence\n",
|
| 53 |
+
" kmer_counts = np.zeros(vocab_size, dtype=np.float32)\n",
|
| 54 |
+
" for kmer in seq.split():\n",
|
| 55 |
+
" kmer_counts[kmer_to_index[kmer]] += 1\n",
|
| 56 |
+
" # Normalize frequencies\n",
|
| 57 |
+
" kmer_counts /= kmer_counts.sum() # Ensure probabilities sum to 1\n",
|
| 58 |
+
" feature_matrix.append(kmer_counts)\n",
|
| 59 |
+
" \n",
|
| 60 |
+
" # Convert to DataFrame\n",
|
| 61 |
+
" feature_df = pd.DataFrame(feature_matrix, columns=kmer_vocab)\n",
|
| 62 |
+
" \n",
|
| 63 |
+
" # Encode the labels as integers\n",
|
| 64 |
+
" label_encoder = LabelEncoder()\n",
|
| 65 |
+
" feature_df[label_column] = label_encoder.fit_transform(df[label_column])\n",
|
| 66 |
+
" label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))\n",
|
| 67 |
+
" print(f\"Label mapping: {label_mapping}\")\n",
|
| 68 |
+
" \n",
|
| 69 |
+
" # Save the feature matrix and labels to a new CSV file\n",
|
| 70 |
+
" feature_df.to_csv(output_csv, index=False)\n",
|
| 71 |
+
" print(f\"K-mer feature matrix and labels saved to {output_csv}\")\n",
|
| 72 |
+
"\n",
|
| 73 |
+
"# Example usage\n",
|
| 74 |
+
"input_csv = '/home/user/torch_shrimp/dataset/Mixed/Cleansed-kmer/kmer_test5101.csv'\n",
|
| 75 |
+
"output_csv = '/home/user/torch_shrimp/until-tools/mod/k-mer/test5101.csv'\n",
|
| 76 |
+
"compute_kmer_scores_to_csv(input_csv, output_csv)\n"
|
| 77 |
+
]
|
| 78 |
+
},
|
| 79 |
+
{
|
| 80 |
+
"cell_type": "code",
|
| 81 |
+
"execution_count": null,
|
| 82 |
+
"id": "63208b57-0c68-4e16-9b6f-f8a733034e4d",
|
| 83 |
+
"metadata": {},
|
| 84 |
+
"outputs": [],
|
| 85 |
+
"source": []
|
| 86 |
+
}
|
| 87 |
+
],
|
| 88 |
+
"metadata": {
|
| 89 |
+
"kernelspec": {
|
| 90 |
+
"display_name": "Python 3 (ipykernel)",
|
| 91 |
+
"language": "python",
|
| 92 |
+
"name": "python3"
|
| 93 |
+
},
|
| 94 |
+
"language_info": {
|
| 95 |
+
"codemirror_mode": {
|
| 96 |
+
"name": "ipython",
|
| 97 |
+
"version": 3
|
| 98 |
+
},
|
| 99 |
+
"file_extension": ".py",
|
| 100 |
+
"mimetype": "text/x-python",
|
| 101 |
+
"name": "python",
|
| 102 |
+
"nbconvert_exporter": "python",
|
| 103 |
+
"pygments_lexer": "ipython3",
|
| 104 |
+
"version": "3.10.12"
|
| 105 |
+
}
|
| 106 |
+
},
|
| 107 |
+
"nbformat": 4,
|
| 108 |
+
"nbformat_minor": 5
|
| 109 |
+
}
|