Improves training efficiency and configuration

Increases the number of workers for the data loader to improve data loading speed.

Adjusts SLURM job configuration to allocate more CPUs per task, potentially speeding up computations.

Modifies the training script to save the initial model weights, and sets default training parameters like epochs, batch size, learning rate, and score weight for better reproducibility and control over the training process.

Files changed (3) hide show

fast_pointnet.py +1 -1
hoho_gpu.batch +1 -1
train_pnet_cluster.py +2 -2

fast_pointnet.py CHANGED Viewed

@@ -274,7 +274,7 @@ def train_pointnet(dataset_dir: str, model_save_path: str, epochs: int = 100, ba
         return patch_data, targets, valid_masks, distances
-    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4,
                            collate_fn=collate_fn, drop_last=True)
     # Initialize model with score prediction

         return patch_data, targets, valid_masks, distances
+    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=8,
                            collate_fn=collate_fn, drop_last=True)
     # Initialize model with score prediction

hoho_gpu.batch CHANGED Viewed

@@ -1,7 +1,7 @@
 #!/bin/bash
 #SBATCH --nodes=1 # 1 node
 #SBATCH --ntasks-per-node=1 # 1 tasks per node
-#SBATCH --cpus-per-task=8 # 6 CPUS per task = 12 CPUS per node
 #SBATCH --mem-per-cpu=10G # 8GB per CPU = 96GB per node
 #SBATCH --time=24:00:00 # time limits: 1 hour
 #SBATCH --error=hoho_gpu.err # standard error file

 #!/bin/bash
 #SBATCH --nodes=1 # 1 node
 #SBATCH --ntasks-per-node=1 # 1 tasks per node
+#SBATCH --cpus-per-task=16 # 6 CPUS per task = 12 CPUS per node
 #SBATCH --mem-per-cpu=10G # 8GB per CPU = 96GB per node
 #SBATCH --time=24:00:00 # time limits: 1 hour
 #SBATCH --error=hoho_gpu.err # standard error file

train_pnet_cluster.py CHANGED Viewed

@@ -4,7 +4,7 @@ if __name__ == "__main__":
     # Load the dataset
     dataset_path = "/mnt/personal/skvrnjan/hohocustom/"
-    model_save_path = "/mnt/personal/skvrnjan/hoho_pnet/"
     # Train the model
-    train_pointnet(dataset_path, model_save_path)

     # Load the dataset
     dataset_path = "/mnt/personal/skvrnjan/hohocustom/"
+    model_save_path = "/mnt/personal/skvrnjan/hoho_pnet/initial.pth"
     # Train the model
+    train_pointnet(dataset_path, model_save_path, epochs=100, batch_size=128, learning_rate=0.001, score_weight=0.1)