jskvrna commited on
Commit
fb55e87
·
1 Parent(s): 8e0796b

Improves training efficiency and configuration

Browse files

Increases the number of workers for the data loader to improve data loading speed.

Adjusts SLURM job configuration to allocate more CPUs per task, potentially speeding up computations.

Modifies the training script to save the initial model weights, and sets default training parameters like epochs, batch size, learning rate, and score weight for better reproducibility and control over the training process.

Files changed (3) hide show
  1. fast_pointnet.py +1 -1
  2. hoho_gpu.batch +1 -1
  3. train_pnet_cluster.py +2 -2
fast_pointnet.py CHANGED
@@ -274,7 +274,7 @@ def train_pointnet(dataset_dir: str, model_save_path: str, epochs: int = 100, ba
274
 
275
  return patch_data, targets, valid_masks, distances
276
 
277
- dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4,
278
  collate_fn=collate_fn, drop_last=True)
279
 
280
  # Initialize model with score prediction
 
274
 
275
  return patch_data, targets, valid_masks, distances
276
 
277
+ dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=8,
278
  collate_fn=collate_fn, drop_last=True)
279
 
280
  # Initialize model with score prediction
hoho_gpu.batch CHANGED
@@ -1,7 +1,7 @@
1
  #!/bin/bash
2
  #SBATCH --nodes=1 # 1 node
3
  #SBATCH --ntasks-per-node=1 # 1 tasks per node
4
- #SBATCH --cpus-per-task=8 # 6 CPUS per task = 12 CPUS per node
5
  #SBATCH --mem-per-cpu=10G # 8GB per CPU = 96GB per node
6
  #SBATCH --time=24:00:00 # time limits: 1 hour
7
  #SBATCH --error=hoho_gpu.err # standard error file
 
1
  #!/bin/bash
2
  #SBATCH --nodes=1 # 1 node
3
  #SBATCH --ntasks-per-node=1 # 1 tasks per node
4
+ #SBATCH --cpus-per-task=16 # 6 CPUS per task = 12 CPUS per node
5
  #SBATCH --mem-per-cpu=10G # 8GB per CPU = 96GB per node
6
  #SBATCH --time=24:00:00 # time limits: 1 hour
7
  #SBATCH --error=hoho_gpu.err # standard error file
train_pnet_cluster.py CHANGED
@@ -4,7 +4,7 @@ if __name__ == "__main__":
4
 
5
  # Load the dataset
6
  dataset_path = "/mnt/personal/skvrnjan/hohocustom/"
7
- model_save_path = "/mnt/personal/skvrnjan/hoho_pnet/"
8
 
9
  # Train the model
10
- train_pointnet(dataset_path, model_save_path)
 
4
 
5
  # Load the dataset
6
  dataset_path = "/mnt/personal/skvrnjan/hohocustom/"
7
+ model_save_path = "/mnt/personal/skvrnjan/hoho_pnet/initial.pth"
8
 
9
  # Train the model
10
+ train_pointnet(dataset_path, model_save_path, epochs=100, batch_size=128, learning_rate=0.001, score_weight=0.1)