ishanjmukherjee commited on
Commit
d7ea743
·
1 Parent(s): 3d180ae

Uncomment layer assignment

Browse files

I had commented out the entire block initialization for loop, instead of just the device management code

Files changed (1) hide show
  1. model.py +20 -20
model.py CHANGED
@@ -641,26 +641,26 @@ class StripedHyena(nn.Module):
641
  # layers_per_gpu = math.ceil(config.num_layers / num_gpus)
642
  # self.logger.info(f"Distributing across {num_gpus} GPUs, approximately {layers_per_gpu} layers per GPU")
643
 
644
- # for layer_idx in tqdm(range(config.num_layers)):
645
- # # Determine which GPU should handle this layer
646
- # device_idx = min(layer_idx // layers_per_gpu, num_gpus - 1)
647
- # device = f"cuda:{device_idx}" if torch.cuda.is_available() else "cpu"
648
-
649
- # with torch.device(device):
650
- # # TELinear uses `device="cuda"` device to allocate empty bias
651
- # # tensor. This makes sure that the empty tensor is allocated on the
652
- # # correct device. (torch.device(), unlike torch.cuda.device(),
653
- # # doesn't override current CUDA device.)
654
- # with torch.cuda.device(device):
655
- # block = get_block(config, layer_idx, flash_fft=self.flash_fft)
656
- # move_to_device(block, device)
657
-
658
- # self.blocks.append(block)
659
- # self.block_idx_to_device[layer_idx] = device
660
- # self.logger.info(f"Assigned {layer_idx=} to {device=}")
661
- # self.logger.info(
662
- # f"Parameter count for block {layer_idx}: {sum(p.numel() for p in self.blocks[-1].parameters())}"
663
- # )
664
 
665
  # with torch.device(self.block_idx_to_device[0]):
666
  # with torch.cuda.device(self.block_idx_to_device[0]):
 
641
  # layers_per_gpu = math.ceil(config.num_layers / num_gpus)
642
  # self.logger.info(f"Distributing across {num_gpus} GPUs, approximately {layers_per_gpu} layers per GPU")
643
 
644
+ for layer_idx in tqdm(range(config.num_layers)):
645
+ # Determine which GPU should handle this layer
646
+ # device_idx = min(layer_idx // layers_per_gpu, num_gpus - 1)
647
+ # device = f"cuda:{device_idx}" if torch.cuda.is_available() else "cpu"
648
+
649
+ # with torch.device(device):
650
+ # TELinear uses `device="cuda"` device to allocate empty bias
651
+ # tensor. This makes sure that the empty tensor is allocated on the
652
+ # correct device. (torch.device(), unlike torch.cuda.device(),
653
+ # doesn't override current CUDA device.)
654
+ # with torch.cuda.device(device):
655
+ block = get_block(config, layer_idx, flash_fft=self.flash_fft)
656
+ # move_to_device(block, device)
657
+
658
+ self.blocks.append(block)
659
+ # self.block_idx_to_device[layer_idx] = device
660
+ # self.logger.info(f"Assigned {layer_idx=} to {device=}")
661
+ # self.logger.info(
662
+ # f"Parameter count for block {layer_idx}: {sum(p.numel() for p in self.blocks[-1].parameters())}"
663
+ # )
664
 
665
  # with torch.device(self.block_idx_to_device[0]):
666
  # with torch.cuda.device(self.block_idx_to_device[0]):