Commit ·
d7ea743
1
Parent(s): 3d180ae
Uncomment layer assignment
Browse filesI had commented out the entire block initialization for loop, instead of just the device management code
model.py
CHANGED
|
@@ -641,26 +641,26 @@ class StripedHyena(nn.Module):
|
|
| 641 |
# layers_per_gpu = math.ceil(config.num_layers / num_gpus)
|
| 642 |
# self.logger.info(f"Distributing across {num_gpus} GPUs, approximately {layers_per_gpu} layers per GPU")
|
| 643 |
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
|
| 665 |
# with torch.device(self.block_idx_to_device[0]):
|
| 666 |
# with torch.cuda.device(self.block_idx_to_device[0]):
|
|
|
|
| 641 |
# layers_per_gpu = math.ceil(config.num_layers / num_gpus)
|
| 642 |
# self.logger.info(f"Distributing across {num_gpus} GPUs, approximately {layers_per_gpu} layers per GPU")
|
| 643 |
|
| 644 |
+
for layer_idx in tqdm(range(config.num_layers)):
|
| 645 |
+
# Determine which GPU should handle this layer
|
| 646 |
+
# device_idx = min(layer_idx // layers_per_gpu, num_gpus - 1)
|
| 647 |
+
# device = f"cuda:{device_idx}" if torch.cuda.is_available() else "cpu"
|
| 648 |
+
|
| 649 |
+
# with torch.device(device):
|
| 650 |
+
# TELinear uses `device="cuda"` device to allocate empty bias
|
| 651 |
+
# tensor. This makes sure that the empty tensor is allocated on the
|
| 652 |
+
# correct device. (torch.device(), unlike torch.cuda.device(),
|
| 653 |
+
# doesn't override current CUDA device.)
|
| 654 |
+
# with torch.cuda.device(device):
|
| 655 |
+
block = get_block(config, layer_idx, flash_fft=self.flash_fft)
|
| 656 |
+
# move_to_device(block, device)
|
| 657 |
+
|
| 658 |
+
self.blocks.append(block)
|
| 659 |
+
# self.block_idx_to_device[layer_idx] = device
|
| 660 |
+
# self.logger.info(f"Assigned {layer_idx=} to {device=}")
|
| 661 |
+
# self.logger.info(
|
| 662 |
+
# f"Parameter count for block {layer_idx}: {sum(p.numel() for p in self.blocks[-1].parameters())}"
|
| 663 |
+
# )
|
| 664 |
|
| 665 |
# with torch.device(self.block_idx_to_device[0]):
|
| 666 |
# with torch.cuda.device(self.block_idx_to_device[0]):
|