Commit ·
ddd4988
1
Parent(s): c4ee29d
Fix device management bug
Browse filesAutoModel wasn't loading because moving data out of meta tensors (which from_pretrained creates) is not possible. The reason is that StripedHyena's class definition has intricate device management logic. Ripping out that logic from model.py turned out to be surprisingly painless: just remove a for loop that distributes layers across GPUs, and two instances of with(device):
Now we have a layer naming bug, which is fantastic progress
model.py
CHANGED
|
@@ -637,45 +637,45 @@ class StripedHyena(nn.Module):
|
|
| 637 |
self.block_idx_to_device = {}
|
| 638 |
|
| 639 |
# Calculate layers per GPU
|
| 640 |
-
num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 1
|
| 641 |
-
layers_per_gpu = math.ceil(config.num_layers / num_gpus)
|
| 642 |
-
self.logger.info(f"Distributing across {num_gpus} GPUs, approximately {layers_per_gpu} layers per GPU")
|
| 643 |
-
|
| 644 |
-
for layer_idx in tqdm(range(config.num_layers)):
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
with torch.device(self.block_idx_to_device[0]):
|
| 666 |
-
|
| 667 |
-
|
| 668 |
-
|
| 669 |
-
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
|
| 679 |
|
| 680 |
self.logger.info("Initialized model")
|
| 681 |
|
|
|
|
| 637 |
self.block_idx_to_device = {}
|
| 638 |
|
| 639 |
# Calculate layers per GPU
|
| 640 |
+
# num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 1
|
| 641 |
+
# layers_per_gpu = math.ceil(config.num_layers / num_gpus)
|
| 642 |
+
# self.logger.info(f"Distributing across {num_gpus} GPUs, approximately {layers_per_gpu} layers per GPU")
|
| 643 |
+
|
| 644 |
+
# for layer_idx in tqdm(range(config.num_layers)):
|
| 645 |
+
# # Determine which GPU should handle this layer
|
| 646 |
+
# device_idx = min(layer_idx // layers_per_gpu, num_gpus - 1)
|
| 647 |
+
# device = f"cuda:{device_idx}" if torch.cuda.is_available() else "cpu"
|
| 648 |
+
|
| 649 |
+
# with torch.device(device):
|
| 650 |
+
# # TELinear uses `device="cuda"` device to allocate empty bias
|
| 651 |
+
# # tensor. This makes sure that the empty tensor is allocated on the
|
| 652 |
+
# # correct device. (torch.device(), unlike torch.cuda.device(),
|
| 653 |
+
# # doesn't override current CUDA device.)
|
| 654 |
+
# with torch.cuda.device(device):
|
| 655 |
+
# block = get_block(config, layer_idx, flash_fft=self.flash_fft)
|
| 656 |
+
# move_to_device(block, device)
|
| 657 |
+
|
| 658 |
+
# self.blocks.append(block)
|
| 659 |
+
# self.block_idx_to_device[layer_idx] = device
|
| 660 |
+
# self.logger.info(f"Assigned {layer_idx=} to {device=}")
|
| 661 |
+
# self.logger.info(
|
| 662 |
+
# f"Parameter count for block {layer_idx}: {sum(p.numel() for p in self.blocks[-1].parameters())}"
|
| 663 |
+
# )
|
| 664 |
+
|
| 665 |
+
# with torch.device(self.block_idx_to_device[0]):
|
| 666 |
+
# with torch.cuda.device(self.block_idx_to_device[0]):
|
| 667 |
+
self.norm = RMSNorm(config) if config.get("final_norm", True) else None
|
| 668 |
+
if config.tie_embeddings:
|
| 669 |
+
# Lambda usage is to be able to use forward() on caller side, which in
|
| 670 |
+
# turn is needed for PyTorch hooks to work properly.
|
| 671 |
+
self.unembed = Lambda(self.embedding_layer.unembed)
|
| 672 |
+
else:
|
| 673 |
+
if config.tie_embeddings:
|
| 674 |
+
# Technically we can support this mode, just need to
|
| 675 |
+
# copy tensors across GPUs then. But let's implement it
|
| 676 |
+
# once/if needed.
|
| 677 |
+
self.logger.info("Ignoring tie_embeddings for now.")
|
| 678 |
+
self.unembed = VocabParallelUnembedding(config)
|
| 679 |
|
| 680 |
self.logger.info("Initialized model")
|
| 681 |
|