ishanjmukherjee commited on
Commit
ddd4988
·
1 Parent(s): c4ee29d

Fix device management bug

Browse files

AutoModel wasn't loading because moving data out of meta tensors (which from_pretrained creates) is not possible. The reason is that StripedHyena's class definition has intricate device management logic. Ripping out that logic from model.py turned out to be surprisingly painless: just remove a for loop that distributes layers across GPUs, and two instances of with(device):

Now we have a layer naming bug, which is fantastic progress

Files changed (1) hide show
  1. model.py +39 -39
model.py CHANGED
@@ -637,45 +637,45 @@ class StripedHyena(nn.Module):
637
  self.block_idx_to_device = {}
638
 
639
  # Calculate layers per GPU
640
- num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 1
641
- layers_per_gpu = math.ceil(config.num_layers / num_gpus)
642
- self.logger.info(f"Distributing across {num_gpus} GPUs, approximately {layers_per_gpu} layers per GPU")
643
-
644
- for layer_idx in tqdm(range(config.num_layers)):
645
- # Determine which GPU should handle this layer
646
- device_idx = min(layer_idx // layers_per_gpu, num_gpus - 1)
647
- device = f"cuda:{device_idx}" if torch.cuda.is_available() else "cpu"
648
-
649
- with torch.device(device):
650
- # TELinear uses `device="cuda"` device to allocate empty bias
651
- # tensor. This makes sure that the empty tensor is allocated on the
652
- # correct device. (torch.device(), unlike torch.cuda.device(),
653
- # doesn't override current CUDA device.)
654
- with torch.cuda.device(device):
655
- block = get_block(config, layer_idx, flash_fft=self.flash_fft)
656
- move_to_device(block, device)
657
-
658
- self.blocks.append(block)
659
- self.block_idx_to_device[layer_idx] = device
660
- self.logger.info(f"Assigned {layer_idx=} to {device=}")
661
- self.logger.info(
662
- f"Parameter count for block {layer_idx}: {sum(p.numel() for p in self.blocks[-1].parameters())}"
663
- )
664
-
665
- with torch.device(self.block_idx_to_device[0]):
666
- with torch.cuda.device(self.block_idx_to_device[0]):
667
- self.norm = RMSNorm(config) if config.get("final_norm", True) else None
668
- if config.tie_embeddings:
669
- # Lambda usage is to be able to use forward() on caller side, which in
670
- # turn is needed for PyTorch hooks to work properly.
671
- self.unembed = Lambda(self.embedding_layer.unembed)
672
- else:
673
- if config.tie_embeddings:
674
- # Technically we can support this mode, just need to
675
- # copy tensors across GPUs then. But let's implement it
676
- # once/if needed.
677
- self.logger.info("Ignoring tie_embeddings for now.")
678
- self.unembed = VocabParallelUnembedding(config)
679
 
680
  self.logger.info("Initialized model")
681
 
 
637
  self.block_idx_to_device = {}
638
 
639
  # Calculate layers per GPU
640
+ # num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 1
641
+ # layers_per_gpu = math.ceil(config.num_layers / num_gpus)
642
+ # self.logger.info(f"Distributing across {num_gpus} GPUs, approximately {layers_per_gpu} layers per GPU")
643
+
644
+ # for layer_idx in tqdm(range(config.num_layers)):
645
+ # # Determine which GPU should handle this layer
646
+ # device_idx = min(layer_idx // layers_per_gpu, num_gpus - 1)
647
+ # device = f"cuda:{device_idx}" if torch.cuda.is_available() else "cpu"
648
+
649
+ # with torch.device(device):
650
+ # # TELinear uses `device="cuda"` device to allocate empty bias
651
+ # # tensor. This makes sure that the empty tensor is allocated on the
652
+ # # correct device. (torch.device(), unlike torch.cuda.device(),
653
+ # # doesn't override current CUDA device.)
654
+ # with torch.cuda.device(device):
655
+ # block = get_block(config, layer_idx, flash_fft=self.flash_fft)
656
+ # move_to_device(block, device)
657
+
658
+ # self.blocks.append(block)
659
+ # self.block_idx_to_device[layer_idx] = device
660
+ # self.logger.info(f"Assigned {layer_idx=} to {device=}")
661
+ # self.logger.info(
662
+ # f"Parameter count for block {layer_idx}: {sum(p.numel() for p in self.blocks[-1].parameters())}"
663
+ # )
664
+
665
+ # with torch.device(self.block_idx_to_device[0]):
666
+ # with torch.cuda.device(self.block_idx_to_device[0]):
667
+ self.norm = RMSNorm(config) if config.get("final_norm", True) else None
668
+ if config.tie_embeddings:
669
+ # Lambda usage is to be able to use forward() on caller side, which in
670
+ # turn is needed for PyTorch hooks to work properly.
671
+ self.unembed = Lambda(self.embedding_layer.unembed)
672
+ else:
673
+ if config.tie_embeddings:
674
+ # Technically we can support this mode, just need to
675
+ # copy tensors across GPUs then. But let's implement it
676
+ # once/if needed.
677
+ self.logger.info("Ignoring tie_embeddings for now.")
678
+ self.unembed = VocabParallelUnembedding(config)
679
 
680
  self.logger.info("Initialized model")
681