petil777
/

srv1_parallel

Transformers

custom_code

Model card Files Files and versions

xet

Community

petil777 commited on Oct 11, 2023

Commit

74a5e35

1 Parent(s): 1e9783d

Upload 2 files

Browse files

Files changed (2) hide show

dist.py +80 -0
layers.py +256 -0

dist.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import os
+import torch
+from datetime import timedelta
+# Tensor Parallelism settings
+RANK = int(os.getenv("RANK", "0"))
+WORLD_SIZE = int(os.getenv("WORLD_SIZE", "1"))
+# CUDA memory fraction
+MEMORY_FRACTION = float(os.getenv("CUDA_MEMORY_FRACTION", "1.0"))
+class FakeBarrier:
+    def wait(self):
+        pass
+class FakeGroup:
+    def __init__(self, rank, size):
+        self._rank = rank
+        self._size = size
+    def allreduce(self, *args, **kwargs):
+        return FakeBarrier()
+    def allgather(self, inputs, local_tensor, **kwargs):
+        assert (
+            len(inputs[0]) == len(local_tensor) == 1
+        ), f"{len(inputs[0])} != {len(local_tensor)} != 1, and the FakeGroup is supposed to join on simple tensors"
+        for input_ in inputs:
+            input_[0].data = local_tensor[0].data
+        return FakeBarrier()
+    def barrier(self, *args, **kwargs):
+        return FakeBarrier()
+    def size(self):
+        return self._size
+    def rank(self):
+        return self._rank
+def initialize_torch_distributed():
+    if torch.cuda.is_available():
+        from torch.distributed import ProcessGroupNCCL
+        # Set the device id.
+        assert WORLD_SIZE <= torch.cuda.device_count(), "Each process is one gpu"
+        device = RANK % torch.cuda.device_count()
+        torch.cuda.set_device(device)
+        torch.cuda.set_per_process_memory_fraction(MEMORY_FRACTION, device)
+        backend = "nccl"
+        options = ProcessGroupNCCL.Options()
+        options.is_high_priority_stream = True
+        options._timeout = timedelta(seconds=60)
+    else:
+        backend = "gloo"
+        options = None
+    if WORLD_SIZE == 1:
+        return FakeGroup(RANK, WORLD_SIZE), RANK, WORLD_SIZE
+    else:
+        if os.getenv("DEBUG", None) == "1":
+            return FakeGroup(RANK, WORLD_SIZE), RANK, WORLD_SIZE
+        if not torch.distributed.is_initialized():
+            # Call the init process.
+            torch.distributed.init_process_group(
+                backend=backend,
+                world_size=WORLD_SIZE,
+                rank=RANK,
+                timeout=timedelta(seconds=60),
+                pg_options=options,
+            )
+        else:
+            print("torch.distributed is already initialized.")
+        return torch.distributed.group.WORLD, RANK, WORLD_SIZE

layers.py ADDED Viewed

	@@ -0,0 +1,256 @@

+# Copy and modify https://github.com/huggingface/text-generation-inference/blob/main/server/text_generation_server/utils/layers.py
+from typing import List
+import torch
+import torch.distributed
+from accelerate import init_empty_weights
+from torch import nn
+from torch.nn import functional as F
+# Monkey patching
+@classmethod
+def load_layer_norm(cls, prefix, weights, eps):
+    weight = weights.get_tensor(f"{prefix}.weight")
+    bias = weights.get_tensor(f"{prefix}.bias")
+    with init_empty_weights():
+        ln = cls(weight.shape, eps=eps)
+    ln.weight = nn.Parameter(weight)
+    ln.bias = nn.Parameter(bias)
+    return ln
+@classmethod
+def load_layer_norm_no_bias(cls, prefix, weights, eps):
+    weight = weights.get_tensor(f"{prefix}.weight")
+    with init_empty_weights():
+        ln = cls(weight.shape, eps=eps)
+    ln.weight = nn.Parameter(weight)
+    ln.bias = None
+    return ln
+torch.nn.LayerNorm.load = load_layer_norm
+torch.nn.LayerNorm.load_no_bias = load_layer_norm_no_bias
+class FastLinear(nn.Module):
+    def __init__(
+        self,
+        weight,
+        bias,
+    ) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(weight)
+        if bias is not None:
+            self.bias = nn.Parameter(bias)
+        else:
+            self.bias = None
+    @classmethod
+    def load(cls, config, prefix: str, weights, bias: bool):
+        weight = weights.get_tensor(f"{prefix}.weight")
+        if bias:
+            bias = weights.get_tensor(f"{prefix}.bias")
+        else:
+            bias = None
+        return cls(weight, bias)
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return F.linear(input, self.weight, self.bias)
+def get_linear(weight, bias):
+    linear = FastLinear(weight, bias)
+    return linear
+class SuperLayer(nn.Module):
+    def __init__(self, linear):
+        super().__init__()
+        self.linear = linear
+    def forward(self, x):
+        return self.linear.forward(x)
+class TensorParallelHead(SuperLayer):
+    def __init__(self, linear, process_group, should_gather: bool):
+        super().__init__(linear)
+        self.process_group = process_group
+        self.should_gather = should_gather
+    @staticmethod
+    def load(config, prefix: str, weights):
+        if weights.process_group.size() > 1:
+            try:
+                weight = weights.get_sharded(f"{prefix}.weight", dim=0)
+                should_gather = True
+            except AssertionError:
+                # If the vocab size is not divisible by number of shards
+                # just load the entire thing.
+                weight = weights.get_tensor(f"{prefix}.weight")
+                should_gather = False
+        else:
+            weight = weights.get_tensor(f"{prefix}.weight")
+            should_gather = False
+        return TensorParallelHead(
+            get_linear(weight, bias=None),
+            process_group=weights.process_group,
+            should_gather=should_gather,
+        )
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if not self.should_gather:
+            return super().forward(input)
+        world_size = self.process_group.size()
+        if len(input.shape) == 2 and isinstance(self.linear, FastLinear):
+            out_dim = self.linear.weight.shape[0]
+            if input.shape[0] == 1:
+                world_out = input.new_empty(1, out_dim * world_size)
+                local_out = input.new_empty(1, out_dim)
+                gather_input = local_out
+            else:
+                world_out = input.new_empty(out_dim * world_size, input.shape[0])
+                gather_input = input.new_empty(out_dim, input.shape[0])
+                local_out = gather_input.T
+            torch.mm(input, self.linear.weight.T, out=local_out)
+            torch.distributed.all_gather_into_tensor(world_out, gather_input, group=self.process_group)
+            if input.shape[0] == 1:
+                return world_out
+            return world_out.T
+        output = super().forward(input)
+        world_output = [torch.empty_like(output) for _ in range(self.process_group.size())]
+        torch.distributed.all_gather(world_output, output, group=self.process_group)
+        world_output = torch.cat(world_output, dim=-1)
+        return world_output
+class TensorParallelColumnLinear(SuperLayer):
+    @classmethod
+    def load(cls, config, prefix: str, weights, bias: bool):
+        return cls.load_multi(config, [prefix], weights, bias, dim=0)
+    @classmethod
+    def load_multi(cls, config, prefixes: List[str], weights, bias: bool, dim: int):
+        weight = weights.get_multi_weights_col(prefixes, dim=dim, quantize=config.quantize)
+        if bias:
+            b = [weights.get_sharded(f"{p}.bias", dim=0) for p in prefixes]
+            bias = torch.cat(b, dim=dim)
+        else:
+            bias = None
+        linear = get_linear(weight, bias)
+        return cls(linear)
+class TensorParallelRowLinear(SuperLayer):
+    def __init__(self, linear, process_group):
+        super().__init__(linear)
+        self.process_group = process_group
+    @classmethod
+    def load(cls, config, prefix: str, weights, bias: bool):
+        weight = weights.get_multi_weights_row(prefix, quantize=config.quantize)
+        if bias and weights.process_group.rank() == 0:
+            # Rank is only on the first rank process
+            bias = weights.get_tensor(f"{prefix}.bias")
+        else:
+            bias = None
+        return cls(
+            get_linear(weight, bias),
+            process_group=weights.process_group,
+        )
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        out = super().forward(input)
+        if self.process_group.size() > 1:
+            torch.distributed.all_reduce(out, group=self.process_group)
+        return out
+class TensorParallelEmbedding(nn.Module):
+    def __init__(self, prefix: str, weights, reduce=True):
+        super().__init__()
+        weight = weights.get_partial_sharded(f"{prefix}.weight", dim=0)
+        num_embeddings = weights.get_shape(f"{prefix}.weight")[0]
+        process_group = weights.process_group
+        world_size = process_group.size()
+        rank = process_group.rank()
+        block_size = num_embeddings // world_size
+        self.min_id = rank * block_size
+        self.max_id = min(num_embeddings, (rank + 1) * block_size)
+        self.null_idx = block_size
+        self.process_group = weights.process_group
+        self.reduce = reduce
+        """Additional 0 entry used for masking"""
+        self.weight = nn.Parameter(F.pad(weight, (0, 0, 0, 1)))
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        # default all out of bounds values to `self.null_idx` that will then be mapped to 0
+        # translate for [0, self.max_id - self.min_id[
+        input = torch.where(
+            (self.min_id > input) | (input >= self.max_id),
+            self.null_idx,
+            input - self.min_id,
+        )
+        out = torch.nn.functional.embedding(input, self.weight)
+        if self.reduce and self.process_group.size() > 1:
+            torch.distributed.all_reduce(out, group=self.process_group)
+        return out
+try:
+    import dropout_layer_norm
+    class FastLayerNorm(nn.LayerNorm):
+        def forward(self, hidden_states, residual=None):
+            if hidden_states.shape[-1] > 8192:
+                if residual is not None:
+                    hidden_states += residual
+                residual = hidden_states
+                return super(FastLayerNorm, self).forward(hidden_states), residual
+            else:
+                (
+                    normed_hidden_states,
+                    residual,
+                    *rest,
+                ) = dropout_layer_norm.dropout_add_ln_fwd(
+                    hidden_states,
+                    residual,
+                    self.weight,
+                    self.bias,
+                    None,
+                    None,
+                    None,
+                    None,
+                    0.0,
+                    self.eps,
+                    1.0,
+                    0,
+                    None,
+                    False,
+                    False,
+                )
+                if residual is None:
+                    residual = hidden_states
+                return normed_hidden_states, residual
+except ImportError:
+    pass