Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

__pycache__/pfsq.cpython-310.pyc +0 -0
__pycache__/plpq.cpython-310.pyc +0 -0
pfsq.py +30 -21
plpq.py +11 -24

__pycache__/pfsq.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/pfsq.cpython-310.pyc and b/__pycache__/pfsq.cpython-310.pyc differ

__pycache__/plpq.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/plpq.cpython-310.pyc and b/__pycache__/plpq.cpython-310.pyc differ

pfsq.py CHANGED Viewed

@@ -12,9 +12,7 @@ import torch
 import torch.nn as nn
 from torch.nn import Module
 from torch import Tensor, int32
-from torch.cuda.amp import autocast
-from einops import rearrange, pack, unpack
 # helper functions
@@ -35,11 +33,22 @@ def maybe(fn):
         return fn(x, *args, **kwargs)
     return inner
-def pack_one(t, pattern):
-    return pack([t], pattern)
-def unpack_one(t, ps, pattern):
-    return unpack(t, ps, pattern)[0]
 # tensor helpers
@@ -137,7 +146,7 @@ class PFSQ(Module):
     def indices_to_level_indices(self, indices):
         """ Converts indices to indices at each level, perhaps needed for a transformer with factorized embeddings """
-        indices = rearrange(indices, '... -> ... 1')
         codes_non_centered = (indices // self._basis) % self._levels
         return codes_non_centered
@@ -152,7 +161,7 @@ class PFSQ(Module):
         codes = self._indices_to_codes(indices)
         if self.keep_num_codebooks_dim:
-            codes = rearrange(codes, '... c d -> ... (c d)')
         if n_codes == 1:
             return codes
@@ -160,11 +169,11 @@ class PFSQ(Module):
         codes = self.project_out(codes)
         if is_img_or_video or self.channel_first:
-            codes = rearrange(codes, 'b ... d -> b d ...')
         return codes
-    @autocast(enabled = False)
     def forward(self, z):
         """
         einstein notation
@@ -180,19 +189,19 @@ class PFSQ(Module):
         # standardize image or video into (batch, seq, dimension)
         if need_move_channel_last:
-            z = rearrange(z, 'b d ... -> b ... d')
-            z, ps = pack_one(z, 'b * d')
         assert z.shape[-1] == self.dim, f'expected dimension of {self.dim} but found dimension of {z.shape[-1]}'
         z = self.project_in(z)
-        z = rearrange(z, 'b n (c d) -> b n c d', c = self.num_codebooks)
         # whether to force quantization step to be full precision or not
         force_f32 = self.force_quantization_f32
-        quantization_context = partial(autocast, enabled = False) if force_f32 else nullcontext
         with quantization_context():
             orig_dtype = z.dtype
@@ -210,7 +219,7 @@ class PFSQ(Module):
                 indices = self.codes_to_indices(codes)
             first_codes = codes[:, :, 0, :] # first codebook
-            codes = rearrange(codes, 'b n c d -> b n (c d)')
             codes = codes.type(orig_dtype)
             first_codes = first_codes.type(orig_dtype)
@@ -221,13 +230,13 @@ class PFSQ(Module):
         # reconstitute image or video dimensions
         if need_move_channel_last:
-            out = unpack_one(out, ps, 'b * d')
-            out = rearrange(out, 'b ... d -> b d ...')
-            indices = maybe(unpack_one)(indices, ps, 'b * c')
         if not self.keep_num_codebooks_dim and self.return_indices:
-            indices = maybe(rearrange)(indices, '... 1 -> ...')
         # return quantized output and indices

 import torch.nn as nn
 from torch.nn import Module
 from torch import Tensor, int32
+from torch.amp import autocast
 # helper functions
         return fn(x, *args, **kwargs)
     return inner
+# einops version
+#def pack_one(t, pattern):
+#    return pack([t], pattern)
+def pack_one(t):
+    # pattern "b * d"
+    if t.ndim > 2:
+        ps = t.shape[1:-1]
+        return t.flatten(1,-2), ps
+    return t, tuple()
+# einops version
+#def unpack_one(t, ps, pattern):
+#    return unpack(t, ps, pattern)[0]
+def unpack_one(t, ps):
+    # pattern "b * d"
+    return t.reshape(t.shape[0], ps, t.shape[-1])
 # tensor helpers
     def indices_to_level_indices(self, indices):
         """ Converts indices to indices at each level, perhaps needed for a transformer with factorized embeddings """
+        indices = indices.unsqueeze(-1)
         codes_non_centered = (indices // self._basis) % self._levels
         return codes_non_centered
         codes = self._indices_to_codes(indices)
         if self.keep_num_codebooks_dim:
+            codes = codes.flatten(start_dim=-2) # '... c d -> ... (c d)'
         if n_codes == 1:
             return codes
         codes = self.project_out(codes)
         if is_img_or_video or self.channel_first:
+            codes = codes.moveaxis(-1,1) # 'b ... d -> b d ...'
         return codes
+    @autocast('cuda', enabled = False)
     def forward(self, z):
         """
         einstein notation
         # standardize image or video into (batch, seq, dimension)
         if need_move_channel_last:
+            z = z.moveaxis(1,-1) # 'b d ... -> b ... d'
+            z, ps = pack_one(z)
         assert z.shape[-1] == self.dim, f'expected dimension of {self.dim} but found dimension of {z.shape[-1]}'
         z = self.project_in(z)
+        z = z.reshape(*z.shape[:2], self.num_codebooks, -1) # 'b n (c d) -> b n c d', c=self.num_codebooks
         # whether to force quantization step to be full precision or not
         force_f32 = self.force_quantization_f32
+        quantization_context = partial(autocast, device_type = 'cuda', enabled = False) if force_f32 else nullcontext
         with quantization_context():
             orig_dtype = z.dtype
                 indices = self.codes_to_indices(codes)
             first_codes = codes[:, :, 0, :] # first codebook
+            codes = codes.flatten(start_dim=-2) # 'b n c d -> b n (c d)'
             codes = codes.type(orig_dtype)
             first_codes = first_codes.type(orig_dtype)
         # reconstitute image or video dimensions
         if need_move_channel_last:
+            out = unpack_one(out, ps)
+            out = out.moveaxis(-1,1) # 'b ... d -> b d ...'
+            indices = maybe(unpack_one)(indices, ps)
         if not self.keep_num_codebooks_dim and self.return_indices:
+            indices = (indices.squeeze(-1)) if indices is not None else None
         # return quantized output and indices

plpq.py CHANGED Viewed

@@ -11,9 +11,7 @@ from .config import PLPQConfig
 class PLPQ(PreTrainedModel):
-    """
-    Pyramidal Local Patch Quantizer
-    """
     config_class = PLPQConfig
     def __init__(self, config):
@@ -58,12 +56,12 @@ class PLPQ(PreTrainedModel):
         # Pyramidal Quantizer
         self.quantizer = PFSQ(
-            levels = config.levels,                      # number of levels for each codebook
-            num_codebooks = config.num_quantizers,   # number of quantizers
-            dim = config.encoder_blocks[-1][2],          # this is the input feature dimension, defaults to log2(codebook_size) if not defined
         )
-        # coarse decoder output -> 32x32 supervision
         self.coarse_decoder = nn.Conv2d(len(config.levels), config.num_out_channels, kernel_size=1, stride=1)
         self.decoder = nn.Sequential(
@@ -76,9 +74,7 @@ class PLPQ(PreTrainedModel):
     def get_num_params(self) -> int:
-        """
-        Return the number of parameters in the model.
-        """
         return sum(p.numel() for p in self.parameters())
@@ -87,19 +83,14 @@ class PLPQ(PreTrainedModel):
         """
         Quantize the input tensor
         Parameters:
-            x (torch.Tensor): The input tensor. Size b, c, h, w
         Returns:
             torch.Tensor: The indices tensor. Size b, h, w
         """
-        # encode the input
         z = self.encoder(x).permute(0, 2, 3, 1).contiguous()
-        # reshape the input
         b, h, w, c = z.shape
         z = z.view(b, h * w, -1)
-        # quantize the input
         quantized, coarse_quantized, all_codes = self.quantizer(z)
         return all_codes
@@ -114,25 +105,21 @@ class PLPQ(PreTrainedModel):
         ncodes = indices.shape[-1]
         emb = self.quantizer.indices_to_codes(indices).squeeze(-1)
         # reshape [b t c] -> [b c h w]
         b, h, w = emb.size(0), int(math.sqrt(emb.size(1))), int(math.sqrt(emb.size(1)))
         emb = emb.permute(0, 2, 1).view(b, -1, h, w).contiguous()
         if ncodes == 1:
-            pred = self.coarse_decoder(emb)
-            return pred
         # full decoder: full image prediction
-        pred = self.decoder(emb)
-        return pred
 class LayerNorm(nn.Module):
-    """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """
     def __init__(self, ndim, bias):
         super().__init__()
         self.weight = nn.Parameter(torch.ones(ndim))

 class PLPQ(PreTrainedModel):
+    """Pyramidal Local Patch Quantizer"""
     config_class = PLPQConfig
     def __init__(self, config):
         # Pyramidal Quantizer
         self.quantizer = PFSQ(
+            levels = config.levels,                     # number of levels for each codebook
+            num_codebooks = config.num_quantizers,      # number of quantizers
+            dim = config.encoder_blocks[-1][2],         # this is the input feature dimension, defaults to log2(codebook_size) if not defined
         )
+        # Coarse decoder output -> 32x32 supervision
         self.coarse_decoder = nn.Conv2d(len(config.levels), config.num_out_channels, kernel_size=1, stride=1)
         self.decoder = nn.Sequential(
     def get_num_params(self) -> int:
+        """Return the number of parameters in the model."""
         return sum(p.numel() for p in self.parameters())
         """
         Quantize the input tensor
         Parameters:
+            x (Image or torch.Tensor): The input tensor. Size b, c, h, w
         Returns:
             torch.Tensor: The indices tensor. Size b, h, w
         """
         z = self.encoder(x).permute(0, 2, 3, 1).contiguous()
         b, h, w, c = z.shape
         z = z.view(b, h * w, -1)
         quantized, coarse_quantized, all_codes = self.quantizer(z)
         return all_codes
         ncodes = indices.shape[-1]
         emb = self.quantizer.indices_to_codes(indices).squeeze(-1)
         # reshape [b t c] -> [b c h w]
         b, h, w = emb.size(0), int(math.sqrt(emb.size(1))), int(math.sqrt(emb.size(1)))
         emb = emb.permute(0, 2, 1).view(b, -1, h, w).contiguous()
         if ncodes == 1:
+            return self.coarse_decoder(emb)
         # full decoder: full image prediction
+        return self.decoder(emb)
 class LayerNorm(nn.Module):
+    """LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False"""
     def __init__(self, ndim, bias):
         super().__init__()
         self.weight = nn.Parameter(torch.ones(ndim))