StreamingT2V

Runtime error

App Files Files Community

hpoghos commited on Apr 4, 2024

Commit

3760daa

1 Parent(s): d67a615

minor

Browse files

Files changed (2) hide show

requirements.txt +1 -1
t2v_enhanced/model/diffusers_conditional/models/controlnet/image_embedder.py +158 -158

requirements.txt CHANGED Viewed

@@ -28,7 +28,7 @@ torchvision==0.15.1
 modelscope==1.13.3
 tqdm==4.65.0
 xformers==0.0.19
-open-clip-torch==2.24.0
 jsonargparse[signatures]==4.27.7
 fairscale==0.4.13
 rotary-embedding-torch==0.5.3

 modelscope==1.13.3
 tqdm==4.65.0
 xformers==0.0.19
+# open-clip-torch==2.24.0
 jsonargparse[signatures]==4.27.7
 fairscale==0.4.13
 rotary-embedding-torch==0.5.3

t2v_enhanced/model/diffusers_conditional/models/controlnet/image_embedder.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import Any, Mapping
 import torch
 import torch.nn as nn
 import kornia
-import open_clip
 from transformers import AutoImageProcessor, AutoModel
 from transformers.models.bit.image_processing_bit import BitImageProcessor
 from einops import rearrange, repeat
@@ -52,160 +52,160 @@ class AbstractEncoder(nn.Module):
-class FrozenOpenCLIPImageEmbedder(AbstractEncoder):
-    """
-    Uses the OpenCLIP vision transformer encoder for images
-    """
-    def __init__(
-        self,
-        arch="ViT-H-14",
-        version="laion2b_s32b_b79k",
-        device="cuda",
-        max_length=77,
-        freeze=True,
-        antialias=True,
-        ucg_rate=0.0,
-        unsqueeze_dim=False,
-        repeat_to_max_len=False,
-        num_image_crops=0,
-        output_tokens=False,
-    ):
-        super().__init__()
-        model, _, _ = open_clip.create_model_and_transforms(
-            arch,
-            device=torch.device("cpu"),
-            pretrained=version,
-        )
-        del model.transformer
-        self.model = model
-        self.max_crops = num_image_crops
-        self.pad_to_max_len = self.max_crops > 0
-        self.repeat_to_max_len = repeat_to_max_len and (not self.pad_to_max_len)
-        self.device = device
-        self.max_length = max_length
-        if freeze:
-            self.freeze()
-        self.antialias = antialias
-        self.register_buffer(
-            "mean", torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False
-        )
-        self.register_buffer(
-            "std", torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False
-        )
-        self.ucg_rate = ucg_rate
-        self.unsqueeze_dim = unsqueeze_dim
-        self.stored_batch = None
-        self.model.visual.output_tokens = output_tokens
-        self.output_tokens = output_tokens
-    def preprocess(self, x):
-        # normalize to [0,1]
-        x = kornia.geometry.resize(
-            x,
-            (224, 224),
-            interpolation="bicubic",
-            align_corners=True,
-            antialias=self.antialias,
-        )
-        x = (x + 1.0) / 2.0
-        # renormalize according to clip
-        x = kornia.enhance.normalize(x, self.mean, self.std)
-        return x
-    def freeze(self):
-        self.model = self.model.eval()
-        for param in self.parameters():
-            param.requires_grad = False
-    def forward(self, image, no_dropout=False):
-        z = self.encode_with_vision_transformer(image)
-        tokens = None
-        if self.output_tokens:
-            z, tokens = z[0], z[1]
-        z = z.to(image.dtype)
-        if self.ucg_rate > 0.0 and not no_dropout and not (self.max_crops > 0):
-            z = (
-                torch.bernoulli(
-                    (1.0 - self.ucg_rate) * torch.ones(z.shape[0], device=z.device)
-                )[:, None]
-                * z
-            )
-            if tokens is not None:
-                tokens = (
-                    expand_dims_like(
-                        torch.bernoulli(
-                            (1.0 - self.ucg_rate)
-                            * torch.ones(tokens.shape[0], device=tokens.device)
-                        ),
-                        tokens,
-                    )
-                    * tokens
-                )
-        if self.unsqueeze_dim:
-            z = z[:, None, :]
-        if self.output_tokens:
-            assert not self.repeat_to_max_len
-            assert not self.pad_to_max_len
-            return tokens, z
-        if self.repeat_to_max_len:
-            if z.dim() == 2:
-                z_ = z[:, None, :]
-            else:
-                z_ = z
-            return repeat(z_, "b 1 d -> b n d", n=self.max_length), z
-        elif self.pad_to_max_len:
-            assert z.dim() == 3
-            z_pad = torch.cat(
-                (
-                    z,
-                    torch.zeros(
-                        z.shape[0],
-                        self.max_length - z.shape[1],
-                        z.shape[2],
-                        device=z.device,
-                    ),
-                ),
-                1,
-            )
-            return z_pad, z_pad[:, 0, ...]
-        return z
-    def encode_with_vision_transformer(self, img):
-        # if self.max_crops > 0:
-        #    img = self.preprocess_by_cropping(img)
-        if img.dim() == 5:
-            assert self.max_crops == img.shape[1]
-            img = rearrange(img, "b n c h w -> (b n) c h w")
-        img = self.preprocess(img)
-        if not self.output_tokens:
-            assert not self.model.visual.output_tokens
-            x = self.model.visual(img)
-            tokens = None
-        else:
-            assert self.model.visual.output_tokens
-            x, tokens = self.model.visual(img)
-        if self.max_crops > 0:
-            x = rearrange(x, "(b n) d -> b n d", n=self.max_crops)
-            # drop out between 0 and all along the sequence axis
-            x = (
-                torch.bernoulli(
-                    (1.0 - self.ucg_rate)
-                    * torch.ones(x.shape[0], x.shape[1], 1, device=x.device)
-                )
-                * x
-            )
-            if tokens is not None:
-                tokens = rearrange(tokens, "(b n) t d -> b t (n d)", n=self.max_crops)
-                print(
-                    f"You are running very experimental token-concat in {self.__class__.__name__}. "
-                    f"Check what you are doing, and then remove this message."
-                )
-        if self.output_tokens:
-            return x, tokens
-        return x
-    def encode(self, text):
-        return self(text)

 import torch
 import torch.nn as nn
 import kornia
+# import open_clip
 from transformers import AutoImageProcessor, AutoModel
 from transformers.models.bit.image_processing_bit import BitImageProcessor
 from einops import rearrange, repeat
+# class FrozenOpenCLIPImageEmbedder(AbstractEncoder):
+#     """
+#     Uses the OpenCLIP vision transformer encoder for images
+#     """
+#     def __init__(
+#         self,
+#         arch="ViT-H-14",
+#         version="laion2b_s32b_b79k",
+#         device="cuda",
+#         max_length=77,
+#         freeze=True,
+#         antialias=True,
+#         ucg_rate=0.0,
+#         unsqueeze_dim=False,
+#         repeat_to_max_len=False,
+#         num_image_crops=0,
+#         output_tokens=False,
+#     ):
+#         super().__init__()
+#         model, _, _ = open_clip.create_model_and_transforms(
+#             arch,
+#             device=torch.device("cpu"),
+#             pretrained=version,
+#         )
+#         del model.transformer
+#         self.model = model
+#         self.max_crops = num_image_crops
+#         self.pad_to_max_len = self.max_crops > 0
+#         self.repeat_to_max_len = repeat_to_max_len and (not self.pad_to_max_len)
+#         self.device = device
+#         self.max_length = max_length
+#         if freeze:
+#             self.freeze()
+#         self.antialias = antialias
+#         self.register_buffer(
+#             "mean", torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False
+#         )
+#         self.register_buffer(
+#             "std", torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False
+#         )
+#         self.ucg_rate = ucg_rate
+#         self.unsqueeze_dim = unsqueeze_dim
+#         self.stored_batch = None
+#         self.model.visual.output_tokens = output_tokens
+#         self.output_tokens = output_tokens
+#     def preprocess(self, x):
+#         # normalize to [0,1]
+#         x = kornia.geometry.resize(
+#             x,
+#             (224, 224),
+#             interpolation="bicubic",
+#             align_corners=True,
+#             antialias=self.antialias,
+#         )
+#         x = (x + 1.0) / 2.0
+#         # renormalize according to clip
+#         x = kornia.enhance.normalize(x, self.mean, self.std)
+#         return x
+#     def freeze(self):
+#         self.model = self.model.eval()
+#         for param in self.parameters():
+#             param.requires_grad = False
+#     def forward(self, image, no_dropout=False):
+#         z = self.encode_with_vision_transformer(image)
+#         tokens = None
+#         if self.output_tokens:
+#             z, tokens = z[0], z[1]
+#         z = z.to(image.dtype)
+#         if self.ucg_rate > 0.0 and not no_dropout and not (self.max_crops > 0):
+#             z = (
+#                 torch.bernoulli(
+#                     (1.0 - self.ucg_rate) * torch.ones(z.shape[0], device=z.device)
+#                 )[:, None]
+#                 * z
+#             )
+#             if tokens is not None:
+#                 tokens = (
+#                     expand_dims_like(
+#                         torch.bernoulli(
+#                             (1.0 - self.ucg_rate)
+#                             * torch.ones(tokens.shape[0], device=tokens.device)
+#                         ),
+#                         tokens,
+#                     )
+#                     * tokens
+#                 )
+#         if self.unsqueeze_dim:
+#             z = z[:, None, :]
+#         if self.output_tokens:
+#             assert not self.repeat_to_max_len
+#             assert not self.pad_to_max_len
+#             return tokens, z
+#         if self.repeat_to_max_len:
+#             if z.dim() == 2:
+#                 z_ = z[:, None, :]
+#             else:
+#                 z_ = z
+#             return repeat(z_, "b 1 d -> b n d", n=self.max_length), z
+#         elif self.pad_to_max_len:
+#             assert z.dim() == 3
+#             z_pad = torch.cat(
+#                 (
+#                     z,
+#                     torch.zeros(
+#                         z.shape[0],
+#                         self.max_length - z.shape[1],
+#                         z.shape[2],
+#                         device=z.device,
+#                     ),
+#                 ),
+#                 1,
+#             )
+#             return z_pad, z_pad[:, 0, ...]
+#         return z
+#     def encode_with_vision_transformer(self, img):
+#         # if self.max_crops > 0:
+#         #    img = self.preprocess_by_cropping(img)
+#         if img.dim() == 5:
+#             assert self.max_crops == img.shape[1]
+#             img = rearrange(img, "b n c h w -> (b n) c h w")
+#         img = self.preprocess(img)
+#         if not self.output_tokens:
+#             assert not self.model.visual.output_tokens
+#             x = self.model.visual(img)
+#             tokens = None
+#         else:
+#             assert self.model.visual.output_tokens
+#             x, tokens = self.model.visual(img)
+#         if self.max_crops > 0:
+#             x = rearrange(x, "(b n) d -> b n d", n=self.max_crops)
+#             # drop out between 0 and all along the sequence axis
+#             x = (
+#                 torch.bernoulli(
+#                     (1.0 - self.ucg_rate)
+#                     * torch.ones(x.shape[0], x.shape[1], 1, device=x.device)
+#                 )
+#                 * x
+#             )
+#             if tokens is not None:
+#                 tokens = rearrange(tokens, "(b n) t d -> b t (n d)", n=self.max_crops)
+#                 print(
+#                     f"You are running very experimental token-concat in {self.__class__.__name__}. "
+#                     f"Check what you are doing, and then remove this message."
+#                 )
+#         if self.output_tokens:
+#             return x, tokens
+#         return x
+#     def encode(self, text):
+#         return self(text)