StreamingT2V

Runtime error

App Files Files Community

hpoghos commited on Apr 4, 2024

Commit

f3a7d15

1 Parent(s): ef10431

minor

Browse files

Files changed (2) hide show

app.py +2 -2
t2v_enhanced/model/diffusers_conditional/models/controlnet/image_embedder.py +158 -158

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import argparse
 import datetime
 from pathlib import Path
 import torch
-import spaces
 import gradio as gr
 import tempfile
 import yaml
@@ -54,7 +54,7 @@ inference_generator = torch.Generator(device="cuda")
 # -------------------------
 # ----- Functionality -----
 # -------------------------
-@spaces.GPU
 def generate(prompt, num_frames, image, model_name_stage1, model_name_stage2, seed, t, image_guidance, where_to_log=result_fol):
     now = datetime.datetime.now()
     name = prompt[:100].replace(" ", "_") + "_" + str(now.time()).replace(":", "_").replace(".", "_")

 import datetime
 from pathlib import Path
 import torch
+# import spaces
 import gradio as gr
 import tempfile
 import yaml
 # -------------------------
 # ----- Functionality -----
 # -------------------------
+# @spaces.GPU
 def generate(prompt, num_frames, image, model_name_stage1, model_name_stage2, seed, t, image_guidance, where_to_log=result_fol):
     now = datetime.datetime.now()
     name = prompt[:100].replace(" ", "_") + "_" + str(now.time()).replace(":", "_").replace(".", "_")

t2v_enhanced/model/diffusers_conditional/models/controlnet/image_embedder.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import Any, Mapping
 import torch
 import torch.nn as nn
 import kornia
-# import open_clip
 from transformers import AutoImageProcessor, AutoModel
 from transformers.models.bit.image_processing_bit import BitImageProcessor
 from einops import rearrange, repeat
@@ -52,160 +52,160 @@ class AbstractEncoder(nn.Module):
-# class FrozenOpenCLIPImageEmbedder(AbstractEncoder):
-#     """
-#     Uses the OpenCLIP vision transformer encoder for images
-#     """
-#     def __init__(
-#         self,
-#         arch="ViT-H-14",
-#         version="laion2b_s32b_b79k",
-#         device="cuda",
-#         max_length=77,
-#         freeze=True,
-#         antialias=True,
-#         ucg_rate=0.0,
-#         unsqueeze_dim=False,
-#         repeat_to_max_len=False,
-#         num_image_crops=0,
-#         output_tokens=False,
-#     ):
-#         super().__init__()
-#         model, _, _ = open_clip.create_model_and_transforms(
-#             arch,
-#             device=torch.device("cpu"),
-#             pretrained=version,
-#         )
-#         del model.transformer
-#         self.model = model
-#         self.max_crops = num_image_crops
-#         self.pad_to_max_len = self.max_crops > 0
-#         self.repeat_to_max_len = repeat_to_max_len and (not self.pad_to_max_len)
-#         self.device = device
-#         self.max_length = max_length
-#         if freeze:
-#             self.freeze()
-#         self.antialias = antialias
-#         self.register_buffer(
-#             "mean", torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False
-#         )
-#         self.register_buffer(
-#             "std", torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False
-#         )
-#         self.ucg_rate = ucg_rate
-#         self.unsqueeze_dim = unsqueeze_dim
-#         self.stored_batch = None
-#         self.model.visual.output_tokens = output_tokens
-#         self.output_tokens = output_tokens
-#     def preprocess(self, x):
-#         # normalize to [0,1]
-#         x = kornia.geometry.resize(
-#             x,
-#             (224, 224),
-#             interpolation="bicubic",
-#             align_corners=True,
-#             antialias=self.antialias,
-#         )
-#         x = (x + 1.0) / 2.0
-#         # renormalize according to clip
-#         x = kornia.enhance.normalize(x, self.mean, self.std)
-#         return x
-#     def freeze(self):
-#         self.model = self.model.eval()
-#         for param in self.parameters():
-#             param.requires_grad = False
-#     def forward(self, image, no_dropout=False):
-#         z = self.encode_with_vision_transformer(image)
-#         tokens = None
-#         if self.output_tokens:
-#             z, tokens = z[0], z[1]
-#         z = z.to(image.dtype)
-#         if self.ucg_rate > 0.0 and not no_dropout and not (self.max_crops > 0):
-#             z = (
-#                 torch.bernoulli(
-#                     (1.0 - self.ucg_rate) * torch.ones(z.shape[0], device=z.device)
-#                 )[:, None]
-#                 * z
-#             )
-#             if tokens is not None:
-#                 tokens = (
-#                     expand_dims_like(
-#                         torch.bernoulli(
-#                             (1.0 - self.ucg_rate)
-#                             * torch.ones(tokens.shape[0], device=tokens.device)
-#                         ),
-#                         tokens,
-#                     )
-#                     * tokens
-#                 )
-#         if self.unsqueeze_dim:
-#             z = z[:, None, :]
-#         if self.output_tokens:
-#             assert not self.repeat_to_max_len
-#             assert not self.pad_to_max_len
-#             return tokens, z
-#         if self.repeat_to_max_len:
-#             if z.dim() == 2:
-#                 z_ = z[:, None, :]
-#             else:
-#                 z_ = z
-#             return repeat(z_, "b 1 d -> b n d", n=self.max_length), z
-#         elif self.pad_to_max_len:
-#             assert z.dim() == 3
-#             z_pad = torch.cat(
-#                 (
-#                     z,
-#                     torch.zeros(
-#                         z.shape[0],
-#                         self.max_length - z.shape[1],
-#                         z.shape[2],
-#                         device=z.device,
-#                     ),
-#                 ),
-#                 1,
-#             )
-#             return z_pad, z_pad[:, 0, ...]
-#         return z
-#     def encode_with_vision_transformer(self, img):
-#         # if self.max_crops > 0:
-#         #    img = self.preprocess_by_cropping(img)
-#         if img.dim() == 5:
-#             assert self.max_crops == img.shape[1]
-#             img = rearrange(img, "b n c h w -> (b n) c h w")
-#         img = self.preprocess(img)
-#         if not self.output_tokens:
-#             assert not self.model.visual.output_tokens
-#             x = self.model.visual(img)
-#             tokens = None
-#         else:
-#             assert self.model.visual.output_tokens
-#             x, tokens = self.model.visual(img)
-#         if self.max_crops > 0:
-#             x = rearrange(x, "(b n) d -> b n d", n=self.max_crops)
-#             # drop out between 0 and all along the sequence axis
-#             x = (
-#                 torch.bernoulli(
-#                     (1.0 - self.ucg_rate)
-#                     * torch.ones(x.shape[0], x.shape[1], 1, device=x.device)
-#                 )
-#                 * x
-#             )
-#             if tokens is not None:
-#                 tokens = rearrange(tokens, "(b n) t d -> b t (n d)", n=self.max_crops)
-#                 print(
-#                     f"You are running very experimental token-concat in {self.__class__.__name__}. "
-#                     f"Check what you are doing, and then remove this message."
-#                 )
-#         if self.output_tokens:
-#             return x, tokens
-#         return x
-#     def encode(self, text):
-#         return self(text)

 import torch
 import torch.nn as nn
 import kornia
+from open_clip import create_model_and_transforms
 from transformers import AutoImageProcessor, AutoModel
 from transformers.models.bit.image_processing_bit import BitImageProcessor
 from einops import rearrange, repeat
+class FrozenOpenCLIPImageEmbedder(AbstractEncoder):
+    """
+    Uses the OpenCLIP vision transformer encoder for images
+    """
+    def __init__(
+        self,
+        arch="ViT-H-14",
+        version="laion2b_s32b_b79k",
+        device="cuda",
+        max_length=77,
+        freeze=True,
+        antialias=True,
+        ucg_rate=0.0,
+        unsqueeze_dim=False,
+        repeat_to_max_len=False,
+        num_image_crops=0,
+        output_tokens=False,
+    ):
+        super().__init__()
+        model, _, _ = open_clip.create_model_and_transforms(
+            arch,
+            device=torch.device("cpu"),
+            pretrained=version,
+        )
+        del model.transformer
+        self.model = model
+        self.max_crops = num_image_crops
+        self.pad_to_max_len = self.max_crops > 0
+        self.repeat_to_max_len = repeat_to_max_len and (not self.pad_to_max_len)
+        self.device = device
+        self.max_length = max_length
+        if freeze:
+            self.freeze()
+        self.antialias = antialias
+        self.register_buffer(
+            "mean", torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False
+        )
+        self.register_buffer(
+            "std", torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False
+        )
+        self.ucg_rate = ucg_rate
+        self.unsqueeze_dim = unsqueeze_dim
+        self.stored_batch = None
+        self.model.visual.output_tokens = output_tokens
+        self.output_tokens = output_tokens
+    def preprocess(self, x):
+        # normalize to [0,1]
+        x = kornia.geometry.resize(
+            x,
+            (224, 224),
+            interpolation="bicubic",
+            align_corners=True,
+            antialias=self.antialias,
+        )
+        x = (x + 1.0) / 2.0
+        # renormalize according to clip
+        x = kornia.enhance.normalize(x, self.mean, self.std)
+        return x
+    def freeze(self):
+        self.model = self.model.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+    def forward(self, image, no_dropout=False):
+        z = self.encode_with_vision_transformer(image)
+        tokens = None
+        if self.output_tokens:
+            z, tokens = z[0], z[1]
+        z = z.to(image.dtype)
+        if self.ucg_rate > 0.0 and not no_dropout and not (self.max_crops > 0):
+            z = (
+                torch.bernoulli(
+                    (1.0 - self.ucg_rate) * torch.ones(z.shape[0], device=z.device)
+                )[:, None]
+                * z
+            )
+            if tokens is not None:
+                tokens = (
+                    expand_dims_like(
+                        torch.bernoulli(
+                            (1.0 - self.ucg_rate)
+                            * torch.ones(tokens.shape[0], device=tokens.device)
+                        ),
+                        tokens,
+                    )
+                    * tokens
+                )
+        if self.unsqueeze_dim:
+            z = z[:, None, :]
+        if self.output_tokens:
+            assert not self.repeat_to_max_len
+            assert not self.pad_to_max_len
+            return tokens, z
+        if self.repeat_to_max_len:
+            if z.dim() == 2:
+                z_ = z[:, None, :]
+            else:
+                z_ = z
+            return repeat(z_, "b 1 d -> b n d", n=self.max_length), z
+        elif self.pad_to_max_len:
+            assert z.dim() == 3
+            z_pad = torch.cat(
+                (
+                    z,
+                    torch.zeros(
+                        z.shape[0],
+                        self.max_length - z.shape[1],
+                        z.shape[2],
+                        device=z.device,
+                    ),
+                ),
+                1,
+            )
+            return z_pad, z_pad[:, 0, ...]
+        return z
+    def encode_with_vision_transformer(self, img):
+        # if self.max_crops > 0:
+        #    img = self.preprocess_by_cropping(img)
+        if img.dim() == 5:
+            assert self.max_crops == img.shape[1]
+            img = rearrange(img, "b n c h w -> (b n) c h w")
+        img = self.preprocess(img)
+        if not self.output_tokens:
+            assert not self.model.visual.output_tokens
+            x = self.model.visual(img)
+            tokens = None
+        else:
+            assert self.model.visual.output_tokens
+            x, tokens = self.model.visual(img)
+        if self.max_crops > 0:
+            x = rearrange(x, "(b n) d -> b n d", n=self.max_crops)
+            # drop out between 0 and all along the sequence axis
+            x = (
+                torch.bernoulli(
+                    (1.0 - self.ucg_rate)
+                    * torch.ones(x.shape[0], x.shape[1], 1, device=x.device)
+                )
+                * x
+            )
+            if tokens is not None:
+                tokens = rearrange(tokens, "(b n) t d -> b t (n d)", n=self.max_crops)
+                print(
+                    f"You are running very experimental token-concat in {self.__class__.__name__}. "
+                    f"Check what you are doing, and then remove this message."
+                )
+        if self.output_tokens:
+            return x, tokens
+        return x
+    def encode(self, text):
+        return self(text)