SeedVR2-3B

Running

App Files Files Community

IceClear commited on Jun 19

Commit

67c0e7b

1 Parent(s): 25e6160

update

Browse files

Files changed (2) hide show

app.py +8 -7
projects/video_diffusion_sr/infer.py +2 -8

app.py CHANGED Viewed

@@ -61,6 +61,8 @@ from pathlib import Path
 from urllib.parse import urlparse
 from torch.hub import download_url_to_file, get_dir
 import shlex
 os.environ["MASTER_ADDR"] = "127.0.0.1"
 os.environ["MASTER_PORT"] = "12355"
@@ -223,7 +225,7 @@ def generation_step(runner, text_embeds_dict, cond_latents):
 @spaces.GPU(duration=120)
 def generation_loop(video_path='./test_videos', seed=666, fps_out=12, batch_size=1, cfg_scale=1.0, cfg_rescale=0.0, sample_steps=1, res_h=1280, res_w=720, sp_size=1):
     runner = configure_runner(1)
-    output_dir = 'output/out.mp4'
     def _build_pos_and_neg_prompt():
         # read positive prompt
         positive_text = "Cinematic, High Contrast, highly detailed, taken using a Canon EOS R camera, \
@@ -342,12 +344,12 @@ def generation_loop(video_path='./test_videos', seed=666, fps_out=12, batch_size
         input_videos = cond_latents
         cond_latents = [cut_videos(video, sp_size) for video in cond_latents]
-        runner.dit.to("cpu")
         print(f"Encoding videos: {list(map(lambda x: x.size(), cond_latents))}")
-        runner.vae.to(torch.device("cuda"))
         cond_latents = runner.vae_encode(cond_latents)
-        runner.vae.to("cpu")
-        runner.dit.to(torch.device("cuda"))
         for i, emb in enumerate(text_embeds["texts_pos"]):
             text_embeds["texts_pos"][i] = emb.to(torch.device("cuda"))
@@ -355,7 +357,7 @@ def generation_loop(video_path='./test_videos', seed=666, fps_out=12, batch_size
             text_embeds["texts_neg"][i] = emb.to(torch.device("cuda"))
         samples = generation_step(runner, text_embeds, cond_latents=cond_latents)
-        runner.dit.to("cpu")
         del cond_latents
         # dump samples to the output directory
@@ -364,7 +366,6 @@ def generation_loop(video_path='./test_videos', seed=666, fps_out=12, batch_size
         ):
             if ori_length < sample.shape[0]:
                 sample = sample[:ori_length]
-            filename = os.path.join(tgt_path, os.path.basename(path))
             # color fix
             input = (
                 rearrange(input[:, None], "c t h w -> t c h w")

 from urllib.parse import urlparse
 from torch.hub import download_url_to_file, get_dir
 import shlex
+import uuid
 os.environ["MASTER_ADDR"] = "127.0.0.1"
 os.environ["MASTER_PORT"] = "12355"
 @spaces.GPU(duration=120)
 def generation_loop(video_path='./test_videos', seed=666, fps_out=12, batch_size=1, cfg_scale=1.0, cfg_rescale=0.0, sample_steps=1, res_h=1280, res_w=720, sp_size=1):
     runner = configure_runner(1)
+    output_dir = 'output/' + uuid.uuid4() + '.mp4'
     def _build_pos_and_neg_prompt():
         # read positive prompt
         positive_text = "Cinematic, High Contrast, highly detailed, taken using a Canon EOS R camera, \
         input_videos = cond_latents
         cond_latents = [cut_videos(video, sp_size) for video in cond_latents]
+        # runner.dit.to("cpu")
         print(f"Encoding videos: {list(map(lambda x: x.size(), cond_latents))}")
+        # runner.vae.to(torch.device("cuda"))
         cond_latents = runner.vae_encode(cond_latents)
+        # runner.vae.to("cpu")
+        # runner.dit.to(torch.device("cuda"))
         for i, emb in enumerate(text_embeds["texts_pos"]):
             text_embeds["texts_pos"][i] = emb.to(torch.device("cuda"))
             text_embeds["texts_neg"][i] = emb.to(torch.device("cuda"))
         samples = generation_step(runner, text_embeds, cond_latents=cond_latents)
+        # runner.dit.to("cpu")
         del cond_latents
         # dump samples to the output directory
         ):
             if ori_length < sample.shape[0]:
                 sample = sample[:ori_length]
             # color fix
             input = (
                 rearrange(input[:, None], "c t h w -> t c h w")

projects/video_diffusion_sr/infer.py CHANGED Viewed

@@ -41,6 +41,7 @@ from models.dit_v2 import na
 class VideoDiffusionInfer():
     def __init__(self, config: DictConfig):
         self.config = config
     def get_condition(self, latent: Tensor, latent_blur: Tensor, task: str) -> Tensor:
         t, h, w, c = latent.shape
@@ -75,13 +76,9 @@ class VideoDiffusionInfer():
         # For fast init & resume,
         #   when training from scratch, rank0 init DiT on cpu, then sync to other ranks with FSDP.
         #   otherwise, all ranks init DiT on meta device, then load_state_dict with assign=True.
-        if self.config.dit.get("init_with_meta_device", False):
-            init_device = "cpu" if get_global_rank() == 0 and checkpoint is None else "meta"
-        else:
-            init_device = "cpu"
         # Create dit model.
-        with torch.device(init_device):
             self.dit = create_object(self.config.dit.model)
         self.dit.set_gradient_checkpointing(self.config.dit.gradient_checkpoint)
@@ -92,9 +89,6 @@ class VideoDiffusionInfer():
             print(f"Loading info: {loading_info}")
             self.dit = meta_non_persistent_buffer_init_fn(self.dit)
-        if device in [get_device(), "cuda"]:
-            self.dit.to(get_device())
         # Print model size.
         num_params = sum(p.numel() for p in self.dit.parameters() if p.requires_grad)
         print(f"DiT trainable parameters: {num_params:,}")

 class VideoDiffusionInfer():
     def __init__(self, config: DictConfig):
         self.config = config
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
     def get_condition(self, latent: Tensor, latent_blur: Tensor, task: str) -> Tensor:
         t, h, w, c = latent.shape
         # For fast init & resume,
         #   when training from scratch, rank0 init DiT on cpu, then sync to other ranks with FSDP.
         #   otherwise, all ranks init DiT on meta device, then load_state_dict with assign=True.
         # Create dit model.
+        with torch.device(self.device):
             self.dit = create_object(self.config.dit.model)
         self.dit.set_gradient_checkpointing(self.config.dit.gradient_checkpoint)
             print(f"Loading info: {loading_info}")
             self.dit = meta_non_persistent_buffer_init_fn(self.dit)
         # Print model size.
         num_params = sum(p.numel() for p in self.dit.parameters() if p.requires_grad)
         print(f"DiT trainable parameters: {num_params:,}")