Spaces:

NTUST-DDRC
/

cosmos_transfer1_av

Paused

App Files Files Community

harry900000 commited on Jul 16

Commit

90101b2

1 Parent(s): 226c7c9

print latent shape

Browse files

Files changed (3) hide show

app.py +2 -2
cosmos_transfer1/diffusion/inference/world_generation_pipeline.py +7 -0
cosmos_transfer1/diffusion/model/model_t2w.py +3 -0

app.py CHANGED Viewed

@@ -28,8 +28,8 @@ except Exception as e:
 # download checkpoints
 from download_checkpoints import main as download_checkpoints
-os.makedirs(CHECKPOINTS_PATH, exist_ok=True)
-download_checkpoints(hf_token="", output_dir=CHECKPOINTS_PATH, model="7b_av")
 from test_environment import main as check_environment

 # download checkpoints
 from download_checkpoints import main as download_checkpoints
+# os.makedirs(CHECKPOINTS_PATH, exist_ok=True)
+# download_checkpoints(hf_token="", output_dir=CHECKPOINTS_PATH, model="7b_av")
 from test_environment import main as check_environment

cosmos_transfer1/diffusion/inference/world_generation_pipeline.py CHANGED Viewed

@@ -553,21 +553,28 @@ class DiffusionControl2WorldGenerationPipeline(BaseWorldGenerationPipeline):
             end_frame = num_new_generated_frames * (i_clip + 1) + self.num_input_frames
             # Prepare x_sigma_max
             if input_video is not None:
                 if is_upscale_case:
                     x_sigma_max = []
                     for b in range(B):
                         input_frames = input_video[b : b + 1, :, start_frame:end_frame].cuda()
                         x0 = self.model.encode(input_frames).contiguous()
                         x_sigma_max.append(self.model.get_x_from_clean(x0, self.sigma_max, seed=(self.seed + i_clip)))
                     x_sigma_max = torch.cat(x_sigma_max)
                 else:
                     input_frames = input_video[:, :, start_frame:end_frame].cuda()
                     x0 = self.model.encode(input_frames).contiguous()
                     x_sigma_max = self.model.get_x_from_clean(x0, self.sigma_max, seed=(self.seed + i_clip))
             else:
                 x_sigma_max = None
             data_batch_i[hint_key] = control_input[:, :, start_frame:end_frame].cuda()
             latent_hint = []

             end_frame = num_new_generated_frames * (i_clip + 1) + self.num_input_frames
             # Prepare x_sigma_max
+            print("==============================================================")
             if input_video is not None:
                 if is_upscale_case:
                     x_sigma_max = []
                     for b in range(B):
                         input_frames = input_video[b : b + 1, :, start_frame:end_frame].cuda()
                         x0 = self.model.encode(input_frames).contiguous()
+                        print("x0 shape ->", x0.shape)
                         x_sigma_max.append(self.model.get_x_from_clean(x0, self.sigma_max, seed=(self.seed + i_clip)))
+                        print("x_sigma_max shape ->", x_sigma_max.shape)
                     x_sigma_max = torch.cat(x_sigma_max)
                 else:
                     input_frames = input_video[:, :, start_frame:end_frame].cuda()
                     x0 = self.model.encode(input_frames).contiguous()
+                    print("x0 shape ->", x0.shape)
                     x_sigma_max = self.model.get_x_from_clean(x0, self.sigma_max, seed=(self.seed + i_clip))
+                    print("x_sigma_max shape ->", x_sigma_max.shape)
             else:
                 x_sigma_max = None
+            print("final ->", x_sigma_max.shape)
+            print("==============================================================")
             data_batch_i[hint_key] = control_input[:, :, start_frame:end_frame].cuda()
             latent_hint = []

cosmos_transfer1/diffusion/model/model_t2w.py CHANGED Viewed

@@ -177,6 +177,9 @@ class DiffusionT2WModel(torch.nn.Module):
                 noise prediction (eps_pred) and optional confidence (logvar).
         """
         xt = xt.to(**self.tensor_kwargs)
         sigma = sigma.to(**self.tensor_kwargs)
         # get precondition for the network

                 noise prediction (eps_pred) and optional confidence (logvar).
         """
+        print("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
+        print(xt.shape)
+        print("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
         xt = xt.to(**self.tensor_kwargs)
         sigma = sigma.to(**self.tensor_kwargs)
         # get precondition for the network