jadechoghari
/

spad

@@ -81,7 +81,7 @@ class SPADPipeline(DiffusionPipeline):
         batch_size = len(prompt) if isinstance(prompt, list) else 1
         device = self.device
-        # Generate camera batch
         if elevations is None or azimuths is None:
             elevations = [45] * 4
             azimuths = [0, 90, 180, 270]
@@ -90,23 +90,23 @@ class SPADPipeline(DiffusionPipeline):
         camera_batch = self.generate_camera_batch(elevations, azimuths, use_abs=self.use_abs_extrinsics)
         camera_batch = {k: v[None].repeat_interleave(batch_size, dim=0).to(device) for k, v in camera_batch.items()}
-        # Prepare gaussian blob initialization
         blob = self.get_gaussian_image(sigma=blob_sigma).to(device)
         camera_batch["img"] = blob.unsqueeze(0).unsqueeze(0).repeat(batch_size, n_views, 1, 1, 1)
-        # Encode text
         text_input_ids = self.tokenizer(prompt, padding="max_length", max_length=self.tokenizer.model_max_length, return_tensors="pt").input_ids.to(device)
         text_embeddings = self.text_encoder(text_input_ids)[0]
-        # Prepare unconditional embeddings for classifier-free guidance
         max_length = text_input_ids.shape[-1]
         uncond_input = self.tokenizer([""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt")
         uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(device))[0]
-        # Encode camera data
         camera_embeddings = self.cc_projection(camera_batch["cam"]).to(device)
-        # Prepare latents
         latent_height, latent_width = self.vae.config.sample_size // 8, self.vae.config.sample_size // 8
         latents = self.prepare_latents(
             batch_size,
@@ -119,36 +119,33 @@ class SPADPipeline(DiffusionPipeline):
             generator=None,
         )
-        # Prepare epi_constraint_masks (placeholder, replace with actual implementation)
         epi_constraint_masks = torch.ones(batch_size, n_views, latent_height, latent_width, n_views, latent_height, latent_width, dtype=torch.bool, device=device)
-        # Prepare plucker embeddings (placeholder, replace with actual implementation)
         plucker_embeds = torch.zeros(batch_size, n_views, 6, latent_height, latent_width, device=device)
         latent_height, latent_width = 64, 64  # Fixed to match the required shape [batch_size, 1, 4, 64, 64]
         n_objects = 2;
         latents = torch.randn(n_objects, n_views, 4, 64, 64, device=device, dtype=self.unet.dtype)
-        # Set up scheduler
         # self.scheduler.set_timesteps(num_inference_steps)
         self.scheduler.set_timesteps(50)
-        # Repeat text_embeddings to match the desired dimensions
-        text_embeddings = text_embeddings.repeat(n_objects, 1, 1)  # Shape: [2, max_seq_len, 512]
-        # Reshape text_embeddings to match [n_objects, n_views, max_seq_len, 512]
         text_embeddings = text_embeddings.unsqueeze(1).repeat(1, n_views, 1, 1)
         camera_embeddings = camera_embeddings.repeat(n_objects, 1, 1, 1)
-        # Denoising loop
         for t in tqdm(self.scheduler.timesteps):
-          # Expand timesteps to match shape [batch_size, 1, 1]
           # timesteps = torch.full((batch_size, 1, 1), t, device=device, dtype=torch.long)
           timesteps = torch.full((n_objects, n_views), t, device=device, dtype=torch.long)
-          # Prepare context
           context = [
-            # text_embeddings.unsqueeze(1),  # [batch_size, 1, max_seq_len, 768]
-            # camera_embeddings.unsqueeze(1) * 0.0,  # [batch_size, 1, 1280] * 0.0
-            # epi_constraint_masks  # Keep this as is for now
             text_embeddings.to(device),  # [n_objects, n_views, max_seq_len, 768]
             camera_embeddings,  # [n_objects, n_views, 1280]
             torch.ones(n_objects, n_views, 6, 32, 32).to(device)
@@ -161,22 +158,22 @@ class SPADPipeline(DiffusionPipeline):
               context=context
           )
-          # Perform guidance
           noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
           noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-          # Compute previous noisy sample
           latents = self.scheduler.step(noise_pred, t, latents).prev_sample
         # reduce latents
-        #EXPERIMENTAL
         latents_reshaped = latents[:, 0, :, :, :]  # Selecting the first view
-        # Decode latents
         images = self.vae.decode(latents_reshaped / self.vae.config.scaling_factor, return_dict=False)[0]
-        # Post-process images
         images = (images / 2 + 0.5).clamp(0, 1)
         if images.dim() == 5:

         batch_size = len(prompt) if isinstance(prompt, list) else 1
         device = self.device
+        # generate camera batch
         if elevations is None or azimuths is None:
             elevations = [45] * 4
             azimuths = [0, 90, 180, 270]
         camera_batch = self.generate_camera_batch(elevations, azimuths, use_abs=self.use_abs_extrinsics)
         camera_batch = {k: v[None].repeat_interleave(batch_size, dim=0).to(device) for k, v in camera_batch.items()}
+        # prepare gaussian blob initialization
         blob = self.get_gaussian_image(sigma=blob_sigma).to(device)
         camera_batch["img"] = blob.unsqueeze(0).unsqueeze(0).repeat(batch_size, n_views, 1, 1, 1)
+        # encode text
         text_input_ids = self.tokenizer(prompt, padding="max_length", max_length=self.tokenizer.model_max_length, return_tensors="pt").input_ids.to(device)
         text_embeddings = self.text_encoder(text_input_ids)[0]
+        # prepare unconditional embeddings for classifier-free guidance
         max_length = text_input_ids.shape[-1]
         uncond_input = self.tokenizer([""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt")
         uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(device))[0]
+        # encode camera data
         camera_embeddings = self.cc_projection(camera_batch["cam"]).to(device)
+        # prepare latents
         latent_height, latent_width = self.vae.config.sample_size // 8, self.vae.config.sample_size // 8
         latents = self.prepare_latents(
             batch_size,
             generator=None,
         )
+        # prepare epi_constraint_masks (placeholder- replace with actual implementation later - MIGHT AFFECT PERFORMANCE)
         epi_constraint_masks = torch.ones(batch_size, n_views, latent_height, latent_width, n_views, latent_height, latent_width, dtype=torch.bool, device=device)
+        # prepare plucker embeddings (placeholder, replace with actual implementation - MIGHT AFFECT PERFORMANCE)
         plucker_embeds = torch.zeros(batch_size, n_views, 6, latent_height, latent_width, device=device)
         latent_height, latent_width = 64, 64  # Fixed to match the required shape [batch_size, 1, 4, 64, 64]
         n_objects = 2;
         latents = torch.randn(n_objects, n_views, 4, 64, 64, device=device, dtype=self.unet.dtype)
+        # set up scheduler
         # self.scheduler.set_timesteps(num_inference_steps)
         self.scheduler.set_timesteps(50)
+        # repeat text_embeddings to match the desired dimensions
+        text_embeddings = text_embeddings.repeat(n_objects, 1, 1)  # Shape: [2, max_seq_len, 768]
+        # reshape text_embeddings to match [n_objects, n_views, max_seq_len, 512]
         text_embeddings = text_embeddings.unsqueeze(1).repeat(1, n_views, 1, 1)
         camera_embeddings = camera_embeddings.repeat(n_objects, 1, 1, 1)
+        # denoising loop
         for t in tqdm(self.scheduler.timesteps):
+          # expand timesteps to match shape [batch_size, 1, 1]
           # timesteps = torch.full((batch_size, 1, 1), t, device=device, dtype=torch.long)
           timesteps = torch.full((n_objects, n_views), t, device=device, dtype=torch.long)
+          # prepare context
           context = [
             text_embeddings.to(device),  # [n_objects, n_views, max_seq_len, 768]
             camera_embeddings,  # [n_objects, n_views, 1280]
             torch.ones(n_objects, n_views, 6, 32, 32).to(device)
               context=context
           )
+          # perform guidance
           noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
           noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+          # compute previous noisy sample
           latents = self.scheduler.step(noise_pred, t, latents).prev_sample
         # reduce latents
+        #EXPERIMENTAL - MIGHT AFFECT PERFORMANCE
         latents_reshaped = latents[:, 0, :, :, :]  # Selecting the first view
+        # decode latents
         images = self.vae.decode(latents_reshaped / self.vae.config.scaling_factor, return_dict=False)[0]
+        # post-process images
         images = (images / 2 + 0.5).clamp(0, 1)
         if images.dim() == 5: