Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -166,24 +166,6 @@ class LTX23DistilledA2VPipeline:
|
|
| 166 |
|
| 167 |
video_duration = num_frames / frame_rate
|
| 168 |
|
| 169 |
-
encoded_audio_latent = vae_encode_audio(self.model_ledger.audio_encoder())
|
| 170 |
-
audio_shape = AudioLatentShape.from_duration(batch=1, duration=video_duration, channels=8, mel_bins=16)
|
| 171 |
-
expected_frames = audio_shape.frames
|
| 172 |
-
actual_frames = encoded_audio_latent.shape[2]
|
| 173 |
-
|
| 174 |
-
if actual_frames > expected_frames:
|
| 175 |
-
encoded_audio_latent = encoded_audio_latent[:, :, :expected_frames, :]
|
| 176 |
-
elif actual_frames < expected_frames:
|
| 177 |
-
pad = torch.zeros(
|
| 178 |
-
encoded_audio_latent.shape[0],
|
| 179 |
-
encoded_audio_latent.shape[1],
|
| 180 |
-
expected_frames - actual_frames,
|
| 181 |
-
encoded_audio_latent.shape[3],
|
| 182 |
-
device=encoded_audio_latent.device,
|
| 183 |
-
dtype=encoded_audio_latent.dtype,
|
| 184 |
-
)
|
| 185 |
-
encoded_audio_latent = torch.cat([encoded_audio_latent, pad], dim=2)
|
| 186 |
-
|
| 187 |
video_encoder = self.model_ledger.video_encoder()
|
| 188 |
transformer = self.model_ledger.transformer()
|
| 189 |
stage_1_sigmas = torch.tensor(DISTILLED_SIGMA_VALUES, device=self.device)
|
|
|
|
| 166 |
|
| 167 |
video_duration = num_frames / frame_rate
|
| 168 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
video_encoder = self.model_ledger.video_encoder()
|
| 170 |
transformer = self.model_ledger.transformer()
|
| 171 |
stage_1_sigmas = torch.tensor(DISTILLED_SIGMA_VALUES, device=self.device)
|