Testing0

Running on Zero

App Files Files Community

dagloop5 commited on about 23 hours ago

Commit

db91dcb

verified ·

1 Parent(s): 00fea93

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -60

app.py CHANGED Viewed

@@ -101,7 +101,7 @@ RESOLUTIONS = {
 }
-class LTX23DistilledA2VPipeline(DistilledPipeline):
     """DistilledPipeline with optional audio conditioning."""
     def __call__(
@@ -113,24 +113,9 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
         num_frames: int,
         frame_rate: float,
         images: list[ImageConditioningInput],
-        audio_path: str | None = None,
         tiling_config: TilingConfig | None = None,
         enhance_prompt: bool = False,
     ):
-        # Standard path when no audio input is provided.
-        print(prompt)
-        if audio_path is None:
-            return super().__call__(
-                prompt=prompt,
-                seed=seed,
-                height=height,
-                width=width,
-                num_frames=num_frames,
-                frame_rate=frame_rate,
-                images=images,
-                tiling_config=tiling_config,
-                enhance_prompt=enhance_prompt,
-            )
         generator = torch.Generator(device=self.device).manual_seed(seed)
         noiser = GaussianNoiser(generator=generator)
@@ -141,38 +126,18 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
             [prompt],
             self.model_ledger,
             enhance_first_prompt=enhance_prompt,
-            enhance_prompt_image=images[0].path if len(images) > 0 else None,
         )
         video_context, audio_context = ctx_p.video_encoding, ctx_p.audio_encoding
-        video_duration = num_frames / frame_rate
-        decoded_audio = decode_audio_from_file(audio_path, self.device, 0.0, video_duration)
-        if decoded_audio is None:
-            raise ValueError(f"Could not extract audio stream from {audio_path}")
-        encoded_audio_latent = vae_encode_audio(decoded_audio, self.model_ledger.audio_encoder())
-        audio_shape = AudioLatentShape.from_duration(batch=1, duration=video_duration, channels=8, mel_bins=16)
-        expected_frames = audio_shape.frames
-        actual_frames = encoded_audio_latent.shape[2]
-        if actual_frames > expected_frames:
-            encoded_audio_latent = encoded_audio_latent[:, :, :expected_frames, :]
-        elif actual_frames < expected_frames:
-            pad = torch.zeros(
-                encoded_audio_latent.shape[0],
-                encoded_audio_latent.shape[1],
-                expected_frames - actual_frames,
-                encoded_audio_latent.shape[3],
-                device=encoded_audio_latent.device,
-                dtype=encoded_audio_latent.dtype,
-            )
-            encoded_audio_latent = torch.cat([encoded_audio_latent, pad], dim=2)
         video_encoder = self.model_ledger.video_encoder()
         transformer = self.model_ledger.transformer()
-        stage_1_sigmas = torch.tensor(DISTILLED_SIGMA_VALUES, device=self.device)
-        def denoising_loop(sigmas, video_state, audio_state, stepper):
             return euler_denoising_loop(
                 sigmas=sigmas,
                 video_state=video_state,
@@ -181,7 +146,7 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
                 denoise_fn=simple_denoising_func(
                     video_context=video_context,
                     audio_context=audio_context,
-                    transformer=transformer,
                 ),
             )
@@ -200,7 +165,8 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
             dtype=dtype,
             device=self.device,
         )
-        video_state = denoise_video_only(
             output_shape=stage_1_output_shape,
             conditionings=stage_1_conditionings,
             noiser=noiser,
@@ -210,7 +176,6 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
             components=self.pipeline_components,
             dtype=dtype,
             device=self.device,
-            initial_audio_latent=encoded_audio_latent,
         )
         torch.cuda.synchronize()
@@ -219,16 +184,12 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
         cleanup_memory()
         decoded_video = vae_decode_video(
-            video_state.latent,
-            self.model_ledger.video_decoder(),
-            tiling_config,
-            generator,
         )
-        original_audio = Audio(
-            waveform=decoded_audio.waveform.squeeze(0),
-            sampling_rate=decoded_audio.sampling_rate,
         )
-        return decoded_video, original_audio
 # Model repos
@@ -566,7 +527,6 @@ def on_highres_toggle(first_image, last_image, high_res):
 def get_gpu_duration(
     first_image,
     last_image,
-    input_audio,
     prompt: str,
     duration: float,
     gpu_duration: float,
@@ -598,7 +558,6 @@ def get_gpu_duration(
 def generate_video(
     first_image,
     last_image,
-    input_audio,
     prompt: str,
     duration: float,
     gpu_duration: float,
@@ -670,7 +629,6 @@ def generate_video(
             num_frames=num_frames,
             frame_rate=frame_rate,
             images=images,
-            audio_path=input_audio,
             tiling_config=tiling_config,
             enhance_prompt=enhance_prompt,
         )
@@ -705,7 +663,6 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
             with gr.Row():
                 first_image = gr.Image(label="First Frame (Optional)", type="pil")
                 last_image = gr.Image(label="Last Frame (Optional)", type="pil")
-            input_audio = gr.Audio(label="Audio Input (Optional)", type="filepath")
             prompt = gr.Textbox(
                 label="Prompt",
                 info="for best results - make it as elaborate as possible",
@@ -807,7 +764,6 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
             [
                 None,
                 "pinkknit.jpg",
-                None,
                 "The camera falls downward through darkness as if dropped into a tunnel. "
                 "As it slows, five friends wearing pink knitted hats and sunglasses lean "
                 "over and look down toward the camera with curious expressions. The lens "
@@ -838,7 +794,7 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
             ],
         ],
         inputs=[
-            first_image, last_image, input_audio, prompt, duration, gpu_duration,
             enhance_prompt, seed, randomize_seed, height, width,
             pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength, voice_strength, realism_strength, transition_strength, physics_strength, reasoning_strength,
         ],
@@ -871,7 +827,7 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
     generate_btn.click(
         fn=generate_video,
         inputs=[
-            first_image, last_image, input_audio, prompt, duration, gpu_duration, enhance_prompt,
             seed, randomize_seed, height, width,
             pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength, voice_strength, realism_strength, transition_strength, physics_strength, reasoning_strength,
         ],

 }
+class LTX23DistilledA2VPipeline:
     """DistilledPipeline with optional audio conditioning."""
     def __call__(
         num_frames: int,
         frame_rate: float,
         images: list[ImageConditioningInput],
         tiling_config: TilingConfig | None = None,
         enhance_prompt: bool = False,
     ):
         generator = torch.Generator(device=self.device).manual_seed(seed)
         noiser = GaussianNoiser(generator=generator)
             [prompt],
             self.model_ledger,
             enhance_first_prompt=enhance_prompt,
+            enhance_prompt_image=images[0][0] if len(images) > 0 else None,
         )
         video_context, audio_context = ctx_p.video_encoding, ctx_p.audio_encoding
+        # Stage 1: Initial low resolution video generation.
         video_encoder = self.model_ledger.video_encoder()
         transformer = self.model_ledger.transformer()
+        stage_1_sigmas = torch.Tensor(DISTILLED_SIGMA_VALUES).to(self.device)
+        def denoising_loop(
+            sigmas: torch.Tensor, video_state: LatentState, audio_state: LatentState, stepper: DiffusionStepProtocol
+        ) -> tuple[LatentState, LatentState]:
             return euler_denoising_loop(
                 sigmas=sigmas,
                 video_state=video_state,
                 denoise_fn=simple_denoising_func(
                     video_context=video_context,
                     audio_context=audio_context,
+                    transformer=transformer,  # noqa: F821
                 ),
             )
             dtype=dtype,
             device=self.device,
         )
+        video_state, audio_state = denoise_audio_video(
             output_shape=stage_1_output_shape,
             conditionings=stage_1_conditionings,
             noiser=noiser,
             components=self.pipeline_components,
             dtype=dtype,
             device=self.device,
         )
         torch.cuda.synchronize()
         cleanup_memory()
         decoded_video = vae_decode_video(
+            video_state.latent, self.model_ledger.video_decoder(), tiling_config, generator
         )
+        decoded_audio = vae_decode_audio(
+            audio_state.latent, self.model_ledger.audio_decoder(), self.model_ledger.vocoder()
         )
+        return decoded_video, decoded_audio
 # Model repos
 def get_gpu_duration(
     first_image,
     last_image,
     prompt: str,
     duration: float,
     gpu_duration: float,
 def generate_video(
     first_image,
     last_image,
     prompt: str,
     duration: float,
     gpu_duration: float,
             num_frames=num_frames,
             frame_rate=frame_rate,
             images=images,
             tiling_config=tiling_config,
             enhance_prompt=enhance_prompt,
         )
             with gr.Row():
                 first_image = gr.Image(label="First Frame (Optional)", type="pil")
                 last_image = gr.Image(label="Last Frame (Optional)", type="pil")
             prompt = gr.Textbox(
                 label="Prompt",
                 info="for best results - make it as elaborate as possible",
             [
                 None,
                 "pinkknit.jpg",
                 "The camera falls downward through darkness as if dropped into a tunnel. "
                 "As it slows, five friends wearing pink knitted hats and sunglasses lean "
                 "over and look down toward the camera with curious expressions. The lens "
             ],
         ],
         inputs=[
+            first_image, last_image, prompt, duration, gpu_duration,
             enhance_prompt, seed, randomize_seed, height, width,
             pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength, voice_strength, realism_strength, transition_strength, physics_strength, reasoning_strength,
         ],
     generate_btn.click(
         fn=generate_video,
         inputs=[
+            first_image, last_image, prompt, duration, gpu_duration, enhance_prompt,
             seed, randomize_seed, height, width,
             pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength, voice_strength, realism_strength, transition_strength, physics_strength, reasoning_strength,
         ],