Spaces:

vclmax
/

Element-16-Video-Max

Running on Zero

App Files Files Community

Vicente Alvarez commited on 5 days ago

Commit

e162f46

1 Parent(s): ce3e28e

Sulphur dev + distill LoRA with TI2VidTwoStagesHQPipeline

Browse files

Files changed (1) hide show

app.py +78 -211

app.py CHANGED Viewed

@@ -52,24 +52,10 @@ import gradio as gr
 import numpy as np
 from huggingface_hub import hf_hub_download, snapshot_download
-from ltx_core.components.diffusion_steps import EulerDiffusionStep
-from ltx_core.components.noisers import GaussianNoiser
-from ltx_core.model.audio_vae import encode_audio as vae_encode_audio
-from ltx_core.model.upsampler import upsample_video
-from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number, decode_video as vae_decode_video
-from ltx_core.types import Audio, AudioLatentShape, VideoPixelShape
-from ltx_pipelines.distilled import DistilledPipeline
-from ltx_pipelines.utils import euler_denoising_loop
 from ltx_pipelines.utils.args import ImageConditioningInput
-from ltx_pipelines.utils.constants import DISTILLED_SIGMA_VALUES, STAGE_2_DISTILLED_SIGMA_VALUES
-from ltx_pipelines.utils.helpers import (
-    cleanup_memory,
-    combined_image_conditionings,
-    denoise_video_only,
-    encode_prompts,
-    simple_denoising_func,
-)
-from ltx_pipelines.utils.media_io import decode_audio_from_file, encode_video
 # Patch attention backend into the LTX attention module.
 import torch.nn.functional as F
@@ -114,217 +100,69 @@ RESOLUTIONS = {
 }
-class LTX23DistilledA2VPipeline(DistilledPipeline):
-    """DistilledPipeline with optional audio conditioning."""
-    def __call__(
-        self,
-        prompt: str,
-        seed: int,
-        height: int,
-        width: int,
-        num_frames: int,
-        frame_rate: float,
-        images: list[ImageConditioningInput],
-        audio_path: str | None = None,
-        tiling_config: TilingConfig | None = None,
-        enhance_prompt: bool = False,
-    ):
-        # Standard path when no audio input is provided.
-        if audio_path is None:
-            return super().__call__(
-                prompt=prompt,
-                seed=seed,
-                height=height,
-                width=width,
-                num_frames=num_frames,
-                frame_rate=frame_rate,
-                images=images,
-                tiling_config=tiling_config,
-                enhance_prompt=enhance_prompt,
-            )
-        generator = torch.Generator(device=self.device).manual_seed(seed)
-        noiser = GaussianNoiser(generator=generator)
-        stepper = EulerDiffusionStep()
-        dtype = torch.bfloat16
-        (ctx_p,) = encode_prompts(
-            [prompt],
-            self.model_ledger,
-            enhance_first_prompt=enhance_prompt,
-            enhance_prompt_image=images[0].path if len(images) > 0 else None,
-        )
-        video_context, audio_context = ctx_p.video_encoding, ctx_p.audio_encoding
-        video_duration = num_frames / frame_rate
-        decoded_audio = decode_audio_from_file(audio_path, self.device, 0.0, video_duration)
-        if decoded_audio is None:
-            raise ValueError(f"Could not extract audio stream from {audio_path}")
-        encoded_audio_latent = vae_encode_audio(decoded_audio, self.model_ledger.audio_encoder())
-        audio_shape = AudioLatentShape.from_duration(batch=1, duration=video_duration, channels=8, mel_bins=16)
-        expected_frames = audio_shape.frames
-        actual_frames = encoded_audio_latent.shape[2]
-        if actual_frames > expected_frames:
-            encoded_audio_latent = encoded_audio_latent[:, :, :expected_frames, :]
-        elif actual_frames < expected_frames:
-            pad = torch.zeros(
-                encoded_audio_latent.shape[0],
-                encoded_audio_latent.shape[1],
-                expected_frames - actual_frames,
-                encoded_audio_latent.shape[3],
-                device=encoded_audio_latent.device,
-                dtype=encoded_audio_latent.dtype,
-            )
-            encoded_audio_latent = torch.cat([encoded_audio_latent, pad], dim=2)
-        video_encoder = self.model_ledger.video_encoder()
-        transformer = self.model_ledger.transformer()
-        stage_1_sigmas = torch.tensor(DISTILLED_SIGMA_VALUES, device=self.device)
-        def denoising_loop(sigmas, video_state, audio_state, stepper):
-            return euler_denoising_loop(
-                sigmas=sigmas,
-                video_state=video_state,
-                audio_state=audio_state,
-                stepper=stepper,
-                denoise_fn=simple_denoising_func(
-                    video_context=video_context,
-                    audio_context=audio_context,
-                    transformer=transformer,
-                ),
-            )
-        stage_1_output_shape = VideoPixelShape(
-            batch=1,
-            frames=num_frames,
-            width=width // 2,
-            height=height // 2,
-            fps=frame_rate,
-        )
-        stage_1_conditionings = combined_image_conditionings(
-            images=images,
-            height=stage_1_output_shape.height,
-            width=stage_1_output_shape.width,
-            video_encoder=video_encoder,
-            dtype=dtype,
-            device=self.device,
-        )
-        video_state = denoise_video_only(
-            output_shape=stage_1_output_shape,
-            conditionings=stage_1_conditionings,
-            noiser=noiser,
-            sigmas=stage_1_sigmas,
-            stepper=stepper,
-            denoising_loop_fn=denoising_loop,
-            components=self.pipeline_components,
-            dtype=dtype,
-            device=self.device,
-            initial_audio_latent=encoded_audio_latent,
-        )
-        torch.cuda.synchronize()
-        cleanup_memory()
-        upscaled_video_latent = upsample_video(
-            latent=video_state.latent[:1],
-            video_encoder=video_encoder,
-            upsampler=self.model_ledger.spatial_upsampler(),
-        )
-        stage_2_sigmas = torch.tensor(STAGE_2_DISTILLED_SIGMA_VALUES, device=self.device)
-        stage_2_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
-        stage_2_conditionings = combined_image_conditionings(
-            images=images,
-            height=stage_2_output_shape.height,
-            width=stage_2_output_shape.width,
-            video_encoder=video_encoder,
-            dtype=dtype,
-            device=self.device,
-        )
-        video_state = denoise_video_only(
-            output_shape=stage_2_output_shape,
-            conditionings=stage_2_conditionings,
-            noiser=noiser,
-            sigmas=stage_2_sigmas,
-            stepper=stepper,
-            denoising_loop_fn=denoising_loop,
-            components=self.pipeline_components,
-            dtype=dtype,
-            device=self.device,
-            noise_scale=stage_2_sigmas[0],
-            initial_video_latent=upscaled_video_latent,
-            initial_audio_latent=encoded_audio_latent,
-        )
-        torch.cuda.synchronize()
-        del transformer
-        del video_encoder
-        cleanup_memory()
-        decoded_video = vae_decode_video(
-            video_state.latent,
-            self.model_ledger.video_decoder(),
-            tiling_config,
-            generator,
-        )
-        original_audio = Audio(
-            waveform=decoded_audio.waveform.squeeze(0),
-            sampling_rate=decoded_audio.sampling_rate,
-        )
-        return decoded_video, original_audio
 # Model repos
-CHECKPOINT_REPO = "Civitai/Sulphur-2-distilled-fp8"
 LTX_MODEL_REPO = "Lightricks/LTX-2.3"
 GEMMA_REPO = "Lightricks/gemma-3-12b-it-qat-q4_0-unquantized"
 # Download model checkpoints
 print("=" * 80)
-print("Downloading Element-16 fp8 model + Gemma...")
 print("=" * 80)
-checkpoint_path = hf_hub_download(repo_id=CHECKPOINT_REPO, filename="sulphur_distil_fp8mixed.safetensors")
 spatial_upsampler_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-spatial-upscaler-x2-1.1.safetensors")
 gemma_root = snapshot_download(repo_id=GEMMA_REPO)
 print(f"Checkpoint: {checkpoint_path}")
 print(f"Spatial upsampler: {spatial_upsampler_path}")
 print(f"Gemma root: {gemma_root}")
-# Initialize pipeline WITH text encoder and optional audio support
-# No fp8_cast needed - checkpoint is already fp8 quantized
-pipeline = LTX23DistilledA2VPipeline(
-    distilled_checkpoint_path=checkpoint_path,
     spatial_upsampler_path=spatial_upsampler_path,
     gemma_root=gemma_root,
-    loras=[],
 )
 # Preload all models for ZeroGPU tensor packing.
 print("Preloading all models (including Gemma and audio components)...")
-ledger = pipeline.model_ledger
-_transformer = ledger.transformer()
-_video_encoder = ledger.video_encoder()
-_video_decoder = ledger.video_decoder()
-_audio_encoder = ledger.audio_encoder()
-_audio_decoder = ledger.audio_decoder()
-_vocoder = ledger.vocoder()
-_spatial_upsampler = ledger.spatial_upsampler()
-_text_encoder = ledger.text_encoder()
-_embeddings_processor = ledger.gemma_embeddings_processor()
-ledger.transformer = lambda: _transformer
-ledger.video_encoder = lambda: _video_encoder
-ledger.video_decoder = lambda: _video_decoder
-ledger.audio_encoder = lambda: _audio_encoder
-ledger.audio_decoder = lambda: _audio_decoder
-ledger.vocoder = lambda: _vocoder
-ledger.spatial_upsampler = lambda: _spatial_upsampler
-ledger.text_encoder = lambda: _text_encoder
-ledger.gemma_embeddings_processor = lambda: _embeddings_processor
 print("All models preloaded (including Gemma text encoder and audio encoder)!")
 print("=" * 80)
@@ -420,28 +258,57 @@ def generate_video(
                 temp_last_path = Path(last_image)
             images.append(ImageConditioningInput(path=str(temp_last_path), frame_idx=num_frames - 1, strength=1.0))
         tiling_config = TilingConfig.default()
         video_chunks_number = get_video_chunks_number(num_frames, tiling_config)
         log_memory("before pipeline call")
-        video, audio = pipeline(
             prompt=prompt,
             seed=current_seed,
             height=int(height),
             width=int(width),
             num_frames=num_frames,
             frame_rate=frame_rate,
             images=images,
-            tiling_config=tiling_config,
-            enhance_prompt=enhance_prompt,
         )
         log_memory("after pipeline call")
         output_path = tempfile.mktemp(suffix=".mp4")
         encode_video(
-            video=video,
             fps=frame_rate,
             audio=audio,
             output_path=output_path,

 import numpy as np
 from huggingface_hub import hf_hub_download, snapshot_download
+from ltx_core.loader import LTXV_LORA_COMFY_RENAMING_MAP, LoraPathStrengthAndSDOps
+from ltx_pipelines.ti2vid_two_stages_hq import TI2VidTwoStagesHQPipeline
 from ltx_pipelines.utils.args import ImageConditioningInput
+from ltx_pipelines.utils.media_io import encode_video
 # Patch attention backend into the LTX attention module.
 import torch.nn.functional as F
 }
 # Model repos
+CHECKPOINT_REPO = "SulphurAI/Sulphur-2-base"
+DISTILL_LORA_REPO = "SulphurAI/Sulphur-2-base"
 LTX_MODEL_REPO = "Lightricks/LTX-2.3"
 GEMMA_REPO = "Lightricks/gemma-3-12b-it-qat-q4_0-unquantized"
 # Download model checkpoints
 print("=" * 80)
+print("Downloading Element-16 dev + distill LoRA + Gemma...")
 print("=" * 80)
+checkpoint_path = hf_hub_download(repo_id=CHECKPOINT_REPO, filename="sulphur_dev_fp8mixed.safetensors")
+distilled_lora_path = hf_hub_download(repo_id=DISTILL_LORA_REPO, filename="distill_loras/ltx-2.3-22b-distilled-lora-1.1_fro90_ceil72_condsafe.safetensors")
 spatial_upsampler_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-spatial-upscaler-x2-1.1.safetensors")
 gemma_root = snapshot_download(repo_id=GEMMA_REPO)
 print(f"Checkpoint: {checkpoint_path}")
+print(f"Distilled LoRA: {distilled_lora_path}")
 print(f"Spatial upsampler: {spatial_upsampler_path}")
 print(f"Gemma root: {gemma_root}")
+# Create distilled LoRA entry
+distilled_lora = [
+    LoraPathStrengthAndSDOps(
+        distilled_lora_path,
+        1.0,
+        LTXV_LORA_COMFY_RENAMING_MAP,
+    ),
+]
+# Initialize pipeline with dev checkpoint + distilled LoRA
+pipeline = TI2VidTwoStagesHQPipeline(
+    checkpoint_path=checkpoint_path,
+    distilled_lora=distilled_lora,
+    distilled_lora_strength_stage_1=0.25,
+    distilled_lora_strength_stage_2=0.5,
     spatial_upsampler_path=spatial_upsampler_path,
     gemma_root=gemma_root,
+    loras=(),
 )
 # Preload all models for ZeroGPU tensor packing.
 print("Preloading all models (including Gemma and audio components)...")
+stage_1_ledger = pipeline.stage_1_model_ledger
+_transformer = stage_1_ledger.transformer()
+_video_encoder = stage_1_ledger.video_encoder()
+_video_decoder = stage_1_ledger.video_decoder()
+_audio_encoder = stage_1_ledger.audio_encoder()
+_audio_decoder = stage_1_ledger.audio_decoder()
+_vocoder = stage_1_ledger.vocoder()
+_spatial_upsampler = stage_1_ledger.spatial_upsampler()
+_text_encoder = stage_1_ledger.text_encoder()
+_embeddings_processor = stage_1_ledger.gemma_embeddings_processor()
+stage_1_ledger.transformer = lambda: _transformer
+stage_1_ledger.video_encoder = lambda: _video_encoder
+stage_1_ledger.video_decoder = lambda: _video_decoder
+stage_1_ledger.audio_encoder = lambda: _audio_encoder
+stage_1_ledger.audio_decoder = lambda: _audio_decoder
+stage_1_ledger.vocoder = lambda: _vocoder
+stage_1_ledger.spatial_upsampler = lambda: _spatial_upsampler
+stage_1_ledger.text_encoder = lambda: _text_encoder
+stage_1_ledger.gemma_embeddings_processor = lambda: _embeddings_processor
 print("All models preloaded (including Gemma text encoder and audio encoder)!")
 print("=" * 80)
                 temp_last_path = Path(last_image)
             images.append(ImageConditioningInput(path=str(temp_last_path), frame_idx=num_frames - 1, strength=1.0))
+        from ltx_core.components.guiders import MultiModalGuiderParams
+        from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number
         tiling_config = TilingConfig.default()
         video_chunks_number = get_video_chunks_number(num_frames, tiling_config)
         log_memory("before pipeline call")
+        # Configure guider params
+        video_guider_params = MultiModalGuiderParams(
+            cfg_scale=3.0,
+            stg_scale=0.0,
+            rescale_scale=0.45,
+            modality_scale=3.0,
+            skip_step=0,
+            stg_blocks=[],
+        )
+        audio_guider_params = MultiModalGuiderParams(
+            cfg_scale=7.0,
+            stg_scale=0.0,
+            rescale_scale=1.0,
+            modality_scale=3.0,
+            skip_step=0,
+            stg_blocks=[],
+        )
+        # Run inference - returns (video_frames_iter, audio)
+        video_frames_iter, audio = pipeline(
             prompt=prompt,
+            negative_prompt=negative_prompt,
             seed=current_seed,
             height=int(height),
             width=int(width),
             num_frames=num_frames,
             frame_rate=frame_rate,
+            num_inference_steps=15,
+            video_guider_params=video_guider_params,
+            audio_guider_params=audio_guider_params,
             images=images,
         )
+        # Collect video frames
+        frames = [frame for frame in video_frames_iter]
+        video_tensor = torch.cat(frames, dim=0) if len(frames) > 1 else frames[0]
         log_memory("after pipeline call")
         output_path = tempfile.mktemp(suffix=".mp4")
         encode_video(
+            video=video_tensor,
             fps=frame_rate,
             audio=audio,
             output_path=output_path,