Spaces:

vclmax
/

Element-16-Video-Max

Running on Zero

App Files Files Community

Vicente Alvarez commited on 22 days ago

Commit

b1a127d

1 Parent(s): 824f9f7

Switch to DistilledPipeline with pre-distilled sulphur_distil_bf16 checkpoint

Browse files

Files changed (1) hide show

app.py +23 -82

app.py CHANGED Viewed

@@ -61,8 +61,7 @@ import gradio as gr
 import numpy as np
 from huggingface_hub import hf_hub_download, snapshot_download
-from ltx_core.loader import LTXV_LORA_COMFY_RENAMING_MAP, LoraPathStrengthAndSDOps
-from ltx_pipelines.ti2vid_two_stages_hq import TI2VidTwoStagesHQPipeline
 from ltx_pipelines.utils.args import ImageConditioningInput
 from ltx_pipelines.utils.media_io import encode_video
@@ -111,21 +110,17 @@ RESOLUTIONS = {
 # Model repos
 CHECKPOINT_REPO = "SulphurAI/Sulphur-2-base"
-DISTILL_LORA_REPO = "SulphurAI/Sulphur-2-base"
 LTX_MODEL_REPO = "Lightricks/LTX-2.3"
 GEMMA_REPO = "Lightricks/gemma-3-12b-it-qat-q4_0-unquantized"
 # Download model checkpoints in parallel for speed
 print("=" * 80)
-print("Downloading Element-16 dev + distill LoRA + Gemma (parallel)...")
 print("=" * 80)
 def download_checkpoint():
-    return hf_hub_download(repo_id=CHECKPOINT_REPO, filename="sulphur_dev_fp8mixed.safetensors")
-def download_lora():
-    # Skip distill LoRA for fp8 - not compatible with mxfp8mixed format
-    return None
 def download_upsampler():
     return hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-spatial-upscaler-x2-1.0.safetensors")
@@ -145,58 +140,27 @@ with ThreadPoolExecutor(max_workers=3) as executor:
 print(f"Checkpoint: {checkpoint_path}")
 print(f"Spatial upsampler: {spatial_upsampler_path}")
 print(f"Gemma root: {gemma_root}")
-print("Note: Using fp8 without distill LoRA - will use more inference steps")
-# Initialize pipeline with fp8 checkpoint (no distill LoRA for fp8 compatibility)
-pipeline = TI2VidTwoStagesHQPipeline(
-    checkpoint_path=checkpoint_path,
-    distilled_lora=[],
-    distilled_lora_strength_stage_1=0.0,
-    distilled_lora_strength_stage_2=0.0,
     spatial_upsampler_path=spatial_upsampler_path,
     gemma_root=gemma_root,
     loras=(),
 )
-# Preload all models for ZeroGPU tensor packing (BOTH stages!)
-print("Preloading all models (including Gemma and audio components)...")
-# Stage 1 models
-stage_1_ledger = pipeline.stage_1_model_ledger
-_transformer = stage_1_ledger.transformer()
-_video_encoder = stage_1_ledger.video_encoder()
-_video_decoder = stage_1_ledger.video_decoder()
-_audio_encoder = stage_1_ledger.audio_encoder()
-_audio_decoder = stage_1_ledger.audio_decoder()
-_vocoder = stage_1_ledger.vocoder()
-_spatial_upsampler_1 = stage_1_ledger.spatial_upsampler()
-_text_encoder = stage_1_ledger.text_encoder()
-_embeddings_processor = stage_1_ledger.gemma_embeddings_processor()
-stage_1_ledger.transformer = lambda: _transformer
-stage_1_ledger.video_encoder = lambda: _video_encoder
-stage_1_ledger.video_decoder = lambda: _video_decoder
-stage_1_ledger.audio_encoder = lambda: _audio_encoder
-stage_1_ledger.audio_decoder = lambda: _audio_decoder
-stage_1_ledger.vocoder = lambda: _vocoder
-stage_1_ledger.spatial_upsampler = lambda: _spatial_upsampler_1
-stage_1_ledger.text_encoder = lambda: _text_encoder
-stage_1_ledger.gemma_embeddings_processor = lambda: _embeddings_processor
-# Stage 2 models (critical - spatial upsampler is used here!)
-print("Preloading stage 2 models...")
-stage_2_ledger = pipeline.stage_2_model_ledger
-_spatial_upsampler_2 = stage_2_ledger.spatial_upsampler()
-_transformer_2 = stage_2_ledger.transformer()
-_video_encoder_2 = stage_2_ledger.video_encoder()
-_video_decoder_2 = stage_2_ledger.video_decoder()
-stage_2_ledger.spatial_upsampler = lambda: _spatial_upsampler_2
-stage_2_ledger.transformer = lambda: _transformer_2
-stage_2_ledger.video_encoder = lambda: _video_encoder_2
-stage_2_ledger.video_decoder = lambda: _video_decoder_2
-print("All models preloaded (stage 1 + stage 2)!")
 print("=" * 80)
 print("Pipeline ready!")
@@ -244,7 +208,7 @@ def on_highres_toggle(first_image, last_image, high_res):
 DEFAULT_NEGATIVE_PROMPT = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走, blurry, glasses, deformed, subtitles, text, captions, worst quality, low quality, inconsistent motion, jittery, distorted"
-@spaces.GPU(duration=120)  # More time needed for 30 inference steps
 @torch.inference_mode()
 def generate_video(
     first_image,
@@ -291,7 +255,6 @@ def generate_video(
                 temp_last_path = Path(last_image)
             images.append(ImageConditioningInput(path=str(temp_last_path), frame_idx=num_frames - 1, strength=1.0))
-        from ltx_core.components.guiders import MultiModalGuiderParams
         from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number
         tiling_config = TilingConfig.default()
@@ -299,38 +262,16 @@ def generate_video(
         log_memory("before pipeline call")
-        # Configure guider params
-        video_guider_params = MultiModalGuiderParams(
-            cfg_scale=3.0,
-            stg_scale=0.0,
-            rescale_scale=0.45,
-            modality_scale=3.0,
-            skip_step=0,
-            stg_blocks=[],
-        )
-        audio_guider_params = MultiModalGuiderParams(
-            cfg_scale=7.0,
-            stg_scale=0.0,
-            rescale_scale=1.0,
-            modality_scale=3.0,
-            skip_step=0,
-            stg_blocks=[],
-        )
-        # Run inference - returns (video_frames_iter, audio)
         video_frames_iter, audio = pipeline(
             prompt=prompt,
-            negative_prompt=negative_prompt,
             seed=current_seed,
             height=int(height),
             width=int(width),
             num_frames=num_frames,
             frame_rate=frame_rate,
-            num_inference_steps=30,  # More steps needed without distill LoRA
-            video_guider_params=video_guider_params,
-            audio_guider_params=audio_guider_params,
             images=images,
         )
         # Collect video frames

 import numpy as np
 from huggingface_hub import hf_hub_download, snapshot_download
+from ltx_pipelines.distilled import DistilledPipeline
 from ltx_pipelines.utils.args import ImageConditioningInput
 from ltx_pipelines.utils.media_io import encode_video
 # Model repos
 CHECKPOINT_REPO = "SulphurAI/Sulphur-2-base"
 LTX_MODEL_REPO = "Lightricks/LTX-2.3"
 GEMMA_REPO = "Lightricks/gemma-3-12b-it-qat-q4_0-unquantized"
 # Download model checkpoints in parallel for speed
 print("=" * 80)
+print("Downloading Element-16 (pre-distilled) + Gemma (parallel)...")
 print("=" * 80)
 def download_checkpoint():
+    # Use pre-distilled checkpoint - no LoRA needed
+    return hf_hub_download(repo_id=CHECKPOINT_REPO, filename="sulphur_distil_bf16.safetensors")
 def download_upsampler():
     return hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-spatial-upscaler-x2-1.0.safetensors")
 print(f"Checkpoint: {checkpoint_path}")
 print(f"Spatial upsampler: {spatial_upsampler_path}")
 print(f"Gemma root: {gemma_root}")
+# Initialize pipeline with pre-distilled checkpoint (no LoRA needed)
+pipeline = DistilledPipeline(
+    distilled_checkpoint_path=checkpoint_path,
     spatial_upsampler_path=spatial_upsampler_path,
     gemma_root=gemma_root,
     loras=(),
 )
+# Preload all models for ZeroGPU tensor packing
+print("Preloading all pipeline components...")
+# DistilledPipeline components are already instantiated, just access them to ensure loaded
+_ = pipeline.prompt_encoder
+_ = pipeline.image_conditioner
+_ = pipeline.stage
+_ = pipeline.upsampler
+_ = pipeline.video_decoder
+_ = pipeline.audio_decoder
+print("All models preloaded!")
 print("=" * 80)
 print("Pipeline ready!")
 DEFAULT_NEGATIVE_PROMPT = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走, blurry, glasses, deformed, subtitles, text, captions, worst quality, low quality, inconsistent motion, jittery, distorted"
+@spaces.GPU(duration=90)
 @torch.inference_mode()
 def generate_video(
     first_image,
                 temp_last_path = Path(last_image)
             images.append(ImageConditioningInput(path=str(temp_last_path), frame_idx=num_frames - 1, strength=1.0))
         from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number
         tiling_config = TilingConfig.default()
         log_memory("before pipeline call")
+        # Run inference - DistilledPipeline has simpler API
         video_frames_iter, audio = pipeline(
             prompt=prompt,
             seed=current_seed,
             height=int(height),
             width=int(width),
             num_frames=num_frames,
             frame_rate=frame_rate,
             images=images,
+            enhance_prompt=enhance_prompt,
         )
         # Collect video frames