import torch from diffusers import LTX2Pipeline, LTX2ImageToVideoPipeline, LTX2VideoTransformer3DModel from diffusers.pipelines.ltx2.export_utils import encode_video from diffusers.utils import load_image from transformers import Qwen3ForCausalLM, BitsAndBytesConfig, AutoTokenizer import math import numpy as np torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True torch.backends.cudnn.benchmark = True BNB_CONFIG = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16 , bnb_4bit_use_double_quant=True, ) from diffusers import LTX2Pipeline from diffusers.pipelines.ltx2.export_utils import encode_video from transformers import Gemma3ForConditionalGeneration repo= "Lightricks/LTX-2" text_encoder = Gemma3ForConditionalGeneration.from_pretrained( repo, subfolder="text_encoder", quantization_config=BNB_CONFIG ) ### transformer transformer_4bit = LTX2VideoTransformer3DModel.from_pretrained( repo, subfolder="transformer", quantization_config=BNB_CONFIG ) pipe = LTX2Pipeline.from_pretrained( repo, torch_dtype=torch.bfloat16, transformer=transformer_4bit, text_encoder=text_encoder, ) pipe.vae.to(dtype=torch.bfloat16) pipe.connectors.to(dtype=torch.bfloat16) pipe.audio_vae.to(dtype=torch.bfloat16) pipe.vocoder.to(dtype=torch.bfloat16) pipe.to("cuda", dtype=torch.bfloat16) image = load_image( "./suji.jpg" ) prompt = "A very beautiful korean kpop young woman is walking waikiki beach" negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted" frame_rate = 24.0 with torch.autocast("cuda", dtype=torch.bfloat16): video, audio = pipe( #image=image, prompt=prompt, negative_prompt=negative_prompt, width=768, height=512, num_frames=121, frame_rate=frame_rate, num_inference_steps=40, guidance_scale=4.0, output_type="np", return_dict=False, ) video = np.nan_to_num(video, nan=0.0) video = np.clip(video, 0, 1) video = (video * 255).round().astype("uint8") video = torch.from_numpy(video) encode_video( video[0], fps=frame_rate, audio=audio[0].float().cpu(), audio_sample_rate=pipe.vocoder.config.output_sampling_rate, # should be 24000 output_path="video2.mp4", ) pipe.save_pretrained("./LTX-2-bnb-4bit")