rippertnt's picture
Upload 5 files
58c2804 verified
import torch
from diffusers import LTX2Pipeline, LTX2ImageToVideoPipeline, LTX2VideoTransformer3DModel
from diffusers.pipelines.ltx2.export_utils import encode_video
from diffusers.utils import load_image
from transformers import Qwen3ForCausalLM, BitsAndBytesConfig, AutoTokenizer
import math
import numpy as np
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.benchmark = True
BNB_CONFIG = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16 ,
bnb_4bit_use_double_quant=True,
)
from diffusers import LTX2Pipeline
from diffusers.pipelines.ltx2.export_utils import encode_video
from transformers import Gemma3ForConditionalGeneration
repo= "Lightricks/LTX-2"
text_encoder = Gemma3ForConditionalGeneration.from_pretrained(
repo,
subfolder="text_encoder",
quantization_config=BNB_CONFIG
)
### transformer
transformer_4bit = LTX2VideoTransformer3DModel.from_pretrained(
repo,
subfolder="transformer",
quantization_config=BNB_CONFIG
)
pipe = LTX2Pipeline.from_pretrained(
repo,
torch_dtype=torch.bfloat16,
transformer=transformer_4bit,
text_encoder=text_encoder,
)
pipe.vae.to(dtype=torch.bfloat16)
pipe.connectors.to(dtype=torch.bfloat16)
pipe.audio_vae.to(dtype=torch.bfloat16)
pipe.vocoder.to(dtype=torch.bfloat16)
pipe.to("cuda", dtype=torch.bfloat16)
image = load_image(
"./suji.jpg"
)
prompt = "A very beautiful korean kpop young woman is walking waikiki beach"
negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
frame_rate = 24.0
with torch.autocast("cuda", dtype=torch.bfloat16):
video, audio = pipe(
#image=image,
prompt=prompt,
negative_prompt=negative_prompt,
width=768,
height=512,
num_frames=121,
frame_rate=frame_rate,
num_inference_steps=40,
guidance_scale=4.0,
output_type="np",
return_dict=False,
)
video = np.nan_to_num(video, nan=0.0)
video = np.clip(video, 0, 1)
video = (video * 255).round().astype("uint8")
video = torch.from_numpy(video)
encode_video(
video[0],
fps=frame_rate,
audio=audio[0].float().cpu(),
audio_sample_rate=pipe.vocoder.config.output_sampling_rate, # should be 24000
output_path="video2.mp4",
)
pipe.save_pretrained("./LTX-2-bnb-4bit")