|
|
import torch |
|
|
from diffusers import LTX2Pipeline, LTX2ImageToVideoPipeline, LTX2VideoTransformer3DModel |
|
|
from diffusers.pipelines.ltx2.export_utils import encode_video |
|
|
from diffusers.utils import load_image |
|
|
from transformers import Qwen3ForCausalLM, BitsAndBytesConfig, AutoTokenizer |
|
|
import math |
|
|
import numpy as np |
|
|
|
|
|
torch.backends.cuda.matmul.allow_tf32 = True |
|
|
torch.backends.cudnn.allow_tf32 = True |
|
|
torch.backends.cudnn.benchmark = True |
|
|
|
|
|
BNB_CONFIG = BitsAndBytesConfig( |
|
|
load_in_4bit=True, |
|
|
bnb_4bit_quant_type="nf4", |
|
|
bnb_4bit_compute_dtype=torch.bfloat16 , |
|
|
bnb_4bit_use_double_quant=True, |
|
|
) |
|
|
|
|
|
|
|
|
from diffusers import LTX2Pipeline |
|
|
from diffusers.pipelines.ltx2.export_utils import encode_video |
|
|
from transformers import Gemma3ForConditionalGeneration |
|
|
repo= "Lightricks/LTX-2" |
|
|
|
|
|
text_encoder = Gemma3ForConditionalGeneration.from_pretrained( |
|
|
repo, |
|
|
subfolder="text_encoder", |
|
|
quantization_config=BNB_CONFIG |
|
|
) |
|
|
|
|
|
|
|
|
transformer_4bit = LTX2VideoTransformer3DModel.from_pretrained( |
|
|
repo, |
|
|
subfolder="transformer", |
|
|
quantization_config=BNB_CONFIG |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
pipe = LTX2Pipeline.from_pretrained( |
|
|
repo, |
|
|
torch_dtype=torch.bfloat16, |
|
|
transformer=transformer_4bit, |
|
|
text_encoder=text_encoder, |
|
|
) |
|
|
|
|
|
pipe.vae.to(dtype=torch.bfloat16) |
|
|
pipe.connectors.to(dtype=torch.bfloat16) |
|
|
pipe.audio_vae.to(dtype=torch.bfloat16) |
|
|
pipe.vocoder.to(dtype=torch.bfloat16) |
|
|
pipe.to("cuda", dtype=torch.bfloat16) |
|
|
|
|
|
image = load_image( |
|
|
"./suji.jpg" |
|
|
) |
|
|
|
|
|
|
|
|
prompt = "A very beautiful korean kpop young woman is walking waikiki beach" |
|
|
negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted" |
|
|
|
|
|
frame_rate = 24.0 |
|
|
|
|
|
with torch.autocast("cuda", dtype=torch.bfloat16): |
|
|
video, audio = pipe( |
|
|
|
|
|
prompt=prompt, |
|
|
negative_prompt=negative_prompt, |
|
|
width=768, |
|
|
height=512, |
|
|
num_frames=121, |
|
|
frame_rate=frame_rate, |
|
|
num_inference_steps=40, |
|
|
guidance_scale=4.0, |
|
|
output_type="np", |
|
|
return_dict=False, |
|
|
) |
|
|
video = np.nan_to_num(video, nan=0.0) |
|
|
video = np.clip(video, 0, 1) |
|
|
video = (video * 255).round().astype("uint8") |
|
|
video = torch.from_numpy(video) |
|
|
|
|
|
encode_video( |
|
|
video[0], |
|
|
fps=frame_rate, |
|
|
audio=audio[0].float().cpu(), |
|
|
audio_sample_rate=pipe.vocoder.config.output_sampling_rate, |
|
|
output_path="video2.mp4", |
|
|
) |
|
|
|
|
|
pipe.save_pretrained("./LTX-2-bnb-4bit") |