import torch
from diffusers import LTX2Pipeline, LTX2ImageToVideoPipeline, LTX2VideoTransformer3DModel
from diffusers.pipelines.ltx2.export_utils import encode_video
from diffusers.utils import load_image
from transformers import Qwen3ForCausalLM, BitsAndBytesConfig, AutoTokenizer
import math
import numpy as np

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.benchmark = True

BNB_CONFIG = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",  
    bnb_4bit_compute_dtype=torch.bfloat16 ,  
    bnb_4bit_use_double_quant=True,  
)


from diffusers import LTX2Pipeline
from diffusers.pipelines.ltx2.export_utils import encode_video
from transformers import Gemma3ForConditionalGeneration
repo= "Lightricks/LTX-2"

text_encoder = Gemma3ForConditionalGeneration.from_pretrained(
    repo,
    subfolder="text_encoder",
    quantization_config=BNB_CONFIG
)

### transformer
transformer_4bit = LTX2VideoTransformer3DModel.from_pretrained(
    repo,
    subfolder="transformer",
    quantization_config=BNB_CONFIG
)


pipe = LTX2Pipeline.from_pretrained(
    repo, 
    torch_dtype=torch.bfloat16,
    transformer=transformer_4bit,
    text_encoder=text_encoder,
)

pipe.vae.to(dtype=torch.bfloat16)
pipe.connectors.to(dtype=torch.bfloat16)
pipe.audio_vae.to(dtype=torch.bfloat16)
pipe.vocoder.to(dtype=torch.bfloat16)
pipe.to("cuda", dtype=torch.bfloat16)

image = load_image(
    "./suji.jpg"
)


prompt = "A very beautiful korean kpop young woman is walking waikiki beach"
negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"

frame_rate = 24.0

with torch.autocast("cuda", dtype=torch.bfloat16):
    video, audio = pipe(
        #image=image,
        prompt=prompt,
        negative_prompt=negative_prompt,
        width=768,
        height=512,
        num_frames=121,
        frame_rate=frame_rate,
        num_inference_steps=40,
        guidance_scale=4.0,
        output_type="np",
        return_dict=False,
    )
video = np.nan_to_num(video, nan=0.0)
video = np.clip(video, 0, 1)
video = (video * 255).round().astype("uint8")
video = torch.from_numpy(video)

encode_video(
    video[0],
    fps=frame_rate,
    audio=audio[0].float().cpu(),
    audio_sample_rate=pipe.vocoder.config.output_sampling_rate,  # should be 24000
    output_path="video2.mp4",
)

pipe.save_pretrained("./LTX-2-bnb-4bit")