| import torch |
| from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline |
| from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition |
| from diffusers.utils import export_to_video |
|
|
| pipe = LTXConditionPipeline.from_pretrained("Lightricks/LTX-Video-0.9.7-dev", torch_dtype=torch.bfloat16) |
| pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("Lightricks/ltxv-spatial-upscaler-0.9.7", vae=pipe.vae, torch_dtype=torch.bfloat16) |
| pipe.to("cuda") |
| pipe_upsample.to("cuda") |
| pipe.vae.enable_tiling() |
|
|
| prompt = "The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region." |
| negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted" |
| expected_height, expected_width = 704, 512 |
| downscale_factor = 2 / 3 |
| num_frames = 121 |
|
|
| |
| downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor) |
| latents = pipe( |
| conditions=None, |
| prompt=prompt, |
| negative_prompt=negative_prompt, |
| width=downscaled_width, |
| height=downscaled_height, |
| num_frames=num_frames, |
| num_inference_steps=30, |
| generator=torch.Generator().manual_seed(0), |
| output_type="latent", |
| ).frames |
|
|
| |
| |
| upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2 |
| upscaled_latents = pipe_upsample( |
| latents=latents, |
| output_type="latent" |
| ).frames |
|
|
| |
| video = pipe( |
| prompt=prompt, |
| negative_prompt=negative_prompt, |
| width=upscaled_width, |
| height=upscaled_height, |
| num_frames=num_frames, |
| denoise_strength=0.4, |
| num_inference_steps=10, |
| latents=upscaled_latents, |
| decode_timestep=0.05, |
| image_cond_noise_scale=0.025, |
| generator=torch.Generator().manual_seed(0), |
| output_type="pil", |
| ).frames[0] |
|
|
| |
| video = [frame.resize((expected_width, expected_height)) for frame in video] |
|
|
| export_to_video(video, "output.mp4", fps=24) |
| import torch |
| import gradio as gr |
| from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline |
| from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition |
| from diffusers.utils import export_to_video |
|
|
| def generate_video( |
| prompt, |
| negative_prompt, |
| expected_height, |
| expected_width, |
| downscale_factor, |
| num_frames, |
| num_inference_steps, |
| denoise_strength, |
| seed, |
| progress=gr.Progress() |
| ): |
| |
| progress(0.1, desc="Loading models...") |
| pipe = LTXConditionPipeline.from_pretrained("Lightricks/LTX-Video-0.9.7-dev", torch_dtype=torch.bfloat16) |
| pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("Lightricks/ltxv-spatial-upscaler-0.9.7", vae=pipe.vae, torch_dtype=torch.bfloat16) |
| pipe.to("cuda") |
| pipe_upsample.to("cuda") |
| pipe.vae.enable_tiling() |
| |
| |
| progress(0.2, desc="Generating initial video...") |
| downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor) |
| generator = torch.Generator().manual_seed(seed) |
| |
| latents = pipe( |
| conditions=None, |
| prompt=prompt, |
| negative_prompt=negative_prompt, |
| width=downscaled_width, |
| height=downscaled_height, |
| num_frames=num_frames, |
| num_inference_steps=num_inference_steps, |
| generator=generator, |
| output_type="latent", |
| ).frames |
| |
| |
| progress(0.5, desc="Upscaling video...") |
| upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2 |
| upscaled_latents = pipe_upsample( |
| latents=latents, |
| output_type="latent" |
| ).frames |
| |
| |
| progress(0.7, desc="Refining video quality...") |
| video = pipe( |
| prompt=prompt, |
| negative_prompt=negative_prompt, |
| width=upscaled_width, |
| height=upscaled_height, |
| num_frames=num_frames, |
| denoise_strength=denoise_strength, |
| num_inference_steps=10, |
| latents=upscaled_latents, |
| decode_timestep=0.05, |
| image_cond_noise_scale=0.025, |
| generator=generator, |
| output_type="pil", |
| ).frames[0] |
| |
| |
| progress(0.9, desc="Finalizing video...") |
| video = [frame.resize((expected_width, expected_height)) for frame in video] |
| |
| |
| output_path = "output.mp4" |
| export_to_video(video, output_path, fps=24) |
| |
| return output_path |
|
|
| |
| with gr.Blocks(title="LTX Video Generator") as demo: |
| gr.Markdown("# LTX Video Generator") |
| gr.Markdown("Generate videos from text prompts using Lightricks' LTX model") |
| |
| with gr.Row(): |
| with gr.Column(): |
| prompt = gr.Textbox( |
| label="Prompt", |
| value="The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region.", |
| lines=4 |
| ) |
| negative_prompt = gr.Textbox( |
| label="Negative Prompt", |
| value="worst quality, inconsistent motion, blurry, jittery, distorted", |
| lines=2 |
| ) |
| |
| with gr.Row(): |
| expected_height = gr.Slider( |
| label="Output Height", |
| minimum=256, |
| maximum=1024, |
| step=64, |
| value=704 |
| ) |
| expected_width = gr.Slider( |
| label="Output Width", |
| minimum=256, |
| maximum=1024, |
| step=64, |
| value=512 |
| ) |
| |
| with gr.Row(): |
| downscale_factor = gr.Slider( |
| label="Initial Downscale Factor", |
| minimum=0.3, |
| maximum=0.9, |
| step=0.05, |
| value=2/3 |
| ) |
| num_frames = gr.Slider( |
| label="Number of Frames", |
| minimum=24, |
| maximum=240, |
| step=1, |
| value=121 |
| ) |
| |
| with gr.Row(): |
| num_inference_steps = gr.Slider( |
| label="Inference Steps", |
| minimum=10, |
| maximum=50, |
| step=1, |
| value=30 |
| ) |
| denoise_strength = gr.Slider( |
| label="Denoise Strength", |
| minimum=0.1, |
| maximum=0.9, |
| step=0.05, |
| value=0.4 |
| ) |
| seed = gr.Number( |
| label="Seed", |
| value=0, |
| precision=0 |
| ) |
| |
| submit_btn = gr.Button("Generate Video", variant="primary") |
| |
| with gr.Column(): |
| output_video = gr.Video(label="Generated Video") |
| |
| submit_btn.click( |
| fn=generate_video, |
| inputs=[ |
| prompt, |
| negative_prompt, |
| expected_height, |
| expected_width, |
| downscale_factor, |
| num_frames, |
| num_inference_steps, |
| denoise_strength, |
| seed |
| ], |
| outputs=output_video |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| |