| import spaces |
| import gradio as gr |
| import torch |
| from diffusers import DiffusionPipeline |
| from diffusers.utils import load_image, export_to_video |
| import random |
| import numpy as np |
| from moviepy import ImageSequenceClip, AudioFileClip, VideoFileClip |
| from PIL import Image, ImageOps |
|
|
| |
| |
| |
|
|
| torch.backends.cuda.matmul.allow_tf32 = True |
| torch.backends.cudnn.allow_tf32 = True |
| torch.set_grad_enabled(False) |
|
|
| torch.backends.cuda.enable_flash_sdp(True) |
| torch.backends.cuda.enable_mem_efficient_sdp(True) |
|
|
| DEVICE = "cuda" |
| DTYPE = torch.bfloat16 |
|
|
| |
| |
| |
|
|
| DISTILLED_SIGMA_VALUES = [ |
| 1.0, 0.99375, 0.9875, 0.98125, 0.975, 0.909375, 0.725, 0.421875 |
| ] |
|
|
| |
| |
| |
|
|
| print("🚀 Loading LTX-2 Distilled on H200...") |
|
|
| pipe = DiffusionPipeline.from_pretrained( |
| "rootonchair/LTX-2-19b-distilled", |
| custom_pipeline="multimodalart/ltx2-audio-to-video", |
| torch_dtype=DTYPE, |
| ) |
|
|
| pipe.to(DEVICE) |
|
|
| |
| try: |
| pipe.enable_xformers_memory_efficient_attention() |
| print("✅ xFormers enabled") |
| except Exception: |
| print("⚠️ xFormers not available") |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| print("📦 Loading Camera Control LoRA...") |
| |
| |
| |
| |
|
|
|
|
| pipe.load_lora_weights( |
| "Lightricks/LTX-2-19b-IC-LoRA-Detailer", |
| adapter_name="camera_control" |
| ) |
|
|
|
|
|
|
| pipe.fuse_lora(lora_scale=0.8) |
| pipe.unload_lora_weights() |
|
|
| print("🔥 Model fully loaded on CUDA.") |
|
|
| |
| |
| |
|
|
| def save_video_with_audio(video_frames, audio_path, fps=24): |
| output_filename = f"output_{random.randint(0, 100000)}.mp4" |
|
|
| if isinstance(video_frames, list): |
| if video_frames and isinstance(video_frames[0], list): |
| frames = video_frames[0] |
| else: |
| frames = video_frames |
| np_frames = [np.array(img) for img in frames] |
| clip = ImageSequenceClip(np_frames, fps=fps) |
| elif isinstance(video_frames, str): |
| clip = VideoFileClip(video_frames) |
| else: |
| temp_path = "temp_video_no_audio.mp4" |
| export_to_video(video_frames, temp_path, fps=fps) |
| clip = VideoFileClip(temp_path) |
|
|
| audio_clip = AudioFileClip(audio_path) |
|
|
| if audio_clip.duration > clip.duration: |
| audio_clip = audio_clip.subclipped(0, clip.duration) |
|
|
| final_clip = clip.with_audio(audio_clip) |
|
|
| final_clip.write_videofile( |
| output_filename, |
| fps=fps, |
| codec="libx264", |
| audio_codec="aac", |
| logger=None |
| ) |
|
|
| final_clip.close() |
| audio_clip.close() |
| clip.close() |
|
|
| return output_filename |
|
|
|
|
| def infer_aspect_ratio(image): |
| resolutions = { |
| "1:1": (512, 512), |
| "16:9": (768, 512), |
| "9:16": (512, 768) |
| } |
|
|
| width, height = image.size |
| image_ratio = width / height |
|
|
| aspect_ratios = { |
| "1:1": 1.0, |
| "16:9": 16 / 9, |
| "9:16": 9 / 16 |
| } |
|
|
| closest_ratio = min( |
| aspect_ratios.keys(), |
| key=lambda k: abs(aspect_ratios[k] - image_ratio) |
| ) |
|
|
| return closest_ratio, resolutions[closest_ratio] |
|
|
|
|
| def process_image_for_aspect_ratio(image): |
| ratio_str, (target_w, target_h) = infer_aspect_ratio(image) |
|
|
| processed_img = ImageOps.fit( |
| image, |
| (target_w, target_h), |
| method=Image.LANCZOS, |
| centering=(0.5, 0.5) |
| ) |
|
|
| return processed_img, target_w, target_h, ratio_str |
|
|
|
|
| def get_audio_duration(audio_path): |
| if audio_path is None: |
| return gr.update() |
|
|
| try: |
| audio_clip = AudioFileClip(audio_path) |
| duration = audio_clip.duration |
| audio_clip.close() |
|
|
| capped = min(duration, 12.0) |
| rounded = round(capped * 2) / 2 |
| return gr.update(value=rounded) |
| except: |
| return gr.update() |
|
|
| |
| |
| |
|
|
| @spaces.GPU(duration=85, size='xlarge') |
| def generate( |
| image_path, |
| audio_path, |
| prompt, |
| negative_prompt, |
| video_duration, |
| seed, |
| progress=gr.Progress(track_tqdm=True) |
| ): |
| if not image_path or not audio_path: |
| raise gr.Error("Please provide both image and audio.") |
|
|
| if seed == -1: |
| seed = random.randint(0, 1_000_000) |
| print(prompt) |
| generator = torch.Generator(device="cuda").manual_seed(seed) |
|
|
| original_image = load_image(image_path) |
| image, width, height, ratio = process_image_for_aspect_ratio(original_image) |
|
|
| fps = 24.0 |
| total_frames = int(video_duration * fps) |
| base_block = round(total_frames / 8) * 8 |
| num_frames = max(base_block + 1, 9) |
|
|
| print(f"Seed: {seed} | {width}x{height} | Frames: {num_frames}") |
|
|
| |
| with torch.inference_mode(): |
| video_output, _ = pipe( |
| image=image, |
| audio=audio_path, |
| prompt=prompt, |
| negative_prompt=negative_prompt, |
| width=width, |
| height=height, |
| num_frames=num_frames, |
| frame_rate=fps, |
| num_inference_steps=8, |
| sigmas=DISTILLED_SIGMA_VALUES, |
| guidance_scale=1.0, |
| generator=generator, |
| return_dict=False, |
| ) |
|
|
| output_path = save_video_with_audio(video_output, audio_path, fps=fps) |
|
|
| return output_path, seed |
|
|
| |
| |
| |
|
|
| css = "#col-container { max-width: 800px; margin: 0 auto; }" |
|
|
| with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo: |
| with gr.Column(elem_id="col-container"): |
| gr.Markdown("# ⚡ LTX-2 Distilled Audio-to-Video (H200 Optimized)") |
|
|
| with gr.Row(): |
| with gr.Column(): |
| input_image = gr.Image(type="filepath", height=300) |
| input_audio = gr.Audio(type="filepath") |
| with gr.Column(): |
| result_video = gr.Video() |
|
|
| prompt = gr.Textbox( |
| value="A person speaking, lips moving in sync with the words", |
| lines=2 |
| ) |
|
|
| video_duration = gr.Slider(1.0, 12.0, step=0.5, value=4.0) |
|
|
| with gr.Accordion("Advanced", open=False): |
| negative_prompt = gr.Textbox( |
| value="low quality, worst quality" |
| ) |
| seed = gr.Number(value=-1, precision=0) |
|
|
| run_btn = gr.Button("Generate", variant="primary") |
| used_seed = gr.Number(visible=False) |
|
|
| input_audio.change( |
| fn=get_audio_duration, |
| inputs=[input_audio], |
| outputs=[video_duration] |
| ) |
|
|
| run_btn.click( |
| fn=generate, |
| inputs=[ |
| input_image, |
| input_audio, |
| prompt, |
| negative_prompt, |
| video_duration, |
| seed |
| ], |
| outputs=[result_video, used_seed] |
| ) |
|
|
| if __name__ == "__main__": |
| demo.queue().launch() |