Spaces:
Running on Zero
Running on Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,6 +8,15 @@ import torch.nn.functional as F
|
|
| 8 |
import torchaudio
|
| 9 |
import os
|
| 10 |
from typing import Any
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
def _coerce_audio_path(audio_path: Any) -> str:
|
| 13 |
# Common Gradio case: tuple where first item is the filepath
|
|
@@ -25,6 +34,41 @@ def _coerce_audio_path(audio_path: Any) -> str:
|
|
| 25 |
|
| 26 |
return os.fspath(audio_path)
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
def match_audio_to_duration(
|
|
@@ -75,8 +119,6 @@ def match_audio_to_duration(
|
|
| 75 |
|
| 76 |
def sh(cmd): subprocess.check_call(cmd, shell=True)
|
| 77 |
|
| 78 |
-
sh("pip install --no-deps easy_dwpose")
|
| 79 |
-
|
| 80 |
# Add packages to Python path
|
| 81 |
current_dir = Path(__file__).parent
|
| 82 |
sys.path.insert(0, str(current_dir / "packages" / "ltx-pipelines" / "src"))
|
|
@@ -118,8 +160,8 @@ from ltx_pipelines.utils import ModelLedger
|
|
| 118 |
from ltx_pipelines.utils.helpers import generate_enhanced_prompt
|
| 119 |
import imageio
|
| 120 |
import cv2
|
| 121 |
-
from controlnet_aux import CannyDetector
|
| 122 |
-
from
|
| 123 |
|
| 124 |
|
| 125 |
# HuggingFace Hub defaults
|
|
@@ -166,6 +208,9 @@ model_ledger = ModelLedger(
|
|
| 166 |
)
|
| 167 |
|
| 168 |
canny_processor = CannyDetector()
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
|
| 171 |
# Load text encoder once and keep it in memory
|
|
@@ -186,7 +231,7 @@ def on_lora_change(selected: str):
|
|
| 186 |
|
| 187 |
def process_video_for_pose(frames, width: int, height: int):
|
| 188 |
|
| 189 |
-
pose_processor =
|
| 190 |
|
| 191 |
if not frames:
|
| 192 |
return []
|
|
@@ -197,7 +242,7 @@ def process_video_for_pose(frames, width: int, height: int):
|
|
| 197 |
pil = Image.fromarray(frame.astype(np.uint8)).convert("RGB")
|
| 198 |
|
| 199 |
# ✅ do NOT pass width/height here (easy_dwpose will handle drawing sizes internally)
|
| 200 |
-
pose_img = pose_processor(pil)
|
| 201 |
|
| 202 |
# Ensure it's PIL then resize to your conditioning size
|
| 203 |
if not isinstance(pose_img, Image.Image):
|
|
@@ -219,6 +264,46 @@ def preprocess_video_to_pose_mp4(video_path: str, width: int, height: int, fps:
|
|
| 219 |
tmp.close()
|
| 220 |
return write_video_mp4(pose_frames, fps=fps, out_path=tmp.name)
|
| 221 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
|
| 223 |
def load_video_frames(video_path: str):
|
| 224 |
"""Return list of frames as numpy arrays (H,W,3) uint8."""
|
|
@@ -230,7 +315,7 @@ def load_video_frames(video_path: str):
|
|
| 230 |
|
| 231 |
|
| 232 |
def process_video_for_canny(frames, width: int, height: int,
|
| 233 |
-
low_threshold=
|
| 234 |
"""
|
| 235 |
Convert RGB frames -> canny edge frames.
|
| 236 |
Returns list of np arrays (H,W,3) in float [0..1] (like controlnet_aux output).
|
|
@@ -244,6 +329,8 @@ def process_video_for_canny(frames, width: int, height: int,
|
|
| 244 |
canny_frames = []
|
| 245 |
for frame in frames:
|
| 246 |
# controlnet_aux CannyDetector returns float image in [0..1] if output_type="np"
|
|
|
|
|
|
|
| 247 |
canny = canny_processor(
|
| 248 |
frame,
|
| 249 |
low_threshold=low_threshold,
|
|
@@ -277,6 +364,158 @@ def preprocess_video_to_canny_mp4(video_path: str, width: int, height: int, fps:
|
|
| 277 |
tmp.close()
|
| 278 |
return write_video_mp4(canny_frames, fps=fps, out_path=tmp.name)
|
| 279 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
|
| 281 |
def encode_text_simple(text_encoder, prompt: str):
|
| 282 |
"""Simple text encoding without using pipeline_utils."""
|
|
@@ -420,6 +659,13 @@ detailer_lora_path = get_hub_or_local_checkpoint(
|
|
| 420 |
"ltx-2-19b-ic-lora-detailer.safetensors",
|
| 421 |
)
|
| 422 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 423 |
# Load distilled LoRA as a regular LoRA
|
| 424 |
loras = [
|
| 425 |
# --- fused / base behavior ---
|
|
@@ -436,9 +682,22 @@ loras = [
|
|
| 436 |
LoraPathStrengthAndSDOps(dolly_right_lora_path, DEFAULT_LORA_STRENGTH, LTXV_LORA_COMFY_RENAMING_MAP),
|
| 437 |
LoraPathStrengthAndSDOps(jib_down_lora_path, DEFAULT_LORA_STRENGTH, LTXV_LORA_COMFY_RENAMING_MAP),
|
| 438 |
LoraPathStrengthAndSDOps(jib_up_lora_path, DEFAULT_LORA_STRENGTH, LTXV_LORA_COMFY_RENAMING_MAP),
|
|
|
|
| 439 |
]
|
| 440 |
|
| 441 |
# Runtime-toggle LoRAs (exclude fused distilled at index 0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 442 |
RUNTIME_LORA_CHOICES = [
|
| 443 |
("No LoRA", -1),
|
| 444 |
("Static", 0),
|
|
@@ -449,7 +708,7 @@ RUNTIME_LORA_CHOICES = [
|
|
| 449 |
("Slide Right", 5),
|
| 450 |
("Slide Down", 6),
|
| 451 |
("Slide Up", 7),
|
| 452 |
-
|
| 453 |
]
|
| 454 |
|
| 455 |
# Initialize pipeline WITHOUT text encoder (gemma_root=None)
|
|
@@ -556,6 +815,18 @@ class RadioAnimated(gr.HTML):
|
|
| 556 |
|
| 557 |
// Recalc on resize (important in Gradio layouts)
|
| 558 |
window.addEventListener('resize', () => setHighlightByIndex(currentIdx));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 559 |
})();
|
| 560 |
|
| 561 |
"""
|
|
@@ -818,30 +1089,34 @@ class CameraDropdown(gr.HTML):
|
|
| 818 |
**kwargs
|
| 819 |
)
|
| 820 |
|
| 821 |
-
def generate_video_example(input_image, prompt, camera_lora, resolution,
|
| 822 |
|
| 823 |
w, h = apply_resolution(resolution)
|
| 824 |
|
| 825 |
-
|
| 826 |
-
|
| 827 |
-
|
| 828 |
-
|
| 829 |
-
|
| 830 |
-
|
| 831 |
-
|
| 832 |
-
|
| 833 |
-
|
| 834 |
-
|
| 835 |
-
|
| 836 |
-
|
| 837 |
-
|
| 838 |
-
|
|
|
|
|
|
|
| 839 |
return output_video
|
| 840 |
|
| 841 |
def get_duration(
|
| 842 |
input_image,
|
| 843 |
prompt,
|
| 844 |
duration,
|
|
|
|
|
|
|
| 845 |
enhance_prompt,
|
| 846 |
seed,
|
| 847 |
randomize_seed,
|
|
@@ -856,6 +1131,9 @@ def get_duration(
|
|
| 856 |
if audio_path is not None:
|
| 857 |
extra_time += 10
|
| 858 |
|
|
|
|
|
|
|
|
|
|
| 859 |
if duration <= 3:
|
| 860 |
return 60 + extra_time
|
| 861 |
elif duration <= 5:
|
|
@@ -865,11 +1143,14 @@ def get_duration(
|
|
| 865 |
else:
|
| 866 |
return 180 + extra_time
|
| 867 |
|
|
|
|
| 868 |
@spaces.GPU(duration=get_duration)
|
| 869 |
def generate_video(
|
| 870 |
input_image,
|
| 871 |
prompt: str,
|
| 872 |
duration: float,
|
|
|
|
|
|
|
| 873 |
enhance_prompt: bool = True,
|
| 874 |
seed: int = 42,
|
| 875 |
randomize_seed: bool = True,
|
|
@@ -885,6 +1166,7 @@ def generate_video(
|
|
| 885 |
input_image: Optional input image for image-to-video. If provided, it is injected at frame 0 to guide motion.
|
| 886 |
prompt: Text description of the scene, motion, and cinematic style to generate.
|
| 887 |
duration: Desired video length in seconds. Converted to frames using a fixed 24 FPS rate.
|
|
|
|
| 888 |
enhance_prompt: Whether to enhance the prompt using the prompt enhancer before encoding.
|
| 889 |
seed: Base random seed for reproducibility (ignored if randomize_seed is True).
|
| 890 |
randomize_seed: If True, a random seed is generated for each run.
|
|
@@ -920,12 +1202,42 @@ def generate_video(
|
|
| 920 |
# Calculate num_frames from duration (using fixed 24 fps)
|
| 921 |
frame_rate = 24.0
|
| 922 |
num_frames = int(duration * frame_rate) + 1 # +1 to ensure we meet the duration
|
|
|
|
| 923 |
|
| 924 |
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
|
| 925 |
output_path = tmpfile.name
|
| 926 |
|
| 927 |
|
| 928 |
images = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 929 |
|
| 930 |
if input_image is not None:
|
| 931 |
images = [(input_image, 0, 1.0)]
|
|
@@ -955,7 +1267,9 @@ def generate_video(
|
|
| 955 |
_, n_audio_context = encode_text_simple(text_encoder, "") # returns tensors on GPU already
|
| 956 |
del audio_context
|
| 957 |
audio_context = n_audio_context
|
| 958 |
-
|
|
|
|
|
|
|
| 959 |
|
| 960 |
torch.cuda.empty_cache()
|
| 961 |
|
|
@@ -982,23 +1296,24 @@ def generate_video(
|
|
| 982 |
input_waveform = None
|
| 983 |
input_waveform_sample_rate = None
|
| 984 |
|
| 985 |
-
|
| 986 |
-
|
| 987 |
-
|
| 988 |
-
|
| 989 |
-
|
| 990 |
-
|
| 991 |
-
|
| 992 |
-
|
| 993 |
-
|
| 994 |
-
|
| 995 |
-
|
| 996 |
-
|
| 997 |
-
|
| 998 |
-
|
| 999 |
-
|
| 1000 |
-
|
| 1001 |
-
|
|
|
|
| 1002 |
del video_context, audio_context
|
| 1003 |
torch.cuda.empty_cache()
|
| 1004 |
print("successful generation")
|
|
@@ -1022,6 +1337,12 @@ def apply_duration(duration: str):
|
|
| 1022 |
duration_s = int(duration[:-1])
|
| 1023 |
return duration_s
|
| 1024 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1025 |
|
| 1026 |
css = """
|
| 1027 |
|
|
@@ -1130,6 +1451,45 @@ css = """
|
|
| 1130 |
#false:checked ~ .toggle-highlight {
|
| 1131 |
transform: translateX(100%);
|
| 1132 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1133 |
"""
|
| 1134 |
|
| 1135 |
css += """
|
|
@@ -1678,15 +2038,27 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
|
|
| 1678 |
"""
|
| 1679 |
)
|
| 1680 |
with gr.Column(elem_id="col-container"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1681 |
with gr.Row():
|
| 1682 |
with gr.Column(elem_id="step-column"):
|
| 1683 |
-
|
| 1684 |
input_image = gr.Image(
|
| 1685 |
label="First Frame (Optional)",
|
| 1686 |
type="filepath",
|
| 1687 |
height=256
|
| 1688 |
)
|
| 1689 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1690 |
relocate = gr.HTML(
|
| 1691 |
value="",
|
| 1692 |
html_template="<div></div>",
|
|
@@ -1757,7 +2129,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
|
|
| 1757 |
|
| 1758 |
with gr.Column(elem_id="step-column"):
|
| 1759 |
output_video = gr.Video(label="Generated Video", autoplay=True, height=512)
|
| 1760 |
-
|
| 1761 |
with gr.Row(elem_id="controls-row"):
|
| 1762 |
|
| 1763 |
duration_ui = CameraDropdown(
|
|
@@ -1805,7 +2177,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
|
|
| 1805 |
height = gr.Number(label="Height", value=DEFAULT_1_STAGE_HEIGHT, precision=0, visible=False)
|
| 1806 |
|
| 1807 |
camera_ui = CameraDropdown(
|
| 1808 |
-
choices=[name for name, _ in
|
| 1809 |
value="No LoRA",
|
| 1810 |
title="Camera LoRA",
|
| 1811 |
elem_id="camera_ui",
|
|
@@ -1814,7 +2186,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
|
|
| 1814 |
# Hidden real dropdown (backend value)
|
| 1815 |
camera_lora = gr.Dropdown(
|
| 1816 |
label="Camera Control LoRA",
|
| 1817 |
-
choices=[name for name, _ in
|
| 1818 |
value="No LoRA",
|
| 1819 |
visible=False
|
| 1820 |
)
|
|
@@ -1828,6 +2200,14 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
|
|
| 1828 |
api_visibility="private"
|
| 1829 |
)
|
| 1830 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1831 |
duration_ui.change(
|
| 1832 |
fn=apply_duration,
|
| 1833 |
inputs=duration_ui,
|
|
@@ -1854,6 +2234,8 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
|
|
| 1854 |
input_image,
|
| 1855 |
prompt,
|
| 1856 |
duration,
|
|
|
|
|
|
|
| 1857 |
enhance_prompt,
|
| 1858 |
seed,
|
| 1859 |
randomize_seed,
|
|
@@ -1873,6 +2255,8 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
|
|
| 1873 |
"A fuzzy puppet superhero character resembling a female puppet with blonde hair and a blue superhero suit sleeping in bed and just waking up, she gradually gets up, rubbing her eyes and looking at her dog that just popped on the bed. the scene feels chaotic, comedic, and emotional with expressive puppet reactions, cinematic lighting, smooth camera motion, shallow depth of field, and high-quality puppet-style animation",
|
| 1874 |
"Static",
|
| 1875 |
"16:9",
|
|
|
|
|
|
|
| 1876 |
"supergirl.m4a"
|
| 1877 |
],
|
| 1878 |
[
|
|
@@ -1880,13 +2264,35 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
|
|
| 1880 |
"A fuzzy puppet superhero character resembling a female puppet with blonde hair and a blue superhero suit stands inside an icy cave made of frozen walls and icicles, she looks panicked and frantic, rapidly turning her head left and right and scanning the cave while waving her arms and shouting angrily and desperately, mouthing the words “where the hell is my dog,” her movements exaggerated and puppet-like with high energy and urgency, suddenly a second puppet dog bursts into frame from the side, jumping up excitedly and tackling her affectionately while licking her face repeatedly, she freezes in surprise and then breaks into relief and laughter as the dog continues licking her, the scene feels chaotic, comedic, and emotional with expressive puppet reactions, cinematic lighting, smooth camera motion, shallow depth of field, and high-quality puppet-style animation",
|
| 1881 |
"No LoRA",
|
| 1882 |
"16:9",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1883 |
None,
|
| 1884 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1885 |
[
|
| 1886 |
"highland.png",
|
| 1887 |
"Realistic POV selfie-style video in a snowy, foggy field. Two shaggy Highland cows with long curved horns stand ahead. The camera is handheld and slightly shaky. The woman filming talks nervously and excitedly in a vlog tone: \"Oh my god guys… look how big those horns are… I’m kinda scared.\" The cow on the left walks toward the camera in a cute, bouncy, hopping way, curious and gentle. Snow crunches under its hooves, breath visible in the cold air. The horns look massive from the POV. As the cow gets very close, its wet nose with slight dripping fills part of the frame. She laughs nervously but reaches out and pets the cow. The cow makes deep, soft, interesting mooing and snorting sounds, calm and friendly. Ultra-realistic, natural lighting, immersive audio, documentary-style realism.",
|
| 1888 |
"No LoRA",
|
| 1889 |
"16:9",
|
|
|
|
|
|
|
| 1890 |
None,
|
| 1891 |
],
|
| 1892 |
[
|
|
@@ -1894,6 +2300,8 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
|
|
| 1894 |
"A cinematic dolly out of Wednesday Addams frozen mid-dance on a dark, blue-lit ballroom floor as students move indistinctly behind her, their footsteps and muffled music reduced to a distant, underwater thrum; the audio foregrounds her steady breathing and the faint rustle of fabric as she slowly raises one arm, never breaking eye contact with the camera, then after a deliberately long silence she speaks in a flat, dry, perfectly controlled voice, “I don’t dance… I vibe code,” each word crisp and unemotional, followed by an abrupt cutoff of her voice as the background sound swells slightly, reinforcing the deadpan humor, with precise lip sync, minimal facial movement, stark gothic lighting, and cinematic realism.",
|
| 1895 |
"Zoom Out",
|
| 1896 |
"16:9",
|
|
|
|
|
|
|
| 1897 |
None,
|
| 1898 |
],
|
| 1899 |
[
|
|
@@ -1901,15 +2309,15 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
|
|
| 1901 |
"An astronaut hatches from a fragile egg on the surface of the Moon, the shell cracking and peeling apart in gentle low-gravity motion. Fine lunar dust lifts and drifts outward with each movement, floating in slow arcs before settling back onto the ground. The astronaut pushes free in a deliberate, weightless motion, small fragments of the egg tumbling and spinning through the air. In the background, the deep darkness of space subtly shifts as stars glide with the camera's movement, emphasizing vast depth and scale. The camera performs a smooth, cinematic slow push-in, with natural parallax between the foreground dust, the astronaut, and the distant starfield. Ultra-realistic detail, physically accurate low-gravity motion, cinematic lighting, and a breath-taking, movie-like shot.",
|
| 1902 |
"Static",
|
| 1903 |
"1:1",
|
|
|
|
|
|
|
| 1904 |
None,
|
| 1905 |
],
|
| 1906 |
-
|
| 1907 |
-
|
| 1908 |
],
|
| 1909 |
fn=generate_video_example,
|
| 1910 |
-
inputs=[input_image, prompt_ui, camera_ui, resolution_ui, audio_input],
|
| 1911 |
outputs = [output_video],
|
| 1912 |
-
label="
|
| 1913 |
cache_examples=True,
|
| 1914 |
)
|
| 1915 |
|
|
|
|
| 8 |
import torchaudio
|
| 9 |
import os
|
| 10 |
from typing import Any
|
| 11 |
+
import time
|
| 12 |
+
from contextlib import contextmanager
|
| 13 |
+
|
| 14 |
+
@contextmanager
|
| 15 |
+
def timer(name: str):
|
| 16 |
+
start = time.time()
|
| 17 |
+
print(f"{name}...")
|
| 18 |
+
yield
|
| 19 |
+
print(f" -> {name} completed in {time.time() - start:.2f} sec")
|
| 20 |
|
| 21 |
def _coerce_audio_path(audio_path: Any) -> str:
|
| 22 |
# Common Gradio case: tuple where first item is the filepath
|
|
|
|
| 34 |
|
| 35 |
return os.fspath(audio_path)
|
| 36 |
|
| 37 |
+
def extract_audio_wav_ffmpeg(video_path: str, target_sr: int = 48000) -> str | None:
|
| 38 |
+
"""
|
| 39 |
+
Extract audio from a video into a temp WAV (mono, target_sr).
|
| 40 |
+
Returns path, or None if the video has no audio stream.
|
| 41 |
+
"""
|
| 42 |
+
out_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
|
| 43 |
+
|
| 44 |
+
# Check if there's an audio stream
|
| 45 |
+
probe_cmd = [
|
| 46 |
+
"ffprobe", "-v", "error",
|
| 47 |
+
"-select_streams", "a:0",
|
| 48 |
+
"-show_entries", "stream=codec_type",
|
| 49 |
+
"-of", "default=nw=1:nk=1",
|
| 50 |
+
video_path,
|
| 51 |
+
]
|
| 52 |
+
try:
|
| 53 |
+
out = subprocess.check_output(probe_cmd).decode("utf-8").strip()
|
| 54 |
+
if not out:
|
| 55 |
+
return None
|
| 56 |
+
except subprocess.CalledProcessError:
|
| 57 |
+
return None
|
| 58 |
+
|
| 59 |
+
# Extract + resample + mono
|
| 60 |
+
cmd = [
|
| 61 |
+
"ffmpeg", "-y", "-v", "error",
|
| 62 |
+
"-i", video_path,
|
| 63 |
+
"-vn",
|
| 64 |
+
"-ac", "1",
|
| 65 |
+
"-ar", str(int(target_sr)),
|
| 66 |
+
"-c:a", "pcm_s16le",
|
| 67 |
+
out_path
|
| 68 |
+
]
|
| 69 |
+
subprocess.check_call(cmd)
|
| 70 |
+
return out_path
|
| 71 |
+
|
| 72 |
|
| 73 |
|
| 74 |
def match_audio_to_duration(
|
|
|
|
| 119 |
|
| 120 |
def sh(cmd): subprocess.check_call(cmd, shell=True)
|
| 121 |
|
|
|
|
|
|
|
| 122 |
# Add packages to Python path
|
| 123 |
current_dir = Path(__file__).parent
|
| 124 |
sys.path.insert(0, str(current_dir / "packages" / "ltx-pipelines" / "src"))
|
|
|
|
| 160 |
from ltx_pipelines.utils.helpers import generate_enhanced_prompt
|
| 161 |
import imageio
|
| 162 |
import cv2
|
| 163 |
+
from controlnet_aux import CannyDetector, MidasDetector
|
| 164 |
+
from dwpose import DwposeDetector
|
| 165 |
|
| 166 |
|
| 167 |
# HuggingFace Hub defaults
|
|
|
|
| 208 |
)
|
| 209 |
|
| 210 |
canny_processor = CannyDetector()
|
| 211 |
+
# Depth (MiDaS) processor
|
| 212 |
+
# Downloads annotator weights automatically the first time.
|
| 213 |
+
depth_processor = MidasDetector.from_pretrained("lllyasviel/Annotators").to("cuda")
|
| 214 |
|
| 215 |
|
| 216 |
# Load text encoder once and keep it in memory
|
|
|
|
| 231 |
|
| 232 |
def process_video_for_pose(frames, width: int, height: int):
|
| 233 |
|
| 234 |
+
pose_processor = DwposeDetector.from_pretrained_default()
|
| 235 |
|
| 236 |
if not frames:
|
| 237 |
return []
|
|
|
|
| 242 |
pil = Image.fromarray(frame.astype(np.uint8)).convert("RGB")
|
| 243 |
|
| 244 |
# ✅ do NOT pass width/height here (easy_dwpose will handle drawing sizes internally)
|
| 245 |
+
pose_img = pose_processor(pil, include_body=True, include_hand=True, include_face=True)
|
| 246 |
|
| 247 |
# Ensure it's PIL then resize to your conditioning size
|
| 248 |
if not isinstance(pose_img, Image.Image):
|
|
|
|
| 264 |
tmp.close()
|
| 265 |
return write_video_mp4(pose_frames, fps=fps, out_path=tmp.name)
|
| 266 |
|
| 267 |
+
def process_video_for_depth(frames, width: int, height: int):
|
| 268 |
+
"""
|
| 269 |
+
Convert RGB frames -> depth map frames.
|
| 270 |
+
Returns list of np arrays (H,W,3) float in [0..1] (controlnet-style).
|
| 271 |
+
"""
|
| 272 |
+
if not frames:
|
| 273 |
+
return []
|
| 274 |
+
|
| 275 |
+
detect_resolution = max(frames[0].shape[0], frames[0].shape[1])
|
| 276 |
+
image_resolution = max(width, height)
|
| 277 |
+
|
| 278 |
+
depth_frames = []
|
| 279 |
+
for frame in frames:
|
| 280 |
+
# controlnet_aux MidasDetector returns float [0..1] when output_type="np"
|
| 281 |
+
depth = depth_processor(
|
| 282 |
+
frame,
|
| 283 |
+
detect_resolution=detect_resolution,
|
| 284 |
+
image_resolution=image_resolution,
|
| 285 |
+
output_type="np",
|
| 286 |
+
)
|
| 287 |
+
|
| 288 |
+
# Safety: ensure HWC and 3 channels (some versions may output 1ch)
|
| 289 |
+
if depth.ndim == 2:
|
| 290 |
+
depth = np.stack([depth, depth, depth], axis=-1)
|
| 291 |
+
elif depth.shape[-1] == 1:
|
| 292 |
+
depth = np.repeat(depth, 3, axis=-1)
|
| 293 |
+
|
| 294 |
+
depth_frames.append(depth)
|
| 295 |
+
|
| 296 |
+
return depth_frames
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
def preprocess_video_to_depth_mp4(video_path: str, width: int, height: int, fps: float):
|
| 300 |
+
"""End-to-end: read video -> depth -> write temp mp4 -> return path."""
|
| 301 |
+
frames = load_video_frames(video_path)
|
| 302 |
+
depth_frames = process_video_for_depth(frames, width=width, height=height)
|
| 303 |
+
tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
|
| 304 |
+
tmp.close()
|
| 305 |
+
return write_video_mp4(depth_frames, fps=fps, out_path=tmp.name)
|
| 306 |
+
|
| 307 |
|
| 308 |
def load_video_frames(video_path: str):
|
| 309 |
"""Return list of frames as numpy arrays (H,W,3) uint8."""
|
|
|
|
| 315 |
|
| 316 |
|
| 317 |
def process_video_for_canny(frames, width: int, height: int,
|
| 318 |
+
low_threshold=20, high_threshold=60):
|
| 319 |
"""
|
| 320 |
Convert RGB frames -> canny edge frames.
|
| 321 |
Returns list of np arrays (H,W,3) in float [0..1] (like controlnet_aux output).
|
|
|
|
| 329 |
canny_frames = []
|
| 330 |
for frame in frames:
|
| 331 |
# controlnet_aux CannyDetector returns float image in [0..1] if output_type="np"
|
| 332 |
+
# frame_blur = cv2.GaussianBlur(frame, (3, 3), 0)
|
| 333 |
+
|
| 334 |
canny = canny_processor(
|
| 335 |
frame,
|
| 336 |
low_threshold=low_threshold,
|
|
|
|
| 364 |
tmp.close()
|
| 365 |
return write_video_mp4(canny_frames, fps=fps, out_path=tmp.name)
|
| 366 |
|
| 367 |
+
import json
|
| 368 |
+
|
| 369 |
+
def probe_video_duration_seconds(video_path: str) -> float:
|
| 370 |
+
"""Return duration in seconds using ffprobe."""
|
| 371 |
+
cmd = [
|
| 372 |
+
"ffprobe", "-v", "error",
|
| 373 |
+
"-select_streams", "v:0",
|
| 374 |
+
"-show_entries", "format=duration",
|
| 375 |
+
"-of", "json",
|
| 376 |
+
video_path,
|
| 377 |
+
]
|
| 378 |
+
out = subprocess.check_output(cmd).decode("utf-8")
|
| 379 |
+
data = json.loads(out)
|
| 380 |
+
dur = float(data["format"]["duration"])
|
| 381 |
+
return dur
|
| 382 |
+
|
| 383 |
+
def trim_video_to_seconds_ffmpeg(video_path: str, target_seconds: float, fps: float = None) -> str:
|
| 384 |
+
"""
|
| 385 |
+
Trim video to [0, target_seconds]. Re-encode for accuracy & compatibility.
|
| 386 |
+
If fps is provided, also normalize fps.
|
| 387 |
+
Returns new temp mp4 path.
|
| 388 |
+
"""
|
| 389 |
+
target_seconds = max(0.01, float(target_seconds))
|
| 390 |
+
|
| 391 |
+
out_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
|
| 392 |
+
|
| 393 |
+
vf = []
|
| 394 |
+
if fps is not None:
|
| 395 |
+
vf.append(f"fps={float(fps)}")
|
| 396 |
+
vf_str = ",".join(vf) if vf else None
|
| 397 |
+
|
| 398 |
+
cmd = ["ffmpeg", "-y", "-v", "error"]
|
| 399 |
+
|
| 400 |
+
# Accurate trim: use -t and re-encode.
|
| 401 |
+
cmd += ["-i", video_path, "-t", f"{target_seconds:.6f}"]
|
| 402 |
+
|
| 403 |
+
if vf_str:
|
| 404 |
+
cmd += ["-vf", vf_str]
|
| 405 |
+
|
| 406 |
+
# Safe default encode
|
| 407 |
+
cmd += [
|
| 408 |
+
"-c:v", "libx264", "-pix_fmt", "yuv420p", "-preset", "veryfast", "-crf", "18",
|
| 409 |
+
"-an", # conditioning video doesn't need audio
|
| 410 |
+
out_path
|
| 411 |
+
]
|
| 412 |
+
|
| 413 |
+
subprocess.check_call(cmd)
|
| 414 |
+
return out_path
|
| 415 |
+
|
| 416 |
+
def extract_first_frame_png(video_path: str) -> str:
|
| 417 |
+
"""Extract first frame as png; returns png path."""
|
| 418 |
+
out_path = tempfile.NamedTemporaryFile(suffix=".png", delete=False).name
|
| 419 |
+
cmd = [
|
| 420 |
+
"ffmpeg", "-y", "-v", "error",
|
| 421 |
+
"-i", video_path,
|
| 422 |
+
"-frames:v", "1",
|
| 423 |
+
out_path
|
| 424 |
+
]
|
| 425 |
+
subprocess.check_call(cmd)
|
| 426 |
+
return out_path
|
| 427 |
+
|
| 428 |
+
def _coerce_video_path(video_path: Any) -> str:
|
| 429 |
+
if isinstance(video_path, tuple) and len(video_path) > 0:
|
| 430 |
+
video_path = video_path[0]
|
| 431 |
+
if isinstance(video_path, dict):
|
| 432 |
+
video_path = video_path.get("name") or video_path.get("path")
|
| 433 |
+
if not isinstance(video_path, (str, bytes, os.PathLike)):
|
| 434 |
+
raise TypeError(f"video_path must be a path-like, got {type(video_path)}: {video_path}")
|
| 435 |
+
return os.fspath(video_path)
|
| 436 |
+
|
| 437 |
+
|
| 438 |
+
def prepare_conditioning_video_mp4(
|
| 439 |
+
video_path: Any,
|
| 440 |
+
target_num_frames: int,
|
| 441 |
+
target_fps: float,
|
| 442 |
+
) -> tuple[str, str]:
|
| 443 |
+
"""
|
| 444 |
+
Returns (conditioning_mp4_path, first_frame_png_path).
|
| 445 |
+
|
| 446 |
+
Makes an mp4 with exactly target_num_frames frames:
|
| 447 |
+
- if source has more -> truncate
|
| 448 |
+
- if source has fewer -> pad by repeating last frame
|
| 449 |
+
"""
|
| 450 |
+
video_path = _coerce_video_path(video_path)
|
| 451 |
+
|
| 452 |
+
# Decode frames (robust / deterministic)
|
| 453 |
+
frames = load_video_frames(video_path) # list of HWC uint8 frames
|
| 454 |
+
if not frames:
|
| 455 |
+
raise ValueError("No frames decoded from input video")
|
| 456 |
+
|
| 457 |
+
# Truncate or pad to exact length
|
| 458 |
+
if len(frames) >= target_num_frames:
|
| 459 |
+
frames = frames[:target_num_frames]
|
| 460 |
+
else:
|
| 461 |
+
last = frames[-1]
|
| 462 |
+
frames = frames + [last] * (target_num_frames - len(frames))
|
| 463 |
+
|
| 464 |
+
# Save first frame as PNG (for input_image)
|
| 465 |
+
first_png = tempfile.NamedTemporaryFile(suffix=".png", delete=False).name
|
| 466 |
+
Image.fromarray(frames[0]).save(first_png)
|
| 467 |
+
|
| 468 |
+
# Write conditioning mp4
|
| 469 |
+
# write_video_mp4 expects float [0..1]
|
| 470 |
+
frames_float = [f.astype(np.float32) / 255.0 for f in frames]
|
| 471 |
+
cond_mp4 = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
|
| 472 |
+
write_video_mp4(frames_float, fps=target_fps, out_path=cond_mp4)
|
| 473 |
+
|
| 474 |
+
return cond_mp4, first_png
|
| 475 |
+
|
| 476 |
+
def valid_1_plus_8k(n: int) -> int:
|
| 477 |
+
"""Largest integer <= n that is of the form 1 + 8*k (k>=0)."""
|
| 478 |
+
if n <= 0:
|
| 479 |
+
return 0
|
| 480 |
+
return 1 + 8 * ((n - 1) // 8)
|
| 481 |
+
|
| 482 |
+
def prepare_conditioning_video_mp4_no_pad(
|
| 483 |
+
video_path: Any,
|
| 484 |
+
duration_frames: int,
|
| 485 |
+
target_fps: float,
|
| 486 |
+
) -> tuple[str, str, int]:
|
| 487 |
+
"""
|
| 488 |
+
Returns (conditioning_mp4_path, first_frame_png_path, used_num_frames)
|
| 489 |
+
|
| 490 |
+
- Decodes source frames
|
| 491 |
+
- Trims to the largest valid length (1 + 8*k) <= source length
|
| 492 |
+
- NEVER pads / loops / repeats last frame
|
| 493 |
+
"""
|
| 494 |
+
video_path = _coerce_video_path(video_path)
|
| 495 |
+
|
| 496 |
+
frames = load_video_frames(video_path) # list of HWC uint8
|
| 497 |
+
if not frames:
|
| 498 |
+
raise ValueError("No frames decoded from input video")
|
| 499 |
+
|
| 500 |
+
n_src = len(frames)
|
| 501 |
+
n_src = min(n_src, duration_frames)
|
| 502 |
+
n_used = valid_1_plus_8k(n_src)
|
| 503 |
+
|
| 504 |
+
# If the video is extremely short (e.g. 1 frame), n_used can be 1 which is valid.
|
| 505 |
+
if n_used == 0:
|
| 506 |
+
raise ValueError(f"Video too short: {n_src} frames")
|
| 507 |
+
|
| 508 |
+
frames = frames[:n_used]
|
| 509 |
+
|
| 510 |
+
first_png = tempfile.NamedTemporaryFile(suffix=".png", delete=False).name
|
| 511 |
+
Image.fromarray(frames[0]).save(first_png)
|
| 512 |
+
|
| 513 |
+
frames_float = [f.astype(np.float32) / 255.0 for f in frames]
|
| 514 |
+
cond_mp4 = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
|
| 515 |
+
write_video_mp4(frames_float, fps=target_fps, out_path=cond_mp4)
|
| 516 |
+
|
| 517 |
+
return cond_mp4, first_png, n_used
|
| 518 |
+
|
| 519 |
|
| 520 |
def encode_text_simple(text_encoder, prompt: str):
|
| 521 |
"""Simple text encoding without using pipeline_utils."""
|
|
|
|
| 659 |
"ltx-2-19b-ic-lora-detailer.safetensors",
|
| 660 |
)
|
| 661 |
|
| 662 |
+
pose_lora_path = get_hub_or_local_checkpoint(
|
| 663 |
+
"Lightricks/LTX-2-19b-IC-LoRA-Pose-Control",
|
| 664 |
+
"ltx-2-19b-ic-lora-pose-control.safetensors",
|
| 665 |
+
)
|
| 666 |
+
|
| 667 |
+
|
| 668 |
+
|
| 669 |
# Load distilled LoRA as a regular LoRA
|
| 670 |
loras = [
|
| 671 |
# --- fused / base behavior ---
|
|
|
|
| 682 |
LoraPathStrengthAndSDOps(dolly_right_lora_path, DEFAULT_LORA_STRENGTH, LTXV_LORA_COMFY_RENAMING_MAP),
|
| 683 |
LoraPathStrengthAndSDOps(jib_down_lora_path, DEFAULT_LORA_STRENGTH, LTXV_LORA_COMFY_RENAMING_MAP),
|
| 684 |
LoraPathStrengthAndSDOps(jib_up_lora_path, DEFAULT_LORA_STRENGTH, LTXV_LORA_COMFY_RENAMING_MAP),
|
| 685 |
+
LoraPathStrengthAndSDOps(pose_lora_path, DEFAULT_LORA_STRENGTH, LTXV_LORA_COMFY_RENAMING_MAP),
|
| 686 |
]
|
| 687 |
|
| 688 |
# Runtime-toggle LoRAs (exclude fused distilled at index 0)
|
| 689 |
+
VISIBLE_RUNTIME_LORA_CHOICES = [
|
| 690 |
+
("No LoRA", -1),
|
| 691 |
+
("Static", 0),
|
| 692 |
+
("Detailer", 1),
|
| 693 |
+
("Zoom In", 2),
|
| 694 |
+
("Zoom Out", 3),
|
| 695 |
+
("Slide Left", 4),
|
| 696 |
+
("Slide Right", 5),
|
| 697 |
+
("Slide Down", 6),
|
| 698 |
+
("Slide Up", 7),
|
| 699 |
+
]
|
| 700 |
+
|
| 701 |
RUNTIME_LORA_CHOICES = [
|
| 702 |
("No LoRA", -1),
|
| 703 |
("Static", 0),
|
|
|
|
| 708 |
("Slide Right", 5),
|
| 709 |
("Slide Down", 6),
|
| 710 |
("Slide Up", 7),
|
| 711 |
+
("Pose", 8),
|
| 712 |
]
|
| 713 |
|
| 714 |
# Initialize pipeline WITHOUT text encoder (gemma_root=None)
|
|
|
|
| 815 |
|
| 816 |
// Recalc on resize (important in Gradio layouts)
|
| 817 |
window.addEventListener('resize', () => setHighlightByIndex(currentIdx));
|
| 818 |
+
|
| 819 |
+
// sync from Python (Examples / backend updates)
|
| 820 |
+
let last = props.value;
|
| 821 |
+
const syncFromProps = () => {
|
| 822 |
+
if (props.value !== last) {
|
| 823 |
+
last = props.value;
|
| 824 |
+
setCheckedByValue(last, false);
|
| 825 |
+
}
|
| 826 |
+
requestAnimationFrame(syncFromProps);
|
| 827 |
+
};
|
| 828 |
+
requestAnimationFrame(syncFromProps);
|
| 829 |
+
|
| 830 |
})();
|
| 831 |
|
| 832 |
"""
|
|
|
|
| 1089 |
**kwargs
|
| 1090 |
)
|
| 1091 |
|
| 1092 |
+
def generate_video_example(input_image, prompt, camera_lora, resolution, radioanimated_mode, input_video, input_audio, progress=gr.Progress(track_tqdm=True)):
|
| 1093 |
|
| 1094 |
w, h = apply_resolution(resolution)
|
| 1095 |
|
| 1096 |
+
with timer(f'generating with video path:{input_video} with duration:{duration} and LoRA:{camera_lora} in {w}x{h}'):
|
| 1097 |
+
output_video = generate_video(
|
| 1098 |
+
input_image,
|
| 1099 |
+
prompt,
|
| 1100 |
+
10,
|
| 1101 |
+
input_video,
|
| 1102 |
+
radioanimated_mode,
|
| 1103 |
+
True,
|
| 1104 |
+
42,
|
| 1105 |
+
True,
|
| 1106 |
+
h,
|
| 1107 |
+
w,
|
| 1108 |
+
camera_lora,
|
| 1109 |
+
input_audio,
|
| 1110 |
+
progress
|
| 1111 |
+
)
|
| 1112 |
return output_video
|
| 1113 |
|
| 1114 |
def get_duration(
|
| 1115 |
input_image,
|
| 1116 |
prompt,
|
| 1117 |
duration,
|
| 1118 |
+
input_video,
|
| 1119 |
+
radioanimated_mode,
|
| 1120 |
enhance_prompt,
|
| 1121 |
seed,
|
| 1122 |
randomize_seed,
|
|
|
|
| 1131 |
if audio_path is not None:
|
| 1132 |
extra_time += 10
|
| 1133 |
|
| 1134 |
+
if input_video is not None:
|
| 1135 |
+
extra_time += 60
|
| 1136 |
+
|
| 1137 |
if duration <= 3:
|
| 1138 |
return 60 + extra_time
|
| 1139 |
elif duration <= 5:
|
|
|
|
| 1143 |
else:
|
| 1144 |
return 180 + extra_time
|
| 1145 |
|
| 1146 |
+
|
| 1147 |
@spaces.GPU(duration=get_duration)
|
| 1148 |
def generate_video(
|
| 1149 |
input_image,
|
| 1150 |
prompt: str,
|
| 1151 |
duration: float,
|
| 1152 |
+
input_video = None,
|
| 1153 |
+
generation_mode = "Image-to-Video",
|
| 1154 |
enhance_prompt: bool = True,
|
| 1155 |
seed: int = 42,
|
| 1156 |
randomize_seed: bool = True,
|
|
|
|
| 1166 |
input_image: Optional input image for image-to-video. If provided, it is injected at frame 0 to guide motion.
|
| 1167 |
prompt: Text description of the scene, motion, and cinematic style to generate.
|
| 1168 |
duration: Desired video length in seconds. Converted to frames using a fixed 24 FPS rate.
|
| 1169 |
+
input_video: Optional conditioning video path (mp4). If provided, motion is guided by this video.
|
| 1170 |
enhance_prompt: Whether to enhance the prompt using the prompt enhancer before encoding.
|
| 1171 |
seed: Base random seed for reproducibility (ignored if randomize_seed is True).
|
| 1172 |
randomize_seed: If True, a random seed is generated for each run.
|
|
|
|
| 1202 |
# Calculate num_frames from duration (using fixed 24 fps)
|
| 1203 |
frame_rate = 24.0
|
| 1204 |
num_frames = int(duration * frame_rate) + 1 # +1 to ensure we meet the duration
|
| 1205 |
+
video_seconds = int(duration)
|
| 1206 |
|
| 1207 |
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
|
| 1208 |
output_path = tmpfile.name
|
| 1209 |
|
| 1210 |
|
| 1211 |
images = []
|
| 1212 |
+
videos = []
|
| 1213 |
+
|
| 1214 |
+
if generation_mode == "Motion Control":
|
| 1215 |
+
if input_video is not None:
|
| 1216 |
+
cond_mp4, first_png, used_frames = prepare_conditioning_video_mp4_no_pad(
|
| 1217 |
+
video_path=input_video,
|
| 1218 |
+
duration_frames=num_frames,
|
| 1219 |
+
target_fps=frame_rate,
|
| 1220 |
+
)
|
| 1221 |
+
|
| 1222 |
+
if input_image is None:
|
| 1223 |
+
images = [(first_png, 0, 1.0)]
|
| 1224 |
+
|
| 1225 |
+
if audio_path is None:
|
| 1226 |
+
src_video_path = _coerce_video_path(input_video)
|
| 1227 |
+
extracted_audio_tmp = extract_audio_wav_ffmpeg(src_video_path, target_sr=48000)
|
| 1228 |
+
|
| 1229 |
+
if extracted_audio_tmp is not None:
|
| 1230 |
+
audio_path = extracted_audio_tmp
|
| 1231 |
+
|
| 1232 |
+
with timer("Pose selected: preprocessing conditioning video to pose..."):
|
| 1233 |
+
cond_path = preprocess_video_to_pose_mp4(
|
| 1234 |
+
video_path=cond_mp4,
|
| 1235 |
+
width=width,
|
| 1236 |
+
height=height,
|
| 1237 |
+
fps=frame_rate,
|
| 1238 |
+
)
|
| 1239 |
+
videos = [(cond_path, 1.0)]
|
| 1240 |
+
camera_lora = "Pose"
|
| 1241 |
|
| 1242 |
if input_image is not None:
|
| 1243 |
images = [(input_image, 0, 1.0)]
|
|
|
|
| 1267 |
_, n_audio_context = encode_text_simple(text_encoder, "") # returns tensors on GPU already
|
| 1268 |
del audio_context
|
| 1269 |
audio_context = n_audio_context
|
| 1270 |
+
|
| 1271 |
+
if len(videos) == 0:
|
| 1272 |
+
camera_lora = "Static"
|
| 1273 |
|
| 1274 |
torch.cuda.empty_cache()
|
| 1275 |
|
|
|
|
| 1296 |
input_waveform = None
|
| 1297 |
input_waveform_sample_rate = None
|
| 1298 |
|
| 1299 |
+
with timer(f'generating with video path:{input_video} and LoRA:{camera_lora} in {width}x{height}'):
|
| 1300 |
+
with torch.inference_mode():
|
| 1301 |
+
pipeline(
|
| 1302 |
+
prompt=prompt,
|
| 1303 |
+
output_path=str(output_path),
|
| 1304 |
+
seed=current_seed,
|
| 1305 |
+
height=height,
|
| 1306 |
+
width=width,
|
| 1307 |
+
num_frames=num_frames,
|
| 1308 |
+
frame_rate=frame_rate,
|
| 1309 |
+
images=images,
|
| 1310 |
+
video_conditioning=videos,
|
| 1311 |
+
tiling_config=TilingConfig.default(),
|
| 1312 |
+
video_context=video_context,
|
| 1313 |
+
audio_context=audio_context,
|
| 1314 |
+
input_waveform=input_waveform,
|
| 1315 |
+
input_waveform_sample_rate=input_waveform_sample_rate,
|
| 1316 |
+
)
|
| 1317 |
del video_context, audio_context
|
| 1318 |
torch.cuda.empty_cache()
|
| 1319 |
print("successful generation")
|
|
|
|
| 1337 |
duration_s = int(duration[:-1])
|
| 1338 |
return duration_s
|
| 1339 |
|
| 1340 |
+
def on_mode_change(selected: str):
|
| 1341 |
+
is_i2v = (selected == "Image-to-Video")
|
| 1342 |
+
|
| 1343 |
+
return gr.update(visible=not is_i2v)
|
| 1344 |
+
|
| 1345 |
+
|
| 1346 |
|
| 1347 |
css = """
|
| 1348 |
|
|
|
|
| 1451 |
#false:checked ~ .toggle-highlight {
|
| 1452 |
transform: translateX(100%);
|
| 1453 |
}
|
| 1454 |
+
|
| 1455 |
+
/* Center items inside that row */
|
| 1456 |
+
#mode-row{
|
| 1457 |
+
justify-content: center !important;
|
| 1458 |
+
align-items: center !important;
|
| 1459 |
+
}
|
| 1460 |
+
|
| 1461 |
+
/* Center the mode row contents */
|
| 1462 |
+
#mode-row {
|
| 1463 |
+
display: flex !important;
|
| 1464 |
+
justify-content: center !important;
|
| 1465 |
+
align-items: center !important;
|
| 1466 |
+
width: 100% !important;
|
| 1467 |
+
}
|
| 1468 |
+
|
| 1469 |
+
/* Stop Gradio from making children stretch */
|
| 1470 |
+
#mode-row > * {
|
| 1471 |
+
flex: 0 0 auto !important;
|
| 1472 |
+
width: auto !important;
|
| 1473 |
+
min-width: 0 !important;
|
| 1474 |
+
}
|
| 1475 |
+
|
| 1476 |
+
/* Specifically ensure the HTML component wrapper doesn't take full width */
|
| 1477 |
+
#mode-row .gr-html,
|
| 1478 |
+
#mode-row .gradio-html,
|
| 1479 |
+
#mode-row .prose,
|
| 1480 |
+
#mode-row .block {
|
| 1481 |
+
width: auto !important;
|
| 1482 |
+
flex: 0 0 auto !important;
|
| 1483 |
+
display: inline-block !important;
|
| 1484 |
+
}
|
| 1485 |
+
|
| 1486 |
+
/* Center the pill itself */
|
| 1487 |
+
#radioanimated_mode {
|
| 1488 |
+
display: inline-flex !important;
|
| 1489 |
+
justify-content: center !important;
|
| 1490 |
+
width: auto !important;
|
| 1491 |
+
}
|
| 1492 |
+
|
| 1493 |
"""
|
| 1494 |
|
| 1495 |
css += """
|
|
|
|
| 2038 |
"""
|
| 2039 |
)
|
| 2040 |
with gr.Column(elem_id="col-container"):
|
| 2041 |
+
with gr.Row(elem_id="mode-row"):
|
| 2042 |
+
radioanimated_mode = RadioAnimated(
|
| 2043 |
+
choices=["Image-to-Video", "Motion Control"],
|
| 2044 |
+
value="Image-to-Video",
|
| 2045 |
+
elem_id="radioanimated_mode"
|
| 2046 |
+
)
|
| 2047 |
with gr.Row():
|
| 2048 |
with gr.Column(elem_id="step-column"):
|
| 2049 |
+
|
| 2050 |
input_image = gr.Image(
|
| 2051 |
label="First Frame (Optional)",
|
| 2052 |
type="filepath",
|
| 2053 |
height=256
|
| 2054 |
)
|
| 2055 |
|
| 2056 |
+
input_video = gr.Video(
|
| 2057 |
+
label="Motion Reference Video",
|
| 2058 |
+
height=256,
|
| 2059 |
+
visible=False,
|
| 2060 |
+
)
|
| 2061 |
+
|
| 2062 |
relocate = gr.HTML(
|
| 2063 |
value="",
|
| 2064 |
html_template="<div></div>",
|
|
|
|
| 2129 |
|
| 2130 |
with gr.Column(elem_id="step-column"):
|
| 2131 |
output_video = gr.Video(label="Generated Video", autoplay=True, height=512)
|
| 2132 |
+
|
| 2133 |
with gr.Row(elem_id="controls-row"):
|
| 2134 |
|
| 2135 |
duration_ui = CameraDropdown(
|
|
|
|
| 2177 |
height = gr.Number(label="Height", value=DEFAULT_1_STAGE_HEIGHT, precision=0, visible=False)
|
| 2178 |
|
| 2179 |
camera_ui = CameraDropdown(
|
| 2180 |
+
choices=[name for name, _ in VISIBLE_RUNTIME_LORA_CHOICES],
|
| 2181 |
value="No LoRA",
|
| 2182 |
title="Camera LoRA",
|
| 2183 |
elem_id="camera_ui",
|
|
|
|
| 2186 |
# Hidden real dropdown (backend value)
|
| 2187 |
camera_lora = gr.Dropdown(
|
| 2188 |
label="Camera Control LoRA",
|
| 2189 |
+
choices=[name for name, _ in VISIBLE_RUNTIME_LORA_CHOICES],
|
| 2190 |
value="No LoRA",
|
| 2191 |
visible=False
|
| 2192 |
)
|
|
|
|
| 2200 |
api_visibility="private"
|
| 2201 |
)
|
| 2202 |
|
| 2203 |
+
radioanimated_mode.change(
|
| 2204 |
+
fn=on_mode_change,
|
| 2205 |
+
inputs=radioanimated_mode,
|
| 2206 |
+
outputs=[input_video],
|
| 2207 |
+
api_visibility="private",
|
| 2208 |
+
)
|
| 2209 |
+
|
| 2210 |
+
|
| 2211 |
duration_ui.change(
|
| 2212 |
fn=apply_duration,
|
| 2213 |
inputs=duration_ui,
|
|
|
|
| 2234 |
input_image,
|
| 2235 |
prompt,
|
| 2236 |
duration,
|
| 2237 |
+
input_video,
|
| 2238 |
+
radioanimated_mode,
|
| 2239 |
enhance_prompt,
|
| 2240 |
seed,
|
| 2241 |
randomize_seed,
|
|
|
|
| 2255 |
"A fuzzy puppet superhero character resembling a female puppet with blonde hair and a blue superhero suit sleeping in bed and just waking up, she gradually gets up, rubbing her eyes and looking at her dog that just popped on the bed. the scene feels chaotic, comedic, and emotional with expressive puppet reactions, cinematic lighting, smooth camera motion, shallow depth of field, and high-quality puppet-style animation",
|
| 2256 |
"Static",
|
| 2257 |
"16:9",
|
| 2258 |
+
"Image-to-Video",
|
| 2259 |
+
None,
|
| 2260 |
"supergirl.m4a"
|
| 2261 |
],
|
| 2262 |
[
|
|
|
|
| 2264 |
"A fuzzy puppet superhero character resembling a female puppet with blonde hair and a blue superhero suit stands inside an icy cave made of frozen walls and icicles, she looks panicked and frantic, rapidly turning her head left and right and scanning the cave while waving her arms and shouting angrily and desperately, mouthing the words “where the hell is my dog,” her movements exaggerated and puppet-like with high energy and urgency, suddenly a second puppet dog bursts into frame from the side, jumping up excitedly and tackling her affectionately while licking her face repeatedly, she freezes in surprise and then breaks into relief and laughter as the dog continues licking her, the scene feels chaotic, comedic, and emotional with expressive puppet reactions, cinematic lighting, smooth camera motion, shallow depth of field, and high-quality puppet-style animation",
|
| 2265 |
"No LoRA",
|
| 2266 |
"16:9",
|
| 2267 |
+
"Image-to-Video",
|
| 2268 |
+
None,
|
| 2269 |
+
None,
|
| 2270 |
+
],
|
| 2271 |
+
[
|
| 2272 |
+
"clay.png",
|
| 2273 |
+
"a character doing a tiktok dance by moving their heads side to side with dramatic lighting and cinematic effects and singing",
|
| 2274 |
+
"Pose",
|
| 2275 |
+
"9:16",
|
| 2276 |
+
"Motion Control",
|
| 2277 |
+
"tiktok.mp4",
|
| 2278 |
None,
|
| 2279 |
],
|
| 2280 |
+
[
|
| 2281 |
+
"paint.png",
|
| 2282 |
+
"a character doing a tiktok dance by moving their heads side to side with dramatic lighting and cinematic effects and singing",
|
| 2283 |
+
"Pose",
|
| 2284 |
+
"9:16",
|
| 2285 |
+
"Motion Control",
|
| 2286 |
+
"tiktok.mp4",
|
| 2287 |
+
None,
|
| 2288 |
+
],
|
| 2289 |
[
|
| 2290 |
"highland.png",
|
| 2291 |
"Realistic POV selfie-style video in a snowy, foggy field. Two shaggy Highland cows with long curved horns stand ahead. The camera is handheld and slightly shaky. The woman filming talks nervously and excitedly in a vlog tone: \"Oh my god guys… look how big those horns are… I’m kinda scared.\" The cow on the left walks toward the camera in a cute, bouncy, hopping way, curious and gentle. Snow crunches under its hooves, breath visible in the cold air. The horns look massive from the POV. As the cow gets very close, its wet nose with slight dripping fills part of the frame. She laughs nervously but reaches out and pets the cow. The cow makes deep, soft, interesting mooing and snorting sounds, calm and friendly. Ultra-realistic, natural lighting, immersive audio, documentary-style realism.",
|
| 2292 |
"No LoRA",
|
| 2293 |
"16:9",
|
| 2294 |
+
"Image-to-Video",
|
| 2295 |
+
None,
|
| 2296 |
None,
|
| 2297 |
],
|
| 2298 |
[
|
|
|
|
| 2300 |
"A cinematic dolly out of Wednesday Addams frozen mid-dance on a dark, blue-lit ballroom floor as students move indistinctly behind her, their footsteps and muffled music reduced to a distant, underwater thrum; the audio foregrounds her steady breathing and the faint rustle of fabric as she slowly raises one arm, never breaking eye contact with the camera, then after a deliberately long silence she speaks in a flat, dry, perfectly controlled voice, “I don’t dance… I vibe code,” each word crisp and unemotional, followed by an abrupt cutoff of her voice as the background sound swells slightly, reinforcing the deadpan humor, with precise lip sync, minimal facial movement, stark gothic lighting, and cinematic realism.",
|
| 2301 |
"Zoom Out",
|
| 2302 |
"16:9",
|
| 2303 |
+
"Image-to-Video",
|
| 2304 |
+
None,
|
| 2305 |
None,
|
| 2306 |
],
|
| 2307 |
[
|
|
|
|
| 2309 |
"An astronaut hatches from a fragile egg on the surface of the Moon, the shell cracking and peeling apart in gentle low-gravity motion. Fine lunar dust lifts and drifts outward with each movement, floating in slow arcs before settling back onto the ground. The astronaut pushes free in a deliberate, weightless motion, small fragments of the egg tumbling and spinning through the air. In the background, the deep darkness of space subtly shifts as stars glide with the camera's movement, emphasizing vast depth and scale. The camera performs a smooth, cinematic slow push-in, with natural parallax between the foreground dust, the astronaut, and the distant starfield. Ultra-realistic detail, physically accurate low-gravity motion, cinematic lighting, and a breath-taking, movie-like shot.",
|
| 2310 |
"Static",
|
| 2311 |
"1:1",
|
| 2312 |
+
"Image-to-Video",
|
| 2313 |
+
None,
|
| 2314 |
None,
|
| 2315 |
],
|
|
|
|
|
|
|
| 2316 |
],
|
| 2317 |
fn=generate_video_example,
|
| 2318 |
+
inputs=[input_image, prompt_ui, camera_ui, resolution_ui, radioanimated_mode, input_video, audio_input],
|
| 2319 |
outputs = [output_video],
|
| 2320 |
+
label="Examples",
|
| 2321 |
cache_examples=True,
|
| 2322 |
)
|
| 2323 |
|