Spaces:
Sleeping
Sleeping
alex commited on
Commit ·
17fc24b
1
Parent(s): b7c34d7
guide frame
Browse files- app.py +69 -29
- packages/ltx-pipelines/src/ltx_pipelines/distilled.py +50 -25
app.py
CHANGED
|
@@ -1404,13 +1404,14 @@ class AudioDropUpload(gr.HTML):
|
|
| 1404 |
|
| 1405 |
|
| 1406 |
|
| 1407 |
-
def generate_video_example(
|
| 1408 |
|
| 1409 |
w, h = apply_resolution(resolution)
|
| 1410 |
|
| 1411 |
with timer(f'generating with video path:{input_video} with duration:{duration} and LoRA:{camera_lora} in {w}x{h}'):
|
| 1412 |
output_video = generate_video(
|
| 1413 |
-
|
|
|
|
| 1414 |
prompt,
|
| 1415 |
10,
|
| 1416 |
input_video,
|
|
@@ -1427,7 +1428,8 @@ def generate_video_example(input_image, prompt, camera_lora, resolution, radioan
|
|
| 1427 |
return output_video
|
| 1428 |
|
| 1429 |
def get_duration(
|
| 1430 |
-
|
|
|
|
| 1431 |
prompt,
|
| 1432 |
duration,
|
| 1433 |
input_video,
|
|
@@ -1457,11 +1459,11 @@ def get_duration(
|
|
| 1457 |
return 120 + extra_time
|
| 1458 |
else:
|
| 1459 |
return 180 + extra_time
|
| 1460 |
-
|
| 1461 |
-
|
| 1462 |
@spaces.GPU(duration=get_duration)
|
| 1463 |
def generate_video(
|
| 1464 |
-
|
|
|
|
| 1465 |
prompt: str,
|
| 1466 |
duration: float,
|
| 1467 |
input_video = None,
|
|
@@ -1478,7 +1480,8 @@ def generate_video(
|
|
| 1478 |
"""
|
| 1479 |
Generate a short cinematic video from a text prompt and optional input image using the LTX-2 distilled pipeline.
|
| 1480 |
Args:
|
| 1481 |
-
|
|
|
|
| 1482 |
prompt: Text description of the scene, motion, and cinematic style to generate.
|
| 1483 |
duration: Desired video length in seconds. Converted to frames using a fixed 24 FPS rate.
|
| 1484 |
input_video: Optional conditioning video path (mp4). If provided, motion is guided by this video.
|
|
@@ -1534,7 +1537,7 @@ def generate_video(
|
|
| 1534 |
target_fps=frame_rate,
|
| 1535 |
)
|
| 1536 |
|
| 1537 |
-
if
|
| 1538 |
images = [(first_png, 0, 1.0)]
|
| 1539 |
|
| 1540 |
if audio_path is None:
|
|
@@ -1554,13 +1557,19 @@ def generate_video(
|
|
| 1554 |
videos = [(cond_path, 1.0)]
|
| 1555 |
camera_lora = "Pose"
|
| 1556 |
|
| 1557 |
-
if
|
| 1558 |
-
images = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1559 |
|
| 1560 |
embeddings, final_prompt, status = encode_prompt(
|
| 1561 |
prompt=prompt,
|
| 1562 |
enhance_prompt=enhance_prompt,
|
| 1563 |
-
input_image=
|
| 1564 |
seed=current_seed,
|
| 1565 |
negative_prompt="",
|
| 1566 |
)
|
|
@@ -1653,9 +1662,10 @@ def apply_duration(duration: str):
|
|
| 1653 |
return duration_s
|
| 1654 |
|
| 1655 |
def on_mode_change(selected: str):
|
| 1656 |
-
|
|
|
|
| 1657 |
|
| 1658 |
-
return gr.update(visible=
|
| 1659 |
|
| 1660 |
|
| 1661 |
|
|
@@ -2468,7 +2478,7 @@ def apply_example(idx: str):
|
|
| 2468 |
idx = int(idx)
|
| 2469 |
|
| 2470 |
# Read the example row from your list
|
| 2471 |
-
img, prompt_txt, cam, res, mode, vid, aud = examples_list[idx]
|
| 2472 |
|
| 2473 |
img_path = img if img else None
|
| 2474 |
vid_path = vid if vid else None
|
|
@@ -2481,6 +2491,7 @@ def apply_example(idx: str):
|
|
| 2481 |
mode_update = mode
|
| 2482 |
video_update = gr.update(value=vid_path, visible=(mode == "Motion Control"))
|
| 2483 |
audio_update = aud_path
|
|
|
|
| 2484 |
|
| 2485 |
return (
|
| 2486 |
input_image_update,
|
|
@@ -2491,6 +2502,7 @@ def apply_example(idx: str):
|
|
| 2491 |
video_update,
|
| 2492 |
audio_update,
|
| 2493 |
audio_update,
|
|
|
|
| 2494 |
)
|
| 2495 |
|
| 2496 |
|
|
@@ -2522,7 +2534,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
|
|
| 2522 |
with gr.Column(elem_id="col-container"):
|
| 2523 |
with gr.Row(elem_id="mode-row"):
|
| 2524 |
radioanimated_mode = RadioAnimated(
|
| 2525 |
-
choices=["Image-to-Video", "Motion Control"],
|
| 2526 |
value="Image-to-Video",
|
| 2527 |
elem_id="radioanimated_mode"
|
| 2528 |
)
|
|
@@ -2531,12 +2543,19 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
|
|
| 2531 |
|
| 2532 |
with gr.Row():
|
| 2533 |
|
| 2534 |
-
|
| 2535 |
label="First Frame (Optional)",
|
| 2536 |
type="filepath",
|
| 2537 |
height=256
|
| 2538 |
)
|
| 2539 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2540 |
input_video = gr.Video(
|
| 2541 |
label="Motion Reference Video",
|
| 2542 |
height=256,
|
|
@@ -2700,7 +2719,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
|
|
| 2700 |
radioanimated_mode.change(
|
| 2701 |
fn=on_mode_change,
|
| 2702 |
inputs=radioanimated_mode,
|
| 2703 |
-
outputs=[input_video],
|
| 2704 |
api_visibility="private",
|
| 2705 |
)
|
| 2706 |
|
|
@@ -2728,7 +2747,8 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
|
|
| 2728 |
generate_btn.click(
|
| 2729 |
fn=generate_video,
|
| 2730 |
inputs=[
|
| 2731 |
-
|
|
|
|
| 2732 |
prompt,
|
| 2733 |
duration,
|
| 2734 |
input_video,
|
|
@@ -2752,7 +2772,18 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
|
|
| 2752 |
"16:9",
|
| 2753 |
"Image-to-Video",
|
| 2754 |
None,
|
| 2755 |
-
"examples/supergirl.m4a"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2756 |
],
|
| 2757 |
[
|
| 2758 |
"examples/supergirl.png",
|
|
@@ -2762,6 +2793,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
|
|
| 2762 |
"Image-to-Video",
|
| 2763 |
None,
|
| 2764 |
None,
|
|
|
|
| 2765 |
],
|
| 2766 |
[
|
| 2767 |
"examples/clay.png",
|
|
@@ -2771,6 +2803,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
|
|
| 2771 |
"Motion Control",
|
| 2772 |
"examples/tiktok.mp4",
|
| 2773 |
None,
|
|
|
|
| 2774 |
],
|
| 2775 |
[
|
| 2776 |
"examples/paint.png",
|
|
@@ -2780,6 +2813,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
|
|
| 2780 |
"Motion Control",
|
| 2781 |
"examples/tiktok.mp4",
|
| 2782 |
None,
|
|
|
|
| 2783 |
],
|
| 2784 |
[
|
| 2785 |
"examples/highland.png",
|
|
@@ -2789,6 +2823,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
|
|
| 2789 |
"Image-to-Video",
|
| 2790 |
None,
|
| 2791 |
None,
|
|
|
|
| 2792 |
],
|
| 2793 |
[
|
| 2794 |
"examples/wednesday.png",
|
|
@@ -2798,6 +2833,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
|
|
| 2798 |
"Image-to-Video",
|
| 2799 |
None,
|
| 2800 |
None,
|
|
|
|
| 2801 |
],
|
| 2802 |
[
|
| 2803 |
"examples/astronaut.png",
|
|
@@ -2807,13 +2843,14 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
|
|
| 2807 |
"Image-to-Video",
|
| 2808 |
None,
|
| 2809 |
None,
|
|
|
|
| 2810 |
],
|
| 2811 |
]
|
| 2812 |
|
| 2813 |
examples_obj = create_examples(
|
| 2814 |
examples=examples_list,
|
| 2815 |
fn=generate_video_example,
|
| 2816 |
-
inputs=[
|
| 2817 |
outputs = [output_video],
|
| 2818 |
label="Examples",
|
| 2819 |
cache_examples=True,
|
|
@@ -2822,13 +2859,15 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
|
|
| 2822 |
|
| 2823 |
preset_gallery = PresetGallery(
|
| 2824 |
items=[
|
| 2825 |
-
{"thumb": "examples/supergirl-2.png", "label": "Example 1", "title": "Image+Audio to Video" },
|
| 2826 |
-
{"thumb": "examples/
|
| 2827 |
-
{"thumb": "examples/
|
| 2828 |
-
{"thumb": "examples/
|
| 2829 |
-
{"thumb": "examples/
|
| 2830 |
-
{"thumb": "examples/
|
| 2831 |
-
{"thumb": "examples/
|
|
|
|
|
|
|
| 2832 |
],
|
| 2833 |
title="Click on Our Examples",
|
| 2834 |
)
|
|
@@ -2858,7 +2897,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
|
|
| 2858 |
fn=apply_example,
|
| 2859 |
inputs=preset_gallery,
|
| 2860 |
outputs=[
|
| 2861 |
-
|
| 2862 |
prompt_ui,
|
| 2863 |
camera_ui,
|
| 2864 |
resolution_ui,
|
|
@@ -2866,6 +2905,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
|
|
| 2866 |
input_video,
|
| 2867 |
audio_input,
|
| 2868 |
audio_ui,
|
|
|
|
| 2869 |
],
|
| 2870 |
api_visibility="private",
|
| 2871 |
).then(
|
|
|
|
| 1404 |
|
| 1405 |
|
| 1406 |
|
| 1407 |
+
def generate_video_example(first_frame, prompt, camera_lora, resolution, radioanimated_mode, input_video, input_audio, end_frame, progress=gr.Progress(track_tqdm=True)):
|
| 1408 |
|
| 1409 |
w, h = apply_resolution(resolution)
|
| 1410 |
|
| 1411 |
with timer(f'generating with video path:{input_video} with duration:{duration} and LoRA:{camera_lora} in {w}x{h}'):
|
| 1412 |
output_video = generate_video(
|
| 1413 |
+
first_frame,
|
| 1414 |
+
end_frame,
|
| 1415 |
prompt,
|
| 1416 |
10,
|
| 1417 |
input_video,
|
|
|
|
| 1428 |
return output_video
|
| 1429 |
|
| 1430 |
def get_duration(
|
| 1431 |
+
first_frame,
|
| 1432 |
+
end_frame,
|
| 1433 |
prompt,
|
| 1434 |
duration,
|
| 1435 |
input_video,
|
|
|
|
| 1459 |
return 120 + extra_time
|
| 1460 |
else:
|
| 1461 |
return 180 + extra_time
|
| 1462 |
+
|
|
|
|
| 1463 |
@spaces.GPU(duration=get_duration)
|
| 1464 |
def generate_video(
|
| 1465 |
+
first_frame,
|
| 1466 |
+
end_frame,
|
| 1467 |
prompt: str,
|
| 1468 |
duration: float,
|
| 1469 |
input_video = None,
|
|
|
|
| 1480 |
"""
|
| 1481 |
Generate a short cinematic video from a text prompt and optional input image using the LTX-2 distilled pipeline.
|
| 1482 |
Args:
|
| 1483 |
+
first_frame: Optional first frame for image-to-video. If provided, it is injected at frame 0 to guide motion.
|
| 1484 |
+
end_frame: Optional last frame for image-to-video. If provided, it is injected at last frame to guide motion.
|
| 1485 |
prompt: Text description of the scene, motion, and cinematic style to generate.
|
| 1486 |
duration: Desired video length in seconds. Converted to frames using a fixed 24 FPS rate.
|
| 1487 |
input_video: Optional conditioning video path (mp4). If provided, motion is guided by this video.
|
|
|
|
| 1537 |
target_fps=frame_rate,
|
| 1538 |
)
|
| 1539 |
|
| 1540 |
+
if first_frame is None:
|
| 1541 |
images = [(first_png, 0, 1.0)]
|
| 1542 |
|
| 1543 |
if audio_path is None:
|
|
|
|
| 1557 |
videos = [(cond_path, 1.0)]
|
| 1558 |
camera_lora = "Pose"
|
| 1559 |
|
| 1560 |
+
if first_frame is not None:
|
| 1561 |
+
images = []
|
| 1562 |
+
images.append((first_frame, 0, 1.0))
|
| 1563 |
+
|
| 1564 |
+
if generation_mode == "Interpolate":
|
| 1565 |
+
if end_frame is not None:
|
| 1566 |
+
end_idx = max(0, num_frames - 1)
|
| 1567 |
+
images.append((end_frame, end_idx, 0.5))
|
| 1568 |
|
| 1569 |
embeddings, final_prompt, status = encode_prompt(
|
| 1570 |
prompt=prompt,
|
| 1571 |
enhance_prompt=enhance_prompt,
|
| 1572 |
+
input_image=first_frame,
|
| 1573 |
seed=current_seed,
|
| 1574 |
negative_prompt="",
|
| 1575 |
)
|
|
|
|
| 1662 |
return duration_s
|
| 1663 |
|
| 1664 |
def on_mode_change(selected: str):
|
| 1665 |
+
is_motion = (selected == "Motion Control")
|
| 1666 |
+
is_interpolate = (selected == "Interpolate")
|
| 1667 |
|
| 1668 |
+
return (gr.update(visible=is_motion), gr.update(visible=is_interpolate))
|
| 1669 |
|
| 1670 |
|
| 1671 |
|
|
|
|
| 2478 |
idx = int(idx)
|
| 2479 |
|
| 2480 |
# Read the example row from your list
|
| 2481 |
+
img, prompt_txt, cam, res, mode, vid, aud, end_img = examples_list[idx]
|
| 2482 |
|
| 2483 |
img_path = img if img else None
|
| 2484 |
vid_path = vid if vid else None
|
|
|
|
| 2491 |
mode_update = mode
|
| 2492 |
video_update = gr.update(value=vid_path, visible=(mode == "Motion Control"))
|
| 2493 |
audio_update = aud_path
|
| 2494 |
+
end_image = end_img
|
| 2495 |
|
| 2496 |
return (
|
| 2497 |
input_image_update,
|
|
|
|
| 2502 |
video_update,
|
| 2503 |
audio_update,
|
| 2504 |
audio_update,
|
| 2505 |
+
end_image,
|
| 2506 |
)
|
| 2507 |
|
| 2508 |
|
|
|
|
| 2534 |
with gr.Column(elem_id="col-container"):
|
| 2535 |
with gr.Row(elem_id="mode-row"):
|
| 2536 |
radioanimated_mode = RadioAnimated(
|
| 2537 |
+
choices=["Image-to-Video", "Interpolate", "Motion Control"],
|
| 2538 |
value="Image-to-Video",
|
| 2539 |
elem_id="radioanimated_mode"
|
| 2540 |
)
|
|
|
|
| 2543 |
|
| 2544 |
with gr.Row():
|
| 2545 |
|
| 2546 |
+
first_frame = gr.Image(
|
| 2547 |
label="First Frame (Optional)",
|
| 2548 |
type="filepath",
|
| 2549 |
height=256
|
| 2550 |
)
|
| 2551 |
+
|
| 2552 |
+
end_frame = gr.Image(
|
| 2553 |
+
label="Last Frame (Optional)",
|
| 2554 |
+
type="filepath",
|
| 2555 |
+
height=256,
|
| 2556 |
+
visible=False,
|
| 2557 |
+
)
|
| 2558 |
+
|
| 2559 |
input_video = gr.Video(
|
| 2560 |
label="Motion Reference Video",
|
| 2561 |
height=256,
|
|
|
|
| 2719 |
radioanimated_mode.change(
|
| 2720 |
fn=on_mode_change,
|
| 2721 |
inputs=radioanimated_mode,
|
| 2722 |
+
outputs=[input_video, end_frame],
|
| 2723 |
api_visibility="private",
|
| 2724 |
)
|
| 2725 |
|
|
|
|
| 2747 |
generate_btn.click(
|
| 2748 |
fn=generate_video,
|
| 2749 |
inputs=[
|
| 2750 |
+
first_frame,
|
| 2751 |
+
end_frame,
|
| 2752 |
prompt,
|
| 2753 |
duration,
|
| 2754 |
input_video,
|
|
|
|
| 2772 |
"16:9",
|
| 2773 |
"Image-to-Video",
|
| 2774 |
None,
|
| 2775 |
+
"examples/supergirl.m4a",
|
| 2776 |
+
None,
|
| 2777 |
+
],
|
| 2778 |
+
[
|
| 2779 |
+
"examples/frame3.png",
|
| 2780 |
+
"a woman in a white dress standing in a supermarket, looking at a stack of pomegranates, she picks one and takes a bite, the camera zooms in to a close up of the pomegranate seeds. A calm music is playing in the supermarket and you can hear her taking a bite.",
|
| 2781 |
+
"Zoom In",
|
| 2782 |
+
"16:9",
|
| 2783 |
+
"Interpolate",
|
| 2784 |
+
None,
|
| 2785 |
+
None,
|
| 2786 |
+
"examples/frame4.png",
|
| 2787 |
],
|
| 2788 |
[
|
| 2789 |
"examples/supergirl.png",
|
|
|
|
| 2793 |
"Image-to-Video",
|
| 2794 |
None,
|
| 2795 |
None,
|
| 2796 |
+
None,
|
| 2797 |
],
|
| 2798 |
[
|
| 2799 |
"examples/clay.png",
|
|
|
|
| 2803 |
"Motion Control",
|
| 2804 |
"examples/tiktok.mp4",
|
| 2805 |
None,
|
| 2806 |
+
None,
|
| 2807 |
],
|
| 2808 |
[
|
| 2809 |
"examples/paint.png",
|
|
|
|
| 2813 |
"Motion Control",
|
| 2814 |
"examples/tiktok.mp4",
|
| 2815 |
None,
|
| 2816 |
+
None,
|
| 2817 |
],
|
| 2818 |
[
|
| 2819 |
"examples/highland.png",
|
|
|
|
| 2823 |
"Image-to-Video",
|
| 2824 |
None,
|
| 2825 |
None,
|
| 2826 |
+
None,
|
| 2827 |
],
|
| 2828 |
[
|
| 2829 |
"examples/wednesday.png",
|
|
|
|
| 2833 |
"Image-to-Video",
|
| 2834 |
None,
|
| 2835 |
None,
|
| 2836 |
+
None,
|
| 2837 |
],
|
| 2838 |
[
|
| 2839 |
"examples/astronaut.png",
|
|
|
|
| 2843 |
"Image-to-Video",
|
| 2844 |
None,
|
| 2845 |
None,
|
| 2846 |
+
None,
|
| 2847 |
],
|
| 2848 |
]
|
| 2849 |
|
| 2850 |
examples_obj = create_examples(
|
| 2851 |
examples=examples_list,
|
| 2852 |
fn=generate_video_example,
|
| 2853 |
+
inputs=[first_frame, prompt_ui, camera_ui, resolution_ui, radioanimated_mode, input_video, audio_input, end_frame],
|
| 2854 |
outputs = [output_video],
|
| 2855 |
label="Examples",
|
| 2856 |
cache_examples=True,
|
|
|
|
| 2859 |
|
| 2860 |
preset_gallery = PresetGallery(
|
| 2861 |
items=[
|
| 2862 |
+
{"thumb": "examples/supergirl-2.png", "label": "Example 1", "title": "Image + Audio to Video" },
|
| 2863 |
+
{"thumb": "examples/frame3.png" , "label": "Example 2", "title": "First and Last Frame" },
|
| 2864 |
+
{"thumb": "examples/supergirl.png" , "label": "Example 3", "title": "Image to Video" },
|
| 2865 |
+
{"thumb": "examples/clay.png" , "label": "Example 4", "title": "Pose to Video" },
|
| 2866 |
+
{"thumb": "examples/paint.png" , "label": "Example 5", "title": "Pose to Video" },
|
| 2867 |
+
{"thumb": "examples/highland.png" , "label": "Example 6", "title": "Image to Video" },
|
| 2868 |
+
{"thumb": "examples/wednesday.png" , "label": "Example 7", "title": "Image to Video" },
|
| 2869 |
+
{"thumb": "examples/astronaut.png" , "label": "Example 8", "title": "Image to Video" },
|
| 2870 |
+
|
| 2871 |
],
|
| 2872 |
title="Click on Our Examples",
|
| 2873 |
)
|
|
|
|
| 2897 |
fn=apply_example,
|
| 2898 |
inputs=preset_gallery,
|
| 2899 |
outputs=[
|
| 2900 |
+
first_frame,
|
| 2901 |
prompt_ui,
|
| 2902 |
camera_ui,
|
| 2903 |
resolution_ui,
|
|
|
|
| 2905 |
input_video,
|
| 2906 |
audio_input,
|
| 2907 |
audio_ui,
|
| 2908 |
+
end_frame,
|
| 2909 |
],
|
| 2910 |
api_visibility="private",
|
| 2911 |
).then(
|
packages/ltx-pipelines/src/ltx_pipelines/distilled.py
CHANGED
|
@@ -6,6 +6,7 @@ import torch
|
|
| 6 |
|
| 7 |
from ltx_core.components.diffusion_steps import EulerDiffusionStep
|
| 8 |
from ltx_core.loader.sd_ops import LTXV_LORA_COMFY_RENAMING_MAP
|
|
|
|
| 9 |
from ltx_core.components.noisers import GaussianNoiser
|
| 10 |
from ltx_core.components.protocols import DiffusionStepProtocol
|
| 11 |
from ltx_core.conditioning import ConditioningItem, VideoConditionByKeyframeIndex, ConditioningError
|
|
@@ -34,6 +35,7 @@ from ltx_pipelines.utils.helpers import (
|
|
| 34 |
generate_enhanced_prompt,
|
| 35 |
get_device,
|
| 36 |
image_conditionings_by_replacing_latent,
|
|
|
|
| 37 |
simple_denoising_func,
|
| 38 |
)
|
| 39 |
from ltx_pipelines.utils.media_io import encode_video, load_video_conditioning
|
|
@@ -295,19 +297,8 @@ class DistilledPipeline:
|
|
| 295 |
strength=audio_strength,
|
| 296 |
)
|
| 297 |
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
text_encoder = self.model_ledger.text_encoder()
|
| 301 |
-
context_p = encode_text(text_encoder, prompts=[prompt])[0]
|
| 302 |
-
video_context, audio_context = context_p
|
| 303 |
-
|
| 304 |
-
torch.cuda.synchronize()
|
| 305 |
-
del text_encoder
|
| 306 |
-
utils.cleanup_memory()
|
| 307 |
-
else:
|
| 308 |
-
# Move pre-computed embeddings to device if needed
|
| 309 |
-
video_context = video_context.to(self.device)
|
| 310 |
-
audio_context = audio_context.to(self.device)
|
| 311 |
|
| 312 |
# Stage 1: Initial low resolution video generation.
|
| 313 |
# Load models only if not already cached
|
|
@@ -319,6 +310,7 @@ class DistilledPipeline:
|
|
| 319 |
self._transformer = self.model_ledger.transformer()
|
| 320 |
transformer = self._transformer
|
| 321 |
stage_1_sigmas = torch.Tensor(DISTILLED_SIGMA_VALUES).to(self.device)
|
|
|
|
| 322 |
|
| 323 |
def denoising_loop(
|
| 324 |
sigmas: torch.Tensor, video_state: LatentState, audio_state: LatentState, stepper: DiffusionStepProtocol
|
|
@@ -335,7 +327,13 @@ class DistilledPipeline:
|
|
| 335 |
),
|
| 336 |
)
|
| 337 |
|
| 338 |
-
stage_1_output_shape = VideoPixelShape(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
stage_1_conditionings = self._create_conditionings(
|
| 340 |
images=images,
|
| 341 |
video_conditioning=video_conditioning,
|
|
@@ -370,8 +368,13 @@ class DistilledPipeline:
|
|
| 370 |
|
| 371 |
stage_2_sigmas = torch.Tensor(STAGE_2_DISTILLED_SIGMA_VALUES).to(self.device)
|
| 372 |
stage_2_output_shape = VideoPixelShape(
|
| 373 |
-
batch=1,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
)
|
|
|
|
| 375 |
if apply_video_conditioning_to_stage2:
|
| 376 |
stage_2_conditionings = self._create_conditionings(
|
| 377 |
images=images,
|
|
@@ -383,7 +386,7 @@ class DistilledPipeline:
|
|
| 383 |
video_conditioning_frame_idx=video_conditioning_frame_idx,
|
| 384 |
)
|
| 385 |
else:
|
| 386 |
-
stage_2_conditionings =
|
| 387 |
images=images,
|
| 388 |
height=stage_2_output_shape.height,
|
| 389 |
width=stage_2_output_shape.width,
|
|
@@ -437,15 +440,37 @@ class DistilledPipeline:
|
|
| 437 |
video_conditioning_frame_idx: int,
|
| 438 |
dtype: torch.dtype,
|
| 439 |
):
|
| 440 |
-
#
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 449 |
|
| 450 |
# 2) Optional: add video conditioning (IC-LoRA style)
|
| 451 |
if not video_conditioning:
|
|
|
|
| 6 |
|
| 7 |
from ltx_core.components.diffusion_steps import EulerDiffusionStep
|
| 8 |
from ltx_core.loader.sd_ops import LTXV_LORA_COMFY_RENAMING_MAP
|
| 9 |
+
from ltx_core.components.schedulers import LTX2Scheduler
|
| 10 |
from ltx_core.components.noisers import GaussianNoiser
|
| 11 |
from ltx_core.components.protocols import DiffusionStepProtocol
|
| 12 |
from ltx_core.conditioning import ConditioningItem, VideoConditionByKeyframeIndex, ConditioningError
|
|
|
|
| 35 |
generate_enhanced_prompt,
|
| 36 |
get_device,
|
| 37 |
image_conditionings_by_replacing_latent,
|
| 38 |
+
image_conditionings_by_adding_guiding_latent,
|
| 39 |
simple_denoising_func,
|
| 40 |
)
|
| 41 |
from ltx_pipelines.utils.media_io import encode_video, load_video_conditioning
|
|
|
|
| 297 |
strength=audio_strength,
|
| 298 |
)
|
| 299 |
|
| 300 |
+
video_context = video_context.to(self.device)
|
| 301 |
+
audio_context = audio_context.to(self.device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
|
| 303 |
# Stage 1: Initial low resolution video generation.
|
| 304 |
# Load models only if not already cached
|
|
|
|
| 310 |
self._transformer = self.model_ledger.transformer()
|
| 311 |
transformer = self._transformer
|
| 312 |
stage_1_sigmas = torch.Tensor(DISTILLED_SIGMA_VALUES).to(self.device)
|
| 313 |
+
# stage_1_sigmas = LTX2Scheduler().execute(steps=40).to(dtype=torch.float32, device=self.device)
|
| 314 |
|
| 315 |
def denoising_loop(
|
| 316 |
sigmas: torch.Tensor, video_state: LatentState, audio_state: LatentState, stepper: DiffusionStepProtocol
|
|
|
|
| 327 |
),
|
| 328 |
)
|
| 329 |
|
| 330 |
+
stage_1_output_shape = VideoPixelShape(
|
| 331 |
+
batch=1,
|
| 332 |
+
frames=num_frames,
|
| 333 |
+
width=width,
|
| 334 |
+
height=height,
|
| 335 |
+
fps=frame_rate,
|
| 336 |
+
)
|
| 337 |
stage_1_conditionings = self._create_conditionings(
|
| 338 |
images=images,
|
| 339 |
video_conditioning=video_conditioning,
|
|
|
|
| 368 |
|
| 369 |
stage_2_sigmas = torch.Tensor(STAGE_2_DISTILLED_SIGMA_VALUES).to(self.device)
|
| 370 |
stage_2_output_shape = VideoPixelShape(
|
| 371 |
+
batch=1,
|
| 372 |
+
frames=num_frames,
|
| 373 |
+
width=width * 2,
|
| 374 |
+
height=height * 2,
|
| 375 |
+
fps=frame_rate,
|
| 376 |
)
|
| 377 |
+
|
| 378 |
if apply_video_conditioning_to_stage2:
|
| 379 |
stage_2_conditionings = self._create_conditionings(
|
| 380 |
images=images,
|
|
|
|
| 386 |
video_conditioning_frame_idx=video_conditioning_frame_idx,
|
| 387 |
)
|
| 388 |
else:
|
| 389 |
+
stage_2_conditionings = image_conditionings_by_adding_guiding_latent(
|
| 390 |
images=images,
|
| 391 |
height=stage_2_output_shape.height,
|
| 392 |
width=stage_2_output_shape.width,
|
|
|
|
| 440 |
video_conditioning_frame_idx: int,
|
| 441 |
dtype: torch.dtype,
|
| 442 |
):
|
| 443 |
+
# First frame: replace-latent (strong anchor)
|
| 444 |
+
replace_imgs = []
|
| 445 |
+
# End frame: guiding-latent (avoid empty slice)
|
| 446 |
+
guide_imgs = []
|
| 447 |
+
|
| 448 |
+
for path, frame_idx, strength in images:
|
| 449 |
+
if frame_idx == 0:
|
| 450 |
+
replace_imgs.append((path, frame_idx, strength))
|
| 451 |
+
else:
|
| 452 |
+
guide_imgs.append((path, frame_idx, strength))
|
| 453 |
+
|
| 454 |
+
conditionings = []
|
| 455 |
+
if replace_imgs:
|
| 456 |
+
conditionings += image_conditionings_by_replacing_latent(
|
| 457 |
+
images=replace_imgs,
|
| 458 |
+
height=height,
|
| 459 |
+
width=width,
|
| 460 |
+
video_encoder=video_encoder,
|
| 461 |
+
dtype=dtype,
|
| 462 |
+
device=self.device,
|
| 463 |
+
)
|
| 464 |
+
if guide_imgs:
|
| 465 |
+
conditionings += image_conditionings_by_adding_guiding_latent(
|
| 466 |
+
images=guide_imgs,
|
| 467 |
+
height=height,
|
| 468 |
+
width=width,
|
| 469 |
+
video_encoder=video_encoder,
|
| 470 |
+
dtype=dtype,
|
| 471 |
+
device=self.device,
|
| 472 |
+
)
|
| 473 |
+
|
| 474 |
|
| 475 |
# 2) Optional: add video conditioning (IC-LoRA style)
|
| 476 |
if not video_conditioning:
|