Spaces:
Running on Zero
Running on Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -214,7 +214,7 @@ class LTX23DistilledA2VPipeline:
|
|
| 214 |
|
| 215 |
# Stage 1: Generate sigmas using LTX2Scheduler with user-specified steps
|
| 216 |
empty_latent = torch.empty(VideoLatentShape.from_pixel_shape(
|
| 217 |
-
VideoPixelShape(batch=1, frames=num_frames, width=width
|
| 218 |
).to_torch_shape())
|
| 219 |
stage_1_sigmas = (
|
| 220 |
LTX2Scheduler()
|
|
@@ -243,25 +243,12 @@ class LTX23DistilledA2VPipeline:
|
|
| 243 |
),
|
| 244 |
)
|
| 245 |
|
| 246 |
-
def stage2_denoising_loop(sigmas: torch.Tensor, video_state, audio_state, stepper: DiffusionStepProtocol):
|
| 247 |
-
return res2s_audio_video_denoising_loop(
|
| 248 |
-
sigmas=sigmas,
|
| 249 |
-
video_state=video_state,
|
| 250 |
-
audio_state=audio_state,
|
| 251 |
-
stepper=stepper,
|
| 252 |
-
denoise_fn=simple_denoising_func(
|
| 253 |
-
video_context=v_context_p,
|
| 254 |
-
audio_context=a_context_p,
|
| 255 |
-
transformer=transformer, # noqa: F821
|
| 256 |
-
),
|
| 257 |
-
)
|
| 258 |
-
|
| 259 |
# ββ Stage 1: Half resolution ββ
|
| 260 |
stage_1_output_shape = VideoPixelShape(
|
| 261 |
batch=1,
|
| 262 |
frames=num_frames,
|
| 263 |
-
width=width
|
| 264 |
-
height=height
|
| 265 |
fps=frame_rate,
|
| 266 |
)
|
| 267 |
stage_1_conditionings = combined_image_conditionings(
|
|
@@ -291,42 +278,6 @@ class LTX23DistilledA2VPipeline:
|
|
| 291 |
torch.cuda.synchronize()
|
| 292 |
cleanup_memory()
|
| 293 |
|
| 294 |
-
# ββ Upscaling ββ
|
| 295 |
-
upscaled_video_latent = upsample_video(
|
| 296 |
-
latent=video_state.latent[:1],
|
| 297 |
-
video_encoder=video_encoder,
|
| 298 |
-
upsampler=self.model_ledger.spatial_upsampler(),
|
| 299 |
-
)
|
| 300 |
-
|
| 301 |
-
# ββ Stage 2: Full resolution ββ
|
| 302 |
-
stage_2_sigmas = torch.tensor(STAGE_2_DISTILLED_SIGMA_VALUES, device=self.device)
|
| 303 |
-
stage_2_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
|
| 304 |
-
stage_2_conditionings = combined_image_conditionings(
|
| 305 |
-
images=images,
|
| 306 |
-
height=stage_2_output_shape.height,
|
| 307 |
-
width=stage_2_output_shape.width,
|
| 308 |
-
video_encoder=video_encoder,
|
| 309 |
-
dtype=dtype,
|
| 310 |
-
device=self.device,
|
| 311 |
-
)
|
| 312 |
-
video_state, audio_state = denoise_audio_video(
|
| 313 |
-
output_shape=stage_2_output_shape,
|
| 314 |
-
conditionings=stage_2_conditionings,
|
| 315 |
-
noiser=noiser,
|
| 316 |
-
sigmas=stage_2_sigmas,
|
| 317 |
-
stepper=stepper,
|
| 318 |
-
denoising_loop_fn=stage2_denoising_loop,
|
| 319 |
-
components=self.pipeline_components,
|
| 320 |
-
dtype=dtype,
|
| 321 |
-
device=self.device,
|
| 322 |
-
noise_scale=stage_2_sigmas[0],
|
| 323 |
-
initial_video_latent=upscaled_video_latent,
|
| 324 |
-
initial_audio_latent=audio_state.latent,
|
| 325 |
-
)
|
| 326 |
-
|
| 327 |
-
torch.cuda.synchronize()
|
| 328 |
-
cleanup_memory()
|
| 329 |
-
|
| 330 |
# ββ Decode both video and audio ββ
|
| 331 |
decoded_video = vae_decode_video(
|
| 332 |
video_state.latent,
|
|
@@ -856,7 +807,7 @@ css = """
|
|
| 856 |
"""
|
| 857 |
|
| 858 |
with gr.Blocks(title="LTX-2.3 Distilled with LoRAs, Negative Prompting, and Advanced Settings") as demo:
|
| 859 |
-
gr.Markdown("# LTX-2.3
|
| 860 |
gr.Markdown(
|
| 861 |
"High-quality text/image-to-video with cached LoRA state + CFG guidance. "
|
| 862 |
"[[Model]](https://huggingface.co/Lightricks/LTX-2.3)"
|
|
|
|
| 214 |
|
| 215 |
# Stage 1: Generate sigmas using LTX2Scheduler with user-specified steps
|
| 216 |
empty_latent = torch.empty(VideoLatentShape.from_pixel_shape(
|
| 217 |
+
VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
|
| 218 |
).to_torch_shape())
|
| 219 |
stage_1_sigmas = (
|
| 220 |
LTX2Scheduler()
|
|
|
|
| 243 |
),
|
| 244 |
)
|
| 245 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
# ββ Stage 1: Half resolution ββ
|
| 247 |
stage_1_output_shape = VideoPixelShape(
|
| 248 |
batch=1,
|
| 249 |
frames=num_frames,
|
| 250 |
+
width=width,
|
| 251 |
+
height=height,
|
| 252 |
fps=frame_rate,
|
| 253 |
)
|
| 254 |
stage_1_conditionings = combined_image_conditionings(
|
|
|
|
| 278 |
torch.cuda.synchronize()
|
| 279 |
cleanup_memory()
|
| 280 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
# ββ Decode both video and audio ββ
|
| 282 |
decoded_video = vae_decode_video(
|
| 283 |
video_state.latent,
|
|
|
|
| 807 |
"""
|
| 808 |
|
| 809 |
with gr.Blocks(title="LTX-2.3 Distilled with LoRAs, Negative Prompting, and Advanced Settings") as demo:
|
| 810 |
+
gr.Markdown("# LTX-2.3 One-Stage HQ Video Generation")
|
| 811 |
gr.Markdown(
|
| 812 |
"High-quality text/image-to-video with cached LoRA state + CFG guidance. "
|
| 813 |
"[[Model]](https://huggingface.co/Lightricks/LTX-2.3)"
|