Spaces:
Runtime error
Runtime error
Merge
Browse files- app_start_end.py +354 -13
app_start_end.py
CHANGED
|
@@ -313,7 +313,277 @@ def set_mp4_comments_imageio_ffmpeg(input_file, comments):
|
|
| 313 |
return False
|
| 314 |
|
| 315 |
@torch.no_grad()
|
| 316 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
def encode_prompt(prompt, n_prompt):
|
| 318 |
llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
|
| 319 |
|
|
@@ -855,18 +1125,18 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
|
|
| 855 |
stream.output_queue.push(('end', None))
|
| 856 |
return
|
| 857 |
|
| 858 |
-
def
|
| 859 |
return allocation_time
|
| 860 |
|
| 861 |
# Remove this decorator if you run on local
|
| 862 |
-
@spaces.GPU(duration=
|
| 863 |
-
def
|
| 864 |
):
|
| 865 |
start = time.time()
|
| 866 |
global stream
|
| 867 |
stream = AsyncStream()
|
| 868 |
|
| 869 |
-
async_run(worker_start_end, input_image, end_image, prompts, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, fps_number)
|
| 870 |
|
| 871 |
output_filename = None
|
| 872 |
|
|
@@ -895,7 +1165,7 @@ def process_start_end_on_gpu(input_image, image_position, end_image, prompts, ge
|
|
| 895 |
"You can upscale the result with RIFE. To make all your generated scenes consistent, you can then apply a face swap on the main character. If you do not see the generated video above, the process may have failed. See the logs for more information. If you see an error like ''NVML_SUCCESS == r INTERNAL ASSERT FAILED'', you probably haven't enough VRAM. Test an example or other options to compare. You can share your inputs to the original space or set your space in public for a peer review.", gr.update(interactive=True), gr.update(interactive=False), gr.update(visible = False)
|
| 896 |
break
|
| 897 |
|
| 898 |
-
def
|
| 899 |
image_position=0,
|
| 900 |
end_image=None,
|
| 901 |
prompt="",
|
|
@@ -947,7 +1217,7 @@ def process_start_end(input_image,
|
|
| 947 |
|
| 948 |
yield gr.update(label="Previewed Frames"), None, '', '', gr.update(interactive=False), gr.update(interactive=True), gr.skip()
|
| 949 |
|
| 950 |
-
yield from
|
| 951 |
image_position,
|
| 952 |
end_image,
|
| 953 |
prompts,
|
|
@@ -1120,7 +1390,7 @@ with block:
|
|
| 1120 |
local_storage = gr.BrowserState(default_local_storage)
|
| 1121 |
with gr.Row():
|
| 1122 |
with gr.Column():
|
| 1123 |
-
generation_mode = gr.Radio([["Text-to-Video", "text"], ["Image-to-Video", "image"], ["Video Extension", "video"]], elem_id="generation-mode", label="Generation mode", value = "image")
|
| 1124 |
text_to_video_hint = gr.HTML("Text-to-Video badly works with a flash effect at the start. I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.")
|
| 1125 |
input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
|
| 1126 |
end_image = gr.Image(sources='upload', type="numpy", label="End Frame (Optional)", height=320)
|
|
@@ -1244,7 +1514,7 @@ with block:
|
|
| 1244 |
]
|
| 1245 |
],
|
| 1246 |
run_on_click = True,
|
| 1247 |
-
fn =
|
| 1248 |
inputs = ips,
|
| 1249 |
outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
|
| 1250 |
cache_examples = torch.cuda.device_count() > 0,
|
|
@@ -1376,7 +1646,43 @@ with block:
|
|
| 1376 |
],
|
| 1377 |
],
|
| 1378 |
run_on_click = True,
|
| 1379 |
-
fn =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1380 |
inputs = ips,
|
| 1381 |
outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
|
| 1382 |
cache_examples = torch.cuda.device_count() > 0,
|
|
@@ -1471,7 +1777,7 @@ with block:
|
|
| 1471 |
]
|
| 1472 |
],
|
| 1473 |
run_on_click = True,
|
| 1474 |
-
fn =
|
| 1475 |
inputs = ips,
|
| 1476 |
outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
|
| 1477 |
cache_examples = False,
|
|
@@ -1602,7 +1908,42 @@ with block:
|
|
| 1602 |
]
|
| 1603 |
],
|
| 1604 |
run_on_click = True,
|
| 1605 |
-
fn =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1606 |
inputs = ips,
|
| 1607 |
outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
|
| 1608 |
cache_examples = False,
|
|
@@ -1714,7 +2055,7 @@ with block:
|
|
| 1714 |
timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
|
| 1715 |
start_button.click(fn = check_parameters, inputs = [
|
| 1716 |
generation_mode, input_image, input_video
|
| 1717 |
-
], outputs = [end_button, warning], queue = False, show_progress = False).success(fn=
|
| 1718 |
start_button_video.click(fn = check_parameters, inputs = [
|
| 1719 |
generation_mode, input_image, input_video
|
| 1720 |
], outputs = [end_button, warning], queue = False, show_progress = False).success(fn=process_video, inputs=ips_video, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button, warning], scroll_to_output = True)
|
|
|
|
| 313 |
return False
|
| 314 |
|
| 315 |
@torch.no_grad()
|
| 316 |
+
def worker(input_image, image_position, end_image, prompts, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, fps_number):
|
| 317 |
+
def encode_prompt(prompt, n_prompt):
|
| 318 |
+
llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
|
| 319 |
+
|
| 320 |
+
if cfg == 1:
|
| 321 |
+
llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
|
| 322 |
+
else:
|
| 323 |
+
llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
|
| 324 |
+
|
| 325 |
+
llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
|
| 326 |
+
llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
|
| 327 |
+
|
| 328 |
+
llama_vec = llama_vec.to(transformer.dtype)
|
| 329 |
+
llama_vec_n = llama_vec_n.to(transformer.dtype)
|
| 330 |
+
clip_l_pooler = clip_l_pooler.to(transformer.dtype)
|
| 331 |
+
clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
|
| 332 |
+
return [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n]
|
| 333 |
+
|
| 334 |
+
total_latent_sections = (total_second_length * fps_number) / (latent_window_size * 4)
|
| 335 |
+
total_latent_sections = int(max(round(total_latent_sections), 1))
|
| 336 |
+
|
| 337 |
+
first_section_index = max(min(math.floor(image_position * (total_latent_sections - 1) / 100), (total_latent_sections - 1)), 0)
|
| 338 |
+
section_index = first_section_index
|
| 339 |
+
forward = (image_position == 0)
|
| 340 |
+
|
| 341 |
+
job_id = generate_timestamp()
|
| 342 |
+
|
| 343 |
+
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
|
| 344 |
+
|
| 345 |
+
try:
|
| 346 |
+
# Clean GPU
|
| 347 |
+
if not high_vram:
|
| 348 |
+
unload_complete_models(
|
| 349 |
+
text_encoder, text_encoder_2, image_encoder, vae, transformer
|
| 350 |
+
)
|
| 351 |
+
|
| 352 |
+
# Text encoding
|
| 353 |
+
|
| 354 |
+
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
|
| 355 |
+
|
| 356 |
+
if not high_vram:
|
| 357 |
+
fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
|
| 358 |
+
load_model_as_complete(text_encoder_2, target_device=gpu)
|
| 359 |
+
|
| 360 |
+
prompt_parameters = []
|
| 361 |
+
|
| 362 |
+
for prompt_part in prompts[:total_latent_sections]:
|
| 363 |
+
prompt_parameters.append(encode_prompt(prompt_part, n_prompt))
|
| 364 |
+
|
| 365 |
+
# Clean GPU
|
| 366 |
+
if not high_vram:
|
| 367 |
+
unload_complete_models(
|
| 368 |
+
text_encoder, text_encoder_2
|
| 369 |
+
)
|
| 370 |
+
|
| 371 |
+
# Processing input image
|
| 372 |
+
|
| 373 |
+
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...'))))
|
| 374 |
+
|
| 375 |
+
H, W, C = input_image.shape
|
| 376 |
+
height, width = find_nearest_bucket(H, W, resolution=resolution)
|
| 377 |
+
|
| 378 |
+
def get_start_latent(input_image, height, width, vae, gpu, image_encoder, high_vram):
|
| 379 |
+
input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
|
| 380 |
+
|
| 381 |
+
#Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
|
| 382 |
+
|
| 383 |
+
input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
|
| 384 |
+
input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
|
| 385 |
+
|
| 386 |
+
# VAE encoding
|
| 387 |
+
|
| 388 |
+
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
|
| 389 |
+
|
| 390 |
+
if not high_vram:
|
| 391 |
+
load_model_as_complete(vae, target_device=gpu)
|
| 392 |
+
|
| 393 |
+
start_latent = vae_encode(input_image_pt, vae)
|
| 394 |
+
|
| 395 |
+
# CLIP Vision
|
| 396 |
+
|
| 397 |
+
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
|
| 398 |
+
|
| 399 |
+
if not high_vram:
|
| 400 |
+
unload_complete_models(vae)
|
| 401 |
+
load_model_as_complete(image_encoder, target_device=gpu)
|
| 402 |
+
|
| 403 |
+
image_encoder_last_hidden_state = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder).last_hidden_state
|
| 404 |
+
|
| 405 |
+
if not high_vram:
|
| 406 |
+
unload_complete_models(image_encoder)
|
| 407 |
+
|
| 408 |
+
return [start_latent, image_encoder_last_hidden_state]
|
| 409 |
+
|
| 410 |
+
[start_latent, image_encoder_last_hidden_state] = get_start_latent(input_image, height, width, vae, gpu, image_encoder, high_vram)
|
| 411 |
+
|
| 412 |
+
# Dtype
|
| 413 |
+
|
| 414 |
+
image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
|
| 415 |
+
|
| 416 |
+
# Sampling
|
| 417 |
+
|
| 418 |
+
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
|
| 419 |
+
|
| 420 |
+
rnd = torch.Generator("cpu").manual_seed(seed)
|
| 421 |
+
|
| 422 |
+
history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32, device=cpu)
|
| 423 |
+
start_latent = start_latent.to(history_latents)
|
| 424 |
+
history_pixels = None
|
| 425 |
+
|
| 426 |
+
history_latents = torch.cat([history_latents, start_latent] if forward else [start_latent, history_latents], dim=2)
|
| 427 |
+
total_generated_latent_frames = 1
|
| 428 |
+
|
| 429 |
+
if enable_preview:
|
| 430 |
+
def callback(d):
|
| 431 |
+
preview = d['denoised']
|
| 432 |
+
preview = vae_decode_fake(preview)
|
| 433 |
+
|
| 434 |
+
preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
|
| 435 |
+
preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
|
| 436 |
+
|
| 437 |
+
if stream.input_queue.top() == 'end':
|
| 438 |
+
stream.output_queue.push(('end', None))
|
| 439 |
+
raise KeyboardInterrupt('User ends the task.')
|
| 440 |
+
|
| 441 |
+
current_step = d['i'] + 1
|
| 442 |
+
percentage = int(100.0 * current_step / steps)
|
| 443 |
+
hint = f'Sampling {current_step}/{steps}'
|
| 444 |
+
desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps_number) :.2f} seconds (FPS-30), Resolution: {height}px * {width}px. The video is being extended now ...'
|
| 445 |
+
stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
|
| 446 |
+
return
|
| 447 |
+
else:
|
| 448 |
+
def callback(d):
|
| 449 |
+
return
|
| 450 |
+
|
| 451 |
+
indices = torch.arange(0, 1 + 16 + 2 + 1 + latent_window_size).unsqueeze(0)
|
| 452 |
+
if forward:
|
| 453 |
+
clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
|
| 454 |
+
clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
|
| 455 |
+
else:
|
| 456 |
+
latent_indices, clean_latent_1x_indices, clean_latent_2x_indices, clean_latent_4x_indices, clean_latent_indices_start = indices.split([latent_window_size, 1, 2, 16, 1], dim=1)
|
| 457 |
+
clean_latent_indices = torch.cat([clean_latent_1x_indices, clean_latent_indices_start], dim=1)
|
| 458 |
+
|
| 459 |
+
def post_process(forward, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream):
|
| 460 |
+
total_generated_latent_frames += int(generated_latents.shape[2])
|
| 461 |
+
history_latents = torch.cat([history_latents, generated_latents.to(history_latents)] if forward else [generated_latents.to(history_latents), history_latents], dim=2)
|
| 462 |
+
|
| 463 |
+
if not high_vram:
|
| 464 |
+
offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
|
| 465 |
+
load_model_as_complete(vae, target_device=gpu)
|
| 466 |
+
|
| 467 |
+
if history_pixels is None:
|
| 468 |
+
real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :] if forward else history_latents[:, :, :total_generated_latent_frames, :, :]
|
| 469 |
+
history_pixels = vae_decode(real_history_latents, vae).cpu()
|
| 470 |
+
else:
|
| 471 |
+
section_latent_frames = latent_window_size * 2
|
| 472 |
+
overlapped_frames = latent_window_size * 4 - 3
|
| 473 |
+
|
| 474 |
+
if forward:
|
| 475 |
+
real_history_latents = history_latents[:, :, -min(section_latent_frames, total_generated_latent_frames):, :, :]
|
| 476 |
+
history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents, vae).cpu(), overlapped_frames)
|
| 477 |
+
else:
|
| 478 |
+
real_history_latents = history_latents[:, :, :min(section_latent_frames, total_generated_latent_frames), :, :]
|
| 479 |
+
history_pixels = soft_append_bcthw(vae_decode(real_history_latents, vae).cpu(), history_pixels, overlapped_frames)
|
| 480 |
+
|
| 481 |
+
if not high_vram:
|
| 482 |
+
unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer)
|
| 483 |
+
|
| 484 |
+
if enable_preview or section_index == (0 if first_section_index == (total_latent_sections - 1) else (total_latent_sections - 1)):
|
| 485 |
+
output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
|
| 486 |
+
|
| 487 |
+
save_bcthw_as_mp4(history_pixels, output_filename, fps=fps_number, crf=mp4_crf)
|
| 488 |
+
|
| 489 |
+
print(f'Decoded. Current latent shape pixel shape {history_pixels.shape}')
|
| 490 |
+
|
| 491 |
+
stream.output_queue.push(('file', output_filename))
|
| 492 |
+
return [total_generated_latent_frames, history_latents, history_pixels]
|
| 493 |
+
|
| 494 |
+
while section_index < total_latent_sections:
|
| 495 |
+
if stream.input_queue.top() == 'end':
|
| 496 |
+
stream.output_queue.push(('end', None))
|
| 497 |
+
return
|
| 498 |
+
|
| 499 |
+
print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
|
| 500 |
+
|
| 501 |
+
prompt_index = min(section_index, len(prompt_parameters) - 1)
|
| 502 |
+
|
| 503 |
+
[llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters[prompt_index]
|
| 504 |
+
|
| 505 |
+
if prompt_index < len(prompt_parameters) - 1 or (prompt_index == total_latent_sections - 1):
|
| 506 |
+
prompt_parameters[prompt_index] = None
|
| 507 |
+
|
| 508 |
+
if not high_vram:
|
| 509 |
+
unload_complete_models()
|
| 510 |
+
move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
|
| 511 |
+
|
| 512 |
+
if use_teacache:
|
| 513 |
+
transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
|
| 514 |
+
else:
|
| 515 |
+
transformer.initialize_teacache(enable_teacache=False)
|
| 516 |
+
|
| 517 |
+
if forward:
|
| 518 |
+
clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -(16 + 2 + 1):, :, :].split([16, 2, 1], dim=2)
|
| 519 |
+
clean_latents = torch.cat([start_latent, clean_latents_1x], dim=2)
|
| 520 |
+
else:
|
| 521 |
+
clean_latents_1x, clean_latents_2x, clean_latents_4x = history_latents[:, :, :(1 + 2 + 16), :, :].split([1, 2, 16], dim=2)
|
| 522 |
+
clean_latents = torch.cat([clean_latents_1x, start_latent], dim=2)
|
| 523 |
+
|
| 524 |
+
generated_latents = sample_hunyuan(
|
| 525 |
+
transformer=transformer,
|
| 526 |
+
sampler='unipc',
|
| 527 |
+
width=width,
|
| 528 |
+
height=height,
|
| 529 |
+
frames=latent_window_size * 4 - 3,
|
| 530 |
+
real_guidance_scale=cfg,
|
| 531 |
+
distilled_guidance_scale=gs,
|
| 532 |
+
guidance_rescale=rs,
|
| 533 |
+
# shift=3.0,
|
| 534 |
+
num_inference_steps=steps,
|
| 535 |
+
generator=rnd,
|
| 536 |
+
prompt_embeds=llama_vec,
|
| 537 |
+
prompt_embeds_mask=llama_attention_mask,
|
| 538 |
+
prompt_poolers=clip_l_pooler,
|
| 539 |
+
negative_prompt_embeds=llama_vec_n,
|
| 540 |
+
negative_prompt_embeds_mask=llama_attention_mask_n,
|
| 541 |
+
negative_prompt_poolers=clip_l_pooler_n,
|
| 542 |
+
device=gpu,
|
| 543 |
+
dtype=torch.bfloat16,
|
| 544 |
+
image_embeddings=image_encoder_last_hidden_state,
|
| 545 |
+
latent_indices=latent_indices,
|
| 546 |
+
clean_latents=clean_latents,
|
| 547 |
+
clean_latent_indices=clean_latent_indices,
|
| 548 |
+
clean_latents_2x=clean_latents_2x,
|
| 549 |
+
clean_latent_2x_indices=clean_latent_2x_indices,
|
| 550 |
+
clean_latents_4x=clean_latents_4x,
|
| 551 |
+
clean_latent_4x_indices=clean_latent_4x_indices,
|
| 552 |
+
callback=callback,
|
| 553 |
+
)
|
| 554 |
+
|
| 555 |
+
[total_generated_latent_frames, history_latents, history_pixels] = post_process(forward, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream)
|
| 556 |
+
|
| 557 |
+
if not forward:
|
| 558 |
+
if section_index > 0:
|
| 559 |
+
section_index -= 1
|
| 560 |
+
else:
|
| 561 |
+
clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
|
| 562 |
+
clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
|
| 563 |
+
|
| 564 |
+
real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
|
| 565 |
+
zero_latents = history_latents[:, :, total_generated_latent_frames:, :, :]
|
| 566 |
+
history_latents = torch.cat([zero_latents, real_history_latents], dim=2)
|
| 567 |
+
real_history_latents = zero_latents = None
|
| 568 |
+
|
| 569 |
+
forward = True
|
| 570 |
+
section_index = first_section_index
|
| 571 |
+
|
| 572 |
+
if forward:
|
| 573 |
+
section_index += 1
|
| 574 |
+
except:
|
| 575 |
+
traceback.print_exc()
|
| 576 |
+
|
| 577 |
+
if not high_vram:
|
| 578 |
+
unload_complete_models(
|
| 579 |
+
text_encoder, text_encoder_2, image_encoder, vae, transformer
|
| 580 |
+
)
|
| 581 |
+
|
| 582 |
+
stream.output_queue.push(('end', None))
|
| 583 |
+
return
|
| 584 |
+
|
| 585 |
+
@torch.no_grad()
|
| 586 |
+
def worker_start_end(input_image, image_position, end_image, prompts, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, fps_number):
|
| 587 |
def encode_prompt(prompt, n_prompt):
|
| 588 |
llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
|
| 589 |
|
|
|
|
| 1125 |
stream.output_queue.push(('end', None))
|
| 1126 |
return
|
| 1127 |
|
| 1128 |
+
def get_duration(input_image, image_position, end_image, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
|
| 1129 |
return allocation_time
|
| 1130 |
|
| 1131 |
# Remove this decorator if you run on local
|
| 1132 |
+
@spaces.GPU(duration=get_duration)
|
| 1133 |
+
def process_on_gpu(input_image, image_position, end_image, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number
|
| 1134 |
):
|
| 1135 |
start = time.time()
|
| 1136 |
global stream
|
| 1137 |
stream = AsyncStream()
|
| 1138 |
|
| 1139 |
+
async_run(worker_start_end if generation_mode == "start_end" else worker, input_image, image_position, end_image, prompts, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, fps_number)
|
| 1140 |
|
| 1141 |
output_filename = None
|
| 1142 |
|
|
|
|
| 1165 |
"You can upscale the result with RIFE. To make all your generated scenes consistent, you can then apply a face swap on the main character. If you do not see the generated video above, the process may have failed. See the logs for more information. If you see an error like ''NVML_SUCCESS == r INTERNAL ASSERT FAILED'', you probably haven't enough VRAM. Test an example or other options to compare. You can share your inputs to the original space or set your space in public for a peer review.", gr.update(interactive=True), gr.update(interactive=False), gr.update(visible = False)
|
| 1166 |
break
|
| 1167 |
|
| 1168 |
+
def process(input_image,
|
| 1169 |
image_position=0,
|
| 1170 |
end_image=None,
|
| 1171 |
prompt="",
|
|
|
|
| 1217 |
|
| 1218 |
yield gr.update(label="Previewed Frames"), None, '', '', gr.update(interactive=False), gr.update(interactive=True), gr.skip()
|
| 1219 |
|
| 1220 |
+
yield from process_on_gpu(input_image,
|
| 1221 |
image_position,
|
| 1222 |
end_image,
|
| 1223 |
prompts,
|
|
|
|
| 1390 |
local_storage = gr.BrowserState(default_local_storage)
|
| 1391 |
with gr.Row():
|
| 1392 |
with gr.Column():
|
| 1393 |
+
generation_mode = gr.Radio([["Text-to-Video", "text"], ["Image-to-Video", "image"], ["Start frame & End frame", "start_end"], ["Video Extension", "video"]], elem_id="generation-mode", label="Generation mode", value = "image")
|
| 1394 |
text_to_video_hint = gr.HTML("Text-to-Video badly works with a flash effect at the start. I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.")
|
| 1395 |
input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
|
| 1396 |
end_image = gr.Image(sources='upload', type="numpy", label="End Frame (Optional)", height=320)
|
|
|
|
| 1514 |
]
|
| 1515 |
],
|
| 1516 |
run_on_click = True,
|
| 1517 |
+
fn = process,
|
| 1518 |
inputs = ips,
|
| 1519 |
outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
|
| 1520 |
cache_examples = torch.cuda.device_count() > 0,
|
|
|
|
| 1646 |
],
|
| 1647 |
],
|
| 1648 |
run_on_click = True,
|
| 1649 |
+
fn = process,
|
| 1650 |
+
inputs = ips,
|
| 1651 |
+
outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
|
| 1652 |
+
cache_examples = torch.cuda.device_count() > 0,
|
| 1653 |
+
)
|
| 1654 |
+
|
| 1655 |
+
with gr.Row(elem_id="start_end_examples", visible=False):
|
| 1656 |
+
gr.Examples(
|
| 1657 |
+
label = "Examples from start and end frames",
|
| 1658 |
+
examples = [
|
| 1659 |
+
[
|
| 1660 |
+
"./img_examples/Example2.webp", # input_image
|
| 1661 |
+
0, # image_position
|
| 1662 |
+
None, # end_image
|
| 1663 |
+
"A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks, the man stops talking and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
|
| 1664 |
+
"start_end", # generation_mode
|
| 1665 |
+
"Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
|
| 1666 |
+
True, # randomize_seed
|
| 1667 |
+
42, # seed
|
| 1668 |
+
True, # auto_allocation
|
| 1669 |
+
180, # allocation_time
|
| 1670 |
+
672, # resolution
|
| 1671 |
+
1, # total_second_length
|
| 1672 |
+
9, # latent_window_size
|
| 1673 |
+
30, # steps
|
| 1674 |
+
1.0, # cfg
|
| 1675 |
+
10.0, # gs
|
| 1676 |
+
0.0, # rs
|
| 1677 |
+
6, # gpu_memory_preservation
|
| 1678 |
+
False, # enable_preview
|
| 1679 |
+
False, # use_teacache
|
| 1680 |
+
16, # mp4_crf
|
| 1681 |
+
30 # fps_number
|
| 1682 |
+
],
|
| 1683 |
+
],
|
| 1684 |
+
run_on_click = True,
|
| 1685 |
+
fn = process,
|
| 1686 |
inputs = ips,
|
| 1687 |
outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
|
| 1688 |
cache_examples = torch.cuda.device_count() > 0,
|
|
|
|
| 1777 |
]
|
| 1778 |
],
|
| 1779 |
run_on_click = True,
|
| 1780 |
+
fn = process,
|
| 1781 |
inputs = ips,
|
| 1782 |
outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
|
| 1783 |
cache_examples = False,
|
|
|
|
| 1908 |
]
|
| 1909 |
],
|
| 1910 |
run_on_click = True,
|
| 1911 |
+
fn = process,
|
| 1912 |
+
inputs = ips,
|
| 1913 |
+
outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
|
| 1914 |
+
cache_examples = False,
|
| 1915 |
+
)
|
| 1916 |
+
|
| 1917 |
+
gr.Examples(
|
| 1918 |
+
label = "🖼️ Examples from start and end frames",
|
| 1919 |
+
examples = [
|
| 1920 |
+
[
|
| 1921 |
+
"./img_examples/Example1.png", # input_image
|
| 1922 |
+
0, # image_position
|
| 1923 |
+
None, # end_image
|
| 1924 |
+
"A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
|
| 1925 |
+
"start_end", # generation_mode
|
| 1926 |
+
"Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
|
| 1927 |
+
True, # randomize_seed
|
| 1928 |
+
42, # seed
|
| 1929 |
+
True, # auto_allocation
|
| 1930 |
+
180, # allocation_time
|
| 1931 |
+
672, # resolution
|
| 1932 |
+
1, # total_second_length
|
| 1933 |
+
9, # latent_window_size
|
| 1934 |
+
30, # steps
|
| 1935 |
+
1.0, # cfg
|
| 1936 |
+
10.0, # gs
|
| 1937 |
+
0.0, # rs
|
| 1938 |
+
6, # gpu_memory_preservation
|
| 1939 |
+
False, # enable_preview
|
| 1940 |
+
True, # use_teacache
|
| 1941 |
+
16, # mp4_crf
|
| 1942 |
+
30 # fps_number
|
| 1943 |
+
],
|
| 1944 |
+
],
|
| 1945 |
+
run_on_click = True,
|
| 1946 |
+
fn = process,
|
| 1947 |
inputs = ips,
|
| 1948 |
outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
|
| 1949 |
cache_examples = False,
|
|
|
|
| 2055 |
timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
|
| 2056 |
start_button.click(fn = check_parameters, inputs = [
|
| 2057 |
generation_mode, input_image, input_video
|
| 2058 |
+
], outputs = [end_button, warning], queue = False, show_progress = False).success(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning], scroll_to_output = True)
|
| 2059 |
start_button_video.click(fn = check_parameters, inputs = [
|
| 2060 |
generation_mode, input_image, input_video
|
| 2061 |
], outputs = [end_button, warning], queue = False, show_progress = False).success(fn=process_video, inputs=ips_video, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button, warning], scroll_to_output = True)
|