Fabrice-TIERCELIN commited on
Commit
27fea43
·
verified ·
1 Parent(s): 6ef4b25
Files changed (1) hide show
  1. app_start_end.py +354 -13
app_start_end.py CHANGED
@@ -313,7 +313,277 @@ def set_mp4_comments_imageio_ffmpeg(input_file, comments):
313
  return False
314
 
315
  @torch.no_grad()
316
- def worker_start_end(input_image, end_image, prompts, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, fps_number):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
  def encode_prompt(prompt, n_prompt):
318
  llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
319
 
@@ -855,18 +1125,18 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
855
  stream.output_queue.push(('end', None))
856
  return
857
 
858
- def get_duration_start_end(input_image, image_position, end_image, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
859
  return allocation_time
860
 
861
  # Remove this decorator if you run on local
862
- @spaces.GPU(duration=get_duration_start_end)
863
- def process_start_end_on_gpu(input_image, image_position, end_image, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number
864
  ):
865
  start = time.time()
866
  global stream
867
  stream = AsyncStream()
868
 
869
- async_run(worker_start_end, input_image, end_image, prompts, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, fps_number)
870
 
871
  output_filename = None
872
 
@@ -895,7 +1165,7 @@ def process_start_end_on_gpu(input_image, image_position, end_image, prompts, ge
895
  "You can upscale the result with RIFE. To make all your generated scenes consistent, you can then apply a face swap on the main character. If you do not see the generated video above, the process may have failed. See the logs for more information. If you see an error like ''NVML_SUCCESS == r INTERNAL ASSERT FAILED'', you probably haven't enough VRAM. Test an example or other options to compare. You can share your inputs to the original space or set your space in public for a peer review.", gr.update(interactive=True), gr.update(interactive=False), gr.update(visible = False)
896
  break
897
 
898
- def process_start_end(input_image,
899
  image_position=0,
900
  end_image=None,
901
  prompt="",
@@ -947,7 +1217,7 @@ def process_start_end(input_image,
947
 
948
  yield gr.update(label="Previewed Frames"), None, '', '', gr.update(interactive=False), gr.update(interactive=True), gr.skip()
949
 
950
- yield from process_start_end_on_gpu(input_image,
951
  image_position,
952
  end_image,
953
  prompts,
@@ -1120,7 +1390,7 @@ with block:
1120
  local_storage = gr.BrowserState(default_local_storage)
1121
  with gr.Row():
1122
  with gr.Column():
1123
- generation_mode = gr.Radio([["Text-to-Video", "text"], ["Image-to-Video", "image"], ["Video Extension", "video"]], elem_id="generation-mode", label="Generation mode", value = "image")
1124
  text_to_video_hint = gr.HTML("Text-to-Video badly works with a flash effect at the start. I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.")
1125
  input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
1126
  end_image = gr.Image(sources='upload', type="numpy", label="End Frame (Optional)", height=320)
@@ -1244,7 +1514,7 @@ with block:
1244
  ]
1245
  ],
1246
  run_on_click = True,
1247
- fn = process_start_end,
1248
  inputs = ips,
1249
  outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
1250
  cache_examples = torch.cuda.device_count() > 0,
@@ -1376,7 +1646,43 @@ with block:
1376
  ],
1377
  ],
1378
  run_on_click = True,
1379
- fn = process_start_end,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1380
  inputs = ips,
1381
  outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
1382
  cache_examples = torch.cuda.device_count() > 0,
@@ -1471,7 +1777,7 @@ with block:
1471
  ]
1472
  ],
1473
  run_on_click = True,
1474
- fn = process_start_end,
1475
  inputs = ips,
1476
  outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
1477
  cache_examples = False,
@@ -1602,7 +1908,42 @@ with block:
1602
  ]
1603
  ],
1604
  run_on_click = True,
1605
- fn = process_start_end,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1606
  inputs = ips,
1607
  outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
1608
  cache_examples = False,
@@ -1714,7 +2055,7 @@ with block:
1714
  timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
1715
  start_button.click(fn = check_parameters, inputs = [
1716
  generation_mode, input_image, input_video
1717
- ], outputs = [end_button, warning], queue = False, show_progress = False).success(fn=process_start_end, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning], scroll_to_output = True)
1718
  start_button_video.click(fn = check_parameters, inputs = [
1719
  generation_mode, input_image, input_video
1720
  ], outputs = [end_button, warning], queue = False, show_progress = False).success(fn=process_video, inputs=ips_video, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button, warning], scroll_to_output = True)
 
313
  return False
314
 
315
  @torch.no_grad()
316
+ def worker(input_image, image_position, end_image, prompts, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, fps_number):
317
+ def encode_prompt(prompt, n_prompt):
318
+ llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
319
+
320
+ if cfg == 1:
321
+ llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
322
+ else:
323
+ llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
324
+
325
+ llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
326
+ llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
327
+
328
+ llama_vec = llama_vec.to(transformer.dtype)
329
+ llama_vec_n = llama_vec_n.to(transformer.dtype)
330
+ clip_l_pooler = clip_l_pooler.to(transformer.dtype)
331
+ clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
332
+ return [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n]
333
+
334
+ total_latent_sections = (total_second_length * fps_number) / (latent_window_size * 4)
335
+ total_latent_sections = int(max(round(total_latent_sections), 1))
336
+
337
+ first_section_index = max(min(math.floor(image_position * (total_latent_sections - 1) / 100), (total_latent_sections - 1)), 0)
338
+ section_index = first_section_index
339
+ forward = (image_position == 0)
340
+
341
+ job_id = generate_timestamp()
342
+
343
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
344
+
345
+ try:
346
+ # Clean GPU
347
+ if not high_vram:
348
+ unload_complete_models(
349
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
350
+ )
351
+
352
+ # Text encoding
353
+
354
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
355
+
356
+ if not high_vram:
357
+ fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
358
+ load_model_as_complete(text_encoder_2, target_device=gpu)
359
+
360
+ prompt_parameters = []
361
+
362
+ for prompt_part in prompts[:total_latent_sections]:
363
+ prompt_parameters.append(encode_prompt(prompt_part, n_prompt))
364
+
365
+ # Clean GPU
366
+ if not high_vram:
367
+ unload_complete_models(
368
+ text_encoder, text_encoder_2
369
+ )
370
+
371
+ # Processing input image
372
+
373
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...'))))
374
+
375
+ H, W, C = input_image.shape
376
+ height, width = find_nearest_bucket(H, W, resolution=resolution)
377
+
378
+ def get_start_latent(input_image, height, width, vae, gpu, image_encoder, high_vram):
379
+ input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
380
+
381
+ #Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
382
+
383
+ input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
384
+ input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
385
+
386
+ # VAE encoding
387
+
388
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
389
+
390
+ if not high_vram:
391
+ load_model_as_complete(vae, target_device=gpu)
392
+
393
+ start_latent = vae_encode(input_image_pt, vae)
394
+
395
+ # CLIP Vision
396
+
397
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
398
+
399
+ if not high_vram:
400
+ unload_complete_models(vae)
401
+ load_model_as_complete(image_encoder, target_device=gpu)
402
+
403
+ image_encoder_last_hidden_state = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder).last_hidden_state
404
+
405
+ if not high_vram:
406
+ unload_complete_models(image_encoder)
407
+
408
+ return [start_latent, image_encoder_last_hidden_state]
409
+
410
+ [start_latent, image_encoder_last_hidden_state] = get_start_latent(input_image, height, width, vae, gpu, image_encoder, high_vram)
411
+
412
+ # Dtype
413
+
414
+ image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
415
+
416
+ # Sampling
417
+
418
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
419
+
420
+ rnd = torch.Generator("cpu").manual_seed(seed)
421
+
422
+ history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32, device=cpu)
423
+ start_latent = start_latent.to(history_latents)
424
+ history_pixels = None
425
+
426
+ history_latents = torch.cat([history_latents, start_latent] if forward else [start_latent, history_latents], dim=2)
427
+ total_generated_latent_frames = 1
428
+
429
+ if enable_preview:
430
+ def callback(d):
431
+ preview = d['denoised']
432
+ preview = vae_decode_fake(preview)
433
+
434
+ preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
435
+ preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
436
+
437
+ if stream.input_queue.top() == 'end':
438
+ stream.output_queue.push(('end', None))
439
+ raise KeyboardInterrupt('User ends the task.')
440
+
441
+ current_step = d['i'] + 1
442
+ percentage = int(100.0 * current_step / steps)
443
+ hint = f'Sampling {current_step}/{steps}'
444
+ desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps_number) :.2f} seconds (FPS-30), Resolution: {height}px * {width}px. The video is being extended now ...'
445
+ stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
446
+ return
447
+ else:
448
+ def callback(d):
449
+ return
450
+
451
+ indices = torch.arange(0, 1 + 16 + 2 + 1 + latent_window_size).unsqueeze(0)
452
+ if forward:
453
+ clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
454
+ clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
455
+ else:
456
+ latent_indices, clean_latent_1x_indices, clean_latent_2x_indices, clean_latent_4x_indices, clean_latent_indices_start = indices.split([latent_window_size, 1, 2, 16, 1], dim=1)
457
+ clean_latent_indices = torch.cat([clean_latent_1x_indices, clean_latent_indices_start], dim=1)
458
+
459
+ def post_process(forward, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream):
460
+ total_generated_latent_frames += int(generated_latents.shape[2])
461
+ history_latents = torch.cat([history_latents, generated_latents.to(history_latents)] if forward else [generated_latents.to(history_latents), history_latents], dim=2)
462
+
463
+ if not high_vram:
464
+ offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
465
+ load_model_as_complete(vae, target_device=gpu)
466
+
467
+ if history_pixels is None:
468
+ real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :] if forward else history_latents[:, :, :total_generated_latent_frames, :, :]
469
+ history_pixels = vae_decode(real_history_latents, vae).cpu()
470
+ else:
471
+ section_latent_frames = latent_window_size * 2
472
+ overlapped_frames = latent_window_size * 4 - 3
473
+
474
+ if forward:
475
+ real_history_latents = history_latents[:, :, -min(section_latent_frames, total_generated_latent_frames):, :, :]
476
+ history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents, vae).cpu(), overlapped_frames)
477
+ else:
478
+ real_history_latents = history_latents[:, :, :min(section_latent_frames, total_generated_latent_frames), :, :]
479
+ history_pixels = soft_append_bcthw(vae_decode(real_history_latents, vae).cpu(), history_pixels, overlapped_frames)
480
+
481
+ if not high_vram:
482
+ unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer)
483
+
484
+ if enable_preview or section_index == (0 if first_section_index == (total_latent_sections - 1) else (total_latent_sections - 1)):
485
+ output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
486
+
487
+ save_bcthw_as_mp4(history_pixels, output_filename, fps=fps_number, crf=mp4_crf)
488
+
489
+ print(f'Decoded. Current latent shape pixel shape {history_pixels.shape}')
490
+
491
+ stream.output_queue.push(('file', output_filename))
492
+ return [total_generated_latent_frames, history_latents, history_pixels]
493
+
494
+ while section_index < total_latent_sections:
495
+ if stream.input_queue.top() == 'end':
496
+ stream.output_queue.push(('end', None))
497
+ return
498
+
499
+ print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
500
+
501
+ prompt_index = min(section_index, len(prompt_parameters) - 1)
502
+
503
+ [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters[prompt_index]
504
+
505
+ if prompt_index < len(prompt_parameters) - 1 or (prompt_index == total_latent_sections - 1):
506
+ prompt_parameters[prompt_index] = None
507
+
508
+ if not high_vram:
509
+ unload_complete_models()
510
+ move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
511
+
512
+ if use_teacache:
513
+ transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
514
+ else:
515
+ transformer.initialize_teacache(enable_teacache=False)
516
+
517
+ if forward:
518
+ clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -(16 + 2 + 1):, :, :].split([16, 2, 1], dim=2)
519
+ clean_latents = torch.cat([start_latent, clean_latents_1x], dim=2)
520
+ else:
521
+ clean_latents_1x, clean_latents_2x, clean_latents_4x = history_latents[:, :, :(1 + 2 + 16), :, :].split([1, 2, 16], dim=2)
522
+ clean_latents = torch.cat([clean_latents_1x, start_latent], dim=2)
523
+
524
+ generated_latents = sample_hunyuan(
525
+ transformer=transformer,
526
+ sampler='unipc',
527
+ width=width,
528
+ height=height,
529
+ frames=latent_window_size * 4 - 3,
530
+ real_guidance_scale=cfg,
531
+ distilled_guidance_scale=gs,
532
+ guidance_rescale=rs,
533
+ # shift=3.0,
534
+ num_inference_steps=steps,
535
+ generator=rnd,
536
+ prompt_embeds=llama_vec,
537
+ prompt_embeds_mask=llama_attention_mask,
538
+ prompt_poolers=clip_l_pooler,
539
+ negative_prompt_embeds=llama_vec_n,
540
+ negative_prompt_embeds_mask=llama_attention_mask_n,
541
+ negative_prompt_poolers=clip_l_pooler_n,
542
+ device=gpu,
543
+ dtype=torch.bfloat16,
544
+ image_embeddings=image_encoder_last_hidden_state,
545
+ latent_indices=latent_indices,
546
+ clean_latents=clean_latents,
547
+ clean_latent_indices=clean_latent_indices,
548
+ clean_latents_2x=clean_latents_2x,
549
+ clean_latent_2x_indices=clean_latent_2x_indices,
550
+ clean_latents_4x=clean_latents_4x,
551
+ clean_latent_4x_indices=clean_latent_4x_indices,
552
+ callback=callback,
553
+ )
554
+
555
+ [total_generated_latent_frames, history_latents, history_pixels] = post_process(forward, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream)
556
+
557
+ if not forward:
558
+ if section_index > 0:
559
+ section_index -= 1
560
+ else:
561
+ clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
562
+ clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
563
+
564
+ real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
565
+ zero_latents = history_latents[:, :, total_generated_latent_frames:, :, :]
566
+ history_latents = torch.cat([zero_latents, real_history_latents], dim=2)
567
+ real_history_latents = zero_latents = None
568
+
569
+ forward = True
570
+ section_index = first_section_index
571
+
572
+ if forward:
573
+ section_index += 1
574
+ except:
575
+ traceback.print_exc()
576
+
577
+ if not high_vram:
578
+ unload_complete_models(
579
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
580
+ )
581
+
582
+ stream.output_queue.push(('end', None))
583
+ return
584
+
585
+ @torch.no_grad()
586
+ def worker_start_end(input_image, image_position, end_image, prompts, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, fps_number):
587
  def encode_prompt(prompt, n_prompt):
588
  llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
589
 
 
1125
  stream.output_queue.push(('end', None))
1126
  return
1127
 
1128
+ def get_duration(input_image, image_position, end_image, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
1129
  return allocation_time
1130
 
1131
  # Remove this decorator if you run on local
1132
+ @spaces.GPU(duration=get_duration)
1133
+ def process_on_gpu(input_image, image_position, end_image, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number
1134
  ):
1135
  start = time.time()
1136
  global stream
1137
  stream = AsyncStream()
1138
 
1139
+ async_run(worker_start_end if generation_mode == "start_end" else worker, input_image, image_position, end_image, prompts, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, fps_number)
1140
 
1141
  output_filename = None
1142
 
 
1165
  "You can upscale the result with RIFE. To make all your generated scenes consistent, you can then apply a face swap on the main character. If you do not see the generated video above, the process may have failed. See the logs for more information. If you see an error like ''NVML_SUCCESS == r INTERNAL ASSERT FAILED'', you probably haven't enough VRAM. Test an example or other options to compare. You can share your inputs to the original space or set your space in public for a peer review.", gr.update(interactive=True), gr.update(interactive=False), gr.update(visible = False)
1166
  break
1167
 
1168
+ def process(input_image,
1169
  image_position=0,
1170
  end_image=None,
1171
  prompt="",
 
1217
 
1218
  yield gr.update(label="Previewed Frames"), None, '', '', gr.update(interactive=False), gr.update(interactive=True), gr.skip()
1219
 
1220
+ yield from process_on_gpu(input_image,
1221
  image_position,
1222
  end_image,
1223
  prompts,
 
1390
  local_storage = gr.BrowserState(default_local_storage)
1391
  with gr.Row():
1392
  with gr.Column():
1393
+ generation_mode = gr.Radio([["Text-to-Video", "text"], ["Image-to-Video", "image"], ["Start frame & End frame", "start_end"], ["Video Extension", "video"]], elem_id="generation-mode", label="Generation mode", value = "image")
1394
  text_to_video_hint = gr.HTML("Text-to-Video badly works with a flash effect at the start. I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.")
1395
  input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
1396
  end_image = gr.Image(sources='upload', type="numpy", label="End Frame (Optional)", height=320)
 
1514
  ]
1515
  ],
1516
  run_on_click = True,
1517
+ fn = process,
1518
  inputs = ips,
1519
  outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
1520
  cache_examples = torch.cuda.device_count() > 0,
 
1646
  ],
1647
  ],
1648
  run_on_click = True,
1649
+ fn = process,
1650
+ inputs = ips,
1651
+ outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
1652
+ cache_examples = torch.cuda.device_count() > 0,
1653
+ )
1654
+
1655
+ with gr.Row(elem_id="start_end_examples", visible=False):
1656
+ gr.Examples(
1657
+ label = "Examples from start and end frames",
1658
+ examples = [
1659
+ [
1660
+ "./img_examples/Example2.webp", # input_image
1661
+ 0, # image_position
1662
+ None, # end_image
1663
+ "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks, the man stops talking and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
1664
+ "start_end", # generation_mode
1665
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1666
+ True, # randomize_seed
1667
+ 42, # seed
1668
+ True, # auto_allocation
1669
+ 180, # allocation_time
1670
+ 672, # resolution
1671
+ 1, # total_second_length
1672
+ 9, # latent_window_size
1673
+ 30, # steps
1674
+ 1.0, # cfg
1675
+ 10.0, # gs
1676
+ 0.0, # rs
1677
+ 6, # gpu_memory_preservation
1678
+ False, # enable_preview
1679
+ False, # use_teacache
1680
+ 16, # mp4_crf
1681
+ 30 # fps_number
1682
+ ],
1683
+ ],
1684
+ run_on_click = True,
1685
+ fn = process,
1686
  inputs = ips,
1687
  outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
1688
  cache_examples = torch.cuda.device_count() > 0,
 
1777
  ]
1778
  ],
1779
  run_on_click = True,
1780
+ fn = process,
1781
  inputs = ips,
1782
  outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
1783
  cache_examples = False,
 
1908
  ]
1909
  ],
1910
  run_on_click = True,
1911
+ fn = process,
1912
+ inputs = ips,
1913
+ outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
1914
+ cache_examples = False,
1915
+ )
1916
+
1917
+ gr.Examples(
1918
+ label = "🖼️ Examples from start and end frames",
1919
+ examples = [
1920
+ [
1921
+ "./img_examples/Example1.png", # input_image
1922
+ 0, # image_position
1923
+ None, # end_image
1924
+ "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1925
+ "start_end", # generation_mode
1926
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1927
+ True, # randomize_seed
1928
+ 42, # seed
1929
+ True, # auto_allocation
1930
+ 180, # allocation_time
1931
+ 672, # resolution
1932
+ 1, # total_second_length
1933
+ 9, # latent_window_size
1934
+ 30, # steps
1935
+ 1.0, # cfg
1936
+ 10.0, # gs
1937
+ 0.0, # rs
1938
+ 6, # gpu_memory_preservation
1939
+ False, # enable_preview
1940
+ True, # use_teacache
1941
+ 16, # mp4_crf
1942
+ 30 # fps_number
1943
+ ],
1944
+ ],
1945
+ run_on_click = True,
1946
+ fn = process,
1947
  inputs = ips,
1948
  outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
1949
  cache_examples = False,
 
2055
  timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
2056
  start_button.click(fn = check_parameters, inputs = [
2057
  generation_mode, input_image, input_video
2058
+ ], outputs = [end_button, warning], queue = False, show_progress = False).success(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning], scroll_to_output = True)
2059
  start_button_video.click(fn = check_parameters, inputs = [
2060
  generation_mode, input_image, input_video
2061
  ], outputs = [end_button, warning], queue = False, show_progress = False).success(fn=process_video, inputs=ips_video, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button, warning], scroll_to_output = True)