Spaces:
Runtime error
Runtime error
Optimization
Browse files
app.py
CHANGED
|
@@ -357,31 +357,36 @@ def worker(input_image, prompts, n_prompt, seed, resolution, total_second_length
|
|
| 357 |
|
| 358 |
H, W, C = input_image.shape
|
| 359 |
height, width = find_nearest_bucket(H, W, resolution=resolution)
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 382 |
|
| 383 |
-
|
| 384 |
-
image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
|
| 385 |
|
| 386 |
# Dtype
|
| 387 |
|
|
@@ -573,31 +578,36 @@ def worker_last_frame(input_image, prompts, n_prompt, seed, resolution, total_se
|
|
| 573 |
|
| 574 |
H, W, C = input_image.shape
|
| 575 |
height, width = find_nearest_bucket(H, W, resolution=resolution)
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 598 |
|
| 599 |
-
|
| 600 |
-
image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
|
| 601 |
|
| 602 |
# Dtype
|
| 603 |
|
|
|
|
| 357 |
|
| 358 |
H, W, C = input_image.shape
|
| 359 |
height, width = find_nearest_bucket(H, W, resolution=resolution)
|
| 360 |
+
|
| 361 |
+
def get_start_latent(input_image, height, width, vae, gpu, image_encoder, high_vram):
|
| 362 |
+
input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
|
| 363 |
+
|
| 364 |
+
#Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
|
| 365 |
+
|
| 366 |
+
input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
|
| 367 |
+
input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
|
| 368 |
+
|
| 369 |
+
# VAE encoding
|
| 370 |
+
|
| 371 |
+
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
|
| 372 |
+
|
| 373 |
+
if not high_vram:
|
| 374 |
+
load_model_as_complete(vae, target_device=gpu)
|
| 375 |
+
|
| 376 |
+
start_latent = vae_encode(input_image_pt, vae)
|
| 377 |
+
|
| 378 |
+
# CLIP Vision
|
| 379 |
+
|
| 380 |
+
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
|
| 381 |
+
|
| 382 |
+
if not high_vram:
|
| 383 |
+
load_model_as_complete(image_encoder, target_device=gpu)
|
| 384 |
+
|
| 385 |
+
return start_latent
|
| 386 |
+
|
| 387 |
+
start_latent = get_start_latent(input_image, height, width, vae, gpu, image_encoder, high_vram)
|
| 388 |
|
| 389 |
+
image_encoder_last_hidden_state = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder).last_hidden_state
|
|
|
|
| 390 |
|
| 391 |
# Dtype
|
| 392 |
|
|
|
|
| 578 |
|
| 579 |
H, W, C = input_image.shape
|
| 580 |
height, width = find_nearest_bucket(H, W, resolution=resolution)
|
| 581 |
+
|
| 582 |
+
def get_start_latent(input_image, height, width, vae, gpu, image_encoder, high_vram):
|
| 583 |
+
input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
|
| 584 |
+
|
| 585 |
+
#Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
|
| 586 |
+
|
| 587 |
+
input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
|
| 588 |
+
input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
|
| 589 |
+
|
| 590 |
+
# VAE encoding
|
| 591 |
+
|
| 592 |
+
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
|
| 593 |
+
|
| 594 |
+
if not high_vram:
|
| 595 |
+
load_model_as_complete(vae, target_device=gpu)
|
| 596 |
+
|
| 597 |
+
start_latent = vae_encode(input_image_pt, vae)
|
| 598 |
+
|
| 599 |
+
# CLIP Vision
|
| 600 |
+
|
| 601 |
+
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
|
| 602 |
+
|
| 603 |
+
if not high_vram:
|
| 604 |
+
load_model_as_complete(image_encoder, target_device=gpu)
|
| 605 |
+
|
| 606 |
+
return start_latent
|
| 607 |
+
|
| 608 |
+
start_latent = get_start_latent(input_image, height, width, vae, gpu, image_encoder, high_vram)
|
| 609 |
|
| 610 |
+
image_encoder_last_hidden_state = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder).last_hidden_state
|
|
|
|
| 611 |
|
| 612 |
# Dtype
|
| 613 |
|