Update app.py
Browse files
app.py
CHANGED
|
@@ -56,8 +56,14 @@ text_encoder_2.requires_grad_(False)
|
|
| 56 |
image_encoder.requires_grad_(False)
|
| 57 |
transformer.requires_grad_(False)
|
| 58 |
|
| 59 |
-
DynamicSwapInstaller.install_model(transformer, device=gpu)
|
| 60 |
-
DynamicSwapInstaller.install_model(text_encoder, device=gpu)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
stream = AsyncStream()
|
| 63 |
|
|
@@ -75,16 +81,16 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
|
|
| 75 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
|
| 76 |
|
| 77 |
try:
|
| 78 |
-
unload_complete_models(
|
| 79 |
-
|
| 80 |
-
)
|
| 81 |
|
| 82 |
# Text encoding
|
| 83 |
|
| 84 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
|
| 85 |
|
| 86 |
-
fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
|
| 87 |
-
load_model_as_complete(text_encoder_2, target_device=gpu)
|
| 88 |
|
| 89 |
llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
|
| 90 |
|
|
@@ -113,7 +119,7 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
|
|
| 113 |
|
| 114 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
|
| 115 |
|
| 116 |
-
load_model_as_complete(vae, target_device=gpu)
|
| 117 |
|
| 118 |
start_latent = vae_encode(input_image_pt, vae)
|
| 119 |
|
|
@@ -121,7 +127,7 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
|
|
| 121 |
|
| 122 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
|
| 123 |
|
| 124 |
-
load_model_as_complete(image_encoder, target_device=gpu)
|
| 125 |
|
| 126 |
image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
|
| 127 |
image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
|
|
@@ -172,8 +178,8 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
|
|
| 172 |
clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, :1 + 2 + 16, :, :].split([1, 2, 16], dim=2)
|
| 173 |
clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)
|
| 174 |
|
| 175 |
-
unload_complete_models()
|
| 176 |
-
move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
|
| 177 |
|
| 178 |
if use_teacache:
|
| 179 |
transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
|
|
@@ -235,8 +241,8 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
|
|
| 235 |
total_generated_latent_frames += int(generated_latents.shape[2])
|
| 236 |
history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
|
| 237 |
|
| 238 |
-
offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
|
| 239 |
-
load_model_as_complete(vae, target_device=gpu)
|
| 240 |
|
| 241 |
real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
|
| 242 |
|
|
@@ -249,7 +255,7 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
|
|
| 249 |
current_pixels = vae_decode(real_history_latents[:, :, :section_latent_frames], vae).cpu()
|
| 250 |
history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
|
| 251 |
|
| 252 |
-
unload_complete_models()
|
| 253 |
|
| 254 |
output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
|
| 255 |
|
|
@@ -264,9 +270,9 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
|
|
| 264 |
except:
|
| 265 |
traceback.print_exc()
|
| 266 |
|
| 267 |
-
unload_complete_models(
|
| 268 |
-
|
| 269 |
-
)
|
| 270 |
|
| 271 |
stream.output_queue.push(('end', None))
|
| 272 |
return
|
|
|
|
| 56 |
image_encoder.requires_grad_(False)
|
| 57 |
transformer.requires_grad_(False)
|
| 58 |
|
| 59 |
+
# DynamicSwapInstaller.install_model(transformer, device=gpu)
|
| 60 |
+
# DynamicSwapInstaller.install_model(text_encoder, device=gpu)
|
| 61 |
+
|
| 62 |
+
text_encoder.to(gpu)
|
| 63 |
+
text_encoder_2.to(gpu)
|
| 64 |
+
image_encoder.to(gpu)
|
| 65 |
+
vae.to(gpu)
|
| 66 |
+
transformer.to(gpu)
|
| 67 |
|
| 68 |
stream = AsyncStream()
|
| 69 |
|
|
|
|
| 81 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
|
| 82 |
|
| 83 |
try:
|
| 84 |
+
# unload_complete_models(
|
| 85 |
+
# text_encoder, text_encoder_2, image_encoder, vae, transformer
|
| 86 |
+
# )
|
| 87 |
|
| 88 |
# Text encoding
|
| 89 |
|
| 90 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
|
| 91 |
|
| 92 |
+
# fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
|
| 93 |
+
# load_model_as_complete(text_encoder_2, target_device=gpu)
|
| 94 |
|
| 95 |
llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
|
| 96 |
|
|
|
|
| 119 |
|
| 120 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
|
| 121 |
|
| 122 |
+
# load_model_as_complete(vae, target_device=gpu)
|
| 123 |
|
| 124 |
start_latent = vae_encode(input_image_pt, vae)
|
| 125 |
|
|
|
|
| 127 |
|
| 128 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
|
| 129 |
|
| 130 |
+
# load_model_as_complete(image_encoder, target_device=gpu)
|
| 131 |
|
| 132 |
image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
|
| 133 |
image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
|
|
|
|
| 178 |
clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, :1 + 2 + 16, :, :].split([1, 2, 16], dim=2)
|
| 179 |
clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)
|
| 180 |
|
| 181 |
+
# unload_complete_models()
|
| 182 |
+
# move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
|
| 183 |
|
| 184 |
if use_teacache:
|
| 185 |
transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
|
|
|
|
| 241 |
total_generated_latent_frames += int(generated_latents.shape[2])
|
| 242 |
history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
|
| 243 |
|
| 244 |
+
# offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
|
| 245 |
+
# load_model_as_complete(vae, target_device=gpu)
|
| 246 |
|
| 247 |
real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
|
| 248 |
|
|
|
|
| 255 |
current_pixels = vae_decode(real_history_latents[:, :, :section_latent_frames], vae).cpu()
|
| 256 |
history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
|
| 257 |
|
| 258 |
+
# unload_complete_models()
|
| 259 |
|
| 260 |
output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
|
| 261 |
|
|
|
|
| 270 |
except:
|
| 271 |
traceback.print_exc()
|
| 272 |
|
| 273 |
+
# unload_complete_models(
|
| 274 |
+
# text_encoder, text_encoder_2, image_encoder, vae, transformer
|
| 275 |
+
# )
|
| 276 |
|
| 277 |
stream.output_queue.push(('end', None))
|
| 278 |
return
|