Spaces:
Build error
Build error
Better allocation estimation
Browse files
app.py
CHANGED
|
@@ -390,9 +390,13 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
|
|
| 390 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
|
| 391 |
|
| 392 |
if not high_vram:
|
|
|
|
| 393 |
load_model_as_complete(image_encoder, target_device=gpu)
|
| 394 |
|
| 395 |
image_encoder_last_hidden_state = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder).last_hidden_state
|
|
|
|
|
|
|
|
|
|
| 396 |
|
| 397 |
return [start_latent, image_encoder_last_hidden_state]
|
| 398 |
|
|
@@ -468,7 +472,7 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
|
|
| 468 |
history_pixels = soft_append_bcthw(vae_decode(real_history_latents, vae).cpu(), history_pixels, overlapped_frames)
|
| 469 |
|
| 470 |
if not high_vram:
|
| 471 |
-
unload_complete_models()
|
| 472 |
|
| 473 |
if enable_preview or section_index == (0 if first_section_index == (total_latent_sections - 1) else (total_latent_sections - 1)):
|
| 474 |
output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
|
|
@@ -636,6 +640,11 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
|
|
| 636 |
load_model_as_complete(image_encoder, target_device=gpu)
|
| 637 |
|
| 638 |
image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 639 |
image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
|
| 640 |
|
| 641 |
# Dtype
|
|
@@ -808,7 +817,7 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
|
|
| 808 |
history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents, vae).cpu(), overlapped_frames)
|
| 809 |
|
| 810 |
if not high_vram:
|
| 811 |
-
unload_complete_models()
|
| 812 |
|
| 813 |
if enable_preview or section_index == total_latent_sections - 1:
|
| 814 |
output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
|
|
@@ -909,7 +918,7 @@ def process(input_image,
|
|
| 909 |
fps_number=30
|
| 910 |
):
|
| 911 |
if auto_allocation:
|
| 912 |
-
allocation_time = min(total_second_length * 60 * (
|
| 913 |
|
| 914 |
if torch.cuda.device_count() == 0:
|
| 915 |
gr.Warning('Set this space to GPU config to make it work.')
|
|
@@ -994,7 +1003,7 @@ def process_video_on_gpu(input_video, prompts, n_prompt, seed, batch, resolution
|
|
| 994 |
def process_video(input_video, prompt, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
|
| 995 |
global high_vram
|
| 996 |
if auto_allocation:
|
| 997 |
-
allocation_time = min(total_second_length * 60 * (
|
| 998 |
|
| 999 |
if torch.cuda.device_count() == 0:
|
| 1000 |
gr.Warning('Set this space to GPU config to make it work.')
|
|
@@ -1066,7 +1075,7 @@ title_html = """
|
|
| 1066 |
|
| 1067 |
js = """
|
| 1068 |
function createGradioAnimation() {
|
| 1069 |
-
window.addEventListener("beforeunload", function
|
| 1070 |
if (document.getElementById('end-button') && !document.getElementById('end-button').disabled) {
|
| 1071 |
var confirmationMessage = 'A process is still running. '
|
| 1072 |
+ 'If you leave before saving, your changes will be lost.';
|
|
@@ -1095,7 +1104,7 @@ with block:
|
|
| 1095 |
with gr.Row():
|
| 1096 |
with gr.Column():
|
| 1097 |
generation_mode = gr.Radio([["Text-to-Video", "text"], ["Image-to-Video", "image"], ["Video Extension", "video"]], elem_id="generation-mode", label="Generation mode", value = "image")
|
| 1098 |
-
text_to_video_hint = gr.HTML("Text-to-Video badly works. I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.")
|
| 1099 |
input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
|
| 1100 |
image_position = gr.Slider(label="Image position", minimum=0, maximum=100, value=0, step=1, info='0=Video start; 100=Video end (lower quality)')
|
| 1101 |
input_video = gr.Video(sources='upload', label="Input Video", height=320)
|
|
@@ -1122,7 +1131,7 @@ with block:
|
|
| 1122 |
enable_preview = gr.Checkbox(label='Enable preview', value=True, info='Display a preview around each second generated but it costs 2 sec. for each second generated.')
|
| 1123 |
use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed and no break in brightness, but often makes hands and fingers slightly worse.')
|
| 1124 |
|
| 1125 |
-
n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
|
| 1126 |
|
| 1127 |
fps_number = gr.Slider(label="Frame per seconds", info="The model is trained for 30 fps so other fps may generate weird results", minimum=10, maximum=60, value=30, step=1)
|
| 1128 |
|
|
@@ -1171,7 +1180,7 @@ with block:
|
|
| 1171 |
allocation_time = gr.Slider(label="GPU allocation time (in seconds)", info='lower=May abort run, higher=Quota penalty for next runs; only useful for ZeroGPU; for instance set to 88 when you have the message "You have exceeded your GPU quota (180s requested vs. 89s left)."', value=180, minimum=60, maximum=320, step=1)
|
| 1172 |
|
| 1173 |
with gr.Column():
|
| 1174 |
-
warning = gr.HTML(value = "<center><big>Your computer must <u>not</u> enter into standby mode.</big><br/>On Chrome, you can force to keep a tab alive in <code>chrome://discards/</code></center>", visible = False)
|
| 1175 |
result_video = gr.Video(label="Generated Frames", autoplay=True, show_share_button=False, height=512, loop=True)
|
| 1176 |
preview_image = gr.Image(label="Next Latents", height=200, visible=False)
|
| 1177 |
progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
|
|
@@ -1189,7 +1198,7 @@ with block:
|
|
| 1189 |
0, # image_position
|
| 1190 |
"Overcrowed street in Japan, photorealistic, realistic, intricate details, 8k, insanely detailed",
|
| 1191 |
"text", # generation_mode
|
| 1192 |
-
"Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
|
| 1193 |
True, # randomize_seed
|
| 1194 |
42, # seed
|
| 1195 |
True, # auto_allocation
|
|
@@ -1223,7 +1232,7 @@ with block:
|
|
| 1223 |
0, # image_position
|
| 1224 |
"A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
|
| 1225 |
"image", # generation_mode
|
| 1226 |
-
"Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
|
| 1227 |
True, # randomize_seed
|
| 1228 |
42, # seed
|
| 1229 |
True, # auto_allocation
|
|
@@ -1246,7 +1255,7 @@ with block:
|
|
| 1246 |
0, # image_position
|
| 1247 |
"A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks, the man stops talking and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
|
| 1248 |
"image", # generation_mode
|
| 1249 |
-
"Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
|
| 1250 |
True, # randomize_seed
|
| 1251 |
42, # seed
|
| 1252 |
True, # auto_allocation
|
|
@@ -1269,7 +1278,7 @@ with block:
|
|
| 1269 |
0, # image_position
|
| 1270 |
"A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks, the woman stops talking and the woman listens A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens",
|
| 1271 |
"image", # generation_mode
|
| 1272 |
-
"Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
|
| 1273 |
True, # randomize_seed
|
| 1274 |
42, # seed
|
| 1275 |
True, # auto_allocation
|
|
@@ -1292,7 +1301,7 @@ with block:
|
|
| 1292 |
0, # image_position
|
| 1293 |
"A boy is walking to the right, full view, full-length view, cartoon",
|
| 1294 |
"image", # generation_mode
|
| 1295 |
-
"Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
|
| 1296 |
True, # randomize_seed
|
| 1297 |
42, # seed
|
| 1298 |
True, # auto_allocation
|
|
@@ -1315,7 +1324,7 @@ with block:
|
|
| 1315 |
100, # image_position
|
| 1316 |
"A building starting to explode, photorealistic, realisitc, 8k, insanely detailed",
|
| 1317 |
"image", # generation_mode
|
| 1318 |
-
"Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
|
| 1319 |
True, # randomize_seed
|
| 1320 |
42, # seed
|
| 1321 |
True, # auto_allocation
|
|
@@ -1347,7 +1356,7 @@ with block:
|
|
| 1347 |
[
|
| 1348 |
"./img_examples/Example1.mp4", # input_video
|
| 1349 |
"View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
|
| 1350 |
-
"Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
|
| 1351 |
True, # randomize_seed
|
| 1352 |
42, # seed
|
| 1353 |
True, # auto_allocation
|
|
|
|
| 390 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
|
| 391 |
|
| 392 |
if not high_vram:
|
| 393 |
+
unload_complete_models(vae)
|
| 394 |
load_model_as_complete(image_encoder, target_device=gpu)
|
| 395 |
|
| 396 |
image_encoder_last_hidden_state = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder).last_hidden_state
|
| 397 |
+
|
| 398 |
+
if not high_vram:
|
| 399 |
+
unload_complete_models(image_encoder)
|
| 400 |
|
| 401 |
return [start_latent, image_encoder_last_hidden_state]
|
| 402 |
|
|
|
|
| 472 |
history_pixels = soft_append_bcthw(vae_decode(real_history_latents, vae).cpu(), history_pixels, overlapped_frames)
|
| 473 |
|
| 474 |
if not high_vram:
|
| 475 |
+
unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer)
|
| 476 |
|
| 477 |
if enable_preview or section_index == (0 if first_section_index == (total_latent_sections - 1) else (total_latent_sections - 1)):
|
| 478 |
output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
|
|
|
|
| 640 |
load_model_as_complete(image_encoder, target_device=gpu)
|
| 641 |
|
| 642 |
image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
|
| 643 |
+
|
| 644 |
+
# Clean GPU
|
| 645 |
+
if not high_vram:
|
| 646 |
+
unload_complete_models(image_encoder)
|
| 647 |
+
|
| 648 |
image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
|
| 649 |
|
| 650 |
# Dtype
|
|
|
|
| 817 |
history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents, vae).cpu(), overlapped_frames)
|
| 818 |
|
| 819 |
if not high_vram:
|
| 820 |
+
unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer)
|
| 821 |
|
| 822 |
if enable_preview or section_index == total_latent_sections - 1:
|
| 823 |
output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
|
|
|
|
| 918 |
fps_number=30
|
| 919 |
):
|
| 920 |
if auto_allocation:
|
| 921 |
+
allocation_time = min(total_second_length * 60 * (1.5 if use_teacache else 3.0) * (1 + ((steps - 25) / 25)), 600)
|
| 922 |
|
| 923 |
if torch.cuda.device_count() == 0:
|
| 924 |
gr.Warning('Set this space to GPU config to make it work.')
|
|
|
|
| 1003 |
def process_video(input_video, prompt, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
|
| 1004 |
global high_vram
|
| 1005 |
if auto_allocation:
|
| 1006 |
+
allocation_time = min(total_second_length * 60 * (2.5 if use_teacache else 3.5) * (1 + ((steps - 25) / 25)), 600)
|
| 1007 |
|
| 1008 |
if torch.cuda.device_count() == 0:
|
| 1009 |
gr.Warning('Set this space to GPU config to make it work.')
|
|
|
|
| 1075 |
|
| 1076 |
js = """
|
| 1077 |
function createGradioAnimation() {
|
| 1078 |
+
window.addEventListener("beforeunload", function(e) {
|
| 1079 |
if (document.getElementById('end-button') && !document.getElementById('end-button').disabled) {
|
| 1080 |
var confirmationMessage = 'A process is still running. '
|
| 1081 |
+ 'If you leave before saving, your changes will be lost.';
|
|
|
|
| 1104 |
with gr.Row():
|
| 1105 |
with gr.Column():
|
| 1106 |
generation_mode = gr.Radio([["Text-to-Video", "text"], ["Image-to-Video", "image"], ["Video Extension", "video"]], elem_id="generation-mode", label="Generation mode", value = "image")
|
| 1107 |
+
text_to_video_hint = gr.HTML("Text-to-Video badly works with a flash effect at the start. I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.")
|
| 1108 |
input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
|
| 1109 |
image_position = gr.Slider(label="Image position", minimum=0, maximum=100, value=0, step=1, info='0=Video start; 100=Video end (lower quality)')
|
| 1110 |
input_video = gr.Video(sources='upload', label="Input Video", height=320)
|
|
|
|
| 1131 |
enable_preview = gr.Checkbox(label='Enable preview', value=True, info='Display a preview around each second generated but it costs 2 sec. for each second generated.')
|
| 1132 |
use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed and no break in brightness, but often makes hands and fingers slightly worse.')
|
| 1133 |
|
| 1134 |
+
n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
|
| 1135 |
|
| 1136 |
fps_number = gr.Slider(label="Frame per seconds", info="The model is trained for 30 fps so other fps may generate weird results", minimum=10, maximum=60, value=30, step=1)
|
| 1137 |
|
|
|
|
| 1180 |
allocation_time = gr.Slider(label="GPU allocation time (in seconds)", info='lower=May abort run, higher=Quota penalty for next runs; only useful for ZeroGPU; for instance set to 88 when you have the message "You have exceeded your GPU quota (180s requested vs. 89s left)."', value=180, minimum=60, maximum=320, step=1)
|
| 1181 |
|
| 1182 |
with gr.Column():
|
| 1183 |
+
warning = gr.HTML(elem_id="warning", value = "<center><big>Your computer must <u>not</u> enter into standby mode.</big><br/>On Chrome, you can force to keep a tab alive in <code>chrome://discards/</code></center>", visible = False)
|
| 1184 |
result_video = gr.Video(label="Generated Frames", autoplay=True, show_share_button=False, height=512, loop=True)
|
| 1185 |
preview_image = gr.Image(label="Next Latents", height=200, visible=False)
|
| 1186 |
progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
|
|
|
|
| 1198 |
0, # image_position
|
| 1199 |
"Overcrowed street in Japan, photorealistic, realistic, intricate details, 8k, insanely detailed",
|
| 1200 |
"text", # generation_mode
|
| 1201 |
+
"Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
|
| 1202 |
True, # randomize_seed
|
| 1203 |
42, # seed
|
| 1204 |
True, # auto_allocation
|
|
|
|
| 1232 |
0, # image_position
|
| 1233 |
"A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
|
| 1234 |
"image", # generation_mode
|
| 1235 |
+
"Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
|
| 1236 |
True, # randomize_seed
|
| 1237 |
42, # seed
|
| 1238 |
True, # auto_allocation
|
|
|
|
| 1255 |
0, # image_position
|
| 1256 |
"A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks, the man stops talking and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
|
| 1257 |
"image", # generation_mode
|
| 1258 |
+
"Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
|
| 1259 |
True, # randomize_seed
|
| 1260 |
42, # seed
|
| 1261 |
True, # auto_allocation
|
|
|
|
| 1278 |
0, # image_position
|
| 1279 |
"A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks, the woman stops talking and the woman listens A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens",
|
| 1280 |
"image", # generation_mode
|
| 1281 |
+
"Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
|
| 1282 |
True, # randomize_seed
|
| 1283 |
42, # seed
|
| 1284 |
True, # auto_allocation
|
|
|
|
| 1301 |
0, # image_position
|
| 1302 |
"A boy is walking to the right, full view, full-length view, cartoon",
|
| 1303 |
"image", # generation_mode
|
| 1304 |
+
"Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
|
| 1305 |
True, # randomize_seed
|
| 1306 |
42, # seed
|
| 1307 |
True, # auto_allocation
|
|
|
|
| 1324 |
100, # image_position
|
| 1325 |
"A building starting to explode, photorealistic, realisitc, 8k, insanely detailed",
|
| 1326 |
"image", # generation_mode
|
| 1327 |
+
"Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
|
| 1328 |
True, # randomize_seed
|
| 1329 |
42, # seed
|
| 1330 |
True, # auto_allocation
|
|
|
|
| 1356 |
[
|
| 1357 |
"./img_examples/Example1.mp4", # input_video
|
| 1358 |
"View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
|
| 1359 |
+
"Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
|
| 1360 |
True, # randomize_seed
|
| 1361 |
42, # seed
|
| 1362 |
True, # auto_allocation
|