Fabrice-TIERCELIN commited on
Commit
76e6c32
·
verified ·
1 Parent(s): d4d8d62

Quick examples

Browse files
Files changed (1) hide show
  1. app.py +539 -541
app.py CHANGED
@@ -1,541 +1,539 @@
1
- from diffusers_helper.hf_login import login
2
-
3
- import os
4
-
5
- os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download')))
6
-
7
- import gradio as gr
8
- import torch
9
- import traceback
10
- import einops
11
- import safetensors.torch as sf
12
- import numpy as np
13
- import math
14
- import spaces
15
-
16
- from PIL import Image
17
- from diffusers import AutoencoderKLHunyuanVideo
18
- from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
19
- from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
20
- from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
21
- from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
22
- from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
23
- from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
24
- from diffusers_helper.thread_utils import AsyncStream, async_run
25
- from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
26
- from transformers import SiglipImageProcessor, SiglipVisionModel
27
- from diffusers_helper.clip_vision import hf_clip_vision_encode
28
- from diffusers_helper.bucket_tools import find_nearest_bucket
29
-
30
-
31
- free_mem_gb = get_cuda_free_memory_gb(gpu)
32
- high_vram = free_mem_gb > 80
33
-
34
- print(f'Free VRAM {free_mem_gb} GB')
35
- print(f'High-VRAM Mode: {high_vram}')
36
-
37
- text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
38
- text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
39
- tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
40
- tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2')
41
- vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu()
42
-
43
- feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor')
44
- image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu()
45
-
46
- # quant_config = DiffusersBitsAndBytesConfig(load_in_8bit=True)
47
- # transformer = HunyuanVideoTransformer3DModelPacked.from_single_file("https://huggingface.co/sirolim/FramePack_F1_I2V_FP8/resolve/main/FramePack_F1_I2V_HY_fp8_e4m3fn.safetensors", torch_dtype=torch.bfloat16)
48
- # transformer = HunyuanVideoTransformer3DModelPacked.from_single_file('sirolim/FramePack_F1_I2V_FP8', "FramePack_F1_I2V_HY_fp8_e4m3fn.safetensors", use_safetensors=True, torch_dtype=torch.bfloat16).cpu()
49
- transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePack_F1_I2V_HY_20250503', torch_dtype=torch.bfloat16).cpu()
50
-
51
- vae.eval()
52
- text_encoder.eval()
53
- text_encoder_2.eval()
54
- image_encoder.eval()
55
- transformer.eval()
56
-
57
- if not high_vram:
58
- vae.enable_slicing()
59
- vae.enable_tiling()
60
-
61
- transformer.high_quality_fp32_output_for_inference = True
62
- print('transformer.high_quality_fp32_output_for_inference = True')
63
-
64
- transformer.to(dtype=torch.bfloat16)
65
- vae.to(dtype=torch.float16)
66
- image_encoder.to(dtype=torch.float16)
67
- text_encoder.to(dtype=torch.float16)
68
- text_encoder_2.to(dtype=torch.float16)
69
-
70
- vae.requires_grad_(False)
71
- text_encoder.requires_grad_(False)
72
- text_encoder_2.requires_grad_(False)
73
- image_encoder.requires_grad_(False)
74
- transformer.requires_grad_(False)
75
-
76
- if not high_vram:
77
- # DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster
78
- DynamicSwapInstaller.install_model(transformer, device=gpu)
79
- DynamicSwapInstaller.install_model(text_encoder, device=gpu)
80
- else:
81
- text_encoder.to(gpu)
82
- text_encoder_2.to(gpu)
83
- image_encoder.to(gpu)
84
- vae.to(gpu)
85
- transformer.to(gpu)
86
-
87
- stream = AsyncStream()
88
-
89
- outputs_folder = './outputs/'
90
- os.makedirs(outputs_folder, exist_ok=True)
91
-
92
- examples = [
93
- ["img_examples/1.png", "The girl dances gracefully, with clear movements, full of charm.",],
94
- ["img_examples/2.jpg", "The man dances flamboyantly, swinging his hips and striking bold poses with dramatic flair."],
95
- ["img_examples/3.png", "The woman dances elegantly among the blossoms, spinning slowly with flowing sleeves and graceful hand movements."],
96
- ]
97
-
98
- input_image_debug_value = None
99
- prompt_debug_value = None
100
- total_second_length_debug_value = None
101
-
102
- def generate_examples(input_image, prompt):
103
-
104
- t2v=False
105
- n_prompt=""
106
- seed=31337
107
- total_second_length=5
108
- latent_window_size=9
109
- steps=25
110
- cfg=1.0
111
- gs=10.0
112
- rs=0.0
113
- gpu_memory_preservation=6
114
- use_teacache=True
115
- mp4_crf=16
116
-
117
- global stream
118
-
119
- # assert input_image is not None, 'No input image!'
120
- if t2v:
121
- default_height, default_width = 640, 640
122
- input_image = np.ones((default_height, default_width, 3), dtype=np.uint8) * 255
123
- print("No input image provided. Using a blank white image.")
124
-
125
- yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
126
-
127
- stream = AsyncStream()
128
-
129
- async_run(worker, input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf)
130
-
131
- output_filename = None
132
-
133
- while True:
134
- flag, data = stream.output_queue.next()
135
-
136
- if flag == 'file':
137
- output_filename = data
138
- yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
139
-
140
- if flag == 'progress':
141
- preview, desc, html = data
142
- yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
143
-
144
- if flag == 'end':
145
- yield output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False)
146
- break
147
-
148
-
149
-
150
- @torch.no_grad()
151
- def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
152
- total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
153
- total_latent_sections = int(max(round(total_latent_sections), 1))
154
-
155
- job_id = generate_timestamp()
156
-
157
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
158
-
159
- try:
160
- # Clean GPU
161
- if not high_vram:
162
- unload_complete_models(
163
- text_encoder, text_encoder_2, image_encoder, vae, transformer
164
- )
165
-
166
- # Text encoding
167
-
168
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
169
-
170
- if not high_vram:
171
- fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
172
- load_model_as_complete(text_encoder_2, target_device=gpu)
173
-
174
- llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
175
-
176
- if cfg == 1:
177
- llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
178
- else:
179
- llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
180
-
181
- llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
182
- llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
183
-
184
- # Processing input image
185
-
186
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...'))))
187
-
188
- H, W, C = input_image.shape
189
- height, width = find_nearest_bucket(H, W, resolution=640)
190
- input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
191
-
192
- Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
193
-
194
- input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
195
- input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
196
-
197
- # VAE encoding
198
-
199
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
200
-
201
- if not high_vram:
202
- load_model_as_complete(vae, target_device=gpu)
203
-
204
- start_latent = vae_encode(input_image_pt, vae)
205
-
206
- # CLIP Vision
207
-
208
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
209
-
210
- if not high_vram:
211
- load_model_as_complete(image_encoder, target_device=gpu)
212
-
213
- image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
214
- image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
215
-
216
- # Dtype
217
-
218
- llama_vec = llama_vec.to(transformer.dtype)
219
- llama_vec_n = llama_vec_n.to(transformer.dtype)
220
- clip_l_pooler = clip_l_pooler.to(transformer.dtype)
221
- clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
222
- image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
223
-
224
- # Sampling
225
-
226
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
227
-
228
- rnd = torch.Generator("cpu").manual_seed(seed)
229
-
230
- history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32).cpu()
231
- history_pixels = None
232
-
233
- history_latents = torch.cat([history_latents, start_latent.to(history_latents)], dim=2)
234
- total_generated_latent_frames = 1
235
-
236
- for section_index in range(total_latent_sections):
237
- if stream.input_queue.top() == 'end':
238
- stream.output_queue.push(('end', None))
239
- return
240
-
241
- print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
242
-
243
- if not high_vram:
244
- unload_complete_models()
245
- move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
246
-
247
- if use_teacache:
248
- transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
249
- else:
250
- transformer.initialize_teacache(enable_teacache=False)
251
-
252
- def callback(d):
253
- preview = d['denoised']
254
- preview = vae_decode_fake(preview)
255
-
256
- preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
257
- preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
258
-
259
- if stream.input_queue.top() == 'end':
260
- stream.output_queue.push(('end', None))
261
- raise KeyboardInterrupt('User ends the task.')
262
-
263
- current_step = d['i'] + 1
264
- percentage = int(100.0 * current_step / steps)
265
- hint = f'Sampling {current_step}/{steps}'
266
- desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30). The video is being extended now ...'
267
- stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
268
- return
269
-
270
- indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
271
- clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
272
- clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
273
-
274
- clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -sum([16, 2, 1]):, :, :].split([16, 2, 1], dim=2)
275
- clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
276
-
277
- generated_latents = sample_hunyuan(
278
- transformer=transformer,
279
- sampler='unipc',
280
- width=width,
281
- height=height,
282
- frames=latent_window_size * 4 - 3,
283
- real_guidance_scale=cfg,
284
- distilled_guidance_scale=gs,
285
- guidance_rescale=rs,
286
- # shift=3.0,
287
- num_inference_steps=steps,
288
- generator=rnd,
289
- prompt_embeds=llama_vec,
290
- prompt_embeds_mask=llama_attention_mask,
291
- prompt_poolers=clip_l_pooler,
292
- negative_prompt_embeds=llama_vec_n,
293
- negative_prompt_embeds_mask=llama_attention_mask_n,
294
- negative_prompt_poolers=clip_l_pooler_n,
295
- device=gpu,
296
- dtype=torch.bfloat16,
297
- image_embeddings=image_encoder_last_hidden_state,
298
- latent_indices=latent_indices,
299
- clean_latents=clean_latents,
300
- clean_latent_indices=clean_latent_indices,
301
- clean_latents_2x=clean_latents_2x,
302
- clean_latent_2x_indices=clean_latent_2x_indices,
303
- clean_latents_4x=clean_latents_4x,
304
- clean_latent_4x_indices=clean_latent_4x_indices,
305
- callback=callback,
306
- )
307
-
308
- total_generated_latent_frames += int(generated_latents.shape[2])
309
- history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
310
-
311
- if not high_vram:
312
- offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
313
- load_model_as_complete(vae, target_device=gpu)
314
-
315
- real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
316
-
317
- if history_pixels is None:
318
- history_pixels = vae_decode(real_history_latents, vae).cpu()
319
- else:
320
- section_latent_frames = latent_window_size * 2
321
- overlapped_frames = latent_window_size * 4 - 3
322
-
323
- current_pixels = vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu()
324
- history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames)
325
-
326
- if not high_vram:
327
- unload_complete_models()
328
-
329
- output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
330
-
331
- save_bcthw_as_mp4(history_pixels, output_filename, fps=30, crf=mp4_crf)
332
-
333
- print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
334
-
335
- stream.output_queue.push(('file', output_filename))
336
- except:
337
- traceback.print_exc()
338
-
339
- if not high_vram:
340
- unload_complete_models(
341
- text_encoder, text_encoder_2, image_encoder, vae, transformer
342
- )
343
-
344
- stream.output_queue.push(('end', None))
345
- return
346
-
347
- def get_duration(input_image, prompt, t2v, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
348
- global total_second_length_debug_value
349
-
350
- if total_second_length_debug_value is not None:
351
- return total_second_length_debug_value * 60
352
- return total_second_length * 60
353
-
354
- @spaces.GPU(duration=get_duration)
355
- def process(input_image, prompt,
356
- t2v=False,
357
- n_prompt="",
358
- seed=31337,
359
- total_second_length=5,
360
- latent_window_size=9,
361
- steps=25,
362
- cfg=1.0,
363
- gs=10.0,
364
- rs=0.0,
365
- gpu_memory_preservation=6,
366
- use_teacache=True,
367
- mp4_crf=16
368
- ):
369
- global stream, input_image_debug_value, prompt_debug_value, total_second_length_debug_value
370
-
371
- if input_image_debug_value is not None:
372
- input_image = input_image_debug_value
373
- input_image_debug_value = None
374
-
375
- if prompt_debug_value is not None:
376
- prompt = prompt_debug_value
377
- prompt_debug_value = None
378
-
379
- if total_second_length_debug_value is not None:
380
- total_second_length = total_second_length_debug_value
381
- total_second_length_debug_value = None
382
-
383
- # assert input_image is not None, 'No input image!'
384
- if t2v:
385
- default_height, default_width = 640, 640
386
- input_image = np.ones((default_height, default_width, 3), dtype=np.uint8) * 255
387
- print("No input image provided. Using a blank white image.")
388
-
389
- yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
390
-
391
- stream = AsyncStream()
392
-
393
- async_run(worker, input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf)
394
-
395
- output_filename = None
396
-
397
- while True:
398
- flag, data = stream.output_queue.next()
399
-
400
- if flag == 'file':
401
- output_filename = data
402
- yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
403
-
404
- if flag == 'progress':
405
- preview, desc, html = data
406
- yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
407
-
408
- if flag == 'end':
409
- yield output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False)
410
- break
411
-
412
-
413
- def end_process():
414
- stream.input_queue.push('end')
415
-
416
-
417
- quick_prompts = [
418
- 'The girl dances gracefully, with clear movements, full of charm.',
419
- 'A character doing some simple body movements.',
420
- ]
421
- quick_prompts = [[x] for x in quick_prompts]
422
-
423
-
424
- css = make_progress_bar_css()
425
- block = gr.Blocks(css=css).queue()
426
- with block:
427
- gr.Markdown('# FramePack Essentials | Experimentation in Progress')
428
- gr.Markdown(f"""### Space is constantly being tinkered with, expect downtime and errors.
429
- """)
430
- with gr.Row():
431
- with gr.Column():
432
- input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
433
- prompt = gr.Textbox(label="Prompt", value='')
434
- t2v = gr.Checkbox(label="do text-to-video", value=False)
435
- example_quick_prompts = gr.Dataset(samples=quick_prompts, label='Quick List', samples_per_page=1000, components=[prompt])
436
- example_quick_prompts.click(lambda x: x[0], inputs=[example_quick_prompts], outputs=prompt, show_progress=False, queue=False)
437
-
438
- with gr.Row():
439
- start_button = gr.Button(value="Start Generation")
440
- end_button = gr.Button(value="End Generation", interactive=False)
441
-
442
- total_second_length = gr.Slider(label="Total Video Length (Seconds)", minimum=1, maximum=5, value=2, step=0.1)
443
- with gr.Group():
444
- with gr.Accordion("Advanced settings", open=False):
445
- use_teacache = gr.Checkbox(label='Use TeaCache', value=True, info='Faster speed, but often makes hands and fingers slightly worse.')
446
-
447
- n_prompt = gr.Textbox(label="Negative Prompt", value="", visible=False) # Not used
448
- seed = gr.Number(label="Seed", value=31337, precision=0)
449
-
450
-
451
- latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, visible=False) # Should not change
452
- steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Changing this value is not recommended.')
453
-
454
- cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, visible=False) # Should not change
455
- gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=10.0, step=0.01, info='Changing this value is not recommended.')
456
- rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01, visible=False) # Should not change
457
-
458
- gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
459
-
460
- mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ")
461
-
462
- with gr.Accordion("Debug", open=False):
463
- input_image_debug = gr.Image(type="numpy", label="Image Debug", height=320)
464
- prompt_debug = gr.Textbox(label="Prompt Debug", value='')
465
- total_second_length_debug = gr.Slider(label="Additional Video Length to Generate (Seconds) Debug", minimum=1, maximum=120, value=5, step=0.1)
466
-
467
- with gr.Column():
468
- preview_image = gr.Image(label="Next Latents", height=200, visible=False)
469
- result_video = gr.Video(label="Finished Frames", autoplay=True, show_share_button=False, height=512, loop=True)
470
- progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
471
- progress_bar = gr.HTML('', elem_classes='no-generating-animation')
472
-
473
- gr.HTML('<div style="text-align:center; margin-top:20px;">Share your results and find ideas at the <a href="https://x.com/search?q=framepack&f=live" target="_blank">FramePack Twitter (X) thread</a></div>')
474
-
475
- ips = [input_image, prompt, t2v, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf]
476
- start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
477
- end_button.click(fn=end_process)
478
-
479
- # gr.Examples(
480
- # examples,
481
- # inputs=[input_image, prompt],
482
- # outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button],
483
- # fn=generate_examples,
484
- # cache_examples=True
485
- # )
486
-
487
- with gr.Row(visible=False):
488
- gr.Examples(
489
- examples = [
490
- [
491
- "./img_examples/Example1.png", # input_image
492
- "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
493
- False, # t2v
494
- "", # n_prompt
495
- 42, # seed
496
- 1, # total_second_length
497
- 9, # latent_window_size
498
- 25, # steps
499
- 1.0, # cfg
500
- 10.0, # gs
501
- 0.0, # rs
502
- 6, # gpu_memory_preservation
503
- True, # use_teacache
504
- 16 # mp4_crf
505
- ],
506
- ],
507
- run_on_click = True,
508
- fn = process,
509
- inputs = ips,
510
- outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button],
511
- cache_examples = True,
512
- )
513
-
514
-
515
- def handle_field_debug_change(input_image_debug_data, prompt_debug_data, total_second_length_debug_data):
516
- global input_image_debug_value, prompt_debug_value, total_second_length_debug_value
517
- input_image_debug_value = input_image_debug_data
518
- prompt_debug_value = prompt_debug_data
519
- total_second_length_debug_value = total_second_length_debug_data
520
- return []
521
-
522
- input_image_debug.upload(
523
- fn=handle_field_debug_change,
524
- inputs=[input_image_debug, prompt_debug, total_second_length_debug],
525
- outputs=[]
526
- )
527
-
528
- prompt_debug.change(
529
- fn=handle_field_debug_change,
530
- inputs=[input_image_debug, prompt_debug, total_second_length_debug],
531
- outputs=[]
532
- )
533
-
534
- total_second_length_debug.change(
535
- fn=handle_field_debug_change,
536
- inputs=[input_image_debug, prompt_debug, total_second_length_debug],
537
- outputs=[]
538
- )
539
-
540
-
541
- block.launch(ssr_mode=False)
 
1
+ from diffusers_helper.hf_login import login
2
+
3
+ import os
4
+
5
+ os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download')))
6
+
7
+ import gradio as gr
8
+ import torch
9
+ import traceback
10
+ import einops
11
+ import safetensors.torch as sf
12
+ import numpy as np
13
+ import math
14
+ import spaces
15
+
16
+ from PIL import Image
17
+ from diffusers import AutoencoderKLHunyuanVideo
18
+ from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
19
+ from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
20
+ from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
21
+ from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
22
+ from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
23
+ from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
24
+ from diffusers_helper.thread_utils import AsyncStream, async_run
25
+ from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
26
+ from transformers import SiglipImageProcessor, SiglipVisionModel
27
+ from diffusers_helper.clip_vision import hf_clip_vision_encode
28
+ from diffusers_helper.bucket_tools import find_nearest_bucket
29
+
30
+
31
+ free_mem_gb = get_cuda_free_memory_gb(gpu)
32
+ high_vram = free_mem_gb > 80
33
+
34
+ print(f'Free VRAM {free_mem_gb} GB')
35
+ print(f'High-VRAM Mode: {high_vram}')
36
+
37
+ text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
38
+ text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
39
+ tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
40
+ tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2')
41
+ vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu()
42
+
43
+ feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor')
44
+ image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu()
45
+
46
+ # quant_config = DiffusersBitsAndBytesConfig(load_in_8bit=True)
47
+ # transformer = HunyuanVideoTransformer3DModelPacked.from_single_file("https://huggingface.co/sirolim/FramePack_F1_I2V_FP8/resolve/main/FramePack_F1_I2V_HY_fp8_e4m3fn.safetensors", torch_dtype=torch.bfloat16)
48
+ # transformer = HunyuanVideoTransformer3DModelPacked.from_single_file('sirolim/FramePack_F1_I2V_FP8', "FramePack_F1_I2V_HY_fp8_e4m3fn.safetensors", use_safetensors=True, torch_dtype=torch.bfloat16).cpu()
49
+ transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePack_F1_I2V_HY_20250503', torch_dtype=torch.bfloat16).cpu()
50
+
51
+ vae.eval()
52
+ text_encoder.eval()
53
+ text_encoder_2.eval()
54
+ image_encoder.eval()
55
+ transformer.eval()
56
+
57
+ if not high_vram:
58
+ vae.enable_slicing()
59
+ vae.enable_tiling()
60
+
61
+ transformer.high_quality_fp32_output_for_inference = True
62
+ print('transformer.high_quality_fp32_output_for_inference = True')
63
+
64
+ transformer.to(dtype=torch.bfloat16)
65
+ vae.to(dtype=torch.float16)
66
+ image_encoder.to(dtype=torch.float16)
67
+ text_encoder.to(dtype=torch.float16)
68
+ text_encoder_2.to(dtype=torch.float16)
69
+
70
+ vae.requires_grad_(False)
71
+ text_encoder.requires_grad_(False)
72
+ text_encoder_2.requires_grad_(False)
73
+ image_encoder.requires_grad_(False)
74
+ transformer.requires_grad_(False)
75
+
76
+ if not high_vram:
77
+ # DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster
78
+ DynamicSwapInstaller.install_model(transformer, device=gpu)
79
+ DynamicSwapInstaller.install_model(text_encoder, device=gpu)
80
+ else:
81
+ text_encoder.to(gpu)
82
+ text_encoder_2.to(gpu)
83
+ image_encoder.to(gpu)
84
+ vae.to(gpu)
85
+ transformer.to(gpu)
86
+
87
+ stream = AsyncStream()
88
+
89
+ outputs_folder = './outputs/'
90
+ os.makedirs(outputs_folder, exist_ok=True)
91
+
92
+ examples = [
93
+ ["img_examples/1.png", "The girl dances gracefully, with clear movements, full of charm.",],
94
+ ["img_examples/2.jpg", "The man dances flamboyantly, swinging his hips and striking bold poses with dramatic flair."],
95
+ ["img_examples/3.png", "The woman dances elegantly among the blossoms, spinning slowly with flowing sleeves and graceful hand movements."],
96
+ ]
97
+
98
+ input_image_debug_value = None
99
+ prompt_debug_value = None
100
+ total_second_length_debug_value = None
101
+
102
+ def generate_examples(input_image, prompt):
103
+
104
+ t2v=False
105
+ n_prompt=""
106
+ seed=31337
107
+ total_second_length=5
108
+ latent_window_size=9
109
+ steps=25
110
+ cfg=1.0
111
+ gs=10.0
112
+ rs=0.0
113
+ gpu_memory_preservation=6
114
+ use_teacache=True
115
+ mp4_crf=16
116
+
117
+ global stream
118
+
119
+ # assert input_image is not None, 'No input image!'
120
+ if t2v:
121
+ default_height, default_width = 640, 640
122
+ input_image = np.ones((default_height, default_width, 3), dtype=np.uint8) * 255
123
+ print("No input image provided. Using a blank white image.")
124
+
125
+ yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
126
+
127
+ stream = AsyncStream()
128
+
129
+ async_run(worker, input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf)
130
+
131
+ output_filename = None
132
+
133
+ while True:
134
+ flag, data = stream.output_queue.next()
135
+
136
+ if flag == 'file':
137
+ output_filename = data
138
+ yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
139
+
140
+ if flag == 'progress':
141
+ preview, desc, html = data
142
+ yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
143
+
144
+ if flag == 'end':
145
+ yield output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False)
146
+ break
147
+
148
+
149
+
150
+ @torch.no_grad()
151
+ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
152
+ total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
153
+ total_latent_sections = int(max(round(total_latent_sections), 1))
154
+
155
+ job_id = generate_timestamp()
156
+
157
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
158
+
159
+ try:
160
+ # Clean GPU
161
+ if not high_vram:
162
+ unload_complete_models(
163
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
164
+ )
165
+
166
+ # Text encoding
167
+
168
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
169
+
170
+ if not high_vram:
171
+ fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
172
+ load_model_as_complete(text_encoder_2, target_device=gpu)
173
+
174
+ llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
175
+
176
+ if cfg == 1:
177
+ llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
178
+ else:
179
+ llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
180
+
181
+ llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
182
+ llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
183
+
184
+ # Processing input image
185
+
186
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...'))))
187
+
188
+ H, W, C = input_image.shape
189
+ height, width = find_nearest_bucket(H, W, resolution=640)
190
+ input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
191
+
192
+ Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
193
+
194
+ input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
195
+ input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
196
+
197
+ # VAE encoding
198
+
199
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
200
+
201
+ if not high_vram:
202
+ load_model_as_complete(vae, target_device=gpu)
203
+
204
+ start_latent = vae_encode(input_image_pt, vae)
205
+
206
+ # CLIP Vision
207
+
208
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
209
+
210
+ if not high_vram:
211
+ load_model_as_complete(image_encoder, target_device=gpu)
212
+
213
+ image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
214
+ image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
215
+
216
+ # Dtype
217
+
218
+ llama_vec = llama_vec.to(transformer.dtype)
219
+ llama_vec_n = llama_vec_n.to(transformer.dtype)
220
+ clip_l_pooler = clip_l_pooler.to(transformer.dtype)
221
+ clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
222
+ image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
223
+
224
+ # Sampling
225
+
226
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
227
+
228
+ rnd = torch.Generator("cpu").manual_seed(seed)
229
+
230
+ history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32).cpu()
231
+ history_pixels = None
232
+
233
+ history_latents = torch.cat([history_latents, start_latent.to(history_latents)], dim=2)
234
+ total_generated_latent_frames = 1
235
+
236
+ for section_index in range(total_latent_sections):
237
+ if stream.input_queue.top() == 'end':
238
+ stream.output_queue.push(('end', None))
239
+ return
240
+
241
+ print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
242
+
243
+ if not high_vram:
244
+ unload_complete_models()
245
+ move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
246
+
247
+ if use_teacache:
248
+ transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
249
+ else:
250
+ transformer.initialize_teacache(enable_teacache=False)
251
+
252
+ def callback(d):
253
+ preview = d['denoised']
254
+ preview = vae_decode_fake(preview)
255
+
256
+ preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
257
+ preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
258
+
259
+ if stream.input_queue.top() == 'end':
260
+ stream.output_queue.push(('end', None))
261
+ raise KeyboardInterrupt('User ends the task.')
262
+
263
+ current_step = d['i'] + 1
264
+ percentage = int(100.0 * current_step / steps)
265
+ hint = f'Sampling {current_step}/{steps}'
266
+ desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30). The video is being extended now ...'
267
+ stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
268
+ return
269
+
270
+ indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
271
+ clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
272
+ clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
273
+
274
+ clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -sum([16, 2, 1]):, :, :].split([16, 2, 1], dim=2)
275
+ clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
276
+
277
+ generated_latents = sample_hunyuan(
278
+ transformer=transformer,
279
+ sampler='unipc',
280
+ width=width,
281
+ height=height,
282
+ frames=latent_window_size * 4 - 3,
283
+ real_guidance_scale=cfg,
284
+ distilled_guidance_scale=gs,
285
+ guidance_rescale=rs,
286
+ # shift=3.0,
287
+ num_inference_steps=steps,
288
+ generator=rnd,
289
+ prompt_embeds=llama_vec,
290
+ prompt_embeds_mask=llama_attention_mask,
291
+ prompt_poolers=clip_l_pooler,
292
+ negative_prompt_embeds=llama_vec_n,
293
+ negative_prompt_embeds_mask=llama_attention_mask_n,
294
+ negative_prompt_poolers=clip_l_pooler_n,
295
+ device=gpu,
296
+ dtype=torch.bfloat16,
297
+ image_embeddings=image_encoder_last_hidden_state,
298
+ latent_indices=latent_indices,
299
+ clean_latents=clean_latents,
300
+ clean_latent_indices=clean_latent_indices,
301
+ clean_latents_2x=clean_latents_2x,
302
+ clean_latent_2x_indices=clean_latent_2x_indices,
303
+ clean_latents_4x=clean_latents_4x,
304
+ clean_latent_4x_indices=clean_latent_4x_indices,
305
+ callback=callback,
306
+ )
307
+
308
+ total_generated_latent_frames += int(generated_latents.shape[2])
309
+ history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
310
+
311
+ if not high_vram:
312
+ offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
313
+ load_model_as_complete(vae, target_device=gpu)
314
+
315
+ real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
316
+
317
+ if history_pixels is None:
318
+ history_pixels = vae_decode(real_history_latents, vae).cpu()
319
+ else:
320
+ section_latent_frames = latent_window_size * 2
321
+ overlapped_frames = latent_window_size * 4 - 3
322
+
323
+ current_pixels = vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu()
324
+ history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames)
325
+
326
+ if not high_vram:
327
+ unload_complete_models()
328
+
329
+ output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
330
+
331
+ save_bcthw_as_mp4(history_pixels, output_filename, fps=30, crf=mp4_crf)
332
+
333
+ print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
334
+
335
+ stream.output_queue.push(('file', output_filename))
336
+ except:
337
+ traceback.print_exc()
338
+
339
+ if not high_vram:
340
+ unload_complete_models(
341
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
342
+ )
343
+
344
+ stream.output_queue.push(('end', None))
345
+ return
346
+
347
+ def get_duration(input_image, prompt, t2v, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
348
+ global total_second_length_debug_value
349
+
350
+ if total_second_length_debug_value is not None:
351
+ return total_second_length_debug_value * 60
352
+ return total_second_length * 60
353
+
354
+ @spaces.GPU(duration=get_duration)
355
+ def process(input_image, prompt,
356
+ t2v=False,
357
+ n_prompt="",
358
+ seed=31337,
359
+ total_second_length=5,
360
+ latent_window_size=9,
361
+ steps=25,
362
+ cfg=1.0,
363
+ gs=10.0,
364
+ rs=0.0,
365
+ gpu_memory_preservation=6,
366
+ use_teacache=True,
367
+ mp4_crf=16
368
+ ):
369
+ global stream, input_image_debug_value, prompt_debug_value, total_second_length_debug_value
370
+
371
+ if input_image_debug_value is not None:
372
+ input_image = input_image_debug_value
373
+ input_image_debug_value = None
374
+
375
+ if prompt_debug_value is not None:
376
+ prompt = prompt_debug_value
377
+ prompt_debug_value = None
378
+
379
+ if total_second_length_debug_value is not None:
380
+ total_second_length = total_second_length_debug_value
381
+ total_second_length_debug_value = None
382
+
383
+ # assert input_image is not None, 'No input image!'
384
+ if t2v:
385
+ default_height, default_width = 640, 640
386
+ input_image = np.ones((default_height, default_width, 3), dtype=np.uint8) * 255
387
+ print("No input image provided. Using a blank white image.")
388
+
389
+ yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
390
+
391
+ stream = AsyncStream()
392
+
393
+ async_run(worker, input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf)
394
+
395
+ output_filename = None
396
+
397
+ while True:
398
+ flag, data = stream.output_queue.next()
399
+
400
+ if flag == 'file':
401
+ output_filename = data
402
+ yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
403
+
404
+ if flag == 'progress':
405
+ preview, desc, html = data
406
+ yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
407
+
408
+ if flag == 'end':
409
+ yield output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False)
410
+ break
411
+
412
+
413
+ def end_process():
414
+ stream.input_queue.push('end')
415
+
416
+
417
+ quick_prompts = [
418
+ 'The girl dances gracefully, with clear movements, full of charm.',
419
+ 'A character doing some simple body movements.',
420
+ ]
421
+ quick_prompts = [[x] for x in quick_prompts]
422
+
423
+
424
+ css = make_progress_bar_css()
425
+ block = gr.Blocks(css=css).queue()
426
+ with block:
427
+ gr.Markdown('# FramePack Essentials | Experimentation in Progress')
428
+ gr.Markdown(f"""### Space is constantly being tinkered with, expect downtime and errors.
429
+ """)
430
+ with gr.Row():
431
+ with gr.Column():
432
+ input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
433
+ prompt = gr.Textbox(label="Prompt", value='')
434
+ t2v = gr.Checkbox(label="do text-to-video", value=False)
435
+
436
+ with gr.Row():
437
+ start_button = gr.Button(value="Start Generation")
438
+ end_button = gr.Button(value="End Generation", interactive=False)
439
+
440
+ total_second_length = gr.Slider(label="Total Video Length (Seconds)", minimum=1, maximum=5, value=2, step=0.1)
441
+ with gr.Group():
442
+ with gr.Accordion("Advanced settings", open=False):
443
+ use_teacache = gr.Checkbox(label='Use TeaCache', value=True, info='Faster speed, but often makes hands and fingers slightly worse.')
444
+
445
+ n_prompt = gr.Textbox(label="Negative Prompt", value="", visible=False) # Not used
446
+ seed = gr.Number(label="Seed", value=31337, precision=0)
447
+
448
+
449
+ latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, visible=False) # Should not change
450
+ steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Changing this value is not recommended.')
451
+
452
+ cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, visible=False) # Should not change
453
+ gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=10.0, step=0.01, info='Changing this value is not recommended.')
454
+ rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01, visible=False) # Should not change
455
+
456
+ gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
457
+
458
+ mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ")
459
+
460
+ with gr.Accordion("Debug", open=False):
461
+ input_image_debug = gr.Image(type="numpy", label="Image Debug", height=320)
462
+ prompt_debug = gr.Textbox(label="Prompt Debug", value='')
463
+ total_second_length_debug = gr.Slider(label="Additional Video Length to Generate (Seconds) Debug", minimum=1, maximum=120, value=5, step=0.1)
464
+
465
+ with gr.Column():
466
+ preview_image = gr.Image(label="Next Latents", height=200, visible=False)
467
+ result_video = gr.Video(label="Finished Frames", autoplay=True, show_share_button=False, height=512, loop=True)
468
+ progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
469
+ progress_bar = gr.HTML('', elem_classes='no-generating-animation')
470
+
471
+ gr.HTML('<div style="text-align:center; margin-top:20px;">Share your results and find ideas at the <a href="https://x.com/search?q=framepack&f=live" target="_blank">FramePack Twitter (X) thread</a></div>')
472
+
473
+ ips = [input_image, prompt, t2v, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf]
474
+ start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
475
+ end_button.click(fn=end_process)
476
+
477
+ # gr.Examples(
478
+ # examples,
479
+ # inputs=[input_image, prompt],
480
+ # outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button],
481
+ # fn=generate_examples,
482
+ # cache_examples=True
483
+ # )
484
+
485
+ with gr.Row(visible=False):
486
+ gr.Examples(
487
+ examples = [
488
+ [
489
+ "./img_examples/Example1.png", # input_image
490
+ "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
491
+ False, # t2v
492
+ "", # n_prompt
493
+ 42, # seed
494
+ 1, # total_second_length
495
+ 9, # latent_window_size
496
+ 25, # steps
497
+ 1.0, # cfg
498
+ 10.0, # gs
499
+ 0.0, # rs
500
+ 6, # gpu_memory_preservation
501
+ True, # use_teacache
502
+ 16 # mp4_crf
503
+ ],
504
+ ],
505
+ run_on_click = True,
506
+ fn = process,
507
+ inputs = ips,
508
+ outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button],
509
+ cache_examples = True,
510
+ )
511
+
512
+
513
+ def handle_field_debug_change(input_image_debug_data, prompt_debug_data, total_second_length_debug_data):
514
+ global input_image_debug_value, prompt_debug_value, total_second_length_debug_value
515
+ input_image_debug_value = input_image_debug_data
516
+ prompt_debug_value = prompt_debug_data
517
+ total_second_length_debug_value = total_second_length_debug_data
518
+ return []
519
+
520
+ input_image_debug.upload(
521
+ fn=handle_field_debug_change,
522
+ inputs=[input_image_debug, prompt_debug, total_second_length_debug],
523
+ outputs=[]
524
+ )
525
+
526
+ prompt_debug.change(
527
+ fn=handle_field_debug_change,
528
+ inputs=[input_image_debug, prompt_debug, total_second_length_debug],
529
+ outputs=[]
530
+ )
531
+
532
+ total_second_length_debug.change(
533
+ fn=handle_field_debug_change,
534
+ inputs=[input_image_debug, prompt_debug, total_second_length_debug],
535
+ outputs=[]
536
+ )
537
+
538
+
539
+ block.launch(ssr_mode=False)