CT-OI commited on
Commit
de00994
·
verified ·
1 Parent(s): 7370131

Update demo_gradio.py

Browse files
Files changed (1) hide show
  1. demo_gradio.py +71 -281
demo_gradio.py CHANGED
@@ -1,89 +1,85 @@
1
- from diffusers_helper.hf_login import login
2
-
3
  import os
4
-
5
- os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download')))
6
-
7
- import gradio as gr
8
  import torch
9
- import traceback
10
- import einops
11
- import safetensors.torch as sf
12
  import numpy as np
13
- import argparse
14
- import math
15
 
16
  from PIL import Image
17
  from diffusers import AutoencoderKLHunyuanVideo
18
- from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
19
- from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
20
- from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
 
 
 
 
 
 
 
 
 
 
 
 
21
  from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
22
  from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
23
- from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
24
- from diffusers_helper.thread_utils import AsyncStream, async_run
25
- from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
26
- from transformers import SiglipImageProcessor, SiglipVisionModel
 
27
  from diffusers_helper.clip_vision import hf_clip_vision_encode
28
- from diffusers_helper.bucket_tools import find_nearest_bucket
29
 
30
 
 
31
  parser = argparse.ArgumentParser()
32
  parser.add_argument('--share', action='store_true')
33
- parser.add_argument("--server", type=str, default='0.0.0.0')
34
- parser.add_argument("--port", type=int, required=False)
35
- parser.add_argument("--inbrowser", action='store_true')
36
  args = parser.parse_args()
37
 
38
- # for win desktop probably use --server 127.0.0.1 --inbrowser
39
- # For linux server probably use --server 127.0.0.1 or do not use any cmd flags
 
40
 
41
  print(args)
42
 
43
  free_mem_gb = get_cuda_free_memory_gb(gpu)
44
  high_vram = free_mem_gb > 60
45
-
46
  print(f'Free VRAM {free_mem_gb} GB')
47
  print(f'High-VRAM Mode: {high_vram}')
48
 
 
49
  text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
50
  text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
51
  tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
52
  tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2')
53
  vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu()
54
-
55
  feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor')
56
  image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu()
57
-
58
  transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePackI2V_HY', torch_dtype=torch.bfloat16).cpu()
59
 
60
- vae.eval()
61
- text_encoder.eval()
62
- text_encoder_2.eval()
63
- image_encoder.eval()
64
- transformer.eval()
65
 
66
  if not high_vram:
67
  vae.enable_slicing()
68
  vae.enable_tiling()
69
 
70
  transformer.high_quality_fp32_output_for_inference = True
71
- print('transformer.high_quality_fp32_output_for_inference = True')
72
-
73
  transformer.to(dtype=torch.bfloat16)
74
  vae.to(dtype=torch.float16)
75
  image_encoder.to(dtype=torch.float16)
76
  text_encoder.to(dtype=torch.float16)
77
  text_encoder_2.to(dtype=torch.float16)
78
 
79
- vae.requires_grad_(False)
80
- text_encoder.requires_grad_(False)
81
- text_encoder_2.requires_grad_(False)
82
- image_encoder.requires_grad_(False)
83
- transformer.requires_grad_(False)
84
 
85
  if not high_vram:
86
- # DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster
87
  DynamicSwapInstaller.install_model(transformer, device=gpu)
88
  DynamicSwapInstaller.install_model(text_encoder, device=gpu)
89
  else:
@@ -94,267 +90,65 @@ else:
94
  transformer.to(gpu)
95
 
96
  stream = AsyncStream()
97
-
98
  outputs_folder = './outputs/'
99
  os.makedirs(outputs_folder, exist_ok=True)
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
- @torch.no_grad()
103
- def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache):
104
- total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
105
- total_latent_sections = int(max(round(total_latent_sections), 1))
106
-
107
- job_id = generate_timestamp()
108
-
109
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
110
-
111
- try:
112
- # Clean GPU
113
- if not high_vram:
114
- unload_complete_models(
115
- text_encoder, text_encoder_2, image_encoder, vae, transformer
116
- )
117
-
118
- # Text encoding
119
-
120
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
121
-
122
- if not high_vram:
123
- fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
124
- load_model_as_complete(text_encoder_2, target_device=gpu)
125
-
126
- llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
127
-
128
- if cfg == 1:
129
- llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
130
- else:
131
- llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
132
-
133
- llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
134
- llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
135
-
136
- # Processing input image
137
-
138
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...'))))
139
-
140
- H, W, C = input_image.shape
141
- height, width = find_nearest_bucket(H, W, resolution=640)
142
- input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
143
-
144
- Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
145
-
146
- input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
147
- input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
148
-
149
- # VAE encoding
150
-
151
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
152
-
153
- if not high_vram:
154
- load_model_as_complete(vae, target_device=gpu)
155
-
156
- start_latent = vae_encode(input_image_pt, vae)
157
-
158
- # CLIP Vision
159
-
160
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
161
-
162
- if not high_vram:
163
- load_model_as_complete(image_encoder, target_device=gpu)
164
-
165
- image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
166
- image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
167
-
168
- # Dtype
169
-
170
- llama_vec = llama_vec.to(transformer.dtype)
171
- llama_vec_n = llama_vec_n.to(transformer.dtype)
172
- clip_l_pooler = clip_l_pooler.to(transformer.dtype)
173
- clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
174
- image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
175
-
176
- # Sampling
177
-
178
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
179
-
180
- rnd = torch.Generator("cpu").manual_seed(seed)
181
- num_frames = latent_window_size * 4 - 3
182
-
183
- history_latents = torch.zeros(size=(1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32).cpu()
184
- history_pixels = None
185
- total_generated_latent_frames = 0
186
-
187
- latent_paddings = reversed(range(total_latent_sections))
188
-
189
- if total_latent_sections > 4:
190
- # In theory the latent_paddings should follow the above sequence, but it seems that duplicating some
191
- # items looks better than expanding it when total_latent_sections > 4
192
- # One can try to remove below trick and just
193
- # use `latent_paddings = list(reversed(range(total_latent_sections)))` to compare
194
- latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
195
-
196
- for latent_padding in latent_paddings:
197
- is_last_section = latent_padding == 0
198
- latent_padding_size = latent_padding * latent_window_size
199
-
200
- if stream.input_queue.top() == 'end':
201
- stream.output_queue.push(('end', None))
202
- return
203
-
204
- print(f'latent_padding_size = {latent_padding_size}, is_last_section = {is_last_section}')
205
-
206
- indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, 1, 2, 16])).unsqueeze(0)
207
- clean_latent_indices_pre, blank_indices, latent_indices, clean_latent_indices_post, clean_latent_2x_indices, clean_latent_4x_indices = indices.split([1, latent_padding_size, latent_window_size, 1, 2, 16], dim=1)
208
- clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
209
-
210
- clean_latents_pre = start_latent.to(history_latents)
211
- clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, :1 + 2 + 16, :, :].split([1, 2, 16], dim=2)
212
- clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)
213
-
214
- if not high_vram:
215
- unload_complete_models()
216
- move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
217
-
218
- if use_teacache:
219
- transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
220
- else:
221
- transformer.initialize_teacache(enable_teacache=False)
222
-
223
- def callback(d):
224
- preview = d['denoised']
225
- preview = vae_decode_fake(preview)
226
-
227
- preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
228
- preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
229
-
230
- if stream.input_queue.top() == 'end':
231
- stream.output_queue.push(('end', None))
232
- raise KeyboardInterrupt('User ends the task.')
233
-
234
- current_step = d['i'] + 1
235
- percentage = int(100.0 * current_step / steps)
236
- hint = f'Sampling {current_step}/{steps}'
237
- desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30). The video is being extended now ...'
238
- stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
239
- return
240
-
241
- generated_latents = sample_hunyuan(
242
- transformer=transformer,
243
- sampler='unipc',
244
- width=width,
245
- height=height,
246
- frames=num_frames,
247
- real_guidance_scale=cfg,
248
- distilled_guidance_scale=gs,
249
- guidance_rescale=rs,
250
- # shift=3.0,
251
- num_inference_steps=steps,
252
- generator=rnd,
253
- prompt_embeds=llama_vec,
254
- prompt_embeds_mask=llama_attention_mask,
255
- prompt_poolers=clip_l_pooler,
256
- negative_prompt_embeds=llama_vec_n,
257
- negative_prompt_embeds_mask=llama_attention_mask_n,
258
- negative_prompt_poolers=clip_l_pooler_n,
259
- device=gpu,
260
- dtype=torch.bfloat16,
261
- image_embeddings=image_encoder_last_hidden_state,
262
- latent_indices=latent_indices,
263
- clean_latents=clean_latents,
264
- clean_latent_indices=clean_latent_indices,
265
- clean_latents_2x=clean_latents_2x,
266
- clean_latent_2x_indices=clean_latent_2x_indices,
267
- clean_latents_4x=clean_latents_4x,
268
- clean_latent_4x_indices=clean_latent_4x_indices,
269
- callback=callback,
270
- )
271
-
272
- if is_last_section:
273
- generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2)
274
-
275
- total_generated_latent_frames += int(generated_latents.shape[2])
276
- history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
277
-
278
- if not high_vram:
279
- offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
280
- load_model_as_complete(vae, target_device=gpu)
281
-
282
- real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
283
-
284
- if history_pixels is None:
285
- history_pixels = vae_decode(real_history_latents, vae).cpu()
286
- else:
287
- section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
288
- overlapped_frames = latent_window_size * 4 - 3
289
-
290
- current_pixels = vae_decode(real_history_latents[:, :, :section_latent_frames], vae).cpu()
291
- history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
292
-
293
- if not high_vram:
294
- unload_complete_models()
295
-
296
- output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
297
-
298
- save_bcthw_as_mp4(history_pixels, output_filename, fps=30)
299
-
300
- print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
301
-
302
- stream.output_queue.push(('file', output_filename))
303
-
304
- if is_last_section:
305
- break
306
- except:
307
- traceback.print_exc()
308
-
309
- if not high_vram:
310
- unload_complete_models(
311
- text_encoder, text_encoder_2, image_encoder, vae, transformer
312
- )
313
-
314
- stream.output_queue.push(('end', None))
315
- return
316
-
317
 
318
  def process(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache):
319
  global stream
320
  assert input_image is not None, 'No input image!'
321
-
322
  yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
323
 
324
  stream = AsyncStream()
325
-
326
  async_run(worker, input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache)
327
 
328
  output_filename = None
329
 
330
  while True:
331
  flag, data = stream.output_queue.next()
332
-
333
  if flag == 'file':
334
  output_filename = data
335
  yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
336
-
337
  if flag == 'progress':
338
  preview, desc, html = data
339
  yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
340
-
341
  if flag == 'end':
342
  yield output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False)
343
  break
344
 
345
-
346
- def end_process():
347
- stream.input_queue.push('end')
348
-
349
-
350
  quick_prompts = [
351
  'The girl dances gracefully, with clear movements, full of charm.',
352
  'A character doing some simple body movements.',
353
  ]
354
  quick_prompts = [[x] for x in quick_prompts]
355
 
356
-
357
  css = make_progress_bar_css()
 
358
  block = gr.Blocks(css=css).queue()
359
  with block:
360
  gr.Markdown('# FramePack')
@@ -370,32 +164,28 @@ with block:
370
  end_button = gr.Button(value="End Generation", interactive=False)
371
 
372
  with gr.Group():
373
- use_teacache = gr.Checkbox(label='Use TeaCache', value=True, info='Faster speed, but often makes hands and fingers slightly worse.')
374
-
375
- n_prompt = gr.Textbox(label="Negative Prompt", value="", visible=False) # Not used
376
  seed = gr.Number(label="Seed", value=31337, precision=0)
377
-
378
  total_second_length = gr.Slider(label="Total Video Length (Seconds)", minimum=1, maximum=120, value=5, step=0.1)
379
- latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, visible=False) # Should not change
380
- steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Changing this value is not recommended.')
381
-
382
- cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, visible=False) # Should not change
383
- gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=10.0, step=0.01, info='Changing this value is not recommended.')
384
- rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01, visible=False) # Should not change
385
-
386
- gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
387
 
388
  with gr.Column():
389
  preview_image = gr.Image(label="Next Latents", height=200, visible=False)
390
  result_video = gr.Video(label="Finished Frames", autoplay=True, show_share_button=False, height=512, loop=True)
391
- gr.Markdown('Note that the ending actions will be generated before the starting actions due to the inverted sampling. If the starting action is not in the video, you just need to wait, and it will be generated later.')
392
  progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
393
  progress_bar = gr.HTML('', elem_classes='no-generating-animation')
 
394
  ips = [input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache]
395
  start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
396
  end_button.click(fn=end_process)
397
 
398
-
399
  block.launch(
400
  server_name=args.server,
401
  server_port=args.port,
 
 
 
1
  import os
2
+ import argparse
 
 
 
3
  import torch
4
+ import gradio as gr
 
 
5
  import numpy as np
6
+ import einops
7
+ import traceback
8
 
9
  from PIL import Image
10
  from diffusers import AutoencoderKLHunyuanVideo
11
+ from transformers import (
12
+ LlamaModel, CLIPTextModel,
13
+ LlamaTokenizerFast, CLIPTokenizer,
14
+ SiglipImageProcessor, SiglipVisionModel
15
+ )
16
+
17
+ from diffusers_helper.hf_login import login
18
+ from diffusers_helper.hunyuan import (
19
+ encode_prompt_conds, vae_decode, vae_encode,
20
+ vae_decode_fake
21
+ )
22
+ from diffusers_helper.utils import (
23
+ save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw,
24
+ resize_and_center_crop, generate_timestamp
25
+ )
26
  from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
27
  from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
28
+ from diffusers_helper.memory import (
29
+ gpu, get_cuda_free_memory_gb, unload_complete_models, load_model_as_complete,
30
+ DynamicSwapInstaller, move_model_to_device_with_memory_preservation,
31
+ offload_model_from_device_for_memory_preservation, fake_diffusers_current_device
32
+ )
33
  from diffusers_helper.clip_vision import hf_clip_vision_encode
34
+ from diffusers_helper.thread_utils import AsyncStream, async_run
35
 
36
 
37
+ # --- Args and config ---
38
  parser = argparse.ArgumentParser()
39
  parser.add_argument('--share', action='store_true')
40
+ parser.add_argument('--server', type=str, default='0.0.0.0')
41
+ parser.add_argument('--port', type=int, required=False)
42
+ parser.add_argument('--inbrowser', action='store_true')
43
  args = parser.parse_args()
44
 
45
+ os.environ['HF_HOME'] = os.path.abspath(
46
+ os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download'))
47
+ )
48
 
49
  print(args)
50
 
51
  free_mem_gb = get_cuda_free_memory_gb(gpu)
52
  high_vram = free_mem_gb > 60
 
53
  print(f'Free VRAM {free_mem_gb} GB')
54
  print(f'High-VRAM Mode: {high_vram}')
55
 
56
+ # --- Load models ---
57
  text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
58
  text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
59
  tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
60
  tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2')
61
  vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu()
 
62
  feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor')
63
  image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu()
 
64
  transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePackI2V_HY', torch_dtype=torch.bfloat16).cpu()
65
 
66
+ vae.eval(), text_encoder.eval(), text_encoder_2.eval(), image_encoder.eval(), transformer.eval()
 
 
 
 
67
 
68
  if not high_vram:
69
  vae.enable_slicing()
70
  vae.enable_tiling()
71
 
72
  transformer.high_quality_fp32_output_for_inference = True
 
 
73
  transformer.to(dtype=torch.bfloat16)
74
  vae.to(dtype=torch.float16)
75
  image_encoder.to(dtype=torch.float16)
76
  text_encoder.to(dtype=torch.float16)
77
  text_encoder_2.to(dtype=torch.float16)
78
 
79
+ for model in [vae, text_encoder, text_encoder_2, image_encoder, transformer]:
80
+ model.requires_grad_(False)
 
 
 
81
 
82
  if not high_vram:
 
83
  DynamicSwapInstaller.install_model(transformer, device=gpu)
84
  DynamicSwapInstaller.install_model(text_encoder, device=gpu)
85
  else:
 
90
  transformer.to(gpu)
91
 
92
  stream = AsyncStream()
 
93
  outputs_folder = './outputs/'
94
  os.makedirs(outputs_folder, exist_ok=True)
95
 
96
+ # --- UI + CSS ---
97
+ def make_progress_bar_css():
98
+ return """
99
+ body, .gradio-container {
100
+ background-color: #000000 !important;
101
+ color: #FFFFFF !important;
102
+ }
103
+ .gr-button, .gr-input, .gr-textbox, .gr-slider, .gr-checkbox {
104
+ background-color: #1a1a1a !important;
105
+ color: #ffffff !important;
106
+ border-color: #444 !important;
107
+ }
108
+ .gr-button:hover {
109
+ background-color: #333 !important;
110
+ }
111
+ .gr-markdown {
112
+ color: #ddd !important;
113
+ }
114
+ .gr-image-preview, .gr-video {
115
+ background-color: #111 !important;
116
+ }
117
+ """
118
 
119
+ def end_process():
120
+ stream.input_queue.push('end')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
  def process(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache):
123
  global stream
124
  assert input_image is not None, 'No input image!'
 
125
  yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
126
 
127
  stream = AsyncStream()
 
128
  async_run(worker, input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache)
129
 
130
  output_filename = None
131
 
132
  while True:
133
  flag, data = stream.output_queue.next()
 
134
  if flag == 'file':
135
  output_filename = data
136
  yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
 
137
  if flag == 'progress':
138
  preview, desc, html = data
139
  yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
 
140
  if flag == 'end':
141
  yield output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False)
142
  break
143
 
 
 
 
 
 
144
  quick_prompts = [
145
  'The girl dances gracefully, with clear movements, full of charm.',
146
  'A character doing some simple body movements.',
147
  ]
148
  quick_prompts = [[x] for x in quick_prompts]
149
 
 
150
  css = make_progress_bar_css()
151
+
152
  block = gr.Blocks(css=css).queue()
153
  with block:
154
  gr.Markdown('# FramePack')
 
164
  end_button = gr.Button(value="End Generation", interactive=False)
165
 
166
  with gr.Group():
167
+ use_teacache = gr.Checkbox(label='Use TeaCache', value=True)
168
+ n_prompt = gr.Textbox(label="Negative Prompt", value="", visible=False)
 
169
  seed = gr.Number(label="Seed", value=31337, precision=0)
 
170
  total_second_length = gr.Slider(label="Total Video Length (Seconds)", minimum=1, maximum=120, value=5, step=0.1)
171
+ latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, visible=False)
172
+ steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1)
173
+ cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, visible=False)
174
+ gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=10.0, step=0.01)
175
+ rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01, visible=False)
176
+ gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB)", minimum=6, maximum=128, value=6, step=0.1)
 
 
177
 
178
  with gr.Column():
179
  preview_image = gr.Image(label="Next Latents", height=200, visible=False)
180
  result_video = gr.Video(label="Finished Frames", autoplay=True, show_share_button=False, height=512, loop=True)
181
+ gr.Markdown('Note: The ending actions are generated before the start. Wait for full video.')
182
  progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
183
  progress_bar = gr.HTML('', elem_classes='no-generating-animation')
184
+
185
  ips = [input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache]
186
  start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
187
  end_button.click(fn=end_process)
188
 
 
189
  block.launch(
190
  server_name=args.server,
191
  server_port=args.port,