Fabrice-TIERCELIN commited on
Commit
fa7bcd2
·
verified ·
1 Parent(s): 28b55f1

Upload 5 files

Browse files
Files changed (4) hide show
  1. README.md +12 -19
  2. app.py +698 -979
  3. app_v2v.py +774 -779
  4. requirements.txt +22 -40
README.md CHANGED
@@ -1,21 +1,14 @@
1
  ---
2
- title: SUPIR Image Upscaler
 
 
 
3
  sdk: gradio
4
- emoji: 📷
5
- sdk_version: 4.38.1
6
- app_file: app.py
7
- license: mit
8
- colorFrom: blue
9
- colorTo: pink
10
- tags:
11
- - Upscaling
12
- - Restoring
13
- - Image-to-Image
14
- - Image-2-Image
15
- - Img-to-Img
16
- - Img-2-Img
17
- - language models
18
- - LLMs
19
- short_description: Restore blurred or small images with prompt
20
- suggested_hardware: zero-a10g
21
- ---
 
1
  ---
2
+ title: FramePack F1 + V2V + EF
3
+ emoji: 👽
4
+ colorFrom: pink
5
+ colorTo: gray
6
  sdk: gradio
7
+ sdk_version: 5.29.1
8
+ app_file: app_v2v.py
9
+ pinned: true
10
+ license: apache-2.0
11
+ short_description: fast video generation from images & text
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,1036 +1,755 @@
 
 
1
  import os
 
 
 
 
2
  import gradio as gr
3
- import argparse
4
- import numpy as np
5
  import torch
 
6
  import einops
7
- import copy
8
- import math
9
- import time
10
  import random
11
- import spaces
12
- import re
13
- import uuid
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- from gradio_imageslider import ImageSlider
16
  from PIL import Image
17
- from SUPIR.util import HWC3, upscale_image, fix_resize, convert_dtype, create_SUPIR_model, load_QF_ckpt
18
- from huggingface_hub import hf_hub_download
19
- from pillow_heif import register_heif_opener
20
-
21
- register_heif_opener()
22
-
23
- max_64_bit_int = np.iinfo(np.int32).max
24
-
25
- hf_hub_download(repo_id="laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", filename="open_clip_pytorch_model.bin", local_dir="laion_CLIP-ViT-bigG-14-laion2B-39B-b160k")
26
- hf_hub_download(repo_id="camenduru/SUPIR", filename="sd_xl_base_1.0_0.9vae.safetensors", local_dir="yushan777_SUPIR")
27
- hf_hub_download(repo_id="camenduru/SUPIR", filename="SUPIR-v0F.ckpt", local_dir="yushan777_SUPIR")
28
- hf_hub_download(repo_id="camenduru/SUPIR", filename="SUPIR-v0Q.ckpt", local_dir="yushan777_SUPIR")
29
- hf_hub_download(repo_id="RunDiffusion/Juggernaut-XL-Lightning", filename="Juggernaut_RunDiffusionPhoto2_Lightning_4Steps.safetensors", local_dir="RunDiffusion_Juggernaut-XL-Lightning")
30
-
31
- parser = argparse.ArgumentParser()
32
- parser.add_argument("--opt", type=str, default='options/SUPIR_v0.yaml')
33
- parser.add_argument("--ip", type=str, default='127.0.0.1')
34
- parser.add_argument("--port", type=int, default='6688')
35
- parser.add_argument("--no_llava", action='store_true', default=True)#False
36
- parser.add_argument("--use_image_slider", action='store_true', default=False)#False
37
- parser.add_argument("--log_history", action='store_true', default=False)
38
- parser.add_argument("--loading_half_params", action='store_true', default=False)#False
39
- parser.add_argument("--use_tile_vae", action='store_true', default=True)#False
40
- parser.add_argument("--encoder_tile_size", type=int, default=512)
41
- parser.add_argument("--decoder_tile_size", type=int, default=64)
42
- parser.add_argument("--load_8bit_llava", action='store_true', default=False)
43
- args = parser.parse_args()
44
 
45
  if torch.cuda.device_count() > 0:
46
- SUPIR_device = 'cuda:0'
47
-
48
- # Load SUPIR
49
- model, default_setting = create_SUPIR_model(args.opt, SUPIR_sign='Q', load_default_setting=True)
50
- if args.loading_half_params:
51
- model = model.half()
52
- if args.use_tile_vae:
53
- model.init_tile_vae(encoder_tile_size=args.encoder_tile_size, decoder_tile_size=args.decoder_tile_size)
54
- model = model.to(SUPIR_device)
55
- model.first_stage_model.denoise_encoder_s1 = copy.deepcopy(model.first_stage_model.denoise_encoder)
56
- model.current_model = 'v0-Q'
57
- ckpt_Q, ckpt_F = load_QF_ckpt(args.opt)
58
-
59
- def check_upload(input_image):
60
- if input_image is None:
61
- raise gr.Error("Please provide an image to restore.")
62
- return gr.update(visible = True)
63
-
64
- def update_seed(is_randomize_seed, seed):
65
- if is_randomize_seed:
66
- return random.randint(0, max_64_bit_int)
67
- return seed
68
-
69
- def reset():
70
- return [
71
- None,
72
- 0,
73
- None,
74
- None,
75
- "Cinematic, High Contrast, highly detailed, taken using a Canon EOS R camera, hyper detailed photo - realistic maximum detail, 32k, Color Grading, ultra HD, extreme meticulous detailing, skin pore detailing, hyper sharpness, perfect without deformations.",
76
- "painting, oil painting, illustration, drawing, art, sketch, anime, cartoon, CG Style, 3D render, unreal engine, blurring, aliasing, pixel, unsharp, weird textures, ugly, dirty, messy, worst quality, low quality, frames, watermark, signature, jpeg artifacts, deformed, lowres, over-smooth",
77
- 1,
78
- 1024,
79
- 1,
80
- 2,
81
- 50,
82
- -1.0,
83
- 1.,
84
- default_setting.s_cfg_Quality if torch.cuda.device_count() > 0 else 1.0,
85
- True,
86
- random.randint(0, max_64_bit_int),
87
- 5,
88
- 1.003,
89
- "Wavelet",
90
- "fp32",
91
- "fp32",
92
- 1.0,
93
- True,
94
- False,
95
- default_setting.spt_linear_CFG_Quality if torch.cuda.device_count() > 0 else 1.0,
96
- 0.,
97
- "v0-Q",
98
- "input",
99
- 179
100
- ]
101
-
102
- def check_and_update(input_image):
103
- if input_image is None:
104
- raise gr.Error("Please provide an image to restore.")
105
- return gr.update(visible = True)
106
-
107
- @spaces.GPU(duration=420)
108
- def stage1_process(
109
- input_image,
110
- gamma_correction,
111
- diff_dtype,
112
- ae_dtype
113
- ):
114
- print('stage1_process ==>>')
115
- if torch.cuda.device_count() == 0:
116
- gr.Warning('Set this space to GPU config to make it work.')
117
- return None, None
118
- torch.cuda.set_device(SUPIR_device)
119
- LQ = HWC3(np.array(Image.open(input_image)))
120
- LQ = fix_resize(LQ, 512)
121
- # stage1
122
- LQ = np.array(LQ) / 255 * 2 - 1
123
- LQ = torch.tensor(LQ, dtype=torch.float32).permute(2, 0, 1).unsqueeze(0).to(SUPIR_device)[:, :3, :, :]
124
-
125
- model.ae_dtype = convert_dtype(ae_dtype)
126
- model.model.dtype = convert_dtype(diff_dtype)
127
-
128
- LQ = model.batchify_denoise(LQ, is_stage1=True)
129
- LQ = (LQ[0].permute(1, 2, 0) * 127.5 + 127.5).cpu().numpy().round().clip(0, 255).astype(np.uint8)
130
- # gamma correction
131
- LQ = LQ / 255.0
132
- LQ = np.power(LQ, gamma_correction)
133
- LQ *= 255.0
134
- LQ = LQ.round().clip(0, 255).astype(np.uint8)
135
- print('<<== stage1_process')
136
- return LQ, gr.update(visible = True)
137
-
138
- def stage2_process_example(*args, **kwargs):
139
- [result_slider, result_gallery, restore_information, reset_btn] = restore_in_Xmin(*args, **kwargs)
140
- return [result_slider, restore_information, reset_btn]
141
-
142
- def stage2_process(*args, **kwargs):
143
  try:
144
- return restore_in_Xmin(*args, **kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  except Exception as e:
146
- # NO_GPU_MESSAGE_INQUEUE
147
- print("gradio.exceptions.Error 'No GPU is currently available for you after 60s'")
148
- print('str(type(e)): ' + str(type(e))) # <class 'gradio.exceptions.Error'>
149
- print('str(e): ' + str(e)) # You have exceeded your GPU quota...
150
- try:
151
- print('e.message: ' + e.message) # No GPU is currently available for you after 60s
152
- except Exception as e2:
153
- print('Failure')
154
- if str(e).startswith("No GPU is currently available for you after 60s"):
155
- print('Exception identified!!!')
156
- #if str(type(e)) == "<class 'gradio.exceptions.Error'>":
157
- #print('Exception of name ' + type(e).__name__)
158
- raise e
159
-
160
- def restore_in_Xmin(
161
- noisy_image,
162
- rotation,
163
- denoise_image,
164
- prompt,
165
- a_prompt,
166
- n_prompt,
167
- num_samples,
168
- min_size,
169
- downscale,
170
- upscale,
171
- edm_steps,
172
- s_stage1,
173
- s_stage2,
174
- s_cfg,
175
- randomize_seed,
176
- seed,
177
- s_churn,
178
- s_noise,
179
- color_fix_type,
180
- diff_dtype,
181
- ae_dtype,
182
- gamma_correction,
183
- linear_CFG,
184
- linear_s_stage2,
185
- spt_linear_CFG,
186
- spt_linear_s_stage2,
187
- model_select,
188
- output_format,
189
- allocation
190
- ):
191
- print("noisy_image:\n" + str(noisy_image))
192
- print("denoise_image:\n" + str(denoise_image))
193
- print("rotation: " + str(rotation))
194
- print("prompt: " + str(prompt))
195
- print("a_prompt: " + str(a_prompt))
196
- print("n_prompt: " + str(n_prompt))
197
- print("num_samples: " + str(num_samples))
198
- print("min_size: " + str(min_size))
199
- print("downscale: " + str(downscale))
200
- print("upscale: " + str(upscale))
201
- print("edm_steps: " + str(edm_steps))
202
- print("s_stage1: " + str(s_stage1))
203
- print("s_stage2: " + str(s_stage2))
204
- print("s_cfg: " + str(s_cfg))
205
- print("randomize_seed: " + str(randomize_seed))
206
- print("seed: " + str(seed))
207
- print("s_churn: " + str(s_churn))
208
- print("s_noise: " + str(s_noise))
209
- print("color_fix_type: " + str(color_fix_type))
210
- print("diff_dtype: " + str(diff_dtype))
211
- print("ae_dtype: " + str(ae_dtype))
212
- print("gamma_correction: " + str(gamma_correction))
213
- print("linear_CFG: " + str(linear_CFG))
214
- print("linear_s_stage2: " + str(linear_s_stage2))
215
- print("spt_linear_CFG: " + str(spt_linear_CFG))
216
- print("spt_linear_s_stage2: " + str(spt_linear_s_stage2))
217
- print("model_select: " + str(model_select))
218
- print("GPU time allocation: " + str(allocation) + " min")
219
- print("output_format: " + str(output_format))
220
-
221
- input_format = re.sub(r"^.*\.([^\.]+)$", r"\1", noisy_image)
222
-
223
- if input_format not in ['png', 'webp', 'jpg', 'jpeg', 'gif', 'bmp', 'heic']:
224
- gr.Warning('Invalid image format. Please first convert into *.png, *.webp, *.jpg, *.jpeg, *.gif, *.bmp or *.heic.')
225
- return None, None, None, None
226
-
227
- if output_format == "input":
228
- if noisy_image is None:
229
- output_format = "png"
230
  else:
231
- output_format = input_format
232
- print("final output_format: " + str(output_format))
 
 
 
233
 
234
- if prompt is None:
235
- prompt = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
- if a_prompt is None:
238
- a_prompt = ""
239
 
240
- if n_prompt is None:
241
- n_prompt = ""
 
 
 
242
 
243
- if prompt != "" and a_prompt != "":
244
- a_prompt = prompt + ", " + a_prompt
245
- else:
246
- a_prompt = prompt + a_prompt
247
- print("Final prompt: " + str(a_prompt))
248
 
249
- denoise_image = np.array(Image.open(noisy_image if denoise_image is None else denoise_image))
250
 
251
- if rotation == 90:
252
- denoise_image = np.array(list(zip(*denoise_image[::-1])))
253
- elif rotation == 180:
254
- denoise_image = np.array(list(zip(*denoise_image[::-1])))
255
- denoise_image = np.array(list(zip(*denoise_image[::-1])))
256
- elif rotation == -90:
257
- denoise_image = np.array(list(zip(*denoise_image))[::-1])
258
 
259
- if 1 < downscale:
260
- input_height, input_width, input_channel = denoise_image.shape
261
- denoise_image = np.array(Image.fromarray(denoise_image).resize((input_width // downscale, input_height // downscale), Image.LANCZOS))
 
 
 
262
 
263
- denoise_image = HWC3(denoise_image)
264
 
265
- if torch.cuda.device_count() == 0:
266
- gr.Warning('Set this space to GPU config to make it work.')
267
- return [noisy_image, denoise_image], gr.update(label="Downloadable results in *." + output_format + " format", format = output_format, value = [denoise_image]), None, gr.update(visible=True)
268
 
269
- if model_select != model.current_model:
270
- print('load ' + model_select)
271
- if model_select == 'v0-Q':
272
- model.load_state_dict(ckpt_Q, strict=False)
273
- elif model_select == 'v0-F':
274
- model.load_state_dict(ckpt_F, strict=False)
275
- model.current_model = model_select
276
 
277
- model.ae_dtype = convert_dtype(ae_dtype)
278
- model.model.dtype = convert_dtype(diff_dtype)
 
 
279
 
280
- return restore_on_gpu(
281
- noisy_image, denoise_image, prompt, a_prompt, n_prompt, num_samples, min_size, downscale, upscale, edm_steps, s_stage1, s_stage2, s_cfg, randomize_seed, seed, s_churn, s_noise, color_fix_type, diff_dtype, ae_dtype, gamma_correction, linear_CFG, linear_s_stage2, spt_linear_CFG, spt_linear_s_stage2, model_select, output_format, allocation
282
- )
283
 
284
- def get_duration(
285
- noisy_image,
286
- input_image,
287
- prompt,
288
- a_prompt,
289
- n_prompt,
290
- num_samples,
291
- min_size,
292
- downscale,
293
- upscale,
294
- edm_steps,
295
- s_stage1,
296
- s_stage2,
297
- s_cfg,
298
- randomize_seed,
299
- seed,
300
- s_churn,
301
- s_noise,
302
- color_fix_type,
303
- diff_dtype,
304
- ae_dtype,
305
- gamma_correction,
306
- linear_CFG,
307
- linear_s_stage2,
308
- spt_linear_CFG,
309
- spt_linear_s_stage2,
310
- model_select,
311
- output_format,
312
- allocation
313
- ):
314
- return allocation
315
 
316
- @spaces.GPU(duration=get_duration)
317
- def restore_on_gpu(
318
- noisy_image,
319
- input_image,
320
- prompt,
321
- a_prompt,
322
- n_prompt,
323
- num_samples,
324
- min_size,
325
- downscale,
326
- upscale,
327
- edm_steps,
328
- s_stage1,
329
- s_stage2,
330
- s_cfg,
331
- randomize_seed,
332
- seed,
333
- s_churn,
334
- s_noise,
335
- color_fix_type,
336
- diff_dtype,
337
- ae_dtype,
338
- gamma_correction,
339
- linear_CFG,
340
- linear_s_stage2,
341
- spt_linear_CFG,
342
- spt_linear_s_stage2,
343
- model_select,
344
- output_format,
345
- allocation
346
- ):
347
- start = time.time()
348
- print('restore ==>>')
349
-
350
- torch.cuda.set_device(SUPIR_device)
351
-
352
- with torch.no_grad():
353
- input_image = upscale_image(input_image, upscale, unit_resolution=32, min_size=min_size)
354
- LQ = np.array(input_image) / 255.0
355
- LQ = np.power(LQ, gamma_correction)
356
- LQ *= 255.0
357
- LQ = LQ.round().clip(0, 255).astype(np.uint8)
358
- LQ = LQ / 255 * 2 - 1
359
- LQ = torch.tensor(LQ, dtype=torch.float32).permute(2, 0, 1).unsqueeze(0).to(SUPIR_device)[:, :3, :, :]
360
- captions = ['']
361
-
362
- samples = model.batchify_sample(LQ, captions, num_steps=edm_steps, restoration_scale=s_stage1, s_churn=s_churn,
363
- s_noise=s_noise, cfg_scale=s_cfg, control_scale=s_stage2, seed=seed,
364
- num_samples=num_samples, p_p=a_prompt, n_p=n_prompt, color_fix_type=color_fix_type,
365
- use_linear_CFG=linear_CFG, use_linear_control_scale=linear_s_stage2,
366
- cfg_scale_start=spt_linear_CFG, control_scale_start=spt_linear_s_stage2)
367
-
368
- x_samples = (einops.rearrange(samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().round().clip(
369
- 0, 255).astype(np.uint8)
370
- results = [x_samples[i] for i in range(num_samples)]
371
- torch.cuda.empty_cache()
372
-
373
- # All the results have the same size
374
- input_height, input_width, input_channel = np.array(input_image).shape
375
- result_height, result_width, result_channel = np.array(results[0]).shape
376
-
377
- print('<<== restore')
378
- end = time.time()
379
- secondes = int(end - start)
380
- minutes = math.floor(secondes / 60)
381
- secondes = secondes - (minutes * 60)
382
- hours = math.floor(minutes / 60)
383
- minutes = minutes - (hours * 60)
384
- information = ("Start the process again if you want a different result. " if randomize_seed else "") + \
385
- "If you don't get the image you wanted, add more details in the « Image description ». " + \
386
- "The image" + (" has" if len(results) == 1 else "s have") + " been generated in " + \
387
- ((str(hours) + " h, ") if hours != 0 else "") + \
388
- ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
389
- str(secondes) + " sec. " + \
390
- "The new image resolution is " + str(result_width) + \
391
- " pixels large and " + str(result_height) + \
392
- " pixels high, so a resolution of " + f'{result_width * result_height:,}' + " pixels."
393
- print(information)
394
- try:
395
- print("Initial resolution: " + f'{input_width * input_height:,}')
396
- print("Final resolution: " + f'{result_width * result_height:,}')
397
- print("edm_steps: " + str(edm_steps))
398
- print("num_samples: " + str(num_samples))
399
- print("downscale: " + str(downscale))
400
- print("Estimated minutes: " + f'{(((result_width * result_height**(1/1.75)) * input_width * input_height * (edm_steps**(1/2)) * (num_samples**(1/2.5)))**(1/2.5)) / 25000:,}')
401
- except Exception as e:
402
- print('Exception of Estimation')
403
 
404
- # Only one image can be shown in the slider
405
- return [noisy_image] + [results[0]], gr.update(label="Downloadable results in *." + output_format + " format", format = output_format, value = results), gr.update(value = information, visible = True), gr.update(visible=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
 
407
- def load_and_reset(param_setting):
408
- print('load_and_reset ==>>')
409
  if torch.cuda.device_count() == 0:
410
  gr.Warning('Set this space to GPU config to make it work.')
411
- return None, None, None, None, None, None, None, None, None, None, None, None, None, None
412
- edm_steps = default_setting.edm_steps
413
- s_stage2 = 1.0
414
- s_stage1 = -1.0
415
- s_churn = 5
416
- s_noise = 1.003
417
- a_prompt = 'Cinematic, High Contrast, highly detailed, taken using a Canon EOS R camera, hyper detailed photo - ' \
418
- 'realistic maximum detail, 32k, Color Grading, ultra HD, extreme meticulous detailing, skin pore ' \
419
- 'detailing, hyper sharpness, perfect without deformations.'
420
- n_prompt = 'painting, oil painting, illustration, drawing, art, sketch, anime, cartoon, CG Style, ' \
421
- '3D render, unreal engine, blurring, dirty, messy, worst quality, low quality, frames, watermark, ' \
422
- 'signature, jpeg artifacts, deformed, lowres, over-smooth'
423
- color_fix_type = 'Wavelet'
424
- spt_linear_s_stage2 = 0.0
425
- linear_s_stage2 = False
426
- linear_CFG = True
427
- if param_setting == "Quality":
428
- s_cfg = default_setting.s_cfg_Quality
429
- spt_linear_CFG = default_setting.spt_linear_CFG_Quality
430
- model_select = "v0-Q"
431
- elif param_setting == "Fidelity":
432
- s_cfg = default_setting.s_cfg_Fidelity
433
- spt_linear_CFG = default_setting.spt_linear_CFG_Fidelity
434
- model_select = "v0-F"
435
- else:
436
- raise NotImplementedError
437
- gr.Info('The parameters are reset.')
438
- print('<<== load_and_reset')
439
- return edm_steps, s_cfg, s_stage2, s_stage1, s_churn, s_noise, a_prompt, n_prompt, color_fix_type, linear_CFG, \
440
- linear_s_stage2, spt_linear_CFG, spt_linear_s_stage2, model_select
441
-
442
- def log_information(result_gallery):
443
- print('log_information')
444
- if result_gallery is not None:
445
- for i, result in enumerate(result_gallery):
446
- print(result[0])
447
-
448
- def on_select_result(result_slider, result_gallery, evt: gr.SelectData):
449
- print('on_select_result')
450
- if result_gallery is not None:
451
- for i, result in enumerate(result_gallery):
452
- print(result[0])
453
- return [result_slider[0], result_gallery[evt.index][0]]
454
-
455
- title_html = """
456
- <h1><center>SUPIR</center></h1>
457
- <big><center>Upscale your images up to x10 freely, without account, without watermark and download it</center></big>
458
- <center><big><big>🤸<big><big><big><big><big><big>🤸</big></big></big></big></big></big></big></big></center>
459
 
460
- <p>This is an online demo of SUPIR, a practicing model scaling for photo-realistic image restoration.
461
- The content added by SUPIR is <b><u>imagination, not real-world information</u></b>.
462
- SUPIR is for beauty and illustration only.
463
- Most of the processes last few minutes.
464
- If you want to upscale AI-generated images, be noticed that <i>PixArt Sigma</i> space can directly generate 5984x5984 images.
465
- Due to Gradio issues, the generated image is slightly less satured than the original.
466
- Please leave a <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/SUPIR/discussions/new">message in discussion</a> if you encounter issues.
467
- You can also use <a href="https://huggingface.co/spaces/gokaygokay/AuraSR">AuraSR</a> to upscale x4.
468
 
469
- <p><center><a href="https://arxiv.org/abs/2401.13627">Paper</a> &emsp; <a href="http://supir.xpixel.group/">Project Page</a> &emsp; <a href="https://huggingface.co/blog/MonsterMMORPG/supir-sota-image-upscale-better-than-magnific-ai">Local Install Guide</a></center></p>
470
- <p><center><a style="display:inline-block" href='https://github.com/Fanghua-Yu/SUPIR'><img alt="GitHub Repo stars" src="https://img.shields.io/github/stars/Fanghua-Yu/SUPIR?style=social"></a></center></p>
471
- """
 
 
 
 
472
 
 
 
473
 
474
- claim_md = """
475
- ## **Piracy**
476
- The images are not stored but the logs are saved during a month.
477
- ## **How to get SUPIR**
478
- You can get SUPIR on HuggingFace by [duplicating this space](https://huggingface.co/spaces/Fabrice-TIERCELIN/SUPIR?duplicate=true) and set GPU.
479
- You can also install SUPIR on your computer following [this tutorial](https://huggingface.co/blog/MonsterMMORPG/supir-sota-image-upscale-better-than-magnific-ai).
480
- You can install _Pinokio_ on your computer and then install _SUPIR_ into it. It should be quite easy if you have an Nvidia GPU.
481
- ## **Terms of use**
482
- By using this service, users are required to agree to the following terms: The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research. Please submit a feedback to us if you get any inappropriate answer! We will collect those to keep improving our models. For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality.
483
- ## **License**
484
- The service is a research preview intended for non-commercial use only, subject to the model [License](https://github.com/Fanghua-Yu/SUPIR) of SUPIR.
485
- """
486
-
487
- # Gradio interface
488
- with gr.Blocks() as interface:
 
 
 
 
 
489
  if torch.cuda.device_count() == 0:
490
  with gr.Row():
491
  gr.HTML("""
492
- <p style="background-color: red;"><big><big><big><b>⚠️To use SUPIR, <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/SUPIR?duplicate=true">duplicate this space</a> and set a GPU with 30 GB VRAM.</b>
493
 
494
- You can't use SUPIR directly here because this space runs on a CPU, which is not enough for SUPIR. Please provide <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/SUPIR/discussions/new">feedback</a> if you have issues.
495
  </big></big></big></p>
496
  """)
497
- gr.HTML(title_html)
498
-
499
- input_image = gr.Image(label="Input (*.png, *.webp, *.jpeg, *.jpg, *.gif, *.bmp, *.heic)", show_label=True, type="filepath", height=600, elem_id="image-input")
500
- rotation = gr.Radio([["No rotation", 0], ["⤵ Rotate +90°", 90], ["↩ Return 180°", 180], ["⤴ Rotate -90°", -90]], label="Orientation correction", info="Will apply the following rotation before restoring the image; the AI needs a good orientation to understand the content", value=0, interactive=True, visible=False)
501
- with gr.Group():
502
- prompt = gr.Textbox(label="Image description", info="Help the AI understand what the image represents; describe as much as possible, especially the details we can't see on the original image; you can write in any language", value="", placeholder="A 33 years old man, walking, in the street, Santiago, morning, Summer, photorealistic", lines=3)
503
- prompt_hint = gr.HTML("You can use a <a href='"'https://huggingface.co/spaces/badayvedat/LLaVA'"'>LlaVa space</a> to auto-generate the description of your image.")
504
- upscale = gr.Radio([["x1", 1], ["x2", 2], ["x3", 3], ["x4", 4], ["x5", 5], ["x6", 6], ["x7", 7], ["x8", 8], ["x9", 9], ["x10", 10]], label="Upscale factor", info="Resolution x1 to x10", value=2, interactive=True)
505
- output_format = gr.Radio([["As input", "input"], ["*.png", "png"], ["*.webp", "webp"], ["*.jpeg", "jpeg"], ["*.gif", "gif"], ["*.bmp", "bmp"]], label="Image format for result", info="File extention", value="input", interactive=True)
506
- allocation = gr.Slider(label="GPU allocation time (in seconds)", info="lower=May abort run, higher=Quota penalty for next runs; only useful for ZeroGPU", value=179, minimum=59, maximum=320, step=1)
507
-
508
- with gr.Accordion("Pre-denoising (optional)", open=False):
509
- gamma_correction = gr.Slider(label="Gamma Correction", info = "lower=lighter, higher=darker", minimum=0.1, maximum=2.0, value=1.0, step=0.1)
510
- denoise_button = gr.Button(value="Pre-denoise")
511
- denoise_image = gr.Image(label="Denoised image", show_label=True, type="filepath", sources=[], interactive = False, height=600, elem_id="image-s1")
512
- denoise_information = gr.HTML(value="If present, the denoised image will be used for the restoration instead of the input image.", visible=False)
513
-
514
- with gr.Accordion("Advanced options", open=False):
515
- a_prompt = gr.Textbox(label="Additional image description",
516
- info="Completes the main image description",
517
- value='Cinematic, High Contrast, highly detailed, taken using a Canon EOS R '
518
- 'camera, hyper detailed photo - realistic maximum detail, 32k, Color '
519
- 'Grading, ultra HD, extreme meticulous detailing, skin pore detailing, clothing fabric detailing, '
520
- 'hyper sharpness, perfect without deformations.',
521
- lines=3)
522
- n_prompt = gr.Textbox(label="Negative image description",
523
- info="Disambiguate by listing what the image does NOT represent",
524
- value='painting, oil painting, illustration, drawing, art, sketch, anime, '
525
- 'cartoon, CG Style, 3D render, unreal engine, blurring, aliasing, pixel, unsharp, weird textures, ugly, dirty, messy, '
526
- 'worst quality, low quality, frames, watermark, signature, jpeg artifacts, '
527
- 'deformed, lowres, over-smooth',
528
- lines=3)
529
- edm_steps = gr.Slider(label="Steps", info="lower=faster, higher=more details; too many steps create a checker effect", minimum=1, maximum=200, value=default_setting.edm_steps if torch.cuda.device_count() > 0 else 1, step=1)
530
- num_samples = gr.Slider(label="Num Samples", info="Number of generated results", minimum=1, maximum=4 if not args.use_image_slider else 1
531
- , value=1, step=1)
532
- min_size = gr.Slider(label="Minimum size", info="Minimum height, minimum width of the result", minimum=32, maximum=4096, value=1024, step=32)
533
- downscale = gr.Radio([["/1", 1], ["/2", 2], ["/3", 3], ["/4", 4], ["/5", 5], ["/6", 6], ["/7", 7], ["/8", 8], ["/9", 9], ["/10", 10]], label="Pre-downscale factor", info="Reducing blurred image reduce the process time", value=1, interactive=True)
534
- with gr.Row():
535
- with gr.Column():
536
- model_select = gr.Radio([["💃 Quality (v0-Q)", "v0-Q"], ["🎯 Fidelity (v0-F)", "v0-F"]], label="Model Selection", info="Pretrained model", value="v0-Q",
537
- interactive=True)
538
- with gr.Column():
539
- color_fix_type = gr.Radio([["None", "None"], ["AdaIn (improve as a photo)", "AdaIn"], ["Wavelet (for JPEG artifacts)", "Wavelet"]], label="Color-Fix Type", info="AdaIn=Improve following a style, Wavelet=For JPEG artifacts", value="AdaIn",
540
- interactive=True)
541
- s_cfg = gr.Slider(label="Text Guidance Scale", info="lower=follow the image, higher=follow the prompt", minimum=1.0, maximum=15.0,
542
- value=default_setting.s_cfg_Quality if torch.cuda.device_count() > 0 else 1.0, step=0.1)
543
- s_stage2 = gr.Slider(label="Restoring Guidance Strength", minimum=0., maximum=1., value=1., step=0.05)
544
- s_stage1 = gr.Slider(label="Pre-denoising Guidance Strength", minimum=-1.0, maximum=6.0, value=-1.0, step=1.0)
545
- s_churn = gr.Slider(label="S-Churn", minimum=0, maximum=40, value=5, step=1)
546
- s_noise = gr.Slider(label="S-Noise", minimum=1.0, maximum=1.1, value=1.003, step=0.001)
547
- with gr.Row():
548
- with gr.Column():
549
- linear_CFG = gr.Checkbox(label="Linear CFG", value=True)
550
- spt_linear_CFG = gr.Slider(label="CFG Start", minimum=1.0,
551
- maximum=9.0, value=default_setting.spt_linear_CFG_Quality if torch.cuda.device_count() > 0 else 1.0, step=0.5)
552
- with gr.Column():
553
- linear_s_stage2 = gr.Checkbox(label="Linear Restoring Guidance", value=False)
554
- spt_linear_s_stage2 = gr.Slider(label="Guidance Start", minimum=0.,
555
- maximum=1., value=0., step=0.05)
556
- with gr.Column():
557
- diff_dtype = gr.Radio([["fp32 (precision)", "fp32"], ["fp16 (medium)", "fp16"], ["bf16 (speed)", "bf16"]], label="Diffusion Data Type", value="fp32",
558
- interactive=True)
559
- with gr.Column():
560
- ae_dtype = gr.Radio([["fp32 (precision)", "fp32"], ["bf16 (speed)", "bf16"]], label="Auto-Encoder Data Type", value="fp32",
561
- interactive=True)
562
- randomize_seed = gr.Checkbox(label = "\U0001F3B2 Randomize seed", value = True, info = "If checked, result is always different")
563
- seed = gr.Slider(label="Seed", minimum=0, maximum=max_64_bit_int, step=1, randomize=True)
564
- with gr.Group():
565
- param_setting = gr.Radio(["Quality", "Fidelity"], interactive=True, label="Presetting", value = "Quality")
566
- restart_button = gr.Button(value="Apply presetting")
567
-
568
- with gr.Column():
569
- diffusion_button = gr.Button(value="🚀 Upscale/Restore", variant = "primary", elem_id = "process_button")
570
- reset_btn = gr.Button(value="🧹 Reinit page", variant="stop", elem_id="reset_button", visible = False)
571
-
572
- warning = gr.HTML(value = "<center><big>Your computer must <u>not</u> enter into standby mode.</big><br/>On Chrome, you can force to keep a tab alive in <code>chrome://discards/</code></center>", visible = False)
573
- restore_information = gr.HTML(value = "Restart the process to get another result.", visible = False)
574
- result_slider = ImageSlider(label = 'Comparator', show_label = False, interactive = False, elem_id = "slider1", show_download_button = False)
575
- result_gallery = gr.Gallery(label = 'Downloadable results', show_label = True, interactive = False, elem_id = "gallery1")
576
-
577
- gr.Examples(
578
- examples = [
579
- [
580
- "./Examples/Example1.png",
581
- 0,
582
- None,
583
- "Group of people, walking, happy, in the street, photorealistic, 8k, extremely detailled",
584
- "Cinematic, High Contrast, highly detailed, taken using a Canon EOS R camera, hyper detailed photo - realistic maximum detail, 32k, Color Grading, ultra HD, extreme meticulous detailing, skin pore detailing, hyper sharpness, perfect without deformations.",
585
- "painting, oil painting, illustration, drawing, art, sketch, anime, cartoon, CG Style, 3D render, unreal engine, blurring, aliasing, pixel, unsharp, weird textures, ugly, dirty, messy, worst quality, low quality, frames, watermark, signature, jpeg artifacts, deformed, lowres, over-smooth",
586
- 2,
587
- 1024,
588
- 1,
589
- 8,
590
- 100,
591
- -1,
592
- 1,
593
- 7.5,
594
- False,
595
- 42,
596
- 5,
597
- 1.003,
598
- "AdaIn",
599
- "fp16",
600
- "bf16",
601
- 1.0,
602
- True,
603
- 4,
604
- False,
605
- 0.,
606
- "v0-Q",
607
- "input",
608
- 179
609
- ],
610
- [
611
- "./Examples/Example2.jpeg",
612
- 0,
613
- None,
614
- "La cabeza de un gato atigrado, en una casa, fotorrealista, 8k, extremadamente detallada",
615
- "Cinematic, High Contrast, highly detailed, taken using a Canon EOS R camera, hyper detailed photo - realistic maximum detail, 32k, Color Grading, ultra HD, extreme meticulous detailing, skin pore detailing, hyper sharpness, perfect without deformations.",
616
- "painting, oil painting, illustration, drawing, art, sketch, anime, cartoon, CG Style, 3D render, unreal engine, blurring, aliasing, pixel, unsharp, weird textures, ugly, dirty, messy, worst quality, low quality, frames, watermark, signature, jpeg artifacts, deformed, lowres, over-smooth",
617
- 1,
618
- 1024,
619
- 1,
620
- 1,
621
- 200,
622
- -1,
623
- 1,
624
- 7.5,
625
- False,
626
- 42,
627
- 5,
628
- 1.003,
629
- "Wavelet",
630
- "fp16",
631
- "bf16",
632
- 1.0,
633
- True,
634
- 4,
635
- False,
636
- 0.,
637
- "v0-Q",
638
- "input",
639
- 179
640
- ],
641
- [
642
- "./Examples/Example3.webp",
643
- 0,
644
- None,
645
- "A red apple",
646
- "Cinematic, High Contrast, highly detailed, taken using a Canon EOS R camera, hyper detailed photo - realistic maximum detail, 32k, Color Grading, ultra HD, extreme meticulous detailing, skin pore detailing, hyper sharpness, perfect without deformations.",
647
- "painting, oil painting, illustration, drawing, art, sketch, anime, cartoon, CG Style, 3D render, unreal engine, blurring, aliasing, pixel, unsharp, weird textures, ugly, dirty, messy, worst quality, low quality, frames, watermark, signature, jpeg artifacts, deformed, lowres, over-smooth",
648
- 1,
649
- 1024,
650
- 1,
651
- 1,
652
- 200,
653
- -1,
654
- 1,
655
- 7.5,
656
- False,
657
- 42,
658
- 5,
659
- 1.003,
660
- "Wavelet",
661
- "fp16",
662
- "bf16",
663
- 1.0,
664
- True,
665
- 4,
666
- False,
667
- 0.,
668
- "v0-Q",
669
- "input",
670
- 179
671
- ],
672
- [
673
- "./Examples/Example3.webp",
674
- 0,
675
- None,
676
- "A red marble",
677
- "Cinematic, High Contrast, highly detailed, taken using a Canon EOS R camera, hyper detailed photo - realistic maximum detail, 32k, Color Grading, ultra HD, extreme meticulous detailing, skin pore detailing, hyper sharpness, perfect without deformations.",
678
- "painting, oil painting, illustration, drawing, art, sketch, anime, cartoon, CG Style, 3D render, unreal engine, blurring, aliasing, pixel, unsharp, weird textures, ugly, dirty, messy, worst quality, low quality, frames, watermark, signature, jpeg artifacts, deformed, lowres, over-smooth",
679
- 1,
680
- 1024,
681
- 1,
682
- 1,
683
- 200,
684
- -1,
685
- 1,
686
- 7.5,
687
- False,
688
- 42,
689
- 5,
690
- 1.003,
691
- "Wavelet",
692
- "fp16",
693
- "bf16",
694
- 1.0,
695
- True,
696
- 4,
697
- False,
698
- 0.,
699
- "v0-Q",
700
- "input",
701
- 179
702
- ],
703
- ],
704
- run_on_click = True,
705
- fn = stage2_process,
706
- inputs = [
707
- input_image,
708
- rotation,
709
- denoise_image,
710
- prompt,
711
- a_prompt,
712
- n_prompt,
713
- num_samples,
714
- min_size,
715
- downscale,
716
- upscale,
717
- edm_steps,
718
- s_stage1,
719
- s_stage2,
720
- s_cfg,
721
- randomize_seed,
722
- seed,
723
- s_churn,
724
- s_noise,
725
- color_fix_type,
726
- diff_dtype,
727
- ae_dtype,
728
- gamma_correction,
729
- linear_CFG,
730
- linear_s_stage2,
731
- spt_linear_CFG,
732
- spt_linear_s_stage2,
733
- model_select,
734
- output_format,
735
- allocation
736
- ],
737
- outputs = [
738
- result_slider,
739
- result_gallery,
740
- restore_information,
741
- reset_btn
742
- ],
743
- cache_examples = False,
744
- )
745
 
746
  with gr.Row(visible=False):
747
  gr.Examples(
748
  examples = [
749
  [
750
- "./Examples/Example1.png",
751
- 0,
752
- None,
753
- "Group of people, walking, happy, in the street, photorealistic, 8k, extremely detailled",
754
- "Cinematic, High Contrast, highly detailed, taken using a Canon EOS R camera, hyper detailed photo - realistic maximum detail, 32k, Color Grading, ultra HD, extreme meticulous detailing, skin pore detailing, hyper sharpness, perfect without deformations.",
755
- "painting, oil painting, illustration, drawing, art, sketch, anime, cartoon, CG Style, 3D render, unreal engine, blurring, aliasing, pixel, unsharp, weird textures, ugly, dirty, messy, worst quality, low quality, frames, watermark, signature, jpeg artifacts, deformed, lowres, over-smooth",
756
- 2,
757
- 1024,
758
- 1,
759
- 8,
760
- 100,
761
- -1,
762
- 1,
763
- 7.5,
764
- False,
765
- 42,
766
- 5,
767
- 1.003,
768
- "AdaIn",
769
- "fp16",
770
- "bf16",
771
- 1.0,
772
- True,
773
- 4,
774
- False,
775
- 0.,
776
- "v0-Q",
777
- "input",
778
- 179
779
  ],
780
  [
781
- "./Examples/Example2.jpeg",
782
- 0,
783
- None,
784
- "La cabeza de un gato atigrado, en una casa, fotorrealista, 8k, extremadamente detallada",
785
- "Cinematic, High Contrast, highly detailed, taken using a Canon EOS R camera, hyper detailed photo - realistic maximum detail, 32k, Color Grading, ultra HD, extreme meticulous detailing, skin pore detailing, hyper sharpness, perfect without deformations.",
786
- "painting, oil painting, illustration, drawing, art, sketch, anime, cartoon, CG Style, 3D render, unreal engine, blurring, aliasing, pixel, unsharp, weird textures, ugly, dirty, messy, worst quality, low quality, frames, watermark, signature, jpeg artifacts, deformed, lowres, over-smooth",
787
- 1,
788
- 1024,
789
- 1,
790
- 1,
791
- 200,
792
- -1,
793
- 1,
794
- 7.5,
795
- False,
796
- 42,
797
- 5,
798
- 1.003,
799
- "Wavelet",
800
- "fp16",
801
- "bf16",
802
- 1.0,
803
- True,
804
- 4,
805
- False,
806
- 0.,
807
- "v0-Q",
808
- "input",
809
- 179
810
  ],
811
  [
812
- "./Examples/Example3.webp",
813
- 0,
814
- None,
815
- "A red apple",
816
- "Cinematic, High Contrast, highly detailed, taken using a Canon EOS R camera, hyper detailed photo - realistic maximum detail, 32k, Color Grading, ultra HD, extreme meticulous detailing, skin pore detailing, hyper sharpness, perfect without deformations.",
817
- "painting, oil painting, illustration, drawing, art, sketch, anime, cartoon, CG Style, 3D render, unreal engine, blurring, aliasing, pixel, unsharp, weird textures, ugly, dirty, messy, worst quality, low quality, frames, watermark, signature, jpeg artifacts, deformed, lowres, over-smooth",
818
- 1,
819
- 1024,
820
- 1,
821
- 1,
822
- 200,
823
- -1,
824
- 1,
825
- 7.5,
826
- False,
827
- 42,
828
- 5,
829
- 1.003,
830
- "Wavelet",
831
- "fp16",
832
- "bf16",
833
- 1.0,
834
- True,
835
- 4,
836
- False,
837
- 0.,
838
- "v0-Q",
839
- "input",
840
- 179
841
  ],
842
  [
843
- "./Examples/Example3.webp",
844
- 0,
845
- None,
846
- "A red marble",
847
- "Cinematic, High Contrast, highly detailed, taken using a Canon EOS R camera, hyper detailed photo - realistic maximum detail, 32k, Color Grading, ultra HD, extreme meticulous detailing, skin pore detailing, hyper sharpness, perfect without deformations.",
848
- "painting, oil painting, illustration, drawing, art, sketch, anime, cartoon, CG Style, 3D render, unreal engine, blurring, aliasing, pixel, unsharp, weird textures, ugly, dirty, messy, worst quality, low quality, frames, watermark, signature, jpeg artifacts, deformed, lowres, over-smooth",
849
- 1,
850
- 1024,
851
- 1,
852
- 1,
853
- 200,
854
- -1,
855
- 1,
856
- 7.5,
857
- False,
858
- 42,
859
- 5,
860
- 1.003,
861
- "Wavelet",
862
- "fp16",
863
- "bf16",
864
- 1.0,
865
- True,
866
- 4,
867
- False,
868
- 0.,
869
- "v0-Q",
870
- "input",
871
- 179
872
  ],
873
  ],
874
  run_on_click = True,
875
- fn = stage2_process_example,
876
- inputs = [
877
- input_image,
878
- rotation,
879
- denoise_image,
880
- prompt,
881
- a_prompt,
882
- n_prompt,
883
- num_samples,
884
- min_size,
885
- downscale,
886
- upscale,
887
- edm_steps,
888
- s_stage1,
889
- s_stage2,
890
- s_cfg,
891
- randomize_seed,
892
- seed,
893
- s_churn,
894
- s_noise,
895
- color_fix_type,
896
- diff_dtype,
897
- ae_dtype,
898
- gamma_correction,
899
- linear_CFG,
900
- linear_s_stage2,
901
- spt_linear_CFG,
902
- spt_linear_s_stage2,
903
- model_select,
904
- output_format,
905
- allocation
906
- ],
907
- outputs = [
908
- result_slider,
909
- restore_information,
910
- reset_btn
911
- ],
912
- cache_examples = "lazy",
913
  )
914
 
915
- with gr.Row():
916
- gr.Markdown(claim_md)
 
917
 
918
- input_image.upload(fn = check_upload, inputs = [
919
- input_image
920
- ], outputs = [
921
- rotation
922
- ], queue = False, show_progress = False)
923
-
924
- denoise_button.click(fn = check_and_update, inputs = [
925
- input_image
926
- ], outputs = [warning], queue = False, show_progress = False).success(fn = stage1_process, inputs = [
927
- input_image,
928
- gamma_correction,
929
- diff_dtype,
930
- ae_dtype
931
- ], outputs=[
932
- denoise_image,
933
- denoise_information
934
- ])
935
-
936
- diffusion_button.click(fn = update_seed, inputs = [
937
- randomize_seed,
938
- seed
939
- ], outputs = [
940
- seed
941
- ], queue = False, show_progress = False).then(fn = check_and_update, inputs = [
942
- input_image
943
- ], outputs = [warning], queue = False, show_progress = False).success(fn=stage2_process, inputs = [
944
- input_image,
945
- rotation,
946
- denoise_image,
947
- prompt,
948
- a_prompt,
949
- n_prompt,
950
- num_samples,
951
- min_size,
952
- downscale,
953
- upscale,
954
- edm_steps,
955
- s_stage1,
956
- s_stage2,
957
- s_cfg,
958
- randomize_seed,
959
- seed,
960
- s_churn,
961
- s_noise,
962
- color_fix_type,
963
- diff_dtype,
964
- ae_dtype,
965
- gamma_correction,
966
- linear_CFG,
967
- linear_s_stage2,
968
- spt_linear_CFG,
969
- spt_linear_s_stage2,
970
- model_select,
971
- output_format,
972
- allocation
973
- ], outputs = [
974
- result_slider,
975
- result_gallery,
976
- restore_information,
977
- reset_btn
978
- ]).success(fn = log_information, inputs = [
979
- result_gallery
980
- ], outputs = [], queue = False, show_progress = False)
981
-
982
- result_gallery.change(on_select_result, [result_slider, result_gallery], result_slider)
983
- result_gallery.select(on_select_result, [result_slider, result_gallery], result_slider)
984
-
985
- restart_button.click(fn = load_and_reset, inputs = [
986
- param_setting
987
- ], outputs = [
988
- edm_steps,
989
- s_cfg,
990
- s_stage2,
991
- s_stage1,
992
- s_churn,
993
- s_noise,
994
- a_prompt,
995
- n_prompt,
996
- color_fix_type,
997
- linear_CFG,
998
- linear_s_stage2,
999
- spt_linear_CFG,
1000
- spt_linear_s_stage2,
1001
- model_select
1002
- ])
1003
-
1004
- reset_btn.click(fn = reset, inputs = [], outputs = [
1005
- input_image,
1006
- rotation,
1007
- denoise_image,
1008
- prompt,
1009
- a_prompt,
1010
- n_prompt,
1011
- num_samples,
1012
- min_size,
1013
- downscale,
1014
- upscale,
1015
- edm_steps,
1016
- s_stage1,
1017
- s_stage2,
1018
- s_cfg,
1019
- randomize_seed,
1020
- seed,
1021
- s_churn,
1022
- s_noise,
1023
- color_fix_type,
1024
- diff_dtype,
1025
- ae_dtype,
1026
- gamma_correction,
1027
- linear_CFG,
1028
- linear_s_stage2,
1029
- spt_linear_CFG,
1030
- spt_linear_s_stage2,
1031
- model_select,
1032
- output_format,
1033
- allocation
1034
- ], queue = False, show_progress = False)
1035
-
1036
- interface.queue(10).launch()
 
1
+ from diffusers_helper.hf_login import login
2
+
3
  import os
4
+
5
+ os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download')))
6
+
7
+ import spaces
8
  import gradio as gr
 
 
9
  import torch
10
+ import traceback
11
  import einops
12
+ import safetensors.torch as sf
13
+ import numpy as np
14
+ import argparse
15
  import random
16
+ import math
17
+ # 20250506 pftq: Added for video input loading
18
+ import decord
19
+ # 20250506 pftq: Added for progress bars in video_encode
20
+ from tqdm import tqdm
21
+ # 20250506 pftq: Normalize file paths for Windows compatibility
22
+ import pathlib
23
+ # 20250506 pftq: for easier to read timestamp
24
+ from datetime import datetime
25
+ # 20250508 pftq: for saving prompt to mp4 comments metadata
26
+ import imageio_ffmpeg
27
+ import tempfile
28
+ import shutil
29
+ import subprocess
30
 
 
31
  from PIL import Image
32
+ from diffusers import AutoencoderKLHunyuanVideo
33
+ from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
34
+ from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
35
+ from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
36
+ from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
37
+ from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
38
+ from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
39
+ from diffusers_helper.thread_utils import AsyncStream, async_run
40
+ from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
41
+ from transformers import SiglipImageProcessor, SiglipVisionModel
42
+ from diffusers_helper.clip_vision import hf_clip_vision_encode
43
+ from diffusers_helper.bucket_tools import find_nearest_bucket
44
+ from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
45
+
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  if torch.cuda.device_count() > 0:
48
+ free_mem_gb = get_cuda_free_memory_gb(gpu)
49
+ high_vram = free_mem_gb > 80
50
+
51
+ print(f'Free VRAM {free_mem_gb} GB')
52
+ print(f'High-VRAM Mode: {high_vram}')
53
+
54
+ text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
55
+ text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
56
+ tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
57
+ tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2')
58
+ vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu()
59
+
60
+ feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor')
61
+ image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu()
62
+
63
+ transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePack_F1_I2V_HY_20250503', torch_dtype=torch.bfloat16).cpu()
64
+
65
+ vae.eval()
66
+ text_encoder.eval()
67
+ text_encoder_2.eval()
68
+ image_encoder.eval()
69
+ transformer.eval()
70
+
71
+ if not high_vram:
72
+ vae.enable_slicing()
73
+ vae.enable_tiling()
74
+
75
+ transformer.high_quality_fp32_output_for_inference = True
76
+ print('transformer.high_quality_fp32_output_for_inference = True')
77
+
78
+ transformer.to(dtype=torch.bfloat16)
79
+ vae.to(dtype=torch.float16)
80
+ image_encoder.to(dtype=torch.float16)
81
+ text_encoder.to(dtype=torch.float16)
82
+ text_encoder_2.to(dtype=torch.float16)
83
+
84
+ vae.requires_grad_(False)
85
+ text_encoder.requires_grad_(False)
86
+ text_encoder_2.requires_grad_(False)
87
+ image_encoder.requires_grad_(False)
88
+ transformer.requires_grad_(False)
89
+
90
+ if not high_vram:
91
+ # DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster
92
+ DynamicSwapInstaller.install_model(transformer, device=gpu)
93
+ DynamicSwapInstaller.install_model(text_encoder, device=gpu)
94
+ else:
95
+ text_encoder.to(gpu)
96
+ text_encoder_2.to(gpu)
97
+ image_encoder.to(gpu)
98
+ vae.to(gpu)
99
+ transformer.to(gpu)
100
+
101
+ stream = AsyncStream()
102
+
103
+ outputs_folder = './outputs/'
104
+ os.makedirs(outputs_folder, exist_ok=True)
105
+
106
+ input_image_debug_value = prompt_debug_value = total_second_length_debug_value = None
107
+
108
+ @spaces.GPU()
109
+ @torch.no_grad()
110
+ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, device="cuda", width=None, height=None):
111
+ """
112
+ Encode a video into latent representations using the VAE.
113
+
114
+ Args:
115
+ video_path: Path to the input video file.
116
+ vae: AutoencoderKLHunyuanVideo model.
117
+ height, width: Target resolution for resizing frames.
118
+ vae_batch_size: Number of frames to process per batch.
119
+ device: Device for computation (e.g., "cuda").
120
+
121
+ Returns:
122
+ start_latent: Latent of the first frame (for compatibility with original code).
123
+ input_image_np: First frame as numpy array (for CLIP vision encoding).
124
+ history_latents: Latents of all frames (shape: [1, channels, frames, height//8, width//8]).
125
+ fps: Frames per second of the input video.
126
+ """
127
+ # 20250506 pftq: Normalize video path for Windows compatibility
128
+ video_path = str(pathlib.Path(video_path).resolve())
129
+ print(f"Processing video: {video_path}")
130
+
131
+ # 20250506 pftq: Check CUDA availability and fallback to CPU if needed
132
+ if device == "cuda" and not torch.cuda.is_available():
133
+ print("CUDA is not available, falling back to CPU")
134
+ device = "cpu"
135
+
 
 
 
 
 
 
 
 
 
136
  try:
137
+ # 20250506 pftq: Load video and get FPS
138
+ print("Initializing VideoReader...")
139
+ vr = decord.VideoReader(video_path)
140
+ fps = vr.get_avg_fps() # Get input video FPS
141
+ num_real_frames = len(vr)
142
+ print(f"Video loaded: {num_real_frames} frames, FPS: {fps}")
143
+
144
+ # Truncate to nearest latent size (multiple of 4)
145
+ latent_size_factor = 4
146
+ num_frames = (num_real_frames // latent_size_factor) * latent_size_factor
147
+ if num_frames != num_real_frames:
148
+ print(f"Truncating video from {num_real_frames} to {num_frames} frames for latent size compatibility")
149
+ num_real_frames = num_frames
150
+
151
+ # 20250506 pftq: Read frames
152
+ print("Reading video frames...")
153
+ frames = vr.get_batch(range(num_real_frames)).asnumpy() # Shape: (num_real_frames, height, width, channels)
154
+ print(f"Frames read: {frames.shape}")
155
+
156
+ # 20250506 pftq: Get native video resolution
157
+ native_height, native_width = frames.shape[1], frames.shape[2]
158
+ print(f"Native video resolution: {native_width}x{native_height}")
159
+
160
+ # 20250506 pftq: Use native resolution if height/width not specified, otherwise use provided values
161
+ target_height = native_height if height is None else height
162
+ target_width = native_width if width is None else width
163
+
164
+ # 20250506 pftq: Adjust to nearest bucket for model compatibility
165
+ if not no_resize:
166
+ target_height, target_width = find_nearest_bucket(target_height, target_width, resolution=resolution)
167
+ print(f"Adjusted resolution: {target_width}x{target_height}")
168
+ else:
169
+ print(f"Using native resolution without resizing: {target_width}x{target_height}")
170
+
171
+ # 20250506 pftq: Preprocess frames to match original image processing
172
+ processed_frames = []
173
+ for i, frame in enumerate(frames):
174
+ #print(f"Preprocessing frame {i+1}/{num_frames}")
175
+ frame_np = resize_and_center_crop(frame, target_width=target_width, target_height=target_height)
176
+ processed_frames.append(frame_np)
177
+ processed_frames = np.stack(processed_frames) # Shape: (num_real_frames, height, width, channels)
178
+ print(f"Frames preprocessed: {processed_frames.shape}")
179
+
180
+ # 20250506 pftq: Save first frame for CLIP vision encoding
181
+ input_image_np = processed_frames[0]
182
+
183
+ # 20250506 pftq: Convert to tensor and normalize to [-1, 1]
184
+ print("Converting frames to tensor...")
185
+ frames_pt = torch.from_numpy(processed_frames).float() / 127.5 - 1
186
+ frames_pt = frames_pt.permute(0, 3, 1, 2) # Shape: (num_real_frames, channels, height, width)
187
+ frames_pt = frames_pt.unsqueeze(0) # Shape: (1, num_real_frames, channels, height, width)
188
+ frames_pt = frames_pt.permute(0, 2, 1, 3, 4) # Shape: (1, channels, num_real_frames, height, width)
189
+ print(f"Tensor shape: {frames_pt.shape}")
190
+
191
+ # 20250507 pftq: Save pixel frames for use in worker
192
+ input_video_pixels = frames_pt.cpu()
193
+
194
+ # 20250506 pftq: Move to device
195
+ print(f"Moving tensor to device: {device}")
196
+ frames_pt = frames_pt.to(device)
197
+ print("Tensor moved to device")
198
+
199
+ # 20250506 pftq: Move VAE to device
200
+ print(f"Moving VAE to device: {device}")
201
+ vae.to(device)
202
+ print("VAE moved to device")
203
+
204
+ # 20250506 pftq: Encode frames in batches
205
+ print(f"Encoding input video frames in VAE batch size {vae_batch_size} (reduce if memory issues here or if forcing video resolution)")
206
+ latents = []
207
+ vae.eval()
208
+ with torch.no_grad():
209
+ for i in tqdm(range(0, frames_pt.shape[2], vae_batch_size), desc="Encoding video frames", mininterval=0.1):
210
+ #print(f"Encoding batch {i//vae_batch_size + 1}: frames {i} to {min(i + vae_batch_size, frames_pt.shape[2])}")
211
+ batch = frames_pt[:, :, i:i + vae_batch_size] # Shape: (1, channels, batch_size, height, width)
212
+ try:
213
+ # 20250506 pftq: Log GPU memory before encoding
214
+ if device == "cuda":
215
+ free_mem = torch.cuda.memory_allocated() / 1024**3
216
+ #print(f"GPU memory before encoding: {free_mem:.2f} GB")
217
+ batch_latent = vae_encode(batch, vae)
218
+ # 20250506 pftq: Synchronize CUDA to catch issues
219
+ if device == "cuda":
220
+ torch.cuda.synchronize()
221
+ #print(f"GPU memory after encoding: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
222
+ latents.append(batch_latent)
223
+ #print(f"Batch encoded, latent shape: {batch_latent.shape}")
224
+ except RuntimeError as e:
225
+ print(f"Error during VAE encoding: {str(e)}")
226
+ if device == "cuda" and "out of memory" in str(e).lower():
227
+ print("CUDA out of memory, try reducing vae_batch_size or using CPU")
228
+ raise
229
+
230
+ # 20250506 pftq: Concatenate latents
231
+ print("Concatenating latents...")
232
+ history_latents = torch.cat(latents, dim=2) # Shape: (1, channels, frames, height//8, width//8)
233
+ print(f"History latents shape: {history_latents.shape}")
234
+
235
+ # 20250506 pftq: Get first frame's latent
236
+ start_latent = history_latents[:, :, :1] # Shape: (1, channels, 1, height//8, width//8)
237
+ print(f"Start latent shape: {start_latent.shape}")
238
+
239
+ # 20250506 pftq: Move VAE back to CPU to free GPU memory
240
+ if device == "cuda":
241
+ vae.to(cpu)
242
+ torch.cuda.empty_cache()
243
+ print("VAE moved back to CPU, CUDA cache cleared")
244
+
245
+ return start_latent, input_image_np, history_latents, fps, target_height, target_width, input_video_pixels
246
+
247
  except Exception as e:
248
+ print(f"Error in video_encode: {str(e)}")
249
+ raise
250
+
251
+ # 20250508 pftq: for saving prompt to mp4 metadata comments
252
+ def set_mp4_comments_imageio_ffmpeg(input_file, comments):
253
+ try:
254
+ # Get the path to the bundled FFmpeg binary from imageio-ffmpeg
255
+ ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
256
+
257
+ # Check if input file exists
258
+ if not os.path.exists(input_file):
259
+ print(f"Error: Input file {input_file} does not exist")
260
+ return False
261
+
262
+ # Create a temporary file path
263
+ temp_file = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False).name
264
+
265
+ # FFmpeg command using the bundled binary
266
+ command = [
267
+ ffmpeg_path, # Use imageio-ffmpeg's FFmpeg
268
+ '-i', input_file, # input file
269
+ '-metadata', f'comment={comments}', # set comment metadata
270
+ '-c:v', 'copy', # copy video stream without re-encoding
271
+ '-c:a', 'copy', # copy audio stream without re-encoding
272
+ '-y', # overwrite output file if it exists
273
+ temp_file # temporary output file
274
+ ]
275
+
276
+ # Run the FFmpeg command
277
+ result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
278
+
279
+ if result.returncode == 0:
280
+ # Replace the original file with the modified one
281
+ shutil.move(temp_file, input_file)
282
+ print(f"Successfully added comments to {input_file}")
283
+ return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  else:
285
+ # Clean up temp file if FFmpeg fails
286
+ if os.path.exists(temp_file):
287
+ os.remove(temp_file)
288
+ print(f"Error: FFmpeg failed with message:\n{result.stderr}")
289
+ return False
290
 
291
+ except Exception as e:
292
+ # Clean up temp file in case of other errors
293
+ if 'temp_file' in locals() and os.path.exists(temp_file):
294
+ os.remove(temp_file)
295
+ print(f"Error saving prompt to video metadata, ffmpeg may be required: "+str(e))
296
+ return False
297
+
298
+ @torch.no_grad()
299
+ def worker(input_image, prompts, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
300
+ def encode_prompt(prompt, n_prompt):
301
+ llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
302
+
303
+ if cfg == 1:
304
+ llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
305
+ else:
306
+ llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
307
 
308
+ llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
309
+ llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
310
 
311
+ llama_vec = llama_vec.to(transformer.dtype)
312
+ llama_vec_n = llama_vec_n.to(transformer.dtype)
313
+ clip_l_pooler = clip_l_pooler.to(transformer.dtype)
314
+ clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
315
+ return [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n]
316
 
317
+ total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
318
+ total_latent_sections = int(max(round(total_latent_sections), 1))
 
 
 
319
 
320
+ job_id = generate_timestamp()
321
 
322
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
 
 
 
 
 
 
323
 
324
+ try:
325
+ # Clean GPU
326
+ if not high_vram:
327
+ unload_complete_models(
328
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
329
+ )
330
 
331
+ # Text encoding
332
 
333
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
 
 
334
 
335
+ if not high_vram:
336
+ fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
337
+ load_model_as_complete(text_encoder_2, target_device=gpu)
 
 
 
 
338
 
339
+ prompt_parameters = []
340
+
341
+ for prompt_part in prompts:
342
+ prompt_parameters.append(encode_prompt(prompt_part, n_prompt))
343
 
344
+ # Processing input image
 
 
345
 
346
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...'))))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
 
348
+ H, W, C = input_image.shape
349
+ height, width = find_nearest_bucket(H, W, resolution=640)
350
+ input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
351
+
352
+ Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
353
+
354
+ input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
355
+ input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
356
+
357
+ # VAE encoding
358
+
359
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
360
+
361
+ if not high_vram:
362
+ load_model_as_complete(vae, target_device=gpu)
363
+
364
+ start_latent = vae_encode(input_image_pt, vae)
365
+
366
+ # CLIP Vision
367
+
368
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
369
+
370
+ if not high_vram:
371
+ load_model_as_complete(image_encoder, target_device=gpu)
372
+
373
+ image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
374
+ image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
375
+
376
+ # Dtype
377
+
378
+ image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
379
+
380
+ # Sampling
381
+
382
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
383
+
384
+ rnd = torch.Generator("cpu").manual_seed(seed)
385
+
386
+ history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32).cpu()
387
+ history_pixels = None
388
+
389
+ history_latents = torch.cat([history_latents, start_latent.to(history_latents)], dim=2)
390
+ total_generated_latent_frames = 1
391
+
392
+ for section_index in range(total_latent_sections):
393
+ if stream.input_queue.top() == 'end':
394
+ stream.output_queue.push(('end', None))
395
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
396
 
397
+ print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
398
+
399
+ if len(prompt_parameters) > 0:
400
+ [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters.pop(0)
401
+
402
+ if not high_vram:
403
+ unload_complete_models()
404
+ move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
405
+
406
+ if use_teacache:
407
+ transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
408
+ else:
409
+ transformer.initialize_teacache(enable_teacache=False)
410
+
411
+ def callback(d):
412
+ preview = d['denoised']
413
+ preview = vae_decode_fake(preview)
414
+
415
+ preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
416
+ preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
417
+
418
+ if stream.input_queue.top() == 'end':
419
+ stream.output_queue.push(('end', None))
420
+ raise KeyboardInterrupt('User ends the task.')
421
+
422
+ current_step = d['i'] + 1
423
+ percentage = int(100.0 * current_step / steps)
424
+ hint = f'Sampling {current_step}/{steps}'
425
+ desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30). The video is being extended now ...'
426
+ stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
427
+ return
428
+
429
+ indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
430
+ clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
431
+ clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
432
+
433
+ clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -sum([16, 2, 1]):, :, :].split([16, 2, 1], dim=2)
434
+ clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
435
+
436
+ generated_latents = sample_hunyuan(
437
+ transformer=transformer,
438
+ sampler='unipc',
439
+ width=width,
440
+ height=height,
441
+ frames=latent_window_size * 4 - 3,
442
+ real_guidance_scale=cfg,
443
+ distilled_guidance_scale=gs,
444
+ guidance_rescale=rs,
445
+ # shift=3.0,
446
+ num_inference_steps=steps,
447
+ generator=rnd,
448
+ prompt_embeds=llama_vec,
449
+ prompt_embeds_mask=llama_attention_mask,
450
+ prompt_poolers=clip_l_pooler,
451
+ negative_prompt_embeds=llama_vec_n,
452
+ negative_prompt_embeds_mask=llama_attention_mask_n,
453
+ negative_prompt_poolers=clip_l_pooler_n,
454
+ device=gpu,
455
+ dtype=torch.bfloat16,
456
+ image_embeddings=image_encoder_last_hidden_state,
457
+ latent_indices=latent_indices,
458
+ clean_latents=clean_latents,
459
+ clean_latent_indices=clean_latent_indices,
460
+ clean_latents_2x=clean_latents_2x,
461
+ clean_latent_2x_indices=clean_latent_2x_indices,
462
+ clean_latents_4x=clean_latents_4x,
463
+ clean_latent_4x_indices=clean_latent_4x_indices,
464
+ callback=callback,
465
+ )
466
+
467
+ total_generated_latent_frames += int(generated_latents.shape[2])
468
+ history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
469
+
470
+ if not high_vram:
471
+ offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
472
+ load_model_as_complete(vae, target_device=gpu)
473
+
474
+ real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
475
+
476
+ if history_pixels is None:
477
+ history_pixels = vae_decode(real_history_latents, vae).cpu()
478
+ else:
479
+ section_latent_frames = latent_window_size * 2
480
+ overlapped_frames = latent_window_size * 4 - 3
481
+
482
+ current_pixels = vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu()
483
+ history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames)
484
+
485
+ if not high_vram:
486
+ unload_complete_models()
487
+
488
+ output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
489
+
490
+ save_bcthw_as_mp4(history_pixels, output_filename, fps=30, crf=mp4_crf)
491
+
492
+ print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
493
+
494
+ stream.output_queue.push(('file', output_filename))
495
+ except:
496
+ traceback.print_exc()
497
+
498
+ if not high_vram:
499
+ unload_complete_models(
500
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
501
+ )
502
+
503
+ stream.output_queue.push(('end', None))
504
+ return
505
+
506
+ def get_duration(input_image, prompt, t2v, n_prompt, randomize_seed, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
507
+ global total_second_length_debug_value
508
+
509
+ if total_second_length_debug_value is not None:
510
+ return min(total_second_length_debug_value * 60, 600)
511
+ return total_second_length * 60
512
+
513
+
514
+ @spaces.GPU(duration=get_duration)
515
+ def process(input_image, prompt,
516
+ t2v=False,
517
+ n_prompt="",
518
+ randomize_seed=True,
519
+ seed=31337,
520
+ total_second_length=5,
521
+ latent_window_size=9,
522
+ steps=25,
523
+ cfg=1.0,
524
+ gs=10.0,
525
+ rs=0.0,
526
+ gpu_memory_preservation=6,
527
+ use_teacache=True,
528
+ mp4_crf=16
529
+ ):
530
+ global stream, input_image_debug_value, prompt_debug_value, total_second_length_debug_value
531
 
 
 
532
  if torch.cuda.device_count() == 0:
533
  gr.Warning('Set this space to GPU config to make it work.')
534
+ return None, None, None, None, None, None
535
+
536
+ if input_image_debug_value is not None or prompt_debug_value is not None or total_second_length_debug_value is not None:
537
+ print("Debug mode")
538
+ input_image = input_image_debug_value
539
+ prompt = prompt_debug_value
540
+ total_second_length = total_second_length_debug_value
541
+ input_image_debug_value = prompt_debug_value = total_second_length_debug_value = None
542
+
543
+ if randomize_seed:
544
+ seed = random.randint(0, np.iinfo(np.int32).max)
545
+
546
+ prompts = prompt.split(";")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
547
 
548
+ # assert input_image is not None, 'No input image!'
549
+ if t2v:
550
+ default_height, default_width = 640, 640
551
+ input_image = np.ones((default_height, default_width, 3), dtype=np.uint8) * 255
552
+ print("No input image provided. Using a blank white image.")
 
 
 
553
 
554
+ yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
555
+
556
+ stream = AsyncStream()
557
+
558
+ async_run(worker, input_image, prompts, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf)
559
+
560
+ output_filename = None
561
 
562
+ while True:
563
+ flag, data = stream.output_queue.next()
564
 
565
+ if flag == 'file':
566
+ output_filename = data
567
+ yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
568
+
569
+ if flag == 'progress':
570
+ preview, desc, html = data
571
+ yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
572
+
573
+ if flag == 'end':
574
+ yield output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False)
575
+ break
576
+
577
+
578
+ def end_process():
579
+ stream.input_queue.push('end')
580
+
581
+
582
+ css = make_progress_bar_css()
583
+ block = gr.Blocks(css=css).queue()
584
+ with block:
585
  if torch.cuda.device_count() == 0:
586
  with gr.Row():
587
  gr.HTML("""
588
+ <p style="background-color: red;"><big><big><big><b>⚠️To use FramePack, <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/SUPIR?duplicate=true">duplicate this space</a> and set a GPU with 30 GB VRAM.</b>
589
 
590
+ You can't use FramePack directly here because this space runs on a CPU, which is not enough for FramePack. Please provide <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/SUPIR/discussions/new">feedback</a> if you have issues.
591
  </big></big></big></p>
592
  """)
593
+ # 20250506 pftq: Updated title to reflect video input functionality
594
+ gr.Markdown('# Framepack F1 with Image Input or with Video Input (Video Extension)')
595
+ gr.Markdown(f"""### Video diffusion, but feels like image diffusion
596
+ *FramePack F1 - a FramePack model that only predicts future frames from history frames*
597
+ ### *beta* FramePack Fill 🖋️- draw a mask over the input image to inpaint the video output
598
+ adapted from the officical code repo [FramePack](https://github.com/lllyasviel/FramePack) by [lllyasviel](lllyasviel/FramePack_F1_I2V_HY_20250503) and [FramePack Studio](https://github.com/colinurbs/FramePack-Studio) 🙌🏻
599
+ """)
600
+ with gr.Row():
601
+ with gr.Column():
602
+ input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
603
+ prompt = gr.Textbox(label="Prompt", value='')
604
+ t2v = gr.Checkbox(label="do text-to-video", value=False)
605
+
606
+ with gr.Row():
607
+ start_button = gr.Button(value="Start Generation", variant="primary")
608
+ end_button = gr.Button(value="End Generation", variant="stop", interactive=False)
609
+
610
+ total_second_length = gr.Slider(label="Generated Video Length (Seconds)", minimum=1, maximum=5, value=2, step=0.1)
611
+ with gr.Accordion("Advanced settings", open=False):
612
+ use_teacache = gr.Checkbox(label='Use TeaCache', value=True, info='Faster speed, but often makes hands and fingers slightly worse.')
613
+
614
+ n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, unrealistic position, blurred, blurry") # Not used
615
+ randomize_seed = gr.Checkbox(label='Randomize seed', value=True, info='If checked, the seed is always different')
616
+ seed = gr.Slider(label="Seed", minimum=0, maximum=np.iinfo(np.int32).max, step=1, randomize=True)
617
+
618
+
619
+ latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1) # Should not change
620
+ steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Changing this value is not recommended.')
621
+
622
+ cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01) # Should not change
623
+ gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=10.0, step=0.01, info='Changing this value is not recommended; 3=blurred motions& & unsharped; 10 focus motion')
624
+ rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01) # Should not change
625
+
626
+ gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
627
+
628
+ mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ")
629
+
630
+ with gr.Accordion("Debug", open=False):
631
+ input_image_debug = gr.Image(type="numpy", label="Image Debug", height=320)
632
+ prompt_debug = gr.Textbox(label="Prompt Debug", value='')
633
+ total_second_length_debug = gr.Slider(label="Additional Video Length to Generate (Seconds) Debug", minimum=1, maximum=120, value=5, step=0.1)
634
+
635
+ with gr.Column():
636
+ preview_image = gr.Image(label="Next Latents", height=200, visible=False)
637
+ result_video = gr.Video(label="Finished Frames", autoplay=True, show_share_button=False, height=512, loop=True)
638
+ progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
639
+ progress_bar = gr.HTML('', elem_classes='no-generating-animation')
640
+
641
+ gr.HTML('<div style="text-align:center; margin-top:20px;">Share your results and find ideas at the <a href="https://x.com/search?q=framepack&f=live" target="_blank">FramePack Twitter (X) thread</a></div>')
642
+
643
+ ips = [input_image, prompt, t2v, n_prompt, randomize_seed, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf]
644
+ start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
645
+ end_button.click(fn=end_process)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
646
 
647
  with gr.Row(visible=False):
648
  gr.Examples(
649
  examples = [
650
  [
651
+ "./img_examples/Example1.png", # input_image
652
+ "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
653
+ False, # t2v
654
+ "Missing arm, unrealistic position, blurred, blurry", # n_prompt
655
+ True, # randomize_seed
656
+ 42, # seed
657
+ 1, # total_second_length
658
+ 9, # latent_window_size
659
+ 25, # steps
660
+ 1.0, # cfg
661
+ 10.0, # gs
662
+ 0.0, # rs
663
+ 6, # gpu_memory_preservation
664
+ True, # use_teacache
665
+ 16 # mp4_crf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
666
  ],
667
  [
668
+ "./img_examples/Example1.png", # input_image
669
+ "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
670
+ False, # t2v
671
+ "Missing arm, unrealistic position, blurred, blurry", # n_prompt
672
+ True, # randomize_seed
673
+ 42, # seed
674
+ 1, # total_second_length
675
+ 9, # latent_window_size
676
+ 25, # steps
677
+ 1.0, # cfg
678
+ 10.0, # gs
679
+ 0.0, # rs
680
+ 6, # gpu_memory_preservation
681
+ True, # use_teacache
682
+ 16 # mp4_crf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
683
  ],
684
  [
685
+ "./img_examples/Example1.png", # input_image
686
+ "We are sinking, photorealistic, realistic, intricate details, 8k, insanely detailed",
687
+ False, # t2v
688
+ "Missing arm, unrealistic position, blurred, blurry", # n_prompt
689
+ True, # randomize_seed
690
+ 42, # seed
691
+ 1, # total_second_length
692
+ 9, # latent_window_size
693
+ 25, # steps
694
+ 1.0, # cfg
695
+ 10.0, # gs
696
+ 0.0, # rs
697
+ 6, # gpu_memory_preservation
698
+ True, # use_teacache
699
+ 16 # mp4_crf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
700
  ],
701
  [
702
+ "./img_examples/Example1.png", # input_image
703
+ "A boat is passing, photorealistic, realistic, intricate details, 8k, insanely detailed",
704
+ False, # t2v
705
+ "Missing arm, unrealistic position, blurred, blurry", # n_prompt
706
+ True, # randomize_seed
707
+ 42, # seed
708
+ 1, # total_second_length
709
+ 9, # latent_window_size
710
+ 25, # steps
711
+ 1.0, # cfg
712
+ 10.0, # gs
713
+ 0.0, # rs
714
+ 6, # gpu_memory_preservation
715
+ True, # use_teacache
716
+ 16 # mp4_crf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
717
  ],
718
  ],
719
  run_on_click = True,
720
+ fn = process,
721
+ inputs = ips,
722
+ outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button],
723
+ cache_examples = True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
724
  )
725
 
726
+ gr.Markdown('## Guide')
727
+ gr.Markdown("I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.")
728
+
729
 
730
+ def handle_field_debug_change(input_image_debug_data, prompt_debug_data, total_second_length_debug_data):
731
+ global input_image_debug_value, prompt_debug_value, total_second_length_debug_value
732
+ input_image_debug_value = input_image_debug_data
733
+ prompt_debug_value = prompt_debug_data
734
+ total_second_length_debug_value = total_second_length_debug_data
735
+ return []
736
+
737
+ input_image_debug.upload(
738
+ fn=handle_field_debug_change,
739
+ inputs=[input_image_debug, prompt_debug, total_second_length_debug],
740
+ outputs=[]
741
+ )
742
+
743
+ prompt_debug.change(
744
+ fn=handle_field_debug_change,
745
+ inputs=[input_image_debug, prompt_debug, total_second_length_debug],
746
+ outputs=[]
747
+ )
748
+
749
+ total_second_length_debug.change(
750
+ fn=handle_field_debug_change,
751
+ inputs=[input_image_debug, prompt_debug, total_second_length_debug],
752
+ outputs=[]
753
+ )
754
+
755
+ block.launch(mcp_server=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app_v2v.py CHANGED
@@ -1,779 +1,774 @@
1
- from diffusers_helper.hf_login import login
2
-
3
- import os
4
-
5
- os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download')))
6
-
7
- import spaces
8
- import gradio as gr
9
- import torch
10
- import traceback
11
- import einops
12
- import safetensors.torch as sf
13
- import numpy as np
14
- import argparse
15
- import random
16
- import math
17
- # 20250506 pftq: Added for video input loading
18
- import decord
19
- # 20250506 pftq: Added for progress bars in video_encode
20
- from tqdm import tqdm
21
- # 20250506 pftq: Normalize file paths for Windows compatibility
22
- import pathlib
23
- # 20250506 pftq: for easier to read timestamp
24
- from datetime import datetime
25
- # 20250508 pftq: for saving prompt to mp4 comments metadata
26
- import imageio_ffmpeg
27
- import tempfile
28
- import shutil
29
- import subprocess
30
-
31
- from PIL import Image
32
- from diffusers import AutoencoderKLHunyuanVideo
33
- from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
34
- from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
35
- from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
36
- from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
37
- from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
38
- from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
39
- from diffusers_helper.thread_utils import AsyncStream, async_run
40
- from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
41
- from transformers import SiglipImageProcessor, SiglipVisionModel
42
- from diffusers_helper.clip_vision import hf_clip_vision_encode
43
- from diffusers_helper.bucket_tools import find_nearest_bucket
44
- from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
45
-
46
- if torch.cuda.device_count() > 0:
47
- free_mem_gb = get_cuda_free_memory_gb(gpu)
48
- high_vram = free_mem_gb > 60
49
-
50
- print(f'Free VRAM {free_mem_gb} GB')
51
- print(f'High-VRAM Mode: {high_vram}')
52
-
53
-
54
-
55
- text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
56
- text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
57
- tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
58
- tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2')
59
- vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu()
60
-
61
- feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor')
62
- image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu()
63
-
64
- transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePack_F1_I2V_HY_20250503', torch_dtype=torch.bfloat16).cpu()
65
-
66
- vae.eval()
67
- text_encoder.eval()
68
- text_encoder_2.eval()
69
- image_encoder.eval()
70
- transformer.eval()
71
-
72
- if not high_vram:
73
- vae.enable_slicing()
74
- vae.enable_tiling()
75
-
76
- transformer.high_quality_fp32_output_for_inference = True
77
- print('transformer.high_quality_fp32_output_for_inference = True')
78
-
79
- transformer.to(dtype=torch.bfloat16)
80
- vae.to(dtype=torch.float16)
81
- image_encoder.to(dtype=torch.float16)
82
- text_encoder.to(dtype=torch.float16)
83
- text_encoder_2.to(dtype=torch.float16)
84
-
85
- vae.requires_grad_(False)
86
- text_encoder.requires_grad_(False)
87
- text_encoder_2.requires_grad_(False)
88
- image_encoder.requires_grad_(False)
89
- transformer.requires_grad_(False)
90
-
91
- if not high_vram:
92
- # DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster
93
- DynamicSwapInstaller.install_model(transformer, device=gpu)
94
- DynamicSwapInstaller.install_model(text_encoder, device=gpu)
95
- else:
96
- text_encoder.to(gpu)
97
- text_encoder_2.to(gpu)
98
- image_encoder.to(gpu)
99
- vae.to(gpu)
100
- transformer.to(gpu)
101
-
102
- stream = AsyncStream()
103
-
104
- outputs_folder = './outputs/'
105
- os.makedirs(outputs_folder, exist_ok=True)
106
-
107
- input_video_debug_value = None
108
- prompt_debug_value = None
109
- total_second_length_debug_value = None
110
-
111
- @spaces.GPU()
112
- @torch.no_grad()
113
- def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, device="cuda", width=None, height=None):
114
- """
115
- Encode a video into latent representations using the VAE.
116
-
117
- Args:
118
- video_path: Path to the input video file.
119
- vae: AutoencoderKLHunyuanVideo model.
120
- height, width: Target resolution for resizing frames.
121
- vae_batch_size: Number of frames to process per batch.
122
- device: Device for computation (e.g., "cuda").
123
-
124
- Returns:
125
- start_latent: Latent of the first frame (for compatibility with original code).
126
- input_image_np: First frame as numpy array (for CLIP vision encoding).
127
- history_latents: Latents of all frames (shape: [1, channels, frames, height//8, width//8]).
128
- fps: Frames per second of the input video.
129
- """
130
- # 20250506 pftq: Normalize video path for Windows compatibility
131
- video_path = str(pathlib.Path(video_path).resolve())
132
- print(f"Processing video: {video_path}")
133
-
134
- # 20250506 pftq: Check CUDA availability and fallback to CPU if needed
135
- if device == "cuda" and not torch.cuda.is_available():
136
- print("CUDA is not available, falling back to CPU")
137
- device = "cpu"
138
-
139
- try:
140
- # 20250506 pftq: Load video and get FPS
141
- print("Initializing VideoReader...")
142
- vr = decord.VideoReader(video_path)
143
- fps = vr.get_avg_fps() # Get input video FPS
144
- num_real_frames = len(vr)
145
- print(f"Video loaded: {num_real_frames} frames, FPS: {fps}")
146
-
147
- # Truncate to nearest latent size (multiple of 4)
148
- latent_size_factor = 4
149
- num_frames = (num_real_frames // latent_size_factor) * latent_size_factor
150
- if num_frames != num_real_frames:
151
- print(f"Truncating video from {num_real_frames} to {num_frames} frames for latent size compatibility")
152
- num_real_frames = num_frames
153
-
154
- # 20250506 pftq: Read frames
155
- print("Reading video frames...")
156
- frames = vr.get_batch(range(num_real_frames)).asnumpy() # Shape: (num_real_frames, height, width, channels)
157
- print(f"Frames read: {frames.shape}")
158
-
159
- # 20250506 pftq: Get native video resolution
160
- native_height, native_width = frames.shape[1], frames.shape[2]
161
- print(f"Native video resolution: {native_width}x{native_height}")
162
-
163
- # 20250506 pftq: Use native resolution if height/width not specified, otherwise use provided values
164
- target_height = native_height if height is None else height
165
- target_width = native_width if width is None else width
166
-
167
- # 20250506 pftq: Adjust to nearest bucket for model compatibility
168
- if not no_resize:
169
- target_height, target_width = find_nearest_bucket(target_height, target_width, resolution=resolution)
170
- print(f"Adjusted resolution: {target_width}x{target_height}")
171
- else:
172
- print(f"Using native resolution without resizing: {target_width}x{target_height}")
173
-
174
- # 20250506 pftq: Preprocess frames to match original image processing
175
- processed_frames = []
176
- for i, frame in enumerate(frames):
177
- #print(f"Preprocessing frame {i+1}/{num_frames}")
178
- frame_np = resize_and_center_crop(frame, target_width=target_width, target_height=target_height)
179
- processed_frames.append(frame_np)
180
- processed_frames = np.stack(processed_frames) # Shape: (num_real_frames, height, width, channels)
181
- print(f"Frames preprocessed: {processed_frames.shape}")
182
-
183
- # 20250506 pftq: Save first frame for CLIP vision encoding
184
- input_image_np = processed_frames[0]
185
-
186
- # 20250506 pftq: Convert to tensor and normalize to [-1, 1]
187
- print("Converting frames to tensor...")
188
- frames_pt = torch.from_numpy(processed_frames).float() / 127.5 - 1
189
- frames_pt = frames_pt.permute(0, 3, 1, 2) # Shape: (num_real_frames, channels, height, width)
190
- frames_pt = frames_pt.unsqueeze(0) # Shape: (1, num_real_frames, channels, height, width)
191
- frames_pt = frames_pt.permute(0, 2, 1, 3, 4) # Shape: (1, channels, num_real_frames, height, width)
192
- print(f"Tensor shape: {frames_pt.shape}")
193
-
194
- # 20250507 pftq: Save pixel frames for use in worker
195
- input_video_pixels = frames_pt.cpu()
196
-
197
- # 20250506 pftq: Move to device
198
- print(f"Moving tensor to device: {device}")
199
- frames_pt = frames_pt.to(device)
200
- print("Tensor moved to device")
201
-
202
- # 20250506 pftq: Move VAE to device
203
- print(f"Moving VAE to device: {device}")
204
- vae.to(device)
205
- print("VAE moved to device")
206
-
207
- # 20250506 pftq: Encode frames in batches
208
- print(f"Encoding input video frames in VAE batch size {vae_batch_size} (reduce if memory issues here or if forcing video resolution)")
209
- latents = []
210
- vae.eval()
211
- with torch.no_grad():
212
- for i in tqdm(range(0, frames_pt.shape[2], vae_batch_size), desc="Encoding video frames", mininterval=0.1):
213
- #print(f"Encoding batch {i//vae_batch_size + 1}: frames {i} to {min(i + vae_batch_size, frames_pt.shape[2])}")
214
- batch = frames_pt[:, :, i:i + vae_batch_size] # Shape: (1, channels, batch_size, height, width)
215
- try:
216
- # 20250506 pftq: Log GPU memory before encoding
217
- if device == "cuda":
218
- free_mem = torch.cuda.memory_allocated() / 1024**3
219
- #print(f"GPU memory before encoding: {free_mem:.2f} GB")
220
- batch_latent = vae_encode(batch, vae)
221
- # 20250506 pftq: Synchronize CUDA to catch issues
222
- if device == "cuda":
223
- torch.cuda.synchronize()
224
- #print(f"GPU memory after encoding: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
225
- latents.append(batch_latent)
226
- #print(f"Batch encoded, latent shape: {batch_latent.shape}")
227
- except RuntimeError as e:
228
- print(f"Error during VAE encoding: {str(e)}")
229
- if device == "cuda" and "out of memory" in str(e).lower():
230
- print("CUDA out of memory, try reducing vae_batch_size or using CPU")
231
- raise
232
-
233
- # 20250506 pftq: Concatenate latents
234
- print("Concatenating latents...")
235
- history_latents = torch.cat(latents, dim=2) # Shape: (1, channels, frames, height//8, width//8)
236
- print(f"History latents shape: {history_latents.shape}")
237
-
238
- # 20250506 pftq: Get first frame's latent
239
- start_latent = history_latents[:, :, :1] # Shape: (1, channels, 1, height//8, width//8)
240
- print(f"Start latent shape: {start_latent.shape}")
241
-
242
- # 20250506 pftq: Move VAE back to CPU to free GPU memory
243
- if device == "cuda":
244
- vae.to(cpu)
245
- torch.cuda.empty_cache()
246
- print("VAE moved back to CPU, CUDA cache cleared")
247
-
248
- return start_latent, input_image_np, history_latents, fps, target_height, target_width, input_video_pixels
249
-
250
- except Exception as e:
251
- print(f"Error in video_encode: {str(e)}")
252
- raise
253
-
254
- # 20250508 pftq: for saving prompt to mp4 metadata comments
255
- def set_mp4_comments_imageio_ffmpeg(input_file, comments):
256
- try:
257
- # Get the path to the bundled FFmpeg binary from imageio-ffmpeg
258
- ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
259
-
260
- # Check if input file exists
261
- if not os.path.exists(input_file):
262
- print(f"Error: Input file {input_file} does not exist")
263
- return False
264
-
265
- # Create a temporary file path
266
- temp_file = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False).name
267
-
268
- # FFmpeg command using the bundled binary
269
- command = [
270
- ffmpeg_path, # Use imageio-ffmpeg's FFmpeg
271
- '-i', input_file, # input file
272
- '-metadata', f'comment={comments}', # set comment metadata
273
- '-c:v', 'copy', # copy video stream without re-encoding
274
- '-c:a', 'copy', # copy audio stream without re-encoding
275
- '-y', # overwrite output file if it exists
276
- temp_file # temporary output file
277
- ]
278
-
279
- # Run the FFmpeg command
280
- result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
281
-
282
- if result.returncode == 0:
283
- # Replace the original file with the modified one
284
- shutil.move(temp_file, input_file)
285
- print(f"Successfully added comments to {input_file}")
286
- return True
287
- else:
288
- # Clean up temp file if FFmpeg fails
289
- if os.path.exists(temp_file):
290
- os.remove(temp_file)
291
- print(f"Error: FFmpeg failed with message:\n{result.stderr}")
292
- return False
293
-
294
- except Exception as e:
295
- # Clean up temp file in case of other errors
296
- if 'temp_file' in locals() and os.path.exists(temp_file):
297
- os.remove(temp_file)
298
- print(f"Error saving prompt to video metadata, ffmpeg may be required: "+str(e))
299
- return False
300
-
301
- # 20250506 pftq: Modified worker to accept video input and clean frame count
302
- @spaces.GPU()
303
- @torch.no_grad()
304
- def worker(input_video, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
305
-
306
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
307
-
308
- try:
309
- # Clean GPU
310
- if not high_vram:
311
- unload_complete_models(
312
- text_encoder, text_encoder_2, image_encoder, vae, transformer
313
- )
314
-
315
- # Text encoding
316
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
317
-
318
- if not high_vram:
319
- fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
320
- load_model_as_complete(text_encoder_2, target_device=gpu)
321
-
322
- llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
323
-
324
- if cfg == 1:
325
- llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
326
- else:
327
- llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
328
-
329
- llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
330
- llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
331
-
332
- # 20250506 pftq: Processing input video instead of image
333
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Video processing ...'))))
334
-
335
- # 20250506 pftq: Encode video
336
- #H, W = 640, 640 # Default resolution, will be adjusted
337
- #height, width = find_nearest_bucket(H, W, resolution=640)
338
- #start_latent, input_image_np, history_latents, fps = video_encode(input_video, vae, height, width, vae_batch_size=16, device=gpu)
339
- start_latent, input_image_np, video_latents, fps, height, width, input_video_pixels = video_encode(input_video, resolution, no_resize, vae, vae_batch_size=vae_batch, device=gpu)
340
-
341
- #Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
342
-
343
- # CLIP Vision
344
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
345
-
346
- if not high_vram:
347
- load_model_as_complete(image_encoder, target_device=gpu)
348
-
349
- image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
350
- image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
351
-
352
- # Dtype
353
- llama_vec = llama_vec.to(transformer.dtype)
354
- llama_vec_n = llama_vec_n.to(transformer.dtype)
355
- clip_l_pooler = clip_l_pooler.to(transformer.dtype)
356
- clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
357
- image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
358
-
359
- total_latent_sections = (total_second_length * fps) / (latent_window_size * 4)
360
- total_latent_sections = int(max(round(total_latent_sections), 1))
361
-
362
- for idx in range(batch):
363
- if batch > 1:
364
- print(f"Beginning video {idx+1} of {batch} with seed {seed} ")
365
-
366
- #job_id = generate_timestamp()
367
- job_id = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+f"_framepackf1-videoinput_{width}-{total_second_length}sec_seed-{seed}_steps-{steps}_distilled-{gs}_cfg-{cfg}" # 20250506 pftq: easier to read timestamp and filename
368
-
369
- # Sampling
370
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
371
-
372
- rnd = torch.Generator("cpu").manual_seed(seed)
373
-
374
- # 20250506 pftq: Initialize history_latents with video latents
375
- history_latents = video_latents.cpu()
376
- total_generated_latent_frames = history_latents.shape[2]
377
- # 20250506 pftq: Initialize history_pixels to fix UnboundLocalError
378
- history_pixels = None
379
- previous_video = None
380
-
381
- # 20250507 pftq: hot fix for initial video being corrupted by vae encoding, issue with ghosting because of slight differences
382
- #history_pixels = input_video_pixels
383
- #save_bcthw_as_mp4(vae_decode(video_latents, vae).cpu(), os.path.join(outputs_folder, f'{job_id}_input_video.mp4'), fps=fps, crf=mp4_crf) # 20250507 pftq: test fast movement corrupted by vae encoding if vae batch size too low
384
-
385
- for section_index in range(total_latent_sections):
386
- if stream.input_queue.top() == 'end':
387
- stream.output_queue.push(('end', None))
388
- return
389
-
390
- print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
391
-
392
- if not high_vram:
393
- unload_complete_models()
394
- move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
395
-
396
- if use_teacache:
397
- transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
398
- else:
399
- transformer.initialize_teacache(enable_teacache=False)
400
-
401
- def callback(d):
402
- preview = d['denoised']
403
- preview = vae_decode_fake(preview)
404
-
405
- preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
406
- preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
407
-
408
- if stream.input_queue.top() == 'end':
409
- stream.output_queue.push(('end', None))
410
- raise KeyboardInterrupt('User ends the task.')
411
-
412
- current_step = d['i'] + 1
413
- percentage = int(100.0 * current_step / steps)
414
- hint = f'Sampling {current_step}/{steps}'
415
- desc = f'Total frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps) :.2f} seconds (FPS-{fps}), Seed: {seed}, Video {idx+1} of {batch}. The video is generating part {section_index+1} of {total_latent_sections}...'
416
- stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
417
- return
418
-
419
- # 20250506 pftq: Use user-specified number of context frames, matching original allocation for num_clean_frames=2
420
- available_frames = history_latents.shape[2] # Number of latent frames
421
- max_pixel_frames = min(latent_window_size * 4 - 3, available_frames * 4) # Cap at available pixel frames
422
- adjusted_latent_frames = max(1, (max_pixel_frames + 3) // 4) # Convert back to latent frames
423
- # Adjust num_clean_frames to match original behavior: num_clean_frames=2 means 1 frame for clean_latents_1x
424
- effective_clean_frames = max(0, num_clean_frames - 1) if num_clean_frames > 1 else 0
425
- effective_clean_frames = min(effective_clean_frames, available_frames - 2) if available_frames > 2 else 0 # 20250507 pftq: changed 1 to 2 for edge case for <=1 sec videos
426
- num_2x_frames = min(2, max(1, available_frames - effective_clean_frames - 1)) if available_frames > effective_clean_frames + 1 else 0 # 20250507 pftq: subtracted 1 for edge case for <=1 sec videos
427
- num_4x_frames = min(16, max(1, available_frames - effective_clean_frames - num_2x_frames)) if available_frames > effective_clean_frames + num_2x_frames else 0 # 20250507 pftq: Edge case for <=1 sec
428
-
429
- total_context_frames = num_4x_frames + num_2x_frames + effective_clean_frames
430
- total_context_frames = min(total_context_frames, available_frames) # 20250507 pftq: Edge case for <=1 sec videos
431
-
432
- indices = torch.arange(0, sum([1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames])).unsqueeze(0) # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
433
- clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split(
434
- [1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames], dim=1 # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
435
- )
436
- clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
437
-
438
- # 20250506 pftq: Split history_latents dynamically based on available frames
439
- fallback_frame_count = 2 # 20250507 pftq: Changed 0 to 2 Edge case for <=1 sec videos
440
- context_frames = history_latents[:, :, -total_context_frames:, :, :] if total_context_frames > 0 else history_latents[:, :, :fallback_frame_count, :, :]
441
- if total_context_frames > 0:
442
- split_sizes = [num_4x_frames, num_2x_frames, effective_clean_frames]
443
- split_sizes = [s for s in split_sizes if s > 0] # Remove zero sizes
444
- if split_sizes:
445
- splits = context_frames.split(split_sizes, dim=2)
446
- split_idx = 0
447
- clean_latents_4x = splits[split_idx] if num_4x_frames > 0 else history_latents[:, :, :fallback_frame_count, :, :]
448
- if clean_latents_4x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
449
- clean_latents_4x = torch.cat([clean_latents_4x, clean_latents_4x[:, :, -1:, :, :]], dim=2)[:, :, :2, :, :]
450
- split_idx += 1 if num_4x_frames > 0 else 0
451
- clean_latents_2x = splits[split_idx] if num_2x_frames > 0 and split_idx < len(splits) else history_latents[:, :, :fallback_frame_count, :, :]
452
- if clean_latents_2x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
453
- clean_latents_2x = torch.cat([clean_latents_2x, clean_latents_2x[:, :, -1:, :, :]], dim=2)[:, :, :2, :, :]
454
- split_idx += 1 if num_2x_frames > 0 else 0
455
- clean_latents_1x = splits[split_idx] if effective_clean_frames > 0 and split_idx < len(splits) else history_latents[:, :, :fallback_frame_count, :, :]
456
- else:
457
- clean_latents_4x = clean_latents_2x = clean_latents_1x = history_latents[:, :, :fallback_frame_count, :, :]
458
- else:
459
- clean_latents_4x = clean_latents_2x = clean_latents_1x = history_latents[:, :, :fallback_frame_count, :, :]
460
-
461
- clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
462
-
463
- # 20250507 pftq: Fix for <=1 sec videos.
464
- max_frames = min(latent_window_size * 4 - 3, history_latents.shape[2] * 4)
465
-
466
- generated_latents = sample_hunyuan(
467
- transformer=transformer,
468
- sampler='unipc',
469
- width=width,
470
- height=height,
471
- frames=max_frames,
472
- real_guidance_scale=cfg,
473
- distilled_guidance_scale=gs,
474
- guidance_rescale=rs,
475
- num_inference_steps=steps,
476
- generator=rnd,
477
- prompt_embeds=llama_vec,
478
- prompt_embeds_mask=llama_attention_mask,
479
- prompt_poolers=clip_l_pooler,
480
- negative_prompt_embeds=llama_vec_n,
481
- negative_prompt_embeds_mask=llama_attention_mask_n,
482
- negative_prompt_poolers=clip_l_pooler_n,
483
- device=gpu,
484
- dtype=torch.bfloat16,
485
- image_embeddings=image_encoder_last_hidden_state,
486
- latent_indices=latent_indices,
487
- clean_latents=clean_latents,
488
- clean_latent_indices=clean_latent_indices,
489
- clean_latents_2x=clean_latents_2x,
490
- clean_latent_2x_indices=clean_latent_2x_indices,
491
- clean_latents_4x=clean_latents_4x,
492
- clean_latent_4x_indices=clean_latent_4x_indices,
493
- callback=callback,
494
- )
495
-
496
- total_generated_latent_frames += int(generated_latents.shape[2])
497
- history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
498
-
499
- if not high_vram:
500
- offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
501
- load_model_as_complete(vae, target_device=gpu)
502
-
503
- real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
504
-
505
- if history_pixels is None:
506
- history_pixels = vae_decode(real_history_latents, vae).cpu()
507
- else:
508
- section_latent_frames = latent_window_size * 2
509
- overlapped_frames = min(latent_window_size * 4 - 3, history_pixels.shape[2])
510
-
511
- #if section_index == 0:
512
- #extra_latents = 1 # Add up to 2 extra latent frames for smoother overlap to initial video
513
- #extra_pixel_frames = extra_latents * 4 # Approx. 4 pixel frames per latent
514
- #overlapped_frames = min(overlapped_frames + extra_pixel_frames, history_pixels.shape[2], section_latent_frames * 4)
515
-
516
- current_pixels = vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu()
517
- history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames)
518
-
519
- if not high_vram:
520
- unload_complete_models()
521
-
522
- output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
523
-
524
- # 20250506 pftq: Use input video FPS for output
525
- save_bcthw_as_mp4(history_pixels, output_filename, fps=fps, crf=mp4_crf)
526
- print(f"Latest video saved: {output_filename}")
527
- # 20250508 pftq: Save prompt to mp4 metadata comments
528
- set_mp4_comments_imageio_ffmpeg(output_filename, f"Prompt: {prompt} | Negative Prompt: {n_prompt}");
529
- print(f"Prompt saved to mp4 metadata comments: {output_filename}")
530
-
531
- # 20250506 pftq: Clean up previous partial files
532
- if previous_video is not None and os.path.exists(previous_video):
533
- try:
534
- os.remove(previous_video)
535
- print(f"Previous partial video deleted: {previous_video}")
536
- except Exception as e:
537
- print(f"Error deleting previous partial video {previous_video}: {e}")
538
- previous_video = output_filename
539
-
540
- print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
541
-
542
- stream.output_queue.push(('file', output_filename))
543
-
544
- seed = (seed + 1) % np.iinfo(np.int32).max
545
-
546
- except:
547
- traceback.print_exc()
548
-
549
- if not high_vram:
550
- unload_complete_models(
551
- text_encoder, text_encoder_2, image_encoder, vae, transformer
552
- )
553
-
554
- stream.output_queue.push(('end', None))
555
- return
556
-
557
- def get_duration(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
558
- global total_second_length_debug_value
559
- if total_second_length_debug_value is not None:
560
- return min(total_second_length_debug_value * 60 * 2, 600)
561
- return total_second_length * 60 * 2
562
-
563
- # 20250506 pftq: Modified process to pass clean frame count, etc from video_encode
564
- @spaces.GPU(duration=get_duration)
565
- def process(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
566
- global stream, high_vram, input_video_debug_value, prompt_debug_value, total_second_length_debug_value
567
-
568
- if torch.cuda.device_count() == 0:
569
- gr.Warning('Set this space to GPU config to make it work.')
570
- return None, None, None, None, None, None
571
-
572
- if input_video_debug_value is not None:
573
- input_video = input_video_debug_value
574
- input_video_debug_value = None
575
-
576
- if prompt_debug_value is not None:
577
- prompt = prompt_debug_value
578
- prompt_debug_value = None
579
-
580
- if total_second_length_debug_value is not None:
581
- total_second_length = total_second_length_debug_value
582
- total_second_length_debug_value = None
583
-
584
- if randomize_seed:
585
- seed = random.randint(0, np.iinfo(np.int32).max)
586
-
587
- # 20250506 pftq: Updated assertion for video input
588
- assert input_video is not None, 'No input video!'
589
-
590
- yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
591
-
592
- # 20250507 pftq: Even the H100 needs offloading if the video dimensions are 720p or higher
593
- if high_vram and (no_resize or resolution>640):
594
- print("Disabling high vram mode due to no resize and/or potentially higher resolution...")
595
- high_vram = False
596
- vae.enable_slicing()
597
- vae.enable_tiling()
598
- DynamicSwapInstaller.install_model(transformer, device=gpu)
599
- DynamicSwapInstaller.install_model(text_encoder, device=gpu)
600
-
601
- # 20250508 pftq: automatically set distilled cfg to 1 if cfg is used
602
- if cfg > 1:
603
- gs = 1
604
-
605
- stream = AsyncStream()
606
-
607
- # 20250506 pftq: Pass num_clean_frames, vae_batch, etc
608
- async_run(worker, input_video, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
609
-
610
- output_filename = None
611
-
612
- while True:
613
- flag, data = stream.output_queue.next()
614
-
615
- if flag == 'file':
616
- output_filename = data
617
- yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
618
-
619
- if flag == 'progress':
620
- preview, desc, html = data
621
- #yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
622
- yield output_filename, gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True) # 20250506 pftq: Keep refreshing the video in case it got hidden when the tab was in the background
623
-
624
- if flag == 'end':
625
- yield output_filename, gr.update(visible=False), desc+' Video complete.', '', gr.update(interactive=True), gr.update(interactive=False)
626
- break
627
-
628
- def end_process():
629
- stream.input_queue.push('end')
630
-
631
- quick_prompts = [
632
- 'The girl dances gracefully, with clear movements, full of charm.',
633
- 'A character doing some simple body movements.',
634
- ]
635
- quick_prompts = [[x] for x in quick_prompts]
636
-
637
- css = make_progress_bar_css()
638
- block = gr.Blocks(css=css).queue()
639
- with block:
640
- if torch.cuda.device_count() == 0:
641
- with gr.Row():
642
- gr.HTML("""
643
- <p style="background-color: red;"><big><big><big><b>⚠️To use FramePack, <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/SUPIR?duplicate=true">duplicate this space</a> and set a GPU with 30 GB VRAM.</b>
644
-
645
- You can't use FramePack directly here because this space runs on a CPU, which is not enough for FramePack. Please provide <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/SUPIR/discussions/new">feedback</a> if you have issues.
646
- </big></big></big></p>
647
- """)
648
- # 20250506 pftq: Updated title to reflect video input functionality
649
- gr.Markdown('# Framepack F1 with Image Input or with Video Input (Video Extension)')
650
- with gr.Row():
651
- with gr.Column():
652
- # 20250506 pftq: Changed to Video input from Image
653
- input_video = gr.Video(sources='upload', label="Input Video", height=320)
654
- prompt = gr.Textbox(label="Prompt", value='')
655
- #example_quick_prompts = gr.Dataset(samples=quick_prompts, label='Quick List', samples_per_page=1000, components=[prompt])
656
- #example_quick_prompts.click(lambda x: x[0], inputs=[example_quick_prompts], outputs=prompt, show_progress=False, queue=False)
657
-
658
- with gr.Row():
659
- start_button = gr.Button(value="Start Generation")
660
- end_button = gr.Button(value="End Generation", interactive=False)
661
-
662
- with gr.Accordion("Advanced settings", open=False):
663
- with gr.Row():
664
- use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed, but often makes hands and fingers slightly worse.')
665
- no_resize = gr.Checkbox(label='Force Original Video Resolution (No Resizing)', value=False, info='Might run out of VRAM (720p requires > 24GB VRAM).')
666
-
667
- randomize_seed = gr.Checkbox(label='Randomize seed', value=True, info='If checked, the seed is always different')
668
- seed = gr.Slider(label="Seed", minimum=0, maximum=np.iinfo(np.int32).max, step=1, randomize=True)
669
-
670
- batch = gr.Slider(label="Batch Size (Number of Videos)", minimum=1, maximum=1000, value=1, step=1, info='Generate multiple videos each with a different seed.')
671
-
672
- resolution = gr.Number(label="Resolution (max width or height)", value=640, precision=0, visible=False)
673
-
674
- total_second_length = gr.Slider(label="Additional Video Length to Generate (Seconds)", minimum=1, maximum=120, value=5, step=0.1)
675
-
676
- # 20250506 pftq: Reduced default distilled guidance scale to improve adherence to input video
677
- gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=3.0, step=0.01, info='Prompt adherence at the cost of less details from the input video, but to a lesser extent than Context Frames.')
678
- cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, visible=True, info='Use this instead of Distilled for more detail/control + Negative Prompt (make sure Distilled set to 1). Doubles render time.') # Should not change
679
- rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01, visible=False) # Should not change
680
-
681
- n_prompt = gr.Textbox(label="Negative Prompt", value="", visible=True, info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
682
- steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Increase for more quality, especially if using high non-distilled CFG.')
683
-
684
- # 20250506 pftq: Renamed slider to Number of Context Frames and updated description
685
- num_clean_frames = gr.Slider(label="Number of Context Frames", minimum=2, maximum=10, value=5, step=1, info="Retain more video details but increase memory use. Reduce to 2 if memory issues.")
686
-
687
- default_vae = 32
688
- if high_vram:
689
- default_vae = 128
690
- elif free_mem_gb>=20:
691
- default_vae = 64
692
-
693
- vae_batch = gr.Slider(label="VAE Batch Size for Input Video", minimum=4, maximum=256, value=default_vae, step=4, info="Reduce if running out of memory. Increase for better quality frames during fast motion.")
694
-
695
- latent_window_size = gr.Slider(label="Latent Window Size", minimum=9, maximum=33, value=9, step=1, visible=True, info='Generate more frames at a time (larger chunks). Less degradation and better blending but higher VRAM cost.')
696
-
697
- gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
698
-
699
- mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ")
700
-
701
- with gr.Row():
702
- input_video_debug = gr.Video(sources='upload', label="Input Video Debug", height=320)
703
- prompt_debug = gr.Textbox(label="Prompt Debug", value='')
704
- total_second_length_debug = gr.Slider(label="Additional Video Length to Generate (Seconds) Debug", minimum=1, maximum=120, value=1, step=0.1)
705
-
706
- with gr.Column():
707
- preview_image = gr.Image(label="Next Latents", height=200, visible=False)
708
- result_video = gr.Video(label="Finished Frames", autoplay=True, show_share_button=False, height=512, loop=True)
709
- progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
710
- progress_bar = gr.HTML('', elem_classes='no-generating-animation')
711
-
712
- with gr.Row(visible=False):
713
- gr.Examples(
714
- examples = [
715
- [
716
- "./img_examples/Example1.mp4", # input_video
717
- "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
718
- "", # n_prompt
719
- True, # randomize_seed
720
- 42, # seed
721
- 1, # batch
722
- 640, # resolution
723
- 1, # total_second_length
724
- 9, # latent_window_size
725
- 25, # steps
726
- 1.0, # cfg
727
- 10.0, # gs
728
- 0.0, # rs
729
- 6, # gpu_memory_preservation
730
- False, # use_teacache
731
- False, # no_resize
732
- 16, # mp4_crf
733
- 5, # num_clean_frames
734
- default_vae
735
- ],
736
- ],
737
- run_on_click = True,
738
- fn = process,
739
- inputs = [input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch],
740
- outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button],
741
- cache_examples = True,
742
- )
743
-
744
- gr.HTML("""
745
- <div style="text-align:center; margin-top:20px;">Share your results and find ideas at the <a href="https://x.com/search?q=framepack&f=live" target="_blank">FramePack Twitter (X) thread</a></div>
746
- """)
747
-
748
- # 20250506 pftq: Updated inputs to include num_clean_frames
749
- ips = [input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
750
- start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
751
- end_button.click(fn=end_process)
752
-
753
-
754
- def handle_field_debug_change(input_video_debug_data, prompt_debug_data, total_second_length_debug_data):
755
- global input_video_debug_value, prompt_debug_value, total_second_length_debug_value
756
- input_video_debug_value = input_video_debug_data
757
- prompt_debug_value = prompt_debug_data
758
- total_second_length_debug_value = total_second_length_debug_data
759
- return []
760
-
761
- input_video_debug.upload(
762
- fn=handle_field_debug_change,
763
- inputs=[input_video_debug, prompt_debug, total_second_length_debug],
764
- outputs=[]
765
- )
766
-
767
- prompt_debug.change(
768
- fn=handle_field_debug_change,
769
- inputs=[input_video_debug, prompt_debug, total_second_length_debug],
770
- outputs=[]
771
- )
772
-
773
- total_second_length_debug.change(
774
- fn=handle_field_debug_change,
775
- inputs=[input_video_debug, prompt_debug, total_second_length_debug],
776
- outputs=[]
777
- )
778
-
779
- block.launch(ssr_mode=False)
 
1
+ from diffusers_helper.hf_login import login
2
+
3
+ import os
4
+
5
+ os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download')))
6
+
7
+ import spaces
8
+ import gradio as gr
9
+ import torch
10
+ import traceback
11
+ import einops
12
+ import safetensors.torch as sf
13
+ import numpy as np
14
+ import argparse
15
+ import random
16
+ import math
17
+ # 20250506 pftq: Added for video input loading
18
+ import decord
19
+ # 20250506 pftq: Added for progress bars in video_encode
20
+ from tqdm import tqdm
21
+ # 20250506 pftq: Normalize file paths for Windows compatibility
22
+ import pathlib
23
+ # 20250506 pftq: for easier to read timestamp
24
+ from datetime import datetime
25
+ # 20250508 pftq: for saving prompt to mp4 comments metadata
26
+ import imageio_ffmpeg
27
+ import tempfile
28
+ import shutil
29
+ import subprocess
30
+
31
+ from PIL import Image
32
+ from diffusers import AutoencoderKLHunyuanVideo
33
+ from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
34
+ from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
35
+ from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
36
+ from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
37
+ from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
38
+ from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
39
+ from diffusers_helper.thread_utils import AsyncStream, async_run
40
+ from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
41
+ from transformers import SiglipImageProcessor, SiglipVisionModel
42
+ from diffusers_helper.clip_vision import hf_clip_vision_encode
43
+ from diffusers_helper.bucket_tools import find_nearest_bucket
44
+ from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
45
+
46
+ if torch.cuda.device_count() > 0:
47
+ free_mem_gb = get_cuda_free_memory_gb(gpu)
48
+ high_vram = free_mem_gb > 80
49
+
50
+ print(f'Free VRAM {free_mem_gb} GB')
51
+ print(f'High-VRAM Mode: {high_vram}')
52
+
53
+
54
+
55
+ text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
56
+ text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
57
+ tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
58
+ tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2')
59
+ vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu()
60
+
61
+ feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor')
62
+ image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu()
63
+
64
+ transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePack_F1_I2V_HY_20250503', torch_dtype=torch.bfloat16).cpu()
65
+
66
+ vae.eval()
67
+ text_encoder.eval()
68
+ text_encoder_2.eval()
69
+ image_encoder.eval()
70
+ transformer.eval()
71
+
72
+ if not high_vram:
73
+ vae.enable_slicing()
74
+ vae.enable_tiling()
75
+
76
+ transformer.high_quality_fp32_output_for_inference = True
77
+ print('transformer.high_quality_fp32_output_for_inference = True')
78
+
79
+ transformer.to(dtype=torch.bfloat16)
80
+ vae.to(dtype=torch.float16)
81
+ image_encoder.to(dtype=torch.float16)
82
+ text_encoder.to(dtype=torch.float16)
83
+ text_encoder_2.to(dtype=torch.float16)
84
+
85
+ vae.requires_grad_(False)
86
+ text_encoder.requires_grad_(False)
87
+ text_encoder_2.requires_grad_(False)
88
+ image_encoder.requires_grad_(False)
89
+ transformer.requires_grad_(False)
90
+
91
+ if not high_vram:
92
+ # DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster
93
+ DynamicSwapInstaller.install_model(transformer, device=gpu)
94
+ DynamicSwapInstaller.install_model(text_encoder, device=gpu)
95
+ else:
96
+ text_encoder.to(gpu)
97
+ text_encoder_2.to(gpu)
98
+ image_encoder.to(gpu)
99
+ vae.to(gpu)
100
+ transformer.to(gpu)
101
+
102
+ stream = AsyncStream()
103
+
104
+ outputs_folder = './outputs/'
105
+ os.makedirs(outputs_folder, exist_ok=True)
106
+
107
+ input_image_debug_value = prompt_debug_value = total_second_length_debug_value = None
108
+
109
+ @spaces.GPU()
110
+ @torch.no_grad()
111
+ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, device="cuda", width=None, height=None):
112
+ """
113
+ Encode a video into latent representations using the VAE.
114
+
115
+ Args:
116
+ video_path: Path to the input video file.
117
+ vae: AutoencoderKLHunyuanVideo model.
118
+ height, width: Target resolution for resizing frames.
119
+ vae_batch_size: Number of frames to process per batch.
120
+ device: Device for computation (e.g., "cuda").
121
+
122
+ Returns:
123
+ start_latent: Latent of the first frame (for compatibility with original code).
124
+ input_image_np: First frame as numpy array (for CLIP vision encoding).
125
+ history_latents: Latents of all frames (shape: [1, channels, frames, height//8, width//8]).
126
+ fps: Frames per second of the input video.
127
+ """
128
+ # 20250506 pftq: Normalize video path for Windows compatibility
129
+ video_path = str(pathlib.Path(video_path).resolve())
130
+ print(f"Processing video: {video_path}")
131
+
132
+ # 20250506 pftq: Check CUDA availability and fallback to CPU if needed
133
+ if device == "cuda" and not torch.cuda.is_available():
134
+ print("CUDA is not available, falling back to CPU")
135
+ device = "cpu"
136
+
137
+ try:
138
+ # 20250506 pftq: Load video and get FPS
139
+ print("Initializing VideoReader...")
140
+ vr = decord.VideoReader(video_path)
141
+ fps = vr.get_avg_fps() # Get input video FPS
142
+ num_real_frames = len(vr)
143
+ print(f"Video loaded: {num_real_frames} frames, FPS: {fps}")
144
+
145
+ # Truncate to nearest latent size (multiple of 4)
146
+ latent_size_factor = 4
147
+ num_frames = (num_real_frames // latent_size_factor) * latent_size_factor
148
+ if num_frames != num_real_frames:
149
+ print(f"Truncating video from {num_real_frames} to {num_frames} frames for latent size compatibility")
150
+ num_real_frames = num_frames
151
+
152
+ # 20250506 pftq: Read frames
153
+ print("Reading video frames...")
154
+ frames = vr.get_batch(range(num_real_frames)).asnumpy() # Shape: (num_real_frames, height, width, channels)
155
+ print(f"Frames read: {frames.shape}")
156
+
157
+ # 20250506 pftq: Get native video resolution
158
+ native_height, native_width = frames.shape[1], frames.shape[2]
159
+ print(f"Native video resolution: {native_width}x{native_height}")
160
+
161
+ # 20250506 pftq: Use native resolution if height/width not specified, otherwise use provided values
162
+ target_height = native_height if height is None else height
163
+ target_width = native_width if width is None else width
164
+
165
+ # 20250506 pftq: Adjust to nearest bucket for model compatibility
166
+ if not no_resize:
167
+ target_height, target_width = find_nearest_bucket(target_height, target_width, resolution=resolution)
168
+ print(f"Adjusted resolution: {target_width}x{target_height}")
169
+ else:
170
+ print(f"Using native resolution without resizing: {target_width}x{target_height}")
171
+
172
+ # 20250506 pftq: Preprocess frames to match original image processing
173
+ processed_frames = []
174
+ for i, frame in enumerate(frames):
175
+ #print(f"Preprocessing frame {i+1}/{num_frames}")
176
+ frame_np = resize_and_center_crop(frame, target_width=target_width, target_height=target_height)
177
+ processed_frames.append(frame_np)
178
+ processed_frames = np.stack(processed_frames) # Shape: (num_real_frames, height, width, channels)
179
+ print(f"Frames preprocessed: {processed_frames.shape}")
180
+
181
+ # 20250506 pftq: Save first frame for CLIP vision encoding
182
+ input_image_np = processed_frames[0]
183
+
184
+ # 20250506 pftq: Convert to tensor and normalize to [-1, 1]
185
+ print("Converting frames to tensor...")
186
+ frames_pt = torch.from_numpy(processed_frames).float() / 127.5 - 1
187
+ frames_pt = frames_pt.permute(0, 3, 1, 2) # Shape: (num_real_frames, channels, height, width)
188
+ frames_pt = frames_pt.unsqueeze(0) # Shape: (1, num_real_frames, channels, height, width)
189
+ frames_pt = frames_pt.permute(0, 2, 1, 3, 4) # Shape: (1, channels, num_real_frames, height, width)
190
+ print(f"Tensor shape: {frames_pt.shape}")
191
+
192
+ # 20250507 pftq: Save pixel frames for use in worker
193
+ input_video_pixels = frames_pt.cpu()
194
+
195
+ # 20250506 pftq: Move to device
196
+ print(f"Moving tensor to device: {device}")
197
+ frames_pt = frames_pt.to(device)
198
+ print("Tensor moved to device")
199
+
200
+ # 20250506 pftq: Move VAE to device
201
+ print(f"Moving VAE to device: {device}")
202
+ vae.to(device)
203
+ print("VAE moved to device")
204
+
205
+ # 20250506 pftq: Encode frames in batches
206
+ print(f"Encoding input video frames in VAE batch size {vae_batch_size} (reduce if memory issues here or if forcing video resolution)")
207
+ latents = []
208
+ vae.eval()
209
+ with torch.no_grad():
210
+ for i in tqdm(range(0, frames_pt.shape[2], vae_batch_size), desc="Encoding video frames", mininterval=0.1):
211
+ #print(f"Encoding batch {i//vae_batch_size + 1}: frames {i} to {min(i + vae_batch_size, frames_pt.shape[2])}")
212
+ batch = frames_pt[:, :, i:i + vae_batch_size] # Shape: (1, channels, batch_size, height, width)
213
+ try:
214
+ # 20250506 pftq: Log GPU memory before encoding
215
+ if device == "cuda":
216
+ free_mem = torch.cuda.memory_allocated() / 1024**3
217
+ #print(f"GPU memory before encoding: {free_mem:.2f} GB")
218
+ batch_latent = vae_encode(batch, vae)
219
+ # 20250506 pftq: Synchronize CUDA to catch issues
220
+ if device == "cuda":
221
+ torch.cuda.synchronize()
222
+ #print(f"GPU memory after encoding: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
223
+ latents.append(batch_latent)
224
+ #print(f"Batch encoded, latent shape: {batch_latent.shape}")
225
+ except RuntimeError as e:
226
+ print(f"Error during VAE encoding: {str(e)}")
227
+ if device == "cuda" and "out of memory" in str(e).lower():
228
+ print("CUDA out of memory, try reducing vae_batch_size or using CPU")
229
+ raise
230
+
231
+ # 20250506 pftq: Concatenate latents
232
+ print("Concatenating latents...")
233
+ history_latents = torch.cat(latents, dim=2) # Shape: (1, channels, frames, height//8, width//8)
234
+ print(f"History latents shape: {history_latents.shape}")
235
+
236
+ # 20250506 pftq: Get first frame's latent
237
+ start_latent = history_latents[:, :, :1] # Shape: (1, channels, 1, height//8, width//8)
238
+ print(f"Start latent shape: {start_latent.shape}")
239
+
240
+ # 20250506 pftq: Move VAE back to CPU to free GPU memory
241
+ if device == "cuda":
242
+ vae.to(cpu)
243
+ torch.cuda.empty_cache()
244
+ print("VAE moved back to CPU, CUDA cache cleared")
245
+
246
+ return start_latent, input_image_np, history_latents, fps, target_height, target_width, input_video_pixels
247
+
248
+ except Exception as e:
249
+ print(f"Error in video_encode: {str(e)}")
250
+ raise
251
+
252
+ # 20250508 pftq: for saving prompt to mp4 metadata comments
253
+ def set_mp4_comments_imageio_ffmpeg(input_file, comments):
254
+ try:
255
+ # Get the path to the bundled FFmpeg binary from imageio-ffmpeg
256
+ ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
257
+
258
+ # Check if input file exists
259
+ if not os.path.exists(input_file):
260
+ print(f"Error: Input file {input_file} does not exist")
261
+ return False
262
+
263
+ # Create a temporary file path
264
+ temp_file = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False).name
265
+
266
+ # FFmpeg command using the bundled binary
267
+ command = [
268
+ ffmpeg_path, # Use imageio-ffmpeg's FFmpeg
269
+ '-i', input_file, # input file
270
+ '-metadata', f'comment={comments}', # set comment metadata
271
+ '-c:v', 'copy', # copy video stream without re-encoding
272
+ '-c:a', 'copy', # copy audio stream without re-encoding
273
+ '-y', # overwrite output file if it exists
274
+ temp_file # temporary output file
275
+ ]
276
+
277
+ # Run the FFmpeg command
278
+ result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
279
+
280
+ if result.returncode == 0:
281
+ # Replace the original file with the modified one
282
+ shutil.move(temp_file, input_file)
283
+ print(f"Successfully added comments to {input_file}")
284
+ return True
285
+ else:
286
+ # Clean up temp file if FFmpeg fails
287
+ if os.path.exists(temp_file):
288
+ os.remove(temp_file)
289
+ print(f"Error: FFmpeg failed with message:\n{result.stderr}")
290
+ return False
291
+
292
+ except Exception as e:
293
+ # Clean up temp file in case of other errors
294
+ if 'temp_file' in locals() and os.path.exists(temp_file):
295
+ os.remove(temp_file)
296
+ print(f"Error saving prompt to video metadata, ffmpeg may be required: "+str(e))
297
+ return False
298
+
299
+ # 20250506 pftq: Modified worker to accept video input and clean frame count
300
+ @spaces.GPU()
301
+ @torch.no_grad()
302
+ def worker(input_video, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
303
+
304
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
305
+
306
+ try:
307
+ # Clean GPU
308
+ if not high_vram:
309
+ unload_complete_models(
310
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
311
+ )
312
+
313
+ # Text encoding
314
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
315
+
316
+ if not high_vram:
317
+ fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
318
+ load_model_as_complete(text_encoder_2, target_device=gpu)
319
+
320
+ llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
321
+
322
+ if cfg == 1:
323
+ llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
324
+ else:
325
+ llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
326
+
327
+ llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
328
+ llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
329
+
330
+ # 20250506 pftq: Processing input video instead of image
331
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Video processing ...'))))
332
+
333
+ # 20250506 pftq: Encode video
334
+ #H, W = 640, 640 # Default resolution, will be adjusted
335
+ #height, width = find_nearest_bucket(H, W, resolution=640)
336
+ #start_latent, input_image_np, history_latents, fps = video_encode(input_video, vae, height, width, vae_batch_size=16, device=gpu)
337
+ start_latent, input_image_np, video_latents, fps, height, width, input_video_pixels = video_encode(input_video, resolution, no_resize, vae, vae_batch_size=vae_batch, device=gpu)
338
+
339
+ #Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
340
+
341
+ # CLIP Vision
342
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
343
+
344
+ if not high_vram:
345
+ load_model_as_complete(image_encoder, target_device=gpu)
346
+
347
+ image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
348
+ image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
349
+
350
+ # Dtype
351
+ llama_vec = llama_vec.to(transformer.dtype)
352
+ llama_vec_n = llama_vec_n.to(transformer.dtype)
353
+ clip_l_pooler = clip_l_pooler.to(transformer.dtype)
354
+ clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
355
+ image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
356
+
357
+ total_latent_sections = (total_second_length * fps) / (latent_window_size * 4)
358
+ total_latent_sections = int(max(round(total_latent_sections), 1))
359
+
360
+ for idx in range(batch):
361
+ if batch > 1:
362
+ print(f"Beginning video {idx+1} of {batch} with seed {seed} ")
363
+
364
+ #job_id = generate_timestamp()
365
+ job_id = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+f"_framepackf1-videoinput_{width}-{total_second_length}sec_seed-{seed}_steps-{steps}_distilled-{gs}_cfg-{cfg}" # 20250506 pftq: easier to read timestamp and filename
366
+
367
+ # Sampling
368
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
369
+
370
+ rnd = torch.Generator("cpu").manual_seed(seed)
371
+
372
+ # 20250506 pftq: Initialize history_latents with video latents
373
+ history_latents = video_latents.cpu()
374
+ total_generated_latent_frames = history_latents.shape[2]
375
+ # 20250506 pftq: Initialize history_pixels to fix UnboundLocalError
376
+ history_pixels = None
377
+ previous_video = None
378
+
379
+ # 20250507 pftq: hot fix for initial video being corrupted by vae encoding, issue with ghosting because of slight differences
380
+ #history_pixels = input_video_pixels
381
+ #save_bcthw_as_mp4(vae_decode(video_latents, vae).cpu(), os.path.join(outputs_folder, f'{job_id}_input_video.mp4'), fps=fps, crf=mp4_crf) # 20250507 pftq: test fast movement corrupted by vae encoding if vae batch size too low
382
+
383
+ for section_index in range(total_latent_sections):
384
+ if stream.input_queue.top() == 'end':
385
+ stream.output_queue.push(('end', None))
386
+ return
387
+
388
+ print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
389
+
390
+ if not high_vram:
391
+ unload_complete_models()
392
+ move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
393
+
394
+ if use_teacache:
395
+ transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
396
+ else:
397
+ transformer.initialize_teacache(enable_teacache=False)
398
+
399
+ def callback(d):
400
+ preview = d['denoised']
401
+ preview = vae_decode_fake(preview)
402
+
403
+ preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
404
+ preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
405
+
406
+ if stream.input_queue.top() == 'end':
407
+ stream.output_queue.push(('end', None))
408
+ raise KeyboardInterrupt('User ends the task.')
409
+
410
+ current_step = d['i'] + 1
411
+ percentage = int(100.0 * current_step / steps)
412
+ hint = f'Sampling {current_step}/{steps}'
413
+ desc = f'Total frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps) :.2f} seconds (FPS-{fps}), Seed: {seed}, Video {idx+1} of {batch}. The video is generating part {section_index+1} of {total_latent_sections}...'
414
+ stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
415
+ return
416
+
417
+ # 20250506 pftq: Use user-specified number of context frames, matching original allocation for num_clean_frames=2
418
+ available_frames = history_latents.shape[2] # Number of latent frames
419
+ max_pixel_frames = min(latent_window_size * 4 - 3, available_frames * 4) # Cap at available pixel frames
420
+ adjusted_latent_frames = max(1, (max_pixel_frames + 3) // 4) # Convert back to latent frames
421
+ # Adjust num_clean_frames to match original behavior: num_clean_frames=2 means 1 frame for clean_latents_1x
422
+ effective_clean_frames = max(0, num_clean_frames - 1) if num_clean_frames > 1 else 0
423
+ effective_clean_frames = min(effective_clean_frames, available_frames - 2) if available_frames > 2 else 0 # 20250507 pftq: changed 1 to 2 for edge case for <=1 sec videos
424
+ num_2x_frames = min(2, max(1, available_frames - effective_clean_frames - 1)) if available_frames > effective_clean_frames + 1 else 0 # 20250507 pftq: subtracted 1 for edge case for <=1 sec videos
425
+ num_4x_frames = min(16, max(1, available_frames - effective_clean_frames - num_2x_frames)) if available_frames > effective_clean_frames + num_2x_frames else 0 # 20250507 pftq: Edge case for <=1 sec
426
+
427
+ total_context_frames = num_4x_frames + num_2x_frames + effective_clean_frames
428
+ total_context_frames = min(total_context_frames, available_frames) # 20250507 pftq: Edge case for <=1 sec videos
429
+
430
+ indices = torch.arange(0, sum([1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames])).unsqueeze(0) # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
431
+ clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split(
432
+ [1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames], dim=1 # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
433
+ )
434
+ clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
435
+
436
+ # 20250506 pftq: Split history_latents dynamically based on available frames
437
+ fallback_frame_count = 2 # 20250507 pftq: Changed 0 to 2 Edge case for <=1 sec videos
438
+ context_frames = history_latents[:, :, -total_context_frames:, :, :] if total_context_frames > 0 else history_latents[:, :, :fallback_frame_count, :, :]
439
+ if total_context_frames > 0:
440
+ split_sizes = [num_4x_frames, num_2x_frames, effective_clean_frames]
441
+ split_sizes = [s for s in split_sizes if s > 0] # Remove zero sizes
442
+ if split_sizes:
443
+ splits = context_frames.split(split_sizes, dim=2)
444
+ split_idx = 0
445
+ clean_latents_4x = splits[split_idx] if num_4x_frames > 0 else history_latents[:, :, :fallback_frame_count, :, :]
446
+ if clean_latents_4x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
447
+ clean_latents_4x = torch.cat([clean_latents_4x, clean_latents_4x[:, :, -1:, :, :]], dim=2)[:, :, :2, :, :]
448
+ split_idx += 1 if num_4x_frames > 0 else 0
449
+ clean_latents_2x = splits[split_idx] if num_2x_frames > 0 and split_idx < len(splits) else history_latents[:, :, :fallback_frame_count, :, :]
450
+ if clean_latents_2x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
451
+ clean_latents_2x = torch.cat([clean_latents_2x, clean_latents_2x[:, :, -1:, :, :]], dim=2)[:, :, :2, :, :]
452
+ split_idx += 1 if num_2x_frames > 0 else 0
453
+ clean_latents_1x = splits[split_idx] if effective_clean_frames > 0 and split_idx < len(splits) else history_latents[:, :, :fallback_frame_count, :, :]
454
+ else:
455
+ clean_latents_4x = clean_latents_2x = clean_latents_1x = history_latents[:, :, :fallback_frame_count, :, :]
456
+ else:
457
+ clean_latents_4x = clean_latents_2x = clean_latents_1x = history_latents[:, :, :fallback_frame_count, :, :]
458
+
459
+ clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
460
+
461
+ # 20250507 pftq: Fix for <=1 sec videos.
462
+ max_frames = min(latent_window_size * 4 - 3, history_latents.shape[2] * 4)
463
+
464
+ generated_latents = sample_hunyuan(
465
+ transformer=transformer,
466
+ sampler='unipc',
467
+ width=width,
468
+ height=height,
469
+ frames=max_frames,
470
+ real_guidance_scale=cfg,
471
+ distilled_guidance_scale=gs,
472
+ guidance_rescale=rs,
473
+ num_inference_steps=steps,
474
+ generator=rnd,
475
+ prompt_embeds=llama_vec,
476
+ prompt_embeds_mask=llama_attention_mask,
477
+ prompt_poolers=clip_l_pooler,
478
+ negative_prompt_embeds=llama_vec_n,
479
+ negative_prompt_embeds_mask=llama_attention_mask_n,
480
+ negative_prompt_poolers=clip_l_pooler_n,
481
+ device=gpu,
482
+ dtype=torch.bfloat16,
483
+ image_embeddings=image_encoder_last_hidden_state,
484
+ latent_indices=latent_indices,
485
+ clean_latents=clean_latents,
486
+ clean_latent_indices=clean_latent_indices,
487
+ clean_latents_2x=clean_latents_2x,
488
+ clean_latent_2x_indices=clean_latent_2x_indices,
489
+ clean_latents_4x=clean_latents_4x,
490
+ clean_latent_4x_indices=clean_latent_4x_indices,
491
+ callback=callback,
492
+ )
493
+
494
+ total_generated_latent_frames += int(generated_latents.shape[2])
495
+ history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
496
+
497
+ if not high_vram:
498
+ offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
499
+ load_model_as_complete(vae, target_device=gpu)
500
+
501
+ real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
502
+
503
+ if history_pixels is None:
504
+ history_pixels = vae_decode(real_history_latents, vae).cpu()
505
+ else:
506
+ section_latent_frames = latent_window_size * 2
507
+ overlapped_frames = min(latent_window_size * 4 - 3, history_pixels.shape[2])
508
+
509
+ #if section_index == 0:
510
+ #extra_latents = 1 # Add up to 2 extra latent frames for smoother overlap to initial video
511
+ #extra_pixel_frames = extra_latents * 4 # Approx. 4 pixel frames per latent
512
+ #overlapped_frames = min(overlapped_frames + extra_pixel_frames, history_pixels.shape[2], section_latent_frames * 4)
513
+
514
+ current_pixels = vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu()
515
+ history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames)
516
+
517
+ if not high_vram:
518
+ unload_complete_models()
519
+
520
+ output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
521
+
522
+ # 20250506 pftq: Use input video FPS for output
523
+ save_bcthw_as_mp4(history_pixels, output_filename, fps=fps, crf=mp4_crf)
524
+ print(f"Latest video saved: {output_filename}")
525
+ # 20250508 pftq: Save prompt to mp4 metadata comments
526
+ set_mp4_comments_imageio_ffmpeg(output_filename, f"Prompt: {prompt} | Negative Prompt: {n_prompt}");
527
+ print(f"Prompt saved to mp4 metadata comments: {output_filename}")
528
+
529
+ # 20250506 pftq: Clean up previous partial files
530
+ if previous_video is not None and os.path.exists(previous_video):
531
+ try:
532
+ os.remove(previous_video)
533
+ print(f"Previous partial video deleted: {previous_video}")
534
+ except Exception as e:
535
+ print(f"Error deleting previous partial video {previous_video}: {e}")
536
+ previous_video = output_filename
537
+
538
+ print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
539
+
540
+ stream.output_queue.push(('file', output_filename))
541
+
542
+ seed = (seed + 1) % np.iinfo(np.int32).max
543
+
544
+ except:
545
+ traceback.print_exc()
546
+
547
+ if not high_vram:
548
+ unload_complete_models(
549
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
550
+ )
551
+
552
+ stream.output_queue.push(('end', None))
553
+ return
554
+
555
+ def get_duration(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
556
+ global total_second_length_debug_value
557
+ if total_second_length_debug_value is not None:
558
+ return min(total_second_length_debug_value * 60 * 2, 600)
559
+ return total_second_length * 60 * 2
560
+
561
+ # 20250506 pftq: Modified process to pass clean frame count, etc from video_encode
562
+ @spaces.GPU(duration=get_duration)
563
+ def process(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
564
+ global stream, high_vram, input_video_debug_value, prompt_debug_value, total_second_length_debug_value
565
+
566
+ if torch.cuda.device_count() == 0:
567
+ gr.Warning('Set this space to GPU config to make it work.')
568
+ return None, None, None, None, None, None
569
+
570
+ if input_video_debug_value is not None:
571
+ input_video = input_video_debug_value
572
+ input_video_debug_value = None
573
+
574
+ if prompt_debug_value is not None:
575
+ prompt = prompt_debug_value
576
+ prompt_debug_value = None
577
+
578
+ if total_second_length_debug_value is not None:
579
+ total_second_length = total_second_length_debug_value
580
+ total_second_length_debug_value = None
581
+
582
+ if randomize_seed:
583
+ seed = random.randint(0, np.iinfo(np.int32).max)
584
+
585
+ # 20250506 pftq: Updated assertion for video input
586
+ assert input_video is not None, 'No input video!'
587
+
588
+ yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
589
+
590
+ # 20250507 pftq: Even the H100 needs offloading if the video dimensions are 720p or higher
591
+ if high_vram and (no_resize or resolution>640):
592
+ print("Disabling high vram mode due to no resize and/or potentially higher resolution...")
593
+ high_vram = False
594
+ vae.enable_slicing()
595
+ vae.enable_tiling()
596
+ DynamicSwapInstaller.install_model(transformer, device=gpu)
597
+ DynamicSwapInstaller.install_model(text_encoder, device=gpu)
598
+
599
+ # 20250508 pftq: automatically set distilled cfg to 1 if cfg is used
600
+ if cfg > 1:
601
+ gs = 1
602
+
603
+ stream = AsyncStream()
604
+
605
+ # 20250506 pftq: Pass num_clean_frames, vae_batch, etc
606
+ async_run(worker, input_video, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
607
+
608
+ output_filename = None
609
+
610
+ while True:
611
+ flag, data = stream.output_queue.next()
612
+
613
+ if flag == 'file':
614
+ output_filename = data
615
+ yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
616
+
617
+ if flag == 'progress':
618
+ preview, desc, html = data
619
+ #yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
620
+ yield output_filename, gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True) # 20250506 pftq: Keep refreshing the video in case it got hidden when the tab was in the background
621
+
622
+ if flag == 'end':
623
+ yield output_filename, gr.update(visible=False), desc+' Video complete.', '', gr.update(interactive=True), gr.update(interactive=False)
624
+ break
625
+
626
+ def end_process():
627
+ stream.input_queue.push('end')
628
+
629
+
630
+ css = make_progress_bar_css()
631
+ block = gr.Blocks(css=css).queue()
632
+ with block:
633
+ if torch.cuda.device_count() == 0:
634
+ with gr.Row():
635
+ gr.HTML("""
636
+ <p style="background-color: red;"><big><big><big><b>⚠️To use FramePack, <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/SUPIR?duplicate=true">duplicate this space</a> and set a GPU with 30 GB VRAM.</b>
637
+
638
+ You can't use FramePack directly here because this space runs on a CPU, which is not enough for FramePack. Please provide <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/SUPIR/discussions/new">feedback</a> if you have issues.
639
+ </big></big></big></p>
640
+ """)
641
+ # 20250506 pftq: Updated title to reflect video input functionality
642
+ gr.Markdown('# Framepack F1 with Image Input or with Video Input (Video Extension)')
643
+ gr.Markdown(f"""### Video diffusion, but feels like image diffusion
644
+ *FramePack F1 - a FramePack model that only predicts future frames from history frames*
645
+ ### *beta* FramePack Fill 🖋️- draw a mask over the input image to inpaint the video output
646
+ adapted from the officical code repo [FramePack](https://github.com/lllyasviel/FramePack) by [lllyasviel](lllyasviel/FramePack_F1_I2V_HY_20250503) and [FramePack Studio](https://github.com/colinurbs/FramePack-Studio) 🙌🏻
647
+ """)
648
+ with gr.Row():
649
+ with gr.Column():
650
+ input_video = gr.Video(sources='upload', label="Input Video", height=320)
651
+ prompt = gr.Textbox(label="Prompt", value='')
652
+
653
+ with gr.Row():
654
+ start_button = gr.Button(value="Start Generation")
655
+ end_button = gr.Button(value="End Generation", interactive=False)
656
+
657
+ with gr.Accordion("Advanced settings", open=False):
658
+ with gr.Row():
659
+ use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed, but often makes hands and fingers slightly worse.')
660
+ no_resize = gr.Checkbox(label='Force Original Video Resolution (No Resizing)', value=False, info='Might run out of VRAM (720p requires > 24GB VRAM).')
661
+
662
+ randomize_seed = gr.Checkbox(label='Randomize seed', value=True, info='If checked, the seed is always different')
663
+ seed = gr.Slider(label="Seed", minimum=0, maximum=np.iinfo(np.int32).max, step=1, randomize=True)
664
+
665
+ batch = gr.Slider(label="Batch Size (Number of Videos)", minimum=1, maximum=1000, value=1, step=1, info='Generate multiple videos each with a different seed.')
666
+
667
+ resolution = gr.Number(label="Resolution (max width or height)", value=640, precision=0, visible=False)
668
+
669
+ total_second_length = gr.Slider(label="Additional Video Length to Generate (Seconds)", minimum=1, maximum=120, value=5, step=0.1)
670
+
671
+ # 20250506 pftq: Reduced default distilled guidance scale to improve adherence to input video
672
+ gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=3.0, step=0.01, info='Prompt adherence at the cost of less details from the input video, but to a lesser extent than Context Frames.')
673
+ cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, visible=True, info='Use this instead of Distilled for more detail/control + Negative Prompt (make sure Distilled set to 1). Doubles render time.') # Should not change
674
+ rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01, visible=False) # Should not change
675
+
676
+ n_prompt = gr.Textbox(label="Negative Prompt", value="", visible=True, info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
677
+ steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Increase for more quality, especially if using high non-distilled CFG.')
678
+
679
+ # 20250506 pftq: Renamed slider to Number of Context Frames and updated description
680
+ num_clean_frames = gr.Slider(label="Number of Context Frames", minimum=2, maximum=10, value=5, step=1, info="Retain more video details but increase memory use. Reduce to 2 if memory issues.")
681
+
682
+ default_vae = 32
683
+ if high_vram:
684
+ default_vae = 128
685
+ elif free_mem_gb>=20:
686
+ default_vae = 64
687
+
688
+ vae_batch = gr.Slider(label="VAE Batch Size for Input Video", minimum=4, maximum=256, value=default_vae, step=4, info="Reduce if running out of memory. Increase for better quality frames during fast motion.")
689
+
690
+ latent_window_size = gr.Slider(label="Latent Window Size", minimum=9, maximum=33, value=9, step=1, visible=True, info='Generate more frames at a time (larger chunks). Less degradation and better blending but higher VRAM cost.')
691
+
692
+ gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
693
+
694
+ mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ")
695
+
696
+ with gr.Row():
697
+ input_video_debug = gr.Video(sources='upload', label="Input Video Debug", height=320)
698
+ prompt_debug = gr.Textbox(label="Prompt Debug", value='')
699
+ total_second_length_debug = gr.Slider(label="Additional Video Length to Generate (Seconds) Debug", minimum=1, maximum=120, value=1, step=0.1)
700
+
701
+ with gr.Column():
702
+ preview_image = gr.Image(label="Next Latents", height=200, visible=False)
703
+ result_video = gr.Video(label="Finished Frames", autoplay=True, show_share_button=False, height=512, loop=True)
704
+ progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
705
+ progress_bar = gr.HTML('', elem_classes='no-generating-animation')
706
+
707
+ with gr.Row(visible=False):
708
+ gr.Examples(
709
+ examples = [
710
+ [
711
+ "./img_examples/Example1.mp4", # input_video
712
+ "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
713
+ "", # n_prompt
714
+ True, # randomize_seed
715
+ 42, # seed
716
+ 1, # batch
717
+ 640, # resolution
718
+ 1, # total_second_length
719
+ 9, # latent_window_size
720
+ 25, # steps
721
+ 1.0, # cfg
722
+ 10.0, # gs
723
+ 0.0, # rs
724
+ 6, # gpu_memory_preservation
725
+ False, # use_teacache
726
+ False, # no_resize
727
+ 16, # mp4_crf
728
+ 5, # num_clean_frames
729
+ default_vae
730
+ ],
731
+ ],
732
+ run_on_click = True,
733
+ fn = process,
734
+ inputs = [input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch],
735
+ outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button],
736
+ cache_examples = True,
737
+ )
738
+
739
+ gr.Markdown('## Guide')
740
+ gr.Markdown("I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.")
741
+
742
+
743
+ # 20250506 pftq: Updated inputs to include num_clean_frames
744
+ ips = [input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
745
+ start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
746
+ end_button.click(fn=end_process)
747
+
748
+
749
+ def handle_field_debug_change(input_video_debug_data, prompt_debug_data, total_second_length_debug_data):
750
+ global input_video_debug_value, prompt_debug_value, total_second_length_debug_value
751
+ input_video_debug_value = input_video_debug_data
752
+ prompt_debug_value = prompt_debug_data
753
+ total_second_length_debug_value = total_second_length_debug_data
754
+ return []
755
+
756
+ input_video_debug.upload(
757
+ fn=handle_field_debug_change,
758
+ inputs=[input_video_debug, prompt_debug, total_second_length_debug],
759
+ outputs=[]
760
+ )
761
+
762
+ prompt_debug.change(
763
+ fn=handle_field_debug_change,
764
+ inputs=[input_video_debug, prompt_debug, total_second_length_debug],
765
+ outputs=[]
766
+ )
767
+
768
+ total_second_length_debug.change(
769
+ fn=handle_field_debug_change,
770
+ inputs=[input_video_debug, prompt_debug, total_second_length_debug],
771
+ outputs=[]
772
+ )
773
+
774
+ block.launch(ssr_mode=False)
 
 
 
 
 
requirements.txt CHANGED
@@ -1,41 +1,23 @@
1
- pydantic==2.10.6
2
- fastapi==0.115.8
3
- gradio_imageslider==0.0.20
4
- gradio_client==1.7.0
5
- numpy==1.26.4
6
- requests==2.32.3
7
- sentencepiece==0.2.0
8
- tokenizers==0.19.1
9
- torchvision==0.22.0
10
- uvicorn==0.30.1
11
- wandb==0.17.4
12
- httpx==0.27.0
13
- transformers==4.42.4
14
- accelerate==0.32.1
15
- scikit-learn==1.5.1
16
- einops==0.8.0
17
- einops-exts==0.0.4
18
- timm==1.0.7
19
- openai-clip==1.0.1
20
- fsspec==2024.6.1
21
- kornia==0.7.3
22
- matplotlib==3.9.1
23
- ninja==1.11.1.1
24
- omegaconf==2.3.0
25
- opencv-python==4.10.0.84
26
- pandas==2.2.2
27
- pillow==10.4.0
28
- pytorch-lightning==2.3.3
29
- PyYAML==6.0.1
30
- scipy==1.14.0
31
- tqdm==4.66.4
32
- triton==3.3.0
33
- urllib3==2.2.2
34
- webdataset==0.2.111
35
- xformers==0.0.30
36
- facexlib==0.3.0
37
- k-diffusion==0.1.1.post1
38
  diffusers==0.33.1
39
- pillow-heif==0.22.0
40
-
41
- open-clip-torch==2.24.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==1.6.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  diffusers==0.33.1
3
+ transformers==4.46.2
4
+ sentencepiece==0.2.0
5
+ pillow==11.1.0
6
+ av==12.1.0
7
+ numpy==1.26.2
8
+ scipy==1.12.0
9
+ requests==2.31.0
10
+ torchsde==0.2.6
11
+ torch>=2.0.0
12
+ torchvision
13
+ torchaudio
14
+ einops
15
+ opencv-contrib-python
16
+ safetensors
17
+ huggingface_hub
18
+ spaces
19
+ decord
20
+ imageio_ffmpeg
21
+ sageattention
22
+ xformers
23
+ bitsandbytes