Fabrice-TIERCELIN commited on
Commit
8a8021a
·
verified ·
1 Parent(s): 61d345b

block.launch(mcp_server=False, ssr_mode=False)

Browse files
Files changed (1) hide show
  1. app.py +753 -755
app.py CHANGED
@@ -1,755 +1,753 @@
1
- from diffusers_helper.hf_login import login
2
-
3
- import os
4
-
5
- os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download')))
6
-
7
- import spaces
8
- import gradio as gr
9
- import torch
10
- import traceback
11
- import einops
12
- import safetensors.torch as sf
13
- import numpy as np
14
- import argparse
15
- import random
16
- import math
17
- # 20250506 pftq: Added for video input loading
18
- import decord
19
- # 20250506 pftq: Added for progress bars in video_encode
20
- from tqdm import tqdm
21
- # 20250506 pftq: Normalize file paths for Windows compatibility
22
- import pathlib
23
- # 20250506 pftq: for easier to read timestamp
24
- from datetime import datetime
25
- # 20250508 pftq: for saving prompt to mp4 comments metadata
26
- import imageio_ffmpeg
27
- import tempfile
28
- import shutil
29
- import subprocess
30
-
31
- from PIL import Image
32
- from diffusers import AutoencoderKLHunyuanVideo
33
- from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
34
- from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
35
- from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
36
- from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
37
- from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
38
- from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
39
- from diffusers_helper.thread_utils import AsyncStream, async_run
40
- from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
41
- from transformers import SiglipImageProcessor, SiglipVisionModel
42
- from diffusers_helper.clip_vision import hf_clip_vision_encode
43
- from diffusers_helper.bucket_tools import find_nearest_bucket
44
- from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
45
-
46
-
47
- if torch.cuda.device_count() > 0:
48
- free_mem_gb = get_cuda_free_memory_gb(gpu)
49
- high_vram = free_mem_gb > 80
50
-
51
- print(f'Free VRAM {free_mem_gb} GB')
52
- print(f'High-VRAM Mode: {high_vram}')
53
-
54
- text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
55
- text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
56
- tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
57
- tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2')
58
- vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu()
59
-
60
- feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor')
61
- image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu()
62
-
63
- transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePack_F1_I2V_HY_20250503', torch_dtype=torch.bfloat16).cpu()
64
-
65
- vae.eval()
66
- text_encoder.eval()
67
- text_encoder_2.eval()
68
- image_encoder.eval()
69
- transformer.eval()
70
-
71
- if not high_vram:
72
- vae.enable_slicing()
73
- vae.enable_tiling()
74
-
75
- transformer.high_quality_fp32_output_for_inference = True
76
- print('transformer.high_quality_fp32_output_for_inference = True')
77
-
78
- transformer.to(dtype=torch.bfloat16)
79
- vae.to(dtype=torch.float16)
80
- image_encoder.to(dtype=torch.float16)
81
- text_encoder.to(dtype=torch.float16)
82
- text_encoder_2.to(dtype=torch.float16)
83
-
84
- vae.requires_grad_(False)
85
- text_encoder.requires_grad_(False)
86
- text_encoder_2.requires_grad_(False)
87
- image_encoder.requires_grad_(False)
88
- transformer.requires_grad_(False)
89
-
90
- if not high_vram:
91
- # DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster
92
- DynamicSwapInstaller.install_model(transformer, device=gpu)
93
- DynamicSwapInstaller.install_model(text_encoder, device=gpu)
94
- else:
95
- text_encoder.to(gpu)
96
- text_encoder_2.to(gpu)
97
- image_encoder.to(gpu)
98
- vae.to(gpu)
99
- transformer.to(gpu)
100
-
101
- stream = AsyncStream()
102
-
103
- outputs_folder = './outputs/'
104
- os.makedirs(outputs_folder, exist_ok=True)
105
-
106
- input_image_debug_value = prompt_debug_value = total_second_length_debug_value = None
107
-
108
- @spaces.GPU()
109
- @torch.no_grad()
110
- def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, device="cuda", width=None, height=None):
111
- """
112
- Encode a video into latent representations using the VAE.
113
-
114
- Args:
115
- video_path: Path to the input video file.
116
- vae: AutoencoderKLHunyuanVideo model.
117
- height, width: Target resolution for resizing frames.
118
- vae_batch_size: Number of frames to process per batch.
119
- device: Device for computation (e.g., "cuda").
120
-
121
- Returns:
122
- start_latent: Latent of the first frame (for compatibility with original code).
123
- input_image_np: First frame as numpy array (for CLIP vision encoding).
124
- history_latents: Latents of all frames (shape: [1, channels, frames, height//8, width//8]).
125
- fps: Frames per second of the input video.
126
- """
127
- # 20250506 pftq: Normalize video path for Windows compatibility
128
- video_path = str(pathlib.Path(video_path).resolve())
129
- print(f"Processing video: {video_path}")
130
-
131
- # 20250506 pftq: Check CUDA availability and fallback to CPU if needed
132
- if device == "cuda" and not torch.cuda.is_available():
133
- print("CUDA is not available, falling back to CPU")
134
- device = "cpu"
135
-
136
- try:
137
- # 20250506 pftq: Load video and get FPS
138
- print("Initializing VideoReader...")
139
- vr = decord.VideoReader(video_path)
140
- fps = vr.get_avg_fps() # Get input video FPS
141
- num_real_frames = len(vr)
142
- print(f"Video loaded: {num_real_frames} frames, FPS: {fps}")
143
-
144
- # Truncate to nearest latent size (multiple of 4)
145
- latent_size_factor = 4
146
- num_frames = (num_real_frames // latent_size_factor) * latent_size_factor
147
- if num_frames != num_real_frames:
148
- print(f"Truncating video from {num_real_frames} to {num_frames} frames for latent size compatibility")
149
- num_real_frames = num_frames
150
-
151
- # 20250506 pftq: Read frames
152
- print("Reading video frames...")
153
- frames = vr.get_batch(range(num_real_frames)).asnumpy() # Shape: (num_real_frames, height, width, channels)
154
- print(f"Frames read: {frames.shape}")
155
-
156
- # 20250506 pftq: Get native video resolution
157
- native_height, native_width = frames.shape[1], frames.shape[2]
158
- print(f"Native video resolution: {native_width}x{native_height}")
159
-
160
- # 20250506 pftq: Use native resolution if height/width not specified, otherwise use provided values
161
- target_height = native_height if height is None else height
162
- target_width = native_width if width is None else width
163
-
164
- # 20250506 pftq: Adjust to nearest bucket for model compatibility
165
- if not no_resize:
166
- target_height, target_width = find_nearest_bucket(target_height, target_width, resolution=resolution)
167
- print(f"Adjusted resolution: {target_width}x{target_height}")
168
- else:
169
- print(f"Using native resolution without resizing: {target_width}x{target_height}")
170
-
171
- # 20250506 pftq: Preprocess frames to match original image processing
172
- processed_frames = []
173
- for i, frame in enumerate(frames):
174
- #print(f"Preprocessing frame {i+1}/{num_frames}")
175
- frame_np = resize_and_center_crop(frame, target_width=target_width, target_height=target_height)
176
- processed_frames.append(frame_np)
177
- processed_frames = np.stack(processed_frames) # Shape: (num_real_frames, height, width, channels)
178
- print(f"Frames preprocessed: {processed_frames.shape}")
179
-
180
- # 20250506 pftq: Save first frame for CLIP vision encoding
181
- input_image_np = processed_frames[0]
182
-
183
- # 20250506 pftq: Convert to tensor and normalize to [-1, 1]
184
- print("Converting frames to tensor...")
185
- frames_pt = torch.from_numpy(processed_frames).float() / 127.5 - 1
186
- frames_pt = frames_pt.permute(0, 3, 1, 2) # Shape: (num_real_frames, channels, height, width)
187
- frames_pt = frames_pt.unsqueeze(0) # Shape: (1, num_real_frames, channels, height, width)
188
- frames_pt = frames_pt.permute(0, 2, 1, 3, 4) # Shape: (1, channels, num_real_frames, height, width)
189
- print(f"Tensor shape: {frames_pt.shape}")
190
-
191
- # 20250507 pftq: Save pixel frames for use in worker
192
- input_video_pixels = frames_pt.cpu()
193
-
194
- # 20250506 pftq: Move to device
195
- print(f"Moving tensor to device: {device}")
196
- frames_pt = frames_pt.to(device)
197
- print("Tensor moved to device")
198
-
199
- # 20250506 pftq: Move VAE to device
200
- print(f"Moving VAE to device: {device}")
201
- vae.to(device)
202
- print("VAE moved to device")
203
-
204
- # 20250506 pftq: Encode frames in batches
205
- print(f"Encoding input video frames in VAE batch size {vae_batch_size} (reduce if memory issues here or if forcing video resolution)")
206
- latents = []
207
- vae.eval()
208
- with torch.no_grad():
209
- for i in tqdm(range(0, frames_pt.shape[2], vae_batch_size), desc="Encoding video frames", mininterval=0.1):
210
- #print(f"Encoding batch {i//vae_batch_size + 1}: frames {i} to {min(i + vae_batch_size, frames_pt.shape[2])}")
211
- batch = frames_pt[:, :, i:i + vae_batch_size] # Shape: (1, channels, batch_size, height, width)
212
- try:
213
- # 20250506 pftq: Log GPU memory before encoding
214
- if device == "cuda":
215
- free_mem = torch.cuda.memory_allocated() / 1024**3
216
- #print(f"GPU memory before encoding: {free_mem:.2f} GB")
217
- batch_latent = vae_encode(batch, vae)
218
- # 20250506 pftq: Synchronize CUDA to catch issues
219
- if device == "cuda":
220
- torch.cuda.synchronize()
221
- #print(f"GPU memory after encoding: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
222
- latents.append(batch_latent)
223
- #print(f"Batch encoded, latent shape: {batch_latent.shape}")
224
- except RuntimeError as e:
225
- print(f"Error during VAE encoding: {str(e)}")
226
- if device == "cuda" and "out of memory" in str(e).lower():
227
- print("CUDA out of memory, try reducing vae_batch_size or using CPU")
228
- raise
229
-
230
- # 20250506 pftq: Concatenate latents
231
- print("Concatenating latents...")
232
- history_latents = torch.cat(latents, dim=2) # Shape: (1, channels, frames, height//8, width//8)
233
- print(f"History latents shape: {history_latents.shape}")
234
-
235
- # 20250506 pftq: Get first frame's latent
236
- start_latent = history_latents[:, :, :1] # Shape: (1, channels, 1, height//8, width//8)
237
- print(f"Start latent shape: {start_latent.shape}")
238
-
239
- # 20250506 pftq: Move VAE back to CPU to free GPU memory
240
- if device == "cuda":
241
- vae.to(cpu)
242
- torch.cuda.empty_cache()
243
- print("VAE moved back to CPU, CUDA cache cleared")
244
-
245
- return start_latent, input_image_np, history_latents, fps, target_height, target_width, input_video_pixels
246
-
247
- except Exception as e:
248
- print(f"Error in video_encode: {str(e)}")
249
- raise
250
-
251
- # 20250508 pftq: for saving prompt to mp4 metadata comments
252
- def set_mp4_comments_imageio_ffmpeg(input_file, comments):
253
- try:
254
- # Get the path to the bundled FFmpeg binary from imageio-ffmpeg
255
- ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
256
-
257
- # Check if input file exists
258
- if not os.path.exists(input_file):
259
- print(f"Error: Input file {input_file} does not exist")
260
- return False
261
-
262
- # Create a temporary file path
263
- temp_file = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False).name
264
-
265
- # FFmpeg command using the bundled binary
266
- command = [
267
- ffmpeg_path, # Use imageio-ffmpeg's FFmpeg
268
- '-i', input_file, # input file
269
- '-metadata', f'comment={comments}', # set comment metadata
270
- '-c:v', 'copy', # copy video stream without re-encoding
271
- '-c:a', 'copy', # copy audio stream without re-encoding
272
- '-y', # overwrite output file if it exists
273
- temp_file # temporary output file
274
- ]
275
-
276
- # Run the FFmpeg command
277
- result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
278
-
279
- if result.returncode == 0:
280
- # Replace the original file with the modified one
281
- shutil.move(temp_file, input_file)
282
- print(f"Successfully added comments to {input_file}")
283
- return True
284
- else:
285
- # Clean up temp file if FFmpeg fails
286
- if os.path.exists(temp_file):
287
- os.remove(temp_file)
288
- print(f"Error: FFmpeg failed with message:\n{result.stderr}")
289
- return False
290
-
291
- except Exception as e:
292
- # Clean up temp file in case of other errors
293
- if 'temp_file' in locals() and os.path.exists(temp_file):
294
- os.remove(temp_file)
295
- print(f"Error saving prompt to video metadata, ffmpeg may be required: "+str(e))
296
- return False
297
-
298
- @torch.no_grad()
299
- def worker(input_image, prompts, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
300
- def encode_prompt(prompt, n_prompt):
301
- llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
302
-
303
- if cfg == 1:
304
- llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
305
- else:
306
- llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
307
-
308
- llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
309
- llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
310
-
311
- llama_vec = llama_vec.to(transformer.dtype)
312
- llama_vec_n = llama_vec_n.to(transformer.dtype)
313
- clip_l_pooler = clip_l_pooler.to(transformer.dtype)
314
- clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
315
- return [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n]
316
-
317
- total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
318
- total_latent_sections = int(max(round(total_latent_sections), 1))
319
-
320
- job_id = generate_timestamp()
321
-
322
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
323
-
324
- try:
325
- # Clean GPU
326
- if not high_vram:
327
- unload_complete_models(
328
- text_encoder, text_encoder_2, image_encoder, vae, transformer
329
- )
330
-
331
- # Text encoding
332
-
333
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
334
-
335
- if not high_vram:
336
- fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
337
- load_model_as_complete(text_encoder_2, target_device=gpu)
338
-
339
- prompt_parameters = []
340
-
341
- for prompt_part in prompts:
342
- prompt_parameters.append(encode_prompt(prompt_part, n_prompt))
343
-
344
- # Processing input image
345
-
346
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...'))))
347
-
348
- H, W, C = input_image.shape
349
- height, width = find_nearest_bucket(H, W, resolution=640)
350
- input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
351
-
352
- Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
353
-
354
- input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
355
- input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
356
-
357
- # VAE encoding
358
-
359
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
360
-
361
- if not high_vram:
362
- load_model_as_complete(vae, target_device=gpu)
363
-
364
- start_latent = vae_encode(input_image_pt, vae)
365
-
366
- # CLIP Vision
367
-
368
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
369
-
370
- if not high_vram:
371
- load_model_as_complete(image_encoder, target_device=gpu)
372
-
373
- image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
374
- image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
375
-
376
- # Dtype
377
-
378
- image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
379
-
380
- # Sampling
381
-
382
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
383
-
384
- rnd = torch.Generator("cpu").manual_seed(seed)
385
-
386
- history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32).cpu()
387
- history_pixels = None
388
-
389
- history_latents = torch.cat([history_latents, start_latent.to(history_latents)], dim=2)
390
- total_generated_latent_frames = 1
391
-
392
- for section_index in range(total_latent_sections):
393
- if stream.input_queue.top() == 'end':
394
- stream.output_queue.push(('end', None))
395
- return
396
-
397
- print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
398
-
399
- if len(prompt_parameters) > 0:
400
- [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters.pop(0)
401
-
402
- if not high_vram:
403
- unload_complete_models()
404
- move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
405
-
406
- if use_teacache:
407
- transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
408
- else:
409
- transformer.initialize_teacache(enable_teacache=False)
410
-
411
- def callback(d):
412
- preview = d['denoised']
413
- preview = vae_decode_fake(preview)
414
-
415
- preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
416
- preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
417
-
418
- if stream.input_queue.top() == 'end':
419
- stream.output_queue.push(('end', None))
420
- raise KeyboardInterrupt('User ends the task.')
421
-
422
- current_step = d['i'] + 1
423
- percentage = int(100.0 * current_step / steps)
424
- hint = f'Sampling {current_step}/{steps}'
425
- desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30). The video is being extended now ...'
426
- stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
427
- return
428
-
429
- indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
430
- clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
431
- clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
432
-
433
- clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -sum([16, 2, 1]):, :, :].split([16, 2, 1], dim=2)
434
- clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
435
-
436
- generated_latents = sample_hunyuan(
437
- transformer=transformer,
438
- sampler='unipc',
439
- width=width,
440
- height=height,
441
- frames=latent_window_size * 4 - 3,
442
- real_guidance_scale=cfg,
443
- distilled_guidance_scale=gs,
444
- guidance_rescale=rs,
445
- # shift=3.0,
446
- num_inference_steps=steps,
447
- generator=rnd,
448
- prompt_embeds=llama_vec,
449
- prompt_embeds_mask=llama_attention_mask,
450
- prompt_poolers=clip_l_pooler,
451
- negative_prompt_embeds=llama_vec_n,
452
- negative_prompt_embeds_mask=llama_attention_mask_n,
453
- negative_prompt_poolers=clip_l_pooler_n,
454
- device=gpu,
455
- dtype=torch.bfloat16,
456
- image_embeddings=image_encoder_last_hidden_state,
457
- latent_indices=latent_indices,
458
- clean_latents=clean_latents,
459
- clean_latent_indices=clean_latent_indices,
460
- clean_latents_2x=clean_latents_2x,
461
- clean_latent_2x_indices=clean_latent_2x_indices,
462
- clean_latents_4x=clean_latents_4x,
463
- clean_latent_4x_indices=clean_latent_4x_indices,
464
- callback=callback,
465
- )
466
-
467
- total_generated_latent_frames += int(generated_latents.shape[2])
468
- history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
469
-
470
- if not high_vram:
471
- offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
472
- load_model_as_complete(vae, target_device=gpu)
473
-
474
- real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
475
-
476
- if history_pixels is None:
477
- history_pixels = vae_decode(real_history_latents, vae).cpu()
478
- else:
479
- section_latent_frames = latent_window_size * 2
480
- overlapped_frames = latent_window_size * 4 - 3
481
-
482
- current_pixels = vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu()
483
- history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames)
484
-
485
- if not high_vram:
486
- unload_complete_models()
487
-
488
- output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
489
-
490
- save_bcthw_as_mp4(history_pixels, output_filename, fps=30, crf=mp4_crf)
491
-
492
- print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
493
-
494
- stream.output_queue.push(('file', output_filename))
495
- except:
496
- traceback.print_exc()
497
-
498
- if not high_vram:
499
- unload_complete_models(
500
- text_encoder, text_encoder_2, image_encoder, vae, transformer
501
- )
502
-
503
- stream.output_queue.push(('end', None))
504
- return
505
-
506
- def get_duration(input_image, prompt, t2v, n_prompt, randomize_seed, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
507
- global total_second_length_debug_value
508
-
509
- if total_second_length_debug_value is not None:
510
- return min(total_second_length_debug_value * 60, 600)
511
- return total_second_length * 60
512
-
513
-
514
- @spaces.GPU(duration=get_duration)
515
- def process(input_image, prompt,
516
- t2v=False,
517
- n_prompt="",
518
- randomize_seed=True,
519
- seed=31337,
520
- total_second_length=5,
521
- latent_window_size=9,
522
- steps=25,
523
- cfg=1.0,
524
- gs=10.0,
525
- rs=0.0,
526
- gpu_memory_preservation=6,
527
- use_teacache=True,
528
- mp4_crf=16
529
- ):
530
- global stream, input_image_debug_value, prompt_debug_value, total_second_length_debug_value
531
-
532
- if torch.cuda.device_count() == 0:
533
- gr.Warning('Set this space to GPU config to make it work.')
534
- return None, None, None, None, None, None
535
-
536
- if input_image_debug_value is not None or prompt_debug_value is not None or total_second_length_debug_value is not None:
537
- print("Debug mode")
538
- input_image = input_image_debug_value
539
- prompt = prompt_debug_value
540
- total_second_length = total_second_length_debug_value
541
- input_image_debug_value = prompt_debug_value = total_second_length_debug_value = None
542
-
543
- if randomize_seed:
544
- seed = random.randint(0, np.iinfo(np.int32).max)
545
-
546
- prompts = prompt.split(";")
547
-
548
- # assert input_image is not None, 'No input image!'
549
- if t2v:
550
- default_height, default_width = 640, 640
551
- input_image = np.ones((default_height, default_width, 3), dtype=np.uint8) * 255
552
- print("No input image provided. Using a blank white image.")
553
-
554
- yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
555
-
556
- stream = AsyncStream()
557
-
558
- async_run(worker, input_image, prompts, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf)
559
-
560
- output_filename = None
561
-
562
- while True:
563
- flag, data = stream.output_queue.next()
564
-
565
- if flag == 'file':
566
- output_filename = data
567
- yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
568
-
569
- if flag == 'progress':
570
- preview, desc, html = data
571
- yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
572
-
573
- if flag == 'end':
574
- yield output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False)
575
- break
576
-
577
-
578
- def end_process():
579
- stream.input_queue.push('end')
580
-
581
-
582
- css = make_progress_bar_css()
583
- block = gr.Blocks(css=css).queue()
584
- with block:
585
- if torch.cuda.device_count() == 0:
586
- with gr.Row():
587
- gr.HTML("""
588
- <p style="background-color: red;"><big><big><big><b>⚠️To use FramePack, <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/SUPIR?duplicate=true">duplicate this space</a> and set a GPU with 30 GB VRAM.</b>
589
-
590
- You can't use FramePack directly here because this space runs on a CPU, which is not enough for FramePack. Please provide <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/SUPIR/discussions/new">feedback</a> if you have issues.
591
- </big></big></big></p>
592
- """)
593
- # 20250506 pftq: Updated title to reflect video input functionality
594
- gr.Markdown('# Framepack F1 with Image Input or with Video Input (Video Extension)')
595
- gr.Markdown(f"""### Video diffusion, but feels like image diffusion
596
- *FramePack F1 - a FramePack model that only predicts future frames from history frames*
597
- ### *beta* FramePack Fill 🖋️- draw a mask over the input image to inpaint the video output
598
- adapted from the officical code repo [FramePack](https://github.com/lllyasviel/FramePack) by [lllyasviel](lllyasviel/FramePack_F1_I2V_HY_20250503) and [FramePack Studio](https://github.com/colinurbs/FramePack-Studio) 🙌🏻
599
- """)
600
- with gr.Row():
601
- with gr.Column():
602
- input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
603
- prompt = gr.Textbox(label="Prompt", value='')
604
- t2v = gr.Checkbox(label="do text-to-video", value=False)
605
-
606
- with gr.Row():
607
- start_button = gr.Button(value="Start Generation", variant="primary")
608
- end_button = gr.Button(value="End Generation", variant="stop", interactive=False)
609
-
610
- total_second_length = gr.Slider(label="Generated Video Length (Seconds)", minimum=1, maximum=5, value=2, step=0.1)
611
- with gr.Accordion("Advanced settings", open=False):
612
- use_teacache = gr.Checkbox(label='Use TeaCache', value=True, info='Faster speed, but often makes hands and fingers slightly worse.')
613
-
614
- n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, unrealistic position, blurred, blurry") # Not used
615
- randomize_seed = gr.Checkbox(label='Randomize seed', value=True, info='If checked, the seed is always different')
616
- seed = gr.Slider(label="Seed", minimum=0, maximum=np.iinfo(np.int32).max, step=1, randomize=True)
617
-
618
-
619
- latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1) # Should not change
620
- steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Changing this value is not recommended.')
621
-
622
- cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01) # Should not change
623
- gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=10.0, step=0.01, info='Changing this value is not recommended; 3=blurred motions& & unsharped; 10 focus motion')
624
- rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01) # Should not change
625
-
626
- gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
627
-
628
- mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ")
629
-
630
- with gr.Accordion("Debug", open=False):
631
- input_image_debug = gr.Image(type="numpy", label="Image Debug", height=320)
632
- prompt_debug = gr.Textbox(label="Prompt Debug", value='')
633
- total_second_length_debug = gr.Slider(label="Additional Video Length to Generate (Seconds) Debug", minimum=1, maximum=120, value=5, step=0.1)
634
-
635
- with gr.Column():
636
- preview_image = gr.Image(label="Next Latents", height=200, visible=False)
637
- result_video = gr.Video(label="Finished Frames", autoplay=True, show_share_button=False, height=512, loop=True)
638
- progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
639
- progress_bar = gr.HTML('', elem_classes='no-generating-animation')
640
-
641
- gr.HTML('<div style="text-align:center; margin-top:20px;">Share your results and find ideas at the <a href="https://x.com/search?q=framepack&f=live" target="_blank">FramePack Twitter (X) thread</a></div>')
642
-
643
- ips = [input_image, prompt, t2v, n_prompt, randomize_seed, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf]
644
- start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
645
- end_button.click(fn=end_process)
646
-
647
- with gr.Row(visible=False):
648
- gr.Examples(
649
- examples = [
650
- [
651
- "./img_examples/Example1.png", # input_image
652
- "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
653
- False, # t2v
654
- "Missing arm, unrealistic position, blurred, blurry", # n_prompt
655
- True, # randomize_seed
656
- 42, # seed
657
- 1, # total_second_length
658
- 9, # latent_window_size
659
- 25, # steps
660
- 1.0, # cfg
661
- 10.0, # gs
662
- 0.0, # rs
663
- 6, # gpu_memory_preservation
664
- True, # use_teacache
665
- 16 # mp4_crf
666
- ],
667
- [
668
- "./img_examples/Example1.png", # input_image
669
- "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
670
- False, # t2v
671
- "Missing arm, unrealistic position, blurred, blurry", # n_prompt
672
- True, # randomize_seed
673
- 42, # seed
674
- 1, # total_second_length
675
- 9, # latent_window_size
676
- 25, # steps
677
- 1.0, # cfg
678
- 10.0, # gs
679
- 0.0, # rs
680
- 6, # gpu_memory_preservation
681
- True, # use_teacache
682
- 16 # mp4_crf
683
- ],
684
- [
685
- "./img_examples/Example1.png", # input_image
686
- "We are sinking, photorealistic, realistic, intricate details, 8k, insanely detailed",
687
- False, # t2v
688
- "Missing arm, unrealistic position, blurred, blurry", # n_prompt
689
- True, # randomize_seed
690
- 42, # seed
691
- 1, # total_second_length
692
- 9, # latent_window_size
693
- 25, # steps
694
- 1.0, # cfg
695
- 10.0, # gs
696
- 0.0, # rs
697
- 6, # gpu_memory_preservation
698
- True, # use_teacache
699
- 16 # mp4_crf
700
- ],
701
- [
702
- "./img_examples/Example1.png", # input_image
703
- "A boat is passing, photorealistic, realistic, intricate details, 8k, insanely detailed",
704
- False, # t2v
705
- "Missing arm, unrealistic position, blurred, blurry", # n_prompt
706
- True, # randomize_seed
707
- 42, # seed
708
- 1, # total_second_length
709
- 9, # latent_window_size
710
- 25, # steps
711
- 1.0, # cfg
712
- 10.0, # gs
713
- 0.0, # rs
714
- 6, # gpu_memory_preservation
715
- True, # use_teacache
716
- 16 # mp4_crf
717
- ],
718
- ],
719
- run_on_click = True,
720
- fn = process,
721
- inputs = ips,
722
- outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button],
723
- cache_examples = True,
724
- )
725
-
726
- gr.Markdown('## Guide')
727
- gr.Markdown("I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.")
728
-
729
-
730
- def handle_field_debug_change(input_image_debug_data, prompt_debug_data, total_second_length_debug_data):
731
- global input_image_debug_value, prompt_debug_value, total_second_length_debug_value
732
- input_image_debug_value = input_image_debug_data
733
- prompt_debug_value = prompt_debug_data
734
- total_second_length_debug_value = total_second_length_debug_data
735
- return []
736
-
737
- input_image_debug.upload(
738
- fn=handle_field_debug_change,
739
- inputs=[input_image_debug, prompt_debug, total_second_length_debug],
740
- outputs=[]
741
- )
742
-
743
- prompt_debug.change(
744
- fn=handle_field_debug_change,
745
- inputs=[input_image_debug, prompt_debug, total_second_length_debug],
746
- outputs=[]
747
- )
748
-
749
- total_second_length_debug.change(
750
- fn=handle_field_debug_change,
751
- inputs=[input_image_debug, prompt_debug, total_second_length_debug],
752
- outputs=[]
753
- )
754
-
755
- block.launch(mcp_server=False)
 
1
+ from diffusers_helper.hf_login import login
2
+
3
+ import os
4
+
5
+ os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download')))
6
+
7
+ import spaces
8
+ import gradio as gr
9
+ import torch
10
+ import traceback
11
+ import einops
12
+ import safetensors.torch as sf
13
+ import numpy as np
14
+ import argparse
15
+ import random
16
+ import math
17
+ # 20250506 pftq: Added for video input loading
18
+ import decord
19
+ # 20250506 pftq: Added for progress bars in video_encode
20
+ from tqdm import tqdm
21
+ # 20250506 pftq: Normalize file paths for Windows compatibility
22
+ import pathlib
23
+ # 20250506 pftq: for easier to read timestamp
24
+ from datetime import datetime
25
+ # 20250508 pftq: for saving prompt to mp4 comments metadata
26
+ import imageio_ffmpeg
27
+ import tempfile
28
+ import shutil
29
+ import subprocess
30
+
31
+ from PIL import Image
32
+ from diffusers import AutoencoderKLHunyuanVideo
33
+ from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
34
+ from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
35
+ from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
36
+ from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
37
+ from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
38
+ from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
39
+ from diffusers_helper.thread_utils import AsyncStream, async_run
40
+ from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
41
+ from transformers import SiglipImageProcessor, SiglipVisionModel
42
+ from diffusers_helper.clip_vision import hf_clip_vision_encode
43
+ from diffusers_helper.bucket_tools import find_nearest_bucket
44
+ from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
45
+
46
+
47
+ if torch.cuda.device_count() > 0:
48
+ free_mem_gb = get_cuda_free_memory_gb(gpu)
49
+ high_vram = free_mem_gb > 80
50
+
51
+ print(f'Free VRAM {free_mem_gb} GB')
52
+ print(f'High-VRAM Mode: {high_vram}')
53
+
54
+ text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
55
+ text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
56
+ tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
57
+ tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2')
58
+ vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu()
59
+
60
+ feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor')
61
+ image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu()
62
+
63
+ transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePack_F1_I2V_HY_20250503', torch_dtype=torch.bfloat16).cpu()
64
+
65
+ vae.eval()
66
+ text_encoder.eval()
67
+ text_encoder_2.eval()
68
+ image_encoder.eval()
69
+ transformer.eval()
70
+
71
+ if not high_vram:
72
+ vae.enable_slicing()
73
+ vae.enable_tiling()
74
+
75
+ transformer.high_quality_fp32_output_for_inference = True
76
+ print('transformer.high_quality_fp32_output_for_inference = True')
77
+
78
+ transformer.to(dtype=torch.bfloat16)
79
+ vae.to(dtype=torch.float16)
80
+ image_encoder.to(dtype=torch.float16)
81
+ text_encoder.to(dtype=torch.float16)
82
+ text_encoder_2.to(dtype=torch.float16)
83
+
84
+ vae.requires_grad_(False)
85
+ text_encoder.requires_grad_(False)
86
+ text_encoder_2.requires_grad_(False)
87
+ image_encoder.requires_grad_(False)
88
+ transformer.requires_grad_(False)
89
+
90
+ if not high_vram:
91
+ # DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster
92
+ DynamicSwapInstaller.install_model(transformer, device=gpu)
93
+ DynamicSwapInstaller.install_model(text_encoder, device=gpu)
94
+ else:
95
+ text_encoder.to(gpu)
96
+ text_encoder_2.to(gpu)
97
+ image_encoder.to(gpu)
98
+ vae.to(gpu)
99
+ transformer.to(gpu)
100
+
101
+ stream = AsyncStream()
102
+
103
+ outputs_folder = './outputs/'
104
+ os.makedirs(outputs_folder, exist_ok=True)
105
+
106
+ input_image_debug_value = prompt_debug_value = total_second_length_debug_value = None
107
+
108
+ @spaces.GPU()
109
+ @torch.no_grad()
110
+ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, device="cuda", width=None, height=None):
111
+ """
112
+ Encode a video into latent representations using the VAE.
113
+
114
+ Args:
115
+ video_path: Path to the input video file.
116
+ vae: AutoencoderKLHunyuanVideo model.
117
+ height, width: Target resolution for resizing frames.
118
+ vae_batch_size: Number of frames to process per batch.
119
+ device: Device for computation (e.g., "cuda").
120
+
121
+ Returns:
122
+ start_latent: Latent of the first frame (for compatibility with original code).
123
+ input_image_np: First frame as numpy array (for CLIP vision encoding).
124
+ history_latents: Latents of all frames (shape: [1, channels, frames, height//8, width//8]).
125
+ fps: Frames per second of the input video.
126
+ """
127
+ # 20250506 pftq: Normalize video path for Windows compatibility
128
+ video_path = str(pathlib.Path(video_path).resolve())
129
+ print(f"Processing video: {video_path}")
130
+
131
+ # 20250506 pftq: Check CUDA availability and fallback to CPU if needed
132
+ if device == "cuda" and not torch.cuda.is_available():
133
+ print("CUDA is not available, falling back to CPU")
134
+ device = "cpu"
135
+
136
+ try:
137
+ # 20250506 pftq: Load video and get FPS
138
+ print("Initializing VideoReader...")
139
+ vr = decord.VideoReader(video_path)
140
+ fps = vr.get_avg_fps() # Get input video FPS
141
+ num_real_frames = len(vr)
142
+ print(f"Video loaded: {num_real_frames} frames, FPS: {fps}")
143
+
144
+ # Truncate to nearest latent size (multiple of 4)
145
+ latent_size_factor = 4
146
+ num_frames = (num_real_frames // latent_size_factor) * latent_size_factor
147
+ if num_frames != num_real_frames:
148
+ print(f"Truncating video from {num_real_frames} to {num_frames} frames for latent size compatibility")
149
+ num_real_frames = num_frames
150
+
151
+ # 20250506 pftq: Read frames
152
+ print("Reading video frames...")
153
+ frames = vr.get_batch(range(num_real_frames)).asnumpy() # Shape: (num_real_frames, height, width, channels)
154
+ print(f"Frames read: {frames.shape}")
155
+
156
+ # 20250506 pftq: Get native video resolution
157
+ native_height, native_width = frames.shape[1], frames.shape[2]
158
+ print(f"Native video resolution: {native_width}x{native_height}")
159
+
160
+ # 20250506 pftq: Use native resolution if height/width not specified, otherwise use provided values
161
+ target_height = native_height if height is None else height
162
+ target_width = native_width if width is None else width
163
+
164
+ # 20250506 pftq: Adjust to nearest bucket for model compatibility
165
+ if not no_resize:
166
+ target_height, target_width = find_nearest_bucket(target_height, target_width, resolution=resolution)
167
+ print(f"Adjusted resolution: {target_width}x{target_height}")
168
+ else:
169
+ print(f"Using native resolution without resizing: {target_width}x{target_height}")
170
+
171
+ # 20250506 pftq: Preprocess frames to match original image processing
172
+ processed_frames = []
173
+ for i, frame in enumerate(frames):
174
+ #print(f"Preprocessing frame {i+1}/{num_frames}")
175
+ frame_np = resize_and_center_crop(frame, target_width=target_width, target_height=target_height)
176
+ processed_frames.append(frame_np)
177
+ processed_frames = np.stack(processed_frames) # Shape: (num_real_frames, height, width, channels)
178
+ print(f"Frames preprocessed: {processed_frames.shape}")
179
+
180
+ # 20250506 pftq: Save first frame for CLIP vision encoding
181
+ input_image_np = processed_frames[0]
182
+
183
+ # 20250506 pftq: Convert to tensor and normalize to [-1, 1]
184
+ print("Converting frames to tensor...")
185
+ frames_pt = torch.from_numpy(processed_frames).float() / 127.5 - 1
186
+ frames_pt = frames_pt.permute(0, 3, 1, 2) # Shape: (num_real_frames, channels, height, width)
187
+ frames_pt = frames_pt.unsqueeze(0) # Shape: (1, num_real_frames, channels, height, width)
188
+ frames_pt = frames_pt.permute(0, 2, 1, 3, 4) # Shape: (1, channels, num_real_frames, height, width)
189
+ print(f"Tensor shape: {frames_pt.shape}")
190
+
191
+ # 20250507 pftq: Save pixel frames for use in worker
192
+ input_video_pixels = frames_pt.cpu()
193
+
194
+ # 20250506 pftq: Move to device
195
+ print(f"Moving tensor to device: {device}")
196
+ frames_pt = frames_pt.to(device)
197
+ print("Tensor moved to device")
198
+
199
+ # 20250506 pftq: Move VAE to device
200
+ print(f"Moving VAE to device: {device}")
201
+ vae.to(device)
202
+ print("VAE moved to device")
203
+
204
+ # 20250506 pftq: Encode frames in batches
205
+ print(f"Encoding input video frames in VAE batch size {vae_batch_size} (reduce if memory issues here or if forcing video resolution)")
206
+ latents = []
207
+ vae.eval()
208
+ with torch.no_grad():
209
+ for i in tqdm(range(0, frames_pt.shape[2], vae_batch_size), desc="Encoding video frames", mininterval=0.1):
210
+ #print(f"Encoding batch {i//vae_batch_size + 1}: frames {i} to {min(i + vae_batch_size, frames_pt.shape[2])}")
211
+ batch = frames_pt[:, :, i:i + vae_batch_size] # Shape: (1, channels, batch_size, height, width)
212
+ try:
213
+ # 20250506 pftq: Log GPU memory before encoding
214
+ if device == "cuda":
215
+ free_mem = torch.cuda.memory_allocated() / 1024**3
216
+ #print(f"GPU memory before encoding: {free_mem:.2f} GB")
217
+ batch_latent = vae_encode(batch, vae)
218
+ # 20250506 pftq: Synchronize CUDA to catch issues
219
+ if device == "cuda":
220
+ torch.cuda.synchronize()
221
+ #print(f"GPU memory after encoding: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
222
+ latents.append(batch_latent)
223
+ #print(f"Batch encoded, latent shape: {batch_latent.shape}")
224
+ except RuntimeError as e:
225
+ print(f"Error during VAE encoding: {str(e)}")
226
+ if device == "cuda" and "out of memory" in str(e).lower():
227
+ print("CUDA out of memory, try reducing vae_batch_size or using CPU")
228
+ raise
229
+
230
+ # 20250506 pftq: Concatenate latents
231
+ print("Concatenating latents...")
232
+ history_latents = torch.cat(latents, dim=2) # Shape: (1, channels, frames, height//8, width//8)
233
+ print(f"History latents shape: {history_latents.shape}")
234
+
235
+ # 20250506 pftq: Get first frame's latent
236
+ start_latent = history_latents[:, :, :1] # Shape: (1, channels, 1, height//8, width//8)
237
+ print(f"Start latent shape: {start_latent.shape}")
238
+
239
+ # 20250506 pftq: Move VAE back to CPU to free GPU memory
240
+ if device == "cuda":
241
+ vae.to(cpu)
242
+ torch.cuda.empty_cache()
243
+ print("VAE moved back to CPU, CUDA cache cleared")
244
+
245
+ return start_latent, input_image_np, history_latents, fps, target_height, target_width, input_video_pixels
246
+
247
+ except Exception as e:
248
+ print(f"Error in video_encode: {str(e)}")
249
+ raise
250
+
251
+ # 20250508 pftq: for saving prompt to mp4 metadata comments
252
+ def set_mp4_comments_imageio_ffmpeg(input_file, comments):
253
+ try:
254
+ # Get the path to the bundled FFmpeg binary from imageio-ffmpeg
255
+ ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
256
+
257
+ # Check if input file exists
258
+ if not os.path.exists(input_file):
259
+ print(f"Error: Input file {input_file} does not exist")
260
+ return False
261
+
262
+ # Create a temporary file path
263
+ temp_file = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False).name
264
+
265
+ # FFmpeg command using the bundled binary
266
+ command = [
267
+ ffmpeg_path, # Use imageio-ffmpeg's FFmpeg
268
+ '-i', input_file, # input file
269
+ '-metadata', f'comment={comments}', # set comment metadata
270
+ '-c:v', 'copy', # copy video stream without re-encoding
271
+ '-c:a', 'copy', # copy audio stream without re-encoding
272
+ '-y', # overwrite output file if it exists
273
+ temp_file # temporary output file
274
+ ]
275
+
276
+ # Run the FFmpeg command
277
+ result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
278
+
279
+ if result.returncode == 0:
280
+ # Replace the original file with the modified one
281
+ shutil.move(temp_file, input_file)
282
+ print(f"Successfully added comments to {input_file}")
283
+ return True
284
+ else:
285
+ # Clean up temp file if FFmpeg fails
286
+ if os.path.exists(temp_file):
287
+ os.remove(temp_file)
288
+ print(f"Error: FFmpeg failed with message:\n{result.stderr}")
289
+ return False
290
+
291
+ except Exception as e:
292
+ # Clean up temp file in case of other errors
293
+ if 'temp_file' in locals() and os.path.exists(temp_file):
294
+ os.remove(temp_file)
295
+ print(f"Error saving prompt to video metadata, ffmpeg may be required: "+str(e))
296
+ return False
297
+
298
+ @torch.no_grad()
299
+ def worker(input_image, prompts, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
300
+ def encode_prompt(prompt, n_prompt):
301
+ llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
302
+
303
+ if cfg == 1:
304
+ llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
305
+ else:
306
+ llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
307
+
308
+ llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
309
+ llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
310
+
311
+ llama_vec = llama_vec.to(transformer.dtype)
312
+ llama_vec_n = llama_vec_n.to(transformer.dtype)
313
+ clip_l_pooler = clip_l_pooler.to(transformer.dtype)
314
+ clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
315
+ return [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n]
316
+
317
+ total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
318
+ total_latent_sections = int(max(round(total_latent_sections), 1))
319
+
320
+ job_id = generate_timestamp()
321
+
322
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
323
+
324
+ try:
325
+ # Clean GPU
326
+ if not high_vram:
327
+ unload_complete_models(
328
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
329
+ )
330
+
331
+ # Text encoding
332
+
333
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
334
+
335
+ if not high_vram:
336
+ fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
337
+ load_model_as_complete(text_encoder_2, target_device=gpu)
338
+
339
+ prompt_parameters = []
340
+
341
+ for prompt_part in prompts:
342
+ prompt_parameters.append(encode_prompt(prompt_part, n_prompt))
343
+
344
+ # Processing input image
345
+
346
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...'))))
347
+
348
+ H, W, C = input_image.shape
349
+ height, width = find_nearest_bucket(H, W, resolution=640)
350
+ input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
351
+
352
+ Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
353
+
354
+ input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
355
+ input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
356
+
357
+ # VAE encoding
358
+
359
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
360
+
361
+ if not high_vram:
362
+ load_model_as_complete(vae, target_device=gpu)
363
+
364
+ start_latent = vae_encode(input_image_pt, vae)
365
+
366
+ # CLIP Vision
367
+
368
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
369
+
370
+ if not high_vram:
371
+ load_model_as_complete(image_encoder, target_device=gpu)
372
+
373
+ image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
374
+ image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
375
+
376
+ # Dtype
377
+
378
+ image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
379
+
380
+ # Sampling
381
+
382
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
383
+
384
+ rnd = torch.Generator("cpu").manual_seed(seed)
385
+
386
+ history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32).cpu()
387
+ history_pixels = None
388
+
389
+ history_latents = torch.cat([history_latents, start_latent.to(history_latents)], dim=2)
390
+ total_generated_latent_frames = 1
391
+
392
+ for section_index in range(total_latent_sections):
393
+ if stream.input_queue.top() == 'end':
394
+ stream.output_queue.push(('end', None))
395
+ return
396
+
397
+ print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
398
+
399
+ if len(prompt_parameters) > 0:
400
+ [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters.pop(0)
401
+
402
+ if not high_vram:
403
+ unload_complete_models()
404
+ move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
405
+
406
+ if use_teacache:
407
+ transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
408
+ else:
409
+ transformer.initialize_teacache(enable_teacache=False)
410
+
411
+ def callback(d):
412
+ preview = d['denoised']
413
+ preview = vae_decode_fake(preview)
414
+
415
+ preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
416
+ preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
417
+
418
+ if stream.input_queue.top() == 'end':
419
+ stream.output_queue.push(('end', None))
420
+ raise KeyboardInterrupt('User ends the task.')
421
+
422
+ current_step = d['i'] + 1
423
+ percentage = int(100.0 * current_step / steps)
424
+ hint = f'Sampling {current_step}/{steps}'
425
+ desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30). The video is being extended now ...'
426
+ stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
427
+ return
428
+
429
+ indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
430
+ clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
431
+ clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
432
+
433
+ clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -sum([16, 2, 1]):, :, :].split([16, 2, 1], dim=2)
434
+ clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
435
+
436
+ generated_latents = sample_hunyuan(
437
+ transformer=transformer,
438
+ sampler='unipc',
439
+ width=width,
440
+ height=height,
441
+ frames=latent_window_size * 4 - 3,
442
+ real_guidance_scale=cfg,
443
+ distilled_guidance_scale=gs,
444
+ guidance_rescale=rs,
445
+ # shift=3.0,
446
+ num_inference_steps=steps,
447
+ generator=rnd,
448
+ prompt_embeds=llama_vec,
449
+ prompt_embeds_mask=llama_attention_mask,
450
+ prompt_poolers=clip_l_pooler,
451
+ negative_prompt_embeds=llama_vec_n,
452
+ negative_prompt_embeds_mask=llama_attention_mask_n,
453
+ negative_prompt_poolers=clip_l_pooler_n,
454
+ device=gpu,
455
+ dtype=torch.bfloat16,
456
+ image_embeddings=image_encoder_last_hidden_state,
457
+ latent_indices=latent_indices,
458
+ clean_latents=clean_latents,
459
+ clean_latent_indices=clean_latent_indices,
460
+ clean_latents_2x=clean_latents_2x,
461
+ clean_latent_2x_indices=clean_latent_2x_indices,
462
+ clean_latents_4x=clean_latents_4x,
463
+ clean_latent_4x_indices=clean_latent_4x_indices,
464
+ callback=callback,
465
+ )
466
+
467
+ total_generated_latent_frames += int(generated_latents.shape[2])
468
+ history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
469
+
470
+ if not high_vram:
471
+ offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
472
+ load_model_as_complete(vae, target_device=gpu)
473
+
474
+ real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
475
+
476
+ if history_pixels is None:
477
+ history_pixels = vae_decode(real_history_latents, vae).cpu()
478
+ else:
479
+ section_latent_frames = latent_window_size * 2
480
+ overlapped_frames = latent_window_size * 4 - 3
481
+
482
+ current_pixels = vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu()
483
+ history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames)
484
+
485
+ if not high_vram:
486
+ unload_complete_models()
487
+
488
+ output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
489
+
490
+ save_bcthw_as_mp4(history_pixels, output_filename, fps=30, crf=mp4_crf)
491
+
492
+ print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
493
+
494
+ stream.output_queue.push(('file', output_filename))
495
+ except:
496
+ traceback.print_exc()
497
+
498
+ if not high_vram:
499
+ unload_complete_models(
500
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
501
+ )
502
+
503
+ stream.output_queue.push(('end', None))
504
+ return
505
+
506
+ def get_duration(input_image, prompt, t2v, n_prompt, randomize_seed, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
507
+ global total_second_length_debug_value
508
+
509
+ if total_second_length_debug_value is not None:
510
+ return min(total_second_length_debug_value * 60, 600)
511
+ return total_second_length * 60
512
+
513
+
514
+ @spaces.GPU(duration=get_duration)
515
+ def process(input_image, prompt,
516
+ t2v=False,
517
+ n_prompt="",
518
+ randomize_seed=True,
519
+ seed=31337,
520
+ total_second_length=5,
521
+ latent_window_size=9,
522
+ steps=25,
523
+ cfg=1.0,
524
+ gs=10.0,
525
+ rs=0.0,
526
+ gpu_memory_preservation=6,
527
+ use_teacache=True,
528
+ mp4_crf=16
529
+ ):
530
+ global stream, input_image_debug_value, prompt_debug_value, total_second_length_debug_value
531
+
532
+ if torch.cuda.device_count() == 0:
533
+ gr.Warning('Set this space to GPU config to make it work.')
534
+ return None, None, None, None, None, None
535
+
536
+ if input_image_debug_value is not None or prompt_debug_value is not None or total_second_length_debug_value is not None:
537
+ print("Debug mode")
538
+ input_image = input_image_debug_value
539
+ prompt = prompt_debug_value
540
+ total_second_length = total_second_length_debug_value
541
+ input_image_debug_value = prompt_debug_value = total_second_length_debug_value = None
542
+
543
+ if randomize_seed:
544
+ seed = random.randint(0, np.iinfo(np.int32).max)
545
+
546
+ prompts = prompt.split(";")
547
+
548
+ # assert input_image is not None, 'No input image!'
549
+ if t2v:
550
+ default_height, default_width = 640, 640
551
+ input_image = np.ones((default_height, default_width, 3), dtype=np.uint8) * 255
552
+ print("No input image provided. Using a blank white image.")
553
+
554
+ yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
555
+
556
+ stream = AsyncStream()
557
+
558
+ async_run(worker, input_image, prompts, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf)
559
+
560
+ output_filename = None
561
+
562
+ while True:
563
+ flag, data = stream.output_queue.next()
564
+
565
+ if flag == 'file':
566
+ output_filename = data
567
+ yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
568
+
569
+ if flag == 'progress':
570
+ preview, desc, html = data
571
+ yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
572
+
573
+ if flag == 'end':
574
+ yield output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False)
575
+ break
576
+
577
+
578
+ def end_process():
579
+ stream.input_queue.push('end')
580
+
581
+
582
+ css = make_progress_bar_css()
583
+ block = gr.Blocks(css=css).queue()
584
+ with block:
585
+ if torch.cuda.device_count() == 0:
586
+ with gr.Row():
587
+ gr.HTML("""
588
+ <p style="background-color: red;"><big><big><big><b>⚠️To use FramePack, <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/SUPIR?duplicate=true">duplicate this space</a> and set a GPU with 30 GB VRAM.</b>
589
+
590
+ You can't use FramePack directly here because this space runs on a CPU, which is not enough for FramePack. Please provide <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/SUPIR/discussions/new">feedback</a> if you have issues.
591
+ </big></big></big></p>
592
+ """)
593
+ # 20250506 pftq: Updated title to reflect video input functionality
594
+ gr.Markdown('# Framepack F1 with Image Input or with Video Input (Video Extension)')
595
+ gr.Markdown(f"""### Video diffusion, but feels like image diffusion
596
+ *FramePack F1 - a FramePack model that only predicts future frames from history frames*
597
+ ### *beta* FramePack Fill 🖋️- draw a mask over the input image to inpaint the video output
598
+ adapted from the officical code repo [FramePack](https://github.com/lllyasviel/FramePack) by [lllyasviel](lllyasviel/FramePack_F1_I2V_HY_20250503) and [FramePack Studio](https://github.com/colinurbs/FramePack-Studio) 🙌🏻
599
+ """)
600
+ with gr.Row():
601
+ with gr.Column():
602
+ input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
603
+ prompt = gr.Textbox(label="Prompt", value='')
604
+ t2v = gr.Checkbox(label="do text-to-video", value=False)
605
+
606
+ with gr.Row():
607
+ start_button = gr.Button(value="Start Generation", variant="primary")
608
+ end_button = gr.Button(value="End Generation", variant="stop", interactive=False)
609
+
610
+ total_second_length = gr.Slider(label="Generated Video Length (Seconds)", minimum=1, maximum=5, value=2, step=0.1)
611
+ with gr.Accordion("Advanced settings", open=False):
612
+ use_teacache = gr.Checkbox(label='Use TeaCache', value=True, info='Faster speed, but often makes hands and fingers slightly worse.')
613
+
614
+ n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, unrealistic position, blurred, blurry") # Not used
615
+ randomize_seed = gr.Checkbox(label='Randomize seed', value=True, info='If checked, the seed is always different')
616
+ seed = gr.Slider(label="Seed", minimum=0, maximum=np.iinfo(np.int32).max, step=1, randomize=True)
617
+
618
+
619
+ latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1) # Should not change
620
+ steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Changing this value is not recommended.')
621
+
622
+ cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01) # Should not change
623
+ gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=10.0, step=0.01, info='Changing this value is not recommended; 3=blurred motions& & unsharped; 10 focus motion')
624
+ rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01) # Should not change
625
+
626
+ gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
627
+
628
+ mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ")
629
+
630
+ with gr.Accordion("Debug", open=False):
631
+ input_image_debug = gr.Image(type="numpy", label="Image Debug", height=320)
632
+ prompt_debug = gr.Textbox(label="Prompt Debug", value='')
633
+ total_second_length_debug = gr.Slider(label="Additional Video Length to Generate (Seconds) Debug", minimum=1, maximum=120, value=5, step=0.1)
634
+
635
+ with gr.Column():
636
+ preview_image = gr.Image(label="Next Latents", height=200, visible=False)
637
+ result_video = gr.Video(label="Finished Frames", autoplay=True, show_share_button=False, height=512, loop=True)
638
+ progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
639
+ progress_bar = gr.HTML('', elem_classes='no-generating-animation')
640
+
641
+ ips = [input_image, prompt, t2v, n_prompt, randomize_seed, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf]
642
+ start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
643
+ end_button.click(fn=end_process)
644
+
645
+ with gr.Row(visible=False):
646
+ gr.Examples(
647
+ examples = [
648
+ [
649
+ "./img_examples/Example1.png", # input_image
650
+ "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
651
+ False, # t2v
652
+ "Missing arm, unrealistic position, blurred, blurry", # n_prompt
653
+ True, # randomize_seed
654
+ 42, # seed
655
+ 1, # total_second_length
656
+ 9, # latent_window_size
657
+ 25, # steps
658
+ 1.0, # cfg
659
+ 10.0, # gs
660
+ 0.0, # rs
661
+ 6, # gpu_memory_preservation
662
+ True, # use_teacache
663
+ 16 # mp4_crf
664
+ ],
665
+ [
666
+ "./img_examples/Example1.png", # input_image
667
+ "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
668
+ False, # t2v
669
+ "Missing arm, unrealistic position, blurred, blurry", # n_prompt
670
+ True, # randomize_seed
671
+ 42, # seed
672
+ 1, # total_second_length
673
+ 9, # latent_window_size
674
+ 25, # steps
675
+ 1.0, # cfg
676
+ 10.0, # gs
677
+ 0.0, # rs
678
+ 6, # gpu_memory_preservation
679
+ True, # use_teacache
680
+ 16 # mp4_crf
681
+ ],
682
+ [
683
+ "./img_examples/Example1.png", # input_image
684
+ "We are sinking, photorealistic, realistic, intricate details, 8k, insanely detailed",
685
+ False, # t2v
686
+ "Missing arm, unrealistic position, blurred, blurry", # n_prompt
687
+ True, # randomize_seed
688
+ 42, # seed
689
+ 1, # total_second_length
690
+ 9, # latent_window_size
691
+ 25, # steps
692
+ 1.0, # cfg
693
+ 10.0, # gs
694
+ 0.0, # rs
695
+ 6, # gpu_memory_preservation
696
+ True, # use_teacache
697
+ 16 # mp4_crf
698
+ ],
699
+ [
700
+ "./img_examples/Example1.png", # input_image
701
+ "A boat is passing, photorealistic, realistic, intricate details, 8k, insanely detailed",
702
+ False, # t2v
703
+ "Missing arm, unrealistic position, blurred, blurry", # n_prompt
704
+ True, # randomize_seed
705
+ 42, # seed
706
+ 1, # total_second_length
707
+ 9, # latent_window_size
708
+ 25, # steps
709
+ 1.0, # cfg
710
+ 10.0, # gs
711
+ 0.0, # rs
712
+ 6, # gpu_memory_preservation
713
+ True, # use_teacache
714
+ 16 # mp4_crf
715
+ ],
716
+ ],
717
+ run_on_click = True,
718
+ fn = process,
719
+ inputs = ips,
720
+ outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button],
721
+ cache_examples = True,
722
+ )
723
+
724
+ gr.Markdown('## Guide')
725
+ gr.Markdown("I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.")
726
+
727
+
728
+ def handle_field_debug_change(input_image_debug_data, prompt_debug_data, total_second_length_debug_data):
729
+ global input_image_debug_value, prompt_debug_value, total_second_length_debug_value
730
+ input_image_debug_value = input_image_debug_data
731
+ prompt_debug_value = prompt_debug_data
732
+ total_second_length_debug_value = total_second_length_debug_data
733
+ return []
734
+
735
+ input_image_debug.upload(
736
+ fn=handle_field_debug_change,
737
+ inputs=[input_image_debug, prompt_debug, total_second_length_debug],
738
+ outputs=[]
739
+ )
740
+
741
+ prompt_debug.change(
742
+ fn=handle_field_debug_change,
743
+ inputs=[input_image_debug, prompt_debug, total_second_length_debug],
744
+ outputs=[]
745
+ )
746
+
747
+ total_second_length_debug.change(
748
+ fn=handle_field_debug_change,
749
+ inputs=[input_image_debug, prompt_debug, total_second_length_debug],
750
+ outputs=[]
751
+ )
752
+
753
+ block.launch(mcp_server=False, ssr_mode=False)