Fabrice-TIERCELIN commited on
Commit
9612e4b
·
verified ·
1 Parent(s): 5051259

Change model

Browse files
Files changed (1) hide show
  1. app_v2v.py +653 -653
app_v2v.py CHANGED
@@ -1,653 +1,653 @@
1
- from diffusers_helper.hf_login import login
2
-
3
- import os
4
-
5
- os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download')))
6
- import spaces
7
- import gradio as gr
8
- import torch
9
- import traceback
10
- import einops
11
- import safetensors.torch as sf
12
- import numpy as np
13
- import argparse
14
- import math
15
- import decord
16
- from tqdm import tqdm
17
- import pathlib
18
- from datetime import datetime
19
- import imageio_ffmpeg
20
- import tempfile
21
- import shutil
22
- import subprocess
23
-
24
- from PIL import Image
25
- from diffusers import AutoencoderKLHunyuanVideo
26
- from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
27
- from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
28
- from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
29
- from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
30
- from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
31
- from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
32
- from diffusers_helper.thread_utils import AsyncStream, async_run
33
- from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
34
- from transformers import SiglipImageProcessor, SiglipVisionModel
35
- from diffusers_helper.clip_vision import hf_clip_vision_encode
36
- from diffusers_helper.bucket_tools import find_nearest_bucket
37
- from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
38
-
39
- parser = argparse.ArgumentParser()
40
- parser.add_argument('--share', action='store_true')
41
- parser.add_argument("--server", type=str, default='0.0.0.0')
42
- parser.add_argument("--port", type=int, required=False)
43
- parser.add_argument("--inbrowser", action='store_true')
44
- args = parser.parse_args()
45
-
46
- print(args)
47
-
48
- free_mem_gb = get_cuda_free_memory_gb(gpu)
49
- high_vram = free_mem_gb > 80
50
-
51
- print(f'Free VRAM {free_mem_gb} GB')
52
- print(f'High-VRAM Mode: {high_vram}')
53
-
54
-
55
-
56
- text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
57
- text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
58
- tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
59
- tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2')
60
- vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu()
61
-
62
- feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor')
63
- image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu()
64
-
65
- quant_config = DiffusersBitsAndBytesConfig(load_in_8bit=True)
66
- transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained(
67
- "lllyasviel/FramePack_F1_I2V_HY_20250503",
68
- quantization_config=quant_config,
69
- torch_dtype=torch.bfloat16,
70
- ).cpu()
71
-
72
- # transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePack_F1_I2V_HY_20250503', torch_dtype=torch.bfloat16).cpu()
73
-
74
- vae.eval()
75
- text_encoder.eval()
76
- text_encoder_2.eval()
77
- image_encoder.eval()
78
- transformer.eval()
79
-
80
- if not high_vram:
81
- vae.enable_slicing()
82
- vae.enable_tiling()
83
-
84
- transformer.high_quality_fp32_output_for_inference = True
85
- print('transformer.high_quality_fp32_output_for_inference = True')
86
-
87
- # transformer.to(dtype=torch.bfloat16)
88
- vae.to(dtype=torch.float16)
89
- image_encoder.to(dtype=torch.float16)
90
- text_encoder.to(dtype=torch.float16)
91
- text_encoder_2.to(dtype=torch.float16)
92
-
93
- vae.requires_grad_(False)
94
- text_encoder.requires_grad_(False)
95
- text_encoder_2.requires_grad_(False)
96
- image_encoder.requires_grad_(False)
97
- transformer.requires_grad_(False)
98
-
99
- if not high_vram:
100
- # DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster
101
- DynamicSwapInstaller.install_model(transformer, device=gpu)
102
- DynamicSwapInstaller.install_model(text_encoder, device=gpu)
103
- else:
104
- text_encoder.to(gpu)
105
- text_encoder_2.to(gpu)
106
- image_encoder.to(gpu)
107
- vae.to(gpu)
108
- # transformer.to(gpu)
109
-
110
- stream = AsyncStream()
111
-
112
- outputs_folder = './outputs/'
113
- os.makedirs(outputs_folder, exist_ok=True)
114
-
115
- @spaces.GPU()
116
- @torch.no_grad()
117
- def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, device="cuda", width=None, height=None):
118
- """
119
- Encode a video into latent representations using the VAE.
120
-
121
- Args:
122
- video_path: Path to the input video file.
123
- vae: AutoencoderKLHunyuanVideo model.
124
- height, width: Target resolution for resizing frames.
125
- vae_batch_size: Number of frames to process per batch.
126
- device: Device for computation (e.g., "cuda").
127
-
128
- Returns:
129
- start_latent: Latent of the first frame (for compatibility with original code).
130
- input_image_np: First frame as numpy array (for CLIP vision encoding).
131
- history_latents: Latents of all frames (shape: [1, channels, frames, height//8, width//8]).
132
- fps: Frames per second of the input video.
133
- """
134
- video_path = str(pathlib.Path(video_path).resolve())
135
- print(f"Processing video: {video_path}")
136
-
137
- if device == "cuda" and not torch.cuda.is_available():
138
- print("CUDA is not available, falling back to CPU")
139
- device = "cpu"
140
-
141
- try:
142
- print("Initializing VideoReader...")
143
- vr = decord.VideoReader(video_path)
144
- fps = vr.get_avg_fps() # Get input video FPS
145
- num_real_frames = len(vr)
146
- print(f"Video loaded: {num_real_frames} frames, FPS: {fps}")
147
-
148
- # Truncate to nearest latent size (multiple of 4)
149
- latent_size_factor = 4
150
- num_frames = (num_real_frames // latent_size_factor) * latent_size_factor
151
- if num_frames != num_real_frames:
152
- print(f"Truncating video from {num_real_frames} to {num_frames} frames for latent size compatibility")
153
- num_real_frames = num_frames
154
-
155
- print("Reading video frames...")
156
- frames = vr.get_batch(range(num_real_frames)).asnumpy() # Shape: (num_real_frames, height, width, channels)
157
- print(f"Frames read: {frames.shape}")
158
-
159
- native_height, native_width = frames.shape[1], frames.shape[2]
160
- print(f"Native video resolution: {native_width}x{native_height}")
161
-
162
- target_height = native_height if height is None else height
163
- target_width = native_width if width is None else width
164
-
165
- if not no_resize:
166
- target_height, target_width = find_nearest_bucket(target_height, target_width, resolution=resolution)
167
- print(f"Adjusted resolution: {target_width}x{target_height}")
168
- else:
169
- print(f"Using native resolution without resizing: {target_width}x{target_height}")
170
-
171
- processed_frames = []
172
- for i, frame in enumerate(frames):
173
- #print(f"Preprocessing frame {i+1}/{num_frames}")
174
- frame_np = resize_and_center_crop(frame, target_width=target_width, target_height=target_height)
175
- processed_frames.append(frame_np)
176
- processed_frames = np.stack(processed_frames) # Shape: (num_real_frames, height, width, channels)
177
- print(f"Frames preprocessed: {processed_frames.shape}")
178
-
179
- input_image_np = processed_frames[0]
180
-
181
- print("Converting frames to tensor...")
182
- frames_pt = torch.from_numpy(processed_frames).float() / 127.5 - 1
183
- frames_pt = frames_pt.permute(0, 3, 1, 2) # Shape: (num_real_frames, channels, height, width)
184
- frames_pt = frames_pt.unsqueeze(0) # Shape: (1, num_real_frames, channels, height, width)
185
- frames_pt = frames_pt.permute(0, 2, 1, 3, 4) # Shape: (1, channels, num_real_frames, height, width)
186
- print(f"Tensor shape: {frames_pt.shape}")
187
-
188
- input_video_pixels = frames_pt.cpu()
189
-
190
- print(f"Moving tensor to device: {device}")
191
- frames_pt = frames_pt.to(device)
192
- print("Tensor moved to device")
193
-
194
- print(f"Moving VAE to device: {device}")
195
- vae.to(device)
196
- print("VAE moved to device")
197
-
198
- print(f"Encoding input video frames in VAE batch size {vae_batch_size} (reduce if memory issues here or if forcing video resolution)")
199
- latents = []
200
- vae.eval()
201
- with torch.no_grad():
202
- for i in tqdm(range(0, frames_pt.shape[2], vae_batch_size), desc="Encoding video frames", mininterval=0.1):
203
- #print(f"Encoding batch {i//vae_batch_size + 1}: frames {i} to {min(i + vae_batch_size, frames_pt.shape[2])}")
204
- batch = frames_pt[:, :, i:i + vae_batch_size] # Shape: (1, channels, batch_size, height, width)
205
- try:
206
- if device == "cuda":
207
- free_mem = torch.cuda.memory_allocated() / 1024**3
208
- print(f"GPU memory before encoding: {free_mem:.2f} GB")
209
- batch_latent = vae_encode(batch, vae)
210
- if device == "cuda":
211
- torch.cuda.synchronize()
212
- print(f"GPU memory after encoding: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
213
- latents.append(batch_latent)
214
- #print(f"Batch encoded, latent shape: {batch_latent.shape}")
215
- except RuntimeError as e:
216
- print(f"Error during VAE encoding: {str(e)}")
217
- if device == "cuda" and "out of memory" in str(e).lower():
218
- print("CUDA out of memory, try reducing vae_batch_size or using CPU")
219
- raise
220
-
221
- print("Concatenating latents...")
222
- history_latents = torch.cat(latents, dim=2) # Shape: (1, channels, frames, height//8, width//8)
223
- print(f"History latents shape: {history_latents.shape}")
224
-
225
- start_latent = history_latents[:, :, :1] # Shape: (1, channels, 1, height//8, width//8)
226
- print(f"Start latent shape: {start_latent.shape}")
227
-
228
- if device == "cuda":
229
- vae.to(cpu)
230
- torch.cuda.empty_cache()
231
- print("VAE moved back to CPU, CUDA cache cleared")
232
-
233
- return start_latent, input_image_np, history_latents, fps, target_height, target_width, input_video_pixels
234
-
235
- except Exception as e:
236
- print(f"Error in video_encode: {str(e)}")
237
- raise
238
-
239
- def set_mp4_comments_imageio_ffmpeg(input_file, comments):
240
- try:
241
- ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
242
-
243
- if not os.path.exists(input_file):
244
- print(f"Error: Input file {input_file} does not exist")
245
- return False
246
-
247
- # Create a temporary file path
248
- temp_file = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False).name
249
-
250
- # FFmpeg command using the bundled binary
251
- command = [
252
- ffmpeg_path, # Use imageio-ffmpeg's FFmpeg
253
- '-i', input_file, # input file
254
- '-metadata', f'comment={comments}', # set comment metadata
255
- '-c:v', 'copy', # copy video stream without re-encoding
256
- '-c:a', 'copy', # copy audio stream without re-encoding
257
- '-y', # overwrite output file if it exists
258
- temp_file # temporary output file
259
- ]
260
-
261
- result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
262
-
263
- if result.returncode == 0:
264
- # Replace the original file with the modified one
265
- shutil.move(temp_file, input_file)
266
- print(f"Successfully added comments to {input_file}")
267
- return True
268
- else:
269
- # Clean up temp file if FFmpeg fails
270
- if os.path.exists(temp_file):
271
- os.remove(temp_file)
272
- print(f"Error: FFmpeg failed with message:\n{result.stderr}")
273
- return False
274
-
275
- except Exception as e:
276
- # Clean up temp file in case of other errors
277
- if 'temp_file' in locals() and os.path.exists(temp_file):
278
- os.remove(temp_file)
279
- print(f"Error saving prompt to video metadata, ffmpeg may be required: "+str(e))
280
- return False
281
-
282
- @spaces.GPU()
283
- @torch.no_grad()
284
- def worker(input_video, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
285
-
286
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
287
-
288
- try:
289
- if not high_vram:
290
- unload_complete_models(
291
- text_encoder, text_encoder_2, image_encoder, vae
292
- )
293
-
294
- # Text encoding
295
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
296
-
297
- if not high_vram:
298
- fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
299
- load_model_as_complete(text_encoder_2, target_device=gpu)
300
-
301
- llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
302
-
303
- if cfg == 1:
304
- llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
305
- else:
306
- llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
307
-
308
- llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
309
- llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
310
-
311
- # 20250506 pftq: Processing input video instead of image
312
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Video processing ...'))))
313
-
314
- # 20250506 pftq: Encode video
315
- #H, W = 640, 640 # Default resolution, will be adjusted
316
- #height, width = find_nearest_bucket(H, W, resolution=640)
317
- #start_latent, input_image_np, history_latents, fps = video_encode(input_video, vae, height, width, vae_batch_size=16, device=gpu)
318
- start_latent, input_image_np, video_latents, fps, height, width, input_video_pixels = video_encode(input_video, resolution, no_resize, vae, vae_batch_size=vae_batch, device=gpu)
319
-
320
- #Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
321
-
322
- # CLIP Vision
323
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
324
-
325
- if not high_vram:
326
- load_model_as_complete(image_encoder, target_device=gpu)
327
-
328
- image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
329
- image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
330
-
331
- # Dtype
332
- llama_vec = llama_vec.to(transformer.dtype)
333
- llama_vec_n = llama_vec_n.to(transformer.dtype)
334
- clip_l_pooler = clip_l_pooler.to(transformer.dtype)
335
- clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
336
- image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
337
-
338
- total_latent_sections = (total_second_length * fps) / (latent_window_size * 4)
339
- total_latent_sections = int(max(round(total_latent_sections), 1))
340
-
341
- for idx in range(batch):
342
- if idx>0:
343
- seed = seed + 1
344
-
345
- if batch > 1:
346
- print(f"Beginning video {idx+1} of {batch} with seed {seed} ")
347
-
348
- #job_id = generate_timestamp()
349
- job_id = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+f"_framepackf1-videoinput_{width}-{total_second_length}sec_seed-{seed}_steps-{steps}_distilled-{gs}_cfg-{cfg}" # 20250506 pftq: easier to read timestamp and filename
350
-
351
- # Sampling
352
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
353
-
354
- rnd = torch.Generator("cpu").manual_seed(seed)
355
-
356
- history_latents = video_latents.cpu()
357
- total_generated_latent_frames = history_latents.shape[2]
358
- history_pixels = None
359
- previous_video = None
360
-
361
- # 20250507 pftq: hot fix for initial video being corrupted by vae encoding, issue with ghosting because of slight differences
362
- #history_pixels = input_video_pixels
363
- #save_bcthw_as_mp4(vae_decode(video_latents, vae).cpu(), os.path.join(outputs_folder, f'{job_id}_input_video.mp4'), fps=fps, crf=mp4_crf) # 20250507 pftq: test fast movement corrupted by vae encoding if vae batch size too low
364
-
365
- for section_index in range(total_latent_sections):
366
- if stream.input_queue.top() == 'end':
367
- stream.output_queue.push(('end', None))
368
- return
369
-
370
- print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
371
-
372
- if not high_vram:
373
- unload_complete_models()
374
- # move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
375
-
376
- if use_teacache:
377
- transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
378
- else:
379
- transformer.initialize_teacache(enable_teacache=False)
380
-
381
- def callback(d):
382
- preview = d['denoised']
383
- preview = vae_decode_fake(preview)
384
-
385
- preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
386
- preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
387
-
388
- if stream.input_queue.top() == 'end':
389
- stream.output_queue.push(('end', None))
390
- raise KeyboardInterrupt('User ends the task.')
391
-
392
- current_step = d['i'] + 1
393
- percentage = int(100.0 * current_step / steps)
394
- hint = f'Sampling {current_step}/{steps}'
395
- desc = f'Total frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps) :.2f} seconds (FPS-{fps}), Seed: {seed}, Video {idx+1} of {batch}. The video is generating part {section_index+1} of {total_latent_sections}...'
396
- stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
397
- return
398
-
399
- # 20250506 pftq: Use user-specified number of context frames, matching original allocation for num_clean_frames=2
400
- available_frames = history_latents.shape[2] # Number of latent frames
401
- max_pixel_frames = min(latent_window_size * 4 - 3, available_frames * 4) # Cap at available pixel frames
402
- adjusted_latent_frames = max(1, (max_pixel_frames + 3) // 4) # Convert back to latent frames
403
- # Adjust num_clean_frames to match original behavior: num_clean_frames=2 means 1 frame for clean_latents_1x
404
- effective_clean_frames = max(0, num_clean_frames - 1) if num_clean_frames > 1 else 0
405
- effective_clean_frames = min(effective_clean_frames, available_frames - 2) if available_frames > 2 else 0 # 20250507 pftq: changed 1 to 2 for edge case for <=1 sec videos
406
- num_2x_frames = min(2, max(1, available_frames - effective_clean_frames - 1)) if available_frames > effective_clean_frames + 1 else 0 # 20250507 pftq: subtracted 1 for edge case for <=1 sec videos
407
- num_4x_frames = min(16, max(1, available_frames - effective_clean_frames - num_2x_frames)) if available_frames > effective_clean_frames + num_2x_frames else 0 # 20250507 pftq: Edge case for <=1 sec
408
-
409
- total_context_frames = num_4x_frames + num_2x_frames + effective_clean_frames
410
- total_context_frames = min(total_context_frames, available_frames) # 20250507 pftq: Edge case for <=1 sec videos
411
-
412
- indices = torch.arange(0, sum([1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames])).unsqueeze(0) # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
413
- clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split(
414
- [1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames], dim=1 # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
415
- )
416
- clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
417
-
418
- # 20250506 pftq: Split history_latents dynamically based on available frames
419
- fallback_frame_count = 2 # 20250507 pftq: Changed 0 to 2 Edge case for <=1 sec videos
420
- context_frames = history_latents[:, :, -total_context_frames:, :, :] if total_context_frames > 0 else history_latents[:, :, :fallback_frame_count, :, :]
421
- if total_context_frames > 0:
422
- split_sizes = [num_4x_frames, num_2x_frames, effective_clean_frames]
423
- split_sizes = [s for s in split_sizes if s > 0] # Remove zero sizes
424
- if split_sizes:
425
- splits = context_frames.split(split_sizes, dim=2)
426
- split_idx = 0
427
- clean_latents_4x = splits[split_idx] if num_4x_frames > 0 else history_latents[:, :, :fallback_frame_count, :, :]
428
- if clean_latents_4x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
429
- clean_latents_4x = torch.cat([clean_latents_4x, clean_latents_4x[:, :, -1:, :, :]], dim=2)[:, :, :2, :, :]
430
- split_idx += 1 if num_4x_frames > 0 else 0
431
- clean_latents_2x = splits[split_idx] if num_2x_frames > 0 and split_idx < len(splits) else history_latents[:, :, :fallback_frame_count, :, :]
432
- if clean_latents_2x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
433
- clean_latents_2x = torch.cat([clean_latents_2x, clean_latents_2x[:, :, -1:, :, :]], dim=2)[:, :, :2, :, :]
434
- split_idx += 1 if num_2x_frames > 0 else 0
435
- clean_latents_1x = splits[split_idx] if effective_clean_frames > 0 and split_idx < len(splits) else history_latents[:, :, :fallback_frame_count, :, :]
436
- else:
437
- clean_latents_4x = clean_latents_2x = clean_latents_1x = history_latents[:, :, :fallback_frame_count, :, :]
438
- else:
439
- clean_latents_4x = clean_latents_2x = clean_latents_1x = history_latents[:, :, :fallback_frame_count, :, :]
440
-
441
- clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
442
-
443
- # 20250507 pftq: Fix for <=1 sec videos.
444
- max_frames = min(latent_window_size * 4 - 3, history_latents.shape[2] * 4)
445
-
446
- generated_latents = sample_hunyuan(
447
- transformer=transformer,
448
- sampler='unipc',
449
- width=width,
450
- height=height,
451
- frames=max_frames,
452
- real_guidance_scale=cfg,
453
- distilled_guidance_scale=gs,
454
- guidance_rescale=rs,
455
- num_inference_steps=steps,
456
- generator=rnd,
457
- prompt_embeds=llama_vec,
458
- prompt_embeds_mask=llama_attention_mask,
459
- prompt_poolers=clip_l_pooler,
460
- negative_prompt_embeds=llama_vec_n,
461
- negative_prompt_embeds_mask=llama_attention_mask_n,
462
- negative_prompt_poolers=clip_l_pooler_n,
463
- device=gpu,
464
- dtype=torch.bfloat16,
465
- image_embeddings=image_encoder_last_hidden_state,
466
- latent_indices=latent_indices,
467
- clean_latents=clean_latents,
468
- clean_latent_indices=clean_latent_indices,
469
- clean_latents_2x=clean_latents_2x,
470
- clean_latent_2x_indices=clean_latent_2x_indices,
471
- clean_latents_4x=clean_latents_4x,
472
- clean_latent_4x_indices=clean_latent_4x_indices,
473
- callback=callback,
474
- )
475
-
476
- total_generated_latent_frames += int(generated_latents.shape[2])
477
- history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
478
-
479
- if not high_vram:
480
- offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
481
- load_model_as_complete(vae, target_device=gpu)
482
-
483
- real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
484
-
485
- if history_pixels is None:
486
- history_pixels = vae_decode(real_history_latents, vae).cpu()
487
- else:
488
- section_latent_frames = latent_window_size * 2
489
- overlapped_frames = min(latent_window_size * 4 - 3, history_pixels.shape[2])
490
-
491
- #if section_index == 0:
492
- #extra_latents = 1 # Add up to 2 extra latent frames for smoother overlap to initial video
493
- #extra_pixel_frames = extra_latents * 4 # Approx. 4 pixel frames per latent
494
- #overlapped_frames = min(overlapped_frames + extra_pixel_frames, history_pixels.shape[2], section_latent_frames * 4)
495
-
496
- current_pixels = vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu()
497
- history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames)
498
-
499
- if not high_vram:
500
- unload_complete_models()
501
-
502
- output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
503
-
504
- # 20250506 pftq: Use input video FPS for output
505
- save_bcthw_as_mp4(history_pixels, output_filename, fps=fps, crf=mp4_crf)
506
- print(f"Latest video saved: {output_filename}")
507
- # 20250508 pftq: Save prompt to mp4 metadata comments
508
- set_mp4_comments_imageio_ffmpeg(output_filename, f"Prompt: {prompt} | Negative Prompt: {n_prompt}");
509
- print(f"Prompt saved to mp4 metadata comments: {output_filename}")
510
-
511
- # 20250506 pftq: Clean up previous partial files
512
- if previous_video is not None and os.path.exists(previous_video):
513
- try:
514
- os.remove(previous_video)
515
- print(f"Previous partial video deleted: {previous_video}")
516
- except Exception as e:
517
- print(f"Error deleting previous partial video {previous_video}: {e}")
518
- previous_video = output_filename
519
-
520
- print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
521
-
522
- stream.output_queue.push(('file', output_filename))
523
- except:
524
- traceback.print_exc()
525
-
526
- if not high_vram:
527
- unload_complete_models(
528
- text_encoder, text_encoder_2, image_encoder, vae
529
- )
530
-
531
- stream.output_queue.push(('end', None))
532
- return
533
-
534
- @spaces.GPU()
535
- def process(input_video, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
536
- global stream, high_vram
537
- # 20250506 pftq: Updated assertion for video input
538
- assert input_video is not None, 'No input video!'
539
-
540
- yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
541
-
542
- # 20250507 pftq: Even the H100 needs offloading if the video dimensions are 720p or higher
543
- if high_vram and (no_resize or resolution>640):
544
- print("Disabling high vram mode due to no resize and/or potentially higher resolution...")
545
- high_vram = False
546
- vae.enable_slicing()
547
- vae.enable_tiling()
548
- DynamicSwapInstaller.install_model(transformer, device=gpu)
549
- DynamicSwapInstaller.install_model(text_encoder, device=gpu)
550
-
551
- # 20250508 pftq: automatically set distilled cfg to 1 if cfg is used
552
- if cfg > 1:
553
- gs = 1
554
-
555
- stream = AsyncStream()
556
-
557
- # 20250506 pftq: Pass num_clean_frames, vae_batch, etc
558
- async_run(worker, input_video, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
559
-
560
- output_filename = None
561
-
562
- while True:
563
- flag, data = stream.output_queue.next()
564
-
565
- if flag == 'file':
566
- output_filename = data
567
- yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
568
-
569
- if flag == 'progress':
570
- preview, desc, html = data
571
- #yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
572
- yield output_filename, gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True) # 20250506 pftq: Keep refreshing the video in case it got hidden when the tab was in the background
573
-
574
- if flag == 'end':
575
- yield output_filename, gr.update(visible=False), desc+' Video complete.', '', gr.update(interactive=True), gr.update(interactive=False)
576
- break
577
-
578
- def end_process():
579
- stream.input_queue.push('end')
580
-
581
- quick_prompts = [
582
- 'The girl dances gracefully, with clear movements, full of charm.',
583
- 'A character doing some simple body movements.',
584
- ]
585
- quick_prompts = [[x] for x in quick_prompts]
586
-
587
- css = make_progress_bar_css()
588
- block = gr.Blocks(css=css).queue()
589
- with block:
590
- gr.Markdown('# Framepack F1 (Video Extender)')
591
- with gr.Row():
592
- with gr.Column():
593
- # 20250506 pftq: Changed to Video input from Image
594
- input_video = gr.Video(sources='upload', label="Input Video", height=320)
595
- prompt = gr.Textbox(label="Prompt", value='')
596
- #example_quick_prompts = gr.Dataset(samples=quick_prompts, label='Quick List', samples_per_page=1000, components=[prompt])
597
- #example_quick_prompts.click(lambda x: x[0], inputs=[example_quick_prompts], outputs=prompt, show_progress=False, queue=False)
598
-
599
- with gr.Row():
600
- start_button = gr.Button(value="Start Generation")
601
- end_button = gr.Button(value="End Generation", interactive=False)
602
-
603
- with gr.Group():
604
- with gr.Row():
605
- use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed, but often makes hands and fingers slightly worse.')
606
- no_resize = gr.Checkbox(label='Force Original Video Resolution (No Resizing)', value=False, info='Might run out of VRAM (720p requires > 24GB VRAM).')
607
-
608
- seed = gr.Number(label="Seed", value=31337, precision=0)
609
-
610
- batch = gr.Slider(label="Batch Size (Number of Videos)", minimum=1, maximum=1000, value=1, step=1, info='Generate multiple videos each with a different seed.')
611
-
612
- resolution = gr.Number(label="Resolution (max width or height)", value=640, precision=0, visible=False)
613
-
614
- total_second_length = gr.Slider(label="Additional Video Length to Generate (Seconds)", minimum=1, maximum=120, value=5, step=0.1)
615
-
616
- gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=3.0, step=0.01, info='Prompt adherence at the cost of less details from the input video, but to a lesser extent than Context Frames.')
617
- cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, visible=True, info='Use this instead of Distilled for more detail/control + Negative Prompt (make sure Distilled set to 1). Doubles render time.') # Should not change
618
- rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01, visible=False) # Should not change
619
-
620
- n_prompt = gr.Textbox(label="Negative Prompt", value="", visible=True, info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
621
- steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Increase for more quality, especially if using high non-distilled CFG.')
622
-
623
- num_clean_frames = gr.Slider(label="Number of Context Frames", minimum=2, maximum=10, value=5, step=1, info="Retain more video details but increase memory use. Reduce to 2 if memory issues.")
624
-
625
- default_vae = 32
626
- if high_vram:
627
- default_vae = 128
628
- elif free_mem_gb>=20:
629
- default_vae = 64
630
-
631
- vae_batch = gr.Slider(label="VAE Batch Size for Input Video", minimum=4, maximum=256, value=default_vae, step=4, info="Reduce if running out of memory. Increase for better quality frames during fast motion.")
632
-
633
- latent_window_size = gr.Slider(label="Latent Window Size", minimum=9, maximum=33, value=9, step=1, visible=True, info='Generate more frames at a time (larger chunks). Less degradation and better blending but higher VRAM cost.')
634
-
635
- gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
636
-
637
- mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ")
638
-
639
- with gr.Column():
640
- preview_image = gr.Image(label="Next Latents", height=200, visible=False)
641
- result_video = gr.Video(label="Finished Frames", autoplay=True, show_share_button=False, height=512, loop=True)
642
- progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
643
- progress_bar = gr.HTML('', elem_classes='no-generating-animation')
644
-
645
- gr.HTML("""
646
- <div style="text-align:center; margin-top:20px;">Share your results and find ideas at the <a href="https://x.com/search?q=framepack&f=live" target="_blank">FramePack Twitter (X) thread</a></div>
647
- """)
648
-
649
- ips = [input_video, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
650
- start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
651
- end_button.click(fn=end_process)
652
-
653
- block.launch(ssr_mode=False)
 
1
+ from diffusers_helper.hf_login import login
2
+
3
+ import os
4
+
5
+ os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download')))
6
+ import spaces
7
+ import gradio as gr
8
+ import torch
9
+ import traceback
10
+ import einops
11
+ import safetensors.torch as sf
12
+ import numpy as np
13
+ import argparse
14
+ import math
15
+ import decord
16
+ from tqdm import tqdm
17
+ import pathlib
18
+ from datetime import datetime
19
+ import imageio_ffmpeg
20
+ import tempfile
21
+ import shutil
22
+ import subprocess
23
+
24
+ from PIL import Image
25
+ from diffusers import AutoencoderKLHunyuanVideo
26
+ from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
27
+ from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
28
+ from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
29
+ from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
30
+ from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
31
+ from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
32
+ from diffusers_helper.thread_utils import AsyncStream, async_run
33
+ from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
34
+ from transformers import SiglipImageProcessor, SiglipVisionModel
35
+ from diffusers_helper.clip_vision import hf_clip_vision_encode
36
+ from diffusers_helper.bucket_tools import find_nearest_bucket
37
+ from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
38
+
39
+ parser = argparse.ArgumentParser()
40
+ parser.add_argument('--share', action='store_true')
41
+ parser.add_argument("--server", type=str, default='0.0.0.0')
42
+ parser.add_argument("--port", type=int, required=False)
43
+ parser.add_argument("--inbrowser", action='store_true')
44
+ args = parser.parse_args()
45
+
46
+ print(args)
47
+
48
+ free_mem_gb = get_cuda_free_memory_gb(gpu)
49
+ high_vram = free_mem_gb > 80
50
+
51
+ print(f'Free VRAM {free_mem_gb} GB')
52
+ print(f'High-VRAM Mode: {high_vram}')
53
+
54
+
55
+
56
+ text_encoder = LlamaModel.from_pretrained("Fabrice-TIERCELIN/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
57
+ text_encoder_2 = CLIPTextModel.from_pretrained("Fabrice-TIERCELIN/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
58
+ tokenizer = LlamaTokenizerFast.from_pretrained("Fabrice-TIERCELIN/HunyuanVideo", subfolder='tokenizer')
59
+ tokenizer_2 = CLIPTokenizer.from_pretrained("Fabrice-TIERCELIN/HunyuanVideo", subfolder='tokenizer_2')
60
+ vae = AutoencoderKLHunyuanVideo.from_pretrained("Fabrice-TIERCELIN/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu()
61
+
62
+ feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor')
63
+ image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu()
64
+
65
+ quant_config = DiffusersBitsAndBytesConfig(load_in_8bit=True)
66
+ transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained(
67
+ "lllyasviel/FramePack_F1_I2V_HY_20250503",
68
+ quantization_config=quant_config,
69
+ torch_dtype=torch.bfloat16,
70
+ ).cpu()
71
+
72
+ # transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePack_F1_I2V_HY_20250503', torch_dtype=torch.bfloat16).cpu()
73
+
74
+ vae.eval()
75
+ text_encoder.eval()
76
+ text_encoder_2.eval()
77
+ image_encoder.eval()
78
+ transformer.eval()
79
+
80
+ if not high_vram:
81
+ vae.enable_slicing()
82
+ vae.enable_tiling()
83
+
84
+ transformer.high_quality_fp32_output_for_inference = True
85
+ print('transformer.high_quality_fp32_output_for_inference = True')
86
+
87
+ # transformer.to(dtype=torch.bfloat16)
88
+ vae.to(dtype=torch.float16)
89
+ image_encoder.to(dtype=torch.float16)
90
+ text_encoder.to(dtype=torch.float16)
91
+ text_encoder_2.to(dtype=torch.float16)
92
+
93
+ vae.requires_grad_(False)
94
+ text_encoder.requires_grad_(False)
95
+ text_encoder_2.requires_grad_(False)
96
+ image_encoder.requires_grad_(False)
97
+ transformer.requires_grad_(False)
98
+
99
+ if not high_vram:
100
+ # DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster
101
+ DynamicSwapInstaller.install_model(transformer, device=gpu)
102
+ DynamicSwapInstaller.install_model(text_encoder, device=gpu)
103
+ else:
104
+ text_encoder.to(gpu)
105
+ text_encoder_2.to(gpu)
106
+ image_encoder.to(gpu)
107
+ vae.to(gpu)
108
+ # transformer.to(gpu)
109
+
110
+ stream = AsyncStream()
111
+
112
+ outputs_folder = './outputs/'
113
+ os.makedirs(outputs_folder, exist_ok=True)
114
+
115
+ @spaces.GPU()
116
+ @torch.no_grad()
117
+ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, device="cuda", width=None, height=None):
118
+ """
119
+ Encode a video into latent representations using the VAE.
120
+
121
+ Args:
122
+ video_path: Path to the input video file.
123
+ vae: AutoencoderKLHunyuanVideo model.
124
+ height, width: Target resolution for resizing frames.
125
+ vae_batch_size: Number of frames to process per batch.
126
+ device: Device for computation (e.g., "cuda").
127
+
128
+ Returns:
129
+ start_latent: Latent of the first frame (for compatibility with original code).
130
+ input_image_np: First frame as numpy array (for CLIP vision encoding).
131
+ history_latents: Latents of all frames (shape: [1, channels, frames, height//8, width//8]).
132
+ fps: Frames per second of the input video.
133
+ """
134
+ video_path = str(pathlib.Path(video_path).resolve())
135
+ print(f"Processing video: {video_path}")
136
+
137
+ if device == "cuda" and not torch.cuda.is_available():
138
+ print("CUDA is not available, falling back to CPU")
139
+ device = "cpu"
140
+
141
+ try:
142
+ print("Initializing VideoReader...")
143
+ vr = decord.VideoReader(video_path)
144
+ fps = vr.get_avg_fps() # Get input video FPS
145
+ num_real_frames = len(vr)
146
+ print(f"Video loaded: {num_real_frames} frames, FPS: {fps}")
147
+
148
+ # Truncate to nearest latent size (multiple of 4)
149
+ latent_size_factor = 4
150
+ num_frames = (num_real_frames // latent_size_factor) * latent_size_factor
151
+ if num_frames != num_real_frames:
152
+ print(f"Truncating video from {num_real_frames} to {num_frames} frames for latent size compatibility")
153
+ num_real_frames = num_frames
154
+
155
+ print("Reading video frames...")
156
+ frames = vr.get_batch(range(num_real_frames)).asnumpy() # Shape: (num_real_frames, height, width, channels)
157
+ print(f"Frames read: {frames.shape}")
158
+
159
+ native_height, native_width = frames.shape[1], frames.shape[2]
160
+ print(f"Native video resolution: {native_width}x{native_height}")
161
+
162
+ target_height = native_height if height is None else height
163
+ target_width = native_width if width is None else width
164
+
165
+ if not no_resize:
166
+ target_height, target_width = find_nearest_bucket(target_height, target_width, resolution=resolution)
167
+ print(f"Adjusted resolution: {target_width}x{target_height}")
168
+ else:
169
+ print(f"Using native resolution without resizing: {target_width}x{target_height}")
170
+
171
+ processed_frames = []
172
+ for i, frame in enumerate(frames):
173
+ #print(f"Preprocessing frame {i+1}/{num_frames}")
174
+ frame_np = resize_and_center_crop(frame, target_width=target_width, target_height=target_height)
175
+ processed_frames.append(frame_np)
176
+ processed_frames = np.stack(processed_frames) # Shape: (num_real_frames, height, width, channels)
177
+ print(f"Frames preprocessed: {processed_frames.shape}")
178
+
179
+ input_image_np = processed_frames[0]
180
+
181
+ print("Converting frames to tensor...")
182
+ frames_pt = torch.from_numpy(processed_frames).float() / 127.5 - 1
183
+ frames_pt = frames_pt.permute(0, 3, 1, 2) # Shape: (num_real_frames, channels, height, width)
184
+ frames_pt = frames_pt.unsqueeze(0) # Shape: (1, num_real_frames, channels, height, width)
185
+ frames_pt = frames_pt.permute(0, 2, 1, 3, 4) # Shape: (1, channels, num_real_frames, height, width)
186
+ print(f"Tensor shape: {frames_pt.shape}")
187
+
188
+ input_video_pixels = frames_pt.cpu()
189
+
190
+ print(f"Moving tensor to device: {device}")
191
+ frames_pt = frames_pt.to(device)
192
+ print("Tensor moved to device")
193
+
194
+ print(f"Moving VAE to device: {device}")
195
+ vae.to(device)
196
+ print("VAE moved to device")
197
+
198
+ print(f"Encoding input video frames in VAE batch size {vae_batch_size} (reduce if memory issues here or if forcing video resolution)")
199
+ latents = []
200
+ vae.eval()
201
+ with torch.no_grad():
202
+ for i in tqdm(range(0, frames_pt.shape[2], vae_batch_size), desc="Encoding video frames", mininterval=0.1):
203
+ #print(f"Encoding batch {i//vae_batch_size + 1}: frames {i} to {min(i + vae_batch_size, frames_pt.shape[2])}")
204
+ batch = frames_pt[:, :, i:i + vae_batch_size] # Shape: (1, channels, batch_size, height, width)
205
+ try:
206
+ if device == "cuda":
207
+ free_mem = torch.cuda.memory_allocated() / 1024**3
208
+ print(f"GPU memory before encoding: {free_mem:.2f} GB")
209
+ batch_latent = vae_encode(batch, vae)
210
+ if device == "cuda":
211
+ torch.cuda.synchronize()
212
+ print(f"GPU memory after encoding: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
213
+ latents.append(batch_latent)
214
+ #print(f"Batch encoded, latent shape: {batch_latent.shape}")
215
+ except RuntimeError as e:
216
+ print(f"Error during VAE encoding: {str(e)}")
217
+ if device == "cuda" and "out of memory" in str(e).lower():
218
+ print("CUDA out of memory, try reducing vae_batch_size or using CPU")
219
+ raise
220
+
221
+ print("Concatenating latents...")
222
+ history_latents = torch.cat(latents, dim=2) # Shape: (1, channels, frames, height//8, width//8)
223
+ print(f"History latents shape: {history_latents.shape}")
224
+
225
+ start_latent = history_latents[:, :, :1] # Shape: (1, channels, 1, height//8, width//8)
226
+ print(f"Start latent shape: {start_latent.shape}")
227
+
228
+ if device == "cuda":
229
+ vae.to(cpu)
230
+ torch.cuda.empty_cache()
231
+ print("VAE moved back to CPU, CUDA cache cleared")
232
+
233
+ return start_latent, input_image_np, history_latents, fps, target_height, target_width, input_video_pixels
234
+
235
+ except Exception as e:
236
+ print(f"Error in video_encode: {str(e)}")
237
+ raise
238
+
239
+ def set_mp4_comments_imageio_ffmpeg(input_file, comments):
240
+ try:
241
+ ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
242
+
243
+ if not os.path.exists(input_file):
244
+ print(f"Error: Input file {input_file} does not exist")
245
+ return False
246
+
247
+ # Create a temporary file path
248
+ temp_file = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False).name
249
+
250
+ # FFmpeg command using the bundled binary
251
+ command = [
252
+ ffmpeg_path, # Use imageio-ffmpeg's FFmpeg
253
+ '-i', input_file, # input file
254
+ '-metadata', f'comment={comments}', # set comment metadata
255
+ '-c:v', 'copy', # copy video stream without re-encoding
256
+ '-c:a', 'copy', # copy audio stream without re-encoding
257
+ '-y', # overwrite output file if it exists
258
+ temp_file # temporary output file
259
+ ]
260
+
261
+ result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
262
+
263
+ if result.returncode == 0:
264
+ # Replace the original file with the modified one
265
+ shutil.move(temp_file, input_file)
266
+ print(f"Successfully added comments to {input_file}")
267
+ return True
268
+ else:
269
+ # Clean up temp file if FFmpeg fails
270
+ if os.path.exists(temp_file):
271
+ os.remove(temp_file)
272
+ print(f"Error: FFmpeg failed with message:\n{result.stderr}")
273
+ return False
274
+
275
+ except Exception as e:
276
+ # Clean up temp file in case of other errors
277
+ if 'temp_file' in locals() and os.path.exists(temp_file):
278
+ os.remove(temp_file)
279
+ print(f"Error saving prompt to video metadata, ffmpeg may be required: "+str(e))
280
+ return False
281
+
282
+ @spaces.GPU()
283
+ @torch.no_grad()
284
+ def worker(input_video, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
285
+
286
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
287
+
288
+ try:
289
+ if not high_vram:
290
+ unload_complete_models(
291
+ text_encoder, text_encoder_2, image_encoder, vae
292
+ )
293
+
294
+ # Text encoding
295
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
296
+
297
+ if not high_vram:
298
+ fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
299
+ load_model_as_complete(text_encoder_2, target_device=gpu)
300
+
301
+ llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
302
+
303
+ if cfg == 1:
304
+ llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
305
+ else:
306
+ llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
307
+
308
+ llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
309
+ llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
310
+
311
+ # 20250506 pftq: Processing input video instead of image
312
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Video processing ...'))))
313
+
314
+ # 20250506 pftq: Encode video
315
+ #H, W = 640, 640 # Default resolution, will be adjusted
316
+ #height, width = find_nearest_bucket(H, W, resolution=640)
317
+ #start_latent, input_image_np, history_latents, fps = video_encode(input_video, vae, height, width, vae_batch_size=16, device=gpu)
318
+ start_latent, input_image_np, video_latents, fps, height, width, input_video_pixels = video_encode(input_video, resolution, no_resize, vae, vae_batch_size=vae_batch, device=gpu)
319
+
320
+ #Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
321
+
322
+ # CLIP Vision
323
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
324
+
325
+ if not high_vram:
326
+ load_model_as_complete(image_encoder, target_device=gpu)
327
+
328
+ image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
329
+ image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
330
+
331
+ # Dtype
332
+ llama_vec = llama_vec.to(transformer.dtype)
333
+ llama_vec_n = llama_vec_n.to(transformer.dtype)
334
+ clip_l_pooler = clip_l_pooler.to(transformer.dtype)
335
+ clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
336
+ image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
337
+
338
+ total_latent_sections = (total_second_length * fps) / (latent_window_size * 4)
339
+ total_latent_sections = int(max(round(total_latent_sections), 1))
340
+
341
+ for idx in range(batch):
342
+ if idx>0:
343
+ seed = seed + 1
344
+
345
+ if batch > 1:
346
+ print(f"Beginning video {idx+1} of {batch} with seed {seed} ")
347
+
348
+ #job_id = generate_timestamp()
349
+ job_id = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+f"_framepackf1-videoinput_{width}-{total_second_length}sec_seed-{seed}_steps-{steps}_distilled-{gs}_cfg-{cfg}" # 20250506 pftq: easier to read timestamp and filename
350
+
351
+ # Sampling
352
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
353
+
354
+ rnd = torch.Generator("cpu").manual_seed(seed)
355
+
356
+ history_latents = video_latents.cpu()
357
+ total_generated_latent_frames = history_latents.shape[2]
358
+ history_pixels = None
359
+ previous_video = None
360
+
361
+ # 20250507 pftq: hot fix for initial video being corrupted by vae encoding, issue with ghosting because of slight differences
362
+ #history_pixels = input_video_pixels
363
+ #save_bcthw_as_mp4(vae_decode(video_latents, vae).cpu(), os.path.join(outputs_folder, f'{job_id}_input_video.mp4'), fps=fps, crf=mp4_crf) # 20250507 pftq: test fast movement corrupted by vae encoding if vae batch size too low
364
+
365
+ for section_index in range(total_latent_sections):
366
+ if stream.input_queue.top() == 'end':
367
+ stream.output_queue.push(('end', None))
368
+ return
369
+
370
+ print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
371
+
372
+ if not high_vram:
373
+ unload_complete_models()
374
+ # move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
375
+
376
+ if use_teacache:
377
+ transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
378
+ else:
379
+ transformer.initialize_teacache(enable_teacache=False)
380
+
381
+ def callback(d):
382
+ preview = d['denoised']
383
+ preview = vae_decode_fake(preview)
384
+
385
+ preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
386
+ preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
387
+
388
+ if stream.input_queue.top() == 'end':
389
+ stream.output_queue.push(('end', None))
390
+ raise KeyboardInterrupt('User ends the task.')
391
+
392
+ current_step = d['i'] + 1
393
+ percentage = int(100.0 * current_step / steps)
394
+ hint = f'Sampling {current_step}/{steps}'
395
+ desc = f'Total frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps) :.2f} seconds (FPS-{fps}), Seed: {seed}, Video {idx+1} of {batch}. The video is generating part {section_index+1} of {total_latent_sections}...'
396
+ stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
397
+ return
398
+
399
+ # 20250506 pftq: Use user-specified number of context frames, matching original allocation for num_clean_frames=2
400
+ available_frames = history_latents.shape[2] # Number of latent frames
401
+ max_pixel_frames = min(latent_window_size * 4 - 3, available_frames * 4) # Cap at available pixel frames
402
+ adjusted_latent_frames = max(1, (max_pixel_frames + 3) // 4) # Convert back to latent frames
403
+ # Adjust num_clean_frames to match original behavior: num_clean_frames=2 means 1 frame for clean_latents_1x
404
+ effective_clean_frames = max(0, num_clean_frames - 1) if num_clean_frames > 1 else 0
405
+ effective_clean_frames = min(effective_clean_frames, available_frames - 2) if available_frames > 2 else 0 # 20250507 pftq: changed 1 to 2 for edge case for <=1 sec videos
406
+ num_2x_frames = min(2, max(1, available_frames - effective_clean_frames - 1)) if available_frames > effective_clean_frames + 1 else 0 # 20250507 pftq: subtracted 1 for edge case for <=1 sec videos
407
+ num_4x_frames = min(16, max(1, available_frames - effective_clean_frames - num_2x_frames)) if available_frames > effective_clean_frames + num_2x_frames else 0 # 20250507 pftq: Edge case for <=1 sec
408
+
409
+ total_context_frames = num_4x_frames + num_2x_frames + effective_clean_frames
410
+ total_context_frames = min(total_context_frames, available_frames) # 20250507 pftq: Edge case for <=1 sec videos
411
+
412
+ indices = torch.arange(0, sum([1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames])).unsqueeze(0) # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
413
+ clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split(
414
+ [1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames], dim=1 # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
415
+ )
416
+ clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
417
+
418
+ # 20250506 pftq: Split history_latents dynamically based on available frames
419
+ fallback_frame_count = 2 # 20250507 pftq: Changed 0 to 2 Edge case for <=1 sec videos
420
+ context_frames = history_latents[:, :, -total_context_frames:, :, :] if total_context_frames > 0 else history_latents[:, :, :fallback_frame_count, :, :]
421
+ if total_context_frames > 0:
422
+ split_sizes = [num_4x_frames, num_2x_frames, effective_clean_frames]
423
+ split_sizes = [s for s in split_sizes if s > 0] # Remove zero sizes
424
+ if split_sizes:
425
+ splits = context_frames.split(split_sizes, dim=2)
426
+ split_idx = 0
427
+ clean_latents_4x = splits[split_idx] if num_4x_frames > 0 else history_latents[:, :, :fallback_frame_count, :, :]
428
+ if clean_latents_4x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
429
+ clean_latents_4x = torch.cat([clean_latents_4x, clean_latents_4x[:, :, -1:, :, :]], dim=2)[:, :, :2, :, :]
430
+ split_idx += 1 if num_4x_frames > 0 else 0
431
+ clean_latents_2x = splits[split_idx] if num_2x_frames > 0 and split_idx < len(splits) else history_latents[:, :, :fallback_frame_count, :, :]
432
+ if clean_latents_2x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
433
+ clean_latents_2x = torch.cat([clean_latents_2x, clean_latents_2x[:, :, -1:, :, :]], dim=2)[:, :, :2, :, :]
434
+ split_idx += 1 if num_2x_frames > 0 else 0
435
+ clean_latents_1x = splits[split_idx] if effective_clean_frames > 0 and split_idx < len(splits) else history_latents[:, :, :fallback_frame_count, :, :]
436
+ else:
437
+ clean_latents_4x = clean_latents_2x = clean_latents_1x = history_latents[:, :, :fallback_frame_count, :, :]
438
+ else:
439
+ clean_latents_4x = clean_latents_2x = clean_latents_1x = history_latents[:, :, :fallback_frame_count, :, :]
440
+
441
+ clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
442
+
443
+ # 20250507 pftq: Fix for <=1 sec videos.
444
+ max_frames = min(latent_window_size * 4 - 3, history_latents.shape[2] * 4)
445
+
446
+ generated_latents = sample_hunyuan(
447
+ transformer=transformer,
448
+ sampler='unipc',
449
+ width=width,
450
+ height=height,
451
+ frames=max_frames,
452
+ real_guidance_scale=cfg,
453
+ distilled_guidance_scale=gs,
454
+ guidance_rescale=rs,
455
+ num_inference_steps=steps,
456
+ generator=rnd,
457
+ prompt_embeds=llama_vec,
458
+ prompt_embeds_mask=llama_attention_mask,
459
+ prompt_poolers=clip_l_pooler,
460
+ negative_prompt_embeds=llama_vec_n,
461
+ negative_prompt_embeds_mask=llama_attention_mask_n,
462
+ negative_prompt_poolers=clip_l_pooler_n,
463
+ device=gpu,
464
+ dtype=torch.bfloat16,
465
+ image_embeddings=image_encoder_last_hidden_state,
466
+ latent_indices=latent_indices,
467
+ clean_latents=clean_latents,
468
+ clean_latent_indices=clean_latent_indices,
469
+ clean_latents_2x=clean_latents_2x,
470
+ clean_latent_2x_indices=clean_latent_2x_indices,
471
+ clean_latents_4x=clean_latents_4x,
472
+ clean_latent_4x_indices=clean_latent_4x_indices,
473
+ callback=callback,
474
+ )
475
+
476
+ total_generated_latent_frames += int(generated_latents.shape[2])
477
+ history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
478
+
479
+ if not high_vram:
480
+ offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
481
+ load_model_as_complete(vae, target_device=gpu)
482
+
483
+ real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
484
+
485
+ if history_pixels is None:
486
+ history_pixels = vae_decode(real_history_latents, vae).cpu()
487
+ else:
488
+ section_latent_frames = latent_window_size * 2
489
+ overlapped_frames = min(latent_window_size * 4 - 3, history_pixels.shape[2])
490
+
491
+ #if section_index == 0:
492
+ #extra_latents = 1 # Add up to 2 extra latent frames for smoother overlap to initial video
493
+ #extra_pixel_frames = extra_latents * 4 # Approx. 4 pixel frames per latent
494
+ #overlapped_frames = min(overlapped_frames + extra_pixel_frames, history_pixels.shape[2], section_latent_frames * 4)
495
+
496
+ current_pixels = vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu()
497
+ history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames)
498
+
499
+ if not high_vram:
500
+ unload_complete_models()
501
+
502
+ output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
503
+
504
+ # 20250506 pftq: Use input video FPS for output
505
+ save_bcthw_as_mp4(history_pixels, output_filename, fps=fps, crf=mp4_crf)
506
+ print(f"Latest video saved: {output_filename}")
507
+ # 20250508 pftq: Save prompt to mp4 metadata comments
508
+ set_mp4_comments_imageio_ffmpeg(output_filename, f"Prompt: {prompt} | Negative Prompt: {n_prompt}");
509
+ print(f"Prompt saved to mp4 metadata comments: {output_filename}")
510
+
511
+ # 20250506 pftq: Clean up previous partial files
512
+ if previous_video is not None and os.path.exists(previous_video):
513
+ try:
514
+ os.remove(previous_video)
515
+ print(f"Previous partial video deleted: {previous_video}")
516
+ except Exception as e:
517
+ print(f"Error deleting previous partial video {previous_video}: {e}")
518
+ previous_video = output_filename
519
+
520
+ print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
521
+
522
+ stream.output_queue.push(('file', output_filename))
523
+ except:
524
+ traceback.print_exc()
525
+
526
+ if not high_vram:
527
+ unload_complete_models(
528
+ text_encoder, text_encoder_2, image_encoder, vae
529
+ )
530
+
531
+ stream.output_queue.push(('end', None))
532
+ return
533
+
534
+ @spaces.GPU()
535
+ def process(input_video, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
536
+ global stream, high_vram
537
+ # 20250506 pftq: Updated assertion for video input
538
+ assert input_video is not None, 'No input video!'
539
+
540
+ yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
541
+
542
+ # 20250507 pftq: Even the H100 needs offloading if the video dimensions are 720p or higher
543
+ if high_vram and (no_resize or resolution>640):
544
+ print("Disabling high vram mode due to no resize and/or potentially higher resolution...")
545
+ high_vram = False
546
+ vae.enable_slicing()
547
+ vae.enable_tiling()
548
+ DynamicSwapInstaller.install_model(transformer, device=gpu)
549
+ DynamicSwapInstaller.install_model(text_encoder, device=gpu)
550
+
551
+ # 20250508 pftq: automatically set distilled cfg to 1 if cfg is used
552
+ if cfg > 1:
553
+ gs = 1
554
+
555
+ stream = AsyncStream()
556
+
557
+ # 20250506 pftq: Pass num_clean_frames, vae_batch, etc
558
+ async_run(worker, input_video, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
559
+
560
+ output_filename = None
561
+
562
+ while True:
563
+ flag, data = stream.output_queue.next()
564
+
565
+ if flag == 'file':
566
+ output_filename = data
567
+ yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
568
+
569
+ if flag == 'progress':
570
+ preview, desc, html = data
571
+ #yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
572
+ yield output_filename, gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True) # 20250506 pftq: Keep refreshing the video in case it got hidden when the tab was in the background
573
+
574
+ if flag == 'end':
575
+ yield output_filename, gr.update(visible=False), desc+' Video complete.', '', gr.update(interactive=True), gr.update(interactive=False)
576
+ break
577
+
578
+ def end_process():
579
+ stream.input_queue.push('end')
580
+
581
+ quick_prompts = [
582
+ 'The girl dances gracefully, with clear movements, full of charm.',
583
+ 'A character doing some simple body movements.',
584
+ ]
585
+ quick_prompts = [[x] for x in quick_prompts]
586
+
587
+ css = make_progress_bar_css()
588
+ block = gr.Blocks(css=css).queue()
589
+ with block:
590
+ gr.Markdown('# Framepack F1 (Video Extender)')
591
+ with gr.Row():
592
+ with gr.Column():
593
+ # 20250506 pftq: Changed to Video input from Image
594
+ input_video = gr.Video(sources='upload', label="Input Video", height=320)
595
+ prompt = gr.Textbox(label="Prompt", value='')
596
+ #example_quick_prompts = gr.Dataset(samples=quick_prompts, label='Quick List', samples_per_page=1000, components=[prompt])
597
+ #example_quick_prompts.click(lambda x: x[0], inputs=[example_quick_prompts], outputs=prompt, show_progress=False, queue=False)
598
+
599
+ with gr.Row():
600
+ start_button = gr.Button(value="Start Generation")
601
+ end_button = gr.Button(value="End Generation", interactive=False)
602
+
603
+ with gr.Group():
604
+ with gr.Row():
605
+ use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed, but often makes hands and fingers slightly worse.')
606
+ no_resize = gr.Checkbox(label='Force Original Video Resolution (No Resizing)', value=False, info='Might run out of VRAM (720p requires > 24GB VRAM).')
607
+
608
+ seed = gr.Number(label="Seed", value=31337, precision=0)
609
+
610
+ batch = gr.Slider(label="Batch Size (Number of Videos)", minimum=1, maximum=1000, value=1, step=1, info='Generate multiple videos each with a different seed.')
611
+
612
+ resolution = gr.Number(label="Resolution (max width or height)", value=640, precision=0, visible=False)
613
+
614
+ total_second_length = gr.Slider(label="Additional Video Length to Generate (Seconds)", minimum=1, maximum=120, value=5, step=0.1)
615
+
616
+ gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=3.0, step=0.01, info='Prompt adherence at the cost of less details from the input video, but to a lesser extent than Context Frames.')
617
+ cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, visible=True, info='Use this instead of Distilled for more detail/control + Negative Prompt (make sure Distilled set to 1). Doubles render time.') # Should not change
618
+ rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01, visible=False) # Should not change
619
+
620
+ n_prompt = gr.Textbox(label="Negative Prompt", value="", visible=True, info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
621
+ steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Increase for more quality, especially if using high non-distilled CFG.')
622
+
623
+ num_clean_frames = gr.Slider(label="Number of Context Frames", minimum=2, maximum=10, value=5, step=1, info="Retain more video details but increase memory use. Reduce to 2 if memory issues.")
624
+
625
+ default_vae = 32
626
+ if high_vram:
627
+ default_vae = 128
628
+ elif free_mem_gb>=20:
629
+ default_vae = 64
630
+
631
+ vae_batch = gr.Slider(label="VAE Batch Size for Input Video", minimum=4, maximum=256, value=default_vae, step=4, info="Reduce if running out of memory. Increase for better quality frames during fast motion.")
632
+
633
+ latent_window_size = gr.Slider(label="Latent Window Size", minimum=9, maximum=33, value=9, step=1, visible=True, info='Generate more frames at a time (larger chunks). Less degradation and better blending but higher VRAM cost.')
634
+
635
+ gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
636
+
637
+ mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ")
638
+
639
+ with gr.Column():
640
+ preview_image = gr.Image(label="Next Latents", height=200, visible=False)
641
+ result_video = gr.Video(label="Finished Frames", autoplay=True, show_share_button=False, height=512, loop=True)
642
+ progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
643
+ progress_bar = gr.HTML('', elem_classes='no-generating-animation')
644
+
645
+ gr.HTML("""
646
+ <div style="text-align:center; margin-top:20px;">Share your results and find ideas at the <a href="https://x.com/search?q=framepack&f=live" target="_blank">FramePack Twitter (X) thread</a></div>
647
+ """)
648
+
649
+ ips = [input_video, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
650
+ start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
651
+ end_button.click(fn=end_process)
652
+
653
+ block.launch(ssr_mode=False)