Fabrice-TIERCELIN commited on
Commit
19aa965
·
verified ·
1 Parent(s): 6774fdd

Merge code

Browse files
Files changed (1) hide show
  1. app.py +1 -281
app.py CHANGED
@@ -939,286 +939,6 @@ def worker_video(input_video, end_frame, prompts, n_prompt, seed, batch, resolut
939
 
940
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
941
 
942
- try:
943
- # 20250506 pftq: Processing input video instead of image
944
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Video processing ...'))))
945
-
946
- # 20250506 pftq: Encode video
947
- start_latent, input_image_np, video_latents, fps, height, width = video_encode(input_video, resolution, no_resize, vae, vae_batch_size=vae_batch, device=gpu)
948
- start_latent = start_latent.to(dtype=torch.float32, device=cpu)
949
- video_latents = video_latents.cpu()
950
-
951
- total_latent_sections = (total_second_length * fps) / (latent_window_size * 4)
952
- total_latent_sections = int(max(round(total_latent_sections), 1))
953
-
954
- # Clean GPU
955
- if not high_vram:
956
- unload_complete_models(
957
- text_encoder, text_encoder_2, image_encoder, vae, transformer
958
- )
959
-
960
- # Text encoding
961
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
962
-
963
- if not high_vram:
964
- fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
965
- load_model_as_complete(text_encoder_2, target_device=gpu)
966
-
967
- prompt_parameters = []
968
-
969
- for prompt_part in prompts[:total_latent_sections]:
970
- prompt_parameters.append(encode_prompt(prompt_part, n_prompt))
971
-
972
- # Clean GPU
973
- if not high_vram:
974
- unload_complete_models(
975
- text_encoder, text_encoder_2
976
- )
977
-
978
- # CLIP Vision
979
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
980
-
981
- if not high_vram:
982
- load_model_as_complete(image_encoder, target_device=gpu)
983
-
984
- image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
985
-
986
- # Clean GPU
987
- if not high_vram:
988
- unload_complete_models(image_encoder)
989
-
990
- image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
991
-
992
- # Dtype
993
- image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
994
-
995
- if enable_preview:
996
- def callback(d):
997
- preview = d['denoised']
998
- preview = vae_decode_fake(preview)
999
-
1000
- preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
1001
- preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
1002
-
1003
- if stream.input_queue.top() == 'end':
1004
- stream.output_queue.push(('end', None))
1005
- raise KeyboardInterrupt('User ends the task.')
1006
-
1007
- current_step = d['i'] + 1
1008
- percentage = int(100.0 * current_step / steps)
1009
- hint = f'Sampling {current_step}/{steps}'
1010
- desc = f'Total frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps) :.2f} seconds (FPS-{fps}), Resolution: {height}px * {width}px, Seed: {seed}, Video {idx+1} of {batch}. The video is generating part {section_index+1} of {total_latent_sections}...'
1011
- stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
1012
- return
1013
- else:
1014
- def callback(d):
1015
- return
1016
-
1017
- def compute_latent(history_latents, latent_window_size, num_clean_frames, start_latent):
1018
- # 20250506 pftq: Use user-specified number of context frames, matching original allocation for num_clean_frames=2
1019
- available_frames = history_latents.shape[2] # Number of latent frames
1020
- max_pixel_frames = min(latent_window_size * 4 - 3, available_frames * 4) # Cap at available pixel frames
1021
- adjusted_latent_frames = max(1, (max_pixel_frames + 3) // 4) # Convert back to latent frames
1022
- # Adjust num_clean_frames to match original behavior: num_clean_frames=2 means 1 frame for clean_latents_1x
1023
- effective_clean_frames = max(0, num_clean_frames - 1)
1024
- effective_clean_frames = min(effective_clean_frames, available_frames - 2) if available_frames > 2 else 0 # 20250507 pftq: changed 1 to 2 for edge case for <=1 sec videos
1025
- num_2x_frames = min(2, max(1, available_frames - effective_clean_frames - 1)) if available_frames > effective_clean_frames + 1 else 0 # 20250507 pftq: subtracted 1 for edge case for <=1 sec videos
1026
- num_4x_frames = min(16, max(1, available_frames - effective_clean_frames - num_2x_frames)) if available_frames > effective_clean_frames + num_2x_frames else 0 # 20250507 pftq: Edge case for <=1 sec
1027
-
1028
- total_context_frames = num_4x_frames + num_2x_frames + effective_clean_frames
1029
- total_context_frames = min(total_context_frames, available_frames) # 20250507 pftq: Edge case for <=1 sec videos
1030
-
1031
- indices = torch.arange(0, 1 + num_4x_frames + num_2x_frames + effective_clean_frames + adjusted_latent_frames).unsqueeze(0) # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
1032
- clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split(
1033
- [1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames], dim=1 # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
1034
- )
1035
- clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
1036
-
1037
- # 20250506 pftq: Split history_latents dynamically based on available frames
1038
- fallback_frame_count = 2 # 20250507 pftq: Changed 0 to 2 Edge case for <=1 sec videos
1039
- context_frames = clean_latents_4x = clean_latents_2x = clean_latents_1x = history_latents[:, :, :fallback_frame_count, :, :]
1040
-
1041
- if total_context_frames > 0:
1042
- context_frames = history_latents[:, :, -total_context_frames:, :, :]
1043
- split_sizes = [num_4x_frames, num_2x_frames, effective_clean_frames]
1044
- split_sizes = [s for s in split_sizes if s > 0] # Remove zero sizes
1045
- if split_sizes:
1046
- splits = context_frames.split(split_sizes, dim=2)
1047
- split_idx = 0
1048
-
1049
- if num_4x_frames > 0:
1050
- clean_latents_4x = splits[split_idx]
1051
- split_idx = 1
1052
- if clean_latents_4x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
1053
- print("Edge case for <=1 sec videos 4x")
1054
- clean_latents_4x = clean_latents_4x.expand(-1, -1, 2, -1, -1)
1055
-
1056
- if num_2x_frames > 0 and split_idx < len(splits):
1057
- clean_latents_2x = splits[split_idx]
1058
- if clean_latents_2x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
1059
- print("Edge case for <=1 sec videos 2x")
1060
- clean_latents_2x = clean_latents_2x.expand(-1, -1, 2, -1, -1)
1061
- split_idx += 1
1062
- elif clean_latents_2x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
1063
- clean_latents_2x = clean_latents_4x
1064
-
1065
- if effective_clean_frames > 0 and split_idx < len(splits):
1066
- clean_latents_1x = splits[split_idx]
1067
-
1068
- clean_latents = torch.cat([start_latent, clean_latents_1x], dim=2)
1069
-
1070
- # 20250507 pftq: Fix for <=1 sec videos.
1071
- max_frames = min(latent_window_size * 4 - 3, history_latents.shape[2] * 4)
1072
- return [max_frames, clean_latents, clean_latents_2x, clean_latents_4x, latent_indices, clean_latents, clean_latent_indices, clean_latent_2x_indices, clean_latent_4x_indices]
1073
-
1074
- for idx in range(batch):
1075
- if batch > 1:
1076
- print(f"Beginning video {idx+1} of {batch} with seed {seed} ")
1077
-
1078
- #job_id = generate_timestamp()
1079
- job_id = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+f"_framepackf1-videoinput_{width}-{total_second_length}sec_seed-{seed}_steps-{steps}_distilled-{gs}_cfg-{cfg}" # 20250506 pftq: easier to read timestamp and filename
1080
-
1081
- # Sampling
1082
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
1083
-
1084
- rnd = torch.Generator("cpu").manual_seed(seed)
1085
-
1086
- # 20250506 pftq: Initialize history_latents with video latents
1087
- history_latents = video_latents
1088
- total_generated_latent_frames = history_latents.shape[2]
1089
- # 20250506 pftq: Initialize history_pixels to fix UnboundLocalError
1090
- history_pixels = None
1091
- previous_video = None
1092
-
1093
- for section_index in range(total_latent_sections):
1094
- if stream.input_queue.top() == 'end':
1095
- stream.output_queue.push(('end', None))
1096
- return
1097
-
1098
- print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
1099
-
1100
- if len(prompt_parameters) > 0:
1101
- [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters.pop(0)
1102
-
1103
- if not high_vram:
1104
- unload_complete_models()
1105
- move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
1106
-
1107
- if use_teacache:
1108
- transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
1109
- else:
1110
- transformer.initialize_teacache(enable_teacache=False)
1111
-
1112
- [max_frames, clean_latents, clean_latents_2x, clean_latents_4x, latent_indices, clean_latents, clean_latent_indices, clean_latent_2x_indices, clean_latent_4x_indices] = compute_latent(history_latents, latent_window_size, num_clean_frames, start_latent)
1113
-
1114
- generated_latents = sample_hunyuan(
1115
- transformer=transformer,
1116
- sampler='unipc',
1117
- width=width,
1118
- height=height,
1119
- frames=max_frames,
1120
- real_guidance_scale=cfg,
1121
- distilled_guidance_scale=gs,
1122
- guidance_rescale=rs,
1123
- num_inference_steps=steps,
1124
- generator=rnd,
1125
- prompt_embeds=llama_vec,
1126
- prompt_embeds_mask=llama_attention_mask,
1127
- prompt_poolers=clip_l_pooler,
1128
- negative_prompt_embeds=llama_vec_n,
1129
- negative_prompt_embeds_mask=llama_attention_mask_n,
1130
- negative_prompt_poolers=clip_l_pooler_n,
1131
- device=gpu,
1132
- dtype=torch.bfloat16,
1133
- image_embeddings=image_encoder_last_hidden_state,
1134
- latent_indices=latent_indices,
1135
- clean_latents=clean_latents,
1136
- clean_latent_indices=clean_latent_indices,
1137
- clean_latents_2x=clean_latents_2x,
1138
- clean_latent_2x_indices=clean_latent_2x_indices,
1139
- clean_latents_4x=clean_latents_4x,
1140
- clean_latent_4x_indices=clean_latent_4x_indices,
1141
- callback=callback,
1142
- )
1143
-
1144
- total_generated_latent_frames += int(generated_latents.shape[2])
1145
- history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
1146
-
1147
- if not high_vram:
1148
- offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
1149
- load_model_as_complete(vae, target_device=gpu)
1150
-
1151
- if history_pixels is None:
1152
- real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
1153
- history_pixels = vae_decode(real_history_latents, vae).cpu()
1154
- else:
1155
- section_latent_frames = latent_window_size * 2
1156
- overlapped_frames = min(latent_window_size * 4 - 3, history_pixels.shape[2])
1157
-
1158
- real_history_latents = history_latents[:, :, -min(total_generated_latent_frames, section_latent_frames):, :, :]
1159
- history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents, vae).cpu(), overlapped_frames)
1160
-
1161
- if not high_vram:
1162
- unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer)
1163
-
1164
- if enable_preview or section_index == total_latent_sections - 1:
1165
- output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
1166
-
1167
- # 20250506 pftq: Use input video FPS for output
1168
- save_bcthw_as_mp4(history_pixels, output_filename, fps=fps, crf=mp4_crf)
1169
- print(f"Latest video saved: {output_filename}")
1170
- # 20250508 pftq: Save prompt to mp4 metadata comments
1171
- set_mp4_comments_imageio_ffmpeg(output_filename, f"Prompt: {prompts} | Negative Prompt: {n_prompt}");
1172
- print(f"Prompt saved to mp4 metadata comments: {output_filename}")
1173
-
1174
- # 20250506 pftq: Clean up previous partial files
1175
- if previous_video is not None and os.path.exists(previous_video):
1176
- try:
1177
- os.remove(previous_video)
1178
- print(f"Previous partial video deleted: {previous_video}")
1179
- except Exception as e:
1180
- print(f"Error deleting previous partial video {previous_video}: {e}")
1181
- previous_video = output_filename
1182
-
1183
- print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
1184
-
1185
- stream.output_queue.push(('file', output_filename))
1186
-
1187
- seed = (seed + 1) % np.iinfo(np.int32).max
1188
-
1189
- except:
1190
- traceback.print_exc()
1191
-
1192
- if not high_vram:
1193
- unload_complete_models(
1194
- text_encoder, text_encoder_2, image_encoder, vae, transformer
1195
- )
1196
-
1197
- stream.output_queue.push(('end', None))
1198
- return
1199
-
1200
- # 20250506 pftq: Modified worker to accept video input and clean frame count
1201
- @torch.no_grad()
1202
- def worker_video_end(input_video, end_frame, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
1203
- def encode_prompt(prompt, n_prompt):
1204
- llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
1205
-
1206
- if cfg == 1:
1207
- llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
1208
- else:
1209
- llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
1210
-
1211
- llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
1212
- llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
1213
-
1214
- llama_vec = llama_vec.to(transformer.dtype)
1215
- llama_vec_n = llama_vec_n.to(transformer.dtype)
1216
- clip_l_pooler = clip_l_pooler.to(transformer.dtype)
1217
- clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
1218
- return [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n]
1219
-
1220
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
1221
-
1222
  try:
1223
  # 20250506 pftq: Processing input video instead of image
1224
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Video processing ...'))))
@@ -1626,7 +1346,7 @@ def process_video_on_gpu(input_video, end_frame, prompts, generation_mode, n_pro
1626
  stream = AsyncStream()
1627
 
1628
  # 20250506 pftq: Pass num_clean_frames, vae_batch, etc
1629
- async_run(worker_video_end if generation_mode == "video_end" else worker_video, input_video, end_frame, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
1630
 
1631
  output_filename = None
1632
 
 
939
 
940
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
941
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
942
  try:
943
  # 20250506 pftq: Processing input video instead of image
944
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Video processing ...'))))
 
1346
  stream = AsyncStream()
1347
 
1348
  # 20250506 pftq: Pass num_clean_frames, vae_batch, etc
1349
+ async_run(worker_video, input_video, end_frame if generation_mode == "video_end" else None, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
1350
 
1351
  output_filename = None
1352