SUPIR

Runtime error

App Files Files Community

Fabrice-TIERCELIN commited on Jun 21, 2025

Commit

0684df1

verified ·

1 Parent(s): 3049af2

Original code

Browse files

Files changed (1) hide show

app.py +1 -279

app.py CHANGED Viewed

@@ -806,284 +806,6 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
     stream.output_queue.push(('end', None))
     return
-# 20250506 pftq: Modified worker to accept video input and clean frame count
-@spaces.GPU()
-@torch.no_grad()
-def worker_video_experimental(input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
-    def encode_prompt(prompt, n_prompt):
-        llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
-        if cfg == 1:
-            llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
-        else:
-            llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
-        llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
-        llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
-        llama_vec = llama_vec.to(transformer.dtype)
-        llama_vec_n = llama_vec_n.to(transformer.dtype)
-        clip_l_pooler = clip_l_pooler.to(transformer.dtype)
-        clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
-        return [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n]
-    stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
-    try:
-        # Clean GPU
-        if not high_vram:
-            unload_complete_models(
-                text_encoder, text_encoder_2, image_encoder, vae, transformer
-            )
-        # Text encoding
-        stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
-        if not high_vram:
-            fake_diffusers_current_device(text_encoder, gpu)  # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
-            load_model_as_complete(text_encoder_2, target_device=gpu)
-        prompt_parameters = []
-        for prompt_part in prompts:
-            prompt_parameters.append(encode_prompt(prompt_part, n_prompt))
-        # 20250506 pftq: Processing input video instead of image
-        stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Video processing ...'))))
-        # 20250506 pftq: Encode video
-        start_latent, input_image_np, video_latents, fps, height, width = video_encode(input_video, resolution, no_resize, vae, vae_batch_size=vae_batch, device=gpu)[:6]
-        start_latent = start_latent.to(dtype=torch.float32).cpu()
-        video_latents = video_latents.cpu()
-        # CLIP Vision
-        stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
-        if not high_vram:
-            load_model_as_complete(image_encoder, target_device=gpu)
-        image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
-        image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
-        # Dtype
-        image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
-        total_latent_sections = (total_second_length * fps) / (latent_window_size * 4)
-        total_latent_sections = int(max(round(total_latent_sections), 1))
-        if enable_preview:
-            def callback(d):
-                preview = d['denoised']
-                preview = vae_decode_fake(preview)
-                preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
-                preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
-                if stream.input_queue.top() == 'end':
-                    stream.output_queue.push(('end', None))
-                    raise KeyboardInterrupt('User ends the task.')
-                current_step = d['i'] + 1
-                percentage = int(100.0 * current_step / steps)
-                hint = f'Sampling {current_step}/{steps}'
-                desc = f'Total frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps) :.2f} seconds (FPS-{fps}), Resolution: {height}px * {width}px, Seed: {seed}, Video {idx+1} of {batch}. The video is generating part {section_index+1} of {total_latent_sections}...'
-                stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
-                return
-        else:
-            def callback(d):
-                return
-        def compute_latent(history_latents, latent_window_size, num_clean_frames, start_latent):
-            # 20250506 pftq: Use user-specified number of context frames, matching original allocation for num_clean_frames=2
-            available_frames = history_latents.shape[2]  # Number of latent frames
-            max_pixel_frames = min(latent_window_size * 4 - 3, available_frames * 4)  # Cap at available pixel frames
-            adjusted_latent_frames = max(1, (max_pixel_frames + 3) // 4)  # Convert back to latent frames
-            # Adjust num_clean_frames to match original behavior: num_clean_frames=2 means 1 frame for clean_latents_1x
-            effective_clean_frames = max(0, num_clean_frames - 1)
-            effective_clean_frames = min(effective_clean_frames, available_frames - 2) if available_frames > 2 else 0 # 20250507 pftq: changed 1 to 2 for edge case for <=1 sec videos
-            num_2x_frames = min(2, max(1, available_frames - effective_clean_frames - 1)) if available_frames > effective_clean_frames + 1 else 0 # 20250507 pftq: subtracted 1 for edge case for <=1 sec videos
-            num_4x_frames = min(16, max(1, available_frames - effective_clean_frames - num_2x_frames)) if available_frames > effective_clean_frames + num_2x_frames else 0 # 20250507 pftq: Edge case for <=1 sec
-            total_context_frames = num_4x_frames + num_2x_frames + effective_clean_frames
-            total_context_frames = min(total_context_frames, available_frames)  # 20250507 pftq: Edge case for <=1 sec videos
-            indices = torch.arange(0, sum([1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames])).unsqueeze(0) # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
-            clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split(
-                [1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames], dim=1 # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
-            )
-            clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
-            # 20250506 pftq: Split history_latents dynamically based on available frames
-            fallback_frame_count = 2 # 20250507 pftq: Changed 0 to 2 Edge case for <=1 sec videos
-            context_frames = clean_latents_4x = clean_latents_2x = clean_latents_1x = history_latents[:, :, :fallback_frame_count, :, :]
-            if total_context_frames > 0:
-                context_frames = history_latents[:, :, -total_context_frames:, :, :]
-                split_sizes = [num_4x_frames, num_2x_frames, effective_clean_frames]
-                split_sizes = [s for s in split_sizes if s > 0]  # Remove zero sizes
-                if split_sizes:
-                    splits = context_frames.split(split_sizes, dim=2)
-                    split_idx = 0
-                    if num_4x_frames > 0:
-                        clean_latents_4x = splits[split_idx]
-                        split_idx = 1
-                    if clean_latents_4x.shape[2] < 2:  # 20250507 pftq: edge case for <=1 sec videos
-                        print("Edge case for <=1 sec videos 4x")
-                        clean_latents_4x = clean_latents_4x.expand(-1, -1, 2, -1, -1)
-                    if num_2x_frames > 0 and split_idx < len(splits):
-                        clean_latents_2x = splits[split_idx]
-                        if clean_latents_2x.shape[2] < 2:  # 20250507 pftq: edge case for <=1 sec videos
-                            print("Edge case for <=1 sec videos 2x")
-                            clean_latents_2x = clean_latents_2x.expand(-1, -1, 2, -1, -1)
-                        split_idx += 1
-                    elif clean_latents_2x.shape[2] < 2:  # 20250507 pftq: edge case for <=1 sec videos
-                        clean_latents_2x = clean_latents_4x
-                    if effective_clean_frames > 0 and split_idx < len(splits):
-                        clean_latents_1x = splits[split_idx]
-            indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
-            clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
-            clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
-            clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -sum([16, 2, 1]):, :, :].split([16, 2, 1], dim=2)
-            clean_latents = torch.cat([start_latent, clean_latents_1x], dim=2)
-            # 20250507 pftq: Fix for <=1 sec videos.
-            max_frames = min(latent_window_size * 4 - 3, history_latents.shape[2] * 4)
-            return [max_frames, clean_latents, clean_latents_2x, clean_latents_4x, latent_indices, clean_latents, clean_latent_indices, clean_latent_2x_indices, clean_latent_4x_indices]
-        for idx in range(batch):
-            if batch > 1:
-                print(f"Beginning video {idx+1} of {batch} with seed {seed} ")
-            #job_id = generate_timestamp()
-            job_id = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+f"_framepackf1-videoinput_{width}-{total_second_length}sec_seed-{seed}_steps-{steps}_distilled-{gs}_cfg-{cfg}" # 20250506 pftq: easier to read timestamp and filename
-            # Sampling
-            stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
-            rnd = torch.Generator("cpu").manual_seed(seed)
-            # 20250506 pftq: Initialize history_latents with video latents
-            history_latents = video_latents
-            total_generated_latent_frames = history_latents.shape[2]
-            history_latents = torch.cat([history_latents, start_latent], dim=2)
-            total_generated_latent_frames = 1
-            # 20250506 pftq: Initialize history_pixels to fix UnboundLocalError
-            history_pixels = None
-            previous_video = None
-            for section_index in range(total_latent_sections):
-                if stream.input_queue.top() == 'end':
-                    stream.output_queue.push(('end', None))
-                    return
-                print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
-                if len(prompt_parameters) > 0:
-                    [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters.pop(0)
-                if not high_vram:
-                    unload_complete_models()
-                    move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
-                if use_teacache:
-                    transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
-                else:
-                    transformer.initialize_teacache(enable_teacache=False)
-                [max_frames, clean_latents, clean_latents_2x, clean_latents_4x, latent_indices, clean_latents, clean_latent_indices, clean_latent_2x_indices, clean_latent_4x_indices] = compute_latent(history_latents, latent_window_size, num_clean_frames, start_latent)
-                generated_latents = sample_hunyuan(
-                    transformer=transformer,
-                    sampler='unipc',
-                    width=width,
-                    height=height,
-                    frames=max_frames,
-                    real_guidance_scale=cfg,
-                    distilled_guidance_scale=gs,
-                    guidance_rescale=rs,
-                    num_inference_steps=steps,
-                    generator=rnd,
-                    prompt_embeds=llama_vec,
-                    prompt_embeds_mask=llama_attention_mask,
-                    prompt_poolers=clip_l_pooler,
-                    negative_prompt_embeds=llama_vec_n,
-                    negative_prompt_embeds_mask=llama_attention_mask_n,
-                    negative_prompt_poolers=clip_l_pooler_n,
-                    device=gpu,
-                    dtype=torch.bfloat16,
-                    image_embeddings=image_encoder_last_hidden_state,
-                    latent_indices=latent_indices,
-                    clean_latents=clean_latents,
-                    clean_latent_indices=clean_latent_indices,
-                    clean_latents_2x=clean_latents_2x,
-                    clean_latent_2x_indices=clean_latent_2x_indices,
-                    clean_latents_4x=clean_latents_4x,
-                    clean_latent_4x_indices=clean_latent_4x_indices,
-                    callback=callback,
-                )
-                total_generated_latent_frames += int(generated_latents.shape[2])
-                history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
-                if not high_vram:
-                    offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
-                    load_model_as_complete(vae, target_device=gpu)
-                if history_pixels is None:
-                    real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
-                    history_pixels = vae_decode(real_history_latents, vae).cpu()
-                else:
-                    section_latent_frames = latent_window_size * 2
-                    overlapped_frames = min(latent_window_size * 4 - 3, history_pixels.shape[2])
-                    real_history_latents = history_latents[:, :, -min(total_generated_latent_frames, section_latent_frames):, :, :]
-                    history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents, vae).cpu(), overlapped_frames)
-                if not high_vram:
-                    unload_complete_models()
-                if enable_preview or section_index == total_latent_sections - 1:
-                    output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
-                    # 20250506 pftq: Use input video FPS for output
-                    save_bcthw_as_mp4(history_pixels, output_filename, fps=fps, crf=mp4_crf)
-                    print(f"Latest video saved: {output_filename}")
-                    # 20250508 pftq: Save prompt to mp4 metadata comments
-                    set_mp4_comments_imageio_ffmpeg(output_filename, f"Prompt: {prompts} | Negative Prompt: {n_prompt}");
-                    print(f"Prompt saved to mp4 metadata comments: {output_filename}")
-                    # 20250506 pftq: Clean up previous partial files
-                    if previous_video is not None and os.path.exists(previous_video):
-                        try:
-                            os.remove(previous_video)
-                            print(f"Previous partial video deleted: {previous_video}")
-                        except Exception as e:
-                            print(f"Error deleting previous partial video {previous_video}: {e}")
-                    previous_video = output_filename
-                    print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
-                    stream.output_queue.push(('file', output_filename))
-            seed = (seed + 1) % np.iinfo(np.int32).max
-    except:
-        traceback.print_exc()
-        if not high_vram:
-            unload_complete_models(
-                text_encoder, text_encoder_2, image_encoder, vae, transformer
-            )
-    stream.output_queue.push(('end', None))
-    return
 def get_duration(input_image, image_position, prompt, generation_mode, n_prompt, randomize_seed, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf):
     global total_second_length_debug_value
@@ -1218,7 +940,7 @@ def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, re
     stream = AsyncStream()
     # 20250506 pftq: Pass num_clean_frames, vae_batch, etc
-    async_run(worker_video_experimental, input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
     output_filename = None

     stream.output_queue.push(('end', None))
     return
 def get_duration(input_image, image_position, prompt, generation_mode, n_prompt, randomize_seed, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf):
     global total_second_length_debug_value
     stream = AsyncStream()
     # 20250506 pftq: Pass num_clean_frames, vae_batch, etc
+    async_run(worker_video, input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
     output_filename = None