Test4

Paused

App Files Files Community

EuuIia commited on Oct 4

Commit

cd93d5b

verified ·

1 Parent(s): eee1c29

Update api/ltx_server.py

Browse files

Files changed (1) hide show

api/ltx_server.py +95 -404

api/ltx_server.py CHANGED Viewed

@@ -396,55 +396,22 @@ class VideoService:
         return out
-    def _dividir_latentes_em_partes(self, latents_brutos, quantidade: int):
-        """
-        Divide um tensor de latentes em `quantidade` partes e retorna uma lista de clones.
-        Args:
-            latents_brutos: tensor [B, C, T, H, W]
-            quantidade: número de partes que queremos dividir
-        Returns:
-            List[Tensor]: lista de `quantidade` partes, cada uma cloneada
-        """
-        total = latents_brutos.shape[2]  # dimensão temporal
-        partes = []
-        if quantidade <= 1 or quantidade > total:
-            return [latents_brutos.clone()]
-        # calcular tamanho aproximado de cada parte
-        step = total // quantidade
-        overlap = 0  # sobreposição mínima de 1 frame entre partes
-        for i in range(quantidade):
-            start = i * step
-            end = start + step
-            if i == quantidade - 1:
-                end = total  # última parte vai até o final
-            else:
-                end += overlap  # sobreposição
-            parte = latents_brutos[:, :, start-1:end+1, :, :].clone()
-            partes.append(parte)
-        return partes
-    def dividir_latentes(latents_brutos):
-        total = latents_brutos.shape[2]  # dimensão temporal (latentes)
         if total % 2 == 1:  # ÍMPAR
-           cut = total // 2
-           primeira = latents_brutos[:, :, :cut+1, :, :].clone()
-           segunda  = latents_brutos[:, :, cut:, :, :].clone()
         else:  # PAR
-           cut = total // 2
-           # primeira parte até o meio, mas o último frame deve ser ajustado
-           primeira = latents_brutos[:, :, :cut+1, :, :].clone()
-           segunda  = latents_brutos[:, :, cut:, :, :].clone()
         return primeira, segunda
     def _concat_mp4s_no_reencode(self, mp4_list: List[str], out_path: str):
         """
@@ -472,7 +439,6 @@ class VideoService:
                 pass
     def generate(
         self,
         prompt,
@@ -575,41 +541,50 @@ class VideoService:
             print(f"[DEBUG] media_items shape={tuple(media.shape)}")
         latents = None
-        try:
-            ctx = torch.autocast(device_type="cuda", dtype=self.runtime_autocast_dtype) if self.device == "cuda" else contextlib.nullcontext()
             if improve_texture:
                 if not self.latent_upsampler:
                     raise ValueError("Upscaler espacial não carregado.")
                 # --- PASSO 1: GERAÇÃO DE LATENTES EM BAIXA RESOLUÇÃO ---
                 print("[DEBUG] Multi-escala: Iniciando Passo 1 (geração de latentes base).")
                 first_pass_args = self.config.get("first_pass", {}).copy()
                 first_pass_kwargs = call_kwargs.copy()
                 first_pass_kwargs.update({
-                    "guidance_scale": float(guidance_scale),
                     "stg_scale": first_pass_args.get("stg_scale"),
                     "rescaling_scale": first_pass_args.get("rescaling_scale"),
                     "skip_block_list": first_pass_args.get("skip_block_list"),
                 })
-                schedule = first_pass_args.get("timesteps") or first_pass_args.get("guidance_timesteps")
-                if schedule:
-                    first_pass_kwargs["timesteps"] = schedule
-                    first_pass_kwargs["guidance_timesteps"] = schedule
                 downscale_factor = self.config.get("downscale_factor", 2)
                 original_height = first_pass_kwargs["height"]
                 original_width = first_pass_kwargs["width"]
                 divisor = 24
-                target_height_p1 = original_height // downscale_factor
                 height_p1 = round(target_height_p1 / divisor) * divisor
                 if height_p1 == 0: height_p1 = divisor
                 first_pass_kwargs["height"] = height_p1
-                target_width_p1 = original_width // downscale_factor
                 width_p1 = round(target_width_p1 / divisor) * divisor
                 if width_p1 == 0: width_p1 = divisor
                 first_pass_kwargs["width"] = width_p1
@@ -622,7 +597,7 @@ class VideoService:
                 latents_low_res = first_pass_result.images
                 log_tensor_info(latents_low_res, "Latentes (Passo 1)")
-                del first_pass_result
                 gc.collect()
                 if self.device == "cuda": torch.cuda.empty_cache()
@@ -641,48 +616,40 @@ class VideoService:
                 second_pass_args = self.config.get("second_pass", {}).copy()
                 second_pass_kwargs = call_kwargs.copy()
-                height_p2 = height_p1 * 2
-                width_p2 = width_p1 * 2
-                second_pass_kwargs["height"] = height_p2
-                second_pass_kwargs["width"] = width_p2
-                print(f"[DEBUG] Passo 2: Dimensões definidas para {height_p2}x{width_p2} para corresponder ao upscale.")
                 second_pass_kwargs.update({
-                    "guidance_scale": float(guidance_scale),
                     "stg_scale": second_pass_args.get("stg_scale"),
                     "rescaling_scale": second_pass_args.get("rescaling_scale"),
                     "skip_block_list": second_pass_args.get("skip_block_list"),
                 })
-                schedule_p2 = second_pass_args.get("timesteps") or second_pass_args.get("guidance_timesteps")
-                if schedule_p2:
-                    timesteps_para_refinamento = schedule_p2
-                    print(f"[DEBUG] Passo 2: Usando {len(timesteps_para_refinamento)} timesteps pré-definidos do config para refinamento.")
-                else:
-                    strength_p2 = second_pass_args.get("strength", second_pass_args.get("denoising_strength", 0.4))
-                    num_steps_passo2_total = second_pass_args.get("num_inference_steps", 20)
-                    self.pipeline.scheduler.set_timesteps(num_steps_passo2_total, device=self.device)
-                    todos_os_timesteps_p2 = self.pipeline.scheduler.timesteps
-                    ponto_de_corte = int(len(todos_os_timesteps_p2) * (1.0 - strength_p2))
-                    timesteps_para_refinamento = todos_os_timesteps_p2[ponto_de_corte:]
-                    print(f"[DEBUG] Passo 2: Calculando {len(timesteps_para_refinamento)} timesteps manuais (strength ≈ {strength_p2})")
-                second_pass_kwargs["timesteps"] = timesteps_para_refinamento
-                if "strength" in second_pass_kwargs: del second_pass_kwargs["strength"]
                 second_pass_kwargs["latents"] = latents_high_res
                 with ctx:
                     second_pass_result = self.pipeline(**second_pass_kwargs)
                 latents = second_pass_result.images
                 log_tensor_info(latents, "Latentes Finais (Passo 2)")
-            else:
-                # --- PASSO ÚNICO (SINGLE-PASS) ---
                 single_pass_kwargs = call_kwargs.copy()
                 first_pass_config = self.config.get("first_pass", {})
                 single_pass_kwargs.update(
@@ -702,22 +669,30 @@ class VideoService:
                 print(f"[DEBUG] Single-pass: timesteps_len={len(schedule) if schedule else 0}")
                 print("\n[INFO] Executando pipeline de etapa única...")
                 with ctx:
                     result = self.pipeline(**single_pass_kwargs)
-                latents = result.images
                 print(f"[DEBUG] Latentes (single-pass): shape={tuple(latents.shape)}")
-            # --- DECODIFICAÇÃO E CODIFICAÇÃO DE VÍDEO FINAL ---
             latents_cpu = latents.detach().to("cpu", non_blocking=True)
-            if self.device == "cuda":
-                torch.cuda.empty_cache()
-                try:
-                    torch.cuda.ipc_collect()
-                except Exception:
-                    pass
             lat_a, lat_b = self._dividir_latentes(latents_cpu)
             lat_a1, lat_a2 = self._dividir_latentes(lat_a)
             lat_b1, lat_b2 = self._dividir_latentes(lat_b)
@@ -730,19 +705,22 @@ class VideoService:
             partes_mp4 = []
             par = 0
-            for part in latents_parts:
-                print(f"[DEBUG] Partição {par}: {tuple(part.shape)}")
                 par = par + 1
                 output_video_path = os.path.join(temp_dir, f"output_{used_seed}_{par}.mp4")
                 print("[DEBUG] Decodificando bloco de latentes com VAE → tensor de pixels...")
                 pixel_tensor = vae_manager_singleton.decode(
-                    part.to(self.device, non_blocking=True),
                     decode_timestep=float(self.config.get("decode_timestep", 0.05))
                 )
                 log_tensor_info(pixel_tensor, "Pixel tensor (VAE saída)")
-                print("[DEBUG] Codificando MP4 a partir do tensor de pixels...")
                 video_encode_tool_singleton.save_video_from_tensor(
                     pixel_tensor,
                     output_video_path,
@@ -756,7 +734,9 @@ class VideoService:
                     final_output_path = candidate
                     print(f"[DEBUG] MP4 parte {par} movido para {final_output_path}")
                     partes_mp4.append(final_output_path)
                 except Exception as e:
                     print(f"[DEBUG] Falha no move; usando tmp como final: {e}")
             final_concat = os.path.join(results_dir, f"concat_fim_{used_seed}.mp4")
@@ -765,320 +745,31 @@ class VideoService:
             self._log_gpu_memory("Fim da Geração")
             return final_concat, used_seed
-        except Exception as e:
-            print("[DEBUG] EXCEÇÃO NA GERAÇÃO:")
-            print("".join(traceback.format_exception(type(e), e, e.__traceback__)))
-            raise
-# ltx_server.py
-    def generate(
-        self,
-        prompt,
-        negative_prompt,
-        mode="text-to-video",
-        start_image_filepath=None,
-        middle_image_filepath=None,
-        middle_frame_number=None,
-        middle_image_weight=1.0,
-        end_image_filepath=None,
-        end_image_weight=1.0,
-        input_video_filepath=None,
-        height=512,
-        width=704,
-        duration=2.0,
-        frames_to_use=9,
-        seed=42,
-        randomize_seed=True,
-        guidance_scale=3.0, # Valor de referência/fallback
-        improve_texture=True,
-        progress_callback=None,
-        external_decode=True,
-    ):
-        t_all = time.perf_counter()
-        print(f"[DEBUG] generate() begin mode={mode} external_decode={external_decode} improve_texture={improve_texture}")
-        if self.device == "cuda":
-            torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()
-        self._log_gpu_memory("Início da Geração")
-        if mode == "image-to-video" and not start_image_filepath:
-            raise ValueError("A imagem de início é obrigatória para o modo image-to-video")
-        if mode == "video-to-video" and not input_video_filepath:
-            raise ValueError("O vídeo de entrada é obrigatório para o modo video-to-video")
-        used_seed = random.randint(0, 2**32 - 1) if randomize_seed else int(seed)
-        seed_everething(used_seed); print(f"[DEBUG] Seed usado: {used_seed}")
-        FPS = 24.0; MAX_NUM_FRAMES = 2570
-        target_frames_rounded = round(duration * FPS)
-        n_val = round((float(target_frames_rounded) - 1.0) / 8.0)
-        actual_num_frames = max(9, min(MAX_NUM_FRAMES, int(n_val * 8 + 1)))
-        print(f"[DEBUG] Frames alvo: {actual_num_frames} (dur={duration}s @ {FPS}fps)")
-        height_padded = ((height - 1) // 32 + 1) * 32
-        width_padded = ((width - 1) // 32 + 1) * 32
-        padding_values = calculate_padding(height, width, height_padded, width_padded)
-        print(f"[DEBUG] Dimensões: ({height},{width}) -> pad ({height_padded},{width_padded}); padding={padding_values}")
-        generator = torch.Generator(device=self.device).manual_seed(used_seed)
-        conditioning_items = []
-        if mode == "image-to-video":
-            start_tensor = self._prepare_conditioning_tensor(start_image_filepath, height, width, padding_values)
-            conditioning_items.append(ConditioningItem(start_tensor, 0, 1.0))
-            if middle_image_filepath and middle_frame_number is not None:
-                middle_tensor = self._prepare_conditioning_tensor(middle_image_filepath, height, width, padding_values)
-                safe_middle_frame = max(0, min(int(middle_frame_number), actual_num_frames - 1))
-                conditioning_items.append(ConditioningItem(middle_tensor, safe_middle_frame, float(middle_image_weight)))
-            if end_image_filepath:
-                end_tensor = self._prepare_conditioning_tensor(end_image_filepath, height, width, padding_values)
-                last_frame_index = actual_num_frames - 1
-                conditioning_items.append(ConditioningItem(end_tensor, last_frame_index, float(end_image_weight)))
-            print(f"[DEBUG] Conditioning items: {len(conditioning_items)}")
-        # --- LÓGICA DE CONVERSÃO DO STG_MODE ---
-        stg_mode_str = self.config.get("stg_mode", "attention_values")
-        stg_mode_map = {
-            "attention_values": "AttentionValues",
-            "attention_skip": "AttentionSkip",
-            "residual": "Residual",
-            "transformer_block": "TransformerBlock"
-        }
-        stg_mode_enum_key = stg_mode_map.get(stg_mode_str.lower(), "AttentionValues")
-        # --- FIM DA LÓGICA DE CONVERSÃO ---
-        call_kwargs = {
-            "prompt": prompt,
-            "negative_prompt": negative_prompt,
-            "height": height_padded,
-            "width": width_padded,
-            "num_frames": actual_num_frames,
-            "frame_rate": int(FPS),
-            "generator": generator,
-            "output_type": "latent",
-            "conditioning_items": conditioning_items if conditioning_items else None,
-            "media_items": None,
-            "decode_timestep": self.config.get("decode_timestep"),
-            "decode_noise_scale": self.config.get("decode_noise_scale"),
-            "stochastic_sampling": self.config.get("stochastic_sampling"),
-            "image_cond_noise_scale": self.config.get("image_cond_noise_scale", 0.01),
-            "is_video": True,
-            "vae_per_channel_normalize": self.config.get("vae_per_channel_normalize", True),
-            "mixed_precision": (self.config.get("precision") == "mixed_precision"),
-            "offload_to_cpu": False,
-            "enhance_prompt": False,
-            "skip_layer_strategy": SkipLayerStrategy[stg_mode_enum_key],
-        }
-        print(f"[DEBUG] output_type={call_kwargs['output_type']} skip_layer_strategy={call_kwargs['skip_layer_strategy']}")
-        if mode == "video-to-video":
-            media = load_media_file(
-                media_path=input_video_filepath,
-                height=height,
-                width=width,
-                max_frames=int(frames_to_use),
-                padding=padding_values,
-            ).to(self.device)
-            call_kwargs["media_items"] = media
-            print(f"[DEBUG] media_items shape={tuple(media.shape)}")
-        latents = None
-        try:
-            ctx = torch.autocast(device_type="cuda", dtype=self.runtime_autocast_dtype) if self.device == "cuda" else contextlib.nullcontext()
-            if improve_texture:
-                if not self.latent_upsampler:
-                    raise ValueError("Upscaler espacial não carregado.")
-                # --- PASSO 1: GERAÇ��O DE LATENTES EM BAIXA RESOLUÇÃO ---
-                print("[DEBUG] Multi-escala: Iniciando Passo 1 (geração de latentes base).")
-                first_pass_args = self.config.get("first_pass", {}).copy()
-                first_pass_kwargs = call_kwargs.copy()
-                first_pass_kwargs.update({
-                    "guidance_scale": first_pass_args.get("guidance_scale", guidance_scale),
-                    "stg_scale": first_pass_args.get("stg_scale"),
-                    "rescaling_scale": first_pass_args.get("rescaling_scale"),
-                    "skip_block_list": first_pass_args.get("skip_block_list"),
-                    "guidance_timesteps": first_pass_args.get("guidance_timesteps"),
-                    "timesteps": first_pass_args.get("timesteps")
-                })
-                print(f"[DEBUG] Passo 1: Parâmetros do config carregados: guidance_scale={first_pass_kwargs['guidance_scale']}, stg_scale={first_pass_kwargs['stg_scale']}")
-                downscale_factor = self.config.get("downscale_factor", 2)
-                original_height = first_pass_kwargs["height"]
-                original_width = first_pass_kwargs["width"]
-                divisor = 24
-                target_height_p1 = original_height // downscale_factor
-                height_p1 = round(target_height_p1 / divisor) * divisor
-                if height_p1 == 0: height_p1 = divisor
-                first_pass_kwargs["height"] = height_p1
-                target_width_p1 = original_width // downscale_factor
-                width_p1 = round(target_width_p1 / divisor) * divisor
-                if width_p1 == 0: width_p1 = divisor
-                first_pass_kwargs["width"] = width_p1
-                print(f"[DEBUG] Passo 1: Dimensões reduzidas e ajustadas para {height_p1}x{width_p1}")
-                with ctx:
-                    first_pass_result = self.pipeline(**first_pass_kwargs)
-                latents_low_res = first_pass_result.images
-                log_tensor_info(latents_low_res, "Latentes (Passo 1)")
-                del first_pass_result
-                gc.collect()
-                if self.device == "cuda": torch.cuda.empty_cache()
-                # --- PASSO INTERMEDIÁRIO: UPSCALE DOS LATENTES ---
-                print("[DEBUG] Multi-escala: Fazendo upscale dos latentes com latent_upsampler.")
-                with ctx:
-                    latents_high_res = self.latent_upsampler(latents_low_res)
-                log_tensor_info(latents_high_res, "Latentes (Pós-Upscale)")
-                del latents_low_res
-                gc.collect()
-                if self.device == "cuda": torch.cuda.empty_cache()
-                # --- PASSO 2: REFINAMENTO EM ALTA RESOLUÇÃO ---
-                print("[DEBUG] Multi-escala: Iniciando Passo 2 (refinamento em alta resolução).")
-                second_pass_args = self.config.get("second_pass", {}).copy()
-                second_pass_kwargs = call_kwargs.copy()
-                second_pass_kwargs.update({
-                    "guidance_scale": second_pass_args.get("guidance_scale", guidance_scale),
-                    "stg_scale": second_pass_args.get("stg_scale"),
-                    "rescaling_scale": second_pass_args.get("rescaling_scale"),
-                    "skip_block_list": second_pass_args.get("skip_block_list"),
-                    "guidance_timesteps": second_pass_args.get("guidance_timesteps"),
-                    "timesteps": second_pass_args.get("timesteps")
-                })
-                print(f"[DEBUG] Passo 2: Parâmetros do config carregados: guidance_scale={second_pass_kwargs['guidance_scale']}, stg_scale={second_pass_kwargs['stg_scale']}")
-                height_p2 = height_p1 * 2
-                width_p2 = width_p1 * 2
-                second_pass_kwargs["height"] = height_p2
-                second_pass_kwargs["width"] = width_p2
-                print(f"[DEBUG] Passo 2: Dimensões definidas para {height_p2}x{width_p2}")
-                second_pass_kwargs["latents"] = latents_high_res
-                with ctx:
-                    second_pass_result = self.pipeline(**second_pass_kwargs)
-                latents = second_pass_result.images
-                log_tensor_info(latents, "Latentes Finais (Passo 2)")
-            else:
-                # --- PASSO ÚNICO (SINGLE-PASS) ---
-                single_pass_kwargs = call_kwargs.copy()
-                single_pass_kwargs.update({
-                    "guidance_scale": self.config.get("guidance_scale", guidance_scale),
-                    "stg_scale": self.config.get("stg_scale"),
-                    "rescaling_scale": self.config.get("rescaling_scale"),
-                    "skip_block_list": self.config.get("skip_block_list"),
-                    "guidance_timesteps": self.config.get("guidance_timesteps"),
-                    "timesteps": self.config.get("timesteps"),
-                    "num_inference_steps": self.config.get("num_inference_steps", 20)
-                })
-                print("\n[INFO] Executando pipeline de etapa única...")
-                with ctx:
-                    result = self.pipeline(**single_pass_kwargs)
-                latents = result.images
-                print(f"[DEBUG] Latentes (single-pass): shape={tuple(latents.shape)}")
-            # --- DECODIFICAÇÃO E CODIFICAÇÃO DE VÍDEO FINAL ---
-            latents_cpu = latents.detach().to("cpu", non_blocking=True)
-            if self.device == "cuda":
-                torch.cuda.empty_cache()
-                try: torch.cuda.ipc_collect()
-                except Exception: pass
-            lat_a, lat_b = self._dividir_latentes(latents_cpu)
-            if lat_a is not None:
-                lat_a1, lat_a2 = self._dividir_latentes(lat_a)
-            else:
-                lat_a1, lat_a2 = None, None
-            if lat_b is not None:
-                lat_b1, lat_b2 = self._dividir_latentes(lat_b)
-            else:
-                lat_b1, lat_b2 = None, None
-            latents_parts = [p for p in [lat_a1, lat_a2, lat_b1, lat_b2] if p is not None and p.shape[2] > 1]
-            if not latents_parts:
-                latents_parts = [latents_cpu]
-            temp_dir = tempfile.mkdtemp(prefix="ltxv_"); self._register_tmp_dir(temp_dir)
-            results_dir = "/app/output"; os.makedirs(results_dir, exist_ok=True)
-            partes_mp4 = []
-            par = 0
-            for part in latents_parts:
-                par += 1
-                print(f"[DEBUG] Partição {par}: {tuple(part.shape)}")
-                output_video_path = os.path.join(temp_dir, f"output_{used_seed}_{par}.mp4")
-                print("[DEBUG] Decodificando bloco de latentes com VAE → tensor de pixels...")
-                pixel_tensor = vae_manager_singleton.decode(
-                    part.to(self.device, non_blocking=True),
-                    decode_timestep=float(self.config.get("decode_timestep", 0.05))
-                )
-                log_tensor_info(pixel_tensor, "Pixel tensor (VAE saída)")
-                print("[DEBUG] Codificando MP4 a partir do tensor de pixels...")
-                video_encode_tool_singleton.save_video_from_tensor(
-                    pixel_tensor,
-                    output_video_path,
-                    fps=call_kwargs["frame_rate"],
-                    progress_callback=progress_callback
-                )
-                candidate = os.path.join(results_dir, f"output_par_{par}.mp4")
-                try:
-                    shutil.move(output_video_path, candidate)
-                    print(f"[DEBUG] MP4 parte {par} movido para {candidate}")
-                    partes_mp4.append(candidate)
-                except Exception as e:
-                    print(f"[DEBUG] Falha no move; usando tmp como final: {e}")
-                    partes_mp4.append(output_video_path)
-            final_concat = os.path.join(results_dir, f"concat_fim_{used_seed}.mp4")
-            if partes_mp4:
-                if len(partes_mp4) == 1:
-                    shutil.move(partes_mp4[0], final_concat)
-                    print(f"[DEBUG] Apenas uma parte, movida para {final_concat}")
-                else:
-                    self._concat_mp4s_no_reencode(partes_mp4, final_concat)
-            else:
-                print("[WARN] Nenhuma parte de vídeo foi gerada para concatenar.")
-                return None, used_seed
-            self._log_gpu_memory("Fim da Geração")
-            return final_concat, used_seed
         except Exception as e:
             print("[DEBUG] EXCEÇÃO NA GERAÇÃO:")
             print("".join(traceback.format_exception(type(e), e, e.__traceback__)))
             raise
         finally:
             try:
-                del latents, latents_low_res, latents_high_res, second_pass_result, first_pass_result, result
-            except NameError:
                 pass
-            except Exception as e:
-                print(f"[DEBUG] Erro na limpeza de variáveis: {e}")
             gc.collect()
-            if self.device == "cuda":
-                try:
                     torch.cuda.empty_cache()
-                    torch.cuda.ipc_collect()
-                except Exception as e:
-                    print(f"[DEBUG] Limpeza GPU no finally falhou: {e}")
             try:
                 self.finalize(keep_paths=[])
@@ -1086,4 +777,4 @@ class VideoService:
                 print(f"[DEBUG] finalize() no finally falhou: {e}")
 print("Criando instância do VideoService. O carregamento do modelo começará agora...")
-video_generation_service = VideoService()

         return out
+    def _dividir_latentes(self, latents_brutos):
+        total = latents_brutos.shape[2]  # dimensão temporal (número de latentes)
         if total % 2 == 1:  # ÍMPAR
+            # Ex: 11 → primeira 0..5, segunda 5..10
+            cut = total // 2
+            primeira = latents_brutos[:, :, :cut+1, :, :].clone()
+            segunda  = latents_brutos[:, :, cut:, :, :].clone()
         else:  # PAR
+            # Ex: 12 → primeira 0..5, segunda 5..11
+            cut = total // 2
+            primeira = latents_brutos[:, :, :cut, :, :].clone()
+            segunda  = latents_brutos[:, :, cut-1:, :, :].clone()
         return primeira, segunda
     def _concat_mp4s_no_reencode(self, mp4_list: List[str], out_path: str):
         """
                 pass
     def generate(
         self,
         prompt,
             print(f"[DEBUG] media_items shape={tuple(media.shape)}")
         latents = None
+        multi_scale_pipeline = None
+        try:
             if improve_texture:
                 if not self.latent_upsampler:
                     raise ValueError("Upscaler espacial não carregado.")
+                # --- INÍCIO DA IMPLEMENTAÇÃO LIMPA DOS 3 PASSOS ---
                 # --- PASSO 1: GERAÇÃO DE LATENTES EM BAIXA RESOLUÇÃO ---
                 print("[DEBUG] Multi-escala: Iniciando Passo 1 (geração de latentes base).")
                 first_pass_args = self.config.get("first_pass", {}).copy()
                 first_pass_kwargs = call_kwargs.copy()
+                # Carrega os parâmetros do config, incluindo listas de timesteps e guidance
                 first_pass_kwargs.update({
+                    "guidance_scale": first_pass_args.get("guidance_scale", guidance_scale),
                     "stg_scale": first_pass_args.get("stg_scale"),
                     "rescaling_scale": first_pass_args.get("rescaling_scale"),
                     "skip_block_list": first_pass_args.get("skip_block_list"),
+                    "guidance_timesteps": first_pass_args.get("guidance_timesteps"),
+                    "timesteps": first_pass_args.get("timesteps")
                 })
+                print(f"[DEBUG] Passo 1: Parâmetros do config carregados.")
+                # Calcula as dimensões de baixa resolução
                 downscale_factor = self.config.get("downscale_factor", 2)
                 original_height = first_pass_kwargs["height"]
                 original_width = first_pass_kwargs["width"]
                 divisor = 24
+                # Para downscale_factor < 1 (ex: 0.666), a lógica é multiplicar
+                if downscale_factor < 1.0:
+                    target_height_p1 = original_height * downscale_factor
+                    target_width_p1 = original_width * downscale_factor
+                else: # Para downscale_factor >= 1, a lógica é dividir
+                    target_height_p1 = original_height // downscale_factor
+                    target_width_p1 = original_width // downscale_factor
                 height_p1 = round(target_height_p1 / divisor) * divisor
                 if height_p1 == 0: height_p1 = divisor
                 first_pass_kwargs["height"] = height_p1
                 width_p1 = round(target_width_p1 / divisor) * divisor
                 if width_p1 == 0: width_p1 = divisor
                 first_pass_kwargs["width"] = width_p1
                 latents_low_res = first_pass_result.images
                 log_tensor_info(latents_low_res, "Latentes (Passo 1)")
+                del first_pass_result, first_pass_kwargs
                 gc.collect()
                 if self.device == "cuda": torch.cuda.empty_cache()
                 second_pass_args = self.config.get("second_pass", {}).copy()
                 second_pass_kwargs = call_kwargs.copy()
+                # Carrega os parâmetros do config para o segundo passo
                 second_pass_kwargs.update({
+                    "guidance_scale": second_pass_args.get("guidance_scale", guidance_scale),
                     "stg_scale": second_pass_args.get("stg_scale"),
                     "rescaling_scale": second_pass_args.get("rescaling_scale"),
                     "skip_block_list": second_pass_args.get("skip_block_list"),
+                    "guidance_timesteps": second_pass_args.get("guidance_timesteps"),
+                    "timesteps": second_pass_args.get("timesteps")
                 })
+                print(f"[DEBUG] Passo 2: Parâmetros do config carregados.")
+                # Define as dimensões de alta resolução com base no upscale
+                # O upsampler espacial dobra a resolução, então multiplicamos por 2
+                height_p2 = height_p1 * 2
+                width_p2 = width_p1 * 2
+                second_pass_kwargs["height"] = height_p2
+                second_pass_kwargs["width"] = width_p2
+                print(f"[DEBUG] Passo 2: Dimensões definidas para {height_p2}x{width_p2}")
+                # A entrada para o refinamento são os latentes que sofreram upscale
                 second_pass_kwargs["latents"] = latents_high_res
+                # Garante que 'strength' não seja passado, pois estamos controlando via timesteps
+                if "strength" in second_pass_kwargs:
+                    del second_pass_kwargs["strength"]
                 with ctx:
                     second_pass_result = self.pipeline(**second_pass_kwargs)
                 latents = second_pass_result.images
                 log_tensor_info(latents, "Latentes Finais (Passo 2)")
+        else:
                 single_pass_kwargs = call_kwargs.copy()
                 first_pass_config = self.config.get("first_pass", {})
                 single_pass_kwargs.update(
                 print(f"[DEBUG] Single-pass: timesteps_len={len(schedule) if schedule else 0}")
                 print("\n[INFO] Executando pipeline de etapa única...")
+                t_sp = time.perf_counter()
+                ctx = torch.autocast(device_type="cuda", dtype=self.runtime_autocast_dtype) if self.device == "cuda" else contextlib.nullcontext()
                 with ctx:
                     result = self.pipeline(**single_pass_kwargs)
+                print(f"[DEBUG] single-pass tempo={time.perf_counter()-t_sp:.3f}s")
+                if hasattr(result, "latents"):
+                    latents = result.latents
+                elif hasattr(result, "images") and isinstance(result.images, torch.Tensor):
+                    latents = result.images
+                else:
+                    latents = result
                 print(f"[DEBUG] Latentes (single-pass): shape={tuple(latents.shape)}")
+            # Staging e escrita MP4 (simples: VAE → pixels → MP4)
             latents_cpu = latents.detach().to("cpu", non_blocking=True)
+            torch.cuda.empty_cache()
+            try:
+                torch.cuda.ipc_collect()
+            except Exception:
+                pass
+            # 2) Divide em duas partes
             lat_a, lat_b = self._dividir_latentes(latents_cpu)
             lat_a1, lat_a2 = self._dividir_latentes(lat_a)
             lat_b1, lat_b2 = self._dividir_latentes(lat_b)
             partes_mp4 = []
             par = 0
+            for latents in latents_parts:
+                print(f"[DEBUG] Partição {par}: {tuple(latents.shape)}")
                 par = par + 1
                 output_video_path = os.path.join(temp_dir, f"output_{used_seed}_{par}.mp4")
+                final_output_path = None
                 print("[DEBUG] Decodificando bloco de latentes com VAE → tensor de pixels...")
+                # Usar manager com timestep por item; previne target_shape e rota NoneType.decode
                 pixel_tensor = vae_manager_singleton.decode(
+                    latents.to(self.device, non_blocking=True),
                     decode_timestep=float(self.config.get("decode_timestep", 0.05))
                 )
                 log_tensor_info(pixel_tensor, "Pixel tensor (VAE saída)")
+                print("[DEBUG] Codificando MP4 a partir do tensor de pixels (bloco inteiro)...")
                 video_encode_tool_singleton.save_video_from_tensor(
                     pixel_tensor,
                     output_video_path,
                     final_output_path = candidate
                     print(f"[DEBUG] MP4 parte {par} movido para {final_output_path}")
                     partes_mp4.append(final_output_path)
                 except Exception as e:
+                    final_output_path = output_video_path
                     print(f"[DEBUG] Falha no move; usando tmp como final: {e}")
             final_concat = os.path.join(results_dir, f"concat_fim_{used_seed}.mp4")
             self._log_gpu_memory("Fim da Geração")
             return final_concat, used_seed
         except Exception as e:
             print("[DEBUG] EXCEÇÃO NA GERAÇÃO:")
             print("".join(traceback.format_exception(type(e), e, e.__traceback__)))
             raise
         finally:
             try:
+                del latents
+            except Exception:
+                pass
+            try:
+                del multi_scale_pipeline
+            except Exception:
                 pass
             gc.collect()
+            try:
+                if self.device == "cuda":
                     torch.cuda.empty_cache()
+                    try:
+                        torch.cuda.ipc_collect()
+                    except Exception:
+                        pass
+            except Exception as e:
+                print(f"[DEBUG] Limpeza GPU no finally falhou: {e}")
             try:
                 self.finalize(keep_paths=[])
                 print(f"[DEBUG] finalize() no finally falhou: {e}")
 print("Criando instância do VideoService. O carregamento do modelo começará agora...")
+video_generation_service = VideoService()