Update api/ltx_server_refactored.py
Browse files- api/ltx_server_refactored.py +14 -14
api/ltx_server_refactored.py
CHANGED
|
@@ -590,7 +590,7 @@ class VideoService:
|
|
| 590 |
print(f"[DEBUG] Carregando condicionamento: {filepath}")
|
| 591 |
tensor = self._load_image_to_tensor_with_resize_and_crop(filepath, height, width)
|
| 592 |
tensor = torch.nn.functional.pad(tensor, padding_values)
|
| 593 |
-
out = tensor.to(self.
|
| 594 |
print(f"[DEBUG] Cond shape={tuple(out.shape)} dtype={out.dtype} device={out.device}")
|
| 595 |
return out
|
| 596 |
|
|
@@ -642,7 +642,7 @@ class VideoService:
|
|
| 642 |
print(f" - Dimensões de Saída: {downscaled_height}x{downscaled_width}")
|
| 643 |
|
| 644 |
# --- Execução da Pipeline ---
|
| 645 |
-
with torch.autocast(device_type=self.
|
| 646 |
|
| 647 |
first_pass_kwargs = {
|
| 648 |
"prompt": prompt,
|
|
@@ -651,7 +651,7 @@ class VideoService:
|
|
| 651 |
"width": downscaled_width,
|
| 652 |
"num_frames": (actual_num_frames//8)+1,
|
| 653 |
"frame_rate": int(DEFAULT_FPS),
|
| 654 |
-
"generator": torch.Generator(device=self.
|
| 655 |
"output_type": "latent",
|
| 656 |
"conditioning_items": conditioning_items,
|
| 657 |
"guidance_scale": float(guidance_scale),
|
|
@@ -874,17 +874,17 @@ class VideoService:
|
|
| 874 |
print("[LOG] FASE 1: Geração de Latentes (Transformer na GPU)")
|
| 875 |
self._set_generation_environment()
|
| 876 |
|
| 877 |
-
latents_to_refine = torch.load(latents_path).to(self.
|
| 878 |
print(f" [LOG] Latentes carregados para a GPU. Shape: {latents_to_refine.shape}")
|
| 879 |
|
| 880 |
-
with torch.autocast(device_type=self.
|
| 881 |
refine_height = latents_to_refine.shape[3] * self.pipeline.vae_scale_factor
|
| 882 |
refine_width = latents_to_refine.shape[4] * self.pipeline.vae_scale_factor
|
| 883 |
second_pass_kwargs = {
|
| 884 |
"prompt": prompt, "negative_prompt": negative_prompt, "height": refine_height, "width": refine_width,
|
| 885 |
"frame_rate": int(DEFAULT_FPS), "num_frames": latents_to_refine.shape[2],
|
| 886 |
"latents": latents_to_refine, "guidance_scale": float(guidance_scale), "output_type": "latent",
|
| 887 |
-
"generator": torch.Generator(device=self.
|
| 888 |
"conditioning_items": conditioning_items, **(self.config.get("second_pass", {}))
|
| 889 |
}
|
| 890 |
print(" [LOG] Enviando para a pipeline de refinamento (Transformer)...")
|
|
@@ -907,7 +907,7 @@ class VideoService:
|
|
| 907 |
|
| 908 |
if len(pontos_de_corte) == 1:
|
| 909 |
pixel_tensor = vae_manager_singleton.decode(
|
| 910 |
-
final_latents_cpu.to(self.
|
| 911 |
decode_timestep=float(self.config.get("decode_timestep", 0.05))
|
| 912 |
).cpu()
|
| 913 |
else:
|
|
@@ -922,7 +922,7 @@ class VideoService:
|
|
| 922 |
print(f" -> Decodificando Grupo {i+1}/{len(pontos_de_corte)} (latentes {start} a {end-1}), shape: {latent_chunk.shape}")
|
| 923 |
|
| 924 |
pixel_chunk = vae_manager_singleton.decode(
|
| 925 |
-
latent_chunk.to(self.
|
| 926 |
decode_timestep=float(self.config.get("decode_timestep", 0.05))
|
| 927 |
)
|
| 928 |
pixel_chunks_list.append(pixel_chunk.cpu())
|
|
@@ -960,10 +960,10 @@ class VideoService:
|
|
| 960 |
chunks = self._split_latents_with_overlap(latents)
|
| 961 |
pixel_chunks = []
|
| 962 |
|
| 963 |
-
with torch.autocast(device_type=self.
|
| 964 |
for chunk in chunks:
|
| 965 |
if chunk.shape[2] == 0: continue
|
| 966 |
-
pixel_chunk = vae_manager_singleton.decode(chunk.to(self.
|
| 967 |
pixel_chunks.append(pixel_chunk)
|
| 968 |
|
| 969 |
final_pixel_tensor = self._merge_chunks_with_overlap(pixel_chunks)
|
|
@@ -1041,10 +1041,10 @@ class VideoService:
|
|
| 1041 |
|
| 1042 |
def _move_models_to_device(self):
|
| 1043 |
"""Move os modelos carregados para o dispositivo de computação (GPU/CPU)."""
|
| 1044 |
-
print(f"[INFO] Movendo modelos para o dispositivo: {self.
|
| 1045 |
-
self.pipeline.to(self.
|
| 1046 |
if self.latent_upsampler:
|
| 1047 |
-
self.latent_upsampler.to(self.
|
| 1048 |
|
| 1049 |
def _get_precision_dtype(self) -> torch.dtype:
|
| 1050 |
"""Determina o dtype para autocast com base na configuração de precisão."""
|
|
@@ -1072,7 +1072,7 @@ class VideoService:
|
|
| 1072 |
"""Carrega uma imagem, redimensiona, aplica padding e move para o dispositivo."""
|
| 1073 |
tensor = self._load_image_to_tensor_with_resize_and_crop(filepath, height, width)
|
| 1074 |
tensor = F.pad(tensor, padding)
|
| 1075 |
-
return tensor.to(self.
|
| 1076 |
|
| 1077 |
def _calculate_downscaled_dims(self, height: int, width: int) -> Tuple[int, int]:
|
| 1078 |
"""Calcula as dimensões para o primeiro passo (baixa resolução)."""
|
|
|
|
| 590 |
print(f"[DEBUG] Carregando condicionamento: {filepath}")
|
| 591 |
tensor = self._load_image_to_tensor_with_resize_and_crop(filepath, height, width)
|
| 592 |
tensor = torch.nn.functional.pad(tensor, padding_values)
|
| 593 |
+
out = tensor.to(self.transformer_devices[0] , dtype=self.runtime_autocast_dtype) if self.transformer_devices[0] == "cuda" else tensor.to(self.transformer_devices[0] )
|
| 594 |
print(f"[DEBUG] Cond shape={tuple(out.shape)} dtype={out.dtype} device={out.device}")
|
| 595 |
return out
|
| 596 |
|
|
|
|
| 642 |
print(f" - Dimensões de Saída: {downscaled_height}x{downscaled_width}")
|
| 643 |
|
| 644 |
# --- Execução da Pipeline ---
|
| 645 |
+
with torch.autocast(device_type=self.transformer_devices[0] .split(':')[0], dtype=self.runtime_autocast_dtype, enabled=(self.transformer_devices[0] == 'cuda')):
|
| 646 |
|
| 647 |
first_pass_kwargs = {
|
| 648 |
"prompt": prompt,
|
|
|
|
| 651 |
"width": downscaled_width,
|
| 652 |
"num_frames": (actual_num_frames//8)+1,
|
| 653 |
"frame_rate": int(DEFAULT_FPS),
|
| 654 |
+
"generator": torch.Generator(device=self.transformer_devices[0] ).manual_seed(used_seed),
|
| 655 |
"output_type": "latent",
|
| 656 |
"conditioning_items": conditioning_items,
|
| 657 |
"guidance_scale": float(guidance_scale),
|
|
|
|
| 874 |
print("[LOG] FASE 1: Geração de Latentes (Transformer na GPU)")
|
| 875 |
self._set_generation_environment()
|
| 876 |
|
| 877 |
+
latents_to_refine = torch.load(latents_path).to(self.transformer_devices[0] )
|
| 878 |
print(f" [LOG] Latentes carregados para a GPU. Shape: {latents_to_refine.shape}")
|
| 879 |
|
| 880 |
+
with torch.autocast(device_type=self.transformer_devices[0] .split(':')[0], dtype=self.runtime_autocast_dtype, enabled=(self.device == 'cuda')):
|
| 881 |
refine_height = latents_to_refine.shape[3] * self.pipeline.vae_scale_factor
|
| 882 |
refine_width = latents_to_refine.shape[4] * self.pipeline.vae_scale_factor
|
| 883 |
second_pass_kwargs = {
|
| 884 |
"prompt": prompt, "negative_prompt": negative_prompt, "height": refine_height, "width": refine_width,
|
| 885 |
"frame_rate": int(DEFAULT_FPS), "num_frames": latents_to_refine.shape[2],
|
| 886 |
"latents": latents_to_refine, "guidance_scale": float(guidance_scale), "output_type": "latent",
|
| 887 |
+
"generator": torch.Generator(device=self.transformer_devices[0] ).manual_seed(used_seed),
|
| 888 |
"conditioning_items": conditioning_items, **(self.config.get("second_pass", {}))
|
| 889 |
}
|
| 890 |
print(" [LOG] Enviando para a pipeline de refinamento (Transformer)...")
|
|
|
|
| 907 |
|
| 908 |
if len(pontos_de_corte) == 1:
|
| 909 |
pixel_tensor = vae_manager_singleton.decode(
|
| 910 |
+
final_latents_cpu.to(self.transformer_devices[0] ),
|
| 911 |
decode_timestep=float(self.config.get("decode_timestep", 0.05))
|
| 912 |
).cpu()
|
| 913 |
else:
|
|
|
|
| 922 |
print(f" -> Decodificando Grupo {i+1}/{len(pontos_de_corte)} (latentes {start} a {end-1}), shape: {latent_chunk.shape}")
|
| 923 |
|
| 924 |
pixel_chunk = vae_manager_singleton.decode(
|
| 925 |
+
latent_chunk.to(self.transformer_devices[0] ),
|
| 926 |
decode_timestep=float(self.config.get("decode_timestep", 0.05))
|
| 927 |
)
|
| 928 |
pixel_chunks_list.append(pixel_chunk.cpu())
|
|
|
|
| 960 |
chunks = self._split_latents_with_overlap(latents)
|
| 961 |
pixel_chunks = []
|
| 962 |
|
| 963 |
+
with torch.autocast(device_type=self.transformer_devices[0] .split(':')[0], dtype=self.runtime_autocast_dtype, enabled=(self.transformer_devices[0] == 'cuda')):
|
| 964 |
for chunk in chunks:
|
| 965 |
if chunk.shape[2] == 0: continue
|
| 966 |
+
pixel_chunk = vae_manager_singleton.decode(chunk.to(self.transformer_devices[0] ), decode_timestep=float(self.config.get("decode_timestep", 0.05)))
|
| 967 |
pixel_chunks.append(pixel_chunk)
|
| 968 |
|
| 969 |
final_pixel_tensor = self._merge_chunks_with_overlap(pixel_chunks)
|
|
|
|
| 1041 |
|
| 1042 |
def _move_models_to_device(self):
|
| 1043 |
"""Move os modelos carregados para o dispositivo de computação (GPU/CPU)."""
|
| 1044 |
+
print(f"[INFO] Movendo modelos para o dispositivo: {self.transformer_devices[0] }")
|
| 1045 |
+
self.pipeline.to(self.transformer_devices[0] )
|
| 1046 |
if self.latent_upsampler:
|
| 1047 |
+
self.latent_upsampler.to(self.transformer_devices[0] )
|
| 1048 |
|
| 1049 |
def _get_precision_dtype(self) -> torch.dtype:
|
| 1050 |
"""Determina o dtype para autocast com base na configuração de precisão."""
|
|
|
|
| 1072 |
"""Carrega uma imagem, redimensiona, aplica padding e move para o dispositivo."""
|
| 1073 |
tensor = self._load_image_to_tensor_with_resize_and_crop(filepath, height, width)
|
| 1074 |
tensor = F.pad(tensor, padding)
|
| 1075 |
+
return tensor.to(self.transformer_devices[0] , dtype=self.runtime_autocast_dtype)
|
| 1076 |
|
| 1077 |
def _calculate_downscaled_dims(self, height: int, width: int) -> Tuple[int, int]:
|
| 1078 |
"""Calcula as dimensões para o primeiro passo (baixa resolução)."""
|