Carlos s commited on
Commit
f70421a
·
verified ·
1 Parent(s): 79febce

Update api/ltx_server.py

Browse files
Files changed (1) hide show
  1. api/ltx_server.py +308 -4
api/ltx_server.py CHANGED
@@ -765,15 +765,319 @@ class VideoService:
765
  self._log_gpu_memory("Fim da Geração")
766
  return final_concat, used_seed
767
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
768
  except Exception as e:
769
  print("[DEBUG] EXCEÇÃO NA GERAÇÃO:")
770
  print("".join(traceback.format_exception(type(e), e, e.__traceback__)))
771
  raise
772
  finally:
773
- try:
774
- del latents, latents_low_res, latents_high_res, second_pass_result
775
- except Exception:
776
- pass
777
 
778
  gc.collect()
779
  if self.device == "cuda":
 
765
  self._log_gpu_memory("Fim da Geração")
766
  return final_concat, used_seed
767
 
768
+ except Exception as e:
769
+ print("[DEBUG] EXCEÇÃO NA GERAÇÃO:")
770
+ print("".join(traceback.format_exception(type(e), e, e.__traceback__)))
771
+ raise
772
+ # ltx_server.py
773
+
774
+ def generate(
775
+ self,
776
+ prompt,
777
+ negative_prompt,
778
+ mode="text-to-video",
779
+ start_image_filepath=None,
780
+ middle_image_filepath=None,
781
+ middle_frame_number=None,
782
+ middle_image_weight=1.0,
783
+ end_image_filepath=None,
784
+ end_image_weight=1.0,
785
+ input_video_filepath=None,
786
+ height=512,
787
+ width=704,
788
+ duration=2.0,
789
+ frames_to_use=9,
790
+ seed=42,
791
+ randomize_seed=True,
792
+ guidance_scale=3.0,
793
+ improve_texture=True,
794
+ progress_callback=None,
795
+ external_decode=True,
796
+ ):
797
+ t_all = time.perf_counter()
798
+ print(f"[DEBUG] generate() begin mode={mode} external_decode={external_decode} improve_texture={improve_texture}")
799
+ if self.device == "cuda":
800
+ torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()
801
+ self._log_gpu_memory("Início da Geração")
802
+
803
+ if mode == "image-to-video" and not start_image_filepath:
804
+ raise ValueError("A imagem de início é obrigatória para o modo image-to-video")
805
+ if mode == "video-to-video" and not input_video_filepath:
806
+ raise ValueError("O vídeo de entrada é obrigatório para o modo video-to-video")
807
+
808
+ used_seed = random.randint(0, 2**32 - 1) if randomize_seed else int(seed)
809
+ seed_everething(used_seed); print(f"[DEBUG] Seed usado: {used_seed}")
810
+
811
+ FPS = 24.0; MAX_NUM_FRAMES = 2570
812
+ target_frames_rounded = round(duration * FPS)
813
+ n_val = round((float(target_frames_rounded) - 1.0) / 8.0)
814
+ actual_num_frames = max(9, min(MAX_NUM_FRAMES, int(n_val * 8 + 1)))
815
+ print(f"[DEBUG] Frames alvo: {actual_num_frames} (dur={duration}s @ {FPS}fps)")
816
+
817
+ height_padded = ((height - 1) // 32 + 1) * 32
818
+ width_padded = ((width - 1) // 32 + 1) * 32
819
+ padding_values = calculate_padding(height, width, height_padded, width_padded)
820
+ print(f"[DEBUG] Dimensões: ({height},{width}) -> pad ({height_padded},{width_padded}); padding={padding_values}")
821
+
822
+ generator = torch.Generator(device=self.device).manual_seed(used_seed)
823
+ conditioning_items = []
824
+
825
+ if mode == "image-to-video":
826
+ start_tensor = self._prepare_conditioning_tensor(start_image_filepath, height, width, padding_values)
827
+ conditioning_items.append(ConditioningItem(start_tensor, 0, 1.0))
828
+ if middle_image_filepath and middle_frame_number is not None:
829
+ middle_tensor = self._prepare_conditioning_tensor(middle_image_filepath, height, width, padding_values)
830
+ safe_middle_frame = max(0, min(int(middle_frame_number), actual_num_frames - 1))
831
+ conditioning_items.append(ConditioningItem(middle_tensor, safe_middle_frame, float(middle_image_weight)))
832
+ if end_image_filepath:
833
+ end_tensor = self._prepare_conditioning_tensor(end_image_filepath, height, width, padding_values)
834
+ last_frame_index = actual_num_frames - 1
835
+ conditioning_items.append(ConditioningItem(end_tensor, last_frame_index, float(end_image_weight)))
836
+ print(f"[DEBUG] Conditioning items: {len(conditioning_items)}")
837
+
838
+ call_kwargs = {
839
+ "prompt": prompt,
840
+ "negative_prompt": negative_prompt,
841
+ "height": height_padded,
842
+ "width": width_padded,
843
+ "num_frames": actual_num_frames,
844
+ "frame_rate": int(FPS),
845
+ "generator": generator,
846
+ "output_type": "latent",
847
+ "conditioning_items": conditioning_items if conditioning_items else None,
848
+ "media_items": None,
849
+ "decode_timestep": self.config["decode_timestep"],
850
+ "decode_noise_scale": self.config["decode_noise_scale"],
851
+ "stochastic_sampling": self.config["stochastic_sampling"],
852
+ "image_cond_noise_scale": 0.01,
853
+ "is_video": True,
854
+ "vae_per_channel_normalize": True,
855
+ "mixed_precision": (self.config["precision"] == "mixed_precision"),
856
+ "offload_to_cpu": False,
857
+ "enhance_prompt": False,
858
+ "skip_layer_strategy": SkipLayerStrategy.AttentionValues,
859
+ }
860
+ print(f"[DEBUG] output_type={call_kwargs['output_type']} skip_layer_strategy={call_kwargs['skip_layer_strategy']}")
861
+
862
+ if mode == "video-to-video":
863
+ media = load_media_file(
864
+ media_path=input_video_filepath,
865
+ height=height,
866
+ width=width,
867
+ max_frames=int(frames_to_use),
868
+ padding=padding_values,
869
+ ).to(self.device)
870
+ call_kwargs["media_items"] = media
871
+ print(f"[DEBUG] media_items shape={tuple(media.shape)}")
872
+
873
+ latents = None
874
+
875
+ try:
876
+ ctx = torch.autast(device_type="cuda", dtype=self.runtime_autocast_dtype) if self.device == "cuda" else contextlib.nullcontext()
877
+
878
+ if improve_texture:
879
+ if not self.latent_upsampler:
880
+ raise ValueError("Upscaler espacial não carregado.")
881
+
882
+ # --- PASSO 1: GERAÇÃO DE LATENTES EM BAIXA RESOLUÇÃO ---
883
+ print("[DEBUG] Multi-escala: Iniciando Passo 1 (geração de latentes base).")
884
+
885
+ first_pass_args = self.config.get("first_pass", {}).copy()
886
+ first_pass_kwargs = call_kwargs.copy()
887
+ first_pass_kwargs.update({
888
+ "guidance_scale": float(guidance_scale),
889
+ "stg_scale": first_pass_args.get("stg_scale"),
890
+ "rescaling_scale": first_pass_args.get("rescaling_scale"),
891
+ "skip_block_list": first_pass_args.get("skip_block_list"),
892
+ })
893
+ schedule = first_pass_args.get("timesteps") or first_pass_args.get("guidance_timesteps")
894
+ if schedule:
895
+ first_pass_kwargs["timesteps"] = schedule
896
+ first_pass_kwargs["guidance_timesteps"] = schedule
897
+
898
+ downscale_factor = self.config.get("downscale_factor", 2)
899
+ original_height = first_pass_kwargs["height"]
900
+ original_width = first_pass_kwargs["width"]
901
+ divisor = 24
902
+
903
+ target_height_p1 = original_height // downscale_factor
904
+ height_p1 = round(target_height_p1 / divisor) * divisor
905
+ if height_p1 == 0: height_p1 = divisor
906
+ first_pass_kwargs["height"] = height_p1
907
+
908
+ target_width_p1 = original_width // downscale_factor
909
+ width_p1 = round(target_width_p1 / divisor) * divisor
910
+ if width_p1 == 0: width_p1 = divisor
911
+ first_pass_kwargs["width"] = width_p1
912
+
913
+ print(f"[DEBUG] Passo 1: Dimensões reduzidas e ajustadas para {height_p1}x{width_p1}")
914
+
915
+ with ctx:
916
+ first_pass_result = self.pipeline(**first_pass_kwargs)
917
+
918
+ latents_low_res = first_pass_result.images
919
+ log_tensor_info(latents_low_res, "Latentes (Passo 1)")
920
+
921
+ del first_pass_result
922
+ gc.collect()
923
+ if self.device == "cuda": torch.cuda.empty_cache()
924
+
925
+ # --- PASSO INTERMEDIÁRIO: UPSCALE DOS LATENTES ---
926
+ print("[DEBUG] Multi-escala: Fazendo upscale dos latentes com latent_upsampler.")
927
+ with ctx:
928
+ latents_high_res = self.latent_upsampler(latents_low_res)
929
+
930
+ log_tensor_info(latents_high_res, "Latentes (Pós-Upscale)")
931
+ del latents_low_res
932
+ gc.collect()
933
+ if self.device == "cuda": torch.cuda.empty_cache()
934
+
935
+ # --- PASSO 2: REFINAMENTO EM ALTA RESOLUÇÃO ---
936
+ print("[DEBUG] Multi-escala: Iniciando Passo 2 (refinamento em alta resolução).")
937
+ second_pass_args = self.config.get("second_pass", {}).copy()
938
+ second_pass_kwargs = call_kwargs.copy()
939
+
940
+ height_p2 = height_p1 * 2
941
+ width_p2 = width_p1 * 2
942
+ second_pass_kwargs["height"] = height_p2
943
+ second_pass_kwargs["width"] = width_p2
944
+ print(f"[DEBUG] Passo 2: Dimensões definidas para {height_p2}x{width_p2} para corresponder ao upscale.")
945
+
946
+ second_pass_kwargs.update({
947
+ "guidance_scale": float(guidance_scale),
948
+ "stg_scale": second_pass_args.get("stg_scale"),
949
+ "rescaling_scale": second_pass_args.get("rescaling_scale"),
950
+ "skip_block_list": second_pass_args.get("skip_block_list"),
951
+ })
952
+
953
+ schedule_p2 = second_pass_args.get("timesteps") or second_pass_args.get("guidance_timesteps")
954
+ if schedule_p2:
955
+ timesteps_para_refinamento = schedule_p2
956
+ print(f"[DEBUG] Passo 2: Usando {len(timesteps_para_refinamento)} timesteps pré-definidos do config para refinamento.")
957
+ else:
958
+ strength_p2 = second_pass_args.get("strength", second_pass_args.get("denoising_strength", 0.4))
959
+ num_steps_passo2_total = second_pass_args.get("num_inference_steps", 20)
960
+
961
+ self.pipeline.scheduler.set_timesteps(num_steps_passo2_total, device=self.device)
962
+ todos_os_timesteps_p2 = self.pipeline.scheduler.timesteps
963
+
964
+ ponto_de_corte = int(len(todos_os_timesteps_p2) * (1.0 - strength_p2))
965
+ timesteps_para_refinamento = todos_os_timesteps_p2[ponto_de_corte:]
966
+ print(f"[DEBUG] Passo 2: Calculando {len(timesteps_para_refinamento)} timesteps manuais (strength ≈ {strength_p2})")
967
+
968
+ second_pass_kwargs["timesteps"] = timesteps_para_refinamento
969
+ if "strength" in second_pass_kwargs: del second_pass_kwargs["strength"]
970
+
971
+ second_pass_kwargs["latents"] = latents_high_res
972
+
973
+ num_timesteps_p2 = len(timesteps_para_refinamento)
974
+ if 'guidance_mapping' not in second_pass_kwargs:
975
+ second_pass_kwargs['guidance_mapping'] = list(range(num_timesteps_p2))
976
+ print(f"[DEBUG] Passo 2: Injetando 'guidance_mapping' de identidade com {num_timesteps_p2} passos.")
977
+
978
+ with ctx:
979
+ second_pass_result = self.pipeline(**second_pass_kwargs)
980
+
981
+ latents = second_pass_result.images
982
+ log_tensor_info(latents, "Latentes Finais (Passo 2)")
983
+
984
+ else:
985
+ # --- PASSO ÚNICO (SINGLE-PASS) ---
986
+ single_pass_kwargs = call_kwargs.copy()
987
+ first_pass_config = self.config.get("first_pass", {})
988
+ single_pass_kwargs.update({
989
+ "guidance_scale": float(guidance_scale),
990
+ "stg_scale": first_pass_config.get("stg_scale"),
991
+ "rescaling_scale": first_pass_config.get("rescaling_scale"),
992
+ "skip_block_list": first_pass_config.get("skip_block_list"),
993
+ })
994
+ schedule = first_pass_config.get("timesteps") or first_pass_config.get("guidance_timesteps")
995
+ if mode == "video-to-video":
996
+ schedule = [0.7]; print("[INFO] Modo video-to-video (etapa única): timesteps=[0.7]")
997
+ if isinstance(schedule, (list, tuple)) and len(schedule) > 0:
998
+ single_pass_kwargs["timesteps"] = schedule
999
+ single_pass_kwargs["guidance_timesteps"] = schedule
1000
+ print(f"[DEBUG] Single-pass: timesteps_len={len(schedule) if schedule else 0}")
1001
+
1002
+ print("\n[INFO] Executando pipeline de etapa única...")
1003
+ with ctx:
1004
+ result = self.pipeline(**single_pass_kwargs)
1005
+
1006
+ latents = result.images
1007
+ print(f"[DEBUG] Latentes (single-pass): shape={tuple(latents.shape)}")
1008
+
1009
+ # --- DECODIFICAÇÃO E CODIFICAÇÃO DE VÍDEO FINAL ---
1010
+ latents_cpu = latents.detach().to("cpu", non_blocking=True)
1011
+ if self.device == "cuda":
1012
+ torch.cuda.empty_cache()
1013
+ try: torch.cuda.ipc_collect()
1014
+ except Exception: pass
1015
+
1016
+ lat_a, lat_b = self._dividir_latentes(latents_cpu)
1017
+ lat_a1, lat_a2 = self._dividir_latentes(lat_a)
1018
+ lat_b1, lat_b2 = self._dividir_latentes(lat_b)
1019
+
1020
+ latents_parts = [lat_a1, lat_a2, lat_b1, lat_b2]
1021
+
1022
+ temp_dir = tempfile.mkdtemp(prefix="ltxv_"); self._register_tmp_dir(temp_dir)
1023
+ results_dir = "/app/output"; os.makedirs(results_dir, exist_ok=True)
1024
+
1025
+ partes_mp4 = []
1026
+ par = 0
1027
+
1028
+ for part in latents_parts:
1029
+ par += 1
1030
+ if part is None: continue
1031
+ print(f"[DEBUG] Partição {par}: {tuple(part.shape)}")
1032
+ output_video_path = os.path.join(temp_dir, f"output_{used_seed}_{par}.mp4")
1033
+
1034
+ print("[DEBUG] Decodificando bloco de latentes com VAE → tensor de pixels...")
1035
+ pixel_tensor = vae_manager_singleton.decode(
1036
+ part.to(self.device, non_blocking=True),
1037
+ decode_timestep=float(self.config.get("decode_timestep", 0.05))
1038
+ )
1039
+ log_tensor_info(pixel_tensor, "Pixel tensor (VAE saída)")
1040
+
1041
+ print("[DEBUG] Codificando MP4 a partir do tensor de pixels...")
1042
+ video_encode_tool_singleton.save_video_from_tensor(
1043
+ pixel_tensor,
1044
+ output_video_path,
1045
+ fps=call_kwargs["frame_rate"],
1046
+ progress_callback=progress_callback
1047
+ )
1048
+
1049
+ candidate = os.path.join(results_dir, f"output_par_{par}.mp4")
1050
+ try:
1051
+ shutil.move(output_video_path, candidate)
1052
+ print(f"[DEBUG] MP4 parte {par} movido para {candidate}")
1053
+ partes_mp4.append(candidate)
1054
+ except Exception as e:
1055
+ print(f"[DEBUG] Falha no move; usando tmp como final: {e}")
1056
+ partes_mp4.append(output_video_path)
1057
+
1058
+ final_concat = os.path.join(results_dir, f"concat_fim_{used_seed}.mp4")
1059
+ if partes_mp4:
1060
+ if len(partes_mp4) == 1:
1061
+ shutil.move(partes_mp4[0], final_concat)
1062
+ print(f"[DEBUG] Apenas uma parte, movida para {final_concat}")
1063
+ else:
1064
+ self._concat_mp4s_no_reencode(partes_mp4, final_concat)
1065
+ else:
1066
+ print("[WARN] Nenhuma parte de vídeo foi gerada para concatenar.")
1067
+ return None, used_seed
1068
+
1069
+ self._log_gpu_memory("Fim da Geração")
1070
+ return final_concat, used_seed
1071
+
1072
  except Exception as e:
1073
  print("[DEBUG] EXCEÇÃO NA GERAÇÃO:")
1074
  print("".join(traceback.format_exception(type(e), e, e.__traceback__)))
1075
  raise
1076
  finally:
1077
+ # Limpeza de variáveis para liberar memória
1078
+ try: del latents, latents_low_res, latents_high_res, second_pass_result
1079
+ except NameError: pass
1080
+ except Exception as e: print(f"[DEBUG] Erro na limpeza de variáveis: {e}")
1081
 
1082
  gc.collect()
1083
  if self.device == "cuda":