Test4

Paused

App Files Files Community

eeuuia commited on Oct 11

Commit

60bd16c

verified ·

1 Parent(s): 83047a1

Update api/gpu_manager.py

Browse files

Files changed (1) hide show

api/gpu_manager.py +72 -42

api/gpu_manager.py CHANGED Viewed

@@ -1,15 +1,22 @@
-# api/gpu_manager.py (Versão com VAE Dedicado)
 import os
 import torch
 import math
 class GPUManager:
     """
-    Gerencia e aloca GPUs de forma inteligente entre LTX (com VAE dedicado),
-    SeedVR e VINCIE.
     """
     def __init__(self):
         self.total_gpus = torch.cuda.device_count()
         self.ltx_main_gpus = []
         self.ltx_vae_gpu = []
@@ -18,77 +25,100 @@ class GPUManager:
         self._allocate_gpus()
     def _allocate_gpus(self):
-        print("="*60)
-        print("🤖 Gerenciador de GPUs (com VAE dedicado para LTX)")
-        print(f"   > Total de GPUs detectadas: {self.total_gpus}")
         all_indices = list(range(self.total_gpus))
         if self.total_gpus == 0:
-            print("   > Nenhuma GPU detectada. Operando em modo CPU.")
         elif self.total_gpus == 1:
-            print("   > 1 GPU: Modo de compartilhamento total.")
             self.ltx_main_gpus = [0]
-            self.ltx_vae_gpu = [0] # Compartilha com o principal
             self.seedvr_gpus = [0]
             self.vincie_gpus = [0]
         elif self.total_gpus == 2:
-            print("   > 2 GPUs: LTX com VAE dedicado, outros compartilham a GPU principal.")
             self.ltx_main_gpus = [0]
-            self.ltx_vae_gpu = [1] # VAE fica com a segunda GPU
-            self.seedvr_gpus = [0] # Compartilha com LTX principal
-            self.vincie_gpus = [0] # Compartilha com LTX principal
-        else: # 3 ou mais GPUs
-            print(f"   > {self.total_gpus} GPUs: Alocação distribuída.")
-            # LTX sempre fica com as duas primeiras GPUs se disponíveis
             self.ltx_main_gpus = [0]
             self.ltx_vae_gpu = [1]
             remaining_gpus = all_indices[2:]
-            if not remaining_gpus: # Caso de exatamente 2 GPUs, já coberto, mas por segurança
-                self.seedvr_gpus = [0]
-                self.vincie_gpus = [0]
-            else:
-                # O resto é dividido entre SeedVR e VINCIE
-                vincie_count = max(1, math.ceil(len(remaining_gpus) / 2))
-                seedvr_count = len(remaining_gpus) - vincie_count
-                self.vincie_gpus = remaining_gpus[:vincie_count]
                 self.seedvr_gpus = remaining_gpus[vincie_count:]
-        print(f"   > Alocação Final:")
-        print(f"     - LTX (Transformer): GPUs {self.ltx_main_gpus}")
-        print(f"     - LTX (VAE):         GPUs {self.ltx_vae_gpu}")
-        print(f"     - SeedVR:            GPUs {self.seedvr_gpus}")
-        print(f"     - VINCIE:            GPUs {self.vincie_gpus}")
-        print("="*60)
     def get_ltx_device(self) -> torch.device:
-        """Retorna o dispositivo principal para o Transformer do LTX."""
         if not self.ltx_main_gpus:
             return torch.device("cpu")
         return torch.device(f"cuda:{self.ltx_main_gpus[0]}")
     def get_ltx_vae_device(self) -> torch.device:
-        """Retorna o dispositivo dedicado para o VAE do LTX."""
         if not self.ltx_vae_gpu:
             return torch.device("cpu")
         return torch.device(f"cuda:{self.ltx_vae_gpu[0]}")
-    def get_seedvr_devices(self) -> list:
-        """Retorna a lista de IDs de GPU para o SeedVR."""
         return self.seedvr_gpus
-    def get_vincie_devices(self) -> list:
-        """Retorna a lista de IDs de GPU para o VINCIE."""
         return self.vincie_gpus
     def requires_memory_swap(self) -> bool:
-        """Verifica se múltiplos serviços estão compartilhando a mesma GPU."""
-        all_allocations = self.ltx_main_gpus + self.seedvr_gpus + self.vincie_gpus
-        # O swap é necessário se o número de alocações for maior que o número de GPUs únicas
-        return len(all_allocations) > len(set(all_allocations))
-# Instância global
 gpu_manager = GPUManager()

+# FILE: api/gpu_manager.py
+# DESCRIPTION: A hardware-aware, service-agnostic GPU allocator for the ADUC-SDR suite.
+# This module inspects available GPUs and partitions them according to a predefined
+# strategy for LTX, SeedVR, and VINCIE services without importing them, thus
+# preventing circular dependencies.
 import os
 import torch
 import math
+import logging
 class GPUManager:
     """
+    Manages and allocates available GPUs among different services.
+    It operates agnostically, providing device information without knowing
+    the specifics of the services that will use them.
     """
     def __init__(self):
+        """Initializes the manager, detects GPUs, and runs the allocation logic."""
         self.total_gpus = torch.cuda.device_count()
         self.ltx_main_gpus = []
         self.ltx_vae_gpu = []
         self._allocate_gpus()
     def _allocate_gpus(self):
+        """
+        Implements the GPU allocation strategy based on the total number of detected GPUs.
+        """
+        logging.info("="*60)
+        logging.info("🤖 Initializing GPU Manager (LTX, SeedVR, VINCIE)")
+        logging.info(f"   > Total GPUs detected: {self.total_gpus}")
         all_indices = list(range(self.total_gpus))
         if self.total_gpus == 0:
+            logging.warning("   > No GPUs detected. All services will operate in CPU mode.")
         elif self.total_gpus == 1:
+            logging.warning("   > 1 GPU detected. All services will share GPU 0. Memory swapping will be active.")
             self.ltx_main_gpus = [0]
+            self.ltx_vae_gpu = [0] # Shares with the main LTX pipeline
             self.seedvr_gpus = [0]
             self.vincie_gpus = [0]
         elif self.total_gpus == 2:
+            logging.info("   > 2 GPUs detected. LTX will use a dedicated VAE device.")
             self.ltx_main_gpus = [0]
+            self.ltx_vae_gpu = [1] # VAE gets the second GPU
+            self.seedvr_gpus = [0] # Shares with main LTX
+            self.vincie_gpus = [0] # Shares with main LTX
+        else: # 3 or more GPUs
+            logging.info(f"   > {self.total_gpus} GPUs detected. Distributing allocation.")
+            # LTX always gets the first two GPUs if available for optimal performance
             self.ltx_main_gpus = [0]
             self.ltx_vae_gpu = [1]
             remaining_gpus = all_indices[2:]
+            # The rest are divided between SeedVR and VINCIE
+            # VINCIE gets priority as it can scale well with more GPUs
+            vincie_count = max(1, math.ceil(len(remaining_gpus) / 2))
+            seedvr_count = len(remaining_gpus) - vincie_count
+            self.vincie_gpus = remaining_gpus[:vincie_count]
+            # If there are GPUs left, assign them to SeedVR
+            if seedvr_count > 0:
                 self.seedvr_gpus = remaining_gpus[vincie_count:]
+            else:
+                # If no GPUs are left for SeedVR, it shares with the main LTX GPU
+                self.seedvr_gpus = [0]
+        logging.info(f"   > Final Allocation:")
+        logging.info(f"     - LTX (Transformer): GPUs {self.ltx_main_gpus}")
+        logging.info(f"     - LTX (VAE):         GPU  {self.ltx_vae_gpu[0] if self.ltx_vae_gpu else 'N/A'}")
+        logging.info(f"     - SeedVR:            GPUs {self.seedvr_gpus}")
+        logging.info(f"     - VINCIE:            GPUs {self.vincie_gpus}")
+        logging.info("="*60)
     def get_ltx_device(self) -> torch.device:
+        """Returns the primary device for the LTX Transformer pipeline."""
         if not self.ltx_main_gpus:
             return torch.device("cpu")
         return torch.device(f"cuda:{self.ltx_main_gpus[0]}")
     def get_ltx_vae_device(self) -> torch.device:
+        """Returns the dedicated device for the LTX VAE."""
         if not self.ltx_vae_gpu:
             return torch.device("cpu")
         return torch.device(f"cuda:{self.ltx_vae_gpu[0]}")
+    def get_seedvr_devices(self) -> List[int]:
+        """Returns the list of GPU indices for the SeedVR service."""
         return self.seedvr_gpus
+    def get_vincie_devices(self) -> List[int]:
+        """Returns the list of GPU indices for the VINCIE service."""
         return self.vincie_gpus
     def requires_memory_swap(self) -> bool:
+        """
+        Determines if memory swapping is necessary because multiple services
+        are sharing the same primary GPU.
+        The dedicated VAE GPU is not considered for swapping logic.
+        """
+        # Collect all GPUs used by the main, memory-intensive parts of the services
+        all_main_allocations = self.ltx_main_gpus + self.seedvr_gpus + self.vincie_gpus
+        # Count how many services are allocated to each unique GPU
+        gpu_usage_count = {}
+        for gpu_idx in all_main_allocations:
+            gpu_usage_count[gpu_idx] = gpu_usage_count.get(gpu_idx, 0) + 1
+        # Swapping is required if any GPU is used by more than one service
+        for gpu_idx in gpu_usage_count:
+            if gpu_usage_count[gpu_idx] > 1:
+                logging.warning(f"Memory swapping is ACTIVE because GPU {gpu_idx} is shared by multiple services.")
+                return True
+        logging.info("Memory swapping is INACTIVE. Each service has dedicated primary GPUs.")
+        return False
+# --- Singleton Instantiation ---
+# This global instance is created once and imported by all other modules.
 gpu_manager = GPUManager()