eeuuia commited on
Commit
60bd16c
·
verified ·
1 Parent(s): 83047a1

Update api/gpu_manager.py

Browse files
Files changed (1) hide show
  1. api/gpu_manager.py +72 -42
api/gpu_manager.py CHANGED
@@ -1,15 +1,22 @@
1
- # api/gpu_manager.py (Versão com VAE Dedicado)
 
 
 
 
2
 
3
  import os
4
  import torch
5
  import math
 
6
 
7
  class GPUManager:
8
  """
9
- Gerencia e aloca GPUs de forma inteligente entre LTX (com VAE dedicado),
10
- SeedVR e VINCIE.
 
11
  """
12
  def __init__(self):
 
13
  self.total_gpus = torch.cuda.device_count()
14
  self.ltx_main_gpus = []
15
  self.ltx_vae_gpu = []
@@ -18,77 +25,100 @@ class GPUManager:
18
  self._allocate_gpus()
19
 
20
  def _allocate_gpus(self):
21
- print("="*60)
22
- print("🤖 Gerenciador de GPUs (com VAE dedicado para LTX)")
23
- print(f" > Total de GPUs detectadas: {self.total_gpus}")
 
 
 
24
 
25
  all_indices = list(range(self.total_gpus))
26
 
27
  if self.total_gpus == 0:
28
- print(" > Nenhuma GPU detectada. Operando em modo CPU.")
29
  elif self.total_gpus == 1:
30
- print(" > 1 GPU: Modo de compartilhamento total.")
31
  self.ltx_main_gpus = [0]
32
- self.ltx_vae_gpu = [0] # Compartilha com o principal
33
  self.seedvr_gpus = [0]
34
  self.vincie_gpus = [0]
35
  elif self.total_gpus == 2:
36
- print(" > 2 GPUs: LTX com VAE dedicado, outros compartilham a GPU principal.")
37
  self.ltx_main_gpus = [0]
38
- self.ltx_vae_gpu = [1] # VAE fica com a segunda GPU
39
- self.seedvr_gpus = [0] # Compartilha com LTX principal
40
- self.vincie_gpus = [0] # Compartilha com LTX principal
41
- else: # 3 ou mais GPUs
42
- print(f" > {self.total_gpus} GPUs: Alocação distribuída.")
43
- # LTX sempre fica com as duas primeiras GPUs se disponíveis
44
  self.ltx_main_gpus = [0]
45
  self.ltx_vae_gpu = [1]
46
 
47
  remaining_gpus = all_indices[2:]
48
 
49
- if not remaining_gpus: # Caso de exatamente 2 GPUs, coberto, mas por segurança
50
- self.seedvr_gpus = [0]
51
- self.vincie_gpus = [0]
52
- else:
53
- # O resto é dividido entre SeedVR e VINCIE
54
- vincie_count = max(1, math.ceil(len(remaining_gpus) / 2))
55
- seedvr_count = len(remaining_gpus) - vincie_count
56
-
57
- self.vincie_gpus = remaining_gpus[:vincie_count]
58
  self.seedvr_gpus = remaining_gpus[vincie_count:]
 
 
 
59
 
60
- print(f" > Alocação Final:")
61
- print(f" - LTX (Transformer): GPUs {self.ltx_main_gpus}")
62
- print(f" - LTX (VAE): GPUs {self.ltx_vae_gpu}")
63
- print(f" - SeedVR: GPUs {self.seedvr_gpus}")
64
- print(f" - VINCIE: GPUs {self.vincie_gpus}")
65
- print("="*60)
66
 
67
  def get_ltx_device(self) -> torch.device:
68
- """Retorna o dispositivo principal para o Transformer do LTX."""
69
  if not self.ltx_main_gpus:
70
  return torch.device("cpu")
71
  return torch.device(f"cuda:{self.ltx_main_gpus[0]}")
72
 
73
  def get_ltx_vae_device(self) -> torch.device:
74
- """Retorna o dispositivo dedicado para o VAE do LTX."""
75
  if not self.ltx_vae_gpu:
76
  return torch.device("cpu")
77
  return torch.device(f"cuda:{self.ltx_vae_gpu[0]}")
78
 
79
- def get_seedvr_devices(self) -> list:
80
- """Retorna a lista de IDs de GPU para o SeedVR."""
81
  return self.seedvr_gpus
82
 
83
- def get_vincie_devices(self) -> list:
84
- """Retorna a lista de IDs de GPU para o VINCIE."""
85
  return self.vincie_gpus
86
 
87
  def requires_memory_swap(self) -> bool:
88
- """Verifica se múltiplos serviços estão compartilhando a mesma GPU."""
89
- all_allocations = self.ltx_main_gpus + self.seedvr_gpus + self.vincie_gpus
90
- # O swap é necessário se o número de alocações for maior que o número de GPUs únicas
91
- return len(all_allocations) > len(set(all_allocations))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
- # Instância global
 
94
  gpu_manager = GPUManager()
 
1
+ # FILE: api/gpu_manager.py
2
+ # DESCRIPTION: A hardware-aware, service-agnostic GPU allocator for the ADUC-SDR suite.
3
+ # This module inspects available GPUs and partitions them according to a predefined
4
+ # strategy for LTX, SeedVR, and VINCIE services without importing them, thus
5
+ # preventing circular dependencies.
6
 
7
  import os
8
  import torch
9
  import math
10
+ import logging
11
 
12
  class GPUManager:
13
  """
14
+ Manages and allocates available GPUs among different services.
15
+ It operates agnostically, providing device information without knowing
16
+ the specifics of the services that will use them.
17
  """
18
  def __init__(self):
19
+ """Initializes the manager, detects GPUs, and runs the allocation logic."""
20
  self.total_gpus = torch.cuda.device_count()
21
  self.ltx_main_gpus = []
22
  self.ltx_vae_gpu = []
 
25
  self._allocate_gpus()
26
 
27
  def _allocate_gpus(self):
28
+ """
29
+ Implements the GPU allocation strategy based on the total number of detected GPUs.
30
+ """
31
+ logging.info("="*60)
32
+ logging.info("🤖 Initializing GPU Manager (LTX, SeedVR, VINCIE)")
33
+ logging.info(f" > Total GPUs detected: {self.total_gpus}")
34
 
35
  all_indices = list(range(self.total_gpus))
36
 
37
  if self.total_gpus == 0:
38
+ logging.warning(" > No GPUs detected. All services will operate in CPU mode.")
39
  elif self.total_gpus == 1:
40
+ logging.warning(" > 1 GPU detected. All services will share GPU 0. Memory swapping will be active.")
41
  self.ltx_main_gpus = [0]
42
+ self.ltx_vae_gpu = [0] # Shares with the main LTX pipeline
43
  self.seedvr_gpus = [0]
44
  self.vincie_gpus = [0]
45
  elif self.total_gpus == 2:
46
+ logging.info(" > 2 GPUs detected. LTX will use a dedicated VAE device.")
47
  self.ltx_main_gpus = [0]
48
+ self.ltx_vae_gpu = [1] # VAE gets the second GPU
49
+ self.seedvr_gpus = [0] # Shares with main LTX
50
+ self.vincie_gpus = [0] # Shares with main LTX
51
+ else: # 3 or more GPUs
52
+ logging.info(f" > {self.total_gpus} GPUs detected. Distributing allocation.")
53
+ # LTX always gets the first two GPUs if available for optimal performance
54
  self.ltx_main_gpus = [0]
55
  self.ltx_vae_gpu = [1]
56
 
57
  remaining_gpus = all_indices[2:]
58
 
59
+ # The rest are divided between SeedVR and VINCIE
60
+ # VINCIE gets priority as it can scale well with more GPUs
61
+ vincie_count = max(1, math.ceil(len(remaining_gpus) / 2))
62
+ seedvr_count = len(remaining_gpus) - vincie_count
63
+
64
+ self.vincie_gpus = remaining_gpus[:vincie_count]
65
+ # If there are GPUs left, assign them to SeedVR
66
+ if seedvr_count > 0:
 
67
  self.seedvr_gpus = remaining_gpus[vincie_count:]
68
+ else:
69
+ # If no GPUs are left for SeedVR, it shares with the main LTX GPU
70
+ self.seedvr_gpus = [0]
71
 
72
+ logging.info(f" > Final Allocation:")
73
+ logging.info(f" - LTX (Transformer): GPUs {self.ltx_main_gpus}")
74
+ logging.info(f" - LTX (VAE): GPU {self.ltx_vae_gpu[0] if self.ltx_vae_gpu else 'N/A'}")
75
+ logging.info(f" - SeedVR: GPUs {self.seedvr_gpus}")
76
+ logging.info(f" - VINCIE: GPUs {self.vincie_gpus}")
77
+ logging.info("="*60)
78
 
79
  def get_ltx_device(self) -> torch.device:
80
+ """Returns the primary device for the LTX Transformer pipeline."""
81
  if not self.ltx_main_gpus:
82
  return torch.device("cpu")
83
  return torch.device(f"cuda:{self.ltx_main_gpus[0]}")
84
 
85
  def get_ltx_vae_device(self) -> torch.device:
86
+ """Returns the dedicated device for the LTX VAE."""
87
  if not self.ltx_vae_gpu:
88
  return torch.device("cpu")
89
  return torch.device(f"cuda:{self.ltx_vae_gpu[0]}")
90
 
91
+ def get_seedvr_devices(self) -> List[int]:
92
+ """Returns the list of GPU indices for the SeedVR service."""
93
  return self.seedvr_gpus
94
 
95
+ def get_vincie_devices(self) -> List[int]:
96
+ """Returns the list of GPU indices for the VINCIE service."""
97
  return self.vincie_gpus
98
 
99
  def requires_memory_swap(self) -> bool:
100
+ """
101
+ Determines if memory swapping is necessary because multiple services
102
+ are sharing the same primary GPU.
103
+ The dedicated VAE GPU is not considered for swapping logic.
104
+ """
105
+ # Collect all GPUs used by the main, memory-intensive parts of the services
106
+ all_main_allocations = self.ltx_main_gpus + self.seedvr_gpus + self.vincie_gpus
107
+
108
+ # Count how many services are allocated to each unique GPU
109
+ gpu_usage_count = {}
110
+ for gpu_idx in all_main_allocations:
111
+ gpu_usage_count[gpu_idx] = gpu_usage_count.get(gpu_idx, 0) + 1
112
+
113
+ # Swapping is required if any GPU is used by more than one service
114
+ for gpu_idx in gpu_usage_count:
115
+ if gpu_usage_count[gpu_idx] > 1:
116
+ logging.warning(f"Memory swapping is ACTIVE because GPU {gpu_idx} is shared by multiple services.")
117
+ return True
118
+
119
+ logging.info("Memory swapping is INACTIVE. Each service has dedicated primary GPUs.")
120
+ return False
121
 
122
+ # --- Singleton Instantiation ---
123
+ # This global instance is created once and imported by all other modules.
124
  gpu_manager = GPUManager()