Spaces:
Paused
Paused
Update api/gpu_manager.py
Browse files- api/gpu_manager.py +72 -42
api/gpu_manager.py
CHANGED
|
@@ -1,15 +1,22 @@
|
|
| 1 |
-
# api/gpu_manager.py
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import os
|
| 4 |
import torch
|
| 5 |
import math
|
|
|
|
| 6 |
|
| 7 |
class GPUManager:
|
| 8 |
"""
|
| 9 |
-
|
| 10 |
-
|
|
|
|
| 11 |
"""
|
| 12 |
def __init__(self):
|
|
|
|
| 13 |
self.total_gpus = torch.cuda.device_count()
|
| 14 |
self.ltx_main_gpus = []
|
| 15 |
self.ltx_vae_gpu = []
|
|
@@ -18,77 +25,100 @@ class GPUManager:
|
|
| 18 |
self._allocate_gpus()
|
| 19 |
|
| 20 |
def _allocate_gpus(self):
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
all_indices = list(range(self.total_gpus))
|
| 26 |
|
| 27 |
if self.total_gpus == 0:
|
| 28 |
-
|
| 29 |
elif self.total_gpus == 1:
|
| 30 |
-
|
| 31 |
self.ltx_main_gpus = [0]
|
| 32 |
-
self.ltx_vae_gpu = [0] #
|
| 33 |
self.seedvr_gpus = [0]
|
| 34 |
self.vincie_gpus = [0]
|
| 35 |
elif self.total_gpus == 2:
|
| 36 |
-
|
| 37 |
self.ltx_main_gpus = [0]
|
| 38 |
-
self.ltx_vae_gpu = [1] # VAE
|
| 39 |
-
self.seedvr_gpus = [0] #
|
| 40 |
-
self.vincie_gpus = [0] #
|
| 41 |
-
else: # 3
|
| 42 |
-
|
| 43 |
-
# LTX
|
| 44 |
self.ltx_main_gpus = [0]
|
| 45 |
self.ltx_vae_gpu = [1]
|
| 46 |
|
| 47 |
remaining_gpus = all_indices[2:]
|
| 48 |
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
self.vincie_gpus = remaining_gpus[:vincie_count]
|
| 58 |
self.seedvr_gpus = remaining_gpus[vincie_count:]
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
|
| 67 |
def get_ltx_device(self) -> torch.device:
|
| 68 |
-
"""
|
| 69 |
if not self.ltx_main_gpus:
|
| 70 |
return torch.device("cpu")
|
| 71 |
return torch.device(f"cuda:{self.ltx_main_gpus[0]}")
|
| 72 |
|
| 73 |
def get_ltx_vae_device(self) -> torch.device:
|
| 74 |
-
"""
|
| 75 |
if not self.ltx_vae_gpu:
|
| 76 |
return torch.device("cpu")
|
| 77 |
return torch.device(f"cuda:{self.ltx_vae_gpu[0]}")
|
| 78 |
|
| 79 |
-
def get_seedvr_devices(self) ->
|
| 80 |
-
"""
|
| 81 |
return self.seedvr_gpus
|
| 82 |
|
| 83 |
-
def get_vincie_devices(self) ->
|
| 84 |
-
"""
|
| 85 |
return self.vincie_gpus
|
| 86 |
|
| 87 |
def requires_memory_swap(self) -> bool:
|
| 88 |
-
"""
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
-
#
|
|
|
|
| 94 |
gpu_manager = GPUManager()
|
|
|
|
| 1 |
+
# FILE: api/gpu_manager.py
|
| 2 |
+
# DESCRIPTION: A hardware-aware, service-agnostic GPU allocator for the ADUC-SDR suite.
|
| 3 |
+
# This module inspects available GPUs and partitions them according to a predefined
|
| 4 |
+
# strategy for LTX, SeedVR, and VINCIE services without importing them, thus
|
| 5 |
+
# preventing circular dependencies.
|
| 6 |
|
| 7 |
import os
|
| 8 |
import torch
|
| 9 |
import math
|
| 10 |
+
import logging
|
| 11 |
|
| 12 |
class GPUManager:
|
| 13 |
"""
|
| 14 |
+
Manages and allocates available GPUs among different services.
|
| 15 |
+
It operates agnostically, providing device information without knowing
|
| 16 |
+
the specifics of the services that will use them.
|
| 17 |
"""
|
| 18 |
def __init__(self):
|
| 19 |
+
"""Initializes the manager, detects GPUs, and runs the allocation logic."""
|
| 20 |
self.total_gpus = torch.cuda.device_count()
|
| 21 |
self.ltx_main_gpus = []
|
| 22 |
self.ltx_vae_gpu = []
|
|
|
|
| 25 |
self._allocate_gpus()
|
| 26 |
|
| 27 |
def _allocate_gpus(self):
|
| 28 |
+
"""
|
| 29 |
+
Implements the GPU allocation strategy based on the total number of detected GPUs.
|
| 30 |
+
"""
|
| 31 |
+
logging.info("="*60)
|
| 32 |
+
logging.info("🤖 Initializing GPU Manager (LTX, SeedVR, VINCIE)")
|
| 33 |
+
logging.info(f" > Total GPUs detected: {self.total_gpus}")
|
| 34 |
|
| 35 |
all_indices = list(range(self.total_gpus))
|
| 36 |
|
| 37 |
if self.total_gpus == 0:
|
| 38 |
+
logging.warning(" > No GPUs detected. All services will operate in CPU mode.")
|
| 39 |
elif self.total_gpus == 1:
|
| 40 |
+
logging.warning(" > 1 GPU detected. All services will share GPU 0. Memory swapping will be active.")
|
| 41 |
self.ltx_main_gpus = [0]
|
| 42 |
+
self.ltx_vae_gpu = [0] # Shares with the main LTX pipeline
|
| 43 |
self.seedvr_gpus = [0]
|
| 44 |
self.vincie_gpus = [0]
|
| 45 |
elif self.total_gpus == 2:
|
| 46 |
+
logging.info(" > 2 GPUs detected. LTX will use a dedicated VAE device.")
|
| 47 |
self.ltx_main_gpus = [0]
|
| 48 |
+
self.ltx_vae_gpu = [1] # VAE gets the second GPU
|
| 49 |
+
self.seedvr_gpus = [0] # Shares with main LTX
|
| 50 |
+
self.vincie_gpus = [0] # Shares with main LTX
|
| 51 |
+
else: # 3 or more GPUs
|
| 52 |
+
logging.info(f" > {self.total_gpus} GPUs detected. Distributing allocation.")
|
| 53 |
+
# LTX always gets the first two GPUs if available for optimal performance
|
| 54 |
self.ltx_main_gpus = [0]
|
| 55 |
self.ltx_vae_gpu = [1]
|
| 56 |
|
| 57 |
remaining_gpus = all_indices[2:]
|
| 58 |
|
| 59 |
+
# The rest are divided between SeedVR and VINCIE
|
| 60 |
+
# VINCIE gets priority as it can scale well with more GPUs
|
| 61 |
+
vincie_count = max(1, math.ceil(len(remaining_gpus) / 2))
|
| 62 |
+
seedvr_count = len(remaining_gpus) - vincie_count
|
| 63 |
+
|
| 64 |
+
self.vincie_gpus = remaining_gpus[:vincie_count]
|
| 65 |
+
# If there are GPUs left, assign them to SeedVR
|
| 66 |
+
if seedvr_count > 0:
|
|
|
|
| 67 |
self.seedvr_gpus = remaining_gpus[vincie_count:]
|
| 68 |
+
else:
|
| 69 |
+
# If no GPUs are left for SeedVR, it shares with the main LTX GPU
|
| 70 |
+
self.seedvr_gpus = [0]
|
| 71 |
|
| 72 |
+
logging.info(f" > Final Allocation:")
|
| 73 |
+
logging.info(f" - LTX (Transformer): GPUs {self.ltx_main_gpus}")
|
| 74 |
+
logging.info(f" - LTX (VAE): GPU {self.ltx_vae_gpu[0] if self.ltx_vae_gpu else 'N/A'}")
|
| 75 |
+
logging.info(f" - SeedVR: GPUs {self.seedvr_gpus}")
|
| 76 |
+
logging.info(f" - VINCIE: GPUs {self.vincie_gpus}")
|
| 77 |
+
logging.info("="*60)
|
| 78 |
|
| 79 |
def get_ltx_device(self) -> torch.device:
|
| 80 |
+
"""Returns the primary device for the LTX Transformer pipeline."""
|
| 81 |
if not self.ltx_main_gpus:
|
| 82 |
return torch.device("cpu")
|
| 83 |
return torch.device(f"cuda:{self.ltx_main_gpus[0]}")
|
| 84 |
|
| 85 |
def get_ltx_vae_device(self) -> torch.device:
|
| 86 |
+
"""Returns the dedicated device for the LTX VAE."""
|
| 87 |
if not self.ltx_vae_gpu:
|
| 88 |
return torch.device("cpu")
|
| 89 |
return torch.device(f"cuda:{self.ltx_vae_gpu[0]}")
|
| 90 |
|
| 91 |
+
def get_seedvr_devices(self) -> List[int]:
|
| 92 |
+
"""Returns the list of GPU indices for the SeedVR service."""
|
| 93 |
return self.seedvr_gpus
|
| 94 |
|
| 95 |
+
def get_vincie_devices(self) -> List[int]:
|
| 96 |
+
"""Returns the list of GPU indices for the VINCIE service."""
|
| 97 |
return self.vincie_gpus
|
| 98 |
|
| 99 |
def requires_memory_swap(self) -> bool:
|
| 100 |
+
"""
|
| 101 |
+
Determines if memory swapping is necessary because multiple services
|
| 102 |
+
are sharing the same primary GPU.
|
| 103 |
+
The dedicated VAE GPU is not considered for swapping logic.
|
| 104 |
+
"""
|
| 105 |
+
# Collect all GPUs used by the main, memory-intensive parts of the services
|
| 106 |
+
all_main_allocations = self.ltx_main_gpus + self.seedvr_gpus + self.vincie_gpus
|
| 107 |
+
|
| 108 |
+
# Count how many services are allocated to each unique GPU
|
| 109 |
+
gpu_usage_count = {}
|
| 110 |
+
for gpu_idx in all_main_allocations:
|
| 111 |
+
gpu_usage_count[gpu_idx] = gpu_usage_count.get(gpu_idx, 0) + 1
|
| 112 |
+
|
| 113 |
+
# Swapping is required if any GPU is used by more than one service
|
| 114 |
+
for gpu_idx in gpu_usage_count:
|
| 115 |
+
if gpu_usage_count[gpu_idx] > 1:
|
| 116 |
+
logging.warning(f"Memory swapping is ACTIVE because GPU {gpu_idx} is shared by multiple services.")
|
| 117 |
+
return True
|
| 118 |
+
|
| 119 |
+
logging.info("Memory swapping is INACTIVE. Each service has dedicated primary GPUs.")
|
| 120 |
+
return False
|
| 121 |
|
| 122 |
+
# --- Singleton Instantiation ---
|
| 123 |
+
# This global instance is created once and imported by all other modules.
|
| 124 |
gpu_manager = GPUManager()
|