File size: 6,596 Bytes
35bb6f4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 | from __future__ import annotations
import platform
import subprocess
import sys
import psutil
from fastapi import APIRouter
from api.src.inference.model_manager import ModelManager
from api.src.inference.voice_manager import VoiceManager
from api.src.structures.schemas import SystemDebugResponse
router = APIRouter(prefix="/debug", tags=["Debug"])
def _nvidia_smi_info() -> dict | None:
"""Run nvidia-smi and return GPU name + driver version, or None."""
try:
result = subprocess.run(
["nvidia-smi", "--query-gpu=name,driver_version", "--format=csv,noheader,nounits"],
capture_output=True, text=True, timeout=5,
)
if result.returncode == 0 and result.stdout.strip():
line = result.stdout.strip().split("\n")[0]
parts = [p.strip() for p in line.split(",")]
return {"gpu_name": parts[0], "driver_version": parts[1] if len(parts) > 1 else "unknown"}
except (FileNotFoundError, subprocess.TimeoutExpired):
pass
return None
def _build_gpu_fix_instructions(
gpu_name: str,
driver_version: str,
torch_ver: str | None,
cuda_ver: str | None,
) -> str:
"""Build OS-specific GPU fix instructions."""
os_name = platform.system() # "Windows", "Linux", "Darwin"
is_docker = _is_running_in_docker()
python_ver = f"{sys.version_info.major}.{sys.version_info.minor}"
# Detect GPU generation from name
is_blackwell = any(x in gpu_name.upper() for x in ("RTX 50", "RTX50", "BLACKWELL", "GB2"))
is_ada = any(x in gpu_name.upper() for x in ("RTX 40", "RTX40", "ADA"))
needs_cu128 = is_blackwell
min_torch = "2.6.0" if is_blackwell else "2.1.0"
cu_tag = "cu128" if needs_cu128 else "cu124"
# Diagnosis
lines = [
f"GPU '{gpu_name}' detected (driver {driver_version}) but PyTorch cannot use it.",
f"Installed: torch=={torch_ver or 'not installed'}, CUDA=={cuda_ver or 'none'}.",
]
if torch_ver is None:
lines.append("PyTorch is not installed at all.")
elif cuda_ver is None:
lines.append("PyTorch is installed but was built without CUDA (CPU-only build).")
elif is_blackwell and cuda_ver and cuda_ver < "12.8":
lines.append(f"RTX 50xx (Blackwell) requires CUDA >= 12.8, but torch has CUDA {cuda_ver}.")
else:
lines.append("PyTorch CUDA version may not match your GPU architecture.")
lines.append("")
# OS-specific fix
if is_docker:
lines.append("Fix (Docker): Rebuild with the updated Dockerfile (CUDA 12.8.0 base image):")
lines.append(" docker build -f docker/gpu/Dockerfile -t neutts-gpu .")
elif os_name == "Windows":
lines.append(f"Fix (Windows, Python {python_ver}):")
lines.append(f" pip install torch>={min_torch} --index-url https://download.pytorch.org/whl/{cu_tag}")
lines.append("")
lines.append("Make sure you have the latest NVIDIA driver installed:")
lines.append(" https://www.nvidia.com/Download/index.aspx")
if is_blackwell:
lines.append(" RTX 50xx requires driver >= 572.16")
elif os_name == "Linux":
lines.append(f"Fix (Linux, Python {python_ver}):")
lines.append(f" pip install torch>={min_torch} --index-url https://download.pytorch.org/whl/{cu_tag}")
lines.append("")
lines.append("Or with conda:")
lines.append(f" conda install pytorch>={min_torch} pytorch-cuda=12.8 -c pytorch -c nvidia")
lines.append("")
lines.append("Verify NVIDIA driver: nvidia-smi")
if is_blackwell:
lines.append(" RTX 50xx requires driver >= 572.16")
else:
lines.append(f"Fix: pip install torch>={min_torch} --index-url https://download.pytorch.org/whl/{cu_tag}")
lines.append("")
lines.append("After installing, restart NeuTTS-FastAPI.")
return "\n".join(lines)
def _is_running_in_docker() -> bool:
"""Check if we're running inside a Docker container."""
try:
with open("/proc/1/cgroup", "r") as f:
return "docker" in f.read()
except (FileNotFoundError, PermissionError):
pass
try:
from pathlib import Path
return Path("/.dockerenv").exists()
except Exception:
pass
return False
@router.get("/system", response_model=SystemDebugResponse)
async def system_info() -> SystemDebugResponse:
"""Return system resource usage and loaded model info."""
model_manager = ModelManager.get_instance()
voice_manager = VoiceManager.get_instance()
mem = psutil.virtual_memory()
gpu_available = False
gpu_info = None
torch_version = None
cuda_version = None
cuda_driver_version = None
gpu_detected_but_unusable = False
gpu_fix_instructions = None
try:
import torch
torch_version = torch.__version__
cuda_version = torch.version.cuda
if torch.cuda.is_available():
gpu_available = True
gpu_info = []
for i in range(torch.cuda.device_count()):
props = torch.cuda.get_device_properties(i)
allocated = torch.cuda.memory_allocated(i) / (1024**3)
total = props.total_mem / (1024**3)
gpu_info.append({
"index": i,
"name": props.name,
"total_gb": round(total, 2),
"allocated_gb": round(allocated, 2),
})
except ImportError:
pass
# Check nvidia-smi for GPU detection even if torch can't use it
smi = _nvidia_smi_info()
if smi:
cuda_driver_version = smi["driver_version"]
if not gpu_available:
gpu_detected_but_unusable = True
gpu_fix_instructions = _build_gpu_fix_instructions(
smi["gpu_name"], smi["driver_version"], torch_version, cuda_version,
)
return SystemDebugResponse(
cpu_count=psutil.cpu_count() or 0,
cpu_percent=psutil.cpu_percent(),
memory_total_gb=round(mem.total / (1024**3), 2),
memory_used_gb=round(mem.used / (1024**3), 2),
memory_percent=mem.percent,
gpu_available=gpu_available,
gpu_info=gpu_info,
torch_version=torch_version,
cuda_version=cuda_version,
cuda_driver_version=cuda_driver_version,
gpu_detected_but_unusable=gpu_detected_but_unusable,
gpu_fix_instructions=gpu_fix_instructions,
models_loaded=list(model_manager.loaded_models.keys()),
voices_available=len(voice_manager.voices),
)
|