| from __future__ import annotations |
|
|
| import platform |
| import subprocess |
| import sys |
|
|
| import psutil |
| from fastapi import APIRouter |
|
|
| from api.src.inference.model_manager import ModelManager |
| from api.src.inference.voice_manager import VoiceManager |
| from api.src.structures.schemas import SystemDebugResponse |
|
|
| router = APIRouter(prefix="/debug", tags=["Debug"]) |
|
|
|
|
| def _nvidia_smi_info() -> dict | None: |
| """Run nvidia-smi and return GPU name + driver version, or None.""" |
| try: |
| result = subprocess.run( |
| ["nvidia-smi", "--query-gpu=name,driver_version", "--format=csv,noheader,nounits"], |
| capture_output=True, text=True, timeout=5, |
| ) |
| if result.returncode == 0 and result.stdout.strip(): |
| line = result.stdout.strip().split("\n")[0] |
| parts = [p.strip() for p in line.split(",")] |
| return {"gpu_name": parts[0], "driver_version": parts[1] if len(parts) > 1 else "unknown"} |
| except (FileNotFoundError, subprocess.TimeoutExpired): |
| pass |
| return None |
|
|
|
|
| def _build_gpu_fix_instructions( |
| gpu_name: str, |
| driver_version: str, |
| torch_ver: str | None, |
| cuda_ver: str | None, |
| ) -> str: |
| """Build OS-specific GPU fix instructions.""" |
| os_name = platform.system() |
| is_docker = _is_running_in_docker() |
| python_ver = f"{sys.version_info.major}.{sys.version_info.minor}" |
|
|
| |
| is_blackwell = any(x in gpu_name.upper() for x in ("RTX 50", "RTX50", "BLACKWELL", "GB2")) |
| is_ada = any(x in gpu_name.upper() for x in ("RTX 40", "RTX40", "ADA")) |
| needs_cu128 = is_blackwell |
| min_torch = "2.6.0" if is_blackwell else "2.1.0" |
| cu_tag = "cu128" if needs_cu128 else "cu124" |
|
|
| |
| lines = [ |
| f"GPU '{gpu_name}' detected (driver {driver_version}) but PyTorch cannot use it.", |
| f"Installed: torch=={torch_ver or 'not installed'}, CUDA=={cuda_ver or 'none'}.", |
| ] |
|
|
| if torch_ver is None: |
| lines.append("PyTorch is not installed at all.") |
| elif cuda_ver is None: |
| lines.append("PyTorch is installed but was built without CUDA (CPU-only build).") |
| elif is_blackwell and cuda_ver and cuda_ver < "12.8": |
| lines.append(f"RTX 50xx (Blackwell) requires CUDA >= 12.8, but torch has CUDA {cuda_ver}.") |
| else: |
| lines.append("PyTorch CUDA version may not match your GPU architecture.") |
|
|
| lines.append("") |
|
|
| |
| if is_docker: |
| lines.append("Fix (Docker): Rebuild with the updated Dockerfile (CUDA 12.8.0 base image):") |
| lines.append(" docker build -f docker/gpu/Dockerfile -t neutts-gpu .") |
| elif os_name == "Windows": |
| lines.append(f"Fix (Windows, Python {python_ver}):") |
| lines.append(f" pip install torch>={min_torch} --index-url https://download.pytorch.org/whl/{cu_tag}") |
| lines.append("") |
| lines.append("Make sure you have the latest NVIDIA driver installed:") |
| lines.append(" https://www.nvidia.com/Download/index.aspx") |
| if is_blackwell: |
| lines.append(" RTX 50xx requires driver >= 572.16") |
| elif os_name == "Linux": |
| lines.append(f"Fix (Linux, Python {python_ver}):") |
| lines.append(f" pip install torch>={min_torch} --index-url https://download.pytorch.org/whl/{cu_tag}") |
| lines.append("") |
| lines.append("Or with conda:") |
| lines.append(f" conda install pytorch>={min_torch} pytorch-cuda=12.8 -c pytorch -c nvidia") |
| lines.append("") |
| lines.append("Verify NVIDIA driver: nvidia-smi") |
| if is_blackwell: |
| lines.append(" RTX 50xx requires driver >= 572.16") |
| else: |
| lines.append(f"Fix: pip install torch>={min_torch} --index-url https://download.pytorch.org/whl/{cu_tag}") |
|
|
| lines.append("") |
| lines.append("After installing, restart NeuTTS-FastAPI.") |
|
|
| return "\n".join(lines) |
|
|
|
|
| def _is_running_in_docker() -> bool: |
| """Check if we're running inside a Docker container.""" |
| try: |
| with open("/proc/1/cgroup", "r") as f: |
| return "docker" in f.read() |
| except (FileNotFoundError, PermissionError): |
| pass |
| try: |
| from pathlib import Path |
| return Path("/.dockerenv").exists() |
| except Exception: |
| pass |
| return False |
|
|
|
|
| @router.get("/system", response_model=SystemDebugResponse) |
| async def system_info() -> SystemDebugResponse: |
| """Return system resource usage and loaded model info.""" |
| model_manager = ModelManager.get_instance() |
| voice_manager = VoiceManager.get_instance() |
|
|
| mem = psutil.virtual_memory() |
|
|
| gpu_available = False |
| gpu_info = None |
| torch_version = None |
| cuda_version = None |
| cuda_driver_version = None |
| gpu_detected_but_unusable = False |
| gpu_fix_instructions = None |
|
|
| try: |
| import torch |
|
|
| torch_version = torch.__version__ |
| cuda_version = torch.version.cuda |
|
|
| if torch.cuda.is_available(): |
| gpu_available = True |
| gpu_info = [] |
| for i in range(torch.cuda.device_count()): |
| props = torch.cuda.get_device_properties(i) |
| allocated = torch.cuda.memory_allocated(i) / (1024**3) |
| total = props.total_mem / (1024**3) |
| gpu_info.append({ |
| "index": i, |
| "name": props.name, |
| "total_gb": round(total, 2), |
| "allocated_gb": round(allocated, 2), |
| }) |
| except ImportError: |
| pass |
|
|
| |
| smi = _nvidia_smi_info() |
| if smi: |
| cuda_driver_version = smi["driver_version"] |
| if not gpu_available: |
| gpu_detected_but_unusable = True |
| gpu_fix_instructions = _build_gpu_fix_instructions( |
| smi["gpu_name"], smi["driver_version"], torch_version, cuda_version, |
| ) |
|
|
| return SystemDebugResponse( |
| cpu_count=psutil.cpu_count() or 0, |
| cpu_percent=psutil.cpu_percent(), |
| memory_total_gb=round(mem.total / (1024**3), 2), |
| memory_used_gb=round(mem.used / (1024**3), 2), |
| memory_percent=mem.percent, |
| gpu_available=gpu_available, |
| gpu_info=gpu_info, |
| torch_version=torch_version, |
| cuda_version=cuda_version, |
| cuda_driver_version=cuda_driver_version, |
| gpu_detected_but_unusable=gpu_detected_but_unusable, |
| gpu_fix_instructions=gpu_fix_instructions, |
| models_loaded=list(model_manager.loaded_models.keys()), |
| voices_available=len(voice_manager.voices), |
| ) |
|
|