Spaces:

grimshaw
/

neuapi

Running

App Files Files Community

neuapi / api /src /routers /debug.py

grimshaw

Upload folder using huggingface_hub

35bb6f4 verified about 1 month ago

Raw

History Blame Contribute Delete

6.6 kB

	from __future__ import annotations

	import platform
	import subprocess
	import sys

	import psutil
	from fastapi import APIRouter

	from api.src.inference.model_manager import ModelManager
	from api.src.inference.voice_manager import VoiceManager
	from api.src.structures.schemas import SystemDebugResponse

	router = APIRouter(prefix="/debug", tags=["Debug"])


	def _nvidia_smi_info() -> dict \| None:
	"""Run nvidia-smi and return GPU name + driver version, or None."""
	try:
	result = subprocess.run(
	["nvidia-smi", "--query-gpu=name,driver_version", "--format=csv,noheader,nounits"],
	capture_output=True, text=True, timeout=5,
	)
	if result.returncode == 0 and result.stdout.strip():
	line = result.stdout.strip().split("\n")[0]
	parts = [p.strip() for p in line.split(",")]
	return {"gpu_name": parts[0], "driver_version": parts[1] if len(parts) > 1 else "unknown"}
	except (FileNotFoundError, subprocess.TimeoutExpired):
	pass
	return None


	def _build_gpu_fix_instructions(
	gpu_name: str,
	driver_version: str,
	torch_ver: str \| None,
	cuda_ver: str \| None,
	) -> str:
	"""Build OS-specific GPU fix instructions."""
	os_name = platform.system() # "Windows", "Linux", "Darwin"
	is_docker = _is_running_in_docker()
	python_ver = f"{sys.version_info.major}.{sys.version_info.minor}"

	# Detect GPU generation from name
	is_blackwell = any(x in gpu_name.upper() for x in ("RTX 50", "RTX50", "BLACKWELL", "GB2"))
	is_ada = any(x in gpu_name.upper() for x in ("RTX 40", "RTX40", "ADA"))
	needs_cu128 = is_blackwell
	min_torch = "2.6.0" if is_blackwell else "2.1.0"
	cu_tag = "cu128" if needs_cu128 else "cu124"

	# Diagnosis
	lines = [
	f"GPU '{gpu_name}' detected (driver {driver_version}) but PyTorch cannot use it.",
	f"Installed: torch=={torch_ver or 'not installed'}, CUDA=={cuda_ver or 'none'}.",
	]

	if torch_ver is None:
	lines.append("PyTorch is not installed at all.")
	elif cuda_ver is None:
	lines.append("PyTorch is installed but was built without CUDA (CPU-only build).")
	elif is_blackwell and cuda_ver and cuda_ver < "12.8":
	lines.append(f"RTX 50xx (Blackwell) requires CUDA >= 12.8, but torch has CUDA {cuda_ver}.")
	else:
	lines.append("PyTorch CUDA version may not match your GPU architecture.")

	lines.append("")

	# OS-specific fix
	if is_docker:
	lines.append("Fix (Docker): Rebuild with the updated Dockerfile (CUDA 12.8.0 base image):")
	lines.append(" docker build -f docker/gpu/Dockerfile -t neutts-gpu .")
	elif os_name == "Windows":
	lines.append(f"Fix (Windows, Python {python_ver}):")
	lines.append(f" pip install torch>={min_torch} --index-url https://download.pytorch.org/whl/{cu_tag}")
	lines.append("")
	lines.append("Make sure you have the latest NVIDIA driver installed:")
	lines.append(" https://www.nvidia.com/Download/index.aspx")
	if is_blackwell:
	lines.append(" RTX 50xx requires driver >= 572.16")
	elif os_name == "Linux":
	lines.append(f"Fix (Linux, Python {python_ver}):")
	lines.append(f" pip install torch>={min_torch} --index-url https://download.pytorch.org/whl/{cu_tag}")
	lines.append("")
	lines.append("Or with conda:")
	lines.append(f" conda install pytorch>={min_torch} pytorch-cuda=12.8 -c pytorch -c nvidia")
	lines.append("")
	lines.append("Verify NVIDIA driver: nvidia-smi")
	if is_blackwell:
	lines.append(" RTX 50xx requires driver >= 572.16")
	else:
	lines.append(f"Fix: pip install torch>={min_torch} --index-url https://download.pytorch.org/whl/{cu_tag}")

	lines.append("")
	lines.append("After installing, restart NeuTTS-FastAPI.")

	return "\n".join(lines)


	def _is_running_in_docker() -> bool:
	"""Check if we're running inside a Docker container."""
	try:
	with open("/proc/1/cgroup", "r") as f:
	return "docker" in f.read()
	except (FileNotFoundError, PermissionError):
	pass
	try:
	from pathlib import Path
	return Path("/.dockerenv").exists()
	except Exception:
	pass
	return False


	@router.get("/system", response_model=SystemDebugResponse)
	async def system_info() -> SystemDebugResponse:
	"""Return system resource usage and loaded model info."""
	model_manager = ModelManager.get_instance()
	voice_manager = VoiceManager.get_instance()

	mem = psutil.virtual_memory()

	gpu_available = False
	gpu_info = None
	torch_version = None
	cuda_version = None
	cuda_driver_version = None
	gpu_detected_but_unusable = False
	gpu_fix_instructions = None

	try:
	import torch

	torch_version = torch.__version__
	cuda_version = torch.version.cuda

	if torch.cuda.is_available():
	gpu_available = True
	gpu_info = []
	for i in range(torch.cuda.device_count()):
	props = torch.cuda.get_device_properties(i)
	allocated = torch.cuda.memory_allocated(i) / (1024**3)
	total = props.total_mem / (1024**3)
	gpu_info.append({
	"index": i,
	"name": props.name,
	"total_gb": round(total, 2),
	"allocated_gb": round(allocated, 2),
	})
	except ImportError:
	pass

	# Check nvidia-smi for GPU detection even if torch can't use it
	smi = _nvidia_smi_info()
	if smi:
	cuda_driver_version = smi["driver_version"]
	if not gpu_available:
	gpu_detected_but_unusable = True
	gpu_fix_instructions = _build_gpu_fix_instructions(
	smi["gpu_name"], smi["driver_version"], torch_version, cuda_version,
	)

	return SystemDebugResponse(
	cpu_count=psutil.cpu_count() or 0,
	cpu_percent=psutil.cpu_percent(),
	memory_total_gb=round(mem.total / (1024**3), 2),
	memory_used_gb=round(mem.used / (1024**3), 2),
	memory_percent=mem.percent,
	gpu_available=gpu_available,
	gpu_info=gpu_info,
	torch_version=torch_version,
	cuda_version=cuda_version,
	cuda_driver_version=cuda_driver_version,
	gpu_detected_but_unusable=gpu_detected_but_unusable,
	gpu_fix_instructions=gpu_fix_instructions,
	models_loaded=list(model_manager.loaded_models.keys()),
	voices_available=len(voice_manager.voices),
	)