File size: 9,310 Bytes
3959f31 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 | //go:build linux
package agent
import (
"bufio"
"fmt"
"log/slog"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
"time"
"github.com/henrygd/beszel/agent/utils"
"github.com/henrygd/beszel/internal/entities/system"
)
var amdgpuNameCache = struct {
sync.RWMutex
hits map[string]string
misses map[string]struct{}
}{
hits: make(map[string]string),
misses: make(map[string]struct{}),
}
// hasAmdSysfs returns true if any AMD GPU sysfs nodes are found
func (gm *GPUManager) hasAmdSysfs() bool {
cards, err := filepath.Glob("/sys/class/drm/card*/device/vendor")
if err != nil {
return false
}
for _, vendorPath := range cards {
vendor, err := utils.ReadStringFileLimited(vendorPath, 64)
if err == nil && vendor == "0x1002" {
return true
}
}
return false
}
// collectAmdStats collects AMD GPU metrics directly from sysfs to avoid the overhead of rocm-smi
func (gm *GPUManager) collectAmdStats() error {
sysfsPollInterval := 3000 * time.Millisecond
cards, err := filepath.Glob("/sys/class/drm/card*")
if err != nil {
return err
}
var amdGpuPaths []string
for _, card := range cards {
// Ignore symbolic links and non-main card directories
if strings.Contains(filepath.Base(card), "-") || !isAmdGpu(card) {
continue
}
amdGpuPaths = append(amdGpuPaths, card)
}
if len(amdGpuPaths) == 0 {
return errNoValidData
}
slog.Debug("Using sysfs for AMD GPU data collection")
failures := 0
for {
hasData := false
for _, cardPath := range amdGpuPaths {
if gm.updateAmdGpuData(cardPath) {
hasData = true
}
}
if !hasData {
failures++
if failures > maxFailureRetries {
return errNoValidData
}
slog.Warn("No AMD GPU data from sysfs", "failures", failures)
time.Sleep(retryWaitTime)
continue
}
failures = 0
time.Sleep(sysfsPollInterval)
}
}
// isAmdGpu checks whether a DRM card path belongs to AMD vendor ID 0x1002.
func isAmdGpu(cardPath string) bool {
vendor, err := utils.ReadStringFileLimited(filepath.Join(cardPath, "device/vendor"), 64)
if err != nil {
return false
}
return vendor == "0x1002"
}
// updateAmdGpuData reads GPU metrics from sysfs and updates the GPU data map.
// Returns true if at least some data was successfully read.
func (gm *GPUManager) updateAmdGpuData(cardPath string) bool {
devicePath := filepath.Join(cardPath, "device")
id := filepath.Base(cardPath)
// Read all sysfs values first (no lock needed - these can be slow)
usage, usageErr := readSysfsFloat(filepath.Join(devicePath, "gpu_busy_percent"))
memUsed, memUsedErr := readSysfsFloat(filepath.Join(devicePath, "mem_info_vram_used"))
memTotal, _ := readSysfsFloat(filepath.Join(devicePath, "mem_info_vram_total"))
// if gtt is present, add it to the memory used and total (https://github.com/henrygd/beszel/issues/1569#issuecomment-3837640484)
if gttUsed, err := readSysfsFloat(filepath.Join(devicePath, "mem_info_gtt_used")); err == nil && gttUsed > 0 {
if gttTotal, err := readSysfsFloat(filepath.Join(devicePath, "mem_info_gtt_total")); err == nil {
memUsed += gttUsed
memTotal += gttTotal
}
}
var temp, power float64
hwmons, _ := filepath.Glob(filepath.Join(devicePath, "hwmon/hwmon*"))
for _, hwmonDir := range hwmons {
if t, err := readSysfsFloat(filepath.Join(hwmonDir, "temp1_input")); err == nil {
temp = t / 1000.0
}
if p, err := readSysfsFloat(filepath.Join(hwmonDir, "power1_average")); err == nil {
power += p / 1000000.0
} else if p, err := readSysfsFloat(filepath.Join(hwmonDir, "power1_input")); err == nil {
power += p / 1000000.0
}
}
// Check if we got any meaningful data
if usageErr != nil && memUsedErr != nil && temp == 0 {
return false
}
// Single lock to update all values atomically
gm.Lock()
defer gm.Unlock()
gpu, ok := gm.GpuDataMap[id]
if !ok {
gpu = &system.GPUData{Name: getAmdGpuName(devicePath)}
gm.GpuDataMap[id] = gpu
}
if usageErr == nil {
gpu.Usage += usage
}
gpu.MemoryUsed = utils.BytesToMegabytes(memUsed)
gpu.MemoryTotal = utils.BytesToMegabytes(memTotal)
gpu.Temperature = temp
gpu.Power += power
gpu.Count++
return true
}
// readSysfsFloat reads and parses a numeric value from a sysfs file.
func readSysfsFloat(path string) (float64, error) {
val, err := utils.ReadStringFileLimited(path, 64)
if err != nil {
slog.Debug("Failed to read sysfs value", "path", path, "error", err)
return 0, err
}
return strconv.ParseFloat(val, 64)
}
// normalizeHexID normalizes hex IDs by trimming spaces, lowercasing, and dropping 0x.
func normalizeHexID(id string) string {
return strings.TrimPrefix(strings.ToLower(strings.TrimSpace(id)), "0x")
}
// cacheKeyForAmdgpu builds the cache key for a device and optional revision.
func cacheKeyForAmdgpu(deviceID, revisionID string) string {
if revisionID != "" {
return deviceID + ":" + revisionID
}
return deviceID
}
// lookupAmdgpuNameInFile resolves an AMDGPU name from amdgpu.ids by device/revision.
func lookupAmdgpuNameInFile(deviceID, revisionID, filePath string) (name string, exact bool, found bool) {
file, err := os.Open(filePath)
if err != nil {
return "", false, false
}
defer file.Close()
var byDevice string
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if line == "" || strings.HasPrefix(line, "#") {
continue
}
parts := strings.SplitN(line, ",", 3)
if len(parts) != 3 {
continue
}
dev := normalizeHexID(parts[0])
rev := normalizeHexID(parts[1])
productName := strings.TrimSpace(parts[2])
if dev == "" || productName == "" || dev != deviceID {
continue
}
if byDevice == "" {
byDevice = productName
}
if revisionID != "" && rev == revisionID {
return productName, true, true
}
}
if byDevice != "" {
return byDevice, false, true
}
return "", false, false
}
// getCachedAmdgpuName returns cached hit/miss status for the given device/revision.
func getCachedAmdgpuName(deviceID, revisionID string) (name string, found bool, done bool) {
// Build the list of cache keys to check. We always look up the exact device+revision key.
// When revisionID is set, we also look up deviceID alone, since the cache may store a
// device-only fallback when we couldn't resolve the exact revision.
keys := []string{cacheKeyForAmdgpu(deviceID, revisionID)}
if revisionID != "" {
keys = append(keys, deviceID)
}
knownMisses := 0
amdgpuNameCache.RLock()
defer amdgpuNameCache.RUnlock()
for _, key := range keys {
if name, ok := amdgpuNameCache.hits[key]; ok {
return name, true, true
}
if _, ok := amdgpuNameCache.misses[key]; ok {
knownMisses++
}
}
// done=true means "don't bother doing slow lookup": we either found a name (above) or
// every key we checked was already a known miss, so we've tried before and failed.
return "", false, knownMisses == len(keys)
}
// normalizeAmdgpuName trims standard suffixes from AMDGPU product names.
func normalizeAmdgpuName(name string) string {
for _, suffix := range []string{" Graphics", " Series"} {
name = strings.TrimSuffix(name, suffix)
}
return name
}
// cacheAmdgpuName stores a resolved AMDGPU name in the lookup cache.
func cacheAmdgpuName(deviceID, revisionID, name string, exact bool) {
name = normalizeAmdgpuName(name)
amdgpuNameCache.Lock()
defer amdgpuNameCache.Unlock()
if exact && revisionID != "" {
amdgpuNameCache.hits[cacheKeyForAmdgpu(deviceID, revisionID)] = name
}
amdgpuNameCache.hits[deviceID] = name
}
// cacheMissingAmdgpuName records unresolved device/revision lookups.
func cacheMissingAmdgpuName(deviceID, revisionID string) {
amdgpuNameCache.Lock()
defer amdgpuNameCache.Unlock()
amdgpuNameCache.misses[deviceID] = struct{}{}
if revisionID != "" {
amdgpuNameCache.misses[cacheKeyForAmdgpu(deviceID, revisionID)] = struct{}{}
}
}
// getAmdGpuName attempts to get a descriptive GPU name.
// First tries product_name (rarely available), then looks up the PCI device ID.
// Falls back to showing the raw device ID if not found in the lookup table.
func getAmdGpuName(devicePath string) string {
// Try product_name first (works for some enterprise GPUs)
if prod, err := utils.ReadStringFileLimited(filepath.Join(devicePath, "product_name"), 128); err == nil {
return prod
}
// Read PCI device ID and look it up
if deviceID, err := utils.ReadStringFileLimited(filepath.Join(devicePath, "device"), 64); err == nil {
id := normalizeHexID(deviceID)
revision := ""
if rev, revErr := utils.ReadStringFileLimited(filepath.Join(devicePath, "revision"), 64); revErr == nil {
revision = normalizeHexID(rev)
}
if name, found, done := getCachedAmdgpuName(id, revision); found {
return name
} else if !done {
if name, exact, ok := lookupAmdgpuNameInFile(id, revision, "/usr/share/libdrm/amdgpu.ids"); ok {
cacheAmdgpuName(id, revision, name, exact)
return normalizeAmdgpuName(name)
}
cacheMissingAmdgpuName(id, revision)
}
return fmt.Sprintf("AMD GPU (%s)", id)
}
return "AMD GPU"
}
|