|
|
|
|
| package agent
|
|
|
| import (
|
| "bufio"
|
| "fmt"
|
| "log/slog"
|
| "os"
|
| "path/filepath"
|
| "strconv"
|
| "strings"
|
| "sync"
|
| "time"
|
|
|
| "github.com/henrygd/beszel/agent/utils"
|
| "github.com/henrygd/beszel/internal/entities/system"
|
| )
|
|
|
| var amdgpuNameCache = struct {
|
| sync.RWMutex
|
| hits map[string]string
|
| misses map[string]struct{}
|
| }{
|
| hits: make(map[string]string),
|
| misses: make(map[string]struct{}),
|
| }
|
|
|
|
|
| func (gm *GPUManager) hasAmdSysfs() bool {
|
| cards, err := filepath.Glob("/sys/class/drm/card*/device/vendor")
|
| if err != nil {
|
| return false
|
| }
|
| for _, vendorPath := range cards {
|
| vendor, err := utils.ReadStringFileLimited(vendorPath, 64)
|
| if err == nil && vendor == "0x1002" {
|
| return true
|
| }
|
| }
|
| return false
|
| }
|
|
|
|
|
| func (gm *GPUManager) collectAmdStats() error {
|
| sysfsPollInterval := 3000 * time.Millisecond
|
| cards, err := filepath.Glob("/sys/class/drm/card*")
|
| if err != nil {
|
| return err
|
| }
|
|
|
| var amdGpuPaths []string
|
| for _, card := range cards {
|
|
|
| if strings.Contains(filepath.Base(card), "-") || !isAmdGpu(card) {
|
| continue
|
| }
|
| amdGpuPaths = append(amdGpuPaths, card)
|
| }
|
|
|
| if len(amdGpuPaths) == 0 {
|
| return errNoValidData
|
| }
|
|
|
| slog.Debug("Using sysfs for AMD GPU data collection")
|
|
|
| failures := 0
|
| for {
|
| hasData := false
|
| for _, cardPath := range amdGpuPaths {
|
| if gm.updateAmdGpuData(cardPath) {
|
| hasData = true
|
| }
|
| }
|
| if !hasData {
|
| failures++
|
| if failures > maxFailureRetries {
|
| return errNoValidData
|
| }
|
| slog.Warn("No AMD GPU data from sysfs", "failures", failures)
|
| time.Sleep(retryWaitTime)
|
| continue
|
| }
|
| failures = 0
|
| time.Sleep(sysfsPollInterval)
|
| }
|
| }
|
|
|
|
|
| func isAmdGpu(cardPath string) bool {
|
| vendor, err := utils.ReadStringFileLimited(filepath.Join(cardPath, "device/vendor"), 64)
|
| if err != nil {
|
| return false
|
| }
|
| return vendor == "0x1002"
|
| }
|
|
|
|
|
|
|
| func (gm *GPUManager) updateAmdGpuData(cardPath string) bool {
|
| devicePath := filepath.Join(cardPath, "device")
|
| id := filepath.Base(cardPath)
|
|
|
|
|
| usage, usageErr := readSysfsFloat(filepath.Join(devicePath, "gpu_busy_percent"))
|
| memUsed, memUsedErr := readSysfsFloat(filepath.Join(devicePath, "mem_info_vram_used"))
|
| memTotal, _ := readSysfsFloat(filepath.Join(devicePath, "mem_info_vram_total"))
|
|
|
| if gttUsed, err := readSysfsFloat(filepath.Join(devicePath, "mem_info_gtt_used")); err == nil && gttUsed > 0 {
|
| if gttTotal, err := readSysfsFloat(filepath.Join(devicePath, "mem_info_gtt_total")); err == nil {
|
| memUsed += gttUsed
|
| memTotal += gttTotal
|
| }
|
| }
|
|
|
| var temp, power float64
|
| hwmons, _ := filepath.Glob(filepath.Join(devicePath, "hwmon/hwmon*"))
|
| for _, hwmonDir := range hwmons {
|
| if t, err := readSysfsFloat(filepath.Join(hwmonDir, "temp1_input")); err == nil {
|
| temp = t / 1000.0
|
| }
|
| if p, err := readSysfsFloat(filepath.Join(hwmonDir, "power1_average")); err == nil {
|
| power += p / 1000000.0
|
| } else if p, err := readSysfsFloat(filepath.Join(hwmonDir, "power1_input")); err == nil {
|
| power += p / 1000000.0
|
| }
|
| }
|
|
|
|
|
| if usageErr != nil && memUsedErr != nil && temp == 0 {
|
| return false
|
| }
|
|
|
|
|
| gm.Lock()
|
| defer gm.Unlock()
|
|
|
| gpu, ok := gm.GpuDataMap[id]
|
| if !ok {
|
| gpu = &system.GPUData{Name: getAmdGpuName(devicePath)}
|
| gm.GpuDataMap[id] = gpu
|
| }
|
|
|
| if usageErr == nil {
|
| gpu.Usage += usage
|
| }
|
| gpu.MemoryUsed = utils.BytesToMegabytes(memUsed)
|
| gpu.MemoryTotal = utils.BytesToMegabytes(memTotal)
|
| gpu.Temperature = temp
|
| gpu.Power += power
|
| gpu.Count++
|
| return true
|
| }
|
|
|
|
|
| func readSysfsFloat(path string) (float64, error) {
|
| val, err := utils.ReadStringFileLimited(path, 64)
|
| if err != nil {
|
| slog.Debug("Failed to read sysfs value", "path", path, "error", err)
|
| return 0, err
|
| }
|
| return strconv.ParseFloat(val, 64)
|
| }
|
|
|
|
|
| func normalizeHexID(id string) string {
|
| return strings.TrimPrefix(strings.ToLower(strings.TrimSpace(id)), "0x")
|
| }
|
|
|
|
|
| func cacheKeyForAmdgpu(deviceID, revisionID string) string {
|
| if revisionID != "" {
|
| return deviceID + ":" + revisionID
|
| }
|
| return deviceID
|
| }
|
|
|
|
|
| func lookupAmdgpuNameInFile(deviceID, revisionID, filePath string) (name string, exact bool, found bool) {
|
| file, err := os.Open(filePath)
|
| if err != nil {
|
| return "", false, false
|
| }
|
| defer file.Close()
|
|
|
| var byDevice string
|
| scanner := bufio.NewScanner(file)
|
| for scanner.Scan() {
|
| line := strings.TrimSpace(scanner.Text())
|
| if line == "" || strings.HasPrefix(line, "#") {
|
| continue
|
| }
|
| parts := strings.SplitN(line, ",", 3)
|
| if len(parts) != 3 {
|
| continue
|
| }
|
|
|
| dev := normalizeHexID(parts[0])
|
| rev := normalizeHexID(parts[1])
|
| productName := strings.TrimSpace(parts[2])
|
| if dev == "" || productName == "" || dev != deviceID {
|
| continue
|
| }
|
| if byDevice == "" {
|
| byDevice = productName
|
| }
|
| if revisionID != "" && rev == revisionID {
|
| return productName, true, true
|
| }
|
| }
|
| if byDevice != "" {
|
| return byDevice, false, true
|
| }
|
| return "", false, false
|
| }
|
|
|
|
|
| func getCachedAmdgpuName(deviceID, revisionID string) (name string, found bool, done bool) {
|
|
|
|
|
|
|
| keys := []string{cacheKeyForAmdgpu(deviceID, revisionID)}
|
| if revisionID != "" {
|
| keys = append(keys, deviceID)
|
| }
|
|
|
| knownMisses := 0
|
| amdgpuNameCache.RLock()
|
| defer amdgpuNameCache.RUnlock()
|
| for _, key := range keys {
|
| if name, ok := amdgpuNameCache.hits[key]; ok {
|
| return name, true, true
|
| }
|
| if _, ok := amdgpuNameCache.misses[key]; ok {
|
| knownMisses++
|
| }
|
| }
|
|
|
|
|
| return "", false, knownMisses == len(keys)
|
| }
|
|
|
|
|
| func normalizeAmdgpuName(name string) string {
|
| for _, suffix := range []string{" Graphics", " Series"} {
|
| name = strings.TrimSuffix(name, suffix)
|
| }
|
| return name
|
| }
|
|
|
|
|
| func cacheAmdgpuName(deviceID, revisionID, name string, exact bool) {
|
| name = normalizeAmdgpuName(name)
|
| amdgpuNameCache.Lock()
|
| defer amdgpuNameCache.Unlock()
|
| if exact && revisionID != "" {
|
| amdgpuNameCache.hits[cacheKeyForAmdgpu(deviceID, revisionID)] = name
|
| }
|
| amdgpuNameCache.hits[deviceID] = name
|
| }
|
|
|
|
|
| func cacheMissingAmdgpuName(deviceID, revisionID string) {
|
| amdgpuNameCache.Lock()
|
| defer amdgpuNameCache.Unlock()
|
| amdgpuNameCache.misses[deviceID] = struct{}{}
|
| if revisionID != "" {
|
| amdgpuNameCache.misses[cacheKeyForAmdgpu(deviceID, revisionID)] = struct{}{}
|
| }
|
| }
|
|
|
|
|
|
|
|
|
| func getAmdGpuName(devicePath string) string {
|
|
|
| if prod, err := utils.ReadStringFileLimited(filepath.Join(devicePath, "product_name"), 128); err == nil {
|
| return prod
|
| }
|
|
|
|
|
| if deviceID, err := utils.ReadStringFileLimited(filepath.Join(devicePath, "device"), 64); err == nil {
|
| id := normalizeHexID(deviceID)
|
| revision := ""
|
| if rev, revErr := utils.ReadStringFileLimited(filepath.Join(devicePath, "revision"), 64); revErr == nil {
|
| revision = normalizeHexID(rev)
|
| }
|
|
|
| if name, found, done := getCachedAmdgpuName(id, revision); found {
|
| return name
|
| } else if !done {
|
| if name, exact, ok := lookupAmdgpuNameInFile(id, revision, "/usr/share/libdrm/amdgpu.ids"); ok {
|
| cacheAmdgpuName(id, revision, name, exact)
|
| return normalizeAmdgpuName(name)
|
| }
|
| cacheMissingAmdgpuName(id, revision)
|
| }
|
|
|
| return fmt.Sprintf("AMD GPU (%s)", id)
|
| }
|
|
|
| return "AMD GPU"
|
| }
|
|
|