| package agent
|
|
|
| import (
|
| "bufio"
|
| "bytes"
|
| "encoding/json"
|
| "fmt"
|
| "log/slog"
|
| "maps"
|
| "os/exec"
|
| "regexp"
|
| "runtime"
|
| "strconv"
|
| "strings"
|
| "sync"
|
| "time"
|
|
|
| "github.com/henrygd/beszel/agent/utils"
|
| "github.com/henrygd/beszel/internal/entities/system"
|
| )
|
|
|
| const (
|
|
|
| nvidiaSmiCmd string = "nvidia-smi"
|
| rocmSmiCmd string = "rocm-smi"
|
| tegraStatsCmd string = "tegrastats"
|
| nvtopCmd string = "nvtop"
|
| powermetricsCmd string = "powermetrics"
|
| macmonCmd string = "macmon"
|
| noGPUFoundMsg string = "no GPU found - see https://beszel.dev/guide/gpu"
|
|
|
|
|
| retryWaitTime time.Duration = 5 * time.Second
|
| maxFailureRetries int = 5
|
|
|
|
|
| mebibytesInAMegabyte float64 = 1.024
|
| milliwattsInAWatt float64 = 1000.0
|
| )
|
|
|
|
|
| type GPUManager struct {
|
| sync.Mutex
|
| GpuDataMap map[string]*system.GPUData
|
|
|
|
|
| lastAvgData map[string]system.GPUData
|
|
|
|
|
| lastSnapshots map[uint16]map[string]*gpuSnapshot
|
| }
|
|
|
|
|
| type gpuSnapshot struct {
|
| count uint32
|
| usage float64
|
| power float64
|
| powerPkg float64
|
| engines map[string]float64
|
| }
|
|
|
|
|
| type RocmSmiJson struct {
|
| ID string `json:"GUID"`
|
| Name string `json:"Card series"`
|
| Temperature string `json:"Temperature (Sensor edge) (C)"`
|
| MemoryUsed string `json:"VRAM Total Used Memory (B)"`
|
| MemoryTotal string `json:"VRAM Total Memory (B)"`
|
| Usage string `json:"GPU use (%)"`
|
| PowerPackage string `json:"Average Graphics Package Power (W)"`
|
| PowerSocket string `json:"Current Socket Graphics Package Power (W)"`
|
| }
|
|
|
|
|
| type gpuCollector struct {
|
| name string
|
| cmdArgs []string
|
| parse func([]byte) bool
|
| buf []byte
|
| bufSize uint16
|
| }
|
|
|
| var errNoValidData = fmt.Errorf("no valid GPU data found")
|
|
|
|
|
| type collectorSource string
|
|
|
| const (
|
| collectorSourceNVTop collectorSource = collectorSource(nvtopCmd)
|
| collectorSourceNVML collectorSource = "nvml"
|
| collectorSourceNvidiaSMI collectorSource = collectorSource(nvidiaSmiCmd)
|
| collectorSourceIntelGpuTop collectorSource = collectorSource(intelGpuStatsCmd)
|
| collectorSourceAmdSysfs collectorSource = "amd_sysfs"
|
| collectorSourceRocmSMI collectorSource = collectorSource(rocmSmiCmd)
|
| collectorSourceMacmon collectorSource = collectorSource(macmonCmd)
|
| collectorSourcePowermetrics collectorSource = collectorSource(powermetricsCmd)
|
| collectorGroupNvidia string = "nvidia"
|
| collectorGroupIntel string = "intel"
|
| collectorGroupAmd string = "amd"
|
| collectorGroupApple string = "apple"
|
| )
|
|
|
| func isValidCollectorSource(source collectorSource) bool {
|
| switch source {
|
| case collectorSourceNVTop,
|
| collectorSourceNVML,
|
| collectorSourceNvidiaSMI,
|
| collectorSourceIntelGpuTop,
|
| collectorSourceAmdSysfs,
|
| collectorSourceRocmSMI,
|
| collectorSourceMacmon,
|
| collectorSourcePowermetrics:
|
| return true
|
| }
|
| return false
|
| }
|
|
|
|
|
| type gpuCapabilities struct {
|
| hasNvidiaSmi bool
|
| hasRocmSmi bool
|
| hasAmdSysfs bool
|
| hasTegrastats bool
|
| hasIntelGpuTop bool
|
| hasNvtop bool
|
| hasMacmon bool
|
| hasPowermetrics bool
|
| }
|
|
|
| type collectorDefinition struct {
|
| group string
|
| available bool
|
| start func(onFailure func()) bool
|
| deprecationWarning string
|
| }
|
|
|
|
|
| func (c *gpuCollector) start() {
|
| for {
|
| err := c.collect()
|
| if err != nil {
|
| if err == errNoValidData {
|
| slog.Warn(c.name + " found no valid GPU data, stopping")
|
| break
|
| }
|
| slog.Warn(c.name+" failed, restarting", "err", err)
|
| time.Sleep(retryWaitTime)
|
| continue
|
| }
|
| }
|
| }
|
|
|
|
|
| func (c *gpuCollector) collect() error {
|
| cmd := exec.Command(c.name, c.cmdArgs...)
|
| stdout, err := cmd.StdoutPipe()
|
| if err != nil {
|
| return err
|
| }
|
| if err := cmd.Start(); err != nil {
|
| return err
|
| }
|
|
|
| scanner := bufio.NewScanner(stdout)
|
| if c.buf == nil {
|
| c.buf = make([]byte, 0, c.bufSize)
|
| }
|
| scanner.Buffer(c.buf, bufio.MaxScanTokenSize)
|
|
|
| for scanner.Scan() {
|
| hasValidData := c.parse(scanner.Bytes())
|
| if !hasValidData {
|
| return errNoValidData
|
| }
|
| }
|
|
|
| if err := scanner.Err(); err != nil {
|
| return fmt.Errorf("scanner error: %w", err)
|
| }
|
| return cmd.Wait()
|
| }
|
|
|
|
|
| func (gm *GPUManager) getJetsonParser() func(output []byte) bool {
|
|
|
| ramPattern := regexp.MustCompile(`RAM (\d+)/(\d+)MB`)
|
| gr3dPattern := regexp.MustCompile(`GR3D_FREQ (\d+)%`)
|
| tempPattern := regexp.MustCompile(`(?:tj|GPU)@(\d+\.?\d*)C`)
|
|
|
|
|
| powerPattern := regexp.MustCompile(`(GPU_SOC|CPU_GPU_CV)\s+(\d+)mW|VDD_SYS_GPU\s+(\d+)/\d+`)
|
|
|
|
|
| gpuData := &system.GPUData{Name: "GPU"}
|
| gm.GpuDataMap["0"] = gpuData
|
|
|
| return func(output []byte) bool {
|
| gm.Lock()
|
| defer gm.Unlock()
|
|
|
| ramMatches := ramPattern.FindSubmatch(output)
|
| if ramMatches != nil {
|
| gpuData.MemoryUsed, _ = strconv.ParseFloat(string(ramMatches[1]), 64)
|
| gpuData.MemoryTotal, _ = strconv.ParseFloat(string(ramMatches[2]), 64)
|
| }
|
|
|
| gr3dMatches := gr3dPattern.FindSubmatch(output)
|
| if gr3dMatches != nil {
|
| gr3dUsage, _ := strconv.ParseFloat(string(gr3dMatches[1]), 64)
|
| gpuData.Usage += gr3dUsage
|
| }
|
|
|
| tempMatches := tempPattern.FindSubmatch(output)
|
| if tempMatches != nil {
|
| gpuData.Temperature, _ = strconv.ParseFloat(string(tempMatches[1]), 64)
|
| }
|
|
|
| powerMatches := powerPattern.FindSubmatch(output)
|
| if powerMatches != nil {
|
|
|
|
|
| powerStr := string(powerMatches[2])
|
| if powerStr == "" {
|
| powerStr = string(powerMatches[3])
|
| }
|
| power, _ := strconv.ParseFloat(powerStr, 64)
|
| gpuData.Power += power / milliwattsInAWatt
|
| }
|
| gpuData.Count++
|
| return true
|
| }
|
| }
|
|
|
|
|
| func (gm *GPUManager) parseNvidiaData(output []byte) bool {
|
| gm.Lock()
|
| defer gm.Unlock()
|
| scanner := bufio.NewScanner(bytes.NewReader(output))
|
| var valid bool
|
| for scanner.Scan() {
|
| line := scanner.Text()
|
| fields := strings.Split(strings.TrimSpace(line), ", ")
|
| if len(fields) < 7 {
|
| continue
|
| }
|
| valid = true
|
| id := fields[0]
|
| temp, _ := strconv.ParseFloat(fields[2], 64)
|
| memoryUsage, _ := strconv.ParseFloat(fields[3], 64)
|
| totalMemory, _ := strconv.ParseFloat(fields[4], 64)
|
| usage, _ := strconv.ParseFloat(fields[5], 64)
|
| power, _ := strconv.ParseFloat(fields[6], 64)
|
|
|
| if _, ok := gm.GpuDataMap[id]; !ok {
|
| name := strings.TrimPrefix(fields[1], "NVIDIA ")
|
| gm.GpuDataMap[id] = &system.GPUData{Name: strings.TrimSuffix(name, " Laptop GPU")}
|
| }
|
|
|
| gpu := gm.GpuDataMap[id]
|
| gpu.Temperature = temp
|
| gpu.MemoryUsed = memoryUsage / mebibytesInAMegabyte
|
| gpu.MemoryTotal = totalMemory / mebibytesInAMegabyte
|
| gpu.Usage += usage
|
| gpu.Power += power
|
| gpu.Count++
|
| }
|
| return valid
|
| }
|
|
|
|
|
| func (gm *GPUManager) parseAmdData(output []byte) bool {
|
| var rocmSmiInfo map[string]RocmSmiJson
|
| if err := json.Unmarshal(output, &rocmSmiInfo); err != nil || len(rocmSmiInfo) == 0 {
|
| return false
|
| }
|
| gm.Lock()
|
| defer gm.Unlock()
|
| for _, v := range rocmSmiInfo {
|
| var power float64
|
| if v.PowerPackage != "" {
|
| power, _ = strconv.ParseFloat(v.PowerPackage, 64)
|
| } else {
|
| power, _ = strconv.ParseFloat(v.PowerSocket, 64)
|
| }
|
| memoryUsage, _ := strconv.ParseFloat(v.MemoryUsed, 64)
|
| totalMemory, _ := strconv.ParseFloat(v.MemoryTotal, 64)
|
| usage, _ := strconv.ParseFloat(v.Usage, 64)
|
|
|
| id := v.ID
|
| if _, ok := gm.GpuDataMap[id]; !ok {
|
| gm.GpuDataMap[id] = &system.GPUData{Name: v.Name}
|
| }
|
| gpu := gm.GpuDataMap[id]
|
| gpu.Temperature, _ = strconv.ParseFloat(v.Temperature, 64)
|
| gpu.MemoryUsed = utils.BytesToMegabytes(memoryUsage)
|
| gpu.MemoryTotal = utils.BytesToMegabytes(totalMemory)
|
| gpu.Usage += usage
|
| gpu.Power += power
|
| gpu.Count++
|
| }
|
| return true
|
| }
|
|
|
|
|
| func (gm *GPUManager) GetCurrentData(cacheKey uint16) map[string]system.GPUData {
|
| gm.Lock()
|
| defer gm.Unlock()
|
|
|
| gm.initializeSnapshots(cacheKey)
|
| nameCounts := gm.countGPUNames()
|
|
|
| gpuData := make(map[string]system.GPUData, len(gm.GpuDataMap))
|
| for id, gpu := range gm.GpuDataMap {
|
| gpuAvg := gm.calculateGPUAverage(id, gpu, cacheKey)
|
| gm.updateInstantaneousValues(&gpuAvg, gpu)
|
| gm.storeSnapshot(id, gpu, cacheKey)
|
|
|
|
|
| if nameCounts[gpu.Name] > 1 {
|
| gpuAvg.Name = fmt.Sprintf("%s %s", gpu.Name, id)
|
| }
|
| gpuData[id] = gpuAvg
|
| }
|
| slog.Debug("GPU", "data", gpuData)
|
| return gpuData
|
| }
|
|
|
|
|
| func (gm *GPUManager) initializeSnapshots(cacheKey uint16) {
|
| if gm.lastAvgData == nil {
|
| gm.lastAvgData = make(map[string]system.GPUData)
|
| }
|
| if gm.lastSnapshots == nil {
|
| gm.lastSnapshots = make(map[uint16]map[string]*gpuSnapshot)
|
| }
|
| if gm.lastSnapshots[cacheKey] == nil {
|
| gm.lastSnapshots[cacheKey] = make(map[string]*gpuSnapshot)
|
| }
|
| }
|
|
|
|
|
| func (gm *GPUManager) countGPUNames() map[string]int {
|
| nameCounts := make(map[string]int)
|
| for _, gpu := range gm.GpuDataMap {
|
| nameCounts[gpu.Name]++
|
| }
|
| return nameCounts
|
| }
|
|
|
|
|
| func (gm *GPUManager) calculateGPUAverage(id string, gpu *system.GPUData, cacheKey uint16) system.GPUData {
|
| lastSnapshot := gm.lastSnapshots[cacheKey][id]
|
| currentCount := uint32(gpu.Count)
|
| deltaCount := gm.calculateDeltaCount(currentCount, lastSnapshot)
|
|
|
|
|
| if deltaCount == 0 {
|
|
|
|
|
| if gpu.Temperature == 0 && gpu.MemoryUsed == 0 {
|
| return system.GPUData{Name: gpu.Name}
|
| }
|
| return gm.lastAvgData[id]
|
| }
|
|
|
|
|
| gpuAvg := *gpu
|
| deltaUsage, deltaPower, deltaPowerPkg := gm.calculateDeltas(gpu, lastSnapshot)
|
|
|
| gpuAvg.Power = utils.TwoDecimals(deltaPower / float64(deltaCount))
|
|
|
| if gpu.Engines != nil {
|
|
|
|
|
| gpuAvg.Engines = make(map[string]float64, len(gpu.Engines))
|
| gpuAvg.Usage = gm.calculateIntelGPUUsage(&gpuAvg, gpu, lastSnapshot, deltaCount)
|
| gpuAvg.PowerPkg = utils.TwoDecimals(deltaPowerPkg / float64(deltaCount))
|
| } else {
|
| gpuAvg.Usage = utils.TwoDecimals(deltaUsage / float64(deltaCount))
|
| }
|
|
|
| gm.lastAvgData[id] = gpuAvg
|
| return gpuAvg
|
| }
|
|
|
|
|
| func (gm *GPUManager) calculateDeltaCount(currentCount uint32, lastSnapshot *gpuSnapshot) uint32 {
|
| if lastSnapshot != nil {
|
| return currentCount - lastSnapshot.count
|
| }
|
| return currentCount
|
| }
|
|
|
|
|
| func (gm *GPUManager) calculateDeltas(gpu *system.GPUData, lastSnapshot *gpuSnapshot) (deltaUsage, deltaPower, deltaPowerPkg float64) {
|
| if lastSnapshot != nil {
|
| return gpu.Usage - lastSnapshot.usage,
|
| gpu.Power - lastSnapshot.power,
|
| gpu.PowerPkg - lastSnapshot.powerPkg
|
| }
|
| return gpu.Usage, gpu.Power, gpu.PowerPkg
|
| }
|
|
|
|
|
| func (gm *GPUManager) calculateIntelGPUUsage(gpuAvg, gpu *system.GPUData, lastSnapshot *gpuSnapshot, deltaCount uint32) float64 {
|
| maxEngineUsage := 0.0
|
| for name, engine := range gpu.Engines {
|
| var deltaEngine float64
|
| if lastSnapshot != nil && lastSnapshot.engines != nil {
|
| deltaEngine = engine - lastSnapshot.engines[name]
|
| } else {
|
| deltaEngine = engine
|
| }
|
| gpuAvg.Engines[name] = utils.TwoDecimals(deltaEngine / float64(deltaCount))
|
| maxEngineUsage = max(maxEngineUsage, deltaEngine/float64(deltaCount))
|
| }
|
| return utils.TwoDecimals(maxEngineUsage)
|
| }
|
|
|
|
|
| func (gm *GPUManager) updateInstantaneousValues(gpuAvg *system.GPUData, gpu *system.GPUData) {
|
| gpuAvg.Temperature = utils.TwoDecimals(gpu.Temperature)
|
| gpuAvg.MemoryUsed = utils.TwoDecimals(gpu.MemoryUsed)
|
| gpuAvg.MemoryTotal = utils.TwoDecimals(gpu.MemoryTotal)
|
| }
|
|
|
|
|
| func (gm *GPUManager) storeSnapshot(id string, gpu *system.GPUData, cacheKey uint16) {
|
| snapshot := &gpuSnapshot{
|
| count: uint32(gpu.Count),
|
| usage: gpu.Usage,
|
| power: gpu.Power,
|
| powerPkg: gpu.PowerPkg,
|
| }
|
| if gpu.Engines != nil {
|
| snapshot.engines = make(map[string]float64, len(gpu.Engines))
|
| maps.Copy(snapshot.engines, gpu.Engines)
|
| }
|
| gm.lastSnapshots[cacheKey][id] = snapshot
|
| }
|
|
|
|
|
|
|
| func (gm *GPUManager) discoverGpuCapabilities() gpuCapabilities {
|
| caps := gpuCapabilities{
|
| hasAmdSysfs: gm.hasAmdSysfs(),
|
| }
|
| if _, err := exec.LookPath(nvidiaSmiCmd); err == nil {
|
| caps.hasNvidiaSmi = true
|
| }
|
| if _, err := exec.LookPath(rocmSmiCmd); err == nil {
|
| caps.hasRocmSmi = true
|
| }
|
| if _, err := exec.LookPath(tegraStatsCmd); err == nil {
|
| caps.hasTegrastats = true
|
| }
|
| if _, err := exec.LookPath(intelGpuStatsCmd); err == nil {
|
| caps.hasIntelGpuTop = true
|
| }
|
| if _, err := exec.LookPath(nvtopCmd); err == nil {
|
| caps.hasNvtop = true
|
| }
|
| if runtime.GOOS == "darwin" {
|
| if _, err := utils.LookPathHomebrew(macmonCmd); err == nil {
|
| caps.hasMacmon = true
|
| }
|
| if _, err := exec.LookPath(powermetricsCmd); err == nil {
|
| caps.hasPowermetrics = true
|
| }
|
| }
|
| return caps
|
| }
|
|
|
| func hasAnyGpuCollector(caps gpuCapabilities) bool {
|
| return caps.hasNvidiaSmi || caps.hasRocmSmi || caps.hasAmdSysfs || caps.hasTegrastats || caps.hasIntelGpuTop || caps.hasNvtop || caps.hasMacmon || caps.hasPowermetrics
|
| }
|
|
|
| func (gm *GPUManager) startIntelCollector() {
|
| go func() {
|
| failures := 0
|
| for {
|
| if err := gm.collectIntelStats(); err != nil {
|
| failures++
|
| if failures > maxFailureRetries {
|
| break
|
| }
|
| slog.Warn("Error collecting Intel GPU data; see https://beszel.dev/guide/gpu", "err", err)
|
| time.Sleep(retryWaitTime)
|
| continue
|
| }
|
| }
|
| }()
|
| }
|
|
|
| func (gm *GPUManager) startNvidiaSmiCollector(intervalSeconds string) {
|
| collector := gpuCollector{
|
| name: nvidiaSmiCmd,
|
| bufSize: 10 * 1024,
|
| cmdArgs: []string{
|
| "-l", intervalSeconds,
|
| "--query-gpu=index,name,temperature.gpu,memory.used,memory.total,utilization.gpu,power.draw",
|
| "--format=csv,noheader,nounits",
|
| },
|
| parse: gm.parseNvidiaData,
|
| }
|
| go collector.start()
|
| }
|
|
|
| func (gm *GPUManager) startTegraStatsCollector(intervalMilliseconds string) {
|
| collector := gpuCollector{
|
| name: tegraStatsCmd,
|
| bufSize: 10 * 1024,
|
| cmdArgs: []string{"--interval", intervalMilliseconds},
|
| parse: gm.getJetsonParser(),
|
| }
|
| go collector.start()
|
| }
|
|
|
| func (gm *GPUManager) startRocmSmiCollector(pollInterval time.Duration) {
|
| collector := gpuCollector{
|
| name: rocmSmiCmd,
|
| bufSize: 10 * 1024,
|
| cmdArgs: []string{"--showid", "--showtemp", "--showuse", "--showpower", "--showproductname", "--showmeminfo", "vram", "--json"},
|
| parse: gm.parseAmdData,
|
| }
|
| go func() {
|
| failures := 0
|
| for {
|
| if err := collector.collect(); err != nil {
|
| failures++
|
| if failures > maxFailureRetries {
|
| break
|
| }
|
| slog.Warn("Error collecting AMD GPU data via rocm-smi", "err", err)
|
| }
|
| time.Sleep(pollInterval)
|
| }
|
| }()
|
| }
|
|
|
| func (gm *GPUManager) collectorDefinitions(caps gpuCapabilities) map[collectorSource]collectorDefinition {
|
| return map[collectorSource]collectorDefinition{
|
| collectorSourceNVML: {
|
| group: collectorGroupNvidia,
|
| available: true,
|
| start: func(_ func()) bool {
|
| return gm.startNvmlCollector()
|
| },
|
| },
|
| collectorSourceNvidiaSMI: {
|
| group: collectorGroupNvidia,
|
| available: caps.hasNvidiaSmi,
|
| start: func(_ func()) bool {
|
| gm.startNvidiaSmiCollector("4")
|
| return true
|
| },
|
| },
|
| collectorSourceIntelGpuTop: {
|
| group: collectorGroupIntel,
|
| available: caps.hasIntelGpuTop,
|
| start: func(_ func()) bool {
|
| gm.startIntelCollector()
|
| return true
|
| },
|
| },
|
| collectorSourceAmdSysfs: {
|
| group: collectorGroupAmd,
|
| available: caps.hasAmdSysfs,
|
| start: func(_ func()) bool {
|
| return gm.startAmdSysfsCollector()
|
| },
|
| },
|
| collectorSourceRocmSMI: {
|
| group: collectorGroupAmd,
|
| available: caps.hasRocmSmi,
|
| deprecationWarning: "rocm-smi is deprecated and may be removed in a future release",
|
| start: func(_ func()) bool {
|
| gm.startRocmSmiCollector(4300 * time.Millisecond)
|
| return true
|
| },
|
| },
|
| collectorSourceNVTop: {
|
| available: caps.hasNvtop,
|
| start: func(onFailure func()) bool {
|
| gm.startNvtopCollector("30", onFailure)
|
| return true
|
| },
|
| },
|
| collectorSourceMacmon: {
|
| group: collectorGroupApple,
|
| available: caps.hasMacmon,
|
| start: func(_ func()) bool {
|
| gm.startMacmonCollector()
|
| return true
|
| },
|
| },
|
| collectorSourcePowermetrics: {
|
| group: collectorGroupApple,
|
| available: caps.hasPowermetrics,
|
| start: func(_ func()) bool {
|
| gm.startPowermetricsCollector()
|
| return true
|
| },
|
| },
|
| }
|
| }
|
|
|
|
|
| func parseCollectorPriority(value string) []collectorSource {
|
| parts := strings.Split(value, ",")
|
| priorities := make([]collectorSource, 0, len(parts))
|
| for _, raw := range parts {
|
| name := collectorSource(strings.TrimSpace(strings.ToLower(raw)))
|
| if !isValidCollectorSource(name) {
|
| if name != "" {
|
| slog.Warn("Ignoring unknown GPU collector", "collector", name)
|
| }
|
| continue
|
| }
|
| priorities = append(priorities, name)
|
| }
|
| return priorities
|
| }
|
|
|
|
|
| func (gm *GPUManager) startNvmlCollector() bool {
|
| collector := &nvmlCollector{gm: gm}
|
| if err := collector.init(); err != nil {
|
| slog.Warn("Failed to initialize NVML", "err", err)
|
| return false
|
| }
|
| go collector.start()
|
| return true
|
| }
|
|
|
|
|
| func (gm *GPUManager) startAmdSysfsCollector() bool {
|
| go func() {
|
| if err := gm.collectAmdStats(); err != nil {
|
| slog.Warn("Error collecting AMD GPU data via sysfs", "err", err)
|
| }
|
| }()
|
| return true
|
| }
|
|
|
|
|
| func (gm *GPUManager) startCollectorsByPriority(priorities []collectorSource, caps gpuCapabilities) int {
|
| definitions := gm.collectorDefinitions(caps)
|
| selectedGroups := make(map[string]bool, 3)
|
| started := 0
|
| for i, source := range priorities {
|
| definition, ok := definitions[source]
|
| if !ok || !definition.available {
|
| continue
|
| }
|
|
|
| if source == collectorSourceNVTop {
|
| if len(selectedGroups) > 0 {
|
| slog.Warn("Skipping nvtop because other collectors are selected")
|
| continue
|
| }
|
|
|
| remaining := append([]collectorSource(nil), priorities[i+1:]...)
|
| if definition.start(func() {
|
| gm.startCollectorsByPriority(remaining, caps)
|
| }) {
|
| started++
|
| return started
|
| }
|
| }
|
| group := definition.group
|
| if group == "" || selectedGroups[group] {
|
| continue
|
| }
|
| if definition.deprecationWarning != "" {
|
| slog.Warn(definition.deprecationWarning)
|
| }
|
| if definition.start(nil) {
|
| selectedGroups[group] = true
|
| started++
|
| }
|
| }
|
| return started
|
| }
|
|
|
|
|
| func (gm *GPUManager) resolveLegacyCollectorPriority(caps gpuCapabilities) []collectorSource {
|
| priorities := make([]collectorSource, 0, 4)
|
|
|
| if caps.hasNvidiaSmi && !caps.hasTegrastats {
|
| if nvml, _ := utils.GetEnv("NVML"); nvml == "true" {
|
| priorities = append(priorities, collectorSourceNVML, collectorSourceNvidiaSMI)
|
| } else {
|
| priorities = append(priorities, collectorSourceNvidiaSMI)
|
| }
|
| }
|
|
|
| if caps.hasRocmSmi {
|
| if val, _ := utils.GetEnv("AMD_SYSFS"); val == "true" {
|
| priorities = append(priorities, collectorSourceAmdSysfs)
|
| } else {
|
| priorities = append(priorities, collectorSourceRocmSMI)
|
| }
|
| } else if caps.hasAmdSysfs {
|
| priorities = append(priorities, collectorSourceAmdSysfs)
|
| }
|
|
|
| if caps.hasIntelGpuTop {
|
| priorities = append(priorities, collectorSourceIntelGpuTop)
|
| }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| if len(priorities) == 0 && caps.hasNvtop {
|
| priorities = append(priorities, collectorSourceNVTop)
|
| }
|
| return priorities
|
| }
|
|
|
|
|
| func NewGPUManager() (*GPUManager, error) {
|
| if skipGPU, _ := utils.GetEnv("SKIP_GPU"); skipGPU == "true" {
|
| return nil, nil
|
| }
|
| var gm GPUManager
|
| caps := gm.discoverGpuCapabilities()
|
| gm.GpuDataMap = make(map[string]*system.GPUData)
|
|
|
|
|
| if caps.hasTegrastats {
|
| gm.startTegraStatsCollector("3700")
|
| return &gm, nil
|
| }
|
|
|
|
|
| if collectorConfig, ok := utils.GetEnv("GPU_COLLECTOR"); ok && strings.TrimSpace(collectorConfig) != "" {
|
| priorities := parseCollectorPriority(collectorConfig)
|
| if gm.startCollectorsByPriority(priorities, caps) == 0 {
|
| return nil, fmt.Errorf("no configured GPU collectors are available")
|
| }
|
| return &gm, nil
|
| }
|
|
|
| if !hasAnyGpuCollector(caps) {
|
| return nil, fmt.Errorf(noGPUFoundMsg)
|
| }
|
|
|
|
|
| if gm.startCollectorsByPriority(gm.resolveLegacyCollectorPriority(caps), caps) == 0 {
|
| return nil, fmt.Errorf(noGPUFoundMsg)
|
| }
|
|
|
| return &gm, nil
|
| }
|
|
|