test_409 / trackio /apple_gpu.py
abidlabs's picture
abidlabs HF Staff
Upload folder using huggingface_hub
43e7a09 verified
import platform
import subprocess
import sys
import threading
import warnings
from typing import TYPE_CHECKING, Any
if TYPE_CHECKING:
from trackio.run import Run
psutil: Any = None
PSUTIL_AVAILABLE = False
_monitor_lock = threading.Lock()
def _ensure_psutil():
global PSUTIL_AVAILABLE, psutil
if PSUTIL_AVAILABLE:
return psutil
try:
import psutil as _psutil
psutil = _psutil
PSUTIL_AVAILABLE = True
return psutil
except ImportError:
raise ImportError(
"psutil is required for Apple Silicon monitoring. "
"Install it with: pip install psutil"
)
def is_apple_silicon() -> bool:
"""Check if running on Apple Silicon (M1/M2/M3/M4)."""
if platform.system() != "Darwin":
return False
try:
result = subprocess.run(
["sysctl", "-n", "machdep.cpu.brand_string"],
capture_output=True,
text=True,
timeout=1,
)
cpu_brand = result.stdout.strip()
return "Apple" in cpu_brand
except Exception:
return False
def get_gpu_info() -> dict[str, Any]:
"""Get Apple GPU information using ioreg."""
try:
result = subprocess.run(
["ioreg", "-r", "-d", "1", "-w", "0", "-c", "IOAccelerator"],
capture_output=True,
text=True,
timeout=2,
)
if result.returncode == 0 and result.stdout:
lines = result.stdout.strip().split("\n")
for line in lines:
if "IOAccelerator" in line and "class" in line:
return {"detected": True, "type": "Apple GPU"}
else:
print("Error collecting Apple GPU info. ioreg stdout was:", file=sys.stderr)
print(result.stdout, file=sys.stderr)
print("ioreg stderr was:", file=sys.stderr)
print(result.stderr, file=sys.stderr)
result = subprocess.run(
["system_profiler", "SPDisplaysDataType"],
capture_output=True,
text=True,
timeout=3,
)
if result.returncode == 0 and "Apple" in result.stdout:
for line in result.stdout.split("\n"):
if "Chipset Model:" in line:
model = line.split(":")[-1].strip()
return {"detected": True, "type": model}
except Exception:
pass
return {"detected": False}
def apple_gpu_available() -> bool:
"""
Check if Apple GPU monitoring is available.
Returns True if running on Apple Silicon (M-series chips) and psutil is installed.
"""
try:
_ensure_psutil()
return is_apple_silicon()
except ImportError:
return False
except Exception:
return False
def collect_apple_metrics() -> dict:
"""
Collect system metrics for Apple Silicon.
Returns:
Dictionary of system metrics including CPU, memory, and GPU info.
"""
if not PSUTIL_AVAILABLE:
try:
_ensure_psutil()
except ImportError:
return {}
metrics = {}
try:
cpu_percent = psutil.cpu_percent(interval=0.1, percpu=False)
metrics["cpu/utilization"] = cpu_percent
except Exception:
pass
try:
cpu_percents = psutil.cpu_percent(interval=0.1, percpu=True)
for i, percent in enumerate(cpu_percents):
metrics[f"cpu/{i}/utilization"] = percent
except Exception:
pass
try:
cpu_freq = psutil.cpu_freq()
if cpu_freq:
metrics["cpu/frequency"] = cpu_freq.current
if cpu_freq.max > 0:
metrics["cpu/frequency_max"] = cpu_freq.max
except Exception:
pass
try:
mem = psutil.virtual_memory()
metrics["memory/used"] = mem.used / (1024**3)
metrics["memory/total"] = mem.total / (1024**3)
metrics["memory/available"] = mem.available / (1024**3)
metrics["memory/percent"] = mem.percent
except Exception:
pass
try:
swap = psutil.swap_memory()
metrics["swap/used"] = swap.used / (1024**3)
metrics["swap/total"] = swap.total / (1024**3)
metrics["swap/percent"] = swap.percent
except Exception:
pass
try:
sensors_temps = psutil.sensors_temperatures()
if sensors_temps:
for name, entries in sensors_temps.items():
for i, entry in enumerate(entries):
label = entry.label or f"{name}_{i}"
metrics[f"temp/{label}"] = entry.current
except Exception:
pass
gpu_info = get_gpu_info()
if gpu_info.get("detected"):
metrics["gpu/detected"] = 1
if "type" in gpu_info:
pass
return metrics
class AppleGpuMonitor:
def __init__(self, run: "Run", interval: float = 10.0):
self._run = run
self._interval = interval
self._stop_flag = threading.Event()
self._thread: "threading.Thread | None" = None
def start(self):
if not is_apple_silicon():
warnings.warn(
"auto_log_gpu=True but not running on Apple Silicon. "
"Apple GPU logging disabled."
)
return
if not PSUTIL_AVAILABLE:
try:
_ensure_psutil()
except ImportError:
warnings.warn(
"auto_log_gpu=True but psutil not installed. "
"Install with: pip install psutil"
)
return
self._thread = threading.Thread(target=self._monitor_loop, daemon=True)
self._thread.start()
def stop(self):
self._stop_flag.set()
if self._thread is not None:
self._thread.join(timeout=2.0)
def _monitor_loop(self):
while not self._stop_flag.is_set():
try:
metrics = collect_apple_metrics()
if metrics:
self._run.log_system(metrics)
except Exception:
pass
self._stop_flag.wait(timeout=self._interval)
def log_apple_gpu(run: "Run | None" = None) -> dict:
"""
Log Apple Silicon system metrics to the current or specified run.
Args:
run: Optional Run instance. If None, uses current run from context.
Returns:
dict: The system metrics that were logged.
Example:
```python
import trackio
run = trackio.init(project="my-project")
trackio.log({"loss": 0.5})
trackio.log_apple_gpu()
```
"""
from trackio import context_vars
if run is None:
run = context_vars.current_run.get()
if run is None:
raise RuntimeError("Call trackio.init() before trackio.log_apple_gpu().")
metrics = collect_apple_metrics()
if metrics:
run.log_system(metrics)
return metrics