world_model / wm /test /check_gpus.py
t1an's picture
Upload folder using huggingface_hub
f17ae24 verified
import subprocess
import re
from collections import defaultdict
def check_gpus():
print("="*80)
print("Aggregate PACE GPU Status Summary (via sinfo)")
print("="*80)
try:
# Use a delimiter for robust parsing
cmd = ["sinfo", "-p", "ice-gpu,coe-gpu,pace-gpu,coc-gpu", "--format=%P|%N|%G|%T|%f", "-h"]
result = subprocess.check_output(cmd, universal_newlines=True)
except Exception as e:
print(f"Error running sinfo: {e}")
return
# stats[partition][gpu_type] = {used, total, nodes}
stats = defaultdict(lambda: defaultdict(lambda: {'used': 0, 'total': 0, 'nodes': 0}))
lines = result.splitlines()
for line in lines:
parts = line.split('|')
if len(parts) < 4: continue
partition = parts[0].strip()
nodelist = parts[1].strip()
gres = parts[2].strip()
state = parts[3].strip()
features = parts[4].strip().lower() if len(parts) > 4 else ""
# Count nodes in nodelist
# atl1-1-03-010-[10,15,20] or atl1-1-03-010-10
# A simple way is to count commas and handle brackets, but sinfo -h -N would be easier
# However, let's just use sinfo -N which gives one line per node.
pass
# Re-run with -N to get one node per line
try:
cmd = ["sinfo", "-p", "ice-gpu,coe-gpu,pace-gpu,coc-gpu", "-N", "--format=%P|%n|%G|%T|%f", "-h"]
result = subprocess.check_output(cmd, universal_newlines=True)
except Exception as e:
print(f"Error running sinfo: {e}")
return
lines = result.splitlines()
for line in lines:
parts = line.split('|')
if len(parts) < 4: continue
partition = parts[0].strip()
node = parts[1].strip()
gres = parts[2].strip()
state = parts[3].strip()
features = parts[4].strip().lower() if len(parts) > 4 else ""
gpu_match = re.search(r'gpu:([^:]+:)?(\d+)', gres)
if not gpu_match: continue
total_gpus_on_node = int(gpu_match.group(2))
gpu_type = 'Other'
if 'h200' in features or 'h200' in gres.lower():
gpu_type = 'H200'
elif 'h100' in features or 'h100' in gres.lower():
gpu_type = 'H100'
elif 'a100' in features or 'a100' in gres.lower():
gpu_type = 'A100'
elif 'v100' in features or 'v100' in gres.lower():
gpu_type = 'V100'
elif 'rtx_6000' in gres.lower():
gpu_type = 'RTX6k'
elif 'l40s' in features or 'l40s' in gres.lower():
gpu_type = 'L40S'
elif 'a40' in features or 'a40' in gres.lower():
gpu_type = 'A40'
used_on_node = 0
if state == 'allocated':
used_on_node = total_gpus_on_node
elif state == 'mixed':
# We don't know exactly, but let's try to get more info from squeue if needed
# For now, keep the approximation
used_on_node = total_gpus_on_node // 2
elif 'drain' in state or 'maint' in state or 'down' in state or 'reserved' in state:
# Don't count these as available
total_gpus_on_node = 0
used_on_node = 0
if total_gpus_on_node > 0 or used_on_node > 0:
stats[partition][gpu_type]['total'] += total_gpus_on_node
stats[partition][gpu_type]['used'] += used_on_node
stats[partition][gpu_type]['nodes'] += 1
print(f"{'Partition':<15} {'GPU Type':<12} {'Used*':<10} {'Avail*':<10} {'Total':<10} {'Nodes':<8}")
print("-" * 80)
partitions = sorted(stats.keys())
for part in partitions:
gpu_types = sorted(stats[part].keys())
for gtype in gpu_types:
s = stats[part][gtype]
avail = s['total'] - s['used']
print(f"{part:<15} {gtype:<12} {s['used']:<10} {avail:<10} {s['total']:<10} {s['nodes']:<8}")
print("-" * 80)
# Grand total per GPU type
grand_stats = defaultdict(lambda: {'used': 0, 'total': 0})
for part in stats:
for gtype in stats[part]:
grand_stats[gtype]['used'] += stats[part][gtype]['used']
grand_stats[gtype]['total'] += stats[part][gtype]['total']
print(f"{'TOTAL':<15} {'GPU Type':<12} {'Used*':<10} {'Avail*':<10} {'Total':<10}")
print("-" * 80)
for gtype in sorted(grand_stats.keys()):
s = grand_stats[gtype]
print(f"{'':<15} {gtype:<12} {s['used']:<10} {s['total'] - s['used']:<10} {s['total']:<10}")
print("="*80)
print("* Used/Avail are estimates based on node states (Allocated/Mixed).")
if __name__ == '__main__':
check_gpus()