File size: 4,673 Bytes

f17ae24

import subprocess
import re
from collections import defaultdict

def check_gpus():
    print("="*80)
    print("Aggregate PACE GPU Status Summary (via sinfo)")
    print("="*80)
    
    try:
        # Use a delimiter for robust parsing
        cmd = ["sinfo", "-p", "ice-gpu,coe-gpu,pace-gpu,coc-gpu", "--format=%P|%N|%G|%T|%f", "-h"]
        result = subprocess.check_output(cmd, universal_newlines=True)
    except Exception as e:
        print(f"Error running sinfo: {e}")
        return

    # stats[partition][gpu_type] = {used, total, nodes}
    stats = defaultdict(lambda: defaultdict(lambda: {'used': 0, 'total': 0, 'nodes': 0}))

    lines = result.splitlines()
    for line in lines:
        parts = line.split('|')
        if len(parts) < 4: continue
        
        partition = parts[0].strip()
        nodelist = parts[1].strip()
        gres = parts[2].strip()
        state = parts[3].strip()
        features = parts[4].strip().lower() if len(parts) > 4 else ""

        # Count nodes in nodelist
        # atl1-1-03-010-[10,15,20] or atl1-1-03-010-10
        # A simple way is to count commas and handle brackets, but sinfo -h -N would be easier
        # However, let's just use sinfo -N which gives one line per node.
        pass

    # Re-run with -N to get one node per line
    try:
        cmd = ["sinfo", "-p", "ice-gpu,coe-gpu,pace-gpu,coc-gpu", "-N", "--format=%P|%n|%G|%T|%f", "-h"]
        result = subprocess.check_output(cmd, universal_newlines=True)
    except Exception as e:
        print(f"Error running sinfo: {e}")
        return

    lines = result.splitlines()
    for line in lines:
        parts = line.split('|')
        if len(parts) < 4: continue
        
        partition = parts[0].strip()
        node = parts[1].strip()
        gres = parts[2].strip()
        state = parts[3].strip()
        features = parts[4].strip().lower() if len(parts) > 4 else ""

        gpu_match = re.search(r'gpu:([^:]+:)?(\d+)', gres)
        if not gpu_match: continue
        
        total_gpus_on_node = int(gpu_match.group(2))
        
        gpu_type = 'Other'
        if 'h200' in features or 'h200' in gres.lower():
            gpu_type = 'H200'
        elif 'h100' in features or 'h100' in gres.lower():
            gpu_type = 'H100'
        elif 'a100' in features or 'a100' in gres.lower():
            gpu_type = 'A100'
        elif 'v100' in features or 'v100' in gres.lower():
            gpu_type = 'V100'
        elif 'rtx_6000' in gres.lower():
            gpu_type = 'RTX6k'
        elif 'l40s' in features or 'l40s' in gres.lower():
            gpu_type = 'L40S'
        elif 'a40' in features or 'a40' in gres.lower():
            gpu_type = 'A40'

        used_on_node = 0
        if state == 'allocated':
            used_on_node = total_gpus_on_node
        elif state == 'mixed':
            # We don't know exactly, but let's try to get more info from squeue if needed
            # For now, keep the approximation
            used_on_node = total_gpus_on_node // 2
        elif 'drain' in state or 'maint' in state or 'down' in state or 'reserved' in state:
            # Don't count these as available
            total_gpus_on_node = 0
            used_on_node = 0

        if total_gpus_on_node > 0 or used_on_node > 0:
            stats[partition][gpu_type]['total'] += total_gpus_on_node
            stats[partition][gpu_type]['used'] += used_on_node
            stats[partition][gpu_type]['nodes'] += 1

    print(f"{'Partition':<15} {'GPU Type':<12} {'Used*':<10} {'Avail*':<10} {'Total':<10} {'Nodes':<8}")
    print("-" * 80)
    
    partitions = sorted(stats.keys())
    for part in partitions:
        gpu_types = sorted(stats[part].keys())
        for gtype in gpu_types:
            s = stats[part][gtype]
            avail = s['total'] - s['used']
            print(f"{part:<15} {gtype:<12} {s['used']:<10} {avail:<10} {s['total']:<10} {s['nodes']:<8}")
        print("-" * 80)

    # Grand total per GPU type
    grand_stats = defaultdict(lambda: {'used': 0, 'total': 0})
    for part in stats:
        for gtype in stats[part]:
            grand_stats[gtype]['used'] += stats[part][gtype]['used']
            grand_stats[gtype]['total'] += stats[part][gtype]['total']

    print(f"{'TOTAL':<15} {'GPU Type':<12} {'Used*':<10} {'Avail*':<10} {'Total':<10}")
    print("-" * 80)
    for gtype in sorted(grand_stats.keys()):
        s = grand_stats[gtype]
        print(f"{'':<15} {gtype:<12} {s['used']:<10} {s['total'] - s['used']:<10} {s['total']:<10}")

    print("="*80)
    print("* Used/Avail are estimates based on node states (Allocated/Mixed).")

if __name__ == '__main__':
    check_gpus()