File size: 4,673 Bytes
f17ae24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import subprocess
import re
from collections import defaultdict

def check_gpus():
    print("="*80)
    print("Aggregate PACE GPU Status Summary (via sinfo)")
    print("="*80)
    
    try:
        # Use a delimiter for robust parsing
        cmd = ["sinfo", "-p", "ice-gpu,coe-gpu,pace-gpu,coc-gpu", "--format=%P|%N|%G|%T|%f", "-h"]
        result = subprocess.check_output(cmd, universal_newlines=True)
    except Exception as e:
        print(f"Error running sinfo: {e}")
        return

    # stats[partition][gpu_type] = {used, total, nodes}
    stats = defaultdict(lambda: defaultdict(lambda: {'used': 0, 'total': 0, 'nodes': 0}))

    lines = result.splitlines()
    for line in lines:
        parts = line.split('|')
        if len(parts) < 4: continue
        
        partition = parts[0].strip()
        nodelist = parts[1].strip()
        gres = parts[2].strip()
        state = parts[3].strip()
        features = parts[4].strip().lower() if len(parts) > 4 else ""

        # Count nodes in nodelist
        # atl1-1-03-010-[10,15,20] or atl1-1-03-010-10
        # A simple way is to count commas and handle brackets, but sinfo -h -N would be easier
        # However, let's just use sinfo -N which gives one line per node.
        pass

    # Re-run with -N to get one node per line
    try:
        cmd = ["sinfo", "-p", "ice-gpu,coe-gpu,pace-gpu,coc-gpu", "-N", "--format=%P|%n|%G|%T|%f", "-h"]
        result = subprocess.check_output(cmd, universal_newlines=True)
    except Exception as e:
        print(f"Error running sinfo: {e}")
        return

    lines = result.splitlines()
    for line in lines:
        parts = line.split('|')
        if len(parts) < 4: continue
        
        partition = parts[0].strip()
        node = parts[1].strip()
        gres = parts[2].strip()
        state = parts[3].strip()
        features = parts[4].strip().lower() if len(parts) > 4 else ""

        gpu_match = re.search(r'gpu:([^:]+:)?(\d+)', gres)
        if not gpu_match: continue
        
        total_gpus_on_node = int(gpu_match.group(2))
        
        gpu_type = 'Other'
        if 'h200' in features or 'h200' in gres.lower():
            gpu_type = 'H200'
        elif 'h100' in features or 'h100' in gres.lower():
            gpu_type = 'H100'
        elif 'a100' in features or 'a100' in gres.lower():
            gpu_type = 'A100'
        elif 'v100' in features or 'v100' in gres.lower():
            gpu_type = 'V100'
        elif 'rtx_6000' in gres.lower():
            gpu_type = 'RTX6k'
        elif 'l40s' in features or 'l40s' in gres.lower():
            gpu_type = 'L40S'
        elif 'a40' in features or 'a40' in gres.lower():
            gpu_type = 'A40'

        used_on_node = 0
        if state == 'allocated':
            used_on_node = total_gpus_on_node
        elif state == 'mixed':
            # We don't know exactly, but let's try to get more info from squeue if needed
            # For now, keep the approximation
            used_on_node = total_gpus_on_node // 2
        elif 'drain' in state or 'maint' in state or 'down' in state or 'reserved' in state:
            # Don't count these as available
            total_gpus_on_node = 0
            used_on_node = 0

        if total_gpus_on_node > 0 or used_on_node > 0:
            stats[partition][gpu_type]['total'] += total_gpus_on_node
            stats[partition][gpu_type]['used'] += used_on_node
            stats[partition][gpu_type]['nodes'] += 1

    print(f"{'Partition':<15} {'GPU Type':<12} {'Used*':<10} {'Avail*':<10} {'Total':<10} {'Nodes':<8}")
    print("-" * 80)
    
    partitions = sorted(stats.keys())
    for part in partitions:
        gpu_types = sorted(stats[part].keys())
        for gtype in gpu_types:
            s = stats[part][gtype]
            avail = s['total'] - s['used']
            print(f"{part:<15} {gtype:<12} {s['used']:<10} {avail:<10} {s['total']:<10} {s['nodes']:<8}")
        print("-" * 80)

    # Grand total per GPU type
    grand_stats = defaultdict(lambda: {'used': 0, 'total': 0})
    for part in stats:
        for gtype in stats[part]:
            grand_stats[gtype]['used'] += stats[part][gtype]['used']
            grand_stats[gtype]['total'] += stats[part][gtype]['total']

    print(f"{'TOTAL':<15} {'GPU Type':<12} {'Used*':<10} {'Avail*':<10} {'Total':<10}")
    print("-" * 80)
    for gtype in sorted(grand_stats.keys()):
        s = grand_stats[gtype]
        print(f"{'':<15} {gtype:<12} {s['used']:<10} {s['total'] - s['used']:<10} {s['total']:<10}")

    print("="*80)
    print("* Used/Avail are estimates based on node states (Allocated/Mixed).")

if __name__ == '__main__':
    check_gpus()