| | import subprocess |
| | import re |
| | from collections import defaultdict |
| |
|
| | def check_gpus(): |
| | print("="*80) |
| | print("Aggregate PACE GPU Status Summary (via sinfo)") |
| | print("="*80) |
| | |
| | try: |
| | |
| | cmd = ["sinfo", "-p", "ice-gpu,coe-gpu,pace-gpu,coc-gpu", "--format=%P|%N|%G|%T|%f", "-h"] |
| | result = subprocess.check_output(cmd, universal_newlines=True) |
| | except Exception as e: |
| | print(f"Error running sinfo: {e}") |
| | return |
| |
|
| | |
| | stats = defaultdict(lambda: defaultdict(lambda: {'used': 0, 'total': 0, 'nodes': 0})) |
| |
|
| | lines = result.splitlines() |
| | for line in lines: |
| | parts = line.split('|') |
| | if len(parts) < 4: continue |
| | |
| | partition = parts[0].strip() |
| | nodelist = parts[1].strip() |
| | gres = parts[2].strip() |
| | state = parts[3].strip() |
| | features = parts[4].strip().lower() if len(parts) > 4 else "" |
| |
|
| | |
| | |
| | |
| | |
| | pass |
| |
|
| | |
| | try: |
| | cmd = ["sinfo", "-p", "ice-gpu,coe-gpu,pace-gpu,coc-gpu", "-N", "--format=%P|%n|%G|%T|%f", "-h"] |
| | result = subprocess.check_output(cmd, universal_newlines=True) |
| | except Exception as e: |
| | print(f"Error running sinfo: {e}") |
| | return |
| |
|
| | lines = result.splitlines() |
| | for line in lines: |
| | parts = line.split('|') |
| | if len(parts) < 4: continue |
| | |
| | partition = parts[0].strip() |
| | node = parts[1].strip() |
| | gres = parts[2].strip() |
| | state = parts[3].strip() |
| | features = parts[4].strip().lower() if len(parts) > 4 else "" |
| |
|
| | gpu_match = re.search(r'gpu:([^:]+:)?(\d+)', gres) |
| | if not gpu_match: continue |
| | |
| | total_gpus_on_node = int(gpu_match.group(2)) |
| | |
| | gpu_type = 'Other' |
| | if 'h200' in features or 'h200' in gres.lower(): |
| | gpu_type = 'H200' |
| | elif 'h100' in features or 'h100' in gres.lower(): |
| | gpu_type = 'H100' |
| | elif 'a100' in features or 'a100' in gres.lower(): |
| | gpu_type = 'A100' |
| | elif 'v100' in features or 'v100' in gres.lower(): |
| | gpu_type = 'V100' |
| | elif 'rtx_6000' in gres.lower(): |
| | gpu_type = 'RTX6k' |
| | elif 'l40s' in features or 'l40s' in gres.lower(): |
| | gpu_type = 'L40S' |
| | elif 'a40' in features or 'a40' in gres.lower(): |
| | gpu_type = 'A40' |
| |
|
| | used_on_node = 0 |
| | if state == 'allocated': |
| | used_on_node = total_gpus_on_node |
| | elif state == 'mixed': |
| | |
| | |
| | used_on_node = total_gpus_on_node // 2 |
| | elif 'drain' in state or 'maint' in state or 'down' in state or 'reserved' in state: |
| | |
| | total_gpus_on_node = 0 |
| | used_on_node = 0 |
| |
|
| | if total_gpus_on_node > 0 or used_on_node > 0: |
| | stats[partition][gpu_type]['total'] += total_gpus_on_node |
| | stats[partition][gpu_type]['used'] += used_on_node |
| | stats[partition][gpu_type]['nodes'] += 1 |
| |
|
| | print(f"{'Partition':<15} {'GPU Type':<12} {'Used*':<10} {'Avail*':<10} {'Total':<10} {'Nodes':<8}") |
| | print("-" * 80) |
| | |
| | partitions = sorted(stats.keys()) |
| | for part in partitions: |
| | gpu_types = sorted(stats[part].keys()) |
| | for gtype in gpu_types: |
| | s = stats[part][gtype] |
| | avail = s['total'] - s['used'] |
| | print(f"{part:<15} {gtype:<12} {s['used']:<10} {avail:<10} {s['total']:<10} {s['nodes']:<8}") |
| | print("-" * 80) |
| |
|
| | |
| | grand_stats = defaultdict(lambda: {'used': 0, 'total': 0}) |
| | for part in stats: |
| | for gtype in stats[part]: |
| | grand_stats[gtype]['used'] += stats[part][gtype]['used'] |
| | grand_stats[gtype]['total'] += stats[part][gtype]['total'] |
| |
|
| | print(f"{'TOTAL':<15} {'GPU Type':<12} {'Used*':<10} {'Avail*':<10} {'Total':<10}") |
| | print("-" * 80) |
| | for gtype in sorted(grand_stats.keys()): |
| | s = grand_stats[gtype] |
| | print(f"{'':<15} {gtype:<12} {s['used']:<10} {s['total'] - s['used']:<10} {s['total']:<10}") |
| |
|
| | print("="*80) |
| | print("* Used/Avail are estimates based on node states (Allocated/Mixed).") |
| |
|
| | if __name__ == '__main__': |
| | check_gpus() |
| |
|