import subprocess import re from collections import defaultdict def check_gpus(): print("="*80) print("Aggregate PACE GPU Status Summary (via sinfo)") print("="*80) try: # Use a delimiter for robust parsing cmd = ["sinfo", "-p", "ice-gpu,coe-gpu,pace-gpu,coc-gpu", "--format=%P|%N|%G|%T|%f", "-h"] result = subprocess.check_output(cmd, universal_newlines=True) except Exception as e: print(f"Error running sinfo: {e}") return # stats[partition][gpu_type] = {used, total, nodes} stats = defaultdict(lambda: defaultdict(lambda: {'used': 0, 'total': 0, 'nodes': 0})) lines = result.splitlines() for line in lines: parts = line.split('|') if len(parts) < 4: continue partition = parts[0].strip() nodelist = parts[1].strip() gres = parts[2].strip() state = parts[3].strip() features = parts[4].strip().lower() if len(parts) > 4 else "" # Count nodes in nodelist # atl1-1-03-010-[10,15,20] or atl1-1-03-010-10 # A simple way is to count commas and handle brackets, but sinfo -h -N would be easier # However, let's just use sinfo -N which gives one line per node. pass # Re-run with -N to get one node per line try: cmd = ["sinfo", "-p", "ice-gpu,coe-gpu,pace-gpu,coc-gpu", "-N", "--format=%P|%n|%G|%T|%f", "-h"] result = subprocess.check_output(cmd, universal_newlines=True) except Exception as e: print(f"Error running sinfo: {e}") return lines = result.splitlines() for line in lines: parts = line.split('|') if len(parts) < 4: continue partition = parts[0].strip() node = parts[1].strip() gres = parts[2].strip() state = parts[3].strip() features = parts[4].strip().lower() if len(parts) > 4 else "" gpu_match = re.search(r'gpu:([^:]+:)?(\d+)', gres) if not gpu_match: continue total_gpus_on_node = int(gpu_match.group(2)) gpu_type = 'Other' if 'h200' in features or 'h200' in gres.lower(): gpu_type = 'H200' elif 'h100' in features or 'h100' in gres.lower(): gpu_type = 'H100' elif 'a100' in features or 'a100' in gres.lower(): gpu_type = 'A100' elif 'v100' in features or 'v100' in gres.lower(): gpu_type = 'V100' elif 'rtx_6000' in gres.lower(): gpu_type = 'RTX6k' elif 'l40s' in features or 'l40s' in gres.lower(): gpu_type = 'L40S' elif 'a40' in features or 'a40' in gres.lower(): gpu_type = 'A40' used_on_node = 0 if state == 'allocated': used_on_node = total_gpus_on_node elif state == 'mixed': # We don't know exactly, but let's try to get more info from squeue if needed # For now, keep the approximation used_on_node = total_gpus_on_node // 2 elif 'drain' in state or 'maint' in state or 'down' in state or 'reserved' in state: # Don't count these as available total_gpus_on_node = 0 used_on_node = 0 if total_gpus_on_node > 0 or used_on_node > 0: stats[partition][gpu_type]['total'] += total_gpus_on_node stats[partition][gpu_type]['used'] += used_on_node stats[partition][gpu_type]['nodes'] += 1 print(f"{'Partition':<15} {'GPU Type':<12} {'Used*':<10} {'Avail*':<10} {'Total':<10} {'Nodes':<8}") print("-" * 80) partitions = sorted(stats.keys()) for part in partitions: gpu_types = sorted(stats[part].keys()) for gtype in gpu_types: s = stats[part][gtype] avail = s['total'] - s['used'] print(f"{part:<15} {gtype:<12} {s['used']:<10} {avail:<10} {s['total']:<10} {s['nodes']:<8}") print("-" * 80) # Grand total per GPU type grand_stats = defaultdict(lambda: {'used': 0, 'total': 0}) for part in stats: for gtype in stats[part]: grand_stats[gtype]['used'] += stats[part][gtype]['used'] grand_stats[gtype]['total'] += stats[part][gtype]['total'] print(f"{'TOTAL':<15} {'GPU Type':<12} {'Used*':<10} {'Avail*':<10} {'Total':<10}") print("-" * 80) for gtype in sorted(grand_stats.keys()): s = grand_stats[gtype] print(f"{'':<15} {gtype:<12} {s['used']:<10} {s['total'] - s['used']:<10} {s['total']:<10}") print("="*80) print("* Used/Avail are estimates based on node states (Allocated/Mixed).") if __name__ == '__main__': check_gpus()