Upload folder using huggingface_hub

f17ae24 verified 9 days ago

4.67 kB

	import subprocess
	import re
	from collections import defaultdict

	def check_gpus():
	print("="*80)
	print("Aggregate PACE GPU Status Summary (via sinfo)")
	print("="*80)

	try:
	# Use a delimiter for robust parsing
	cmd = ["sinfo", "-p", "ice-gpu,coe-gpu,pace-gpu,coc-gpu", "--format=%P\|%N\|%G\|%T\|%f", "-h"]
	result = subprocess.check_output(cmd, universal_newlines=True)
	except Exception as e:
	print(f"Error running sinfo: {e}")
	return

	# stats[partition][gpu_type] = {used, total, nodes}
	stats = defaultdict(lambda: defaultdict(lambda: {'used': 0, 'total': 0, 'nodes': 0}))

	lines = result.splitlines()
	for line in lines:
	parts = line.split('\|')
	if len(parts) < 4: continue

	partition = parts[0].strip()
	nodelist = parts[1].strip()
	gres = parts[2].strip()
	state = parts[3].strip()
	features = parts[4].strip().lower() if len(parts) > 4 else ""

	# Count nodes in nodelist
	# atl1-1-03-010-[10,15,20] or atl1-1-03-010-10
	# A simple way is to count commas and handle brackets, but sinfo -h -N would be easier
	# However, let's just use sinfo -N which gives one line per node.
	pass

	# Re-run with -N to get one node per line
	try:
	cmd = ["sinfo", "-p", "ice-gpu,coe-gpu,pace-gpu,coc-gpu", "-N", "--format=%P\|%n\|%G\|%T\|%f", "-h"]
	result = subprocess.check_output(cmd, universal_newlines=True)
	except Exception as e:
	print(f"Error running sinfo: {e}")
	return

	lines = result.splitlines()
	for line in lines:
	parts = line.split('\|')
	if len(parts) < 4: continue

	partition = parts[0].strip()
	node = parts[1].strip()
	gres = parts[2].strip()
	state = parts[3].strip()
	features = parts[4].strip().lower() if len(parts) > 4 else ""

	gpu_match = re.search(r'gpu:([^:]+:)?(\d+)', gres)
	if not gpu_match: continue

	total_gpus_on_node = int(gpu_match.group(2))

	gpu_type = 'Other'
	if 'h200' in features or 'h200' in gres.lower():
	gpu_type = 'H200'
	elif 'h100' in features or 'h100' in gres.lower():
	gpu_type = 'H100'
	elif 'a100' in features or 'a100' in gres.lower():
	gpu_type = 'A100'
	elif 'v100' in features or 'v100' in gres.lower():
	gpu_type = 'V100'
	elif 'rtx_6000' in gres.lower():
	gpu_type = 'RTX6k'
	elif 'l40s' in features or 'l40s' in gres.lower():
	gpu_type = 'L40S'
	elif 'a40' in features or 'a40' in gres.lower():
	gpu_type = 'A40'

	used_on_node = 0
	if state == 'allocated':
	used_on_node = total_gpus_on_node
	elif state == 'mixed':
	# We don't know exactly, but let's try to get more info from squeue if needed
	# For now, keep the approximation
	used_on_node = total_gpus_on_node // 2
	elif 'drain' in state or 'maint' in state or 'down' in state or 'reserved' in state:
	# Don't count these as available
	total_gpus_on_node = 0
	used_on_node = 0

	if total_gpus_on_node > 0 or used_on_node > 0:
	stats[partition][gpu_type]['total'] += total_gpus_on_node
	stats[partition][gpu_type]['used'] += used_on_node
	stats[partition][gpu_type]['nodes'] += 1

	print(f"{'Partition':<15} {'GPU Type':<12} {'Used':<10} {'Avail':<10} {'Total':<10} {'Nodes':<8}")
	print("-" * 80)

	partitions = sorted(stats.keys())
	for part in partitions:
	gpu_types = sorted(stats[part].keys())
	for gtype in gpu_types:
	s = stats[part][gtype]
	avail = s['total'] - s['used']
	print(f"{part:<15} {gtype:<12} {s['used']:<10} {avail:<10} {s['total']:<10} {s['nodes']:<8}")
	print("-" * 80)

	# Grand total per GPU type
	grand_stats = defaultdict(lambda: {'used': 0, 'total': 0})
	for part in stats:
	for gtype in stats[part]:
	grand_stats[gtype]['used'] += stats[part][gtype]['used']
	grand_stats[gtype]['total'] += stats[part][gtype]['total']

	print(f"{'TOTAL':<15} {'GPU Type':<12} {'Used':<10} {'Avail':<10} {'Total':<10}")
	print("-" * 80)
	for gtype in sorted(grand_stats.keys()):
	s = grand_stats[gtype]
	print(f"{'':<15} {gtype:<12} {s['used']:<10} {s['total'] - s['used']:<10} {s['total']:<10}")

	print("="*80)
	print("* Used/Avail are estimates based on node states (Allocated/Mixed).")

	if __name__ == '__main__':
	check_gpus()