gpu_benchmark / benchmark_fp16_full.py

Upload benchmark_fp16_full.py with huggingface_hub

3b4d89e verified 4 months ago

14.8 kB

	#!/usr/bin/env python3
	import torch
	import matplotlib.pyplot as plt
	import matplotlib.animation as animation
	from datetime import datetime
	import subprocess
	import time
	import psutil
	import re
	from collections import deque
	import threading

	class FP16BenchmarkFull:
	def __init__(self):
	self.max_temp = 85
	self.temperatures = deque(maxlen=200)
	self.tflops_history = deque(maxlen=200)
	self.load_level = deque(maxlen=200)
	self.power_watts = deque(maxlen=200)
	self.voltage_volts = deque(maxlen=200)
	self.current_amps = deque(maxlen=200)
	self.peak_tflops = 0
	self.peak_power = 0
	self.running = True

	self.current_load = 1
	self.matrix_size = 10240
	self.num_operations = 1
	self.num_streams = 1

	self.fig, ((self.ax1, self.ax2), (self.ax3, self.ax4)) = plt.subplots(2, 2, figsize=(16, 10))
	self.fig.suptitle('BENCHMARK FP16 COMPLETO - Radeon Pro VII', fontsize=16, weight='bold')

	self.last_temp_check = time.time()
	self.temp_rising_fast = False

	def get_gpu_metrics(self):
	"""Obtém temperatura, potência, tensão via rocm-smi e sensors"""
	temp = 0
	power = 0
	voltage = 0

	# Temperatura via sensors
	try:
	result = subprocess.run(['sensors'], capture_output=True, text=True, timeout=0.5)
	for line in result.stdout.split('\n'):
	if 'edge:' in line.lower():
	match = re.search(r'([+-]?\d+\.?\d)\s°C', line)
	if match:
	temp = float(match.group(1))
	except:
	pass

	# Potência e tensão via rocm-smi
	try:
	result = subprocess.run(['rocm-smi', '--showpower', '--showvolt'],
	capture_output=True, text=True, timeout=0.5)
	for line in result.stdout.split('\n'):
	# Potência (watts)
	if 'Average Graphics Package Power' in line or 'Power' in line:
	match = re.search(r'(\d+\.?\d)\sW', line)
	if match:
	power = float(match.group(1))

	# Tensão (volts)
	if 'Voltage' in line or 'volt' in line.lower():
	match = re.search(r'(\d+\.?\d)\smV', line)
	if match:
	voltage = float(match.group(1)) / 1000 # mV para V
	except:
	pass

	return temp, power, voltage

	def check_system_health(self):
	try:
	start = time.time()
	cpu = psutil.cpu_percent(interval=0.05)
	response = time.time() - start
	if response > 0.4 or cpu > 95:
	return False
	return True
	except:
	return False

	def calculate_tflops(self, matrix_size, elapsed_time, num_ops, num_streams):
	operations = 2 * (matrix_size ** 3) * num_ops * num_streams
	return (operations / elapsed_time) / 1e12

	def increase_load(self):
	if self.current_load < 10:
	self.current_load += 1

	if self.current_load >= 2 and self.num_streams < 4:
	self.num_streams += 1

	if self.current_load >= 4 and self.num_operations < 30:
	self.num_operations += 5

	if self.current_load >= 6 and self.matrix_size < 16384:
	self.matrix_size = min(self.matrix_size + 1024, 16384)

	def decrease_load(self):
	if self.current_load > 1:
	self.current_load -= 1

	if self.matrix_size > 8192:
	self.matrix_size = max(self.matrix_size - 512, 8192)

	if self.num_operations > 5:
	self.num_operations = max(self.num_operations - 5, 1)

	def stress_gpu(self):
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	if device.type == 'cpu':
	print("❌ ERRO: GPU não detectada!")
	self.running = False
	return

	props = torch.cuda.get_device_properties(0)
	print(f"🎯 GPU: {torch.cuda.get_device_name(0)}")
	print(f"💾 VRAM: {props.total_memory / 1e9:.1f} GB")
	print(f"🔥 Modo: FP16 (Half Precision)")
	print(f"📊 TFLOPS Teórico FP16: ~26.88")
	print(f"⚡ TDP Especificado: 300W")
	print(f"⚠️ Limite: {self.max_temp}°C\n")

	streams = [torch.cuda.Stream() for _ in range(4)]
	last_temp = 0
	stable_cycles = 0

	while self.running:
	current_time = time.time()
	if current_time - self.last_temp_check > 0.05:
	temp, power, voltage = self.get_gpu_metrics()
	self.last_temp_check = current_time

	# Calcula corrente: I = P / V
	current = power / voltage if voltage > 0 else 0

	if len(self.temperatures) > 0:
	temp_delta = temp - last_temp
	if temp_delta > 2:
	self.temp_rising_fast = True
	else:
	self.temp_rising_fast = False

	if temp >= self.max_temp:
	print(f"\n🚨 TEMPERATURA: {temp}°C - ABORTANDO!")
	self.running = False
	break

	if temp >= self.max_temp - 3:
	print(f"\n⚠️ ALERTA! Temp: {temp}°C - Reduzindo...")
	self.decrease_load()
	self.decrease_load()

	if self.temp_rising_fast and temp > 75:
	self.decrease_load()

	last_temp = temp
	else:
	temp, power, voltage = last_temp, self.power_watts[-1] if self.power_watts else 0, self.voltage_volts[-1] if self.voltage_volts else 0
	current = power / voltage if voltage > 0 else 0

	if not self.check_system_health():
	print(f"\n🚨 SISTEMA INSTÁVEL!")
	self.running = False
	break

	try:
	torch.cuda.synchronize()
	start = time.time()

	for i in range(self.num_streams):
	with torch.cuda.stream(streams[i]):
	a = torch.randn(self.matrix_size, self.matrix_size, device=device, dtype=torch.float16)
	b = torch.randn(self.matrix_size, self.matrix_size, device=device, dtype=torch.float16)

	for _ in range(self.num_operations):
	c = torch.mm(a, b)
	a = b
	b = c

	torch.cuda.synchronize()
	elapsed = time.time() - start

	tflops = self.calculate_tflops(self.matrix_size, elapsed,
	self.num_operations, self.num_streams)

	self.temperatures.append(temp)
	self.tflops_history.append(tflops)
	self.load_level.append(self.current_load)
	self.power_watts.append(power)
	self.voltage_volts.append(voltage)
	self.current_amps.append(current)

	if tflops > self.peak_tflops:
	self.peak_tflops = tflops

	if power > self.peak_power:
	self.peak_power = power

	print(f"TFLOPS: {tflops:6.2f} \| Temp: {temp:5.1f}°C \| {power:6.1f}W \| {voltage:5.3f}V \| {current:6.2f}A \| Peak: {self.peak_tflops:.2f}", end='\r')

	if temp < 75 and stable_cycles > 10:
	self.increase_load()
	stable_cycles = 0
	elif temp < 80:
	stable_cycles += 1
	else:
	stable_cycles = 0

	time.sleep(0.02)

	except RuntimeError as e:
	if "out of memory" in str(e):
	print(f"\n⚠️ VRAM cheia - Reduzindo...")
	self.decrease_load()
	torch.cuda.empty_cache()
	else:
	print(f"\n🚨 ERRO: {e}")
	self.running = False
	break
	except Exception as e:
	print(f"\n🚨 ERRO: {e}")
	self.running = False
	break

	def update_plot(self, frame):
	if len(self.tflops_history) == 0:
	return

	for ax in [self.ax1, self.ax2, self.ax3, self.ax4]:
	ax.clear()

	# Gráfico 1: TFLOPS
	if len(self.tflops_history) > 0:
	self.ax1.plot(list(self.tflops_history), 'b-', linewidth=2.5, label='TFLOPS FP16')
	self.ax1.axhline(y=self.peak_tflops, color='g', linestyle='--', linewidth=2,
	label=f'Peak: {self.peak_tflops:.2f}')
	self.ax1.axhline(y=26.88, color='orange', linestyle=':', linewidth=2,
	label='Teórico: 26.88')
	self.ax1.set_ylabel('TFLOPS', fontsize=11, weight='bold')
	self.ax1.set_title('Performance FP16', fontsize=11, weight='bold')
	self.ax1.legend(loc='upper left', fontsize=9)
	self.ax1.grid(True, alpha=0.3)
	self.ax1.set_ylim(0, 30)

	# Gráfico 2: Temperatura
	if len(self.temperatures) > 0:
	temps = list(self.temperatures)
	self.ax2.plot(temps, 'r-', linewidth=2.5, label='Temperatura')
	self.ax2.axhline(y=self.max_temp, color='red', linestyle='--', linewidth=2,
	label=f'Limite: {self.max_temp}°C')
	self.ax2.fill_between(range(len(temps)), temps, self.max_temp,
	where=[t >= self.max_temp - 5 for t in temps],
	alpha=0.3, color='orange')
	self.ax2.set_ylabel('Temperatura (°C)', fontsize=11, weight='bold')
	self.ax2.set_title('Monitoramento Térmico', fontsize=11, weight='bold')
	self.ax2.legend(loc='upper left', fontsize=9)
	self.ax2.grid(True, alpha=0.3)
	self.ax2.set_ylim(30, 95)

	# Gráfico 3: Potência
	if len(self.power_watts) > 0:
	powers = list(self.power_watts)
	self.ax3.plot(powers, 'green', linewidth=2.5, label='Potência')
	self.ax3.axhline(y=300, color='red', linestyle='--', linewidth=2,
	label='TDP: 300W')
	self.ax3.axhline(y=self.peak_power, color='orange', linestyle=':', linewidth=2,
	label=f'Peak: {self.peak_power:.1f}W')
	self.ax3.fill_between(range(len(powers)), powers, alpha=0.3, color='green')
	self.ax3.set_ylabel('Potência (W)', fontsize=11, weight='bold')
	self.ax3.set_xlabel('Amostras', fontsize=11, weight='bold')
	self.ax3.set_title('Consumo Elétrico', fontsize=11, weight='bold')
	self.ax3.legend(loc='upper left', fontsize=9)
	self.ax3.grid(True, alpha=0.3)
	self.ax3.set_ylim(0, 350)

	# Gráfico 4: Tensão e Corrente
	if len(self.voltage_volts) > 0 and len(self.current_amps) > 0:
	ax4_twin = self.ax4.twinx()

	volts = list(self.voltage_volts)
	amps = list(self.current_amps)

	line1 = self.ax4.plot(volts, 'blue', linewidth=2.5, label='Tensão (V)')
	self.ax4.set_ylabel('Tensão (V)', fontsize=11, weight='bold', color='blue')
	self.ax4.tick_params(axis='y', labelcolor='blue')

	line2 = ax4_twin.plot(amps, 'red', linewidth=2.5, label='Corrente (A)')
	ax4_twin.set_ylabel('Corrente (A)', fontsize=11, weight='bold', color='red')
	ax4_twin.tick_params(axis='y', labelcolor='red')

	self.ax4.set_xlabel('Amostras', fontsize=11, weight='bold')
	self.ax4.set_title('Tensão e Corrente', fontsize=11, weight='bold')
	self.ax4.grid(True, alpha=0.3)

	lines = line1 + line2
	labels = [l.get_label() for l in lines]
	self.ax4.legend(lines, labels, loc='upper left', fontsize=9)

	if not self.running and len(self.tflops_history) > 0:
	efficiency = (self.peak_tflops / 26.88) * 100
	self.ax1.text(0.5, 0.5,
	f'🏆 {self.peak_tflops:.2f} TFLOPS\n{efficiency:.1f}%',
	transform=self.ax1.transAxes, fontsize=16,
	ha='center', va='center', color='darkgreen', weight='bold',
	bbox=dict(boxstyle='round,pad=0.8', facecolor='lightgreen', alpha=0.9))

	def run(self):
	stress_thread = threading.Thread(target=self.stress_gpu)
	stress_thread.daemon = True
	stress_thread.start()

	ani = animation.FuncAnimation(self.fig, self.update_plot,
	interval=300, cache_frame_data=False)
	plt.tight_layout()
	plt.show()
	stress_thread.join(timeout=2)

	avg_power = sum(self.power_watts) / len(self.power_watts) if self.power_watts else 0
	avg_voltage = sum(self.voltage_volts) / len(self.voltage_volts) if self.voltage_volts else 0
	avg_current = sum(self.current_amps) / len(self.current_amps) if self.current_amps else 0

	print(f"\n\n{'='*70}")
	print(f"{'BENCHMARK FP16 COMPLETO - RESULTADO':^70}")
	print(f"{'='*70}")
	print(f"🏆 PEAK TFLOPS (FP16): {self.peak_tflops:.2f}")
	print(f"📊 Teórico FP16: 26.88 TFLOPS")
	print(f"📈 Eficiência: {(self.peak_tflops / 26.88) * 100:.1f}%")
	print(f"🌡️ Temp Máxima: {max(self.temperatures) if self.temperatures else 0:.1f}°C")
	print(f"⚡ Potência Peak: {self.peak_power:.1f}W")
	print(f"⚡ Potência Média: {avg_power:.1f}W")
	print(f"🔌 Tensão Média: {avg_voltage:.3f}V")
	print(f"🔌 Corrente Média: {avg_current:.2f}A")
	print(f"🔥 Carga Máxima: {max(self.load_level) if self.load_level else 0}/10")
	print(f"{'='*70}\n")

	if __name__ == "__main__":
	bench = FP16BenchmarkFull()
	bench.run()