gpu_benchmark / benchmark_fp16_full.py
meccatronis's picture
Upload benchmark_fp16_full.py with huggingface_hub
3b4d89e verified
#!/usr/bin/env python3
import torch
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from datetime import datetime
import subprocess
import time
import psutil
import re
from collections import deque
import threading
class FP16BenchmarkFull:
def __init__(self):
self.max_temp = 85
self.temperatures = deque(maxlen=200)
self.tflops_history = deque(maxlen=200)
self.load_level = deque(maxlen=200)
self.power_watts = deque(maxlen=200)
self.voltage_volts = deque(maxlen=200)
self.current_amps = deque(maxlen=200)
self.peak_tflops = 0
self.peak_power = 0
self.running = True
self.current_load = 1
self.matrix_size = 10240
self.num_operations = 1
self.num_streams = 1
self.fig, ((self.ax1, self.ax2), (self.ax3, self.ax4)) = plt.subplots(2, 2, figsize=(16, 10))
self.fig.suptitle('BENCHMARK FP16 COMPLETO - Radeon Pro VII', fontsize=16, weight='bold')
self.last_temp_check = time.time()
self.temp_rising_fast = False
def get_gpu_metrics(self):
"""Obtém temperatura, potência, tensão via rocm-smi e sensors"""
temp = 0
power = 0
voltage = 0
# Temperatura via sensors
try:
result = subprocess.run(['sensors'], capture_output=True, text=True, timeout=0.5)
for line in result.stdout.split('\n'):
if 'edge:' in line.lower():
match = re.search(r'([+-]?\d+\.?\d*)\s*°C', line)
if match:
temp = float(match.group(1))
except:
pass
# Potência e tensão via rocm-smi
try:
result = subprocess.run(['rocm-smi', '--showpower', '--showvolt'],
capture_output=True, text=True, timeout=0.5)
for line in result.stdout.split('\n'):
# Potência (watts)
if 'Average Graphics Package Power' in line or 'Power' in line:
match = re.search(r'(\d+\.?\d*)\s*W', line)
if match:
power = float(match.group(1))
# Tensão (volts)
if 'Voltage' in line or 'volt' in line.lower():
match = re.search(r'(\d+\.?\d*)\s*mV', line)
if match:
voltage = float(match.group(1)) / 1000 # mV para V
except:
pass
return temp, power, voltage
def check_system_health(self):
try:
start = time.time()
cpu = psutil.cpu_percent(interval=0.05)
response = time.time() - start
if response > 0.4 or cpu > 95:
return False
return True
except:
return False
def calculate_tflops(self, matrix_size, elapsed_time, num_ops, num_streams):
operations = 2 * (matrix_size ** 3) * num_ops * num_streams
return (operations / elapsed_time) / 1e12
def increase_load(self):
if self.current_load < 10:
self.current_load += 1
if self.current_load >= 2 and self.num_streams < 4:
self.num_streams += 1
if self.current_load >= 4 and self.num_operations < 30:
self.num_operations += 5
if self.current_load >= 6 and self.matrix_size < 16384:
self.matrix_size = min(self.matrix_size + 1024, 16384)
def decrease_load(self):
if self.current_load > 1:
self.current_load -= 1
if self.matrix_size > 8192:
self.matrix_size = max(self.matrix_size - 512, 8192)
if self.num_operations > 5:
self.num_operations = max(self.num_operations - 5, 1)
def stress_gpu(self):
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device.type == 'cpu':
print("❌ ERRO: GPU não detectada!")
self.running = False
return
props = torch.cuda.get_device_properties(0)
print(f"🎯 GPU: {torch.cuda.get_device_name(0)}")
print(f"💾 VRAM: {props.total_memory / 1e9:.1f} GB")
print(f"🔥 Modo: FP16 (Half Precision)")
print(f"📊 TFLOPS Teórico FP16: ~26.88")
print(f"⚡ TDP Especificado: 300W")
print(f"⚠️ Limite: {self.max_temp}°C\n")
streams = [torch.cuda.Stream() for _ in range(4)]
last_temp = 0
stable_cycles = 0
while self.running:
current_time = time.time()
if current_time - self.last_temp_check > 0.05:
temp, power, voltage = self.get_gpu_metrics()
self.last_temp_check = current_time
# Calcula corrente: I = P / V
current = power / voltage if voltage > 0 else 0
if len(self.temperatures) > 0:
temp_delta = temp - last_temp
if temp_delta > 2:
self.temp_rising_fast = True
else:
self.temp_rising_fast = False
if temp >= self.max_temp:
print(f"\n🚨 TEMPERATURA: {temp}°C - ABORTANDO!")
self.running = False
break
if temp >= self.max_temp - 3:
print(f"\n⚠️ ALERTA! Temp: {temp}°C - Reduzindo...")
self.decrease_load()
self.decrease_load()
if self.temp_rising_fast and temp > 75:
self.decrease_load()
last_temp = temp
else:
temp, power, voltage = last_temp, self.power_watts[-1] if self.power_watts else 0, self.voltage_volts[-1] if self.voltage_volts else 0
current = power / voltage if voltage > 0 else 0
if not self.check_system_health():
print(f"\n🚨 SISTEMA INSTÁVEL!")
self.running = False
break
try:
torch.cuda.synchronize()
start = time.time()
for i in range(self.num_streams):
with torch.cuda.stream(streams[i]):
a = torch.randn(self.matrix_size, self.matrix_size, device=device, dtype=torch.float16)
b = torch.randn(self.matrix_size, self.matrix_size, device=device, dtype=torch.float16)
for _ in range(self.num_operations):
c = torch.mm(a, b)
a = b
b = c
torch.cuda.synchronize()
elapsed = time.time() - start
tflops = self.calculate_tflops(self.matrix_size, elapsed,
self.num_operations, self.num_streams)
self.temperatures.append(temp)
self.tflops_history.append(tflops)
self.load_level.append(self.current_load)
self.power_watts.append(power)
self.voltage_volts.append(voltage)
self.current_amps.append(current)
if tflops > self.peak_tflops:
self.peak_tflops = tflops
if power > self.peak_power:
self.peak_power = power
print(f"TFLOPS: {tflops:6.2f} | Temp: {temp:5.1f}°C | {power:6.1f}W | {voltage:5.3f}V | {current:6.2f}A | Peak: {self.peak_tflops:.2f}", end='\r')
if temp < 75 and stable_cycles > 10:
self.increase_load()
stable_cycles = 0
elif temp < 80:
stable_cycles += 1
else:
stable_cycles = 0
time.sleep(0.02)
except RuntimeError as e:
if "out of memory" in str(e):
print(f"\n⚠️ VRAM cheia - Reduzindo...")
self.decrease_load()
torch.cuda.empty_cache()
else:
print(f"\n🚨 ERRO: {e}")
self.running = False
break
except Exception as e:
print(f"\n🚨 ERRO: {e}")
self.running = False
break
def update_plot(self, frame):
if len(self.tflops_history) == 0:
return
for ax in [self.ax1, self.ax2, self.ax3, self.ax4]:
ax.clear()
# Gráfico 1: TFLOPS
if len(self.tflops_history) > 0:
self.ax1.plot(list(self.tflops_history), 'b-', linewidth=2.5, label='TFLOPS FP16')
self.ax1.axhline(y=self.peak_tflops, color='g', linestyle='--', linewidth=2,
label=f'Peak: {self.peak_tflops:.2f}')
self.ax1.axhline(y=26.88, color='orange', linestyle=':', linewidth=2,
label='Teórico: 26.88')
self.ax1.set_ylabel('TFLOPS', fontsize=11, weight='bold')
self.ax1.set_title('Performance FP16', fontsize=11, weight='bold')
self.ax1.legend(loc='upper left', fontsize=9)
self.ax1.grid(True, alpha=0.3)
self.ax1.set_ylim(0, 30)
# Gráfico 2: Temperatura
if len(self.temperatures) > 0:
temps = list(self.temperatures)
self.ax2.plot(temps, 'r-', linewidth=2.5, label='Temperatura')
self.ax2.axhline(y=self.max_temp, color='red', linestyle='--', linewidth=2,
label=f'Limite: {self.max_temp}°C')
self.ax2.fill_between(range(len(temps)), temps, self.max_temp,
where=[t >= self.max_temp - 5 for t in temps],
alpha=0.3, color='orange')
self.ax2.set_ylabel('Temperatura (°C)', fontsize=11, weight='bold')
self.ax2.set_title('Monitoramento Térmico', fontsize=11, weight='bold')
self.ax2.legend(loc='upper left', fontsize=9)
self.ax2.grid(True, alpha=0.3)
self.ax2.set_ylim(30, 95)
# Gráfico 3: Potência
if len(self.power_watts) > 0:
powers = list(self.power_watts)
self.ax3.plot(powers, 'green', linewidth=2.5, label='Potência')
self.ax3.axhline(y=300, color='red', linestyle='--', linewidth=2,
label='TDP: 300W')
self.ax3.axhline(y=self.peak_power, color='orange', linestyle=':', linewidth=2,
label=f'Peak: {self.peak_power:.1f}W')
self.ax3.fill_between(range(len(powers)), powers, alpha=0.3, color='green')
self.ax3.set_ylabel('Potência (W)', fontsize=11, weight='bold')
self.ax3.set_xlabel('Amostras', fontsize=11, weight='bold')
self.ax3.set_title('Consumo Elétrico', fontsize=11, weight='bold')
self.ax3.legend(loc='upper left', fontsize=9)
self.ax3.grid(True, alpha=0.3)
self.ax3.set_ylim(0, 350)
# Gráfico 4: Tensão e Corrente
if len(self.voltage_volts) > 0 and len(self.current_amps) > 0:
ax4_twin = self.ax4.twinx()
volts = list(self.voltage_volts)
amps = list(self.current_amps)
line1 = self.ax4.plot(volts, 'blue', linewidth=2.5, label='Tensão (V)')
self.ax4.set_ylabel('Tensão (V)', fontsize=11, weight='bold', color='blue')
self.ax4.tick_params(axis='y', labelcolor='blue')
line2 = ax4_twin.plot(amps, 'red', linewidth=2.5, label='Corrente (A)')
ax4_twin.set_ylabel('Corrente (A)', fontsize=11, weight='bold', color='red')
ax4_twin.tick_params(axis='y', labelcolor='red')
self.ax4.set_xlabel('Amostras', fontsize=11, weight='bold')
self.ax4.set_title('Tensão e Corrente', fontsize=11, weight='bold')
self.ax4.grid(True, alpha=0.3)
lines = line1 + line2
labels = [l.get_label() for l in lines]
self.ax4.legend(lines, labels, loc='upper left', fontsize=9)
if not self.running and len(self.tflops_history) > 0:
efficiency = (self.peak_tflops / 26.88) * 100
self.ax1.text(0.5, 0.5,
f'🏆 {self.peak_tflops:.2f} TFLOPS\n{efficiency:.1f}%',
transform=self.ax1.transAxes, fontsize=16,
ha='center', va='center', color='darkgreen', weight='bold',
bbox=dict(boxstyle='round,pad=0.8', facecolor='lightgreen', alpha=0.9))
def run(self):
stress_thread = threading.Thread(target=self.stress_gpu)
stress_thread.daemon = True
stress_thread.start()
ani = animation.FuncAnimation(self.fig, self.update_plot,
interval=300, cache_frame_data=False)
plt.tight_layout()
plt.show()
stress_thread.join(timeout=2)
avg_power = sum(self.power_watts) / len(self.power_watts) if self.power_watts else 0
avg_voltage = sum(self.voltage_volts) / len(self.voltage_volts) if self.voltage_volts else 0
avg_current = sum(self.current_amps) / len(self.current_amps) if self.current_amps else 0
print(f"\n\n{'='*70}")
print(f"{'BENCHMARK FP16 COMPLETO - RESULTADO':^70}")
print(f"{'='*70}")
print(f"🏆 PEAK TFLOPS (FP16): {self.peak_tflops:.2f}")
print(f"📊 Teórico FP16: 26.88 TFLOPS")
print(f"📈 Eficiência: {(self.peak_tflops / 26.88) * 100:.1f}%")
print(f"🌡️ Temp Máxima: {max(self.temperatures) if self.temperatures else 0:.1f}°C")
print(f"⚡ Potência Peak: {self.peak_power:.1f}W")
print(f"⚡ Potência Média: {avg_power:.1f}W")
print(f"🔌 Tensão Média: {avg_voltage:.3f}V")
print(f"🔌 Corrente Média: {avg_current:.2f}A")
print(f"🔥 Carga Máxima: {max(self.load_level) if self.load_level else 0}/10")
print(f"{'='*70}\n")
if __name__ == "__main__":
bench = FP16BenchmarkFull()
bench.run()