gpu_benchmark / benchmark_extreme.py
meccatronis's picture
Upload benchmark_extreme.py with huggingface_hub
6fe8c45 verified
#!/usr/bin/env python3
import torch
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from datetime import datetime
import subprocess
import time
import psutil
import re
from collections import deque
import threading
import numpy as np
class ExtremeBenchmark:
def __init__(self):
self.max_temp = 85
self.temperatures = deque(maxlen=200)
self.tflops_history = deque(maxlen=200)
self.load_level = deque(maxlen=200)
self.peak_tflops = 0
self.running = True
# Configuração incremental
self.current_load = 1 # 1 a 10
self.matrix_size = 8192
self.num_operations = 1
self.num_streams = 1
self.fig, (self.ax1, self.ax2, self.ax3) = plt.subplots(3, 1, figsize=(14, 10))
self.fig.suptitle('BENCHMARK EXTREMO - Radeon Pro VII', fontsize=16, weight='bold')
self.last_temp_check = time.time()
self.temp_rising_fast = False
def get_gpu_temp(self):
try:
result = subprocess.run(['sensors'], capture_output=True, text=True, timeout=0.5)
for line in result.stdout.split('\n'):
if 'edge:' in line.lower():
match = re.search(r'([+-]?\d+\.?\d*)\s*°C', line)
if match:
return float(match.group(1))
except:
return 0
return 0
def check_system_health(self):
try:
start = time.time()
cpu = psutil.cpu_percent(interval=0.05)
response = time.time() - start
# Sistema travando se demorar muito ou CPU altíssima
if response > 0.4 or cpu > 95:
return False
return True
except:
return False
def calculate_tflops(self, matrix_size, elapsed_time, num_ops, num_streams):
operations = 2 * (matrix_size ** 3) * num_ops * num_streams
return (operations / elapsed_time) / 1e12
def increase_load(self):
"""Aumenta carga gradualmente"""
if self.current_load < 10:
self.current_load += 1
if self.current_load >= 2 and self.num_streams < 4:
self.num_streams += 1
if self.current_load >= 4 and self.num_operations < 20:
self.num_operations += 5
if self.current_load >= 6 and self.matrix_size < 14336:
self.matrix_size = min(self.matrix_size + 1024, 14336)
def decrease_load(self):
"""Diminui carga por segurança"""
if self.current_load > 1:
self.current_load -= 1
if self.matrix_size > 8192:
self.matrix_size = max(self.matrix_size - 512, 8192)
if self.num_operations > 5:
self.num_operations = max(self.num_operations - 5, 1)
def stress_gpu(self):
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device.type == 'cpu':
print("❌ ERRO: GPU não detectada!")
self.running = False
return
props = torch.cuda.get_device_properties(0)
print(f"🎯 GPU: {torch.cuda.get_device_name(0)}")
print(f"💾 VRAM: {props.total_memory / 1e9:.1f} GB")
print(f"🔥 Iniciando teste EXTREMO com aumento gradual...")
print(f"⚠️ Limite de temperatura: {self.max_temp}°C")
print(f"⚠️ Monitoramento de estabilidade: ATIVO\n")
streams = [torch.cuda.Stream() for _ in range(4)]
last_temp = 0
stable_cycles = 0
while self.running:
# Verificação de temperatura mais frequente
current_time = time.time()
if current_time - self.last_temp_check > 0.05: # 50ms
temp = self.get_gpu_temp()
self.last_temp_check = current_time
# Detecta aquecimento rápido
if len(self.temperatures) > 0:
temp_delta = temp - last_temp
if temp_delta > 2: # Subiu mais de 2°C muito rápido
self.temp_rising_fast = True
else:
self.temp_rising_fast = False
# EMERGÊNCIA: temperatura perigosa
if temp >= self.max_temp:
print(f"\n🚨 EMERGÊNCIA! Temperatura: {temp}°C - ABORTANDO!")
self.running = False
break
# ALERTA: próximo do limite
if temp >= self.max_temp - 3:
print(f"\n⚠️ ALERTA! Temperatura: {temp}°C - Reduzindo carga...")
self.decrease_load()
self.decrease_load()
# Temperatura subindo rápido
if self.temp_rising_fast and temp > 75:
self.decrease_load()
last_temp = temp
else:
temp = last_temp
# Verifica saúde do sistema
if not self.check_system_health():
print(f"\n🚨 SISTEMA INSTÁVEL - ABORTANDO!")
self.running = False
break
try:
torch.cuda.synchronize()
start = time.time()
# Stress distribuído em streams
for i in range(self.num_streams):
with torch.cuda.stream(streams[i]):
a = torch.randn(self.matrix_size, self.matrix_size, device=device, dtype=torch.float32)
b = torch.randn(self.matrix_size, self.matrix_size, device=device, dtype=torch.float32)
for _ in range(self.num_operations):
c = torch.mm(a, b)
a = b
b = c
torch.cuda.synchronize()
elapsed = time.time() - start
tflops = self.calculate_tflops(self.matrix_size, elapsed,
self.num_operations, self.num_streams)
self.temperatures.append(temp)
self.tflops_history.append(tflops)
self.load_level.append(self.current_load)
if tflops > self.peak_tflops:
self.peak_tflops = tflops
print(f"TFLOPS: {tflops:6.2f} | Temp: {temp:5.1f}°C | Load: {self.current_load}/10 | "
f"Matrix: {self.matrix_size} | Ops: {self.num_operations} | Streams: {self.num_streams} | "
f"Peak: {self.peak_tflops:.2f}", end='\r')
# Lógica de aumento gradual
if temp < 75 and stable_cycles > 10:
self.increase_load()
stable_cycles = 0
elif temp < 80:
stable_cycles += 1
else:
stable_cycles = 0
time.sleep(0.02) # 20ms
except RuntimeError as e:
if "out of memory" in str(e):
print(f"\n⚠️ VRAM cheia - Reduzindo carga...")
self.decrease_load()
torch.cuda.empty_cache()
else:
print(f"\n🚨 ERRO: {e}")
self.running = False
break
except Exception as e:
print(f"\n🚨 ERRO CRÍTICO: {e}")
self.running = False
break
def update_plot(self, frame):
if len(self.tflops_history) == 0:
return
self.ax1.clear()
self.ax2.clear()
self.ax3.clear()
# Gráfico 1: TFLOPS
if len(self.tflops_history) > 0:
self.ax1.plot(list(self.tflops_history), 'b-', linewidth=2.5, label='TFLOPS Real')
self.ax1.axhline(y=self.peak_tflops, color='g', linestyle='--', linewidth=2,
label=f'Peak: {self.peak_tflops:.2f}')
self.ax1.axhline(y=13.44, color='orange', linestyle=':', linewidth=2,
label='Teórico: 13.44')
self.ax1.set_ylabel('TFLOPS', fontsize=12, weight='bold')
self.ax1.set_title('Performance Computacional', fontsize=12, weight='bold')
self.ax1.legend(loc='upper left')
self.ax1.grid(True, alpha=0.3)
self.ax1.set_ylim(0, 15)
# Gráfico 2: Temperatura
if len(self.temperatures) > 0:
temps = list(self.temperatures)
self.ax2.plot(temps, 'r-', linewidth=2.5, label='Temperatura')
self.ax2.axhline(y=self.max_temp, color='red', linestyle='--', linewidth=2,
label=f'LIMITE: {self.max_temp}°C')
self.ax2.axhline(y=self.max_temp - 5, color='orange', linestyle=':', linewidth=1.5,
label='Alerta: 80°C')
self.ax2.fill_between(range(len(temps)), temps, self.max_temp,
where=[t >= self.max_temp - 5 for t in temps],
alpha=0.3, color='orange')
self.ax2.set_ylabel('Temperatura (°C)', fontsize=12, weight='bold')
self.ax2.set_title('Monitoramento Térmico', fontsize=12, weight='bold')
self.ax2.legend(loc='upper left')
self.ax2.grid(True, alpha=0.3)
self.ax2.set_ylim(30, 95)
# Gráfico 3: Nível de Carga
if len(self.load_level) > 0:
loads = list(self.load_level)
self.ax3.plot(loads, 'purple', linewidth=2.5, label='Nível de Stress')
self.ax3.fill_between(range(len(loads)), loads, alpha=0.3, color='purple')
self.ax3.set_ylabel('Carga (1-10)', fontsize=12, weight='bold')
self.ax3.set_xlabel('Amostras', fontsize=12, weight='bold')
self.ax3.set_title('Intensidade do Teste', fontsize=12, weight='bold')
self.ax3.legend(loc='upper left')
self.ax3.grid(True, alpha=0.3)
self.ax3.set_ylim(0, 11)
if not self.running and len(self.tflops_history) > 0:
efficiency = (self.peak_tflops / 13.44) * 100
self.ax1.text(0.5, 0.5,
f'🏆 PEAK: {self.peak_tflops:.2f} TFLOPS\n'
f'📊 Eficiência: {efficiency:.1f}%',
transform=self.ax1.transAxes, fontsize=20,
ha='center', va='center', color='darkgreen', weight='bold',
bbox=dict(boxstyle='round,pad=1', facecolor='lightgreen', alpha=0.9))
def run(self):
stress_thread = threading.Thread(target=self.stress_gpu)
stress_thread.daemon = True
stress_thread.start()
ani = animation.FuncAnimation(self.fig, self.update_plot,
interval=300, cache_frame_data=False)
plt.tight_layout()
plt.show()
stress_thread.join(timeout=2)
print(f"\n\n{'='*70}")
print(f"{'RESULTADO FINAL DO BENCHMARK EXTREMO':^70}")
print(f"{'='*70}")
print(f"🏆 PEAK TFLOPS ALCANÇADO: {self.peak_tflops:.2f}")
print(f"📊 TFLOPS Teórico (FP32): 13.44")
print(f"📈 Eficiência Real: {(self.peak_tflops / 13.44) * 100:.1f}%")
print(f"🌡️ Temperatura Máxima: {max(self.temperatures) if self.temperatures else 0:.1f}°C")
print(f"🔥 Nível de Carga Máximo: {max(self.load_level) if self.load_level else 0}/10")
print(f"⏱️ Duração do Teste: {len(self.tflops_history) * 0.3:.1f}s")
print(f"{'='*70}\n")
if __name__ == "__main__":
bench = ExtremeBenchmark()
bench.run()