gpu_benchmark / benchmark_extreme.py

Upload benchmark_extreme.py with huggingface_hub

6fe8c45 verified 4 months ago

12.1 kB

	#!/usr/bin/env python3
	import torch
	import matplotlib.pyplot as plt
	import matplotlib.animation as animation
	from datetime import datetime
	import subprocess
	import time
	import psutil
	import re
	from collections import deque
	import threading
	import numpy as np

	class ExtremeBenchmark:
	def __init__(self):
	self.max_temp = 85
	self.temperatures = deque(maxlen=200)
	self.tflops_history = deque(maxlen=200)
	self.load_level = deque(maxlen=200)
	self.peak_tflops = 0
	self.running = True

	# Configuração incremental
	self.current_load = 1 # 1 a 10
	self.matrix_size = 8192
	self.num_operations = 1
	self.num_streams = 1

	self.fig, (self.ax1, self.ax2, self.ax3) = plt.subplots(3, 1, figsize=(14, 10))
	self.fig.suptitle('BENCHMARK EXTREMO - Radeon Pro VII', fontsize=16, weight='bold')

	self.last_temp_check = time.time()
	self.temp_rising_fast = False

	def get_gpu_temp(self):
	try:
	result = subprocess.run(['sensors'], capture_output=True, text=True, timeout=0.5)
	for line in result.stdout.split('\n'):
	if 'edge:' in line.lower():
	match = re.search(r'([+-]?\d+\.?\d)\s°C', line)
	if match:
	return float(match.group(1))
	except:
	return 0
	return 0

	def check_system_health(self):
	try:
	start = time.time()
	cpu = psutil.cpu_percent(interval=0.05)
	response = time.time() - start

	# Sistema travando se demorar muito ou CPU altíssima
	if response > 0.4 or cpu > 95:
	return False
	return True
	except:
	return False

	def calculate_tflops(self, matrix_size, elapsed_time, num_ops, num_streams):
	operations = 2 * (matrix_size ** 3) * num_ops * num_streams
	return (operations / elapsed_time) / 1e12

	def increase_load(self):
	"""Aumenta carga gradualmente"""
	if self.current_load < 10:
	self.current_load += 1

	if self.current_load >= 2 and self.num_streams < 4:
	self.num_streams += 1

	if self.current_load >= 4 and self.num_operations < 20:
	self.num_operations += 5

	if self.current_load >= 6 and self.matrix_size < 14336:
	self.matrix_size = min(self.matrix_size + 1024, 14336)

	def decrease_load(self):
	"""Diminui carga por segurança"""
	if self.current_load > 1:
	self.current_load -= 1

	if self.matrix_size > 8192:
	self.matrix_size = max(self.matrix_size - 512, 8192)

	if self.num_operations > 5:
	self.num_operations = max(self.num_operations - 5, 1)

	def stress_gpu(self):
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	if device.type == 'cpu':
	print("❌ ERRO: GPU não detectada!")
	self.running = False
	return

	props = torch.cuda.get_device_properties(0)
	print(f"🎯 GPU: {torch.cuda.get_device_name(0)}")
	print(f"💾 VRAM: {props.total_memory / 1e9:.1f} GB")
	print(f"🔥 Iniciando teste EXTREMO com aumento gradual...")
	print(f"⚠️ Limite de temperatura: {self.max_temp}°C")
	print(f"⚠️ Monitoramento de estabilidade: ATIVO\n")

	streams = [torch.cuda.Stream() for _ in range(4)]
	last_temp = 0
	stable_cycles = 0

	while self.running:
	# Verificação de temperatura mais frequente
	current_time = time.time()
	if current_time - self.last_temp_check > 0.05: # 50ms
	temp = self.get_gpu_temp()
	self.last_temp_check = current_time

	# Detecta aquecimento rápido
	if len(self.temperatures) > 0:
	temp_delta = temp - last_temp
	if temp_delta > 2: # Subiu mais de 2°C muito rápido
	self.temp_rising_fast = True
	else:
	self.temp_rising_fast = False

	# EMERGÊNCIA: temperatura perigosa
	if temp >= self.max_temp:
	print(f"\n🚨 EMERGÊNCIA! Temperatura: {temp}°C - ABORTANDO!")
	self.running = False
	break

	# ALERTA: próximo do limite
	if temp >= self.max_temp - 3:
	print(f"\n⚠️ ALERTA! Temperatura: {temp}°C - Reduzindo carga...")
	self.decrease_load()
	self.decrease_load()

	# Temperatura subindo rápido
	if self.temp_rising_fast and temp > 75:
	self.decrease_load()

	last_temp = temp
	else:
	temp = last_temp

	# Verifica saúde do sistema
	if not self.check_system_health():
	print(f"\n🚨 SISTEMA INSTÁVEL - ABORTANDO!")
	self.running = False
	break

	try:
	torch.cuda.synchronize()
	start = time.time()

	# Stress distribuído em streams
	for i in range(self.num_streams):
	with torch.cuda.stream(streams[i]):
	a = torch.randn(self.matrix_size, self.matrix_size, device=device, dtype=torch.float32)
	b = torch.randn(self.matrix_size, self.matrix_size, device=device, dtype=torch.float32)

	for _ in range(self.num_operations):
	c = torch.mm(a, b)
	a = b
	b = c

	torch.cuda.synchronize()
	elapsed = time.time() - start

	tflops = self.calculate_tflops(self.matrix_size, elapsed,
	self.num_operations, self.num_streams)

	self.temperatures.append(temp)
	self.tflops_history.append(tflops)
	self.load_level.append(self.current_load)

	if tflops > self.peak_tflops:
	self.peak_tflops = tflops

	print(f"TFLOPS: {tflops:6.2f} \| Temp: {temp:5.1f}°C \| Load: {self.current_load}/10 \| "
	f"Matrix: {self.matrix_size} \| Ops: {self.num_operations} \| Streams: {self.num_streams} \| "
	f"Peak: {self.peak_tflops:.2f}", end='\r')

	# Lógica de aumento gradual
	if temp < 75 and stable_cycles > 10:
	self.increase_load()
	stable_cycles = 0
	elif temp < 80:
	stable_cycles += 1
	else:
	stable_cycles = 0

	time.sleep(0.02) # 20ms

	except RuntimeError as e:
	if "out of memory" in str(e):
	print(f"\n⚠️ VRAM cheia - Reduzindo carga...")
	self.decrease_load()
	torch.cuda.empty_cache()
	else:
	print(f"\n🚨 ERRO: {e}")
	self.running = False
	break
	except Exception as e:
	print(f"\n🚨 ERRO CRÍTICO: {e}")
	self.running = False
	break

	def update_plot(self, frame):
	if len(self.tflops_history) == 0:
	return

	self.ax1.clear()
	self.ax2.clear()
	self.ax3.clear()

	# Gráfico 1: TFLOPS
	if len(self.tflops_history) > 0:
	self.ax1.plot(list(self.tflops_history), 'b-', linewidth=2.5, label='TFLOPS Real')
	self.ax1.axhline(y=self.peak_tflops, color='g', linestyle='--', linewidth=2,
	label=f'Peak: {self.peak_tflops:.2f}')
	self.ax1.axhline(y=13.44, color='orange', linestyle=':', linewidth=2,
	label='Teórico: 13.44')
	self.ax1.set_ylabel('TFLOPS', fontsize=12, weight='bold')
	self.ax1.set_title('Performance Computacional', fontsize=12, weight='bold')
	self.ax1.legend(loc='upper left')
	self.ax1.grid(True, alpha=0.3)
	self.ax1.set_ylim(0, 15)

	# Gráfico 2: Temperatura
	if len(self.temperatures) > 0:
	temps = list(self.temperatures)
	self.ax2.plot(temps, 'r-', linewidth=2.5, label='Temperatura')
	self.ax2.axhline(y=self.max_temp, color='red', linestyle='--', linewidth=2,
	label=f'LIMITE: {self.max_temp}°C')
	self.ax2.axhline(y=self.max_temp - 5, color='orange', linestyle=':', linewidth=1.5,
	label='Alerta: 80°C')
	self.ax2.fill_between(range(len(temps)), temps, self.max_temp,
	where=[t >= self.max_temp - 5 for t in temps],
	alpha=0.3, color='orange')
	self.ax2.set_ylabel('Temperatura (°C)', fontsize=12, weight='bold')
	self.ax2.set_title('Monitoramento Térmico', fontsize=12, weight='bold')
	self.ax2.legend(loc='upper left')
	self.ax2.grid(True, alpha=0.3)
	self.ax2.set_ylim(30, 95)

	# Gráfico 3: Nível de Carga
	if len(self.load_level) > 0:
	loads = list(self.load_level)
	self.ax3.plot(loads, 'purple', linewidth=2.5, label='Nível de Stress')
	self.ax3.fill_between(range(len(loads)), loads, alpha=0.3, color='purple')
	self.ax3.set_ylabel('Carga (1-10)', fontsize=12, weight='bold')
	self.ax3.set_xlabel('Amostras', fontsize=12, weight='bold')
	self.ax3.set_title('Intensidade do Teste', fontsize=12, weight='bold')
	self.ax3.legend(loc='upper left')
	self.ax3.grid(True, alpha=0.3)
	self.ax3.set_ylim(0, 11)

	if not self.running and len(self.tflops_history) > 0:
	efficiency = (self.peak_tflops / 13.44) * 100
	self.ax1.text(0.5, 0.5,
	f'🏆 PEAK: {self.peak_tflops:.2f} TFLOPS\n'
	f'📊 Eficiência: {efficiency:.1f}%',
	transform=self.ax1.transAxes, fontsize=20,
	ha='center', va='center', color='darkgreen', weight='bold',
	bbox=dict(boxstyle='round,pad=1', facecolor='lightgreen', alpha=0.9))

	def run(self):
	stress_thread = threading.Thread(target=self.stress_gpu)
	stress_thread.daemon = True
	stress_thread.start()

	ani = animation.FuncAnimation(self.fig, self.update_plot,
	interval=300, cache_frame_data=False)
	plt.tight_layout()
	plt.show()
	stress_thread.join(timeout=2)

	print(f"\n\n{'='*70}")
	print(f"{'RESULTADO FINAL DO BENCHMARK EXTREMO':^70}")
	print(f"{'='*70}")
	print(f"🏆 PEAK TFLOPS ALCANÇADO: {self.peak_tflops:.2f}")
	print(f"📊 TFLOPS Teórico (FP32): 13.44")
	print(f"📈 Eficiência Real: {(self.peak_tflops / 13.44) * 100:.1f}%")
	print(f"🌡️ Temperatura Máxima: {max(self.temperatures) if self.temperatures else 0:.1f}°C")
	print(f"🔥 Nível de Carga Máximo: {max(self.load_level) if self.load_level else 0}/10")
	print(f"⏱️ Duração do Teste: {len(self.tflops_history) * 0.3:.1f}s")
	print(f"{'='*70}\n")

	if __name__ == "__main__":
	bench = ExtremeBenchmark()
	bench.run()