| |
| import torch |
| import matplotlib.pyplot as plt |
| import matplotlib.animation as animation |
| from datetime import datetime |
| import subprocess |
| import time |
| import psutil |
| import re |
| from collections import deque |
| import threading |
|
|
| class FP16BenchmarkFull: |
| def __init__(self): |
| self.max_temp = 85 |
| self.temperatures = deque(maxlen=200) |
| self.tflops_history = deque(maxlen=200) |
| self.load_level = deque(maxlen=200) |
| self.power_watts = deque(maxlen=200) |
| self.voltage_volts = deque(maxlen=200) |
| self.current_amps = deque(maxlen=200) |
| self.peak_tflops = 0 |
| self.peak_power = 0 |
| self.running = True |
| |
| self.current_load = 1 |
| self.matrix_size = 10240 |
| self.num_operations = 1 |
| self.num_streams = 1 |
| |
| self.fig, ((self.ax1, self.ax2), (self.ax3, self.ax4)) = plt.subplots(2, 2, figsize=(16, 10)) |
| self.fig.suptitle('BENCHMARK FP16 COMPLETO - Radeon Pro VII', fontsize=16, weight='bold') |
| |
| self.last_temp_check = time.time() |
| self.temp_rising_fast = False |
| |
| def get_gpu_metrics(self): |
| """Obtém temperatura, potência, tensão via rocm-smi e sensors""" |
| temp = 0 |
| power = 0 |
| voltage = 0 |
| |
| |
| try: |
| result = subprocess.run(['sensors'], capture_output=True, text=True, timeout=0.5) |
| for line in result.stdout.split('\n'): |
| if 'edge:' in line.lower(): |
| match = re.search(r'([+-]?\d+\.?\d*)\s*°C', line) |
| if match: |
| temp = float(match.group(1)) |
| except: |
| pass |
| |
| |
| try: |
| result = subprocess.run(['rocm-smi', '--showpower', '--showvolt'], |
| capture_output=True, text=True, timeout=0.5) |
| for line in result.stdout.split('\n'): |
| |
| if 'Average Graphics Package Power' in line or 'Power' in line: |
| match = re.search(r'(\d+\.?\d*)\s*W', line) |
| if match: |
| power = float(match.group(1)) |
| |
| |
| if 'Voltage' in line or 'volt' in line.lower(): |
| match = re.search(r'(\d+\.?\d*)\s*mV', line) |
| if match: |
| voltage = float(match.group(1)) / 1000 |
| except: |
| pass |
| |
| return temp, power, voltage |
|
|
| def check_system_health(self): |
| try: |
| start = time.time() |
| cpu = psutil.cpu_percent(interval=0.05) |
| response = time.time() - start |
| if response > 0.4 or cpu > 95: |
| return False |
| return True |
| except: |
| return False |
|
|
| def calculate_tflops(self, matrix_size, elapsed_time, num_ops, num_streams): |
| operations = 2 * (matrix_size ** 3) * num_ops * num_streams |
| return (operations / elapsed_time) / 1e12 |
|
|
| def increase_load(self): |
| if self.current_load < 10: |
| self.current_load += 1 |
| |
| if self.current_load >= 2 and self.num_streams < 4: |
| self.num_streams += 1 |
| |
| if self.current_load >= 4 and self.num_operations < 30: |
| self.num_operations += 5 |
| |
| if self.current_load >= 6 and self.matrix_size < 16384: |
| self.matrix_size = min(self.matrix_size + 1024, 16384) |
|
|
| def decrease_load(self): |
| if self.current_load > 1: |
| self.current_load -= 1 |
| |
| if self.matrix_size > 8192: |
| self.matrix_size = max(self.matrix_size - 512, 8192) |
| |
| if self.num_operations > 5: |
| self.num_operations = max(self.num_operations - 5, 1) |
|
|
| def stress_gpu(self): |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
| if device.type == 'cpu': |
| print("❌ ERRO: GPU não detectada!") |
| self.running = False |
| return |
| |
| props = torch.cuda.get_device_properties(0) |
| print(f"🎯 GPU: {torch.cuda.get_device_name(0)}") |
| print(f"💾 VRAM: {props.total_memory / 1e9:.1f} GB") |
| print(f"🔥 Modo: FP16 (Half Precision)") |
| print(f"📊 TFLOPS Teórico FP16: ~26.88") |
| print(f"⚡ TDP Especificado: 300W") |
| print(f"⚠️ Limite: {self.max_temp}°C\n") |
| |
| streams = [torch.cuda.Stream() for _ in range(4)] |
| last_temp = 0 |
| stable_cycles = 0 |
| |
| while self.running: |
| current_time = time.time() |
| if current_time - self.last_temp_check > 0.05: |
| temp, power, voltage = self.get_gpu_metrics() |
| self.last_temp_check = current_time |
| |
| |
| current = power / voltage if voltage > 0 else 0 |
| |
| if len(self.temperatures) > 0: |
| temp_delta = temp - last_temp |
| if temp_delta > 2: |
| self.temp_rising_fast = True |
| else: |
| self.temp_rising_fast = False |
| |
| if temp >= self.max_temp: |
| print(f"\n🚨 TEMPERATURA: {temp}°C - ABORTANDO!") |
| self.running = False |
| break |
| |
| if temp >= self.max_temp - 3: |
| print(f"\n⚠️ ALERTA! Temp: {temp}°C - Reduzindo...") |
| self.decrease_load() |
| self.decrease_load() |
| |
| if self.temp_rising_fast and temp > 75: |
| self.decrease_load() |
| |
| last_temp = temp |
| else: |
| temp, power, voltage = last_temp, self.power_watts[-1] if self.power_watts else 0, self.voltage_volts[-1] if self.voltage_volts else 0 |
| current = power / voltage if voltage > 0 else 0 |
| |
| if not self.check_system_health(): |
| print(f"\n🚨 SISTEMA INSTÁVEL!") |
| self.running = False |
| break |
| |
| try: |
| torch.cuda.synchronize() |
| start = time.time() |
| |
| for i in range(self.num_streams): |
| with torch.cuda.stream(streams[i]): |
| a = torch.randn(self.matrix_size, self.matrix_size, device=device, dtype=torch.float16) |
| b = torch.randn(self.matrix_size, self.matrix_size, device=device, dtype=torch.float16) |
| |
| for _ in range(self.num_operations): |
| c = torch.mm(a, b) |
| a = b |
| b = c |
| |
| torch.cuda.synchronize() |
| elapsed = time.time() - start |
| |
| tflops = self.calculate_tflops(self.matrix_size, elapsed, |
| self.num_operations, self.num_streams) |
| |
| self.temperatures.append(temp) |
| self.tflops_history.append(tflops) |
| self.load_level.append(self.current_load) |
| self.power_watts.append(power) |
| self.voltage_volts.append(voltage) |
| self.current_amps.append(current) |
| |
| if tflops > self.peak_tflops: |
| self.peak_tflops = tflops |
| |
| if power > self.peak_power: |
| self.peak_power = power |
| |
| print(f"TFLOPS: {tflops:6.2f} | Temp: {temp:5.1f}°C | {power:6.1f}W | {voltage:5.3f}V | {current:6.2f}A | Peak: {self.peak_tflops:.2f}", end='\r') |
| |
| if temp < 75 and stable_cycles > 10: |
| self.increase_load() |
| stable_cycles = 0 |
| elif temp < 80: |
| stable_cycles += 1 |
| else: |
| stable_cycles = 0 |
| |
| time.sleep(0.02) |
| |
| except RuntimeError as e: |
| if "out of memory" in str(e): |
| print(f"\n⚠️ VRAM cheia - Reduzindo...") |
| self.decrease_load() |
| torch.cuda.empty_cache() |
| else: |
| print(f"\n🚨 ERRO: {e}") |
| self.running = False |
| break |
| except Exception as e: |
| print(f"\n🚨 ERRO: {e}") |
| self.running = False |
| break |
|
|
| def update_plot(self, frame): |
| if len(self.tflops_history) == 0: |
| return |
| |
| for ax in [self.ax1, self.ax2, self.ax3, self.ax4]: |
| ax.clear() |
| |
| |
| if len(self.tflops_history) > 0: |
| self.ax1.plot(list(self.tflops_history), 'b-', linewidth=2.5, label='TFLOPS FP16') |
| self.ax1.axhline(y=self.peak_tflops, color='g', linestyle='--', linewidth=2, |
| label=f'Peak: {self.peak_tflops:.2f}') |
| self.ax1.axhline(y=26.88, color='orange', linestyle=':', linewidth=2, |
| label='Teórico: 26.88') |
| self.ax1.set_ylabel('TFLOPS', fontsize=11, weight='bold') |
| self.ax1.set_title('Performance FP16', fontsize=11, weight='bold') |
| self.ax1.legend(loc='upper left', fontsize=9) |
| self.ax1.grid(True, alpha=0.3) |
| self.ax1.set_ylim(0, 30) |
| |
| |
| if len(self.temperatures) > 0: |
| temps = list(self.temperatures) |
| self.ax2.plot(temps, 'r-', linewidth=2.5, label='Temperatura') |
| self.ax2.axhline(y=self.max_temp, color='red', linestyle='--', linewidth=2, |
| label=f'Limite: {self.max_temp}°C') |
| self.ax2.fill_between(range(len(temps)), temps, self.max_temp, |
| where=[t >= self.max_temp - 5 for t in temps], |
| alpha=0.3, color='orange') |
| self.ax2.set_ylabel('Temperatura (°C)', fontsize=11, weight='bold') |
| self.ax2.set_title('Monitoramento Térmico', fontsize=11, weight='bold') |
| self.ax2.legend(loc='upper left', fontsize=9) |
| self.ax2.grid(True, alpha=0.3) |
| self.ax2.set_ylim(30, 95) |
| |
| |
| if len(self.power_watts) > 0: |
| powers = list(self.power_watts) |
| self.ax3.plot(powers, 'green', linewidth=2.5, label='Potência') |
| self.ax3.axhline(y=300, color='red', linestyle='--', linewidth=2, |
| label='TDP: 300W') |
| self.ax3.axhline(y=self.peak_power, color='orange', linestyle=':', linewidth=2, |
| label=f'Peak: {self.peak_power:.1f}W') |
| self.ax3.fill_between(range(len(powers)), powers, alpha=0.3, color='green') |
| self.ax3.set_ylabel('Potência (W)', fontsize=11, weight='bold') |
| self.ax3.set_xlabel('Amostras', fontsize=11, weight='bold') |
| self.ax3.set_title('Consumo Elétrico', fontsize=11, weight='bold') |
| self.ax3.legend(loc='upper left', fontsize=9) |
| self.ax3.grid(True, alpha=0.3) |
| self.ax3.set_ylim(0, 350) |
| |
| |
| if len(self.voltage_volts) > 0 and len(self.current_amps) > 0: |
| ax4_twin = self.ax4.twinx() |
| |
| volts = list(self.voltage_volts) |
| amps = list(self.current_amps) |
| |
| line1 = self.ax4.plot(volts, 'blue', linewidth=2.5, label='Tensão (V)') |
| self.ax4.set_ylabel('Tensão (V)', fontsize=11, weight='bold', color='blue') |
| self.ax4.tick_params(axis='y', labelcolor='blue') |
| |
| line2 = ax4_twin.plot(amps, 'red', linewidth=2.5, label='Corrente (A)') |
| ax4_twin.set_ylabel('Corrente (A)', fontsize=11, weight='bold', color='red') |
| ax4_twin.tick_params(axis='y', labelcolor='red') |
| |
| self.ax4.set_xlabel('Amostras', fontsize=11, weight='bold') |
| self.ax4.set_title('Tensão e Corrente', fontsize=11, weight='bold') |
| self.ax4.grid(True, alpha=0.3) |
| |
| lines = line1 + line2 |
| labels = [l.get_label() for l in lines] |
| self.ax4.legend(lines, labels, loc='upper left', fontsize=9) |
| |
| if not self.running and len(self.tflops_history) > 0: |
| efficiency = (self.peak_tflops / 26.88) * 100 |
| self.ax1.text(0.5, 0.5, |
| f'🏆 {self.peak_tflops:.2f} TFLOPS\n{efficiency:.1f}%', |
| transform=self.ax1.transAxes, fontsize=16, |
| ha='center', va='center', color='darkgreen', weight='bold', |
| bbox=dict(boxstyle='round,pad=0.8', facecolor='lightgreen', alpha=0.9)) |
|
|
| def run(self): |
| stress_thread = threading.Thread(target=self.stress_gpu) |
| stress_thread.daemon = True |
| stress_thread.start() |
| |
| ani = animation.FuncAnimation(self.fig, self.update_plot, |
| interval=300, cache_frame_data=False) |
| plt.tight_layout() |
| plt.show() |
| stress_thread.join(timeout=2) |
| |
| avg_power = sum(self.power_watts) / len(self.power_watts) if self.power_watts else 0 |
| avg_voltage = sum(self.voltage_volts) / len(self.voltage_volts) if self.voltage_volts else 0 |
| avg_current = sum(self.current_amps) / len(self.current_amps) if self.current_amps else 0 |
| |
| print(f"\n\n{'='*70}") |
| print(f"{'BENCHMARK FP16 COMPLETO - RESULTADO':^70}") |
| print(f"{'='*70}") |
| print(f"🏆 PEAK TFLOPS (FP16): {self.peak_tflops:.2f}") |
| print(f"📊 Teórico FP16: 26.88 TFLOPS") |
| print(f"📈 Eficiência: {(self.peak_tflops / 26.88) * 100:.1f}%") |
| print(f"🌡️ Temp Máxima: {max(self.temperatures) if self.temperatures else 0:.1f}°C") |
| print(f"⚡ Potência Peak: {self.peak_power:.1f}W") |
| print(f"⚡ Potência Média: {avg_power:.1f}W") |
| print(f"🔌 Tensão Média: {avg_voltage:.3f}V") |
| print(f"🔌 Corrente Média: {avg_current:.2f}A") |
| print(f"🔥 Carga Máxima: {max(self.load_level) if self.load_level else 0}/10") |
| print(f"{'='*70}\n") |
|
|
| if __name__ == "__main__": |
| bench = FP16BenchmarkFull() |
| bench.run() |
|
|