meccatronis
/

gpu_benchmark

Model card Files Files and versions

xet

Community

meccatronis commited on Feb 11

Commit

3b4d89e

verified ·

1 Parent(s): 6d6f4dd

Upload benchmark_fp16_full.py with huggingface_hub

Browse files

Files changed (1) hide show

benchmark_fp16_full.py +343 -0

benchmark_fp16_full.py ADDED Viewed

	@@ -0,0 +1,343 @@

+#!/usr/bin/env python3
+import torch
+import matplotlib.pyplot as plt
+import matplotlib.animation as animation
+from datetime import datetime
+import subprocess
+import time
+import psutil
+import re
+from collections import deque
+import threading
+class FP16BenchmarkFull:
+    def __init__(self):
+        self.max_temp = 85
+        self.temperatures = deque(maxlen=200)
+        self.tflops_history = deque(maxlen=200)
+        self.load_level = deque(maxlen=200)
+        self.power_watts = deque(maxlen=200)
+        self.voltage_volts = deque(maxlen=200)
+        self.current_amps = deque(maxlen=200)
+        self.peak_tflops = 0
+        self.peak_power = 0
+        self.running = True
+        self.current_load = 1
+        self.matrix_size = 10240
+        self.num_operations = 1
+        self.num_streams = 1
+        self.fig, ((self.ax1, self.ax2), (self.ax3, self.ax4)) = plt.subplots(2, 2, figsize=(16, 10))
+        self.fig.suptitle('BENCHMARK FP16 COMPLETO - Radeon Pro VII', fontsize=16, weight='bold')
+        self.last_temp_check = time.time()
+        self.temp_rising_fast = False
+    def get_gpu_metrics(self):
+        """Obtém temperatura, potência, tensão via rocm-smi e sensors"""
+        temp = 0
+        power = 0
+        voltage = 0
+        # Temperatura via sensors
+        try:
+            result = subprocess.run(['sensors'], capture_output=True, text=True, timeout=0.5)
+            for line in result.stdout.split('\n'):
+                if 'edge:' in line.lower():
+                    match = re.search(r'([+-]?\d+\.?\d*)\s*°C', line)
+                    if match:
+                        temp = float(match.group(1))
+        except:
+            pass
+        # Potência e tensão via rocm-smi
+        try:
+            result = subprocess.run(['rocm-smi', '--showpower', '--showvolt'],
+                                  capture_output=True, text=True, timeout=0.5)
+            for line in result.stdout.split('\n'):
+                # Potência (watts)
+                if 'Average Graphics Package Power' in line or 'Power' in line:
+                    match = re.search(r'(\d+\.?\d*)\s*W', line)
+                    if match:
+                        power = float(match.group(1))
+                # Tensão (volts)
+                if 'Voltage' in line or 'volt' in line.lower():
+                    match = re.search(r'(\d+\.?\d*)\s*mV', line)
+                    if match:
+                        voltage = float(match.group(1)) / 1000  # mV para V
+        except:
+            pass
+        return temp, power, voltage
+    def check_system_health(self):
+        try:
+            start = time.time()
+            cpu = psutil.cpu_percent(interval=0.05)
+            response = time.time() - start
+            if response > 0.4 or cpu > 95:
+                return False
+            return True
+        except:
+            return False
+    def calculate_tflops(self, matrix_size, elapsed_time, num_ops, num_streams):
+        operations = 2 * (matrix_size ** 3) * num_ops * num_streams
+        return (operations / elapsed_time) / 1e12
+    def increase_load(self):
+        if self.current_load < 10:
+            self.current_load += 1
+        if self.current_load >= 2 and self.num_streams < 4:
+            self.num_streams += 1
+        if self.current_load >= 4 and self.num_operations < 30:
+            self.num_operations += 5
+        if self.current_load >= 6 and self.matrix_size < 16384:
+            self.matrix_size = min(self.matrix_size + 1024, 16384)
+    def decrease_load(self):
+        if self.current_load > 1:
+            self.current_load -= 1
+        if self.matrix_size > 8192:
+            self.matrix_size = max(self.matrix_size - 512, 8192)
+        if self.num_operations > 5:
+            self.num_operations = max(self.num_operations - 5, 1)
+    def stress_gpu(self):
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        if device.type == 'cpu':
+            print("❌ ERRO: GPU não detectada!")
+            self.running = False
+            return
+        props = torch.cuda.get_device_properties(0)
+        print(f"🎯 GPU: {torch.cuda.get_device_name(0)}")
+        print(f"💾 VRAM: {props.total_memory / 1e9:.1f} GB")
+        print(f"🔥 Modo: FP16 (Half Precision)")
+        print(f"📊 TFLOPS Teórico FP16: ~26.88")
+        print(f"⚡ TDP Especificado: 300W")
+        print(f"⚠️  Limite: {self.max_temp}°C\n")
+        streams = [torch.cuda.Stream() for _ in range(4)]
+        last_temp = 0
+        stable_cycles = 0
+        while self.running:
+            current_time = time.time()
+            if current_time - self.last_temp_check > 0.05:
+                temp, power, voltage = self.get_gpu_metrics()
+                self.last_temp_check = current_time
+                # Calcula corrente: I = P / V
+                current = power / voltage if voltage > 0 else 0
+                if len(self.temperatures) > 0:
+                    temp_delta = temp - last_temp
+                    if temp_delta > 2:
+                        self.temp_rising_fast = True
+                    else:
+                        self.temp_rising_fast = False
+                if temp >= self.max_temp:
+                    print(f"\n🚨 TEMPERATURA: {temp}°C - ABORTANDO!")
+                    self.running = False
+                    break
+                if temp >= self.max_temp - 3:
+                    print(f"\n⚠️  ALERTA! Temp: {temp}°C - Reduzindo...")
+                    self.decrease_load()
+                    self.decrease_load()
+                if self.temp_rising_fast and temp > 75:
+                    self.decrease_load()
+                last_temp = temp
+            else:
+                temp, power, voltage = last_temp, self.power_watts[-1] if self.power_watts else 0, self.voltage_volts[-1] if self.voltage_volts else 0
+                current = power / voltage if voltage > 0 else 0
+            if not self.check_system_health():
+                print(f"\n🚨 SISTEMA INSTÁVEL!")
+                self.running = False
+                break
+            try:
+                torch.cuda.synchronize()
+                start = time.time()
+                for i in range(self.num_streams):
+                    with torch.cuda.stream(streams[i]):
+                        a = torch.randn(self.matrix_size, self.matrix_size, device=device, dtype=torch.float16)
+                        b = torch.randn(self.matrix_size, self.matrix_size, device=device, dtype=torch.float16)
+                        for _ in range(self.num_operations):
+                            c = torch.mm(a, b)
+                            a = b
+                            b = c
+                torch.cuda.synchronize()
+                elapsed = time.time() - start
+                tflops = self.calculate_tflops(self.matrix_size, elapsed,
+                                               self.num_operations, self.num_streams)
+                self.temperatures.append(temp)
+                self.tflops_history.append(tflops)
+                self.load_level.append(self.current_load)
+                self.power_watts.append(power)
+                self.voltage_volts.append(voltage)
+                self.current_amps.append(current)
+                if tflops > self.peak_tflops:
+                    self.peak_tflops = tflops
+                if power > self.peak_power:
+                    self.peak_power = power
+                print(f"TFLOPS: {tflops:6.2f} | Temp: {temp:5.1f}°C | {power:6.1f}W | {voltage:5.3f}V | {current:6.2f}A | Peak: {self.peak_tflops:.2f}", end='\r')
+                if temp < 75 and stable_cycles > 10:
+                    self.increase_load()
+                    stable_cycles = 0
+                elif temp < 80:
+                    stable_cycles += 1
+                else:
+                    stable_cycles = 0
+                time.sleep(0.02)
+            except RuntimeError as e:
+                if "out of memory" in str(e):
+                    print(f"\n⚠️  VRAM cheia - Reduzindo...")
+                    self.decrease_load()
+                    torch.cuda.empty_cache()
+                else:
+                    print(f"\n🚨 ERRO: {e}")
+                    self.running = False
+                    break
+            except Exception as e:
+                print(f"\n🚨 ERRO: {e}")
+                self.running = False
+                break
+    def update_plot(self, frame):
+        if len(self.tflops_history) == 0:
+            return
+        for ax in [self.ax1, self.ax2, self.ax3, self.ax4]:
+            ax.clear()
+        # Gráfico 1: TFLOPS
+        if len(self.tflops_history) > 0:
+            self.ax1.plot(list(self.tflops_history), 'b-', linewidth=2.5, label='TFLOPS FP16')
+            self.ax1.axhline(y=self.peak_tflops, color='g', linestyle='--', linewidth=2,
+                           label=f'Peak: {self.peak_tflops:.2f}')
+            self.ax1.axhline(y=26.88, color='orange', linestyle=':', linewidth=2,
+                           label='Teórico: 26.88')
+            self.ax1.set_ylabel('TFLOPS', fontsize=11, weight='bold')
+            self.ax1.set_title('Performance FP16', fontsize=11, weight='bold')
+            self.ax1.legend(loc='upper left', fontsize=9)
+            self.ax1.grid(True, alpha=0.3)
+            self.ax1.set_ylim(0, 30)
+        # Gráfico 2: Temperatura
+        if len(self.temperatures) > 0:
+            temps = list(self.temperatures)
+            self.ax2.plot(temps, 'r-', linewidth=2.5, label='Temperatura')
+            self.ax2.axhline(y=self.max_temp, color='red', linestyle='--', linewidth=2,
+                           label=f'Limite: {self.max_temp}°C')
+            self.ax2.fill_between(range(len(temps)), temps, self.max_temp,
+                                 where=[t >= self.max_temp - 5 for t in temps],
+                                 alpha=0.3, color='orange')
+            self.ax2.set_ylabel('Temperatura (°C)', fontsize=11, weight='bold')
+            self.ax2.set_title('Monitoramento Térmico', fontsize=11, weight='bold')
+            self.ax2.legend(loc='upper left', fontsize=9)
+            self.ax2.grid(True, alpha=0.3)
+            self.ax2.set_ylim(30, 95)
+        # Gráfico 3: Potência
+        if len(self.power_watts) > 0:
+            powers = list(self.power_watts)
+            self.ax3.plot(powers, 'green', linewidth=2.5, label='Potência')
+            self.ax3.axhline(y=300, color='red', linestyle='--', linewidth=2,
+                           label='TDP: 300W')
+            self.ax3.axhline(y=self.peak_power, color='orange', linestyle=':', linewidth=2,
+                           label=f'Peak: {self.peak_power:.1f}W')
+            self.ax3.fill_between(range(len(powers)), powers, alpha=0.3, color='green')
+            self.ax3.set_ylabel('Potência (W)', fontsize=11, weight='bold')
+            self.ax3.set_xlabel('Amostras', fontsize=11, weight='bold')
+            self.ax3.set_title('Consumo Elétrico', fontsize=11, weight='bold')
+            self.ax3.legend(loc='upper left', fontsize=9)
+            self.ax3.grid(True, alpha=0.3)
+            self.ax3.set_ylim(0, 350)
+        # Gráfico 4: Tensão e Corrente
+        if len(self.voltage_volts) > 0 and len(self.current_amps) > 0:
+            ax4_twin = self.ax4.twinx()
+            volts = list(self.voltage_volts)
+            amps = list(self.current_amps)
+            line1 = self.ax4.plot(volts, 'blue', linewidth=2.5, label='Tensão (V)')
+            self.ax4.set_ylabel('Tensão (V)', fontsize=11, weight='bold', color='blue')
+            self.ax4.tick_params(axis='y', labelcolor='blue')
+            line2 = ax4_twin.plot(amps, 'red', linewidth=2.5, label='Corrente (A)')
+            ax4_twin.set_ylabel('Corrente (A)', fontsize=11, weight='bold', color='red')
+            ax4_twin.tick_params(axis='y', labelcolor='red')
+            self.ax4.set_xlabel('Amostras', fontsize=11, weight='bold')
+            self.ax4.set_title('Tensão e Corrente', fontsize=11, weight='bold')
+            self.ax4.grid(True, alpha=0.3)
+            lines = line1 + line2
+            labels = [l.get_label() for l in lines]
+            self.ax4.legend(lines, labels, loc='upper left', fontsize=9)
+        if not self.running and len(self.tflops_history) > 0:
+            efficiency = (self.peak_tflops / 26.88) * 100
+            self.ax1.text(0.5, 0.5,
+                         f'🏆 {self.peak_tflops:.2f} TFLOPS\n{efficiency:.1f}%',
+                         transform=self.ax1.transAxes, fontsize=16,
+                         ha='center', va='center', color='darkgreen', weight='bold',
+                         bbox=dict(boxstyle='round,pad=0.8', facecolor='lightgreen', alpha=0.9))
+    def run(self):
+        stress_thread = threading.Thread(target=self.stress_gpu)
+        stress_thread.daemon = True
+        stress_thread.start()
+        ani = animation.FuncAnimation(self.fig, self.update_plot,
+                                     interval=300, cache_frame_data=False)
+        plt.tight_layout()
+        plt.show()
+        stress_thread.join(timeout=2)
+        avg_power = sum(self.power_watts) / len(self.power_watts) if self.power_watts else 0
+        avg_voltage = sum(self.voltage_volts) / len(self.voltage_volts) if self.voltage_volts else 0
+        avg_current = sum(self.current_amps) / len(self.current_amps) if self.current_amps else 0
+        print(f"\n\n{'='*70}")
+        print(f"{'BENCHMARK FP16 COMPLETO - RESULTADO':^70}")
+        print(f"{'='*70}")
+        print(f"🏆 PEAK TFLOPS (FP16): {self.peak_tflops:.2f}")
+        print(f"📊 Teórico FP16: 26.88 TFLOPS")
+        print(f"📈 Eficiência: {(self.peak_tflops / 26.88) * 100:.1f}%")
+        print(f"🌡️  Temp Máxima: {max(self.temperatures) if self.temperatures else 0:.1f}°C")
+        print(f"⚡ Potência Peak: {self.peak_power:.1f}W")
+        print(f"⚡ Potência Média: {avg_power:.1f}W")
+        print(f"🔌 Tensão Média: {avg_voltage:.3f}V")
+        print(f"🔌 Corrente Média: {avg_current:.2f}A")
+        print(f"🔥 Carga Máxima: {max(self.load_level) if self.load_level else 0}/10")
+        print(f"{'='*70}\n")
+if __name__ == "__main__":
+    bench = FP16BenchmarkFull()
+    bench.run()