meccatronis commited on
Commit
3b4d89e
·
verified ·
1 Parent(s): 6d6f4dd

Upload benchmark_fp16_full.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. benchmark_fp16_full.py +343 -0
benchmark_fp16_full.py ADDED
@@ -0,0 +1,343 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import torch
3
+ import matplotlib.pyplot as plt
4
+ import matplotlib.animation as animation
5
+ from datetime import datetime
6
+ import subprocess
7
+ import time
8
+ import psutil
9
+ import re
10
+ from collections import deque
11
+ import threading
12
+
13
+ class FP16BenchmarkFull:
14
+ def __init__(self):
15
+ self.max_temp = 85
16
+ self.temperatures = deque(maxlen=200)
17
+ self.tflops_history = deque(maxlen=200)
18
+ self.load_level = deque(maxlen=200)
19
+ self.power_watts = deque(maxlen=200)
20
+ self.voltage_volts = deque(maxlen=200)
21
+ self.current_amps = deque(maxlen=200)
22
+ self.peak_tflops = 0
23
+ self.peak_power = 0
24
+ self.running = True
25
+
26
+ self.current_load = 1
27
+ self.matrix_size = 10240
28
+ self.num_operations = 1
29
+ self.num_streams = 1
30
+
31
+ self.fig, ((self.ax1, self.ax2), (self.ax3, self.ax4)) = plt.subplots(2, 2, figsize=(16, 10))
32
+ self.fig.suptitle('BENCHMARK FP16 COMPLETO - Radeon Pro VII', fontsize=16, weight='bold')
33
+
34
+ self.last_temp_check = time.time()
35
+ self.temp_rising_fast = False
36
+
37
+ def get_gpu_metrics(self):
38
+ """Obtém temperatura, potência, tensão via rocm-smi e sensors"""
39
+ temp = 0
40
+ power = 0
41
+ voltage = 0
42
+
43
+ # Temperatura via sensors
44
+ try:
45
+ result = subprocess.run(['sensors'], capture_output=True, text=True, timeout=0.5)
46
+ for line in result.stdout.split('\n'):
47
+ if 'edge:' in line.lower():
48
+ match = re.search(r'([+-]?\d+\.?\d*)\s*°C', line)
49
+ if match:
50
+ temp = float(match.group(1))
51
+ except:
52
+ pass
53
+
54
+ # Potência e tensão via rocm-smi
55
+ try:
56
+ result = subprocess.run(['rocm-smi', '--showpower', '--showvolt'],
57
+ capture_output=True, text=True, timeout=0.5)
58
+ for line in result.stdout.split('\n'):
59
+ # Potência (watts)
60
+ if 'Average Graphics Package Power' in line or 'Power' in line:
61
+ match = re.search(r'(\d+\.?\d*)\s*W', line)
62
+ if match:
63
+ power = float(match.group(1))
64
+
65
+ # Tensão (volts)
66
+ if 'Voltage' in line or 'volt' in line.lower():
67
+ match = re.search(r'(\d+\.?\d*)\s*mV', line)
68
+ if match:
69
+ voltage = float(match.group(1)) / 1000 # mV para V
70
+ except:
71
+ pass
72
+
73
+ return temp, power, voltage
74
+
75
+ def check_system_health(self):
76
+ try:
77
+ start = time.time()
78
+ cpu = psutil.cpu_percent(interval=0.05)
79
+ response = time.time() - start
80
+ if response > 0.4 or cpu > 95:
81
+ return False
82
+ return True
83
+ except:
84
+ return False
85
+
86
+ def calculate_tflops(self, matrix_size, elapsed_time, num_ops, num_streams):
87
+ operations = 2 * (matrix_size ** 3) * num_ops * num_streams
88
+ return (operations / elapsed_time) / 1e12
89
+
90
+ def increase_load(self):
91
+ if self.current_load < 10:
92
+ self.current_load += 1
93
+
94
+ if self.current_load >= 2 and self.num_streams < 4:
95
+ self.num_streams += 1
96
+
97
+ if self.current_load >= 4 and self.num_operations < 30:
98
+ self.num_operations += 5
99
+
100
+ if self.current_load >= 6 and self.matrix_size < 16384:
101
+ self.matrix_size = min(self.matrix_size + 1024, 16384)
102
+
103
+ def decrease_load(self):
104
+ if self.current_load > 1:
105
+ self.current_load -= 1
106
+
107
+ if self.matrix_size > 8192:
108
+ self.matrix_size = max(self.matrix_size - 512, 8192)
109
+
110
+ if self.num_operations > 5:
111
+ self.num_operations = max(self.num_operations - 5, 1)
112
+
113
+ def stress_gpu(self):
114
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
115
+ if device.type == 'cpu':
116
+ print("❌ ERRO: GPU não detectada!")
117
+ self.running = False
118
+ return
119
+
120
+ props = torch.cuda.get_device_properties(0)
121
+ print(f"🎯 GPU: {torch.cuda.get_device_name(0)}")
122
+ print(f"💾 VRAM: {props.total_memory / 1e9:.1f} GB")
123
+ print(f"🔥 Modo: FP16 (Half Precision)")
124
+ print(f"📊 TFLOPS Teórico FP16: ~26.88")
125
+ print(f"⚡ TDP Especificado: 300W")
126
+ print(f"⚠️ Limite: {self.max_temp}°C\n")
127
+
128
+ streams = [torch.cuda.Stream() for _ in range(4)]
129
+ last_temp = 0
130
+ stable_cycles = 0
131
+
132
+ while self.running:
133
+ current_time = time.time()
134
+ if current_time - self.last_temp_check > 0.05:
135
+ temp, power, voltage = self.get_gpu_metrics()
136
+ self.last_temp_check = current_time
137
+
138
+ # Calcula corrente: I = P / V
139
+ current = power / voltage if voltage > 0 else 0
140
+
141
+ if len(self.temperatures) > 0:
142
+ temp_delta = temp - last_temp
143
+ if temp_delta > 2:
144
+ self.temp_rising_fast = True
145
+ else:
146
+ self.temp_rising_fast = False
147
+
148
+ if temp >= self.max_temp:
149
+ print(f"\n🚨 TEMPERATURA: {temp}°C - ABORTANDO!")
150
+ self.running = False
151
+ break
152
+
153
+ if temp >= self.max_temp - 3:
154
+ print(f"\n⚠️ ALERTA! Temp: {temp}°C - Reduzindo...")
155
+ self.decrease_load()
156
+ self.decrease_load()
157
+
158
+ if self.temp_rising_fast and temp > 75:
159
+ self.decrease_load()
160
+
161
+ last_temp = temp
162
+ else:
163
+ temp, power, voltage = last_temp, self.power_watts[-1] if self.power_watts else 0, self.voltage_volts[-1] if self.voltage_volts else 0
164
+ current = power / voltage if voltage > 0 else 0
165
+
166
+ if not self.check_system_health():
167
+ print(f"\n🚨 SISTEMA INSTÁVEL!")
168
+ self.running = False
169
+ break
170
+
171
+ try:
172
+ torch.cuda.synchronize()
173
+ start = time.time()
174
+
175
+ for i in range(self.num_streams):
176
+ with torch.cuda.stream(streams[i]):
177
+ a = torch.randn(self.matrix_size, self.matrix_size, device=device, dtype=torch.float16)
178
+ b = torch.randn(self.matrix_size, self.matrix_size, device=device, dtype=torch.float16)
179
+
180
+ for _ in range(self.num_operations):
181
+ c = torch.mm(a, b)
182
+ a = b
183
+ b = c
184
+
185
+ torch.cuda.synchronize()
186
+ elapsed = time.time() - start
187
+
188
+ tflops = self.calculate_tflops(self.matrix_size, elapsed,
189
+ self.num_operations, self.num_streams)
190
+
191
+ self.temperatures.append(temp)
192
+ self.tflops_history.append(tflops)
193
+ self.load_level.append(self.current_load)
194
+ self.power_watts.append(power)
195
+ self.voltage_volts.append(voltage)
196
+ self.current_amps.append(current)
197
+
198
+ if tflops > self.peak_tflops:
199
+ self.peak_tflops = tflops
200
+
201
+ if power > self.peak_power:
202
+ self.peak_power = power
203
+
204
+ print(f"TFLOPS: {tflops:6.2f} | Temp: {temp:5.1f}°C | {power:6.1f}W | {voltage:5.3f}V | {current:6.2f}A | Peak: {self.peak_tflops:.2f}", end='\r')
205
+
206
+ if temp < 75 and stable_cycles > 10:
207
+ self.increase_load()
208
+ stable_cycles = 0
209
+ elif temp < 80:
210
+ stable_cycles += 1
211
+ else:
212
+ stable_cycles = 0
213
+
214
+ time.sleep(0.02)
215
+
216
+ except RuntimeError as e:
217
+ if "out of memory" in str(e):
218
+ print(f"\n⚠️ VRAM cheia - Reduzindo...")
219
+ self.decrease_load()
220
+ torch.cuda.empty_cache()
221
+ else:
222
+ print(f"\n🚨 ERRO: {e}")
223
+ self.running = False
224
+ break
225
+ except Exception as e:
226
+ print(f"\n🚨 ERRO: {e}")
227
+ self.running = False
228
+ break
229
+
230
+ def update_plot(self, frame):
231
+ if len(self.tflops_history) == 0:
232
+ return
233
+
234
+ for ax in [self.ax1, self.ax2, self.ax3, self.ax4]:
235
+ ax.clear()
236
+
237
+ # Gráfico 1: TFLOPS
238
+ if len(self.tflops_history) > 0:
239
+ self.ax1.plot(list(self.tflops_history), 'b-', linewidth=2.5, label='TFLOPS FP16')
240
+ self.ax1.axhline(y=self.peak_tflops, color='g', linestyle='--', linewidth=2,
241
+ label=f'Peak: {self.peak_tflops:.2f}')
242
+ self.ax1.axhline(y=26.88, color='orange', linestyle=':', linewidth=2,
243
+ label='Teórico: 26.88')
244
+ self.ax1.set_ylabel('TFLOPS', fontsize=11, weight='bold')
245
+ self.ax1.set_title('Performance FP16', fontsize=11, weight='bold')
246
+ self.ax1.legend(loc='upper left', fontsize=9)
247
+ self.ax1.grid(True, alpha=0.3)
248
+ self.ax1.set_ylim(0, 30)
249
+
250
+ # Gráfico 2: Temperatura
251
+ if len(self.temperatures) > 0:
252
+ temps = list(self.temperatures)
253
+ self.ax2.plot(temps, 'r-', linewidth=2.5, label='Temperatura')
254
+ self.ax2.axhline(y=self.max_temp, color='red', linestyle='--', linewidth=2,
255
+ label=f'Limite: {self.max_temp}°C')
256
+ self.ax2.fill_between(range(len(temps)), temps, self.max_temp,
257
+ where=[t >= self.max_temp - 5 for t in temps],
258
+ alpha=0.3, color='orange')
259
+ self.ax2.set_ylabel('Temperatura (°C)', fontsize=11, weight='bold')
260
+ self.ax2.set_title('Monitoramento Térmico', fontsize=11, weight='bold')
261
+ self.ax2.legend(loc='upper left', fontsize=9)
262
+ self.ax2.grid(True, alpha=0.3)
263
+ self.ax2.set_ylim(30, 95)
264
+
265
+ # Gráfico 3: Potência
266
+ if len(self.power_watts) > 0:
267
+ powers = list(self.power_watts)
268
+ self.ax3.plot(powers, 'green', linewidth=2.5, label='Potência')
269
+ self.ax3.axhline(y=300, color='red', linestyle='--', linewidth=2,
270
+ label='TDP: 300W')
271
+ self.ax3.axhline(y=self.peak_power, color='orange', linestyle=':', linewidth=2,
272
+ label=f'Peak: {self.peak_power:.1f}W')
273
+ self.ax3.fill_between(range(len(powers)), powers, alpha=0.3, color='green')
274
+ self.ax3.set_ylabel('Potência (W)', fontsize=11, weight='bold')
275
+ self.ax3.set_xlabel('Amostras', fontsize=11, weight='bold')
276
+ self.ax3.set_title('Consumo Elétrico', fontsize=11, weight='bold')
277
+ self.ax3.legend(loc='upper left', fontsize=9)
278
+ self.ax3.grid(True, alpha=0.3)
279
+ self.ax3.set_ylim(0, 350)
280
+
281
+ # Gráfico 4: Tensão e Corrente
282
+ if len(self.voltage_volts) > 0 and len(self.current_amps) > 0:
283
+ ax4_twin = self.ax4.twinx()
284
+
285
+ volts = list(self.voltage_volts)
286
+ amps = list(self.current_amps)
287
+
288
+ line1 = self.ax4.plot(volts, 'blue', linewidth=2.5, label='Tensão (V)')
289
+ self.ax4.set_ylabel('Tensão (V)', fontsize=11, weight='bold', color='blue')
290
+ self.ax4.tick_params(axis='y', labelcolor='blue')
291
+
292
+ line2 = ax4_twin.plot(amps, 'red', linewidth=2.5, label='Corrente (A)')
293
+ ax4_twin.set_ylabel('Corrente (A)', fontsize=11, weight='bold', color='red')
294
+ ax4_twin.tick_params(axis='y', labelcolor='red')
295
+
296
+ self.ax4.set_xlabel('Amostras', fontsize=11, weight='bold')
297
+ self.ax4.set_title('Tensão e Corrente', fontsize=11, weight='bold')
298
+ self.ax4.grid(True, alpha=0.3)
299
+
300
+ lines = line1 + line2
301
+ labels = [l.get_label() for l in lines]
302
+ self.ax4.legend(lines, labels, loc='upper left', fontsize=9)
303
+
304
+ if not self.running and len(self.tflops_history) > 0:
305
+ efficiency = (self.peak_tflops / 26.88) * 100
306
+ self.ax1.text(0.5, 0.5,
307
+ f'🏆 {self.peak_tflops:.2f} TFLOPS\n{efficiency:.1f}%',
308
+ transform=self.ax1.transAxes, fontsize=16,
309
+ ha='center', va='center', color='darkgreen', weight='bold',
310
+ bbox=dict(boxstyle='round,pad=0.8', facecolor='lightgreen', alpha=0.9))
311
+
312
+ def run(self):
313
+ stress_thread = threading.Thread(target=self.stress_gpu)
314
+ stress_thread.daemon = True
315
+ stress_thread.start()
316
+
317
+ ani = animation.FuncAnimation(self.fig, self.update_plot,
318
+ interval=300, cache_frame_data=False)
319
+ plt.tight_layout()
320
+ plt.show()
321
+ stress_thread.join(timeout=2)
322
+
323
+ avg_power = sum(self.power_watts) / len(self.power_watts) if self.power_watts else 0
324
+ avg_voltage = sum(self.voltage_volts) / len(self.voltage_volts) if self.voltage_volts else 0
325
+ avg_current = sum(self.current_amps) / len(self.current_amps) if self.current_amps else 0
326
+
327
+ print(f"\n\n{'='*70}")
328
+ print(f"{'BENCHMARK FP16 COMPLETO - RESULTADO':^70}")
329
+ print(f"{'='*70}")
330
+ print(f"🏆 PEAK TFLOPS (FP16): {self.peak_tflops:.2f}")
331
+ print(f"📊 Teórico FP16: 26.88 TFLOPS")
332
+ print(f"📈 Eficiência: {(self.peak_tflops / 26.88) * 100:.1f}%")
333
+ print(f"🌡️ Temp Máxima: {max(self.temperatures) if self.temperatures else 0:.1f}°C")
334
+ print(f"⚡ Potência Peak: {self.peak_power:.1f}W")
335
+ print(f"⚡ Potência Média: {avg_power:.1f}W")
336
+ print(f"🔌 Tensão Média: {avg_voltage:.3f}V")
337
+ print(f"🔌 Corrente Média: {avg_current:.2f}A")
338
+ print(f"🔥 Carga Máxima: {max(self.load_level) if self.load_level else 0}/10")
339
+ print(f"{'='*70}\n")
340
+
341
+ if __name__ == "__main__":
342
+ bench = FP16BenchmarkFull()
343
+ bench.run()