meccatronis commited on
Commit
6fe8c45
·
verified ·
1 Parent(s): 3b4d89e

Upload benchmark_extreme.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. benchmark_extreme.py +288 -0
benchmark_extreme.py ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import torch
3
+ import matplotlib.pyplot as plt
4
+ import matplotlib.animation as animation
5
+ from datetime import datetime
6
+ import subprocess
7
+ import time
8
+ import psutil
9
+ import re
10
+ from collections import deque
11
+ import threading
12
+ import numpy as np
13
+
14
+ class ExtremeBenchmark:
15
+ def __init__(self):
16
+ self.max_temp = 85
17
+ self.temperatures = deque(maxlen=200)
18
+ self.tflops_history = deque(maxlen=200)
19
+ self.load_level = deque(maxlen=200)
20
+ self.peak_tflops = 0
21
+ self.running = True
22
+
23
+ # Configuração incremental
24
+ self.current_load = 1 # 1 a 10
25
+ self.matrix_size = 8192
26
+ self.num_operations = 1
27
+ self.num_streams = 1
28
+
29
+ self.fig, (self.ax1, self.ax2, self.ax3) = plt.subplots(3, 1, figsize=(14, 10))
30
+ self.fig.suptitle('BENCHMARK EXTREMO - Radeon Pro VII', fontsize=16, weight='bold')
31
+
32
+ self.last_temp_check = time.time()
33
+ self.temp_rising_fast = False
34
+
35
+ def get_gpu_temp(self):
36
+ try:
37
+ result = subprocess.run(['sensors'], capture_output=True, text=True, timeout=0.5)
38
+ for line in result.stdout.split('\n'):
39
+ if 'edge:' in line.lower():
40
+ match = re.search(r'([+-]?\d+\.?\d*)\s*°C', line)
41
+ if match:
42
+ return float(match.group(1))
43
+ except:
44
+ return 0
45
+ return 0
46
+
47
+ def check_system_health(self):
48
+ try:
49
+ start = time.time()
50
+ cpu = psutil.cpu_percent(interval=0.05)
51
+ response = time.time() - start
52
+
53
+ # Sistema travando se demorar muito ou CPU altíssima
54
+ if response > 0.4 or cpu > 95:
55
+ return False
56
+ return True
57
+ except:
58
+ return False
59
+
60
+ def calculate_tflops(self, matrix_size, elapsed_time, num_ops, num_streams):
61
+ operations = 2 * (matrix_size ** 3) * num_ops * num_streams
62
+ return (operations / elapsed_time) / 1e12
63
+
64
+ def increase_load(self):
65
+ """Aumenta carga gradualmente"""
66
+ if self.current_load < 10:
67
+ self.current_load += 1
68
+
69
+ if self.current_load >= 2 and self.num_streams < 4:
70
+ self.num_streams += 1
71
+
72
+ if self.current_load >= 4 and self.num_operations < 20:
73
+ self.num_operations += 5
74
+
75
+ if self.current_load >= 6 and self.matrix_size < 14336:
76
+ self.matrix_size = min(self.matrix_size + 1024, 14336)
77
+
78
+ def decrease_load(self):
79
+ """Diminui carga por segurança"""
80
+ if self.current_load > 1:
81
+ self.current_load -= 1
82
+
83
+ if self.matrix_size > 8192:
84
+ self.matrix_size = max(self.matrix_size - 512, 8192)
85
+
86
+ if self.num_operations > 5:
87
+ self.num_operations = max(self.num_operations - 5, 1)
88
+
89
+ def stress_gpu(self):
90
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
91
+ if device.type == 'cpu':
92
+ print("❌ ERRO: GPU não detectada!")
93
+ self.running = False
94
+ return
95
+
96
+ props = torch.cuda.get_device_properties(0)
97
+ print(f"🎯 GPU: {torch.cuda.get_device_name(0)}")
98
+ print(f"💾 VRAM: {props.total_memory / 1e9:.1f} GB")
99
+ print(f"🔥 Iniciando teste EXTREMO com aumento gradual...")
100
+ print(f"⚠️ Limite de temperatura: {self.max_temp}°C")
101
+ print(f"⚠️ Monitoramento de estabilidade: ATIVO\n")
102
+
103
+ streams = [torch.cuda.Stream() for _ in range(4)]
104
+ last_temp = 0
105
+ stable_cycles = 0
106
+
107
+ while self.running:
108
+ # Verificação de temperatura mais frequente
109
+ current_time = time.time()
110
+ if current_time - self.last_temp_check > 0.05: # 50ms
111
+ temp = self.get_gpu_temp()
112
+ self.last_temp_check = current_time
113
+
114
+ # Detecta aquecimento rápido
115
+ if len(self.temperatures) > 0:
116
+ temp_delta = temp - last_temp
117
+ if temp_delta > 2: # Subiu mais de 2°C muito rápido
118
+ self.temp_rising_fast = True
119
+ else:
120
+ self.temp_rising_fast = False
121
+
122
+ # EMERGÊNCIA: temperatura perigosa
123
+ if temp >= self.max_temp:
124
+ print(f"\n🚨 EMERGÊNCIA! Temperatura: {temp}°C - ABORTANDO!")
125
+ self.running = False
126
+ break
127
+
128
+ # ALERTA: próximo do limite
129
+ if temp >= self.max_temp - 3:
130
+ print(f"\n⚠️ ALERTA! Temperatura: {temp}°C - Reduzindo carga...")
131
+ self.decrease_load()
132
+ self.decrease_load()
133
+
134
+ # Temperatura subindo rápido
135
+ if self.temp_rising_fast and temp > 75:
136
+ self.decrease_load()
137
+
138
+ last_temp = temp
139
+ else:
140
+ temp = last_temp
141
+
142
+ # Verifica saúde do sistema
143
+ if not self.check_system_health():
144
+ print(f"\n🚨 SISTEMA INSTÁVEL - ABORTANDO!")
145
+ self.running = False
146
+ break
147
+
148
+ try:
149
+ torch.cuda.synchronize()
150
+ start = time.time()
151
+
152
+ # Stress distribuído em streams
153
+ for i in range(self.num_streams):
154
+ with torch.cuda.stream(streams[i]):
155
+ a = torch.randn(self.matrix_size, self.matrix_size, device=device, dtype=torch.float32)
156
+ b = torch.randn(self.matrix_size, self.matrix_size, device=device, dtype=torch.float32)
157
+
158
+ for _ in range(self.num_operations):
159
+ c = torch.mm(a, b)
160
+ a = b
161
+ b = c
162
+
163
+ torch.cuda.synchronize()
164
+ elapsed = time.time() - start
165
+
166
+ tflops = self.calculate_tflops(self.matrix_size, elapsed,
167
+ self.num_operations, self.num_streams)
168
+
169
+ self.temperatures.append(temp)
170
+ self.tflops_history.append(tflops)
171
+ self.load_level.append(self.current_load)
172
+
173
+ if tflops > self.peak_tflops:
174
+ self.peak_tflops = tflops
175
+
176
+ print(f"TFLOPS: {tflops:6.2f} | Temp: {temp:5.1f}°C | Load: {self.current_load}/10 | "
177
+ f"Matrix: {self.matrix_size} | Ops: {self.num_operations} | Streams: {self.num_streams} | "
178
+ f"Peak: {self.peak_tflops:.2f}", end='\r')
179
+
180
+ # Lógica de aumento gradual
181
+ if temp < 75 and stable_cycles > 10:
182
+ self.increase_load()
183
+ stable_cycles = 0
184
+ elif temp < 80:
185
+ stable_cycles += 1
186
+ else:
187
+ stable_cycles = 0
188
+
189
+ time.sleep(0.02) # 20ms
190
+
191
+ except RuntimeError as e:
192
+ if "out of memory" in str(e):
193
+ print(f"\n⚠️ VRAM cheia - Reduzindo carga...")
194
+ self.decrease_load()
195
+ torch.cuda.empty_cache()
196
+ else:
197
+ print(f"\n🚨 ERRO: {e}")
198
+ self.running = False
199
+ break
200
+ except Exception as e:
201
+ print(f"\n🚨 ERRO CRÍTICO: {e}")
202
+ self.running = False
203
+ break
204
+
205
+ def update_plot(self, frame):
206
+ if len(self.tflops_history) == 0:
207
+ return
208
+
209
+ self.ax1.clear()
210
+ self.ax2.clear()
211
+ self.ax3.clear()
212
+
213
+ # Gráfico 1: TFLOPS
214
+ if len(self.tflops_history) > 0:
215
+ self.ax1.plot(list(self.tflops_history), 'b-', linewidth=2.5, label='TFLOPS Real')
216
+ self.ax1.axhline(y=self.peak_tflops, color='g', linestyle='--', linewidth=2,
217
+ label=f'Peak: {self.peak_tflops:.2f}')
218
+ self.ax1.axhline(y=13.44, color='orange', linestyle=':', linewidth=2,
219
+ label='Teórico: 13.44')
220
+ self.ax1.set_ylabel('TFLOPS', fontsize=12, weight='bold')
221
+ self.ax1.set_title('Performance Computacional', fontsize=12, weight='bold')
222
+ self.ax1.legend(loc='upper left')
223
+ self.ax1.grid(True, alpha=0.3)
224
+ self.ax1.set_ylim(0, 15)
225
+
226
+ # Gráfico 2: Temperatura
227
+ if len(self.temperatures) > 0:
228
+ temps = list(self.temperatures)
229
+ self.ax2.plot(temps, 'r-', linewidth=2.5, label='Temperatura')
230
+ self.ax2.axhline(y=self.max_temp, color='red', linestyle='--', linewidth=2,
231
+ label=f'LIMITE: {self.max_temp}°C')
232
+ self.ax2.axhline(y=self.max_temp - 5, color='orange', linestyle=':', linewidth=1.5,
233
+ label='Alerta: 80°C')
234
+ self.ax2.fill_between(range(len(temps)), temps, self.max_temp,
235
+ where=[t >= self.max_temp - 5 for t in temps],
236
+ alpha=0.3, color='orange')
237
+ self.ax2.set_ylabel('Temperatura (°C)', fontsize=12, weight='bold')
238
+ self.ax2.set_title('Monitoramento Térmico', fontsize=12, weight='bold')
239
+ self.ax2.legend(loc='upper left')
240
+ self.ax2.grid(True, alpha=0.3)
241
+ self.ax2.set_ylim(30, 95)
242
+
243
+ # Gráfico 3: Nível de Carga
244
+ if len(self.load_level) > 0:
245
+ loads = list(self.load_level)
246
+ self.ax3.plot(loads, 'purple', linewidth=2.5, label='Nível de Stress')
247
+ self.ax3.fill_between(range(len(loads)), loads, alpha=0.3, color='purple')
248
+ self.ax3.set_ylabel('Carga (1-10)', fontsize=12, weight='bold')
249
+ self.ax3.set_xlabel('Amostras', fontsize=12, weight='bold')
250
+ self.ax3.set_title('Intensidade do Teste', fontsize=12, weight='bold')
251
+ self.ax3.legend(loc='upper left')
252
+ self.ax3.grid(True, alpha=0.3)
253
+ self.ax3.set_ylim(0, 11)
254
+
255
+ if not self.running and len(self.tflops_history) > 0:
256
+ efficiency = (self.peak_tflops / 13.44) * 100
257
+ self.ax1.text(0.5, 0.5,
258
+ f'🏆 PEAK: {self.peak_tflops:.2f} TFLOPS\n'
259
+ f'📊 Eficiência: {efficiency:.1f}%',
260
+ transform=self.ax1.transAxes, fontsize=20,
261
+ ha='center', va='center', color='darkgreen', weight='bold',
262
+ bbox=dict(boxstyle='round,pad=1', facecolor='lightgreen', alpha=0.9))
263
+
264
+ def run(self):
265
+ stress_thread = threading.Thread(target=self.stress_gpu)
266
+ stress_thread.daemon = True
267
+ stress_thread.start()
268
+
269
+ ani = animation.FuncAnimation(self.fig, self.update_plot,
270
+ interval=300, cache_frame_data=False)
271
+ plt.tight_layout()
272
+ plt.show()
273
+ stress_thread.join(timeout=2)
274
+
275
+ print(f"\n\n{'='*70}")
276
+ print(f"{'RESULTADO FINAL DO BENCHMARK EXTREMO':^70}")
277
+ print(f"{'='*70}")
278
+ print(f"🏆 PEAK TFLOPS ALCANÇADO: {self.peak_tflops:.2f}")
279
+ print(f"📊 TFLOPS Teórico (FP32): 13.44")
280
+ print(f"📈 Eficiência Real: {(self.peak_tflops / 13.44) * 100:.1f}%")
281
+ print(f"🌡️ Temperatura Máxima: {max(self.temperatures) if self.temperatures else 0:.1f}°C")
282
+ print(f"🔥 Nível de Carga Máximo: {max(self.load_level) if self.load_level else 0}/10")
283
+ print(f"⏱️ Duração do Teste: {len(self.tflops_history) * 0.3:.1f}s")
284
+ print(f"{'='*70}\n")
285
+
286
+ if __name__ == "__main__":
287
+ bench = ExtremeBenchmark()
288
+ bench.run()