Spaces:
Runtime error
Runtime error
| """ | |
| Tensor Core subsystem for hyperrealistic GPU simulation. | |
| Models hardware-level matrix multiply-accumulate, scheduling, and memory integration. | |
| """ | |
| import time | |
| import sys | |
| import os | |
| sys.path.append(os.path.dirname(os.path.abspath(__file__))) | |
| try: | |
| from electron_speed import TARGET_SWITCHES_PER_SEC, TRANSISTORS_ON_CHIP | |
| except ImportError: | |
| TARGET_SWITCHES_PER_SEC = 9e20 | |
| TRANSISTORS_ON_CHIP = 6e11 | |
| class TensorCore: | |
| """ | |
| Simulates a hardware tensor core for matrix operations (multiply-accumulate), | |
| with realistic operand fetch from registers, shared memory, and VRAM/global memory. | |
| """ | |
| def __init__(self, bits=2, memory_size=800*1024*1024*1024, bandwidth_tbps=10000, sm=None): | |
| self.bits = bits | |
| # Use a sparse dict for local memory: keys are (row, col), values are floats | |
| self.memory = {} | |
| self.bandwidth_tbps = bandwidth_tbps # Simulated bandwidth for operand fetch (TB/s) | |
| self.sm = sm # Reference to parent SM for memory access | |
| def fetch_operand(self, source, addr, shape): | |
| """ | |
| Fetches a matrix operand from a given source (registers, shared, global). | |
| Simulates bandwidth and latency. | |
| """ | |
| n, m = shape | |
| if source == 'register': | |
| # Simulate register fetch (fast, minimal latency) | |
| matrix = self.sm.read_register_matrix(addr, n, m) | |
| latency = 1e-9 # 1ns | |
| elif source == 'shared': | |
| # Simulate shared memory fetch | |
| matrix = self.sm.shared_mem.read_matrix(addr, n, m) | |
| latency = 10e-9 # 10ns | |
| elif source == 'global': | |
| # Simulate VRAM/global memory fetch | |
| matrix = self.sm.global_mem.read_matrix(addr, n, m) | |
| latency = 200e-9 # 200ns | |
| else: | |
| raise ValueError(f"Unknown source: {source}") | |
| # Simulate bandwidth (TB/s) | |
| data_size_bytes = n * m * (self.bits // 8) | |
| transfer_time = data_size_bytes / (self.bandwidth_tbps * 1e12) | |
| time.sleep(latency + transfer_time) # Simulate delay | |
| return matrix | |
| def matmul(self, A, B): | |
| # A, B: 2D lists (matrices) of voltages | |
| n = len(A) | |
| m = len(B[0]) | |
| p = len(B) | |
| C = [[0.0 for _ in range(m)] for _ in range(n)] | |
| for i in range(n): | |
| for j in range(m): | |
| acc = 0.0 | |
| for k in range(p): | |
| acc += A[i][k] * B[k][j] | |
| C[i][j] = acc | |
| return C | |
| def matmul_from_memory(self, srcA, addrA, srcB, addrB, shapeA, shapeB): | |
| """ | |
| Fetches operands from memory hierarchy and performs matmul. | |
| srcA/srcB: 'register', 'shared', or 'global' | |
| addrA/addrB: address or index | |
| shapeA/shapeB: (n, p), (p, m) | |
| """ | |
| A = self.fetch_operand(srcA, addrA, shapeA) | |
| B = self.fetch_operand(srcB, addrB, shapeB) | |
| return self.matmul(A, B) | |
| def load_matrix(self, matrix, row_offset=0, col_offset=0): | |
| # Loads a matrix into local memory (sparse) | |
| for i, row in enumerate(matrix): | |
| for j, val in enumerate(row): | |
| self.memory[(row_offset+i, col_offset+j)] = val | |
| def read_matrix(self, n, m, row_offset=0, col_offset=0): | |
| # Reads an n x m matrix from local memory (sparse) | |
| return [ | |
| [self.memory.get((row_offset+i, col_offset+j), 0.0) for j in range(m)] | |
| for i in range(n) | |
| ] | |
| class TensorCoreArray: | |
| """ | |
| Array of tensor cores per SM, with scheduling and memory integration. | |
| """ | |
| def __init__(self, num_tensor_cores=8000, bits=2, memory_size=800*1024*1024*1024, bandwidth_tbps=10000, sm=None): | |
| self.tensor_cores = [TensorCore(bits=bits, memory_size=memory_size, bandwidth_tbps=bandwidth_tbps, sm=sm) for _ in range(num_tensor_cores)] | |
| self.schedule_ptr = 0 | |
| self.sm = sm | |
| # Deep realism: calculate theoretical PFLOPS | |
| # Use foundational switching rate from electron_speed.py | |
| # PFLOPS = (num_tensor_cores * ops_per_cycle * clock_GHz) / 1e6 | |
| # clock_GHz = (TARGET_SWITCHES_PER_SEC / TRANSISTORS_ON_CHIP) / 1e9 | |
| self.ops_per_cycle = 1024 # Example: 1024 fused-multiply-adds per cycle per core | |
| self.clock_ghz = (TARGET_SWITCHES_PER_SEC / TRANSISTORS_ON_CHIP) / 1e9 | |
| self.pflops = (num_tensor_cores * self.ops_per_cycle * self.clock_ghz) / 1e6 | |
| def schedule(self): | |
| # Simple round-robin scheduling | |
| tc = self.tensor_cores[self.schedule_ptr] | |
| self.schedule_ptr = (self.schedule_ptr + 1) % len(self.tensor_cores) | |
| return tc | |
| def matmul(self, A, B): | |
| tc = self.schedule() | |
| # Deep realism: calculate actual compute time | |
| n = len(A) | |
| m = len(B[0]) | |
| p = len(B) | |
| total_ops = n * m * p * 2 # 2 ops per FMA (multiply and add) | |
| seconds = total_ops / (self.pflops * 1e15) | |
| print(f"[TensorCoreArray] Matmul on {len(self.tensor_cores)} tensor cores @ {self.pflops:.1f} PFLOPS, ops={total_ops}, time={seconds:.9f}s") | |
| time.sleep(seconds) # Simulate actual compute time | |
| return tc.matmul(A, B) | |
| def matmul_from_memory(self, srcA, addrA, srcB, addrB, shapeA, shapeB): | |
| tc = self.schedule() | |
| n, p = shapeA | |
| p2, m = shapeB | |
| total_ops = n * m * p * 2 | |
| seconds = total_ops / (self.pflops * 1e15) | |
| print(f"[TensorCoreArray] Matmul from memory on {len(self.tensor_cores)} tensor cores @ {self.pflops:.1f} PFLOPS, ops={total_ops}, time={seconds:.9f}s") | |
| time.sleep(seconds) | |
| return tc.matmul_from_memory(srcA, addrA, srcB, addrB, shapeA, shapeB) | |
| def load_matrix(self, matrix, core_idx=0, row_offset=0, col_offset=0): | |
| self.tensor_cores[core_idx].load_matrix(matrix, row_offset, col_offset) | |
| def read_matrix(self, n, m, core_idx=0, row_offset=0, col_offset=0): | |
| return self.tensor_cores[core_idx].read_matrix(n, m, row_offset, col_offset) | |