Spaces:
Runtime error
Runtime error
File size: 6,104 Bytes
2ff82ee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
"""
Tensor Core subsystem for hyperrealistic GPU simulation.
Models hardware-level matrix multiply-accumulate, scheduling, and memory integration.
"""
import time
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
try:
from electron_speed import TARGET_SWITCHES_PER_SEC, TRANSISTORS_ON_CHIP
except ImportError:
TARGET_SWITCHES_PER_SEC = 9e20
TRANSISTORS_ON_CHIP = 6e11
class TensorCore:
"""
Simulates a hardware tensor core for matrix operations (multiply-accumulate),
with realistic operand fetch from registers, shared memory, and VRAM/global memory.
"""
def __init__(self, bits=2, memory_size=800*1024*1024*1024, bandwidth_tbps=10000, sm=None):
self.bits = bits
# Use a sparse dict for local memory: keys are (row, col), values are floats
self.memory = {}
self.bandwidth_tbps = bandwidth_tbps # Simulated bandwidth for operand fetch (TB/s)
self.sm = sm # Reference to parent SM for memory access
def fetch_operand(self, source, addr, shape):
"""
Fetches a matrix operand from a given source (registers, shared, global).
Simulates bandwidth and latency.
"""
n, m = shape
if source == 'register':
# Simulate register fetch (fast, minimal latency)
matrix = self.sm.read_register_matrix(addr, n, m)
latency = 1e-9 # 1ns
elif source == 'shared':
# Simulate shared memory fetch
matrix = self.sm.shared_mem.read_matrix(addr, n, m)
latency = 10e-9 # 10ns
elif source == 'global':
# Simulate VRAM/global memory fetch
matrix = self.sm.global_mem.read_matrix(addr, n, m)
latency = 200e-9 # 200ns
else:
raise ValueError(f"Unknown source: {source}")
# Simulate bandwidth (TB/s)
data_size_bytes = n * m * (self.bits // 8)
transfer_time = data_size_bytes / (self.bandwidth_tbps * 1e12)
time.sleep(latency + transfer_time) # Simulate delay
return matrix
def matmul(self, A, B):
# A, B: 2D lists (matrices) of voltages
n = len(A)
m = len(B[0])
p = len(B)
C = [[0.0 for _ in range(m)] for _ in range(n)]
for i in range(n):
for j in range(m):
acc = 0.0
for k in range(p):
acc += A[i][k] * B[k][j]
C[i][j] = acc
return C
def matmul_from_memory(self, srcA, addrA, srcB, addrB, shapeA, shapeB):
"""
Fetches operands from memory hierarchy and performs matmul.
srcA/srcB: 'register', 'shared', or 'global'
addrA/addrB: address or index
shapeA/shapeB: (n, p), (p, m)
"""
A = self.fetch_operand(srcA, addrA, shapeA)
B = self.fetch_operand(srcB, addrB, shapeB)
return self.matmul(A, B)
def load_matrix(self, matrix, row_offset=0, col_offset=0):
# Loads a matrix into local memory (sparse)
for i, row in enumerate(matrix):
for j, val in enumerate(row):
self.memory[(row_offset+i, col_offset+j)] = val
def read_matrix(self, n, m, row_offset=0, col_offset=0):
# Reads an n x m matrix from local memory (sparse)
return [
[self.memory.get((row_offset+i, col_offset+j), 0.0) for j in range(m)]
for i in range(n)
]
class TensorCoreArray:
"""
Array of tensor cores per SM, with scheduling and memory integration.
"""
def __init__(self, num_tensor_cores=8000, bits=2, memory_size=800*1024*1024*1024, bandwidth_tbps=10000, sm=None):
self.tensor_cores = [TensorCore(bits=bits, memory_size=memory_size, bandwidth_tbps=bandwidth_tbps, sm=sm) for _ in range(num_tensor_cores)]
self.schedule_ptr = 0
self.sm = sm
# Deep realism: calculate theoretical PFLOPS
# Use foundational switching rate from electron_speed.py
# PFLOPS = (num_tensor_cores * ops_per_cycle * clock_GHz) / 1e6
# clock_GHz = (TARGET_SWITCHES_PER_SEC / TRANSISTORS_ON_CHIP) / 1e9
self.ops_per_cycle = 1024 # Example: 1024 fused-multiply-adds per cycle per core
self.clock_ghz = (TARGET_SWITCHES_PER_SEC / TRANSISTORS_ON_CHIP) / 1e9
self.pflops = (num_tensor_cores * self.ops_per_cycle * self.clock_ghz) / 1e6
def schedule(self):
# Simple round-robin scheduling
tc = self.tensor_cores[self.schedule_ptr]
self.schedule_ptr = (self.schedule_ptr + 1) % len(self.tensor_cores)
return tc
def matmul(self, A, B):
tc = self.schedule()
# Deep realism: calculate actual compute time
n = len(A)
m = len(B[0])
p = len(B)
total_ops = n * m * p * 2 # 2 ops per FMA (multiply and add)
seconds = total_ops / (self.pflops * 1e15)
print(f"[TensorCoreArray] Matmul on {len(self.tensor_cores)} tensor cores @ {self.pflops:.1f} PFLOPS, ops={total_ops}, time={seconds:.9f}s")
time.sleep(seconds) # Simulate actual compute time
return tc.matmul(A, B)
def matmul_from_memory(self, srcA, addrA, srcB, addrB, shapeA, shapeB):
tc = self.schedule()
n, p = shapeA
p2, m = shapeB
total_ops = n * m * p * 2
seconds = total_ops / (self.pflops * 1e15)
print(f"[TensorCoreArray] Matmul from memory on {len(self.tensor_cores)} tensor cores @ {self.pflops:.1f} PFLOPS, ops={total_ops}, time={seconds:.9f}s")
time.sleep(seconds)
return tc.matmul_from_memory(srcA, addrA, srcB, addrB, shapeA, shapeB)
def load_matrix(self, matrix, core_idx=0, row_offset=0, col_offset=0):
self.tensor_cores[core_idx].load_matrix(matrix, row_offset, col_offset)
def read_matrix(self, n, m, core_idx=0, row_offset=0, col_offset=0):
return self.tensor_cores[core_idx].read_matrix(n, m, row_offset, col_offset)
|