Spaces:

factorstudios
/

INFER

Runtime error

INFER / tensor_core.py

Factor Studios

Upload 27 files

2ff82ee verified 5 months ago

6.1 kB

	"""
	Tensor Core subsystem for hyperrealistic GPU simulation.
	Models hardware-level matrix multiply-accumulate, scheduling, and memory integration.
	"""

	import time
	import sys
	import os
	sys.path.append(os.path.dirname(os.path.abspath(__file__)))
	try:
	from electron_speed import TARGET_SWITCHES_PER_SEC, TRANSISTORS_ON_CHIP
	except ImportError:
	TARGET_SWITCHES_PER_SEC = 9e20
	TRANSISTORS_ON_CHIP = 6e11

	class TensorCore:
	"""
	Simulates a hardware tensor core for matrix operations (multiply-accumulate),
	with realistic operand fetch from registers, shared memory, and VRAM/global memory.
	"""
	def __init__(self, bits=2, memory_size=80010241024*1024, bandwidth_tbps=10000, sm=None):
	self.bits = bits
	# Use a sparse dict for local memory: keys are (row, col), values are floats
	self.memory = {}
	self.bandwidth_tbps = bandwidth_tbps # Simulated bandwidth for operand fetch (TB/s)
	self.sm = sm # Reference to parent SM for memory access

	def fetch_operand(self, source, addr, shape):
	"""
	Fetches a matrix operand from a given source (registers, shared, global).
	Simulates bandwidth and latency.
	"""
	n, m = shape
	if source == 'register':
	# Simulate register fetch (fast, minimal latency)
	matrix = self.sm.read_register_matrix(addr, n, m)
	latency = 1e-9 # 1ns
	elif source == 'shared':
	# Simulate shared memory fetch
	matrix = self.sm.shared_mem.read_matrix(addr, n, m)
	latency = 10e-9 # 10ns
	elif source == 'global':
	# Simulate VRAM/global memory fetch
	matrix = self.sm.global_mem.read_matrix(addr, n, m)
	latency = 200e-9 # 200ns
	else:
	raise ValueError(f"Unknown source: {source}")
	# Simulate bandwidth (TB/s)
	data_size_bytes = n * m * (self.bits // 8)
	transfer_time = data_size_bytes / (self.bandwidth_tbps * 1e12)
	time.sleep(latency + transfer_time) # Simulate delay
	return matrix

	def matmul(self, A, B):
	# A, B: 2D lists (matrices) of voltages
	n = len(A)
	m = len(B[0])
	p = len(B)
	C = [[0.0 for _ in range(m)] for _ in range(n)]
	for i in range(n):
	for j in range(m):
	acc = 0.0
	for k in range(p):
	acc += A[i][k] * B[k][j]
	C[i][j] = acc
	return C

	def matmul_from_memory(self, srcA, addrA, srcB, addrB, shapeA, shapeB):
	"""
	Fetches operands from memory hierarchy and performs matmul.
	srcA/srcB: 'register', 'shared', or 'global'
	addrA/addrB: address or index
	shapeA/shapeB: (n, p), (p, m)
	"""
	A = self.fetch_operand(srcA, addrA, shapeA)
	B = self.fetch_operand(srcB, addrB, shapeB)
	return self.matmul(A, B)

	def load_matrix(self, matrix, row_offset=0, col_offset=0):
	# Loads a matrix into local memory (sparse)
	for i, row in enumerate(matrix):
	for j, val in enumerate(row):
	self.memory[(row_offset+i, col_offset+j)] = val

	def read_matrix(self, n, m, row_offset=0, col_offset=0):
	# Reads an n x m matrix from local memory (sparse)
	return [
	[self.memory.get((row_offset+i, col_offset+j), 0.0) for j in range(m)]
	for i in range(n)
	]

	class TensorCoreArray:
	"""
	Array of tensor cores per SM, with scheduling and memory integration.
	"""
	def __init__(self, num_tensor_cores=8000, bits=2, memory_size=80010241024*1024, bandwidth_tbps=10000, sm=None):
	self.tensor_cores = [TensorCore(bits=bits, memory_size=memory_size, bandwidth_tbps=bandwidth_tbps, sm=sm) for _ in range(num_tensor_cores)]
	self.schedule_ptr = 0
	self.sm = sm
	# Deep realism: calculate theoretical PFLOPS
	# Use foundational switching rate from electron_speed.py
	# PFLOPS = (num_tensor_cores * ops_per_cycle * clock_GHz) / 1e6
	# clock_GHz = (TARGET_SWITCHES_PER_SEC / TRANSISTORS_ON_CHIP) / 1e9
	self.ops_per_cycle = 1024 # Example: 1024 fused-multiply-adds per cycle per core
	self.clock_ghz = (TARGET_SWITCHES_PER_SEC / TRANSISTORS_ON_CHIP) / 1e9
	self.pflops = (num_tensor_cores * self.ops_per_cycle * self.clock_ghz) / 1e6

	def schedule(self):
	# Simple round-robin scheduling
	tc = self.tensor_cores[self.schedule_ptr]
	self.schedule_ptr = (self.schedule_ptr + 1) % len(self.tensor_cores)
	return tc

	def matmul(self, A, B):
	tc = self.schedule()
	# Deep realism: calculate actual compute time
	n = len(A)
	m = len(B[0])
	p = len(B)
	total_ops = n * m * p * 2 # 2 ops per FMA (multiply and add)
	seconds = total_ops / (self.pflops * 1e15)
	print(f"[TensorCoreArray] Matmul on {len(self.tensor_cores)} tensor cores @ {self.pflops:.1f} PFLOPS, ops={total_ops}, time={seconds:.9f}s")
	time.sleep(seconds) # Simulate actual compute time
	return tc.matmul(A, B)

	def matmul_from_memory(self, srcA, addrA, srcB, addrB, shapeA, shapeB):
	tc = self.schedule()
	n, p = shapeA
	p2, m = shapeB
	total_ops = n * m * p * 2
	seconds = total_ops / (self.pflops * 1e15)
	print(f"[TensorCoreArray] Matmul from memory on {len(self.tensor_cores)} tensor cores @ {self.pflops:.1f} PFLOPS, ops={total_ops}, time={seconds:.9f}s")
	time.sleep(seconds)
	return tc.matmul_from_memory(srcA, addrA, srcB, addrB, shapeA, shapeB)

	def load_matrix(self, matrix, core_idx=0, row_offset=0, col_offset=0):
	self.tensor_cores[core_idx].load_matrix(matrix, row_offset, col_offset)

	def read_matrix(self, n, m, core_idx=0, row_offset=0, col_offset=0):
	return self.tensor_cores[core_idx].read_matrix(n, m, row_offset, col_offset)