from multicore import MultiCoreSystem from vram.ram_controller import RAMController import os from gpu_state_db import GPUStateDB from custom_vram import CustomVRAM from ai import AIAccelerator class TensorCoreDB: def __init__(self, tensor_core_id, sm_id, db): self.tensor_core_id = tensor_core_id self.sm_id = sm_id self.db = db def load_state(self): state = self.db.load_state("tensor_core", "tensor_core_id", self.tensor_core_id) return state or {} def save_state(self, state): self.db.save_state("tensor_core", "tensor_core_id", self.tensor_core_id, state) def matmul(self, A, B): state = self.load_state() # Simulate a matrix multiply (for demo, just sum all elements) result = sum(sum(row) for row in A) * sum(sum(row) for row in B) state["last_result"] = result self.save_state(state) return result class OpticalInterconnect: def __init__(self, bandwidth_tbps=800, latency_ns=1): self.bandwidth_tbps = bandwidth_tbps # TB/s self.latency_ns = latency_ns # nanoseconds def transfer_time(self, data_size_bytes): # Time = latency + (data_size / bandwidth) bandwidth_bytes_per_s = self.bandwidth_tbps * 1e12 transfer_time_s = self.latency_ns * 1e-9 + (data_size_bytes / bandwidth_bytes_per_s) return transfer_time_s class Thread: def __init__(self, thread_id, core): self.thread_id = thread_id self.core = core self.active = True self.result = None def run(self, a, b, cin, opcode, reg_sel): if self.active: self.result = self.core.step(a, b, cin, opcode, reg_sel) return self.result class Warp: def __init__(self, warp_id, threads): self.warp_id = warp_id self.threads = threads # List of Thread objects self.active = True def run(self, a, b, cin, opcode, reg_sel): # All threads in a warp execute in lockstep (SIMT) return [thread.run(a, b, cin, opcode, reg_sel) for thread in self.threads if thread.active] class WarpScheduler: def __init__(self, warps): self.warps = warps # List of Warp objects self.schedule_ptr = 0 def schedule(self): # Simple round-robin scheduler if not self.warps: return None warp = self.warps[self.schedule_ptr] self.schedule_ptr = (self.schedule_ptr + 1) % len(self.warps) return warp class SharedMemory: def __init__(self, size): self.size = size self.mem = [0] * size def read(self, addr): return self.mem[addr % self.size] def write(self, addr, value): self.mem[addr % self.size] = value def read_matrix(self, addr, n, m): # Simulate reading an n x m matrix from shared memory # For simplicity, treat addr as row offset return [ [self.mem[(addr + i * m + j) % self.size] for j in range(m)] for i in range(n) ] class L1Cache: def __init__(self, size): self.size = size self.cache = [None] * size def read(self, addr): return self.cache[addr % self.size] def write(self, addr, value): self.cache[addr % self.size] = value # GlobalMemory now uses RAMController and persists to .db class GlobalMemory: def __init__(self, size_bytes=None, db_path=None): if db_path is None: import uuid db_path = os.path.join(os.path.dirname(__file__), f"global_mem_{uuid.uuid4().hex}.db") self.size_bytes = float('inf') # Unlimited size self.ram = RAMController(size_bytes=None, db_path=db_path) # Pass None for unlimited size self.allocated_address = 0 # Simple allocation pointer def read(self, addr, length=1): data = self.ram.read(addr, length) # Return as int for compatibility (simulate voltage) if length == 1: return int(data[0]) if data else 0 return [int(b) for b in data] def write(self, addr, value): # Accepts int, float, or list/bytes if isinstance(value, (int, float)): data = bytes([int(value) & 0xFF]) elif isinstance(value, (bytes, bytearray)): data = value elif isinstance(value, list): # Convert list of integers to bytes, assuming each integer is a byte value (0-255) data = bytes(value) else: raise TypeError("Unsupported value type for write") self.ram.write(addr, data) def read_matrix(self, addr, n, m): # Read n*m bytes and reshape data = self.ram.read(addr, n * m) return [list(data[i*m:(i+1)*m]) for i in range(n)] def allocate_space(self, size_bytes: int) -> int: """Simulates allocating space in global memory with unlimited capacity.""" allocated_addr = self.allocated_address self.allocated_address += size_bytes return allocated_addr # Always succeeds due to unlimited storage # StreamingMultiprocessor now only loads state from DB as needed class StreamingMultiprocessor: def __init__(self, sm_id, chip_id, db: GPUStateDB, num_cores_per_sm=128, warps_per_sm=164, threads_per_warp=700, num_tensor_cores=8): self.sm_id = sm_id self.chip_id = chip_id self.db = db self.num_cores_per_sm = num_cores_per_sm self.warps_per_sm = warps_per_sm self.threads_per_warp = threads_per_warp self.num_tensor_cores = num_tensor_cores self.global_mem = None # Will be set by GPUMemoryHierarchy def load_state(self): state = self.db.load_state("sm", "sm_id", self.sm_id) return state or {} def save_state(self, state): self.db.save_state("sm", "sm_id", self.sm_id, state) def attach_global_mem(self, global_mem): self.global_mem = global_mem def get_core(self, core_id): return Core(core_id, self.sm_id, self.db) def get_warp(self, warp_id): return WarpDB(warp_id, self.sm_id, self.db) def get_tensor_core(self, tensor_core_id): return TensorCoreDB(tensor_core_id, self.sm_id, self.db) def run_next_warp(self, a, b, cin, opcode, reg_sel): # Example: load warp 0, run, save warp = self.get_warp(0) result = warp.run(a, b, cin, opcode, reg_sel) return result def tensor_core_matmul(self, A, B, tensor_core_id=0): tensor_core = self.get_tensor_core(tensor_core_id) return tensor_core.matmul(A, B) class Core: def __init__(self, core_id, sm_id, db: GPUStateDB): self.core_id = core_id self.sm_id = sm_id self.db = db def load_state(self): state = self.db.load_state("core", "core_id", self.core_id) return state or {} def save_state(self, state): self.db.save_state("core", "core_id", self.core_id, state) def step(self, a, b, cin, opcode, reg_sel): state = self.load_state() # Simulate a simple operation state["last_result"] = (a[0] + b[0] + cin) if opcode == 0b10 else 0.0 self.save_state(state) return state["last_result"] class WarpDB: def __init__(self, warp_id, sm_id, db: GPUStateDB, threads_per_warp=700): self.warp_id = warp_id self.sm_id = sm_id self.db = db self.threads_per_warp = threads_per_warp def load_state(self): state = self.db.load_state("warp", "warp_id", self.warp_id) return state or {} def save_state(self, state): self.db.save_state("warp", "warp_id", self.warp_id, state) def get_thread(self, thread_id): return ThreadDB(thread_id, self.warp_id, self.db) def run(self, a, b, cin, opcode, reg_sel): # For demo, run only first thread thread = self.get_thread(0) result = thread.run(a, b, cin, opcode, reg_sel) return [result] class ThreadDB: def __init__(self, thread_id, warp_id, db: GPUStateDB): self.thread_id = thread_id self.warp_id = warp_id self.db = db def load_state(self): state = self.db.load_state("thread", "thread_id", self.thread_id) return state or {} def save_state(self, state): self.db.save_state("thread", "thread_id", self.thread_id, state) def run(self, a, b, cin, opcode, reg_sel): state = self.load_state() # Simulate a simple operation state["result"] = (a[0] + b[0] + cin) if opcode == 0b10 else 0.0 self.save_state(state) return state["result"] def attach_global_mem(self, global_mem): self.global_mem = global_mem def run_next_warp(self, a, b, cin, opcode, reg_sel): warp = self.scheduler.schedule() if warp: return warp.run(a, b, cin, opcode, reg_sel) return None def tensor_core_matmul(self, A, B): return self.tensor_cores.matmul(A, B) def tensor_core_matmul_from_memory(self, srcA, addrA, srcB, addrB, shapeA, shapeB): return self.tensor_cores.matmul_from_memory(srcA, addrA, srcB, addrB, shapeA, shapeB) def read_register_matrix(self, addr, n, m): # Simulate reading an n x m matrix from registers # For simplicity, treat addr as row offset return [ [self.register_file[(addr + i) % len(self.register_file)][(j) % len(self.register_file[0])] for j in range(m)] for i in range(n) ] class GPUMemoryHierarchy: def __init__(self, num_sms, global_mem_size_bytes, chip_id, db: GPUStateDB): self.global_mem = GlobalMemory(global_mem_size_bytes) self.sm_ids = list(range(num_sms)) self.chip_id = chip_id self.db = db self.num_sms = num_sms def add_sm(self, sm): sm.attach_global_mem(self.global_mem) def read_global(self, addr): return self.global_mem.read(addr) def write_global(self, addr, value): self.global_mem.write(addr, value) class Chip: def __init__(self, chip_id, num_sms=1500, vram_size_gb=16, db_path="gpu_state.db", storage=None): self.chip_id = chip_id self.db = GPUStateDB(db_path) # Handle unlimited VRAM case (when vram_size_gb is None) global_mem_size_bytes = None if vram_size_gb is None else vram_size_gb * 1024 * 1024 * 1024 self.gpu_mem = GPUMemoryHierarchy(num_sms=num_sms, global_mem_size_bytes=global_mem_size_bytes, chip_id=chip_id, db=self.db) self.sm_ids = list(range(num_sms)) self.connected_chips = [] self.storage = storage # Store shared WebSocket storage self.ai_accelerator = AIAccelerator(storage=storage) # Pass shared storage to accelerator self.custom_vram = CustomVRAM(self.gpu_mem.global_mem) # Create CustomVRAM instance self.ai_accelerator.set_vram(self.custom_vram) # Set VRAM for AIAccelerator def get_sm(self, sm_id): return StreamingMultiprocessor(sm_id, self.chip_id, self.db) def connect_chip(self, other_chip, interconnect): self.connected_chips.append((other_chip, interconnect)) def close(self): if hasattr(self, "db") and self.db: self.db.close() if hasattr(self, "gpu_mem") and hasattr(self.gpu_mem, "global_mem") and hasattr(self.gpu_mem.global_mem, "ram"): self.gpu_mem.global_mem.ram.close() if __name__ == "__main__": print("\n--- Multi-Chip GPU Simulation (DB-backed) ---") num_chips = 10 vram_size_gb = 16 chips = [Chip( chip_id=i, num_sms=100, vram_size_gb=vram_size_gb, db_path=f"gpu_state_chip_{i}.db" ) for i in range(num_chips)] print(f"Total chips: {len(chips)}") optical_link = OpticalInterconnect(bandwidth_tbps=800, latency_ns=1) for i in range(num_chips): chips[i].connect_chip(chips[(i+1)%num_chips], optical_link) for chip in chips: sm = chip.get_sm(0) results = sm.run_next_warp([0.7, 0.0], [0.7, 0.7], 0.0, 0b10, 0) print(f"Chip {chip.chip_id} SM 0 first thread result: {results[0] if results else None}") # Example tensor core usage: matrix multiply on SM 0, tensor core 0 A = [[1.0, 2.0], [3.0, 4.0]] B = [[5.0, 6.0], [7.0, 8.0]] tc_result = sm.tensor_core_matmul(A, B, tensor_core_id=0) print(f"Chip {chip.chip_id} SM 0 tensor core 0 matmul result: {tc_result}") print(f"Total SMs in first chip: {len(chips[0].sm_ids)}") print(f"Global memory size in first chip: {chips[0].gpu_mem.global_mem.size_bytes} bytes (backed by .db)") chips[0].send_data(chips[1], optical_link, 1024*1024*1024*10)