from typing import Dict, Any, Optional import numpy as np class MemoryBlock: """Base class for GPU memory blocks""" def __init__(self, size_bytes: int): self.size = size_bytes self.data = bytearray(size_bytes) self.offset = 0 def allocate(self, size_bytes: int) -> Optional[int]: """Allocate memory and return offset""" if self.offset + size_bytes > self.size: return None current_offset = self.offset self.offset += size_bytes return current_offset def write(self, offset: int, data: bytes): """Write data at specified offset""" if offset + len(data) > self.size: raise ValueError("Write operation exceeds memory block size") self.data[offset:offset + len(data)] = data def read(self, offset: int, size: int) -> bytes: """Read data from specified offset""" if offset + size > self.size: raise ValueError("Read operation exceeds memory block size") return bytes(self.data[offset:offset + size]) class SharedMemory(MemoryBlock): """Represents shared memory accessible by all threads in a block""" def __init__(self, size_bytes: int = 48*1024): # Default 48KB super().__init__(size_bytes) self.locks: Dict[int, bool] = {} # For synchronization def atomic_add(self, offset: int, value: int) -> int: """Perform atomic addition""" current = int.from_bytes(self.read(offset, 4), 'little') new_value = current + value self.write(offset, new_value.to_bytes(4, 'little')) return current class L1Cache(MemoryBlock): """Represents L1 cache memory""" def __init__(self, size_bytes: int = 32*1024): # Default 32KB super().__init__(size_bytes) self.cache_lines: Dict[int, bytes] = {} self.line_size = 128 # 128 bytes per cache line def load_line(self, address: int) -> bytes: """Load a cache line""" line_address = address - (address % self.line_size) if line_address not in self.cache_lines: # Simulate fetching from L2 self.cache_lines[line_address] = bytes(self.line_size) return self.cache_lines[line_address] class L2Cache(MemoryBlock): """Represents L2 cache memory""" def __init__(self, size_bytes: int = 1024*1024): # Default 1MB super().__init__(size_bytes) self.cache_lines: Dict[int, bytes] = {} self.line_size = 256 # 256 bytes per cache line def load_line(self, address: int) -> bytes: """Load a cache line""" line_address = address - (address % self.line_size) if line_address not in self.cache_lines: # Simulate fetching from global memory self.cache_lines[line_address] = bytes(self.line_size) return self.cache_lines[line_address] class RegisterFile: """Represents per-thread registers""" def __init__(self, num_registers: int = 255): # Maximum registers per thread self.registers = [0] * num_registers self.used_registers = 0 def allocate(self, num: int = 1) -> Optional[int]: """Allocate registers and return starting index""" if self.used_registers + num > len(self.registers): return None start = self.used_registers self.used_registers += num return start def read(self, index: int) -> int: """Read from register""" if 0 <= index < self.used_registers: return self.registers[index] raise IndexError("Register index out of bounds") def write(self, index: int, value: int): """Write to register""" if 0 <= index < self.used_registers: self.registers[index] = value else: raise IndexError("Register index out of bounds")