Factor Studios commited on
Commit
2ff82ee
·
verified ·
1 Parent(s): 1e9fb4b

Upload 27 files

Browse files
ai.py ADDED
@@ -0,0 +1,419 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import time
3
+ from typing import Dict, Any, Optional, Tuple, Union, List
4
+ from enum import Enum
5
+
6
+
7
+ class VectorOperation(Enum):
8
+ """Enumeration of supported vector operations."""
9
+ ADD = "add"
10
+ SUBTRACT = "subtract"
11
+ MULTIPLY = "multiply"
12
+ DIVIDE = "divide"
13
+ DOT_PRODUCT = "dot_product"
14
+ CROSS_PRODUCT = "cross_product"
15
+ NORMALIZE = "normalize"
16
+ MAGNITUDE = "magnitude"
17
+
18
+
19
+ class AIAccelerator:
20
+ """
21
+ AI Accelerator that simulates GPU-based AI computations.
22
+
23
+ This class leverages NumPy's optimized operations to simulate the parallel
24
+ processing capabilities of the vGPU for AI workloads.
25
+ """
26
+
27
+ def __init__(self, vram=None, num_sms: int = 800, cores_per_sm: int = 222):
28
+ self.vram = vram
29
+ self.num_sms = num_sms
30
+ self.cores_per_sm = cores_per_sm
31
+ self.total_cores = num_sms * cores_per_sm
32
+
33
+ # AI operation statistics
34
+ self.operations_performed = 0
35
+ self.total_compute_time = 0.0
36
+ self.flops_performed = 0 # Floating point operations
37
+
38
+ # Matrix registry for storing matrices in VRAM
39
+ self.matrix_registry: Dict[str, str] = {} # matrix_id -> vram_address
40
+ self.matrix_counter = 0
41
+
42
+ # Model/tokenizer registry for full isolation
43
+ self.model_registry: Dict[str, Any] = {}
44
+ self.tokenizer_registry: Dict[str, Any] = {}
45
+ self.model_loaded = False
46
+
47
+ def set_vram(self, vram):
48
+ """Set the VRAM reference."""
49
+ self.vram = vram
50
+
51
+ def allocate_matrix(self, shape: Tuple[int, ...], dtype=np.float32,
52
+ name: Optional[str] = None) -> str:
53
+ """Allocate a matrix in VRAM and return its ID."""
54
+ if not self.vram:
55
+ raise RuntimeError("VRAM not available")
56
+
57
+ if name is None:
58
+ name = f"matrix_{self.matrix_counter}"
59
+ self.matrix_counter += 1
60
+
61
+ # Create matrix data
62
+ matrix_data = np.zeros(shape, dtype=dtype)
63
+
64
+ # Store in VRAM as a texture (reusing texture storage mechanism)
65
+ matrix_id = self.vram.load_texture(matrix_data, name)
66
+ self.matrix_registry[name] = matrix_id
67
+
68
+ return name
69
+
70
+ def load_matrix(self, matrix_data: np.ndarray, name: Optional[str] = None) -> str:
71
+ """Load matrix data into VRAM and return its ID."""
72
+ if not self.vram:
73
+ raise RuntimeError("VRAM not available")
74
+
75
+ if name is None:
76
+ name = f"matrix_{self.matrix_counter}"
77
+ self.matrix_counter += 1
78
+
79
+ # Store in VRAM
80
+ matrix_id = self.vram.load_texture(matrix_data, name)
81
+ self.matrix_registry[name] = matrix_id
82
+
83
+ return name
84
+
85
+ def get_matrix(self, matrix_id: str) -> Optional[np.ndarray]:
86
+ """Retrieve matrix data from VRAM."""
87
+ if not self.vram or matrix_id not in self.matrix_registry:
88
+ return None
89
+
90
+ vram_id = self.matrix_registry[matrix_id]
91
+ return self.vram.get_texture(vram_id)
92
+
93
+ def matrix_multiply(self, matrix_a_id: str, matrix_b_id: str,
94
+ result_id: Optional[str] = None) -> Optional[str]:
95
+ """Perform matrix multiplication using simulated GPU parallelism."""
96
+ start_time = time.time()
97
+
98
+ # Retrieve matrices from VRAM
99
+ matrix_a = self.get_matrix(matrix_a_id)
100
+ matrix_b = self.get_matrix(matrix_b_id)
101
+
102
+ if matrix_a is None or matrix_b is None:
103
+ print(f"Error: Could not retrieve matrices {matrix_a_id} or {matrix_b_id}")
104
+ return None
105
+
106
+ try:
107
+ # Check if matrices can be multiplied
108
+ if matrix_a.shape[-1] != matrix_b.shape[0]:
109
+ print(f"Error: Matrix dimensions incompatible for multiplication: "
110
+ f"{matrix_a.shape} x {matrix_b.shape}")
111
+ return None
112
+
113
+ # Simulate parallel processing by breaking down the operation
114
+ # In a real GPU, this would be distributed across SMs and cores
115
+ result = self._simulate_parallel_matmul(matrix_a, matrix_b)
116
+
117
+ # Store result in VRAM
118
+ if result_id is None:
119
+ result_id = f"result_{self.matrix_counter}"
120
+ self.matrix_counter += 1
121
+
122
+ result_matrix_id = self.load_matrix(result, result_id)
123
+
124
+ # Update statistics
125
+ compute_time = time.time() - start_time
126
+ self.total_compute_time += compute_time
127
+ self.operations_performed += 1
128
+
129
+ # Calculate FLOPs (2 * M * N * K for matrix multiplication)
130
+ m, k = matrix_a.shape
131
+ k2, n = matrix_b.shape
132
+ flops = 2 * m * n * k
133
+ self.flops_performed += flops
134
+
135
+ print(f"Matrix multiplication completed: {matrix_a.shape} x {matrix_b.shape} "
136
+ f"= {result.shape} in {compute_time:.4f}s")
137
+ print(f"Simulated {flops:,} FLOPs across {self.total_cores} cores")
138
+
139
+ return result_matrix_id
140
+
141
+ except Exception as e:
142
+ print(f"Error in matrix multiplication: {e}")
143
+ return None
144
+
145
+ def _simulate_parallel_matmul(self, matrix_a: np.ndarray, matrix_b: np.ndarray) -> np.ndarray:
146
+ """Simulate parallel matrix multiplication across SMs."""
147
+ # Use NumPy's optimized matrix multiplication
148
+ # In a real implementation, this would be broken down into blocks
149
+ # and distributed across the simulated SMs
150
+
151
+ # For demonstration, we can show how the work would be distributed
152
+ m, k = matrix_a.shape
153
+ k2, n = matrix_b.shape
154
+
155
+ # Calculate work distribution
156
+ total_output_elements = m * n
157
+ elements_per_sm = max(1, total_output_elements // self.num_sms)
158
+
159
+ print(f"Distributing {total_output_elements:,} output elements across "
160
+ f"{self.num_sms} SMs ({elements_per_sm} elements per SM)")
161
+
162
+ # Perform the actual computation using NumPy
163
+ result = np.dot(matrix_a, matrix_b)
164
+
165
+ return result
166
+
167
+ def vector_operation(self, operation: VectorOperation, vector_a_id: str,
168
+ vector_b_id: Optional[str] = None,
169
+ result_id: Optional[str] = None) -> Optional[str]:
170
+ """Perform vector operations using simulated GPU parallelism."""
171
+ start_time = time.time()
172
+
173
+ # Retrieve vectors from VRAM
174
+ vector_a = self.get_matrix(vector_a_id)
175
+ if vector_a is None:
176
+ print(f"Error: Could not retrieve vector {vector_a_id}")
177
+ return None
178
+
179
+ vector_b = None
180
+ if vector_b_id:
181
+ vector_b = self.get_matrix(vector_b_id)
182
+ if vector_b is None:
183
+ print(f"Error: Could not retrieve vector {vector_b_id}")
184
+ return None
185
+
186
+ try:
187
+ result = None
188
+ flops = 0
189
+
190
+ if operation == VectorOperation.ADD:
191
+ if vector_b is None:
192
+ raise ValueError("Vector B required for addition")
193
+ result = vector_a + vector_b
194
+ flops = vector_a.size
195
+
196
+ elif operation == VectorOperation.SUBTRACT:
197
+ if vector_b is None:
198
+ raise ValueError("Vector B required for subtraction")
199
+ result = vector_a - vector_b
200
+ flops = vector_a.size
201
+
202
+ elif operation == VectorOperation.MULTIPLY:
203
+ if vector_b is None:
204
+ raise ValueError("Vector B required for multiplication")
205
+ result = vector_a * vector_b
206
+ flops = vector_a.size
207
+
208
+ elif operation == VectorOperation.DIVIDE:
209
+ if vector_b is None:
210
+ raise ValueError("Vector B required for division")
211
+ result = vector_a / vector_b
212
+ flops = vector_a.size
213
+
214
+ elif operation == VectorOperation.DOT_PRODUCT:
215
+ if vector_b is None:
216
+ raise ValueError("Vector B required for dot product")
217
+ result = np.dot(vector_a.flatten(), vector_b.flatten())
218
+ flops = 2 * vector_a.size
219
+
220
+ elif operation == VectorOperation.CROSS_PRODUCT:
221
+ if vector_b is None:
222
+ raise ValueError("Vector B required for cross product")
223
+ result = np.cross(vector_a, vector_b)
224
+ flops = 6 # Approximate for 3D cross product
225
+
226
+ elif operation == VectorOperation.NORMALIZE:
227
+ magnitude = np.linalg.norm(vector_a)
228
+ result = vector_a / magnitude if magnitude > 0 else vector_a
229
+ flops = vector_a.size * 2 # Division + magnitude calculation
230
+
231
+ elif operation == VectorOperation.MAGNITUDE:
232
+ result = np.array([np.linalg.norm(vector_a)])
233
+ flops = vector_a.size * 2 # Squares and sum
234
+
235
+ else:
236
+ raise ValueError(f"Unsupported vector operation: {operation}")
237
+
238
+ # Store result in VRAM
239
+ if result_id is None:
240
+ result_id = f"vector_result_{self.matrix_counter}"
241
+ self.matrix_counter += 1
242
+
243
+ result_vector_id = self.load_matrix(result, result_id)
244
+
245
+ # Update statistics
246
+ compute_time = time.time() - start_time
247
+ self.total_compute_time += compute_time
248
+ self.operations_performed += 1
249
+ self.flops_performed += flops
250
+
251
+ print(f"Vector operation {operation.value} completed in {compute_time:.4f}s")
252
+
253
+ return result_vector_id
254
+
255
+ except Exception as e:
256
+ print(f"Error in vector operation {operation.value}: {e}")
257
+ return None
258
+
259
+ def convolution_2d(self, input_id: str, kernel_id: str,
260
+ stride: int = 1, padding: int = 0,
261
+ result_id: Optional[str] = None) -> Optional[str]:
262
+ """Perform 2D convolution operation."""
263
+ start_time = time.time()
264
+
265
+ # Retrieve input and kernel from VRAM
266
+ input_data = self.get_matrix(input_id)
267
+ kernel = self.get_matrix(kernel_id)
268
+
269
+ if input_data is None or kernel is None:
270
+ print(f"Error: Could not retrieve input or kernel")
271
+ return None
272
+
273
+ try:
274
+ # Simple 2D convolution implementation
275
+ # In a real GPU implementation, this would be highly optimized
276
+ # and distributed across many cores
277
+
278
+ if len(input_data.shape) == 2:
279
+ input_h, input_w = input_data.shape
280
+ channels = 1
281
+ else:
282
+ input_h, input_w, channels = input_data.shape
283
+
284
+ kernel_h, kernel_w = kernel.shape[:2]
285
+
286
+ # Calculate output dimensions
287
+ output_h = (input_h + 2 * padding - kernel_h) // stride + 1
288
+ output_w = (input_w + 2 * padding - kernel_w) // stride + 1
289
+
290
+ # Initialize output
291
+ if channels == 1:
292
+ output = np.zeros((output_h, output_w))
293
+ else:
294
+ output = np.zeros((output_h, output_w, channels))
295
+
296
+ # Pad input if necessary
297
+ if padding > 0:
298
+ if channels == 1:
299
+ padded_input = np.pad(input_data, padding, mode='constant')
300
+ else:
301
+ padded_input = np.pad(input_data,
302
+ ((padding, padding), (padding, padding), (0, 0)),
303
+ mode='constant')
304
+ else:
305
+ padded_input = input_data
306
+
307
+ # Perform convolution
308
+ flops = 0
309
+ for y in range(0, output_h):
310
+ for x in range(0, output_w):
311
+ y_start = y * stride
312
+ x_start = x * stride
313
+
314
+ if channels == 1:
315
+ patch = padded_input[y_start:y_start+kernel_h, x_start:x_start+kernel_w]
316
+ output[y, x] = np.sum(patch * kernel)
317
+ flops += kernel_h * kernel_w * 2 # Multiply and add
318
+ else:
319
+ for c in range(channels):
320
+ patch = padded_input[y_start:y_start+kernel_h,
321
+ x_start:x_start+kernel_w, c]
322
+ output[y, x, c] = np.sum(patch * kernel)
323
+ flops += kernel_h * kernel_w * 2
324
+
325
+ # Store result in VRAM
326
+ if result_id is None:
327
+ result_id = f"conv_result_{self.matrix_counter}"
328
+ self.matrix_counter += 1
329
+
330
+ result_conv_id = self.load_matrix(output, result_id)
331
+
332
+ # Update statistics
333
+ compute_time = time.time() - start_time
334
+ self.total_compute_time += compute_time
335
+ self.operations_performed += 1
336
+ self.flops_performed += flops
337
+
338
+ print(f"2D Convolution completed: {input_data.shape} * {kernel.shape} "
339
+ f"= {output.shape} in {compute_time:.4f}s")
340
+ print(f"Simulated {flops:,} FLOPs")
341
+
342
+ return result_conv_id
343
+
344
+ except Exception as e:
345
+ print(f"Error in 2D convolution: {e}")
346
+ return None
347
+
348
+ def get_stats(self) -> Dict[str, Any]:
349
+ """Get AI accelerator statistics."""
350
+ avg_compute_time = self.total_compute_time / max(1, self.operations_performed)
351
+ flops_per_second = self.flops_performed / max(0.001, self.total_compute_time)
352
+
353
+ return {
354
+ "operations_performed": self.operations_performed,
355
+ "total_compute_time": self.total_compute_time,
356
+ "avg_compute_time": avg_compute_time,
357
+ "flops_performed": self.flops_performed,
358
+ "flops_per_second": flops_per_second,
359
+ "matrices_in_memory": len(self.matrix_registry),
360
+ "simulated_cores": self.total_cores,
361
+ "simulated_sms": self.num_sms
362
+ }
363
+
364
+ def reset_stats(self) -> None:
365
+ """Reset AI accelerator statistics."""
366
+ self.operations_performed = 0
367
+ self.total_compute_time = 0.0
368
+ self.flops_performed = 0
369
+
370
+ def load_model(self, model_id: str, model: Any, processor: Any):
371
+ """Loads a model and its processor into the accelerator's registry."""
372
+ self.model_registry[model_id] = model
373
+ self.tokenizer_registry[model_id] = processor
374
+ self.model_loaded = True
375
+ print(f"Model '{model_id}' loaded into AIAccelerator.")
376
+
377
+ def has_model(self, model_id: str) -> bool:
378
+ """Checks if a model is loaded in the accelerator's registry."""
379
+ return model_id in self.model_registry
380
+
381
+ def inference(self, model_id, input_text, idx=None):
382
+ print(f"[DEBUG] AIAccelerator.inference called for model_id={model_id}, idx={idx}")
383
+ if not self.has_model(model_id):
384
+ print(f"[ERROR] Model {model_id} not loaded in AIAccelerator.")
385
+ return None
386
+ model = self.model_registry[model_id]
387
+ processor = self.tokenizer_registry[model_id]
388
+ try:
389
+ # Check if this is a dummy model for testing
390
+ if hasattr(model, '__class__') and 'Dummy' in model.__class__.__name__:
391
+ # Handle dummy model for testing
392
+ return processor.decode([1, 2, 3, 4, 5], skip_special_tokens=True)
393
+
394
+ # Try to import torch and transformers for real models
395
+ import torch
396
+ from transformers import BlipForConditionalGeneration, BlipProcessor
397
+
398
+ # BLIP vision model branch
399
+ if isinstance(model, BlipForConditionalGeneration) and isinstance(processor, BlipProcessor):
400
+ # input_text is actually the image/frame (numpy array)
401
+ image = input_text
402
+ prompt = "Describe this image."
403
+ # Accept numpy.ndarray, PIL.Image, or torch.Tensor
404
+ if not (hasattr(image, 'shape') or hasattr(image, 'size')):
405
+ raise ValueError(f"Invalid image type. Expected either PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or jax.ndarray, but got {type(image)}.")
406
+ inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device)
407
+ with torch.no_grad():
408
+ out = model.generate(**inputs, max_new_tokens=64)
409
+ caption = processor.decode(out[0], skip_special_tokens=True)
410
+ print(f"[DEBUG] BLIP inference result for idx={idx}: {caption}")
411
+ return caption
412
+ else:
413
+ print(f"[ERROR] Unsupported model type for inference: {type(model)}")
414
+ return None
415
+ except Exception as e:
416
+ print(f"[ERROR] AIAccelerator.inference failed for idx={idx}: {e}")
417
+ return None
418
+
419
+
core.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Physics-inspired digital core model for virtual GPU v2.
3
+ Contains AdvancedCore class and example usage.
4
+ """
5
+
6
+ from logic_gates import ControlUnit, ALU2Bit, RegisterFile2x2, SimpleMMU
7
+
8
+ class AdvancedCore:
9
+ """
10
+ Simulates a physics-inspired digital core with:
11
+ - Control unit
12
+ - ALU
13
+ - Register file
14
+ - MMU
15
+ - Clocking and timing at the voltage/physics level
16
+ """
17
+ def __init__(self, bits=2, num_registers=2):
18
+ self.control = ControlUnit()
19
+ self.alu = ALU2Bit()
20
+ self.regfile = RegisterFile2x2()
21
+ self.mmu = SimpleMMU(num_registers=num_registers, bits=bits)
22
+ self.clk = 0.7 # High voltage for clock
23
+ self.bits = bits
24
+
25
+ def step(self, a, b, cin, opcode, reg_sel):
26
+ # Set control signals
27
+ self.control.set_opcode(opcode)
28
+ ctrl = self.control.get_control_signals()
29
+ # ALU operation
30
+ (r0, r1), cout = self.alu.operate(a[0], a[1], b[0], b[1], cin, ctrl['alu_op'])
31
+ # Write to register file
32
+ self.regfile.write(r0, r1, self.clk, reg_sel)
33
+ # MMU write (simulate memory-mapped register)
34
+ self.mmu.write(reg_sel, [r0, r1], self.clk)
35
+ # Read back
36
+ reg_out = self.regfile.read(reg_sel)
37
+ mmu_out = self.mmu.read(reg_sel)
38
+ return {
39
+ 'alu_result': (r0, r1),
40
+ 'carry_out': cout,
41
+ 'regfile_out': reg_out,
42
+ 'mmu_out': mmu_out,
43
+ 'control': ctrl
44
+ }
45
+
46
+ if __name__ == "__main__":
47
+ print("\n--- Advanced Core Simulation ---")
48
+ core = AdvancedCore(bits=2, num_registers=2)
49
+ # Simulate an ADD operation between (1,0) and (1,1), store in reg0
50
+ result = core.step([0.7, 0.0], [0.7, 0.7], 0.0, 0b10, 0)
51
+ print("Core step (ADD):", result)
52
+ # Simulate an OR operation between (1,0) and (1,1), store in reg1
53
+ result = core.step([0.7, 0.0], [0.7, 0.7], 0.0, 0b01, 1)
54
+ print("Core step (OR):", result)
custom_vram.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ class CustomVRAM:
4
+ def __init__(self, global_mem):
5
+ self.global_mem = global_mem
6
+ self.texture_registry = {}
7
+ self.texture_counter = 0
8
+
9
+ def load_texture(self, data: np.ndarray, name: str = None) -> str:
10
+ if name is None:
11
+ name = f"texture_{self.texture_counter}"
12
+ self.texture_counter += 1
13
+
14
+ # Serialize numpy array to bytes
15
+ data_bytes = data.tobytes()
16
+ data_shape = data.shape
17
+ data_dtype = str(data.dtype)
18
+
19
+ # Store metadata and data in global memory
20
+ # For simplicity, we'll store everything contiguously for now.
21
+ # In a real system, this would involve more sophisticated memory management.
22
+
23
+ # Find a suitable address in global memory (very simplified, no actual allocation logic)
24
+ # For this simulation, we'll just use a simple counter for addresses.
25
+ # In a real scenario, you'd need a proper memory allocator.
26
+ address = self.global_mem.allocate_space(len(data_bytes) + 100) # +100 for metadata
27
+
28
+ # Store shape, dtype, and then data
29
+ # This is a very basic serialization. For production, consider more robust methods.
30
+ metadata = f"{data_shape};{data_dtype};{len(data_bytes)}".encode("utf-8")
31
+ self.global_mem.write(address, list(metadata))
32
+ self.global_mem.write(address + len(metadata), list(data_bytes))
33
+
34
+ self.texture_registry[name] = {
35
+ "address": address,
36
+ "size": len(data_bytes),
37
+ "shape": data_shape,
38
+ "dtype": data_dtype,
39
+ "metadata_size": len(metadata)
40
+ }
41
+ return name
42
+
43
+ def get_texture(self, name: str) -> np.ndarray:
44
+ if name not in self.texture_registry:
45
+ return None
46
+
47
+ texture_info = self.texture_registry[name]
48
+ address = texture_info["address"]
49
+ size = texture_info["size"]
50
+ shape = texture_info["shape"]
51
+ dtype = texture_info["dtype"]
52
+ metadata_size = texture_info["metadata_size"]
53
+
54
+ # Read data from global memory
55
+ data_bytes = bytes(self.global_mem.read(address + metadata_size, size))
56
+
57
+ # Deserialize bytes to numpy array
58
+ return np.frombuffer(data_bytes, dtype=dtype).reshape(shape)
59
+
60
+ def has_texture(self, name: str) -> bool:
61
+ return name in self.texture_registry
62
+
63
+ def delete_texture(self, name: str):
64
+ if name in self.texture_registry:
65
+ # In a real system, you'd deallocate the memory.
66
+ # For this simulation, we just remove the entry.
67
+ del self.texture_registry[name]
68
+
69
+
electron_speed.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Calculate electron drift speed and relate it to transistor switching (tick) rate for a modern GPU.
3
+ Assume: We want to simulate 900 quintillion (9e20) transistor switches per second (B200 scale).
4
+ """
5
+
6
+ # Physical constants
7
+ ELEM_CHARGE = 1.602e-19 # Coulombs
8
+ ELECTRON_MASS = 9.109e-31 # kg
9
+ VACUUM_PERMITTIVITY = 8.854e-12 # F/m
10
+ SILICON_MOBILITY = 0.14 # m^2/(V·s) (typical for electrons in Si at room temp)
11
+
12
+ # Example parameters (can be tuned for realism)
13
+ VOLTAGE = 0.7 # V (typical for advanced nodes)
14
+ CHANNEL_LENGTH = 5e-9 # 5 nm process
15
+ ELECTRIC_FIELD = VOLTAGE / CHANNEL_LENGTH # V/m
16
+
17
+ # Calculate drift velocity (v = μE)
18
+ drift_velocity = SILICON_MOBILITY * ELECTRIC_FIELD # m/s
19
+
20
+ # Calculate time for electron to cross channel (t = L / v)
21
+ transit_time = CHANNEL_LENGTH / drift_velocity # seconds
22
+
23
+ # Calculate max theoretical switching frequency (f = 1 / t)
24
+ max_switch_freq = 1 / transit_time # Hz
25
+
26
+
27
+ # For 900 quintillion switches/sec, but with 600 billion transistors
28
+ TARGET_SWITCHES_PER_SEC = 9e20
29
+ TRANSISTORS_ON_CHIP = 6e11 # 600 billion
30
+ transistors_needed = TARGET_SWITCHES_PER_SEC / max_switch_freq
31
+ required_switch_freq_per_transistor = TARGET_SWITCHES_PER_SEC / TRANSISTORS_ON_CHIP
32
+
33
+ # Speed of light in silicon (approx 2/3 c)
34
+ SPEED_OF_LIGHT_VACUUM = 3e8 # m/s
35
+ SILICON_REFRACTIVE_INDEX = 3.5
36
+ speed_of_light_silicon = SPEED_OF_LIGHT_VACUUM / SILICON_REFRACTIVE_INDEX
37
+
38
+
39
+ if __name__ == "__main__":
40
+ print(f"Electron drift velocity: {drift_velocity:.2e} m/s")
41
+ print(f"Channel transit time: {transit_time:.2e} s")
42
+ print(f"Max transistor switching frequency: {max_switch_freq:.2e} Hz")
43
+ print(f"To achieve {TARGET_SWITCHES_PER_SEC:.1e} switches/sec:")
44
+ print(f"- You'd need {transistors_needed:.2e} transistors switching at max speed in parallel.")
45
+ print(f"- For a chip with 600B transistors, each must switch at {required_switch_freq_per_transistor:.2e} Hz.")
46
+ print(f"- Electron drift speed: {drift_velocity:.2e} m/s vs. speed of light in silicon: {speed_of_light_silicon:.2e} m/s")
47
+ print(f"- Electron drift is ~{(drift_velocity/speed_of_light_silicon)*100:.2f}% the speed of light in silicon (photon speed).")
48
+
49
+
50
+ # --- SR, D, JK, T Flip-Flop Physics/Timing Summary ---
51
+ print("\n--- Flip-Flop Types and Switching Physics ---")
52
+ print("SR Flip-Flop: Set-Reset, basic memory, built from NAND/NOR gates.")
53
+ print("D Flip-Flop: Data/Delay, synchronizes input to clock, used in registers.")
54
+ print("JK Flip-Flop: Universal, toggles or sets/resets based on inputs.")
55
+ print("T Flip-Flop: Toggle, divides clock, used in counters.")
56
+ print("All flip-flops are built from logic gates, so their switching speed is limited by the gate delay (set by electron drift and channel length).\n")
57
+
58
+ # Example: Calculate flip-flop switching time (assuming 4 gate delays per flip-flop)
59
+ GATE_DELAY = transit_time # seconds, from above
60
+ FF_GATE_COUNT = 4 # typical for basic flip-flop
61
+ flip_flop_delay = FF_GATE_COUNT * GATE_DELAY
62
+ flip_flop_max_freq = 1 / flip_flop_delay
63
+
64
+ print(f"Estimated flip-flop delay: {flip_flop_delay:.2e} s (for {FF_GATE_COUNT} gates)")
65
+ print(f"Max flip-flop switching frequency: {flip_flop_max_freq:.2e} Hz")
66
+
67
+
68
+
flip_flops.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hyperrealistic voltage-based flip-flops: SR, D, JK, and T.
3
+ Each flip-flop is built from voltage-based logic gates and simulates real-world behavior.
4
+ """
5
+ from logic_gates import NANDGate, ANDGate, ORGate, NOTGate, VDD, VSS, VTH, GATE_DELAY
6
+ import time
7
+
8
+ class SRFlipFlop:
9
+ """Set-Reset flip-flop using cross-coupled NAND gates."""
10
+ def __init__(self):
11
+ self.nand1 = NANDGate()
12
+ self.nand2 = NANDGate()
13
+ self.q = VSS
14
+ self.q_bar = VDD
15
+
16
+ def update(self, s, r):
17
+ # s, r are voltages
18
+ # Cross-coupled NANDs
19
+ q_new = self.nand1.output(s, self.q_bar)
20
+ q_bar_new = self.nand2.output(r, q_new)
21
+ self.q = q_new
22
+ self.q_bar = q_bar_new
23
+ return self.q, self.q_bar
24
+
25
+ class DFlipFlop:
26
+ """D (Data) flip-flop using SR flip-flop and NOT gate."""
27
+ def __init__(self):
28
+ self.sr = SRFlipFlop()
29
+ self.notg = NOTGate()
30
+
31
+ def update(self, d, clk):
32
+ # d, clk are voltages
33
+ s = self.nand(d, clk)
34
+ r = self.nand(self.notg.output(d), clk)
35
+ return self.sr.update(s, r)
36
+
37
+ def nand(self, a, b):
38
+ return NANDGate().output(a, b)
39
+
40
+ class JKFlipFlop:
41
+ """JK flip-flop using NAND gates."""
42
+ def __init__(self):
43
+ self.q = VSS
44
+ self.q_bar = VDD
45
+ self.nand1 = NANDGate()
46
+ self.nand2 = NANDGate()
47
+ self.nand3 = NANDGate()
48
+ self.nand4 = NANDGate()
49
+
50
+ def update(self, j, k, clk):
51
+ # j, k, clk are voltages
52
+ j_in = self.nand1.output(j, clk, self.q_bar)
53
+ k_in = self.nand2.output(k, clk, self.q)
54
+ q_new = self.nand3.output(j_in, self.q_bar)
55
+ q_bar_new = self.nand4.output(k_in, q_new)
56
+ self.q = q_new
57
+ self.q_bar = q_bar_new
58
+ return self.q, self.q_bar
59
+
60
+ class TFlipFlop:
61
+ """T (Toggle) flip-flop using JK flip-flop."""
62
+ def __init__(self):
63
+ self.jk = JKFlipFlop()
64
+
65
+ def update(self, t, clk):
66
+ # t, clk are voltages
67
+ return self.jk.update(t, t, clk)
68
+
69
+ # Example usage
70
+ if __name__ == "__main__":
71
+ print("SR Flip-Flop:")
72
+ sr = SRFlipFlop()
73
+ print("Set:", sr.update(VDD, VSS))
74
+ print("Reset:", sr.update(VSS, VDD))
75
+ print("Hold:", sr.update(VSS, VSS))
76
+
77
+ print("\nD Flip-Flop:")
78
+ dff = DFlipFlop()
79
+ print("D=1, CLK=1:", dff.update(VDD, VDD))
80
+ print("D=0, CLK=1:", dff.update(VSS, VDD))
81
+
82
+ print("\nJK Flip-Flop:")
83
+ jk = JKFlipFlop()
84
+ print("J=1, K=0, CLK=1:", jk.update(VDD, VSS, VDD))
85
+ print("J=0, K=1, CLK=1:", jk.update(VSS, VDD, VDD))
86
+ print("J=1, K=1, CLK=1 (toggle):", jk.update(VDD, VDD, VDD))
87
+
88
+ print("\nT Flip-Flop:")
89
+ tff = TFlipFlop()
90
+ print("T=1, CLK=1 (toggle):", tff.update(VDD, VDD))
91
+ print("T=0, CLK=1 (hold):", tff.update(VSS, VDD))
gpu_arch.py ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from multicore import MultiCoreSystem
2
+ from vram.ram_controller import RAMController
3
+ import os
4
+ from gpu_state_db import GPUStateDB
5
+ from custom_vram import CustomVRAM
6
+ from ai import AIAccelerator
7
+
8
+ class TensorCoreDB:
9
+ def __init__(self, tensor_core_id, sm_id, db):
10
+ self.tensor_core_id = tensor_core_id
11
+ self.sm_id = sm_id
12
+ self.db = db
13
+
14
+ def load_state(self):
15
+ state = self.db.load_state("tensor_core", "tensor_core_id", self.tensor_core_id)
16
+ return state or {}
17
+
18
+ def save_state(self, state):
19
+ self.db.save_state("tensor_core", "tensor_core_id", self.tensor_core_id, state)
20
+
21
+ def matmul(self, A, B):
22
+ state = self.load_state()
23
+ # Simulate a matrix multiply (for demo, just sum all elements)
24
+ result = sum(sum(row) for row in A) * sum(sum(row) for row in B)
25
+ state["last_result"] = result
26
+ self.save_state(state)
27
+ return result
28
+
29
+ class OpticalInterconnect:
30
+ def __init__(self, bandwidth_tbps=800, latency_ns=1):
31
+ self.bandwidth_tbps = bandwidth_tbps # TB/s
32
+ self.latency_ns = latency_ns # nanoseconds
33
+
34
+ def transfer_time(self, data_size_bytes):
35
+ # Time = latency + (data_size / bandwidth)
36
+ bandwidth_bytes_per_s = self.bandwidth_tbps * 1e12
37
+ transfer_time_s = self.latency_ns * 1e-9 + (data_size_bytes / bandwidth_bytes_per_s)
38
+ return transfer_time_s
39
+
40
+ class Thread:
41
+ def __init__(self, thread_id, core):
42
+ self.thread_id = thread_id
43
+ self.core = core
44
+ self.active = True
45
+ self.result = None
46
+
47
+ def run(self, a, b, cin, opcode, reg_sel):
48
+ if self.active:
49
+ self.result = self.core.step(a, b, cin, opcode, reg_sel)
50
+ return self.result
51
+
52
+ class Warp:
53
+ def __init__(self, warp_id, threads):
54
+ self.warp_id = warp_id
55
+ self.threads = threads # List of Thread objects
56
+ self.active = True
57
+
58
+ def run(self, a, b, cin, opcode, reg_sel):
59
+ # All threads in a warp execute in lockstep (SIMT)
60
+ return [thread.run(a, b, cin, opcode, reg_sel) for thread in self.threads if thread.active]
61
+
62
+ class WarpScheduler:
63
+ def __init__(self, warps):
64
+ self.warps = warps # List of Warp objects
65
+ self.schedule_ptr = 0
66
+
67
+ def schedule(self):
68
+ # Simple round-robin scheduler
69
+ if not self.warps:
70
+ return None
71
+ warp = self.warps[self.schedule_ptr]
72
+ self.schedule_ptr = (self.schedule_ptr + 1) % len(self.warps)
73
+ return warp
74
+
75
+ class SharedMemory:
76
+ def __init__(self, size):
77
+ self.size = size
78
+ self.mem = [0] * size
79
+
80
+ def read(self, addr):
81
+ return self.mem[addr % self.size]
82
+
83
+ def write(self, addr, value):
84
+ self.mem[addr % self.size] = value
85
+
86
+ def read_matrix(self, addr, n, m):
87
+ # Simulate reading an n x m matrix from shared memory
88
+ # For simplicity, treat addr as row offset
89
+ return [
90
+ [self.mem[(addr + i * m + j) % self.size] for j in range(m)]
91
+ for i in range(n)
92
+ ]
93
+
94
+ class L1Cache:
95
+ def __init__(self, size):
96
+ self.size = size
97
+ self.cache = [None] * size
98
+
99
+ def read(self, addr):
100
+ return self.cache[addr % self.size]
101
+
102
+ def write(self, addr, value):
103
+ self.cache[addr % self.size] = value
104
+
105
+
106
+ # GlobalMemory now uses RAMController and persists to .db
107
+ class GlobalMemory:
108
+ def __init__(self, size_bytes, db_path=None):
109
+ if db_path is None:
110
+ import uuid
111
+ db_path = os.path.join(os.path.dirname(__file__), f"global_mem_{uuid.uuid4().hex}.db")
112
+ self.size_bytes = size_bytes
113
+ self.ram = RAMController(size_bytes, db_path=db_path)
114
+ self.allocated_address = 0 # Simple allocation pointer
115
+
116
+ def read(self, addr, length=1):
117
+ data = self.ram.read(addr, length)
118
+ # Return as int for compatibility (simulate voltage)
119
+ if length == 1:
120
+ return int(data[0]) if data else 0
121
+ return [int(b) for b in data]
122
+
123
+ def write(self, addr, value):
124
+ # Accepts int, float, or list/bytes
125
+ if isinstance(value, (int, float)):
126
+ data = bytes([int(value) & 0xFF])
127
+ elif isinstance(value, (bytes, bytearray)):
128
+ data = value
129
+ elif isinstance(value, list):
130
+ # Convert list of integers to bytes, assuming each integer is a byte value (0-255)
131
+ data = bytes(value)
132
+ else:
133
+ raise TypeError("Unsupported value type for write")
134
+ self.ram.write(addr, data)
135
+
136
+ def read_matrix(self, addr, n, m):
137
+ # Read n*m bytes and reshape
138
+ data = self.ram.read(addr, n * m)
139
+ return [list(data[i*m:(i+1)*m]) for i in range(n)]
140
+
141
+ def allocate_space(self, size_bytes: int) -> int:
142
+ """Simulates allocating space in global memory."""
143
+ if self.allocated_address + size_bytes > self.size_bytes:
144
+ raise MemoryError("Out of global memory space")
145
+ allocated_addr = self.allocated_address
146
+ self.allocated_address += size_bytes
147
+ return allocated_addr
148
+
149
+
150
+ # StreamingMultiprocessor now only loads state from DB as needed
151
+ class StreamingMultiprocessor:
152
+ def __init__(self, sm_id, chip_id, db: GPUStateDB, num_cores_per_sm=128, warps_per_sm=164, threads_per_warp=700, num_tensor_cores=8):
153
+ self.sm_id = sm_id
154
+ self.chip_id = chip_id
155
+ self.db = db
156
+ self.num_cores_per_sm = num_cores_per_sm
157
+ self.warps_per_sm = warps_per_sm
158
+ self.threads_per_warp = threads_per_warp
159
+ self.num_tensor_cores = num_tensor_cores
160
+ self.global_mem = None # Will be set by GPUMemoryHierarchy
161
+
162
+ def load_state(self):
163
+ state = self.db.load_state("sm", "sm_id", self.sm_id)
164
+ return state or {}
165
+
166
+ def save_state(self, state):
167
+ self.db.save_state("sm", "sm_id", self.sm_id, state)
168
+
169
+ def attach_global_mem(self, global_mem):
170
+ self.global_mem = global_mem
171
+
172
+ def get_core(self, core_id):
173
+ return Core(core_id, self.sm_id, self.db)
174
+
175
+ def get_warp(self, warp_id):
176
+ return WarpDB(warp_id, self.sm_id, self.db)
177
+
178
+ def get_tensor_core(self, tensor_core_id):
179
+ return TensorCoreDB(tensor_core_id, self.sm_id, self.db)
180
+
181
+ def run_next_warp(self, a, b, cin, opcode, reg_sel):
182
+ # Example: load warp 0, run, save
183
+ warp = self.get_warp(0)
184
+ result = warp.run(a, b, cin, opcode, reg_sel)
185
+ return result
186
+
187
+ def tensor_core_matmul(self, A, B, tensor_core_id=0):
188
+ tensor_core = self.get_tensor_core(tensor_core_id)
189
+ return tensor_core.matmul(A, B)
190
+
191
+ class Core:
192
+ def __init__(self, core_id, sm_id, db: GPUStateDB):
193
+ self.core_id = core_id
194
+ self.sm_id = sm_id
195
+ self.db = db
196
+
197
+ def load_state(self):
198
+ state = self.db.load_state("core", "core_id", self.core_id)
199
+ return state or {}
200
+
201
+ def save_state(self, state):
202
+ self.db.save_state("core", "core_id", self.core_id, state)
203
+
204
+ def step(self, a, b, cin, opcode, reg_sel):
205
+ state = self.load_state()
206
+ # Simulate a simple operation
207
+ state["last_result"] = (a[0] + b[0] + cin) if opcode == 0b10 else 0.0
208
+ self.save_state(state)
209
+ return state["last_result"]
210
+
211
+ class WarpDB:
212
+ def __init__(self, warp_id, sm_id, db: GPUStateDB, threads_per_warp=700):
213
+ self.warp_id = warp_id
214
+ self.sm_id = sm_id
215
+ self.db = db
216
+ self.threads_per_warp = threads_per_warp
217
+
218
+ def load_state(self):
219
+ state = self.db.load_state("warp", "warp_id", self.warp_id)
220
+ return state or {}
221
+
222
+ def save_state(self, state):
223
+ self.db.save_state("warp", "warp_id", self.warp_id, state)
224
+
225
+ def get_thread(self, thread_id):
226
+ return ThreadDB(thread_id, self.warp_id, self.db)
227
+
228
+ def run(self, a, b, cin, opcode, reg_sel):
229
+ # For demo, run only first thread
230
+ thread = self.get_thread(0)
231
+ result = thread.run(a, b, cin, opcode, reg_sel)
232
+ return [result]
233
+
234
+ class ThreadDB:
235
+ def __init__(self, thread_id, warp_id, db: GPUStateDB):
236
+ self.thread_id = thread_id
237
+ self.warp_id = warp_id
238
+ self.db = db
239
+
240
+ def load_state(self):
241
+ state = self.db.load_state("thread", "thread_id", self.thread_id)
242
+ return state or {}
243
+
244
+ def save_state(self, state):
245
+ self.db.save_state("thread", "thread_id", self.thread_id, state)
246
+
247
+ def run(self, a, b, cin, opcode, reg_sel):
248
+ state = self.load_state()
249
+ # Simulate a simple operation
250
+ state["result"] = (a[0] + b[0] + cin) if opcode == 0b10 else 0.0
251
+ self.save_state(state)
252
+ return state["result"]
253
+
254
+ def attach_global_mem(self, global_mem):
255
+ self.global_mem = global_mem
256
+
257
+ def run_next_warp(self, a, b, cin, opcode, reg_sel):
258
+ warp = self.scheduler.schedule()
259
+ if warp:
260
+ return warp.run(a, b, cin, opcode, reg_sel)
261
+ return None
262
+
263
+ def tensor_core_matmul(self, A, B):
264
+ return self.tensor_cores.matmul(A, B)
265
+
266
+ def tensor_core_matmul_from_memory(self, srcA, addrA, srcB, addrB, shapeA, shapeB):
267
+ return self.tensor_cores.matmul_from_memory(srcA, addrA, srcB, addrB, shapeA, shapeB)
268
+
269
+ def read_register_matrix(self, addr, n, m):
270
+ # Simulate reading an n x m matrix from registers
271
+ # For simplicity, treat addr as row offset
272
+ return [
273
+ [self.register_file[(addr + i) % len(self.register_file)][(j) % len(self.register_file[0])] for j in range(m)]
274
+ for i in range(n)
275
+ ]
276
+
277
+
278
+
279
+ class GPUMemoryHierarchy:
280
+ def __init__(self, num_sms, global_mem_size_bytes, chip_id, db: GPUStateDB):
281
+ self.global_mem = GlobalMemory(global_mem_size_bytes)
282
+ self.sm_ids = list(range(num_sms))
283
+ self.chip_id = chip_id
284
+ self.db = db
285
+ self.num_sms = num_sms
286
+
287
+ def add_sm(self, sm):
288
+ sm.attach_global_mem(self.global_mem)
289
+
290
+ def read_global(self, addr):
291
+ return self.global_mem.read(addr)
292
+
293
+ def write_global(self, addr, value):
294
+ self.global_mem.write(addr, value)
295
+
296
+
297
+
298
+
299
+ class Chip:
300
+ def __init__(self, chip_id, num_sms=1500, vram_size_gb=16, db_path="gpu_state.db"):
301
+ self.chip_id = chip_id
302
+ self.db = GPUStateDB(db_path)
303
+ global_mem_size_bytes = vram_size_gb * 1024 * 1024 * 1024
304
+ self.gpu_mem = GPUMemoryHierarchy(num_sms=num_sms, global_mem_size_bytes=global_mem_size_bytes, chip_id=chip_id, db=self.db)
305
+ self.sm_ids = list(range(num_sms))
306
+ self.connected_chips = []
307
+ self.ai_accelerator = AIAccelerator() # Instantiate AIAccelerator
308
+ self.custom_vram = CustomVRAM(self.gpu_mem.global_mem) # Create CustomVRAM instance
309
+ self.ai_accelerator.set_vram(self.custom_vram) # Set VRAM for AIAccelerator
310
+
311
+ def get_sm(self, sm_id):
312
+ return StreamingMultiprocessor(sm_id, self.chip_id, self.db)
313
+
314
+ def connect_chip(self, other_chip, interconnect):
315
+ self.connected_chips.append((other_chip, interconnect))
316
+
317
+ def close(self):
318
+ if hasattr(self, "db") and self.db:
319
+ self.db.close()
320
+ if hasattr(self, "gpu_mem") and hasattr(self.gpu_mem, "global_mem") and hasattr(self.gpu_mem.global_mem, "ram"):
321
+ self.gpu_mem.global_mem.ram.close()
322
+
323
+
324
+ if __name__ == "__main__":
325
+ print("\n--- Multi-Chip GPU Simulation (DB-backed) ---")
326
+ num_chips = 10
327
+ vram_size_gb = 16
328
+ chips = [Chip(
329
+ chip_id=i,
330
+ num_sms=100,
331
+ vram_size_gb=vram_size_gb,
332
+ db_path=f"gpu_state_chip_{i}.db"
333
+ ) for i in range(num_chips)]
334
+ print(f"Total chips: {len(chips)}")
335
+ optical_link = OpticalInterconnect(bandwidth_tbps=800, latency_ns=1)
336
+ for i in range(num_chips):
337
+ chips[i].connect_chip(chips[(i+1)%num_chips], optical_link)
338
+ for chip in chips:
339
+ sm = chip.get_sm(0)
340
+ results = sm.run_next_warp([0.7, 0.0], [0.7, 0.7], 0.0, 0b10, 0)
341
+ print(f"Chip {chip.chip_id} SM 0 first thread result: {results[0] if results else None}")
342
+ # Example tensor core usage: matrix multiply on SM 0, tensor core 0
343
+ A = [[1.0, 2.0], [3.0, 4.0]]
344
+ B = [[5.0, 6.0], [7.0, 8.0]]
345
+ tc_result = sm.tensor_core_matmul(A, B, tensor_core_id=0)
346
+ print(f"Chip {chip.chip_id} SM 0 tensor core 0 matmul result: {tc_result}")
347
+ print(f"Total SMs in first chip: {len(chips[0].sm_ids)}")
348
+ print(f"Global memory size in first chip: {chips[0].gpu_mem.global_mem.size_bytes} bytes (backed by .db)")
349
+ chips[0].send_data(chips[1], optical_link, 1024*1024*1024*10)
350
+
351
+
gpu_state.db ADDED
Binary file (24.6 kB). View file
 
gpu_state_db.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ import json
3
+ import threading
4
+
5
+ class GPUStateDB:
6
+ def __init__(self, db_path='gpu_state.db'):
7
+ self.conn = sqlite3.connect(db_path, check_same_thread=False)
8
+ self.lock = threading.Lock()
9
+ self._init_tables()
10
+
11
+ def _init_tables(self):
12
+ with self.lock:
13
+ c = self.conn.cursor()
14
+ c.execute('''CREATE TABLE IF NOT EXISTS sm (
15
+ sm_id INTEGER PRIMARY KEY,
16
+ chip_id INTEGER,
17
+ state_json TEXT
18
+ )''')
19
+ c.execute('''CREATE TABLE IF NOT EXISTS core (
20
+ core_id INTEGER PRIMARY KEY,
21
+ sm_id INTEGER,
22
+ registers BLOB,
23
+ state_json TEXT
24
+ )''')
25
+ c.execute('''CREATE TABLE IF NOT EXISTS warp (
26
+ warp_id INTEGER PRIMARY KEY,
27
+ sm_id INTEGER,
28
+ thread_ids TEXT,
29
+ state_json TEXT
30
+ )''')
31
+ c.execute('''CREATE TABLE IF NOT EXISTS thread (
32
+ thread_id INTEGER PRIMARY KEY,
33
+ warp_id INTEGER,
34
+ core_id INTEGER,
35
+ state_json TEXT
36
+ )''')
37
+ c.execute('''CREATE TABLE IF NOT EXISTS tensor_core (
38
+ tensor_core_id INTEGER PRIMARY KEY,
39
+ sm_id INTEGER,
40
+ memory BLOB,
41
+ state_json TEXT
42
+ )''')
43
+ self.conn.commit()
44
+
45
+ def save_state(self, table, id_name, id_value, state):
46
+ state_json = json.dumps(state)
47
+ with self.lock:
48
+ self.conn.execute(f"INSERT OR REPLACE INTO {table} ({id_name}, state_json) VALUES (?, ?)", (id_value, state_json))
49
+ self.conn.commit()
50
+
51
+ def load_state(self, table, id_name, id_value):
52
+ with self.lock:
53
+ cur = self.conn.execute(f"SELECT state_json FROM {table} WHERE {id_name}=?", (id_value,))
54
+ row = cur.fetchone()
55
+ return json.loads(row[0]) if row else None
56
+
57
+ def close(self):
58
+ if self.conn:
59
+ self.conn.close()
60
+ self.conn = None
logic_gates.py ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hyperrealistic voltage-based logic gates for digital simulation.
3
+ Each gate operates on analog voltages, with digital 1/0 determined by thresholding.
4
+ Gate switching speed is parameterized to match target transistor switching rates.
5
+ """
6
+
7
+ import random
8
+
9
+ # Constants for voltage logic
10
+ VDD = 0.7 # High voltage (V)
11
+ VSS = 0.0 # Low voltage (V)
12
+ VTH = 0.35 # Threshold voltage (V)
13
+
14
+ # Gate switching delay (in seconds) to match fastest possible switching
15
+ # This should be the minimum possible, based on electron_speed.py calculation
16
+ from electron_speed import max_switch_freq
17
+ GATE_DELAY = 1 / max_switch_freq # seconds per switch (theoretical limit)
18
+
19
+ class LogicGate:
20
+ def __init__(self, vdd=VDD, vss=VSS, vth=VTH, delay=GATE_DELAY):
21
+ self.vdd = vdd
22
+ self.vss = vss
23
+ self.vth = vth
24
+ self.delay = delay
25
+
26
+ def interpret(self, voltage):
27
+ """Return digital 1 if voltage > Vth, else 0."""
28
+ return 1 if voltage > self.vth else 0
29
+
30
+ def voltage(self, bit):
31
+ """Return voltage for digital bit."""
32
+ return self.vdd if bit else self.vss
33
+
34
+ class NANDGate(LogicGate):
35
+ def output(self, vin1, vin2):
36
+ # Interpret inputs as digital
37
+ in1 = self.interpret(vin1)
38
+ in2 = self.interpret(vin2)
39
+ # NAND logic: output is high unless both inputs are high
40
+ out_bit = 0 if (in1 and in2) else 1
41
+ # Add random noise for realism
42
+ noise = random.gauss(0, 0.01 * self.vdd)
43
+ return self.voltage(out_bit) + noise
44
+
45
+ class ANDGate(LogicGate):
46
+ def output(self, vin1, vin2):
47
+ in1 = self.interpret(vin1)
48
+ in2 = self.interpret(vin2)
49
+ out_bit = 1 if (in1 and in2) else 0
50
+ noise = random.gauss(0, 0.01 * self.vdd)
51
+ return self.voltage(out_bit) + noise
52
+
53
+ class ORGate(LogicGate):
54
+ def output(self, vin1, vin2):
55
+ in1 = self.interpret(vin1)
56
+ in2 = self.interpret(vin2)
57
+ out_bit = 1 if (in1 or in2) else 0
58
+ noise = random.gauss(0, 0.01 * self.vdd)
59
+ return self.voltage(out_bit) + noise
60
+
61
+ class NOTGate(LogicGate):
62
+ def output(self, vin):
63
+ in_bit = self.interpret(vin)
64
+ out_bit = 0 if in_bit else 1
65
+ noise = random.gauss(0, 0.01 * self.vdd)
66
+ return self.voltage(out_bit) + noise
67
+
68
+ # Example usage and test
69
+ if __name__ == "__main__":
70
+ nand = NANDGate()
71
+ andg = ANDGate()
72
+ org = ORGate()
73
+ notg = NOTGate()
74
+ print("NAND(0.7, 0.7):", nand.output(0.7, 0.7))
75
+ print("AND(0.7, 0.7):", andg.output(0.7, 0.7))
76
+ print("OR(0.0, 0.7):", org.output(0.0, 0.7))
77
+ print("NOT(0.7):", notg.output(0.7))
78
+ print(f"Gate delay (s): {GATE_DELAY:.2e}")
79
+
80
+
81
+ # --- Combinational Logic ---
82
+ class XORGate(LogicGate):
83
+ def output(self, vin1, vin2):
84
+ in1 = self.interpret(vin1)
85
+ in2 = self.interpret(vin2)
86
+ out_bit = 1 if (in1 != in2) else 0
87
+ noise = random.gauss(0, 0.01 * self.vdd)
88
+ return self.voltage(out_bit) + noise
89
+
90
+ class NORGate(LogicGate):
91
+ def output(self, vin1, vin2):
92
+ in1 = self.interpret(vin1)
93
+ in2 = self.interpret(vin2)
94
+ out_bit = 0 if (in1 or in2) else 1
95
+ noise = random.gauss(0, 0.01 * self.vdd)
96
+ return self.voltage(out_bit) + noise
97
+
98
+ class XNORGate(LogicGate):
99
+ def output(self, vin1, vin2):
100
+ in1 = self.interpret(vin1)
101
+ in2 = self.interpret(vin2)
102
+ out_bit = 1 if (in1 == in2) else 0
103
+ noise = random.gauss(0, 0.01 * self.vdd)
104
+ return self.voltage(out_bit) + noise
105
+
106
+ # Example: 1-bit Full Adder (combinational logic)
107
+ class FullAdder:
108
+ def __init__(self):
109
+ self.xor1 = XORGate()
110
+ self.xor2 = XORGate()
111
+ self.and1 = ANDGate()
112
+ self.and2 = ANDGate()
113
+ self.or1 = ORGate()
114
+
115
+ def output(self, a, b, cin):
116
+ sum1 = self.xor1.output(a, b)
117
+ sum_bit = self.xor2.output(sum1, cin)
118
+ carry1 = self.and1.output(a, b)
119
+ carry2 = self.and2.output(sum1, cin)
120
+ cout = self.or1.output(carry1, carry2)
121
+ return sum_bit, cout
122
+
123
+ # --- Sequential Logic ---
124
+ # SR, D, JK, T Flip-Flops (voltage-based, using gates)
125
+ class SRFlipFlop:
126
+ def __init__(self):
127
+ self.q = VSS
128
+ self.nand1 = NANDGate()
129
+ self.nand2 = NANDGate()
130
+
131
+ def output(self, s, r):
132
+ # s, r: voltages
133
+ q_bar = self.nand1.output(s, self.q)
134
+ self.q = self.nand2.output(r, q_bar)
135
+ return self.q
136
+
137
+ class DFlipFlop:
138
+ def __init__(self):
139
+ self.sr = SRFlipFlop()
140
+
141
+ def output(self, d, clk):
142
+ # On rising clock, sample d
143
+ s = d if clk > VTH else VSS
144
+ r = NOTGate().output(d) if clk > VTH else VSS
145
+ return self.sr.output(s, r)
146
+
147
+ class JKFlipFlop:
148
+ def __init__(self):
149
+ self.q = VSS
150
+ self.j = None
151
+ self.k = None
152
+ self.nand1 = NANDGate()
153
+ self.nand2 = NANDGate()
154
+ self.nand3 = NANDGate()
155
+ self.nand4 = NANDGate()
156
+
157
+ def output(self, j, k, clk):
158
+ # Simple JK: toggle on J=K=1, set/reset otherwise
159
+ if clk > VTH:
160
+ if j > VTH and k > VTH:
161
+ self.q = VDD if self.q == VSS else VSS
162
+ elif j > VTH:
163
+ self.q = VDD
164
+ elif k > VTH:
165
+ self.q = VSS
166
+ return self.q
167
+
168
+ class TFlipFlop:
169
+ def __init__(self):
170
+ self.q = VSS
171
+
172
+ def output(self, t, clk):
173
+ if clk > VTH and t > VTH:
174
+ self.q = VDD if self.q == VSS else VSS
175
+ return self.q
176
+
177
+ # Example: 2-bit Register (sequential logic)
178
+ class Register2Bit:
179
+ def __init__(self):
180
+ self.dff0 = DFlipFlop()
181
+ self.dff1 = DFlipFlop()
182
+
183
+ def output(self, d0, d1, clk):
184
+ q0 = self.dff0.output(d0, clk)
185
+ q1 = self.dff1.output(d1, clk)
186
+ return q0, q1
187
+
188
+ # Example usage
189
+ if __name__ == "__main__":
190
+ # ...existing code...
191
+ xor = XORGate()
192
+ print("XOR(0.7, 0.0):", xor.output(0.7, 0.0))
193
+ fa = FullAdder()
194
+ s, c = fa.output(0.7, 0.7, 0.0)
195
+ print("FullAdder(1,1,0): sum=", s, "carry=", c)
196
+ sr = SRFlipFlop()
197
+ print("SRFlipFlop S=1, R=0:", sr.output(0.7, 0.0))
198
+ dff = DFlipFlop()
199
+ print("DFlipFlop D=1, CLK=1:", dff.output(0.7, 0.7))
200
+ jk = JKFlipFlop()
201
+ print("JKFlipFlop J=1, K=1, CLK=1:", jk.output(0.7, 0.7, 0.7))
202
+ tff = TFlipFlop()
203
+ print("TFlipFlop T=1, CLK=1:", tff.output(0.7, 0.7))
204
+ reg = Register2Bit()
205
+ print("Register2Bit D0=1, D1=0, CLK=1:", reg.output(0.7, 0.0, 0.7))
206
+
207
+
208
+ # --- Functional Units and Modules ---
209
+ # Arithmetic Logic Unit (ALU) - 1-bit (can be extended to n-bit)
210
+ class ALU1Bit:
211
+ def __init__(self):
212
+ self.andg = ANDGate()
213
+ self.org = ORGate()
214
+ self.xorg = XORGate()
215
+ self.fadd = FullAdder()
216
+
217
+ def operate(self, a, b, cin, op):
218
+ """
219
+ op: 2-bit operation selector
220
+ 00 = AND, 01 = OR, 10 = ADD, 11 = XOR
221
+ Returns (result, carry_out)
222
+ """
223
+ if op == 0b00:
224
+ return self.andg.output(a, b), 0.0
225
+ elif op == 0b01:
226
+ return self.org.output(a, b), 0.0
227
+ elif op == 0b10:
228
+ s, c = self.fadd.output(a, b, cin)
229
+ return s, c
230
+ elif op == 0b11:
231
+ return self.xorg.output(a, b), 0.0
232
+ else:
233
+ raise ValueError("Invalid ALU op")
234
+
235
+ # 2-bit ALU (example of module composition)
236
+ class ALU2Bit:
237
+ def __init__(self):
238
+ self.alu0 = ALU1Bit()
239
+ self.alu1 = ALU1Bit()
240
+
241
+ def operate(self, a0, a1, b0, b1, cin, op):
242
+ # Least significant bit
243
+ r0, c0 = self.alu0.operate(a0, b0, cin, op)
244
+ # Most significant bit
245
+ r1, c1 = self.alu1.operate(a1, b1, c0, op)
246
+ return (r0, r1), c1
247
+
248
+ # 2-bit Counter (using T flip-flops)
249
+ class Counter2Bit:
250
+ def __init__(self):
251
+ self.tff0 = TFlipFlop()
252
+ self.tff1 = TFlipFlop()
253
+
254
+ def tick(self, clk):
255
+ q0 = self.tff0.output(VDD, clk)
256
+ q1 = self.tff1.output(q0, clk)
257
+ return self.tff0.q, self.tff1.q
258
+
259
+ # 2x2-bit Register File (2 registers, 2 bits each)
260
+ class RegisterFile2x2:
261
+ def __init__(self):
262
+ self.reg0 = Register2Bit()
263
+ self.reg1 = Register2Bit()
264
+ self.sel = 0 # select register 0 or 1
265
+
266
+ def write(self, d0, d1, clk, sel):
267
+ if sel == 0:
268
+ self.reg0.output(d0, d1, clk)
269
+ else:
270
+ self.reg1.output(d0, d1, clk)
271
+
272
+ def read(self, sel):
273
+ if sel == 0:
274
+ return self.reg0.dff0.sr.q, self.reg0.dff1.sr.q
275
+ else:
276
+ return self.reg1.dff0.sr.q, self.reg1.dff1.sr.q
277
+
278
+ # Example usage of functional units
279
+ if __name__ == "__main__":
280
+ # ...existing code...
281
+ alu = ALU1Bit()
282
+ res, cout = alu.operate(0.7, 0.0, 0.0, 0b10)
283
+ print("ALU1Bit ADD 1+0: result=", res, "carry=", cout)
284
+ alu2 = ALU2Bit()
285
+ (r0, r1), c = alu2.operate(0.7, 0.0, 0.7, 0.7, 0.0, 0b10)
286
+ print("ALU2Bit ADD (10)+(11): result=", (r0, r1), "carry=", c)
287
+ counter = Counter2Bit()
288
+ print("Counter2Bit tick 1:", counter.tick(0.7))
289
+ print("Counter2Bit tick 2:", counter.tick(0.7))
290
+ regfile = RegisterFile2x2()
291
+ regfile.write(0.7, 0.0, 0.7, 0)
292
+ regfile.write(0.0, 0.7, 0.7, 1)
293
+ print("RegisterFile2x2 read reg0:", regfile.read(0))
294
+ print("RegisterFile2x2 read reg1:", regfile.read(1))
295
+
296
+
297
+ # --- Control Unit, Registers, and Memory Management Units ---
298
+
299
+ # Simple Control Unit (Finite State Machine for ALU operations)
300
+ class ControlUnit:
301
+ def __init__(self):
302
+ self.state = 0
303
+ self.opcode = 0b00 # default operation
304
+
305
+ def set_opcode(self, opcode):
306
+ self.opcode = opcode
307
+
308
+ def next_state(self):
309
+ self.state = (self.state + 1) % 4
310
+ return self.state
311
+
312
+ def get_control_signals(self):
313
+ # Example: output ALU op and register select
314
+ reg_sel = self.state % 2
315
+ return {'alu_op': self.opcode, 'reg_sel': reg_sel}
316
+
317
+ # General Purpose Register (n-bit, here 2-bit for demo)
318
+ class GeneralPurposeRegister:
319
+ def __init__(self, bits=2):
320
+ self.bits = bits
321
+ self.dffs = [DFlipFlop() for _ in range(bits)]
322
+
323
+ def write(self, data, clk):
324
+ for i in range(self.bits):
325
+ self.dffs[i].output(data[i], clk)
326
+
327
+ def read(self):
328
+ return tuple(self.dffs[i].sr.q for i in range(self.bits))
329
+
330
+ # Simple Memory Management Unit (MMU) - address decode and register file access
331
+ class SimpleMMU:
332
+ def __init__(self, num_registers=2, bits=2):
333
+ self.registers = [GeneralPurposeRegister(bits) for _ in range(num_registers)]
334
+
335
+ def write(self, addr, data, clk):
336
+ if 0 <= addr < len(self.registers):
337
+ self.registers[addr].write(data, clk)
338
+
339
+ def read(self, addr):
340
+ if 0 <= addr < len(self.registers):
341
+ return self.registers[addr].read()
342
+ return None
343
+
344
+ # Example usage of control and memory units
345
+ if __name__ == "__main__":
346
+ # ...existing code...
347
+ cu = ControlUnit()
348
+ cu.set_opcode(0b10) # ADD
349
+ print("ControlUnit state:", cu.next_state(), cu.get_control_signals())
350
+ gpr = GeneralPurposeRegister(bits=2)
351
+ gpr.write([0.7, 0.0], 0.7)
352
+ print("GeneralPurposeRegister read:", gpr.read())
353
+ mmu = SimpleMMU(num_registers=2, bits=2)
354
+ mmu.write(0, [0.7, 0.0], 0.7)
355
+ mmu.write(1, [0.0, 0.7], 0.7)
356
+ print("SimpleMMU read reg0:", mmu.read(0))
357
+ print("SimpleMMU read reg1:", mmu.read(1))
multicore.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Multicore system simulation for virtual GPU v2.
3
+ Simulates 50,000 identical AdvancedCore instances in parallel.
4
+ """
5
+
6
+ from core import AdvancedCore
7
+
8
+ class MultiCoreSystem:
9
+ def __init__(self, num_cores=50000, bits=2, num_registers=2):
10
+ self.cores = [AdvancedCore(bits=bits, num_registers=num_registers) for _ in range(num_cores)]
11
+ self.num_cores = num_cores
12
+
13
+ def step_all(self, a, b, cin, opcode, reg_sel):
14
+ """
15
+ Steps all cores in parallel with the same input.
16
+ a, b: lists of voltages (length 2)
17
+ cin: carry in
18
+ opcode: ALU operation
19
+ reg_sel: register select
20
+ Returns: list of results from all cores
21
+ """
22
+ return [core.step(a, b, cin, opcode, reg_sel) for core in self.cores]
23
+
24
+ def step_all_custom(self, inputs):
25
+ """
26
+ Steps all cores in parallel with custom input for each core.
27
+ inputs: list of dicts with keys 'a', 'b', 'cin', 'opcode', 'reg_sel'
28
+ Returns: list of results from all cores
29
+ """
30
+ return [core.step(inp['a'], inp['b'], inp['cin'], inp['opcode'], inp['reg_sel']) for core, inp in zip(self.cores, inputs)]
31
+
32
+ if __name__ == "__main__":
33
+ print("\n--- MultiCore System Simulation (50,000 cores) ---")
34
+ system = MultiCoreSystem(num_cores=50000, bits=2, num_registers=2)
35
+ # Example: Step all cores with the same ADD operation
36
+ results = system.step_all([0.7, 0.0], [0.7, 0.7], 0.0, 0b10, 0)
37
+ print(f"First core result: {results[0]}")
38
+ print(f"Total cores simulated: {len(results)}")
tensor_core.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tensor Core subsystem for hyperrealistic GPU simulation.
3
+ Models hardware-level matrix multiply-accumulate, scheduling, and memory integration.
4
+ """
5
+
6
+ import time
7
+ import sys
8
+ import os
9
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
10
+ try:
11
+ from electron_speed import TARGET_SWITCHES_PER_SEC, TRANSISTORS_ON_CHIP
12
+ except ImportError:
13
+ TARGET_SWITCHES_PER_SEC = 9e20
14
+ TRANSISTORS_ON_CHIP = 6e11
15
+
16
+ class TensorCore:
17
+ """
18
+ Simulates a hardware tensor core for matrix operations (multiply-accumulate),
19
+ with realistic operand fetch from registers, shared memory, and VRAM/global memory.
20
+ """
21
+ def __init__(self, bits=2, memory_size=800*1024*1024*1024, bandwidth_tbps=10000, sm=None):
22
+ self.bits = bits
23
+ # Use a sparse dict for local memory: keys are (row, col), values are floats
24
+ self.memory = {}
25
+ self.bandwidth_tbps = bandwidth_tbps # Simulated bandwidth for operand fetch (TB/s)
26
+ self.sm = sm # Reference to parent SM for memory access
27
+
28
+ def fetch_operand(self, source, addr, shape):
29
+ """
30
+ Fetches a matrix operand from a given source (registers, shared, global).
31
+ Simulates bandwidth and latency.
32
+ """
33
+ n, m = shape
34
+ if source == 'register':
35
+ # Simulate register fetch (fast, minimal latency)
36
+ matrix = self.sm.read_register_matrix(addr, n, m)
37
+ latency = 1e-9 # 1ns
38
+ elif source == 'shared':
39
+ # Simulate shared memory fetch
40
+ matrix = self.sm.shared_mem.read_matrix(addr, n, m)
41
+ latency = 10e-9 # 10ns
42
+ elif source == 'global':
43
+ # Simulate VRAM/global memory fetch
44
+ matrix = self.sm.global_mem.read_matrix(addr, n, m)
45
+ latency = 200e-9 # 200ns
46
+ else:
47
+ raise ValueError(f"Unknown source: {source}")
48
+ # Simulate bandwidth (TB/s)
49
+ data_size_bytes = n * m * (self.bits // 8)
50
+ transfer_time = data_size_bytes / (self.bandwidth_tbps * 1e12)
51
+ time.sleep(latency + transfer_time) # Simulate delay
52
+ return matrix
53
+
54
+ def matmul(self, A, B):
55
+ # A, B: 2D lists (matrices) of voltages
56
+ n = len(A)
57
+ m = len(B[0])
58
+ p = len(B)
59
+ C = [[0.0 for _ in range(m)] for _ in range(n)]
60
+ for i in range(n):
61
+ for j in range(m):
62
+ acc = 0.0
63
+ for k in range(p):
64
+ acc += A[i][k] * B[k][j]
65
+ C[i][j] = acc
66
+ return C
67
+
68
+ def matmul_from_memory(self, srcA, addrA, srcB, addrB, shapeA, shapeB):
69
+ """
70
+ Fetches operands from memory hierarchy and performs matmul.
71
+ srcA/srcB: 'register', 'shared', or 'global'
72
+ addrA/addrB: address or index
73
+ shapeA/shapeB: (n, p), (p, m)
74
+ """
75
+ A = self.fetch_operand(srcA, addrA, shapeA)
76
+ B = self.fetch_operand(srcB, addrB, shapeB)
77
+ return self.matmul(A, B)
78
+
79
+ def load_matrix(self, matrix, row_offset=0, col_offset=0):
80
+ # Loads a matrix into local memory (sparse)
81
+ for i, row in enumerate(matrix):
82
+ for j, val in enumerate(row):
83
+ self.memory[(row_offset+i, col_offset+j)] = val
84
+
85
+ def read_matrix(self, n, m, row_offset=0, col_offset=0):
86
+ # Reads an n x m matrix from local memory (sparse)
87
+ return [
88
+ [self.memory.get((row_offset+i, col_offset+j), 0.0) for j in range(m)]
89
+ for i in range(n)
90
+ ]
91
+
92
+ class TensorCoreArray:
93
+ """
94
+ Array of tensor cores per SM, with scheduling and memory integration.
95
+ """
96
+ def __init__(self, num_tensor_cores=8000, bits=2, memory_size=800*1024*1024*1024, bandwidth_tbps=10000, sm=None):
97
+ self.tensor_cores = [TensorCore(bits=bits, memory_size=memory_size, bandwidth_tbps=bandwidth_tbps, sm=sm) for _ in range(num_tensor_cores)]
98
+ self.schedule_ptr = 0
99
+ self.sm = sm
100
+ # Deep realism: calculate theoretical PFLOPS
101
+ # Use foundational switching rate from electron_speed.py
102
+ # PFLOPS = (num_tensor_cores * ops_per_cycle * clock_GHz) / 1e6
103
+ # clock_GHz = (TARGET_SWITCHES_PER_SEC / TRANSISTORS_ON_CHIP) / 1e9
104
+ self.ops_per_cycle = 1024 # Example: 1024 fused-multiply-adds per cycle per core
105
+ self.clock_ghz = (TARGET_SWITCHES_PER_SEC / TRANSISTORS_ON_CHIP) / 1e9
106
+ self.pflops = (num_tensor_cores * self.ops_per_cycle * self.clock_ghz) / 1e6
107
+
108
+ def schedule(self):
109
+ # Simple round-robin scheduling
110
+ tc = self.tensor_cores[self.schedule_ptr]
111
+ self.schedule_ptr = (self.schedule_ptr + 1) % len(self.tensor_cores)
112
+ return tc
113
+
114
+ def matmul(self, A, B):
115
+ tc = self.schedule()
116
+ # Deep realism: calculate actual compute time
117
+ n = len(A)
118
+ m = len(B[0])
119
+ p = len(B)
120
+ total_ops = n * m * p * 2 # 2 ops per FMA (multiply and add)
121
+ seconds = total_ops / (self.pflops * 1e15)
122
+ print(f"[TensorCoreArray] Matmul on {len(self.tensor_cores)} tensor cores @ {self.pflops:.1f} PFLOPS, ops={total_ops}, time={seconds:.9f}s")
123
+ time.sleep(seconds) # Simulate actual compute time
124
+ return tc.matmul(A, B)
125
+
126
+ def matmul_from_memory(self, srcA, addrA, srcB, addrB, shapeA, shapeB):
127
+ tc = self.schedule()
128
+ n, p = shapeA
129
+ p2, m = shapeB
130
+ total_ops = n * m * p * 2
131
+ seconds = total_ops / (self.pflops * 1e15)
132
+ print(f"[TensorCoreArray] Matmul from memory on {len(self.tensor_cores)} tensor cores @ {self.pflops:.1f} PFLOPS, ops={total_ops}, time={seconds:.9f}s")
133
+ time.sleep(seconds)
134
+ return tc.matmul_from_memory(srcA, addrA, srcB, addrB, shapeA, shapeB)
135
+
136
+ def load_matrix(self, matrix, core_idx=0, row_offset=0, col_offset=0):
137
+ self.tensor_cores[core_idx].load_matrix(matrix, row_offset, col_offset)
138
+
139
+ def read_matrix(self, n, m, core_idx=0, row_offset=0, col_offset=0):
140
+ return self.tensor_cores[core_idx].read_matrix(n, m, row_offset, col_offset)
test_ai_integration.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from gpu_arch import Chip
3
+ from ai import AIAccelerator
4
+ from custom_vram import CustomVRAM
5
+ from PIL import Image
6
+ import requests
7
+
8
+ def test_ai_integration():
9
+ print("\n--- Testing AI Integration ---")
10
+
11
+ # Test 1: Model Loading (Florence-2 model)
12
+ print("\nTest 1: Model Loading (Florence-2)")
13
+ try:
14
+ # Initialize a Chip for model loading
15
+ chip_for_loading = Chip(chip_id=0, vram_size_gb=10)
16
+ ai_accelerator_for_loading = chip_for_loading.ai_accelerator
17
+
18
+ # Load BLIP-2 Large model and processor using Hugging Face Auto classes
19
+ from transformers import Blip2ForConditionalGeneration, Blip2Processor
20
+ model_id = "Salesforce/blip2-flan-t5-xxl"
21
+ model = Blip2ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
22
+ processor = Blip2Processor.from_pretrained(model_id)
23
+
24
+ ai_accelerator_for_loading.load_model(model_id, model, processor)
25
+ print(f"Model '{model_id}' loaded successfully on chip 0.")
26
+ assert ai_accelerator_for_loading.has_model(model_id), "Model not found in registry after loading."
27
+
28
+ except Exception as e:
29
+ print(f"Model loading test failed: {e}")
30
+ return
31
+ # Test 2: Multi-Chip Inference (on all images in sample_task folder)
32
+ print("\nTest 2: Multi-Chip Inference (Florence-2, all images in sample_task)")
33
+ import os
34
+ num_chips = 1 # You can increase this if you want to test with more chips
35
+ chips = []
36
+ ai_accelerators = []
37
+
38
+ try:
39
+ # Initialize multiple chips and their AI accelerators
40
+ for i in range(num_chips):
41
+ chip = Chip(chip_id=i, vram_size_gb=1)
42
+ chips.append(chip)
43
+ ai_accelerators.append(chip.ai_accelerator)
44
+ ai_accelerators[i].load_model(model_id, model, processor)
45
+ print(f"Model '{model_id}' loaded successfully on chip {i}.")
46
+
47
+ # Get all image files in sample_task folder
48
+ image_folder = os.path.join(os.path.dirname(__file__), '..', 'sample_task')
49
+ image_files = [f for f in os.listdir(image_folder) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif'))]
50
+ image_files.sort()
51
+ if not image_files:
52
+ print("No images found in sample_task folder.")
53
+ return
54
+
55
+ # Perform inference on each image using all chips
56
+ import time
57
+ for img_name in image_files:
58
+ img_path = os.path.join(image_folder, img_name)
59
+ raw_image = Image.open(img_path).convert('RGB')
60
+ print(f"\nRunning inference for image: {img_name}")
61
+ for i, accelerator in enumerate(ai_accelerators):
62
+ print(f"Performing inference on chip {i}...")
63
+ start_time = time.time()
64
+ result = accelerator.inference(model_id, raw_image)
65
+ elapsed = time.time() - start_time
66
+ print(f"Inference result from chip {i} on {img_name}: {result}")
67
+ print(f"Inference time for chip {i} on {img_name}: {elapsed:.3f} seconds")
68
+ assert result is not None, f"Inference returned None for chip {i} on {img_name}."
69
+ assert isinstance(result, str), f"Inference result from chip {i} on {img_name} is not a string."
70
+ print("Multi-chip inference test on all images successful.")
71
+
72
+ except Exception as e:
73
+ print(f"Multi-chip inference test failed: {e}")
74
+ return
75
+ return
76
+
77
+ # Test 3: Matrix Operations (using CustomVRAM) - still on a single chip
78
+ # print("\nTest 3: Matrix Operations (using CustomVRAM)")
79
+ # try:
80
+ # matrix_a = np.array([[1, 2], [3, 4]], dtype=np.float32)
81
+ # matrix_b = np.array([[5, 6], [7, 8]], dtype=np.float32)
82
+
83
+ # matrix_a_id = ai_accelerator_for_loading.load_matrix(matrix_a, "matrix_A")
84
+ # matrix_b_id = ai_accelerator_for_loading.load_matrix(matrix_b, "matrix_B")
85
+
86
+ # result_matrix_id = ai_accelerator_for_loading.matrix_multiply(matrix_a_id, matrix_b_id, "result_C")
87
+ # result_matrix = ai_accelerator_for_loading.get_matrix(result_matrix_id)
88
+
89
+ # print(f"Matrix A:\n{matrix_a}")
90
+ # print(f"Matrix B:\n{matrix_b}")
91
+ # print(f"Result Matrix C:\n{result_matrix}")
92
+
93
+ # expected_result = np.dot(matrix_a, matrix_b)
94
+ # assert np.array_equal(result_matrix, expected_result), "Matrix multiplication result incorrect."
95
+ # print("Matrix operations test successful.")
96
+
97
+ # except Exception as e:
98
+ # print(f"Matrix operations test failed: {e}")
99
+ # return
100
+
101
+ print("\n--- All AI Integration Tests Completed ---")
102
+
103
+ if __name__ == "__main__":
104
+ test_ai_integration()
105
+
test_multi_chip_gpu.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test for hyperrealistic multi-chip GPU system with full SM and tensor core realism.
3
+ """
4
+ import time
5
+ from gpu_arch import Chip, OpticalInterconnect
6
+
7
+ def test_multi_chip_gpu():
8
+ print("\n=== Multi-Chip GPU System Full Test ===")
9
+ num_chips = 2 # Use 2 for realism, scale up as needed
10
+ num_sms = 4 # Use 4 for realism, scale up as needed
11
+
12
+ chips = [Chip(
13
+ chip_id=i,
14
+ num_sms=num_sms
15
+ ) for i in range(num_chips)]
16
+ print(f"Created {num_chips} chips, each with {num_sms} SMs.")
17
+
18
+ # Connect chips in a ring topology
19
+ optical_link = OpticalInterconnect(bandwidth_tbps=800, latency_ns=1)
20
+ for i in range(num_chips):
21
+ chips[i].connect_chip(chips[(i+1)%num_chips], optical_link)
22
+
23
+ # Run tensor core matmul from all SMs on all chips
24
+ for chip in chips:
25
+ print(f"\n--- Chip {chip.chip_id} ---")
26
+ for sm in chip.sms:
27
+ # Fill registers, shared, and global memory for realism
28
+ for i in range(len(sm.register_file)):
29
+ for j in range(len(sm.register_file[0])):
30
+ sm.register_file[i][j] = float(i + j)
31
+ for addr in range(sm.shared_mem.size):
32
+ sm.shared_mem.write(addr, float(addr % 10))
33
+ for addr in range(sm.global_mem.size_bytes if sm.global_mem else 0):
34
+ sm.global_mem.write(addr, float(addr % 100))
35
+ # Test tensor core matmul from registers
36
+ reg_result = sm.tensor_core_matmul_from_memory('register', 0, 'register', 0, (2,2), (2,2))
37
+ print(f"SM {sm.sm_id} tensor core matmul from registers: {reg_result}")
38
+ # Test tensor core matmul from shared memory
39
+ shared_result = sm.tensor_core_matmul_from_memory('shared', 0, 'shared', 0, (2,2), (2,2))
40
+ print(f"SM {sm.sm_id} tensor core matmul from shared memory: {shared_result}")
41
+ # Test tensor core matmul from global memory
42
+ global_result = sm.tensor_core_matmul_from_memory('global', 0, 'global', 0, (2,2), (2,2))
43
+ print(f"SM {sm.sm_id} tensor core matmul from global memory: {global_result}")
44
+ print("\n=== Multi-Chip GPU System Test Complete ===")
45
+
46
+ if __name__ == "__main__":
47
+ start = time.time()
48
+ test_multi_chip_gpu()
49
+ print(f"Test runtime: {time.time()-start:.3f} seconds")
vram/__pycache__/ram_controller.cpython-311.pyc ADDED
Binary file (3.92 kB). View file
 
vram/__pycache__/ram_controller.cpython-312.pyc ADDED
Binary file (3.25 kB). View file
 
vram/dram_cache.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class DRAMCache:
2
+ def __init__(self, size_mb=512):
3
+ self.size_mb = size_mb
4
+ self.cache = {}
5
+ self.access_order = []
6
+
7
+ def read(self, key):
8
+ if key in self.cache:
9
+ self.access_order.remove(key)
10
+ self.access_order.append(key)
11
+ return self.cache[key]
12
+ return None
13
+
14
+ def write(self, key, value):
15
+ if key in self.cache:
16
+ self.access_order.remove(key)
17
+ elif len(self.cache) >= self.size_mb * 256: # Assume 4KB per entry
18
+ oldest = self.access_order.pop(0)
19
+ del self.cache[oldest]
20
+ self.cache[key] = value
21
+ self.access_order.append(key)
22
+
23
+ class Buffer:
24
+ def __init__(self, size_mb=64):
25
+ self.size_mb = size_mb
26
+ self.buffer = []
27
+
28
+ def add(self, data):
29
+ self.buffer.append(data)
30
+ if len(self.buffer) > self.size_mb * 256:
31
+ self.buffer.pop(0)
32
+
33
+ def flush(self):
34
+ flushed = self.buffer[:]
35
+ self.buffer = []
36
+ return flushed
vram/electron_speed.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Calculate electron drift speed and relate it to transistor switching (tick) rate for a modern GPU.
3
+ Assume: We want to simulate 900 quintillion (9e20) transistor switches per second (B200 scale).
4
+ """
5
+
6
+ # Physical constants
7
+ ELEM_CHARGE = 1.602e-19 # Coulombs
8
+ ELECTRON_MASS = 9.109e-31 # kg
9
+ VACUUM_PERMITTIVITY = 8.854e-12 # F/m
10
+ SILICON_MOBILITY = 0.14 # m^2/(V·s) (typical for electrons in Si at room temp)
11
+
12
+ # Example parameters (can be tuned for realism)
13
+ VOLTAGE = 0.7 # V (typical for advanced nodes)
14
+ CHANNEL_LENGTH = 5e-9 # 5 nm process
15
+ ELECTRIC_FIELD = VOLTAGE / CHANNEL_LENGTH # V/m
16
+
17
+
18
+ SPEED_OF_LIGHT_VACUUM = 3e8 # m/s
19
+ SILICON_REFRACTIVE_INDEX = 3.5
20
+ speed_of_light_silicon = SPEED_OF_LIGHT_VACUUM / SILICON_REFRACTIVE_INDEX
21
+
22
+ # Calculate drift velocity (v = μE)
23
+ drift_velocity = speed_of_light_silicon # m/s
24
+
25
+ # Calculate time for electron to cross channel (t = L / v)
26
+ transit_time = CHANNEL_LENGTH / drift_velocity # seconds
27
+
28
+ # Calculate max theoretical switching frequency (f = 1 / t)
29
+ max_switch_freq = 1 / transit_time # Hz
30
+
31
+
32
+ # For 900 quintillion switches/sec, but with 600 billion transistors
33
+ TARGET_SWITCHES_PER_SEC = 9e20
34
+ TRANSISTORS_ON_CHIP = 6e11 # 600 billion
35
+ transistors_needed = TARGET_SWITCHES_PER_SEC / max_switch_freq
36
+ required_switch_freq_per_transistor = TARGET_SWITCHES_PER_SEC / TRANSISTORS_ON_CHIP
37
+
38
+ # Speed of light in silicon (approx 2/3 c)
39
+
40
+ # --- NAND Flash Floating Gate Transistor Model ---
41
+ class FloatingGateTransistor:
42
+ def __init__(self, channel_length, drift_velocity):
43
+ self.channel_length = channel_length
44
+ self.drift_velocity = drift_velocity
45
+ self.trapped_electrons = 0 # Number of electrons trapped
46
+ self.state = 0 # 0 or 1, representing data
47
+
48
+ def program(self, electrons):
49
+ self.trapped_electrons += electrons
50
+ self.state = 1 if self.trapped_electrons > 0 else 0
51
+ prog_time = self.channel_length / self.drift_velocity
52
+ return prog_time
53
+
54
+ def erase(self):
55
+ self.trapped_electrons = 0
56
+ self.state = 0
57
+ erase_time = self.channel_length / self.drift_velocity
58
+ return erase_time
59
+
60
+ def read(self):
61
+ return self.state
62
+
63
+
64
+
65
+ if __name__ == "__main__":
66
+ print(f"Electron drift velocity: {drift_velocity:.2e} m/s")
67
+ print(f"Channel transit time: {transit_time:.2e} s")
68
+ print(f"Max transistor switching frequency: {max_switch_freq:.2e} Hz")
69
+ print(f"To achieve {TARGET_SWITCHES_PER_SEC:.1e} switches/sec:")
70
+ print(f"- You'd need {transistors_needed:.2e} transistors switching at max speed in parallel.")
71
+ print(f"- For a chip with 600B transistors, each must switch at {required_switch_freq_per_transistor:.2e} Hz.")
72
+ print(f"- Electron drift speed: {drift_velocity:.2e} m/s vs. speed of light in silicon: {speed_of_light_silicon:.2e} m/s")
73
+ print(f"- Electron drift is ~{(drift_velocity/speed_of_light_silicon)*100:.2f}% the speed of light in silicon (photon speed).")
74
+
75
+ # NAND Flash Floating Gate Transistor Demo
76
+ print("\n--- NAND Flash Floating Gate Transistor Demo ---")
77
+ fgt = FloatingGateTransistor(CHANNEL_LENGTH, drift_velocity)
78
+ electrons_to_trap = 1000
79
+
80
+ # Real-time trapping analysis (simulated)
81
+ print("\nSimulating electron trapping in real time:")
82
+ electrons_per_step = 100
83
+ total_steps = electrons_to_trap // electrons_per_step
84
+ for step in range(1, total_steps + 1):
85
+ prog_time = fgt.program(electrons_per_step)
86
+ print(f"Step {step}: Trapped electrons = {fgt.trapped_electrons}, State = {fgt.read()}, Time for this step = {prog_time:.2e} s")
87
+ # Final state after all electrons trapped
88
+ print(f"Final: Trapped electrons = {fgt.trapped_electrons}, State = {fgt.read()}")
89
+ erase_time = fgt.erase()
90
+ print(f"Erasing: State = {fgt.read()}, Time = {erase_time:.2e} s")
91
+ print(f"(Operation speed is limited by electron drift velocity: {drift_velocity:.2e} m/s)")
92
+ print("Higher drift velocity = faster programming/erasing; lower drift velocity = slower data ops.")
93
+
94
+
95
+ # --- SR, D, JK, T Flip-Flop Physics/Timing Summary ---
96
+ print("\n--- Flip-Flop Types and Switching Physics ---")
97
+ print("SR Flip-Flop: Set-Reset, basic memory, built from NAND/NOR gates.")
98
+ print("D Flip-Flop: Data/Delay, synchronizes input to clock, used in registers.")
99
+ print("JK Flip-Flop: Universal, toggles or sets/resets based on inputs.")
100
+ print("T Flip-Flop: Toggle, divides clock, used in counters.")
101
+ print("All flip-flops are built from logic gates, so their switching speed is limited by the gate delay (set by electron drift and channel length).\n")
102
+
103
+ # Example: Calculate flip-flop switching time (assuming 4 gate delays per flip-flop)
104
+ GATE_DELAY = transit_time # seconds, from above
105
+ FF_GATE_COUNT = 4 # typical for basic flip-flop
106
+ flip_flop_delay = FF_GATE_COUNT * GATE_DELAY
107
+ flip_flop_max_freq = 1 / flip_flop_delay
108
+
109
+ print(f"Estimated flip-flop delay: {flip_flop_delay:.2e} s (for {FF_GATE_COUNT} gates)")
110
+ print(f"Max flip-flop switching frequency: {flip_flop_max_freq:.2e} Hz")
111
+
112
+
113
+
vram/ftl.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class FTL:
2
+ def __init__(self):
3
+ self.lba_to_phys = {}
4
+ self.phys_to_lba = {}
5
+
6
+ def map(self, lba, phys):
7
+ self.lba_to_phys[lba] = phys
8
+ self.phys_to_lba[phys] = lba
9
+
10
+ def get_phys(self, lba):
11
+ return self.lba_to_phys.get(lba, None)
12
+
13
+ def get_lba(self, phys):
14
+ return self.phys_to_lba.get(phys, None)
15
+
16
+ def invalidate(self, lba):
17
+ phys = self.lba_to_phys.pop(lba, None)
18
+ if phys:
19
+ self.phys_to_lba.pop(phys, None)
vram/interface.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class PCIeInterface:
2
+ def __init__(self, version='4.0', lanes=4, max_gbps=15):
3
+ self.version = version
4
+ self.lanes = lanes
5
+ self.max_gbps = max_gbps # GB/s
6
+ self.latency_us = 2 # microseconds, typical for PCIe 4.0
7
+
8
+ def transfer_time(self, size_bytes):
9
+ # Calculate time to transfer size_bytes at max_gbps (in seconds)
10
+ gb = size_bytes / 1e9
11
+ time_s = gb / self.max_gbps
12
+ return time_s
13
+
14
+ def simulate_transfer(self, size_bytes, direction='write'):
15
+ t = self.transfer_time(size_bytes)
16
+ print(f"[PCIe] {direction.title()} {size_bytes/1e6:.2f} MB over PCIe {self.version} x{self.lanes} at {self.max_gbps} GB/s: {t*1e3:.3f} ms + {self.latency_us} us latency")
17
+ return t + self.latency_us / 1e6
vram/main.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ram_controller import RAMController
2
+ import random
3
+
4
+ RAM_SIZE_BYTES = 1024 * 1024 * 16 # 16 MB of RAM
5
+
6
+ def demo():
7
+ print(f"Virtual RAM Demo: {RAM_SIZE_BYTES / (1024 * 1024):.2f} MB")
8
+ ram = RAMController(RAM_SIZE_BYTES)
9
+
10
+ print("\nWriting sequential data to RAM:")
11
+ for i in range(0, 1024, 16):
12
+ data = [random.randint(0, 255) for _ in range(16)]
13
+ ram.write(i, data)
14
+ if i < 64:
15
+ print(f"Address {i}: Data (first 16 bytes) {data}")
16
+
17
+ print("\nReading sequential data from RAM:")
18
+ for i in range(0, 1024, 16):
19
+ read_data = ram.read(i, 16)
20
+ if i < 64:
21
+ print(f"Address {i}: Read Data (first 16 bytes) {list(read_data)}")
22
+
23
+ print("\nWriting random data to RAM:")
24
+ for _ in range(10):
25
+ address = random.randint(0, RAM_SIZE_BYTES - 16)
26
+ data = [random.randint(0, 255) for _ in range(16)]
27
+ ram.write(address, data)
28
+ print(f"Address {address}: Data (first 16 bytes) {data}")
29
+
30
+ print("\nReading random data from RAM:")
31
+ for _ in range(10):
32
+ address = random.randint(0, RAM_SIZE_BYTES - 16)
33
+ read_data = ram.read(address, 16)
34
+ print(f"Address {address}: Read Data (first 16 bytes) {list(read_data)}")
35
+
36
+ if __name__ == "__main__":
37
+ demo()
38
+
39
+
vram/nand_block.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from nand_page import Page
2
+
3
+ class Block:
4
+ def __init__(self, num_pages, num_cells_per_page, channel_length, drift_velocity, levels):
5
+ self.pages = [Page(num_cells_per_page, channel_length, drift_velocity, levels) for _ in range(num_pages)]
6
+ self.wear_count = 0
7
+
8
+ def erase(self):
9
+ for page in self.pages:
10
+ page.erase()
11
+ self.wear_count += 1
vram/nand_cell.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class MultiLevelCell:
2
+ def __init__(self, channel_length, drift_velocity, levels):
3
+ self.channel_length = channel_length
4
+ self.drift_velocity = drift_velocity
5
+ self.levels = levels
6
+ self.trapped_electrons = 0
7
+ self.value = 0
8
+ self.wear_count = 0
9
+ self.retention_loss = 0.0
10
+
11
+ def program(self, value):
12
+ self.value = max(0, min(self.levels-1, value))
13
+ self.trapped_electrons = self.value
14
+ self.wear_count += 1
15
+ self.retention_loss = 0.0
16
+ prog_time = self.channel_length / self.drift_velocity
17
+ return prog_time
18
+
19
+ def erase(self):
20
+ self.trapped_electrons = 0
21
+ self.value = 0
22
+ self.wear_count += 1
23
+ self.retention_loss = 0.0
24
+ erase_time = self.channel_length / self.drift_velocity
25
+ return erase_time
26
+
27
+ def read(self):
28
+ import random
29
+ if self.value > 0:
30
+ self.retention_loss += random.uniform(0, 0.01)
31
+ if self.retention_loss > 0.5:
32
+ self.value = max(0, self.value - 1)
33
+ self.trapped_electrons = self.value
34
+ self.retention_loss = 0.0
35
+ return self.value
vram/nand_memory.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ NAND Flash SSD Simulation (Modular)
4
+ -----------------------------------
5
+ This file documents the SSD architecture and usage for the modular simulation.
6
+
7
+ Components:
8
+ - nand_cell.py: MultiLevelCell (single cell physics/logic)
9
+ - nand_page.py: Page (group of cells, ECC)
10
+ - nand_block.py: Block (group of pages)
11
+ - nand_plane.py: Plane (group of blocks)
12
+ - dram_cache.py: DRAMCache, Buffer (cache, buffer, metadata)
13
+ - ftl.py: FTL (Flash Translation Layer, mapping table)
14
+ - ssd_controller.py: SSDController (manages all above, FTL, cache, buffer)
15
+ - main.py: Demo/entry point
16
+
17
+ Usage:
18
+ ------
19
+ Import and use the SSDController and other components in your own scripts, or run main.py for a demo.
20
+
21
+ Example:
22
+ from ssd_controller import SSDController
23
+ ssd = SSDController(...)
24
+ ssd.program(lba, data)
25
+ ssd.read(lba)
26
+
27
+ See main.py for a full demonstration of SSD features, including DRAM cache, buffer, FTL, wear leveling, garbage collection, and retention simulation.
28
+ """
vram/nand_page.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from nand_cell import MultiLevelCell
2
+
3
+ class Page:
4
+ def __init__(self, num_cells, channel_length, drift_velocity, levels):
5
+ self.cells = [MultiLevelCell(channel_length, drift_velocity, levels) for _ in range(num_cells)]
6
+ self.ecc = 0 # Placeholder for ECC bits
7
+
8
+ def program(self, data):
9
+ for i, value in enumerate(data):
10
+ self.cells[i].program(value)
11
+ self.ecc = self.calculate_ecc(data)
12
+
13
+ def erase(self):
14
+ for cell in self.cells:
15
+ cell.erase()
16
+ self.ecc = 0
17
+
18
+ def read(self):
19
+ data = [cell.read() for cell in self.cells]
20
+ return data, self.ecc
21
+
22
+ def calculate_ecc(self, data):
23
+ return sum(data) % 2
vram/nand_plane.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from nand_block import Block
2
+
3
+ class Plane:
4
+ def __init__(self, num_blocks, num_pages, num_cells_per_page, channel_length, drift_velocity, levels):
5
+ self.blocks = [Block(num_pages, num_cells_per_page, channel_length, drift_velocity, levels) for _ in range(num_blocks)]
vram/nvme.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from interface import PCIeInterface
2
+ import threading
3
+ import queue
4
+ import time
5
+
6
+ class NVMeCommand:
7
+ def __init__(self, cmd_type, lba, data=None):
8
+ self.cmd_type = cmd_type # 'read' or 'write'
9
+ self.lba = lba
10
+ self.data = data
11
+ self.result = None
12
+ self.completed = threading.Event()
13
+
14
+ class NVMeController:
15
+ def __init__(self, ssd_controller, queue_depth=64):
16
+ self.ssd = ssd_controller
17
+ self.submission_queue = queue.Queue(maxsize=queue_depth)
18
+ self.completion_queue = queue.Queue(maxsize=queue_depth)
19
+ self.running = True
20
+ self.worker = threading.Thread(target=self.process_commands)
21
+ self.worker.daemon = True
22
+ self.worker.start()
23
+ self.interface = PCIeInterface()
24
+
25
+ def submit(self, cmd):
26
+ self.submission_queue.put(cmd)
27
+
28
+ def process_commands(self):
29
+ while self.running:
30
+ try:
31
+ cmd = self.submission_queue.get(timeout=0.1)
32
+ if cmd.cmd_type == 'write':
33
+ self.ssd.program(cmd.lba, cmd.data)
34
+ self.interface.simulate_transfer(len(cmd.data) * 32 // 8, direction='write')
35
+ cmd.result = 'write_complete'
36
+ elif cmd.cmd_type == 'read':
37
+ data = self.ssd.read(cmd.lba)
38
+ self.interface.simulate_transfer(len(data) * 32 // 8, direction='read')
39
+ cmd.result = data
40
+ self.completion_queue.put(cmd)
41
+ cmd.completed.set()
42
+ except queue.Empty:
43
+ continue
44
+
45
+ def get_completion(self, timeout=1.0):
46
+ try:
47
+ cmd = self.completion_queue.get(timeout=timeout)
48
+ return cmd
49
+ except queue.Empty:
50
+ return None
51
+
52
+ def shutdown(self):
53
+ self.running = False
54
+ self.worker.join()
vram/ram_controller.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import sqlite3
3
+ import threading
4
+
5
+ class RAMController:
6
+ def __init__(self, size_bytes, db_path='ram_storage.db'):
7
+ self.size_bytes = size_bytes
8
+ self.conn = sqlite3.connect(db_path, check_same_thread=False)
9
+ self.db_lock = threading.Lock()
10
+ with self.db_lock:
11
+ self.conn.execute('''CREATE TABLE IF NOT EXISTS ram_cells (
12
+ address INTEGER PRIMARY KEY,
13
+ data BLOB
14
+ )''')
15
+ self.conn.commit()
16
+
17
+ def read(self, address, length):
18
+ if address < 0 or address + length > self.size_bytes:
19
+ raise IndexError("Memory access out of bounds")
20
+ with self.db_lock:
21
+ cur = self.conn.execute(
22
+ "SELECT address, data FROM ram_cells WHERE address >= ? AND address < ? ORDER BY address ASC",
23
+ (address, address + length)
24
+ )
25
+ # Build a bytearray of the requested range
26
+ result = bytearray([0] * length)
27
+ for row in cur:
28
+ addr = row[0]
29
+ data = row[1]
30
+ if address <= addr < address + length:
31
+ result[addr - address] = data[0] if isinstance(data, (bytes, bytearray)) else data
32
+ return result
33
+
34
+ def write(self, address, data):
35
+ if address < 0 or address + len(data) > self.size_bytes:
36
+ raise IndexError("Memory access out of bounds")
37
+ with self.db_lock:
38
+ for offset, value in enumerate(data):
39
+ self.conn.execute(
40
+ "INSERT OR REPLACE INTO ram_cells (address, data) VALUES (?, ?)",
41
+ (address + offset, sqlite3.Binary(bytes([value])))
42
+ )
43
+ self.conn.commit()
44
+
45
+ def close(self):
46
+ with self.db_lock:
47
+ if self.conn:
48
+ self.conn.close()
49
+ self.conn = None
50
+
51
+