File size: 6,104 Bytes
2ff82ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
"""

Tensor Core subsystem for hyperrealistic GPU simulation.

Models hardware-level matrix multiply-accumulate, scheduling, and memory integration.

"""

import time
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
try:
    from electron_speed import TARGET_SWITCHES_PER_SEC, TRANSISTORS_ON_CHIP
except ImportError:
    TARGET_SWITCHES_PER_SEC = 9e20
    TRANSISTORS_ON_CHIP = 6e11

class TensorCore:
    """

    Simulates a hardware tensor core for matrix operations (multiply-accumulate),

    with realistic operand fetch from registers, shared memory, and VRAM/global memory.

    """
    def __init__(self, bits=2, memory_size=800*1024*1024*1024, bandwidth_tbps=10000, sm=None):
        self.bits = bits
        # Use a sparse dict for local memory: keys are (row, col), values are floats
        self.memory = {}
        self.bandwidth_tbps = bandwidth_tbps  # Simulated bandwidth for operand fetch (TB/s)
        self.sm = sm  # Reference to parent SM for memory access

    def fetch_operand(self, source, addr, shape):
        """

        Fetches a matrix operand from a given source (registers, shared, global).

        Simulates bandwidth and latency.

        """
        n, m = shape
        if source == 'register':
            # Simulate register fetch (fast, minimal latency)
            matrix = self.sm.read_register_matrix(addr, n, m)
            latency = 1e-9  # 1ns
        elif source == 'shared':
            # Simulate shared memory fetch
            matrix = self.sm.shared_mem.read_matrix(addr, n, m)
            latency = 10e-9  # 10ns
        elif source == 'global':
            # Simulate VRAM/global memory fetch
            matrix = self.sm.global_mem.read_matrix(addr, n, m)
            latency = 200e-9  # 200ns
        else:
            raise ValueError(f"Unknown source: {source}")
        # Simulate bandwidth (TB/s)
        data_size_bytes = n * m * (self.bits // 8)
        transfer_time = data_size_bytes / (self.bandwidth_tbps * 1e12)
        time.sleep(latency + transfer_time)  # Simulate delay
        return matrix

    def matmul(self, A, B):
        # A, B: 2D lists (matrices) of voltages
        n = len(A)
        m = len(B[0])
        p = len(B)
        C = [[0.0 for _ in range(m)] for _ in range(n)]
        for i in range(n):
            for j in range(m):
                acc = 0.0
                for k in range(p):
                    acc += A[i][k] * B[k][j]
                C[i][j] = acc
        return C

    def matmul_from_memory(self, srcA, addrA, srcB, addrB, shapeA, shapeB):
        """

        Fetches operands from memory hierarchy and performs matmul.

        srcA/srcB: 'register', 'shared', or 'global'

        addrA/addrB: address or index

        shapeA/shapeB: (n, p), (p, m)

        """
        A = self.fetch_operand(srcA, addrA, shapeA)
        B = self.fetch_operand(srcB, addrB, shapeB)
        return self.matmul(A, B)

    def load_matrix(self, matrix, row_offset=0, col_offset=0):
        # Loads a matrix into local memory (sparse)
        for i, row in enumerate(matrix):
            for j, val in enumerate(row):
                self.memory[(row_offset+i, col_offset+j)] = val

    def read_matrix(self, n, m, row_offset=0, col_offset=0):
        # Reads an n x m matrix from local memory (sparse)
        return [
            [self.memory.get((row_offset+i, col_offset+j), 0.0) for j in range(m)]
            for i in range(n)
        ]

class TensorCoreArray:
    """

    Array of tensor cores per SM, with scheduling and memory integration.

    """
    def __init__(self, num_tensor_cores=8000, bits=2, memory_size=800*1024*1024*1024, bandwidth_tbps=10000, sm=None):
        self.tensor_cores = [TensorCore(bits=bits, memory_size=memory_size, bandwidth_tbps=bandwidth_tbps, sm=sm) for _ in range(num_tensor_cores)]
        self.schedule_ptr = 0
        self.sm = sm
        # Deep realism: calculate theoretical PFLOPS
        # Use foundational switching rate from electron_speed.py
        # PFLOPS = (num_tensor_cores * ops_per_cycle * clock_GHz) / 1e6
        # clock_GHz = (TARGET_SWITCHES_PER_SEC / TRANSISTORS_ON_CHIP) / 1e9
        self.ops_per_cycle = 1024  # Example: 1024 fused-multiply-adds per cycle per core
        self.clock_ghz = (TARGET_SWITCHES_PER_SEC / TRANSISTORS_ON_CHIP) / 1e9
        self.pflops = (num_tensor_cores * self.ops_per_cycle * self.clock_ghz) / 1e6

    def schedule(self):
        # Simple round-robin scheduling
        tc = self.tensor_cores[self.schedule_ptr]
        self.schedule_ptr = (self.schedule_ptr + 1) % len(self.tensor_cores)
        return tc

    def matmul(self, A, B):
        tc = self.schedule()
        # Deep realism: calculate actual compute time
        n = len(A)
        m = len(B[0])
        p = len(B)
        total_ops = n * m * p * 2  # 2 ops per FMA (multiply and add)
        seconds = total_ops / (self.pflops * 1e15)
        print(f"[TensorCoreArray] Matmul on {len(self.tensor_cores)} tensor cores @ {self.pflops:.1f} PFLOPS, ops={total_ops}, time={seconds:.9f}s")
        time.sleep(seconds)  # Simulate actual compute time
        return tc.matmul(A, B)

    def matmul_from_memory(self, srcA, addrA, srcB, addrB, shapeA, shapeB):
        tc = self.schedule()
        n, p = shapeA
        p2, m = shapeB
        total_ops = n * m * p * 2
        seconds = total_ops / (self.pflops * 1e15)
        print(f"[TensorCoreArray] Matmul from memory on {len(self.tensor_cores)} tensor cores @ {self.pflops:.1f} PFLOPS, ops={total_ops}, time={seconds:.9f}s")
        time.sleep(seconds)
        return tc.matmul_from_memory(srcA, addrA, srcB, addrB, shapeA, shapeB)

    def load_matrix(self, matrix, core_idx=0, row_offset=0, col_offset=0):
        self.tensor_cores[core_idx].load_matrix(matrix, row_offset, col_offset)

    def read_matrix(self, n, m, core_idx=0, row_offset=0, col_offset=0):
        return self.tensor_cores[core_idx].read_matrix(n, m, row_offset, col_offset)