File size: 10,434 Bytes
ecadbd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
import torch
import torch.nn as nn
from typing import Optional, Set

from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge
import einops


from layer import SharedMonarch, SingleSharedBlockDiag


def test_single():
    # input B * c, output B * r
    B = 20
    block_size_r = 5
    block_size_c = 4
    r = 2
    uni_w = B // r
    model = SingleSharedBlockDiag(num_unique_weights=uni_w, share_factor=r, block_size_c=block_size_c, block_size_r=block_size_r)
    x = torch.rand(2, 3, B*block_size_c)
    y = model(x)
    
    print(f"Weights Shape: {model.weights.shape}, input {x.shape}")
    print(f"Output Shape: {y.shape}")
    W_group_0 = model.weights[0]

    x_blk_0 = x[0, 0, 0:block_size_c]
    y_manual = x_blk_0 @ W_group_0.T
    y_model = y[0, 0, 0:block_size_r]

    print("\n--- Correctness Check ---")
    print(f"Manual: {y_manual.detach().numpy()}")
    print(f"Model : {y_model.detach().numpy()}")
    
    # Assert close
    assert torch.allclose(y_manual, y_model, atol=1e-6)
    print(">> Logic Verified: Dense Matrix Multiplication per block is correct.")

def test_monarch():
    rL, cL = 4, 6
    rR, cR = 4, 2
    share_factor = 2
    model = SharedMonarch(share_factor, rR, cR, rL, cL)


    x = torch.arange(cL * cR, dtype=torch.float32).unsqueeze(0)
    y = model(x)

    print(f"Input shape: {x.shape}")
    print(f"Output shape: {y.shape}")


def manual_override_weights(model, val_r=1.0, val_l=1.0, method='identity'):
    """
    Helper function to force weights to known values for testing.
    """
    with torch.no_grad():
        # --- Override Layer R ---
        if method == 'identity':
            # Create identity matrices for each block
            # Weight shape: (num_unique, r, c). We want weights s.t. x @ W.T = x
            # So W must be Identity.
            for i in range(model.sama_R.num_unique_weights):
                nn.init.eye_(model.sama_R.weights[i])
                model.sama_R.weights[i] *= val_r # Optional scaling
                
        elif method == 'seq':
             # Fill with sequential integers to track position
             nn.init.constant_(model.sama_R.weights, 0)
             count = 1
             for i in range(model.sama_R.num_unique_weights):
                 for r in range(model.sama_R.block_size_r):
                     for c in range(model.sama_R.block_size_c):
                         model.sama_R.weights[i, r, c] = count
                         count += 1

        # --- Override Layer L ---
        if method == 'identity':
            for i in range(model.sama_L.num_unique_weights):
                nn.init.eye_(model.sama_L.weights[i])
                model.sama_L.weights[i] *= val_l

def test_monarch_correctness():
    print("\n=== STARTING MONARCH LAYER DEBUG/TEST ===")
    
    # 1. Setup minimal model to easy calculation
    # N = 4 (2x2 grid). 
    # This is small enough to print everything.
    rL, cL = 2, 2
    rR, cR = 2, 2
    share_factor = 1
    model = SharedMonarch(share_factor, rR, cR, rL, cL)
    
    # ==========================================
    # TEST CASE 1: The "Passthrough" (Identity) Test
    # ==========================================
    print("\n--- Test 1: Identity/Reconstruction ---")
    print("Goal: If R=Identity and L=Identity, output must equal input.")
    print("Logic: x -> I -> P -> I -> P_T -> x")
    
    # Force weights to Identity
    manual_override_weights(model, method='identity')
    
    # Create random input
    x = torch.randn(1, cL * cR)
    y = model(x)
    
    print(f"Input:  {x.detach().numpy().round(2)}")
    print(f"Output: {y.detach().numpy().round(2)}")
    
    if torch.allclose(x, y, atol=1e-6):
        print(">> [PASS] Identity Test: P and P_T are correctly inverse.")
    else:
        print(">> [FAIL] Identity Test: Output does not match Input!")
        return # Stop if basic logic fails

    # ==========================================
    # TEST CASE 2: The "Specific Path" (One-Hot) Test
    # ==========================================
    print("\n--- Test 2: Manual Calculation Trace ---")
    print("Goal: Trace a specific value through the layers.")
    
    # Setup: 
    # R will double the values (Scale = 2 * Identity)
    # L will triple the values (Scale = 3 * Identity)
    # Expected: Output = Input * 2 * 3 = Input * 6
    manual_override_weights(model, val_r=2.0, val_l=3.0, method='identity')
    
    # Input: One-hot vector at index 1 (value 1.0)
    # [0, 1, 0, 0]
    x_one_hot = torch.tensor([[0.0, 1.0, 0.0, 0.0]])
    
    y_test = model(x_one_hot)
    
    print(f"Input One-Hot: {x_one_hot.numpy()}")
    print(f"Output:        {y_test.detach().numpy()}")
    
    expected_value = 1.0 * 2.0 * 3.0 # = 6.0
    if torch.allclose(y_test[0, 1], torch.tensor(expected_value)):
        print(f">> [PASS] Value Scaling: Input 1.0 became {y_test[0,1].item()} (Expected 6.0)")
    else:
        print(f">> [FAIL] Value Scaling: Expected 6.0 but got {y_test[0,1].item()}")

    # ==========================================
    # TEST CASE 3: The "Permutation Logic" Check
    # ==========================================
    print("\n--- Test 3: Permutation Logic (Complex) ---")
    print("Goal: Ensure R acts on local blocks (neighbors) and L acts on distant blocks (strided).")
    
    # Reset model
    model = SharedMonarch(share_factor, rR, cR, rL, cL)
    
    # Configuration:
    # We want R to mix [0,1] and [2,3].
    # We want L to mix [0,2] and [1,3] (after permute).
    
    # Let's set R to be Identity (Pass through).
    manual_override_weights(model, val_r=1.0, val_l=1.0, method='identity')
    
    # Now CHANGE Layer L (The second one) manually.
    # L has 2 blocks. Block size is 2.
    # Block 0 of L acts on indices that WERE [0, 2] in original space (due to transpose).
    # Let's make Block 0 of L sum its inputs: Matrix [[1, 1], [1, 1]]
    with torch.no_grad():
        # L weights shape: (num_unique, block_r, block_c) -> (2, 2, 2)
        # Set Block 0 to all ones
        model.sama_L.weights[0] = torch.ones(2, 2) 
        # Set Block 1 to Identity (keep simple)
        nn.init.eye_(model.sama_L.weights[1])
        
    # Input: Activate index 0 and 2.
    # x = [1, 0, 1, 0]
    x_input = torch.tensor([[1.0, 0.0, 1.0, 0.0]])
    
    # Manual Trace Logic:
    # 1. x -> R (Identity) -> [1, 0, 1, 0]
    # 2. P (Permute):
    #    Original indices: 0, 1, 2, 3
    #    View (2,2): [[0,1], [2,3]]
    #    Transpose:  [[0,2], [1,3]]
    #    New Vector sequence: [val_at_0, val_at_2, val_at_1, val_at_3]
    #    Values: [1, 1, 0, 0]
    # 3. L (Block Diag):
    #    Vector is [1, 1] (Block 0 input) and [0, 0] (Block 1 input).
    #    Block 0 of L is "All Ones" matrix.
    #    Calculation: [1, 1] @ [[1,1],[1,1]].T = [2, 2]
    #    Block 1 of L is Identity.
    #    Calculation: [0, 0] @ I = [0, 0]
    #    Result after L: [2, 2, 0, 0] (This is in Permuted space!)
    # 4. P_T (Un-permute):
    #    Current View (2,2): [[2, 2], [0, 0]]
    #    Transpose back: [[2, 0], [2, 0]]
    #    Flatten: [2, 0, 2, 0]
    
    y_complex = model(x_input)
    
    print(f"Input:    {x_input.numpy()}")
    print(f"Expected: [2., 0., 2., 0.]")
    print(f"Actual:   {y_complex.detach().numpy()}")
    
    if torch.allclose(y_complex, torch.tensor([[2., 0., 2., 0.]])):
        print(">> [PASS] Permutation Mixing Logic is correct!")
    else:
        print(">> [FAIL] Permutation Mixing Logic is flawed.")

import matplotlib.pyplot as plt
import torch
import numpy as np

def visualize_monarch_structure():
    """
    Visualizes the effective weight matrices of the Monarch layers to prove
    the structural properties (Sparsity & Butterfly pattern).
    """
    
    
    # 1. Initialize Model
    rL, cL = 4, 4
    rR, cR = 4, 4
    share_factor = 1
    print(f"Visualizing Monarch Structure for N={cL*cR} (Grid {cL}x{cR})...")
    model = SharedMonarch(share_factor, rR, cR, rL, cL)
    
    # 2. Force weights to be random but significant (away from 0) for clear visualization
    with torch.no_grad():
        nn.init.uniform_(model.sama_R.weights, 0.5, 1.0)
        nn.init.uniform_(model.sama_L.weights, 0.5, 1.0)

    # 3. Create Identity Matrix to extract Effective Weights
    input_dim = cL * cR
    identity_matrix = torch.eye(input_dim)
    
    # 4. Extract Matrices
    with torch.no_grad():
        # A. Extract Matrix R (Right Layer Only)
        # We only run the first layer: x -> R
        w_r_transposed = model.sama_R(identity_matrix)
        w_r = w_r_transposed.T # Transpose back to standard (Out, In) format
        
        # B. Extract Total Effective Matrix (R -> P -> L -> P_T)
        w_total_transposed = model(identity_matrix)
        w_total = w_total_transposed.T

    # 5. Plotting
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    # Plot A: The Local Block Diagonal Matrix (Layer R)
    im1 = axes[0].imshow(w_r.abs().numpy(), cmap='Blues', interpolation='nearest')
    axes[0].set_title(f"Layer R (Block Diagonal)\nBlocks of size {cR}x{cR}")
    axes[0].set_xlabel("Input Index")
    axes[0].set_ylabel("Intermediate Index")
    
    # Plot B: The Full Monarch Matrix (Butterfly Structure)
    im2 = axes[1].imshow(w_total.abs().numpy(), cmap='Reds', interpolation='nearest')
    axes[1].set_title(f"Full Monarch (Butterfly)\nGlobal Mixing")
    axes[1].set_xlabel("Input Index")
    axes[1].set_ylabel("Output Index")

    plt.tight_layout()
    plt.savefig("monarch_viz1.png")
    print("Graph saved to 'monarch_viz.png'. Please check your file explorer.")
    plt.close()



def test_merge():
    model = SharedMonarch(share_factor=1, 
                          block_size_cR=3, block_size_rR=4, 
                          block_size_cL=4, block_size_rL=5)
    
    # Method 1 (Manual)
    with torch.no_grad():
        W_manual = model.get_delta_weight()

    # Method 2: Identity Trick (Forward)
    # Note: forward(I) returns là W^T. 
    I = torch.eye(12)
    with torch.no_grad():
        W_forward = model(I).T 
        
    print(f"Manual Shape: {W_manual.shape}")   # (20, 12)
    print(f"Forward Shape: {W_forward.shape}") # (20, 12)
    
    diff = (W_manual - W_forward).abs().max()
    print(f"Maximum Difference: {diff.item()}")
    
    if diff < 1e-6:
        print(">> Accurate!")

if __name__ == "__main__":
    # test_monarch_correctness()
    # visualize_monarch_structure()
    test_merge()