File size: 12,812 Bytes
26849c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
"""
Baseline attention mechanisms for comparison with H4 Polytopic Attention.

Implements standard softmax attention and linear attention (Katharopoulos et al. 2020)
with the SAME model wrapper (embeddings, FFN, LM head) so the only variable is attention.

Usage:
    model = BaselineLanguageModel(vocab_size=128, d_model=128, n_heads=8,
                                   n_layers=4, d_value=16, d_ffn=512,
                                   attention_type='softmax')  # or 'linear'
"""

import math
import torch
import torch.nn as nn
import torch.nn.functional as F

import sys
import os
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

from utils.phi_positional import PhiPositionalEncoding
from bitlinear import BitLinear


# ---------------------------------------------------------------------------
# Softmax Attention (standard transformer)
# ---------------------------------------------------------------------------

class SoftmaxAttention(nn.Module):
    """Standard multi-head scaled dot-product attention with causal mask."""

    def __init__(self, d_model, n_heads, d_value, dropout=0.0, use_bitlinear=False):
        super().__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_head = d_model // n_heads
        self.d_value = d_value
        self.scale = 1.0 / math.sqrt(self.d_head)

        Linear = BitLinear if use_bitlinear else nn.Linear

        self.W_q = Linear(d_model, self.d_head * n_heads, bias=False)
        self.W_k = Linear(d_model, self.d_head * n_heads, bias=False)
        self.W_v = Linear(d_model, d_value * n_heads, bias=False)
        self.W_out = Linear(d_value * n_heads, d_model, bias=False)
        self.dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()

    def forward(self, x, **kwargs):
        B, T, D = x.shape
        H = self.n_heads

        Q = self.W_q(x).view(B, T, H, self.d_head).transpose(1, 2)  # (B, H, T, d_head)
        K = self.W_k(x).view(B, T, H, self.d_head).transpose(1, 2)
        V = self.W_v(x).view(B, T, H, self.d_value).transpose(1, 2)  # (B, H, T, d_value)

        # Scaled dot-product attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale  # (B, H, T, T)

        # Causal mask
        causal_mask = torch.triu(
            torch.ones(T, T, device=x.device, dtype=torch.bool), diagonal=1
        )
        scores.masked_fill_(causal_mask.unsqueeze(0).unsqueeze(0), float('-inf'))

        attn = F.softmax(scores, dim=-1)
        attn = self.dropout(attn)

        out = torch.matmul(attn, V)  # (B, H, T, d_value)
        out = out.transpose(1, 2).contiguous().view(B, T, H * self.d_value)
        return self.W_out(out)


# ---------------------------------------------------------------------------
# Linear Attention (Katharopoulos et al. 2020)
# ---------------------------------------------------------------------------

def elu_feature_map(x):
    """ELU+1 feature map for linear attention: phi(x) = elu(x) + 1."""
    return F.elu(x) + 1.0


class LinearAttention(nn.Module):
    """
    Linear attention: O(T) causal attention via kernel trick.

    Instead of softmax(QK^T)V, computes phi(Q) @ (phi(K)^T @ V)
    where phi is the ELU+1 feature map.

    For causal attention, uses cumulative sum formulation:
        S_t = sum_{i<=t} phi(K_i)^T V_i   (running state)
        z_t = sum_{i<=t} phi(K_i)          (running normalizer)
        output_t = (phi(Q_t) @ S_t) / (phi(Q_t) @ z_t)
    """

    def __init__(self, d_model, n_heads, d_value, dropout=0.0, use_bitlinear=False):
        super().__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_head = d_model // n_heads
        self.d_value = d_value

        Linear = BitLinear if use_bitlinear else nn.Linear

        self.W_q = Linear(d_model, self.d_head * n_heads, bias=False)
        self.W_k = Linear(d_model, self.d_head * n_heads, bias=False)
        self.W_v = Linear(d_model, d_value * n_heads, bias=False)
        self.W_out = Linear(d_value * n_heads, d_model, bias=False)
        self.dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()

    def forward(self, x, **kwargs):
        B, T, D = x.shape
        H = self.n_heads

        Q = self.W_q(x).view(B, T, H, self.d_head)  # (B, T, H, d_head)
        K = self.W_k(x).view(B, T, H, self.d_head)
        V = self.W_v(x).view(B, T, H, self.d_value)  # (B, T, H, d_value)

        # Apply ELU+1 feature map
        Q = elu_feature_map(Q)  # (B, T, H, d_head)
        K = elu_feature_map(K)

        # Causal linear attention via cumulative sum
        # S_t = cumsum(phi(K)^T @ V) over time dimension
        # For each timestep: KV = outer(K_t, V_t) -> (B, H, d_head, d_value)
        # Cumulative: S_t = sum_{i<=t} KV_i

        # Reshape for batch computation
        Q = Q.permute(0, 2, 1, 3)  # (B, H, T, d_head)
        K = K.permute(0, 2, 1, 3)
        V = V.permute(0, 2, 1, 3)  # (B, H, T, d_value)

        # KV: outer product at each timestep
        KV = torch.einsum('bhti,bhtj->bhtij', K, V)  # (B, H, T, d_head, d_value)
        S = torch.cumsum(KV, dim=2)  # (B, H, T, d_head, d_value)

        # Normalizer: cumsum of K
        z = torch.cumsum(K, dim=2)  # (B, H, T, d_head)

        # Output: Q @ S / (Q @ z)
        # numerator: (B, H, T, d_head) @ (B, H, T, d_head, d_value) -> (B, H, T, d_value)
        num = torch.einsum('bhti,bhtij->bhtj', Q, S)
        # denominator: (B, H, T, d_head) . (B, H, T, d_head) -> (B, H, T)
        den = torch.einsum('bhti,bhti->bht', Q, z).unsqueeze(-1).clamp(min=1e-6)

        out = num / den  # (B, H, T, d_value)
        out = out.permute(0, 2, 1, 3).contiguous().view(B, T, H * self.d_value)
        out = self.dropout(out)
        return self.W_out(out)


# ---------------------------------------------------------------------------
# Transformer Block (swappable attention)
# ---------------------------------------------------------------------------

class SoftmaxTransformerBlock(nn.Module):
    """Standard pre-norm transformer block with softmax attention."""

    def __init__(self, d_model, n_heads, d_value, d_ffn=None, dropout=0.0,
                 use_bitlinear=False):
        super().__init__()
        if d_ffn is None:
            d_ffn = d_model * 4
        Linear = BitLinear if use_bitlinear else nn.Linear

        self.ln1 = nn.LayerNorm(d_model)
        self.attn = SoftmaxAttention(d_model, n_heads, d_value, dropout, use_bitlinear)
        self.ln2 = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            Linear(d_model, d_ffn, bias=False),
            nn.GELU(),
            Linear(d_ffn, d_model, bias=False),
            nn.Dropout(dropout) if dropout > 0 else nn.Identity(),
        )

    def forward(self, x, **kwargs):
        x = x + self.attn(self.ln1(x))
        x = x + self.ffn(self.ln2(x))
        return x


class LinearTransformerBlock(nn.Module):
    """Pre-norm transformer block with linear attention (Katharopoulos et al. 2020)."""

    def __init__(self, d_model, n_heads, d_value, d_ffn=None, dropout=0.0,
                 use_bitlinear=False):
        super().__init__()
        if d_ffn is None:
            d_ffn = d_model * 4
        Linear = BitLinear if use_bitlinear else nn.Linear

        self.ln1 = nn.LayerNorm(d_model)
        self.attn = LinearAttention(d_model, n_heads, d_value, dropout, use_bitlinear)
        self.ln2 = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            Linear(d_model, d_ffn, bias=False),
            nn.GELU(),
            Linear(d_ffn, d_model, bias=False),
            nn.Dropout(dropout) if dropout > 0 else nn.Identity(),
        )

    def forward(self, x, **kwargs):
        x = x + self.attn(self.ln1(x))
        x = x + self.ffn(self.ln2(x))
        return x


# ---------------------------------------------------------------------------
# Baseline Language Model
# ---------------------------------------------------------------------------

class BaselineLanguageModel(nn.Module):
    """
    Language model with swappable attention mechanism.

    Same architecture as H4LanguageModel (same embeddings, FFN, LM head)
    but with standard softmax or linear attention instead of H4 geometric attention.
    This ensures the only variable in comparisons is the attention mechanism.

    Args:
        vocab_size: vocabulary size
        d_model: model dimension
        n_heads: number of attention heads
        n_layers: number of transformer blocks
        d_value: value dimension per head
        d_ffn: FFN hidden dimension (default: 4 * d_model)
        max_seq_len: max sequence length for positional encoding
        dropout: dropout rate
        attention_type: 'softmax' or 'linear'
        use_bitlinear: if True, use ternary weights
    """

    def __init__(
        self,
        vocab_size,
        d_model=128,
        n_heads=8,
        n_layers=4,
        d_value=16,
        d_ffn=None,
        max_seq_len=512,
        dropout=0.0,
        attention_type='softmax',
        use_bitlinear=False,
    ):
        super().__init__()
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.n_layers = n_layers
        self.attention_type = attention_type

        if d_ffn is None:
            d_ffn = d_model * 4

        # Token embedding (always float)
        self.token_emb = nn.Embedding(vocab_size, d_model)
        self.emb_scale = math.sqrt(d_model)

        # Same golden-angle positional encoding as H4LanguageModel
        self.pos_enc = PhiPositionalEncoding(d_model, max_cached=max_seq_len)

        self.emb_dropout = nn.Dropout(dropout)

        # Transformer blocks with selected attention type
        if attention_type == 'softmax':
            BlockClass = SoftmaxTransformerBlock
        elif attention_type == 'linear':
            BlockClass = LinearTransformerBlock
        else:
            raise ValueError(f"Unknown attention_type: {attention_type}")

        self.blocks = nn.ModuleList([
            BlockClass(
                d_model=d_model,
                n_heads=n_heads,
                d_value=d_value,
                d_ffn=d_ffn,
                dropout=dropout,
                use_bitlinear=use_bitlinear,
            )
            for _ in range(n_layers)
        ])

        # Final layer norm
        self.ln_f = nn.LayerNorm(d_model)

        # LM head (tied with token embedding)
        self.lm_head = nn.Linear(d_model, vocab_size, bias=False)
        self.lm_head.weight = self.token_emb.weight

        self._init_weights()

    def _init_weights(self):
        """Initialize weights following GPT-2 conventions."""
        for module in self.modules():
            if isinstance(module, BitLinear):
                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            elif isinstance(module, nn.Linear):
                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
                if module.bias is not None:
                    torch.nn.init.zeros_(module.bias)
            elif isinstance(module, nn.Embedding):
                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, input_ids, **kwargs):
        """
        Args:
            input_ids: (batch, seq_len) token indices
        Returns:
            logits: (batch, seq_len, vocab_size)
        """
        B, T = input_ids.shape

        tok_emb = self.token_emb(input_ids) * self.emb_scale
        pos_emb = self.pos_enc(T).unsqueeze(0).to(tok_emb.device)
        x = self.emb_dropout(tok_emb + pos_emb)

        for block in self.blocks:
            x = block(x)

        x = self.ln_f(x)
        logits = self.lm_head(x)
        return logits

    def count_params(self):
        """Count trainable and frozen parameters."""
        trainable = sum(p.numel() for p in self.parameters() if p.requires_grad)
        frozen = sum(p.numel() for p in self.parameters() if not p.requires_grad)
        buffers = sum(b.numel() for b in self.buffers())
        return {
            'trainable': trainable,
            'frozen': frozen,
            'buffers': buffers,
            'total': trainable + frozen,
        }

    @torch.no_grad()
    def generate(self, input_ids, max_new_tokens=100, temperature=1.0, top_k_sample=0):
        """Autoregressive generation."""
        for _ in range(max_new_tokens):
            logits = self.forward(input_ids)
            logits = logits[:, -1, :] / temperature

            if top_k_sample > 0:
                v, _ = torch.topk(logits, min(top_k_sample, logits.size(-1)))
                logits[logits < v[:, [-1]]] = float('-inf')

            probs = F.softmax(logits, dim=-1)
            next_id = torch.multinomial(probs, num_samples=1)
            input_ids = torch.cat([input_ids, next_id], dim=1)

        return input_ids