ThomasTheMaker commited on
Commit
0a38bb7
·
verified ·
1 Parent(s): 3b448ed

Delete pico-decoder-tiny-max-vram

Browse files
Files changed (27) hide show
  1. pico-decoder-tiny-max-vram/checkpoints/step_0/config.json +0 -22
  2. pico-decoder-tiny-max-vram/checkpoints/step_0/fabric_state/checkpoint.pt +0 -3
  3. pico-decoder-tiny-max-vram/checkpoints/step_0/learning_dynamics/train_activations.pt +0 -3
  4. pico-decoder-tiny-max-vram/checkpoints/step_0/learning_dynamics/train_data/data-00000-of-00001.arrow +0 -3
  5. pico-decoder-tiny-max-vram/checkpoints/step_0/learning_dynamics/train_data/dataset_info.json +0 -19
  6. pico-decoder-tiny-max-vram/checkpoints/step_0/learning_dynamics/train_data/state.json +0 -13
  7. pico-decoder-tiny-max-vram/checkpoints/step_0/learning_dynamics/train_gradients.pt +0 -3
  8. pico-decoder-tiny-max-vram/checkpoints/step_0/learning_dynamics/train_weights.pt +0 -3
  9. pico-decoder-tiny-max-vram/checkpoints/step_0/model.safetensors +0 -3
  10. pico-decoder-tiny-max-vram/checkpoints/step_0/pico_decoder.py +0 -608
  11. pico-decoder-tiny-max-vram/checkpoints/step_0/special_tokens_map.json +0 -16
  12. pico-decoder-tiny-max-vram/checkpoints/step_0/tokenizer.json +0 -0
  13. pico-decoder-tiny-max-vram/checkpoints/step_0/tokenizer_config.json +0 -239
  14. pico-decoder-tiny-max-vram/checkpoints/step_27/config.json +0 -22
  15. pico-decoder-tiny-max-vram/checkpoints/step_27/fabric_state/checkpoint.pt +0 -3
  16. pico-decoder-tiny-max-vram/checkpoints/step_27/model.safetensors +0 -3
  17. pico-decoder-tiny-max-vram/checkpoints/step_27/pico_decoder.py +0 -608
  18. pico-decoder-tiny-max-vram/checkpoints/step_27/special_tokens_map.json +0 -16
  19. pico-decoder-tiny-max-vram/checkpoints/step_27/tokenizer.json +0 -0
  20. pico-decoder-tiny-max-vram/checkpoints/step_27/tokenizer_config.json +0 -239
  21. pico-decoder-tiny-max-vram/eval_results/step_0.json +0 -1
  22. pico-decoder-tiny-max-vram/eval_results/step_27.json +0 -1
  23. pico-decoder-tiny-max-vram/logs/log_20250828_210412.log +0 -0
  24. pico-decoder-tiny-max-vram/logs/log_20250828_210457.log +0 -108
  25. pico-decoder-tiny-max-vram/logs/log_20250828_210719.log +0 -108
  26. pico-decoder-tiny-max-vram/logs/log_20250828_210922.log +0 -113
  27. pico-decoder-tiny-max-vram/training_config.yaml +0 -74
pico-decoder-tiny-max-vram/checkpoints/step_0/config.json DELETED
@@ -1,22 +0,0 @@
1
- {
2
- "activation_hidden_dim": 384,
3
- "architectures": [
4
- "PicoDecoderHF"
5
- ],
6
- "attention_n_heads": 12,
7
- "attention_n_kv_heads": 4,
8
- "auto_map": {
9
- "AutoConfig": "pico_decoder.PicoDecoderHFConfig",
10
- "AutoModelForCausalLM": "pico_decoder.PicoDecoderHF"
11
- },
12
- "batch_size": 1024,
13
- "d_model": 96,
14
- "max_seq_len": 2048,
15
- "model_type": "pico_decoder",
16
- "n_layers": 12,
17
- "norm_eps": 1e-06,
18
- "position_emb_theta": 10000.0,
19
- "torch_dtype": "float32",
20
- "transformers_version": "4.48.3",
21
- "vocab_size": 50304
22
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pico-decoder-tiny-max-vram/checkpoints/step_0/fabric_state/checkpoint.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b50a50fd67e7a1dfa214a074549428c03047ccc26357734db80084015a538b90
3
- size 45187997
 
 
 
 
pico-decoder-tiny-max-vram/checkpoints/step_0/learning_dynamics/train_activations.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b6e6e181e18c36507d7cb053f37008011d6846e06f9e345baf9d0663fb288d53
3
- size 1388635
 
 
 
 
pico-decoder-tiny-max-vram/checkpoints/step_0/learning_dynamics/train_data/data-00000-of-00001.arrow DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8cf7fcdfd88a10fcfc5c173847b1b6f8926953cc585a896149edefcde9308ba8
3
- size 4121312
 
 
 
 
pico-decoder-tiny-max-vram/checkpoints/step_0/learning_dynamics/train_data/dataset_info.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "citation": "",
3
- "description": "",
4
- "features": {
5
- "input_ids": {
6
- "feature": {
7
- "dtype": "int32",
8
- "_type": "Value"
9
- },
10
- "_type": "Sequence"
11
- },
12
- "text": {
13
- "dtype": "string",
14
- "_type": "Value"
15
- }
16
- },
17
- "homepage": "",
18
- "license": ""
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pico-decoder-tiny-max-vram/checkpoints/step_0/learning_dynamics/train_data/state.json DELETED
@@ -1,13 +0,0 @@
1
- {
2
- "_data_files": [
3
- {
4
- "filename": "data-00000-of-00001.arrow"
5
- }
6
- ],
7
- "_fingerprint": "6848e6167d9ecc18",
8
- "_format_columns": null,
9
- "_format_kwargs": {},
10
- "_format_type": null,
11
- "_output_all_columns": false,
12
- "_split": null
13
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pico-decoder-tiny-max-vram/checkpoints/step_0/learning_dynamics/train_gradients.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:27e73f3bd443e10701a6786ae83543453f0ffe514be04040edb55b9ff158895d
3
- size 2371527
 
 
 
 
pico-decoder-tiny-max-vram/checkpoints/step_0/learning_dynamics/train_weights.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c029ef92a6494ae121c847e432e52e6a8ff3bf7d9fef3e61bef871c1e9a9aa02
3
- size 2371443
 
 
 
 
pico-decoder-tiny-max-vram/checkpoints/step_0/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1852515eb5c8556533445f22edf523884b9f8cc44812379a6a951668a4ffa3a3
3
- size 45143592
 
 
 
 
pico-decoder-tiny-max-vram/checkpoints/step_0/pico_decoder.py DELETED
@@ -1,608 +0,0 @@
1
- """
2
- Pico Decoder: A Lightweight Causal Transformer Language Model
3
-
4
- Pico Decoder uses a simple LLAMA-style transformer architecture, written for clarity and educational purposes.
5
-
6
- Everything is written with a modular design for easy modification and experimentation.
7
-
8
- Key features:
9
- - RMSNorm for layer normalization
10
- - Rotary Positional Embeddings (RoPE)
11
- - Multi-head attention with KV-cache support
12
- - SwiGLU activation function
13
- - Residual connections throughout
14
-
15
- - KV-cache for faster autoregressive generation
16
-
17
- References:
18
- - RoPE: https://arxiv.org/abs/2104.09864
19
- - SwiGLU: https://arxiv.org/abs/2002.05202
20
- - LLAMA: https://arxiv.org/abs/2302.13971
21
-
22
- Adapted from:
23
- - OLMO: https://github.com/allenai/OLMo
24
- - LLAMA: https://github.com/meta/llama
25
- """
26
-
27
- from dataclasses import asdict
28
- from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
29
-
30
- import torch
31
- import torch.nn as nn
32
- import torch.nn.functional as F
33
- from torch.nn.attention import SDPBackend, sdpa_kernel
34
- from transformers import PretrainedConfig, PreTrainedModel
35
- from transformers.modeling_outputs import CausalLMOutput, CausalLMOutputWithPast
36
-
37
- try:
38
- if TYPE_CHECKING:
39
- # We need to do this to avoid importing these when creating the HF-compatible models
40
- from src.config import ModelConfig
41
- except ImportError:
42
- pass
43
-
44
- ########################################################
45
- #
46
- # Layer Normalization
47
- #
48
- ########################################################
49
-
50
-
51
- class RMSNorm(torch.nn.Module):
52
- """Root Mean Square Layer Normalization.
53
-
54
- A variant of Layer Normalization that uses RMS statistics instead of mean/variance,
55
- resulting in improved stability and performance.
56
-
57
- Args:
58
- config (Union[ModelConfig, PicoHFConfig]): Configuration object containing normalization parameters
59
- - config.norm_eps: Small constant for numerical stability
60
- - config.d_model: Model dimension for the weight parameter
61
-
62
- References:
63
- https://arxiv.org/abs/1910.07467
64
- """
65
-
66
- def __init__(self, config: Union["ModelConfig", "PicoDecoderHFConfig"]):
67
- super().__init__()
68
- self.eps = config.norm_eps
69
- self.weight = nn.Parameter(torch.ones(config.d_model))
70
-
71
- def _norm(self, x: torch.Tensor) -> torch.Tensor:
72
- """
73
- Normalizes the input tensor by its RMS value.
74
- """
75
- return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
76
-
77
- def forward(self, x: torch.Tensor) -> torch.Tensor:
78
- """
79
- Applies RMS normalization to the input tensor and scales it by the weight parameter.
80
- """
81
- output = self._norm(x.float()).type_as(x)
82
- return output * self.weight
83
-
84
-
85
- ########################################################
86
- #
87
- # Positional Embedding
88
- #
89
- ########################################################
90
-
91
-
92
- class RoPE(nn.Module):
93
- """Rotary Positional Embeddings (RoPE).
94
-
95
- Implements position-dependent rotation of keys and queries in attention mechanism,
96
- allowing better modeling of relative positions in sequences. Uses complex number
97
- operations for efficient rotation.
98
-
99
- Args:
100
- config (Union[ModelConfig, PicoHFConfig]): Model configuration containing:
101
- - config.position_emb_theta: Base for frequency computation
102
- - config.d_model: Model dimension
103
- - config.attention_n_heads: Number of attention heads
104
- - config.max_seq_len: Maximum sequence length
105
-
106
- References:
107
- https://arxiv.org/abs/2104.09864
108
- """
109
-
110
- _freqs_cis_tensor: torch.Tensor | None = None
111
-
112
- def __init__(self, config: Union["ModelConfig", "PicoDecoderHFConfig"]):
113
- super().__init__()
114
-
115
- self.theta = config.position_emb_theta
116
- self.dim = config.d_model // config.attention_n_heads
117
-
118
- max_seq_len = config.max_seq_len
119
-
120
- # only gets set once, and then reused for all RoPE instances
121
- if RoPE._freqs_cis_tensor is None:
122
- RoPE._freqs_cis_tensor = self._setup_freqs_cis(
123
- max_seq_len, self.theta, self.dim
124
- )
125
-
126
- # register _freqs_cis buffer
127
- # can be easily recomputed so persistent=False
128
- self.register_buffer("_freqs_cis", self._freqs_cis_tensor, persistent=False)
129
-
130
- @classmethod
131
- def _setup_freqs_cis(cls, seq_len: int, theta: float, dim: int) -> torch.Tensor:
132
- """Setup Frequency Tensor for RoPE Embeddings
133
-
134
- Initializes the complex frequency tensor that is used to compute the RoPE embeddings.
135
-
136
- Note other implementations will use cos and sin directly, but using the complex
137
- number representation is (probably?) more efficient:
138
-
139
- e^(theta * i * t) = cos(theta * t) + i * sin(theta * t) [Euler's formula]
140
- """
141
- _freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
142
- positions = torch.arange(seq_len)
143
- freqs = torch.outer(positions, _freqs)
144
- return torch.polar(torch.ones_like(freqs), freqs) # complex64
145
-
146
- def get_freqs_cis(
147
- self, input_shape: torch.Size, start_pos: int, end_pos: int
148
- ) -> torch.Tensor:
149
- """Reshape Frequency Tensor for RoPE Embeddings
150
-
151
- Makes the frequency tensor broadcastable with the input tensor.
152
- """
153
- _freqs_cis = self._freqs_cis[start_pos:end_pos]
154
- ndim = len(input_shape)
155
- assert 0 <= 1 < ndim
156
- assert _freqs_cis.shape == (input_shape[1], input_shape[-1])
157
-
158
- # TODO: Check whether this is correct (might be able to remove this)
159
- shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(input_shape)]
160
- return _freqs_cis.view(*shape)
161
-
162
- def forward(
163
- self,
164
- queries: torch.Tensor,
165
- keys: torch.Tensor,
166
- start_pos: int = 0,
167
- ) -> Tuple[torch.Tensor, torch.Tensor]:
168
- """Apply RoPE Embeddings to Queries and Keys
169
-
170
- Applies the rotary positional embeddings to the input tensors via complex num multiplication
171
-
172
- NOTE: The start_pos is used if we want to use the kv_cache in the attention mechanism.
173
- """
174
- queries_ = torch.view_as_complex(
175
- queries.float().reshape(*queries.shape[:-1], -1, 2)
176
- )
177
- keys_ = torch.view_as_complex(keys.float().reshape(*keys.shape[:-1], -1, 2))
178
-
179
- input_shape = (
180
- queries_.shape
181
- ) # same as keys: (batch_size, seq_len, n_heads, head_dim/2)
182
- freqs_start_pos = start_pos
183
- freqs_end_pos = freqs_start_pos + queries_.shape[1]
184
-
185
- freqs_cis = self.get_freqs_cis(input_shape, freqs_start_pos, freqs_end_pos)
186
-
187
- queries_rotated = torch.view_as_real(queries_ * freqs_cis).flatten(3)
188
- keys_rotated = torch.view_as_real(keys_ * freqs_cis).flatten(3)
189
- return queries_rotated.type_as(queries), keys_rotated.type_as(keys)
190
-
191
-
192
- ########################################################
193
- #
194
- # Attention
195
- #
196
- ########################################################
197
-
198
-
199
- class Attention(nn.Module):
200
- """Multi-head Attention with Group Query Attention support.
201
-
202
- Implements scaled dot-product attention and supports:
203
- - Grouped Query Attention (GQA)
204
- - Key-Value caching for efficient inference
205
- - RoPE integration
206
-
207
- Args:
208
- config (Union[ModelConfig, PretrainedConfig]): Configuration containing:
209
- - config.attention_n_heads: Number of attention heads
210
- - config.attention_n_kv_heads: Number of key/value heads
211
- - config.d_model: Model dimension
212
- - config.batch_size: Maximum batch size
213
- - config.max_seq_len: Maximum sequence length
214
-
215
- Shape:
216
- - Input: (batch_size, seq_len, d_model)
217
- - Output: (batch_size, seq_len, d_model)
218
- """
219
-
220
- def __init__(
221
- self,
222
- config: Union["ModelConfig", "PicoDecoderHFConfig"],
223
- ):
224
- super().__init__()
225
-
226
- self.n_heads = config.attention_n_heads
227
- self.n_kv_heads = config.attention_n_kv_heads
228
-
229
- self.batch_size = config.batch_size
230
- self.max_seq_len = config.max_seq_len
231
-
232
- d_model = config.d_model
233
- self.head_dim = d_model // self.n_heads
234
-
235
- self.n_rep = self.n_heads // self.n_kv_heads
236
-
237
- self.q_proj = nn.Linear(d_model, self.n_heads * self.head_dim, bias=False)
238
- self.k_proj = nn.Linear(d_model, self.n_kv_heads * self.head_dim, bias=False)
239
- self.v_proj = nn.Linear(d_model, self.n_kv_heads * self.head_dim, bias=False)
240
- self.o_proj = nn.Linear(self.n_heads * self.head_dim, d_model, bias=False)
241
-
242
- self.rope = RoPE(config)
243
-
244
- def forward(
245
- self,
246
- input: torch.Tensor,
247
- mask: Optional[torch.Tensor] = None,
248
- past_key_values: Optional[Tuple[torch.Tensor, ...]] = None,
249
- use_cache: bool = False,
250
- ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
251
- """Forward pass for the attention mechanism.
252
-
253
- Computes queries, keys, and values for the attention mechanism. Applies rotary positional
254
- embeddings to the queries and keys, and then computes attention scores and outputs.
255
-
256
- For an introduction to the attention mechanism, see:
257
- https://arxiv.org/abs/1706.03762
258
-
259
- A few things to note:
260
- - The past_key_values is used to implement the KV cache, which is used to speed up
261
- generation by caching the KV pairs from previous forward passes. This is useful when doing
262
- tasks that require generating multiple tokens conditioned on previous tokens (e.g. language
263
- modeling, text generation, etc.). The way the KV cache is implemented is that each layer has
264
- its own KV cache - this KV cache is implemented as a tuple.
265
- """
266
- bsz, seq_len, _ = input.shape
267
- _queries, _keys, _values = (
268
- self.q_proj(input),
269
- self.k_proj(input),
270
- self.v_proj(input),
271
- )
272
-
273
- # Reshaping for multi-head attention
274
- queries = _queries.view(bsz, seq_len, self.n_heads, self.head_dim)
275
- keys = _keys.view(bsz, seq_len, self.n_kv_heads, self.head_dim)
276
- values = _values.view(bsz, seq_len, self.n_kv_heads, self.head_dim)
277
-
278
- # The start position is used to apply the RoPE embeddings to only the new tokens
279
- # when using the kv_cache in the attention mechanism.
280
- # We want to start from the last position in the cache.
281
- start_pos = past_key_values[0].shape[1] if past_key_values is not None else 0
282
-
283
- # apply rotary positional embeddings
284
- queries, keys = self.rope(queries, keys, start_pos)
285
-
286
- if past_key_values is not None:
287
- keys = torch.cat([past_key_values[0], keys], dim=1)
288
- values = torch.cat([past_key_values[1], values], dim=1)
289
-
290
- if use_cache:
291
- cached_keys = keys
292
- cached_values = values
293
- else:
294
- cached_keys = None
295
- cached_values = None
296
-
297
- queries = queries.transpose(1, 2)
298
- keys = keys.transpose(1, 2)
299
- values = values.transpose(1, 2)
300
-
301
- apply_gqa = self.n_rep > 1
302
- if apply_gqa and queries.device.type == "mps":
303
- # NOTE: MPS does not support GQA in the SDPA kernel, but we can repeat the keys and values
304
- # outside of the kernel to get the same effect.
305
- # See: https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
306
- keys = keys.repeat_interleave(self.n_rep, dim=-3)
307
- values = values.repeat_interleave(self.n_rep, dim=-3)
308
- apply_gqa = False
309
-
310
- backends = [SDPBackend.CUDNN_ATTENTION, SDPBackend.MATH]
311
-
312
- with sdpa_kernel(backends=backends):
313
- attn_output = F.scaled_dot_product_attention(
314
- queries.contiguous(),
315
- keys.contiguous(),
316
- values.contiguous(),
317
- attn_mask=mask.to(queries.dtype),
318
- enable_gqa=apply_gqa,
319
- )
320
-
321
- attn_output = attn_output.transpose(1, 2).contiguous().view(bsz, seq_len, -1)
322
- output = self.o_proj(attn_output)
323
-
324
- return output, (cached_keys, cached_values)
325
-
326
-
327
- ########################################################
328
- #
329
- # SwiGLU (Combines MLP and Activation)
330
- #
331
- ########################################################
332
-
333
-
334
- class SwiGLU(nn.Module):
335
- """SwiGLU Activation Function with Linear Projections.
336
-
337
- Implements the SwiGLU activation function combined with linear transformations,
338
- serving as the feed-forward network in transformer blocks.
339
-
340
- Args:
341
- config (Union[ModelConfig, PicoDecoderHFConfig]): Configuration containing:
342
- - config.d_model: Model dimension
343
- - config.activation_hidden_dim: Hidden dimension (typically 4 * d_model)
344
-
345
- References:
346
- https://arxiv.org/abs/2002.05202
347
- """
348
-
349
- def __init__(self, config: Union["ModelConfig", "PicoDecoderHFConfig"]):
350
- super().__init__()
351
-
352
- model_dim = config.d_model
353
- act_hidden_dim = config.activation_hidden_dim # usually 4 * d_model
354
-
355
- self.w_0 = nn.Linear(model_dim, act_hidden_dim, bias=False)
356
- self.w_1 = nn.Linear(model_dim, act_hidden_dim, bias=False)
357
- self.w_2 = nn.Linear(act_hidden_dim, model_dim, bias=False)
358
-
359
- def forward(self, x: torch.Tensor) -> torch.Tensor:
360
- return self.w_2(F.silu(self.w_0(x)) * self.w_1(x))
361
-
362
-
363
- ########################################################
364
- #
365
- # PicoDecoderBlock
366
- #
367
- ########################################################
368
-
369
-
370
- class PicoDecoderBlock(nn.Module):
371
- """Single Transformer Block with Attention and Feed-forward layers.
372
-
373
- Implements a standard transformer block with:
374
- - Multi-head attention with normalization and residual connection
375
- - SwiGLU feed-forward network with normalization and residual connection
376
-
377
- Args:
378
- config (Union[ModelConfig, PicoDecoderHFConfig]): Model configuration; either a dataclass or
379
- a HuggingFace PicoDecoderHFConfig
380
- """
381
-
382
- def __init__(
383
- self,
384
- config: Union["ModelConfig", "PicoDecoderHFConfig"],
385
- ):
386
- super().__init__()
387
-
388
- self.attention = Attention(config)
389
- self.swiglu = SwiGLU(config)
390
- self.attention_norm = RMSNorm(config)
391
- self.swiglu_norm = RMSNorm(config)
392
-
393
- def forward(
394
- self,
395
- input: torch.Tensor,
396
- mask: Optional[torch.Tensor] = None,
397
- past_key_values: Optional[Tuple[torch.Tensor]] = None,
398
- use_cache: bool = False,
399
- ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
400
- attention_output, cached_key_values = self.attention(
401
- self.attention_norm(input),
402
- mask=mask,
403
- past_key_values=past_key_values,
404
- use_cache=use_cache,
405
- )
406
- # NOTE: cached_key_values is None if use_cache is False
407
-
408
- h = input + attention_output
409
- out = h + self.swiglu(self.swiglu_norm(h))
410
- return out, cached_key_values
411
-
412
-
413
- ########################################################
414
- #
415
- # Pico Decoder (Causal Transformer Model)
416
- #
417
- ########################################################
418
-
419
-
420
- class PicoDecoder(nn.Module):
421
- """
422
- Pico Decoder: combines the embedding, causal decoder blocks, and output projection into a
423
- single autoregressive model.
424
-
425
- For more information on the model, see the classes for the modules that make up the model.
426
- """
427
-
428
- def __init__(
429
- self,
430
- model_config: Union["ModelConfig", "PicoDecoderHFConfig"],
431
- ):
432
- super().__init__()
433
- self.config = model_config
434
-
435
- self.embedding_proj = nn.Embedding(self.config.vocab_size, self.config.d_model)
436
- self.layers = nn.ModuleList(
437
- [PicoDecoderBlock(self.config) for _ in range(self.config.n_layers)]
438
- )
439
- self.output_norm = RMSNorm(self.config)
440
- self.de_embedding_proj = nn.Linear(
441
- self.config.d_model, self.config.vocab_size, bias=False
442
- )
443
-
444
- def convert_to_hf_model(self) -> "PicoDecoderHF":
445
- """Convert the Lightning model to a HuggingFace model."""
446
- # Create HF config without fabric-specific settings
447
- hf_config = PicoDecoderHFConfig.from_dataclass(self.config)
448
-
449
- # Create new HF model
450
- hf_model = PicoDecoderHF(hf_config)
451
-
452
- # Copy state dict, excluding fabric-specific keys
453
- hf_model.load_state_dict(self.state_dict(prefix="pico_decoder."))
454
-
455
- return hf_model
456
-
457
- def forward(
458
- self,
459
- input_ids: torch.Tensor,
460
- past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
461
- use_cache: bool = False,
462
- ) -> Tuple[torch.Tensor, Optional[Tuple[Tuple[torch.Tensor, torch.Tensor]]]]:
463
- """
464
- This is the forward pass for the entire Pico model. It boils down to:
465
- - Embedding the input ids
466
- - Creating a causal mask
467
- - Processing through the pico layers
468
- - Projecting the output to logits
469
-
470
- NOTE: One feature that might be confusing is the KV cache. The KV cache is used to speed up
471
- generation by caching the KV pairs from previous forward passes. This is useful when doing
472
- tasks that require generating multiple tokens conditioned on previous tokens (e.g. language
473
- modeling, text generation, etc.). The way the KV cache is implemented is that each layer has
474
- its own KV cache which is stored as a tuple. The whole model then stores a tuple of these
475
- KV caches (so a tuple of tuples).
476
- """
477
-
478
- seq_len = input_ids.shape[-1]
479
- h = self.embedding_proj(input_ids)
480
-
481
- # Calculate start position from past cached KV pairs. Remember that each layer has its
482
- # own KV Cache. So when we index past_key_values, we need to index into the KV pairs for the
483
- # correct layer and then for either the keys or values.
484
- start_pos = 0 if past_key_values is None else past_key_values[0][0].shape[1]
485
-
486
- # Create causal mask for current sequence
487
- mask = None
488
- if seq_len > 1:
489
- mask = torch.full((seq_len, seq_len), float("-inf"))
490
- mask = torch.triu(mask, diagonal=1)
491
-
492
- # If using KV cache, extend mask to cover cached sequence length
493
- if past_key_values is not None:
494
- # Add zeros for cached tokens (we can attend to all of them)
495
- mask = torch.hstack([torch.zeros((seq_len, start_pos)), mask])
496
-
497
- mask = mask.to(h.device)
498
-
499
- # NOTE: If we are using the cache, we need to store the cached KV pairs for each layer
500
- # in a tuple. Each layer will have its own cached KV pair which we aggregate in a tuple.
501
- cached_key_values = () if use_cache else None
502
-
503
- # Process through transformer blocks
504
- for idx, layer in enumerate(self.layers):
505
- layer_past_key_values = (
506
- past_key_values[idx] if past_key_values is not None else None
507
- )
508
-
509
- h, layer_cached_key_values = layer(
510
- h, mask=mask, past_key_values=layer_past_key_values, use_cache=use_cache
511
- )
512
-
513
- if use_cache:
514
- cached_key_values += (layer_cached_key_values,)
515
-
516
- # Final norm and projection
517
- h = self.output_norm(h)
518
- logits = self.de_embedding_proj(h).float()
519
-
520
- return logits, cached_key_values
521
-
522
-
523
- ########################################################
524
- #
525
- # HuggingFace Wrapper for the Pico Decoder model.
526
- #
527
- ########################################################
528
-
529
-
530
- class PicoDecoderHFConfig(PretrainedConfig):
531
- """Config class for the Pico Decoder HuggingFace wrapper."""
532
-
533
- model_type = "pico_decoder"
534
-
535
- @classmethod
536
- def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "PicoDecoderHFConfig":
537
- """
538
- Initialize config from a dictionary. Note that no kwargs are passed to the constructor --
539
- this is because with some kwargs special handling is required and can make this class
540
- brittle.
541
- """
542
- pico_config = cls(**config_dict)
543
-
544
- return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
545
- unused_kwargs = {
546
- key: value for key, value in kwargs.items() if not hasattr(pico_config, key)
547
- }
548
-
549
- if return_unused_kwargs:
550
- return pico_config, unused_kwargs
551
- return pico_config
552
-
553
- @classmethod
554
- def from_dataclass(cls, model_config: "ModelConfig"):
555
- """Initialise from our custom config dataclass."""
556
- return cls.from_dict(asdict(model_config))
557
-
558
-
559
- class PicoDecoderHF(PreTrainedModel):
560
- """
561
- HuggingFace wrapper for the Pico model.
562
-
563
- Many evaluation frameworks require a model be setup as a HuggingFace model, so we provide a simple
564
- wrapper that does just that. When we save checkpoints of the Pico model, we save both the normal
565
- Pico model as well as the model wrapped in this HuggingFace class.
566
-
567
- This also lets you do cool things like:
568
-
569
- `model = AutoModelForCausalLM.from_pretrained("path/to/checkpoint")`
570
- """
571
-
572
- config_class = PicoDecoderHFConfig
573
- _no_split_modules = ["PicoBlock", "Attention", "SwiGLU", "RMSNorm"]
574
-
575
- def __init__(self, config: PicoDecoderHFConfig):
576
- super().__init__(config)
577
- self.pico_decoder = PicoDecoder(config)
578
-
579
- def forward(
580
- self,
581
- input_ids: torch.Tensor,
582
- past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
583
- use_cache: bool = False,
584
- **kwargs,
585
- ) -> Union[CausalLMOutput, CausalLMOutputWithPast]:
586
- """HuggingFace forward pass wrapper.
587
-
588
- Forwards pass for the HuggingFace version of the Pico Model. Basic wrapper around the
589
- Pico model's forward pass, and returns the output as a HuggingFace CausalLMOutput.
590
- """
591
- logits, past_key_values = self.pico_decoder(
592
- input_ids, past_key_values, use_cache
593
- )
594
- if use_cache:
595
- return CausalLMOutputWithPast(
596
- logits=logits,
597
- past_key_values=past_key_values,
598
- )
599
- else:
600
- return CausalLMOutput(
601
- logits=logits,
602
- )
603
-
604
-
605
- # Register for auto classes
606
- PicoDecoderHFConfig.register_for_auto_class()
607
- PicoDecoderHF.register_for_auto_class("AutoModel")
608
- PicoDecoderHF.register_for_auto_class("AutoModelForCausalLM")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pico-decoder-tiny-max-vram/checkpoints/step_0/special_tokens_map.json DELETED
@@ -1,16 +0,0 @@
1
- {
2
- "eos_token": {
3
- "content": "<|endoftext|>",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "pad_token": {
10
- "content": "<|padding|>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- }
16
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pico-decoder-tiny-max-vram/checkpoints/step_0/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
pico-decoder-tiny-max-vram/checkpoints/step_0/tokenizer_config.json DELETED
@@ -1,239 +0,0 @@
1
- {
2
- "add_bos_token": false,
3
- "add_eos_token": false,
4
- "add_prefix_space": false,
5
- "added_tokens_decoder": {
6
- "0": {
7
- "content": "|||IP_ADDRESS|||",
8
- "lstrip": false,
9
- "normalized": true,
10
- "rstrip": false,
11
- "single_word": false,
12
- "special": false
13
- },
14
- "1": {
15
- "content": "<|padding|>",
16
- "lstrip": false,
17
- "normalized": false,
18
- "rstrip": false,
19
- "single_word": false,
20
- "special": true
21
- },
22
- "50254": {
23
- "content": " ",
24
- "lstrip": false,
25
- "normalized": true,
26
- "rstrip": false,
27
- "single_word": false,
28
- "special": false
29
- },
30
- "50255": {
31
- "content": " ",
32
- "lstrip": false,
33
- "normalized": true,
34
- "rstrip": false,
35
- "single_word": false,
36
- "special": false
37
- },
38
- "50256": {
39
- "content": " ",
40
- "lstrip": false,
41
- "normalized": true,
42
- "rstrip": false,
43
- "single_word": false,
44
- "special": false
45
- },
46
- "50257": {
47
- "content": " ",
48
- "lstrip": false,
49
- "normalized": true,
50
- "rstrip": false,
51
- "single_word": false,
52
- "special": false
53
- },
54
- "50258": {
55
- "content": " ",
56
- "lstrip": false,
57
- "normalized": true,
58
- "rstrip": false,
59
- "single_word": false,
60
- "special": false
61
- },
62
- "50259": {
63
- "content": " ",
64
- "lstrip": false,
65
- "normalized": true,
66
- "rstrip": false,
67
- "single_word": false,
68
- "special": false
69
- },
70
- "50260": {
71
- "content": " ",
72
- "lstrip": false,
73
- "normalized": true,
74
- "rstrip": false,
75
- "single_word": false,
76
- "special": false
77
- },
78
- "50261": {
79
- "content": " ",
80
- "lstrip": false,
81
- "normalized": true,
82
- "rstrip": false,
83
- "single_word": false,
84
- "special": false
85
- },
86
- "50262": {
87
- "content": " ",
88
- "lstrip": false,
89
- "normalized": true,
90
- "rstrip": false,
91
- "single_word": false,
92
- "special": false
93
- },
94
- "50263": {
95
- "content": " ",
96
- "lstrip": false,
97
- "normalized": true,
98
- "rstrip": false,
99
- "single_word": false,
100
- "special": false
101
- },
102
- "50264": {
103
- "content": " ",
104
- "lstrip": false,
105
- "normalized": true,
106
- "rstrip": false,
107
- "single_word": false,
108
- "special": false
109
- },
110
- "50265": {
111
- "content": " ",
112
- "lstrip": false,
113
- "normalized": true,
114
- "rstrip": false,
115
- "single_word": false,
116
- "special": false
117
- },
118
- "50266": {
119
- "content": " ",
120
- "lstrip": false,
121
- "normalized": true,
122
- "rstrip": false,
123
- "single_word": false,
124
- "special": false
125
- },
126
- "50267": {
127
- "content": " ",
128
- "lstrip": false,
129
- "normalized": true,
130
- "rstrip": false,
131
- "single_word": false,
132
- "special": false
133
- },
134
- "50268": {
135
- "content": " ",
136
- "lstrip": false,
137
- "normalized": true,
138
- "rstrip": false,
139
- "single_word": false,
140
- "special": false
141
- },
142
- "50269": {
143
- "content": " ",
144
- "lstrip": false,
145
- "normalized": true,
146
- "rstrip": false,
147
- "single_word": false,
148
- "special": false
149
- },
150
- "50270": {
151
- "content": " ",
152
- "lstrip": false,
153
- "normalized": true,
154
- "rstrip": false,
155
- "single_word": false,
156
- "special": false
157
- },
158
- "50271": {
159
- "content": " ",
160
- "lstrip": false,
161
- "normalized": true,
162
- "rstrip": false,
163
- "single_word": false,
164
- "special": false
165
- },
166
- "50272": {
167
- "content": " ",
168
- "lstrip": false,
169
- "normalized": true,
170
- "rstrip": false,
171
- "single_word": false,
172
- "special": false
173
- },
174
- "50273": {
175
- "content": " ",
176
- "lstrip": false,
177
- "normalized": true,
178
- "rstrip": false,
179
- "single_word": false,
180
- "special": false
181
- },
182
- "50274": {
183
- "content": " ",
184
- "lstrip": false,
185
- "normalized": true,
186
- "rstrip": false,
187
- "single_word": false,
188
- "special": false
189
- },
190
- "50275": {
191
- "content": " ",
192
- "lstrip": false,
193
- "normalized": true,
194
- "rstrip": false,
195
- "single_word": false,
196
- "special": false
197
- },
198
- "50276": {
199
- "content": " ",
200
- "lstrip": false,
201
- "normalized": true,
202
- "rstrip": false,
203
- "single_word": false,
204
- "special": false
205
- },
206
- "50277": {
207
- "content": "|||EMAIL_ADDRESS|||",
208
- "lstrip": false,
209
- "normalized": true,
210
- "rstrip": false,
211
- "single_word": false,
212
- "special": false
213
- },
214
- "50278": {
215
- "content": "|||PHONE_NUMBER|||",
216
- "lstrip": false,
217
- "normalized": true,
218
- "rstrip": false,
219
- "single_word": false,
220
- "special": false
221
- },
222
- "50279": {
223
- "content": "<|endoftext|>",
224
- "lstrip": false,
225
- "normalized": false,
226
- "rstrip": false,
227
- "single_word": false,
228
- "special": true
229
- }
230
- },
231
- "bos_token": null,
232
- "clean_up_tokenization_spaces": true,
233
- "eos_token": "<|endoftext|>",
234
- "extra_special_tokens": {},
235
- "model_max_length": 1000000000000000019884624838656,
236
- "pad_token": "<|padding|>",
237
- "tokenizer_class": "GPTNeoXTokenizer",
238
- "unk_token": null
239
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pico-decoder-tiny-max-vram/checkpoints/step_27/config.json DELETED
@@ -1,22 +0,0 @@
1
- {
2
- "activation_hidden_dim": 384,
3
- "architectures": [
4
- "PicoDecoderHF"
5
- ],
6
- "attention_n_heads": 12,
7
- "attention_n_kv_heads": 4,
8
- "auto_map": {
9
- "AutoConfig": "pico_decoder.PicoDecoderHFConfig",
10
- "AutoModelForCausalLM": "pico_decoder.PicoDecoderHF"
11
- },
12
- "batch_size": 1024,
13
- "d_model": 96,
14
- "max_seq_len": 2048,
15
- "model_type": "pico_decoder",
16
- "n_layers": 12,
17
- "norm_eps": 1e-06,
18
- "position_emb_theta": 10000.0,
19
- "torch_dtype": "float32",
20
- "transformers_version": "4.48.3",
21
- "vocab_size": 50304
22
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pico-decoder-tiny-max-vram/checkpoints/step_27/fabric_state/checkpoint.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e42d749796c6550ffb318da21c493f94df7f0c48120ac9ecbbd0eb6402fc67ff
3
- size 135543171
 
 
 
 
pico-decoder-tiny-max-vram/checkpoints/step_27/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3d7be9a1e9b585a92821668324e20d977c23d51c04b2ade7610f764f62efe829
3
- size 45143592
 
 
 
 
pico-decoder-tiny-max-vram/checkpoints/step_27/pico_decoder.py DELETED
@@ -1,608 +0,0 @@
1
- """
2
- Pico Decoder: A Lightweight Causal Transformer Language Model
3
-
4
- Pico Decoder uses a simple LLAMA-style transformer architecture, written for clarity and educational purposes.
5
-
6
- Everything is written with a modular design for easy modification and experimentation.
7
-
8
- Key features:
9
- - RMSNorm for layer normalization
10
- - Rotary Positional Embeddings (RoPE)
11
- - Multi-head attention with KV-cache support
12
- - SwiGLU activation function
13
- - Residual connections throughout
14
-
15
- - KV-cache for faster autoregressive generation
16
-
17
- References:
18
- - RoPE: https://arxiv.org/abs/2104.09864
19
- - SwiGLU: https://arxiv.org/abs/2002.05202
20
- - LLAMA: https://arxiv.org/abs/2302.13971
21
-
22
- Adapted from:
23
- - OLMO: https://github.com/allenai/OLMo
24
- - LLAMA: https://github.com/meta/llama
25
- """
26
-
27
- from dataclasses import asdict
28
- from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
29
-
30
- import torch
31
- import torch.nn as nn
32
- import torch.nn.functional as F
33
- from torch.nn.attention import SDPBackend, sdpa_kernel
34
- from transformers import PretrainedConfig, PreTrainedModel
35
- from transformers.modeling_outputs import CausalLMOutput, CausalLMOutputWithPast
36
-
37
- try:
38
- if TYPE_CHECKING:
39
- # We need to do this to avoid importing these when creating the HF-compatible models
40
- from src.config import ModelConfig
41
- except ImportError:
42
- pass
43
-
44
- ########################################################
45
- #
46
- # Layer Normalization
47
- #
48
- ########################################################
49
-
50
-
51
- class RMSNorm(torch.nn.Module):
52
- """Root Mean Square Layer Normalization.
53
-
54
- A variant of Layer Normalization that uses RMS statistics instead of mean/variance,
55
- resulting in improved stability and performance.
56
-
57
- Args:
58
- config (Union[ModelConfig, PicoHFConfig]): Configuration object containing normalization parameters
59
- - config.norm_eps: Small constant for numerical stability
60
- - config.d_model: Model dimension for the weight parameter
61
-
62
- References:
63
- https://arxiv.org/abs/1910.07467
64
- """
65
-
66
- def __init__(self, config: Union["ModelConfig", "PicoDecoderHFConfig"]):
67
- super().__init__()
68
- self.eps = config.norm_eps
69
- self.weight = nn.Parameter(torch.ones(config.d_model))
70
-
71
- def _norm(self, x: torch.Tensor) -> torch.Tensor:
72
- """
73
- Normalizes the input tensor by its RMS value.
74
- """
75
- return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
76
-
77
- def forward(self, x: torch.Tensor) -> torch.Tensor:
78
- """
79
- Applies RMS normalization to the input tensor and scales it by the weight parameter.
80
- """
81
- output = self._norm(x.float()).type_as(x)
82
- return output * self.weight
83
-
84
-
85
- ########################################################
86
- #
87
- # Positional Embedding
88
- #
89
- ########################################################
90
-
91
-
92
- class RoPE(nn.Module):
93
- """Rotary Positional Embeddings (RoPE).
94
-
95
- Implements position-dependent rotation of keys and queries in attention mechanism,
96
- allowing better modeling of relative positions in sequences. Uses complex number
97
- operations for efficient rotation.
98
-
99
- Args:
100
- config (Union[ModelConfig, PicoHFConfig]): Model configuration containing:
101
- - config.position_emb_theta: Base for frequency computation
102
- - config.d_model: Model dimension
103
- - config.attention_n_heads: Number of attention heads
104
- - config.max_seq_len: Maximum sequence length
105
-
106
- References:
107
- https://arxiv.org/abs/2104.09864
108
- """
109
-
110
- _freqs_cis_tensor: torch.Tensor | None = None
111
-
112
- def __init__(self, config: Union["ModelConfig", "PicoDecoderHFConfig"]):
113
- super().__init__()
114
-
115
- self.theta = config.position_emb_theta
116
- self.dim = config.d_model // config.attention_n_heads
117
-
118
- max_seq_len = config.max_seq_len
119
-
120
- # only gets set once, and then reused for all RoPE instances
121
- if RoPE._freqs_cis_tensor is None:
122
- RoPE._freqs_cis_tensor = self._setup_freqs_cis(
123
- max_seq_len, self.theta, self.dim
124
- )
125
-
126
- # register _freqs_cis buffer
127
- # can be easily recomputed so persistent=False
128
- self.register_buffer("_freqs_cis", self._freqs_cis_tensor, persistent=False)
129
-
130
- @classmethod
131
- def _setup_freqs_cis(cls, seq_len: int, theta: float, dim: int) -> torch.Tensor:
132
- """Setup Frequency Tensor for RoPE Embeddings
133
-
134
- Initializes the complex frequency tensor that is used to compute the RoPE embeddings.
135
-
136
- Note other implementations will use cos and sin directly, but using the complex
137
- number representation is (probably?) more efficient:
138
-
139
- e^(theta * i * t) = cos(theta * t) + i * sin(theta * t) [Euler's formula]
140
- """
141
- _freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
142
- positions = torch.arange(seq_len)
143
- freqs = torch.outer(positions, _freqs)
144
- return torch.polar(torch.ones_like(freqs), freqs) # complex64
145
-
146
- def get_freqs_cis(
147
- self, input_shape: torch.Size, start_pos: int, end_pos: int
148
- ) -> torch.Tensor:
149
- """Reshape Frequency Tensor for RoPE Embeddings
150
-
151
- Makes the frequency tensor broadcastable with the input tensor.
152
- """
153
- _freqs_cis = self._freqs_cis[start_pos:end_pos]
154
- ndim = len(input_shape)
155
- assert 0 <= 1 < ndim
156
- assert _freqs_cis.shape == (input_shape[1], input_shape[-1])
157
-
158
- # TODO: Check whether this is correct (might be able to remove this)
159
- shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(input_shape)]
160
- return _freqs_cis.view(*shape)
161
-
162
- def forward(
163
- self,
164
- queries: torch.Tensor,
165
- keys: torch.Tensor,
166
- start_pos: int = 0,
167
- ) -> Tuple[torch.Tensor, torch.Tensor]:
168
- """Apply RoPE Embeddings to Queries and Keys
169
-
170
- Applies the rotary positional embeddings to the input tensors via complex num multiplication
171
-
172
- NOTE: The start_pos is used if we want to use the kv_cache in the attention mechanism.
173
- """
174
- queries_ = torch.view_as_complex(
175
- queries.float().reshape(*queries.shape[:-1], -1, 2)
176
- )
177
- keys_ = torch.view_as_complex(keys.float().reshape(*keys.shape[:-1], -1, 2))
178
-
179
- input_shape = (
180
- queries_.shape
181
- ) # same as keys: (batch_size, seq_len, n_heads, head_dim/2)
182
- freqs_start_pos = start_pos
183
- freqs_end_pos = freqs_start_pos + queries_.shape[1]
184
-
185
- freqs_cis = self.get_freqs_cis(input_shape, freqs_start_pos, freqs_end_pos)
186
-
187
- queries_rotated = torch.view_as_real(queries_ * freqs_cis).flatten(3)
188
- keys_rotated = torch.view_as_real(keys_ * freqs_cis).flatten(3)
189
- return queries_rotated.type_as(queries), keys_rotated.type_as(keys)
190
-
191
-
192
- ########################################################
193
- #
194
- # Attention
195
- #
196
- ########################################################
197
-
198
-
199
- class Attention(nn.Module):
200
- """Multi-head Attention with Group Query Attention support.
201
-
202
- Implements scaled dot-product attention and supports:
203
- - Grouped Query Attention (GQA)
204
- - Key-Value caching for efficient inference
205
- - RoPE integration
206
-
207
- Args:
208
- config (Union[ModelConfig, PretrainedConfig]): Configuration containing:
209
- - config.attention_n_heads: Number of attention heads
210
- - config.attention_n_kv_heads: Number of key/value heads
211
- - config.d_model: Model dimension
212
- - config.batch_size: Maximum batch size
213
- - config.max_seq_len: Maximum sequence length
214
-
215
- Shape:
216
- - Input: (batch_size, seq_len, d_model)
217
- - Output: (batch_size, seq_len, d_model)
218
- """
219
-
220
- def __init__(
221
- self,
222
- config: Union["ModelConfig", "PicoDecoderHFConfig"],
223
- ):
224
- super().__init__()
225
-
226
- self.n_heads = config.attention_n_heads
227
- self.n_kv_heads = config.attention_n_kv_heads
228
-
229
- self.batch_size = config.batch_size
230
- self.max_seq_len = config.max_seq_len
231
-
232
- d_model = config.d_model
233
- self.head_dim = d_model // self.n_heads
234
-
235
- self.n_rep = self.n_heads // self.n_kv_heads
236
-
237
- self.q_proj = nn.Linear(d_model, self.n_heads * self.head_dim, bias=False)
238
- self.k_proj = nn.Linear(d_model, self.n_kv_heads * self.head_dim, bias=False)
239
- self.v_proj = nn.Linear(d_model, self.n_kv_heads * self.head_dim, bias=False)
240
- self.o_proj = nn.Linear(self.n_heads * self.head_dim, d_model, bias=False)
241
-
242
- self.rope = RoPE(config)
243
-
244
- def forward(
245
- self,
246
- input: torch.Tensor,
247
- mask: Optional[torch.Tensor] = None,
248
- past_key_values: Optional[Tuple[torch.Tensor, ...]] = None,
249
- use_cache: bool = False,
250
- ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
251
- """Forward pass for the attention mechanism.
252
-
253
- Computes queries, keys, and values for the attention mechanism. Applies rotary positional
254
- embeddings to the queries and keys, and then computes attention scores and outputs.
255
-
256
- For an introduction to the attention mechanism, see:
257
- https://arxiv.org/abs/1706.03762
258
-
259
- A few things to note:
260
- - The past_key_values is used to implement the KV cache, which is used to speed up
261
- generation by caching the KV pairs from previous forward passes. This is useful when doing
262
- tasks that require generating multiple tokens conditioned on previous tokens (e.g. language
263
- modeling, text generation, etc.). The way the KV cache is implemented is that each layer has
264
- its own KV cache - this KV cache is implemented as a tuple.
265
- """
266
- bsz, seq_len, _ = input.shape
267
- _queries, _keys, _values = (
268
- self.q_proj(input),
269
- self.k_proj(input),
270
- self.v_proj(input),
271
- )
272
-
273
- # Reshaping for multi-head attention
274
- queries = _queries.view(bsz, seq_len, self.n_heads, self.head_dim)
275
- keys = _keys.view(bsz, seq_len, self.n_kv_heads, self.head_dim)
276
- values = _values.view(bsz, seq_len, self.n_kv_heads, self.head_dim)
277
-
278
- # The start position is used to apply the RoPE embeddings to only the new tokens
279
- # when using the kv_cache in the attention mechanism.
280
- # We want to start from the last position in the cache.
281
- start_pos = past_key_values[0].shape[1] if past_key_values is not None else 0
282
-
283
- # apply rotary positional embeddings
284
- queries, keys = self.rope(queries, keys, start_pos)
285
-
286
- if past_key_values is not None:
287
- keys = torch.cat([past_key_values[0], keys], dim=1)
288
- values = torch.cat([past_key_values[1], values], dim=1)
289
-
290
- if use_cache:
291
- cached_keys = keys
292
- cached_values = values
293
- else:
294
- cached_keys = None
295
- cached_values = None
296
-
297
- queries = queries.transpose(1, 2)
298
- keys = keys.transpose(1, 2)
299
- values = values.transpose(1, 2)
300
-
301
- apply_gqa = self.n_rep > 1
302
- if apply_gqa and queries.device.type == "mps":
303
- # NOTE: MPS does not support GQA in the SDPA kernel, but we can repeat the keys and values
304
- # outside of the kernel to get the same effect.
305
- # See: https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
306
- keys = keys.repeat_interleave(self.n_rep, dim=-3)
307
- values = values.repeat_interleave(self.n_rep, dim=-3)
308
- apply_gqa = False
309
-
310
- backends = [SDPBackend.CUDNN_ATTENTION, SDPBackend.MATH]
311
-
312
- with sdpa_kernel(backends=backends):
313
- attn_output = F.scaled_dot_product_attention(
314
- queries.contiguous(),
315
- keys.contiguous(),
316
- values.contiguous(),
317
- attn_mask=mask.to(queries.dtype),
318
- enable_gqa=apply_gqa,
319
- )
320
-
321
- attn_output = attn_output.transpose(1, 2).contiguous().view(bsz, seq_len, -1)
322
- output = self.o_proj(attn_output)
323
-
324
- return output, (cached_keys, cached_values)
325
-
326
-
327
- ########################################################
328
- #
329
- # SwiGLU (Combines MLP and Activation)
330
- #
331
- ########################################################
332
-
333
-
334
- class SwiGLU(nn.Module):
335
- """SwiGLU Activation Function with Linear Projections.
336
-
337
- Implements the SwiGLU activation function combined with linear transformations,
338
- serving as the feed-forward network in transformer blocks.
339
-
340
- Args:
341
- config (Union[ModelConfig, PicoDecoderHFConfig]): Configuration containing:
342
- - config.d_model: Model dimension
343
- - config.activation_hidden_dim: Hidden dimension (typically 4 * d_model)
344
-
345
- References:
346
- https://arxiv.org/abs/2002.05202
347
- """
348
-
349
- def __init__(self, config: Union["ModelConfig", "PicoDecoderHFConfig"]):
350
- super().__init__()
351
-
352
- model_dim = config.d_model
353
- act_hidden_dim = config.activation_hidden_dim # usually 4 * d_model
354
-
355
- self.w_0 = nn.Linear(model_dim, act_hidden_dim, bias=False)
356
- self.w_1 = nn.Linear(model_dim, act_hidden_dim, bias=False)
357
- self.w_2 = nn.Linear(act_hidden_dim, model_dim, bias=False)
358
-
359
- def forward(self, x: torch.Tensor) -> torch.Tensor:
360
- return self.w_2(F.silu(self.w_0(x)) * self.w_1(x))
361
-
362
-
363
- ########################################################
364
- #
365
- # PicoDecoderBlock
366
- #
367
- ########################################################
368
-
369
-
370
- class PicoDecoderBlock(nn.Module):
371
- """Single Transformer Block with Attention and Feed-forward layers.
372
-
373
- Implements a standard transformer block with:
374
- - Multi-head attention with normalization and residual connection
375
- - SwiGLU feed-forward network with normalization and residual connection
376
-
377
- Args:
378
- config (Union[ModelConfig, PicoDecoderHFConfig]): Model configuration; either a dataclass or
379
- a HuggingFace PicoDecoderHFConfig
380
- """
381
-
382
- def __init__(
383
- self,
384
- config: Union["ModelConfig", "PicoDecoderHFConfig"],
385
- ):
386
- super().__init__()
387
-
388
- self.attention = Attention(config)
389
- self.swiglu = SwiGLU(config)
390
- self.attention_norm = RMSNorm(config)
391
- self.swiglu_norm = RMSNorm(config)
392
-
393
- def forward(
394
- self,
395
- input: torch.Tensor,
396
- mask: Optional[torch.Tensor] = None,
397
- past_key_values: Optional[Tuple[torch.Tensor]] = None,
398
- use_cache: bool = False,
399
- ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
400
- attention_output, cached_key_values = self.attention(
401
- self.attention_norm(input),
402
- mask=mask,
403
- past_key_values=past_key_values,
404
- use_cache=use_cache,
405
- )
406
- # NOTE: cached_key_values is None if use_cache is False
407
-
408
- h = input + attention_output
409
- out = h + self.swiglu(self.swiglu_norm(h))
410
- return out, cached_key_values
411
-
412
-
413
- ########################################################
414
- #
415
- # Pico Decoder (Causal Transformer Model)
416
- #
417
- ########################################################
418
-
419
-
420
- class PicoDecoder(nn.Module):
421
- """
422
- Pico Decoder: combines the embedding, causal decoder blocks, and output projection into a
423
- single autoregressive model.
424
-
425
- For more information on the model, see the classes for the modules that make up the model.
426
- """
427
-
428
- def __init__(
429
- self,
430
- model_config: Union["ModelConfig", "PicoDecoderHFConfig"],
431
- ):
432
- super().__init__()
433
- self.config = model_config
434
-
435
- self.embedding_proj = nn.Embedding(self.config.vocab_size, self.config.d_model)
436
- self.layers = nn.ModuleList(
437
- [PicoDecoderBlock(self.config) for _ in range(self.config.n_layers)]
438
- )
439
- self.output_norm = RMSNorm(self.config)
440
- self.de_embedding_proj = nn.Linear(
441
- self.config.d_model, self.config.vocab_size, bias=False
442
- )
443
-
444
- def convert_to_hf_model(self) -> "PicoDecoderHF":
445
- """Convert the Lightning model to a HuggingFace model."""
446
- # Create HF config without fabric-specific settings
447
- hf_config = PicoDecoderHFConfig.from_dataclass(self.config)
448
-
449
- # Create new HF model
450
- hf_model = PicoDecoderHF(hf_config)
451
-
452
- # Copy state dict, excluding fabric-specific keys
453
- hf_model.load_state_dict(self.state_dict(prefix="pico_decoder."))
454
-
455
- return hf_model
456
-
457
- def forward(
458
- self,
459
- input_ids: torch.Tensor,
460
- past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
461
- use_cache: bool = False,
462
- ) -> Tuple[torch.Tensor, Optional[Tuple[Tuple[torch.Tensor, torch.Tensor]]]]:
463
- """
464
- This is the forward pass for the entire Pico model. It boils down to:
465
- - Embedding the input ids
466
- - Creating a causal mask
467
- - Processing through the pico layers
468
- - Projecting the output to logits
469
-
470
- NOTE: One feature that might be confusing is the KV cache. The KV cache is used to speed up
471
- generation by caching the KV pairs from previous forward passes. This is useful when doing
472
- tasks that require generating multiple tokens conditioned on previous tokens (e.g. language
473
- modeling, text generation, etc.). The way the KV cache is implemented is that each layer has
474
- its own KV cache which is stored as a tuple. The whole model then stores a tuple of these
475
- KV caches (so a tuple of tuples).
476
- """
477
-
478
- seq_len = input_ids.shape[-1]
479
- h = self.embedding_proj(input_ids)
480
-
481
- # Calculate start position from past cached KV pairs. Remember that each layer has its
482
- # own KV Cache. So when we index past_key_values, we need to index into the KV pairs for the
483
- # correct layer and then for either the keys or values.
484
- start_pos = 0 if past_key_values is None else past_key_values[0][0].shape[1]
485
-
486
- # Create causal mask for current sequence
487
- mask = None
488
- if seq_len > 1:
489
- mask = torch.full((seq_len, seq_len), float("-inf"))
490
- mask = torch.triu(mask, diagonal=1)
491
-
492
- # If using KV cache, extend mask to cover cached sequence length
493
- if past_key_values is not None:
494
- # Add zeros for cached tokens (we can attend to all of them)
495
- mask = torch.hstack([torch.zeros((seq_len, start_pos)), mask])
496
-
497
- mask = mask.to(h.device)
498
-
499
- # NOTE: If we are using the cache, we need to store the cached KV pairs for each layer
500
- # in a tuple. Each layer will have its own cached KV pair which we aggregate in a tuple.
501
- cached_key_values = () if use_cache else None
502
-
503
- # Process through transformer blocks
504
- for idx, layer in enumerate(self.layers):
505
- layer_past_key_values = (
506
- past_key_values[idx] if past_key_values is not None else None
507
- )
508
-
509
- h, layer_cached_key_values = layer(
510
- h, mask=mask, past_key_values=layer_past_key_values, use_cache=use_cache
511
- )
512
-
513
- if use_cache:
514
- cached_key_values += (layer_cached_key_values,)
515
-
516
- # Final norm and projection
517
- h = self.output_norm(h)
518
- logits = self.de_embedding_proj(h).float()
519
-
520
- return logits, cached_key_values
521
-
522
-
523
- ########################################################
524
- #
525
- # HuggingFace Wrapper for the Pico Decoder model.
526
- #
527
- ########################################################
528
-
529
-
530
- class PicoDecoderHFConfig(PretrainedConfig):
531
- """Config class for the Pico Decoder HuggingFace wrapper."""
532
-
533
- model_type = "pico_decoder"
534
-
535
- @classmethod
536
- def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "PicoDecoderHFConfig":
537
- """
538
- Initialize config from a dictionary. Note that no kwargs are passed to the constructor --
539
- this is because with some kwargs special handling is required and can make this class
540
- brittle.
541
- """
542
- pico_config = cls(**config_dict)
543
-
544
- return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
545
- unused_kwargs = {
546
- key: value for key, value in kwargs.items() if not hasattr(pico_config, key)
547
- }
548
-
549
- if return_unused_kwargs:
550
- return pico_config, unused_kwargs
551
- return pico_config
552
-
553
- @classmethod
554
- def from_dataclass(cls, model_config: "ModelConfig"):
555
- """Initialise from our custom config dataclass."""
556
- return cls.from_dict(asdict(model_config))
557
-
558
-
559
- class PicoDecoderHF(PreTrainedModel):
560
- """
561
- HuggingFace wrapper for the Pico model.
562
-
563
- Many evaluation frameworks require a model be setup as a HuggingFace model, so we provide a simple
564
- wrapper that does just that. When we save checkpoints of the Pico model, we save both the normal
565
- Pico model as well as the model wrapped in this HuggingFace class.
566
-
567
- This also lets you do cool things like:
568
-
569
- `model = AutoModelForCausalLM.from_pretrained("path/to/checkpoint")`
570
- """
571
-
572
- config_class = PicoDecoderHFConfig
573
- _no_split_modules = ["PicoBlock", "Attention", "SwiGLU", "RMSNorm"]
574
-
575
- def __init__(self, config: PicoDecoderHFConfig):
576
- super().__init__(config)
577
- self.pico_decoder = PicoDecoder(config)
578
-
579
- def forward(
580
- self,
581
- input_ids: torch.Tensor,
582
- past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
583
- use_cache: bool = False,
584
- **kwargs,
585
- ) -> Union[CausalLMOutput, CausalLMOutputWithPast]:
586
- """HuggingFace forward pass wrapper.
587
-
588
- Forwards pass for the HuggingFace version of the Pico Model. Basic wrapper around the
589
- Pico model's forward pass, and returns the output as a HuggingFace CausalLMOutput.
590
- """
591
- logits, past_key_values = self.pico_decoder(
592
- input_ids, past_key_values, use_cache
593
- )
594
- if use_cache:
595
- return CausalLMOutputWithPast(
596
- logits=logits,
597
- past_key_values=past_key_values,
598
- )
599
- else:
600
- return CausalLMOutput(
601
- logits=logits,
602
- )
603
-
604
-
605
- # Register for auto classes
606
- PicoDecoderHFConfig.register_for_auto_class()
607
- PicoDecoderHF.register_for_auto_class("AutoModel")
608
- PicoDecoderHF.register_for_auto_class("AutoModelForCausalLM")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pico-decoder-tiny-max-vram/checkpoints/step_27/special_tokens_map.json DELETED
@@ -1,16 +0,0 @@
1
- {
2
- "eos_token": {
3
- "content": "<|endoftext|>",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "pad_token": {
10
- "content": "<|padding|>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- }
16
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pico-decoder-tiny-max-vram/checkpoints/step_27/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
pico-decoder-tiny-max-vram/checkpoints/step_27/tokenizer_config.json DELETED
@@ -1,239 +0,0 @@
1
- {
2
- "add_bos_token": false,
3
- "add_eos_token": false,
4
- "add_prefix_space": false,
5
- "added_tokens_decoder": {
6
- "0": {
7
- "content": "|||IP_ADDRESS|||",
8
- "lstrip": false,
9
- "normalized": true,
10
- "rstrip": false,
11
- "single_word": false,
12
- "special": false
13
- },
14
- "1": {
15
- "content": "<|padding|>",
16
- "lstrip": false,
17
- "normalized": false,
18
- "rstrip": false,
19
- "single_word": false,
20
- "special": true
21
- },
22
- "50254": {
23
- "content": " ",
24
- "lstrip": false,
25
- "normalized": true,
26
- "rstrip": false,
27
- "single_word": false,
28
- "special": false
29
- },
30
- "50255": {
31
- "content": " ",
32
- "lstrip": false,
33
- "normalized": true,
34
- "rstrip": false,
35
- "single_word": false,
36
- "special": false
37
- },
38
- "50256": {
39
- "content": " ",
40
- "lstrip": false,
41
- "normalized": true,
42
- "rstrip": false,
43
- "single_word": false,
44
- "special": false
45
- },
46
- "50257": {
47
- "content": " ",
48
- "lstrip": false,
49
- "normalized": true,
50
- "rstrip": false,
51
- "single_word": false,
52
- "special": false
53
- },
54
- "50258": {
55
- "content": " ",
56
- "lstrip": false,
57
- "normalized": true,
58
- "rstrip": false,
59
- "single_word": false,
60
- "special": false
61
- },
62
- "50259": {
63
- "content": " ",
64
- "lstrip": false,
65
- "normalized": true,
66
- "rstrip": false,
67
- "single_word": false,
68
- "special": false
69
- },
70
- "50260": {
71
- "content": " ",
72
- "lstrip": false,
73
- "normalized": true,
74
- "rstrip": false,
75
- "single_word": false,
76
- "special": false
77
- },
78
- "50261": {
79
- "content": " ",
80
- "lstrip": false,
81
- "normalized": true,
82
- "rstrip": false,
83
- "single_word": false,
84
- "special": false
85
- },
86
- "50262": {
87
- "content": " ",
88
- "lstrip": false,
89
- "normalized": true,
90
- "rstrip": false,
91
- "single_word": false,
92
- "special": false
93
- },
94
- "50263": {
95
- "content": " ",
96
- "lstrip": false,
97
- "normalized": true,
98
- "rstrip": false,
99
- "single_word": false,
100
- "special": false
101
- },
102
- "50264": {
103
- "content": " ",
104
- "lstrip": false,
105
- "normalized": true,
106
- "rstrip": false,
107
- "single_word": false,
108
- "special": false
109
- },
110
- "50265": {
111
- "content": " ",
112
- "lstrip": false,
113
- "normalized": true,
114
- "rstrip": false,
115
- "single_word": false,
116
- "special": false
117
- },
118
- "50266": {
119
- "content": " ",
120
- "lstrip": false,
121
- "normalized": true,
122
- "rstrip": false,
123
- "single_word": false,
124
- "special": false
125
- },
126
- "50267": {
127
- "content": " ",
128
- "lstrip": false,
129
- "normalized": true,
130
- "rstrip": false,
131
- "single_word": false,
132
- "special": false
133
- },
134
- "50268": {
135
- "content": " ",
136
- "lstrip": false,
137
- "normalized": true,
138
- "rstrip": false,
139
- "single_word": false,
140
- "special": false
141
- },
142
- "50269": {
143
- "content": " ",
144
- "lstrip": false,
145
- "normalized": true,
146
- "rstrip": false,
147
- "single_word": false,
148
- "special": false
149
- },
150
- "50270": {
151
- "content": " ",
152
- "lstrip": false,
153
- "normalized": true,
154
- "rstrip": false,
155
- "single_word": false,
156
- "special": false
157
- },
158
- "50271": {
159
- "content": " ",
160
- "lstrip": false,
161
- "normalized": true,
162
- "rstrip": false,
163
- "single_word": false,
164
- "special": false
165
- },
166
- "50272": {
167
- "content": " ",
168
- "lstrip": false,
169
- "normalized": true,
170
- "rstrip": false,
171
- "single_word": false,
172
- "special": false
173
- },
174
- "50273": {
175
- "content": " ",
176
- "lstrip": false,
177
- "normalized": true,
178
- "rstrip": false,
179
- "single_word": false,
180
- "special": false
181
- },
182
- "50274": {
183
- "content": " ",
184
- "lstrip": false,
185
- "normalized": true,
186
- "rstrip": false,
187
- "single_word": false,
188
- "special": false
189
- },
190
- "50275": {
191
- "content": " ",
192
- "lstrip": false,
193
- "normalized": true,
194
- "rstrip": false,
195
- "single_word": false,
196
- "special": false
197
- },
198
- "50276": {
199
- "content": " ",
200
- "lstrip": false,
201
- "normalized": true,
202
- "rstrip": false,
203
- "single_word": false,
204
- "special": false
205
- },
206
- "50277": {
207
- "content": "|||EMAIL_ADDRESS|||",
208
- "lstrip": false,
209
- "normalized": true,
210
- "rstrip": false,
211
- "single_word": false,
212
- "special": false
213
- },
214
- "50278": {
215
- "content": "|||PHONE_NUMBER|||",
216
- "lstrip": false,
217
- "normalized": true,
218
- "rstrip": false,
219
- "single_word": false,
220
- "special": false
221
- },
222
- "50279": {
223
- "content": "<|endoftext|>",
224
- "lstrip": false,
225
- "normalized": false,
226
- "rstrip": false,
227
- "single_word": false,
228
- "special": true
229
- }
230
- },
231
- "bos_token": null,
232
- "clean_up_tokenization_spaces": true,
233
- "eos_token": "<|endoftext|>",
234
- "extra_special_tokens": {},
235
- "model_max_length": 1000000000000000019884624838656,
236
- "pad_token": "<|padding|>",
237
- "tokenizer_class": "GPTNeoXTokenizer",
238
- "unk_token": null
239
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pico-decoder-tiny-max-vram/eval_results/step_0.json DELETED
@@ -1 +0,0 @@
1
- {"paloma": 59434.76600609756}
 
 
pico-decoder-tiny-max-vram/eval_results/step_27.json DELETED
@@ -1 +0,0 @@
1
- {"paloma": 59120.39268292683}
 
 
pico-decoder-tiny-max-vram/logs/log_20250828_210412.log DELETED
File without changes
pico-decoder-tiny-max-vram/logs/log_20250828_210457.log DELETED
@@ -1,108 +0,0 @@
1
- 2025-08-28 21:06:12 - pico-train - INFO - Step 0 -- 📊 Evaluation Results
2
- 2025-08-28 21:06:12 - pico-train - INFO - └── paloma: 59435.04994555749
3
- 2025-08-28 21:06:13 - pico-train - INFO - ==================================================
4
- 2025-08-28 21:06:13 - pico-train - INFO - ✨ Training Configuration
5
- 2025-08-28 21:06:13 - pico-train - INFO - ==================================================
6
- 2025-08-28 21:06:13 - pico-train - INFO - ╭─────────────────────────────────────────────────────╮
7
- 2025-08-28 21:06:13 - pico-train - INFO - │ checkpointing: │
8
- 2025-08-28 21:06:13 - pico-train - INFO - │ checkpoints_dir: checkpoints │
9
- 2025-08-28 21:06:13 - pico-train - INFO - │ evaluation: │
10
- 2025-08-28 21:06:13 - pico-train - INFO - │ eval_results_dir: eval_results │
11
- 2025-08-28 21:06:13 - pico-train - INFO - │ fabric_checkpoint_dir: fabric_state │
12
- 2025-08-28 21:06:13 - pico-train - INFO - │ fabric_checkpoint_filename: checkpoint.pt │
13
- 2025-08-28 21:06:13 - pico-train - INFO - │ hf_checkpoint: │
14
- 2025-08-28 21:06:13 - pico-train - INFO - │ collection_slug: null │
15
- 2025-08-28 21:06:13 - pico-train - INFO - │ repo_id: ThomasTheMaker/pico-decoder-tiny │
16
- 2025-08-28 21:06:13 - pico-train - INFO - │ learning_dynamics: │
17
- 2025-08-28 21:06:13 - pico-train - INFO - │ batch_size: 8 │
18
- 2025-08-28 21:06:13 - pico-train - INFO - │ eval_data: null │
19
- 2025-08-28 21:06:13 - pico-train - INFO - │ layer_suffixes: │
20
- 2025-08-28 21:06:13 - pico-train - INFO - │ - attention.v_proj │
21
- 2025-08-28 21:06:13 - pico-train - INFO - │ - attention.o_proj │
22
- 2025-08-28 21:06:13 - pico-train - INFO - │ - swiglu.w_2 │
23
- 2025-08-28 21:06:13 - pico-train - INFO - │ sequence_idx: -1 │
24
- 2025-08-28 21:06:13 - pico-train - INFO - │ learning_dynamics_dir: learning_dynamics │
25
- 2025-08-28 21:06:13 - pico-train - INFO - │ logs_dir: logs │
26
- 2025-08-28 21:06:13 - pico-train - INFO - │ run_name: pico-decoder-tiny-max-vram │
27
- 2025-08-28 21:06:13 - pico-train - INFO - │ runs_dir: runs │
28
- 2025-08-28 21:06:13 - pico-train - INFO - │ save_every_n_steps: 1000 │
29
- 2025-08-28 21:06:13 - pico-train - INFO - │ save_to_hf: true │
30
- 2025-08-28 21:06:13 - pico-train - INFO - │ training: │
31
- 2025-08-28 21:06:13 - pico-train - INFO - │ auto_resume: true │
32
- 2025-08-28 21:06:13 - pico-train - INFO - │ data: │
33
- 2025-08-28 21:06:13 - pico-train - INFO - │ dataloader: │
34
- 2025-08-28 21:06:13 - pico-train - INFO - │ batch_size: 64 │
35
- 2025-08-28 21:06:13 - pico-train - INFO - │ dataset: │
36
- 2025-08-28 21:06:13 - pico-train - INFO - │ name: pico-lm/pretokenized-dolma-tinsy │
37
- 2025-08-28 21:06:13 - pico-train - INFO - │ tokenizer: │
38
- 2025-08-28 21:06:13 - pico-train - INFO - │ name: allenai/OLMo-7B-0724-hf │
39
- 2025-08-28 21:06:13 - pico-train - INFO - │ vocab_size: 50304 │
40
- 2025-08-28 21:06:13 - pico-train - INFO - │ evaluation: │
41
- 2025-08-28 21:06:13 - pico-train - INFO - │ metrics: │
42
- 2025-08-28 21:06:13 - pico-train - INFO - │ - paloma │
43
- 2025-08-28 21:06:13 - pico-train - INFO - │ paloma: │
44
- 2025-08-28 21:06:13 - pico-train - INFO - │ batch_size: 2 │
45
- 2025-08-28 21:06:13 - pico-train - INFO - │ dataset_name: pico-lm/pretokenized-paloma-tinsy │
46
- 2025-08-28 21:06:13 - pico-train - INFO - │ dataset_split: val │
47
- 2025-08-28 21:06:13 - pico-train - INFO - │ max_length: 2048 │
48
- 2025-08-28 21:06:13 - pico-train - INFO - │ model: │
49
- 2025-08-28 21:06:13 - pico-train - INFO - │ activation_hidden_dim: 384 │
50
- 2025-08-28 21:06:13 - pico-train - INFO - │ attention_n_heads: 12 │
51
- 2025-08-28 21:06:13 - pico-train - INFO - │ attention_n_kv_heads: 4 │
52
- 2025-08-28 21:06:13 - pico-train - INFO - │ batch_size: 1024 │
53
- 2025-08-28 21:06:13 - pico-train - INFO - │ d_model: 96 │
54
- 2025-08-28 21:06:13 - pico-train - INFO - │ max_seq_len: 2048 │
55
- 2025-08-28 21:06:13 - pico-train - INFO - │ model_type: pico_decoder │
56
- 2025-08-28 21:06:13 - pico-train - INFO - │ n_layers: 12 │
57
- 2025-08-28 21:06:13 - pico-train - INFO - │ norm_eps: 1.0e-06 │
58
- 2025-08-28 21:06:13 - pico-train - INFO - │ position_emb_theta: 10000.0 │
59
- 2025-08-28 21:06:13 - pico-train - INFO - │ vocab_size: 50304 │
60
- 2025-08-28 21:06:13 - pico-train - INFO - │ monitoring: │
61
- 2025-08-28 21:06:13 - pico-train - INFO - │ logging: │
62
- 2025-08-28 21:06:13 - pico-train - INFO - │ log_every_n_steps: 100 │
63
- 2025-08-28 21:06:13 - pico-train - INFO - │ log_level: INFO │
64
- 2025-08-28 21:06:13 - pico-train - INFO - │ save_to_wandb: false │
65
- 2025-08-28 21:06:13 - pico-train - INFO - │ wandb: │
66
- 2025-08-28 21:06:13 - pico-train - INFO - │ entity: boymyc │
67
- 2025-08-28 21:06:13 - pico-train - INFO - │ project: pico-decoder-tiny │
68
- 2025-08-28 21:06:13 - pico-train - INFO - │ training: │
69
- 2025-08-28 21:06:13 - pico-train - INFO - │ fabric: │
70
- 2025-08-28 21:06:13 - pico-train - INFO - │ accelerator: cuda │
71
- 2025-08-28 21:06:13 - pico-train - INFO - │ num_devices: 1 │
72
- 2025-08-28 21:06:13 - pico-train - INFO - │ num_nodes: 1 │
73
- 2025-08-28 21:06:13 - pico-train - INFO - │ precision: 16-mixed │
74
- 2025-08-28 21:06:13 - pico-train - INFO - │ max_steps: 200000 │
75
- 2025-08-28 21:06:13 - pico-train - INFO - │ optimization: │
76
- 2025-08-28 21:06:13 - pico-train - INFO - │ gradient_accumulation_steps: 64 │
77
- 2025-08-28 21:06:13 - pico-train - INFO - │ lr: 0.0003 │
78
- 2025-08-28 21:06:13 - pico-train - INFO - │ lr_scheduler: linear_with_warmup │
79
- 2025-08-28 21:06:13 - pico-train - INFO - │ lr_warmup_steps: 2500 │
80
- 2025-08-28 21:06:13 - pico-train - INFO - │ optimizer: adamw │
81
- 2025-08-28 21:06:13 - pico-train - INFO - │ │
82
- 2025-08-28 21:06:13 - pico-train - INFO - ╰─────────────────────────────────────────────────────╯
83
- 2025-08-28 21:06:13 - pico-train - INFO - ==================================================
84
- 2025-08-28 21:06:13 - pico-train - INFO - ⛭ Runtime Summary:
85
- 2025-08-28 21:06:13 - pico-train - INFO - ==================================================
86
- 2025-08-28 21:06:13 - pico-train - INFO - Starting from step: 0
87
- 2025-08-28 21:06:13 - pico-train - INFO - Model Setup:
88
- 2025-08-28 21:06:13 - pico-train - INFO - └─ Total Parameters: 11,282,784
89
- 2025-08-28 21:06:13 - pico-train - INFO - └─ Trainable Parameters: 11,282,784
90
- 2025-08-28 21:06:13 - pico-train - INFO - Distributed Setup:
91
- 2025-08-28 21:06:13 - pico-train - INFO - └─ Number of Devices: 1
92
- 2025-08-28 21:06:13 - pico-train - INFO - └─ Device Type: NVIDIA GeForce RTX 5090
93
- 2025-08-28 21:06:13 - pico-train - INFO - └─ Available Memory: 33.68 GB
94
- 2025-08-28 21:06:13 - pico-train - INFO - Software Setup:
95
- 2025-08-28 21:06:13 - pico-train - INFO - └─ Python Version: 3.10.12
96
- 2025-08-28 21:06:13 - pico-train - INFO - └─ PyTorch Version: 2.8.0+cu128
97
- 2025-08-28 21:06:13 - pico-train - INFO - └─ CUDA Version: 12.8
98
- 2025-08-28 21:06:13 - pico-train - INFO - └─ Operating System: Linux 6.8.0-63-generic
99
- 2025-08-28 21:06:13 - pico-train - INFO - Batch Size Configuration:
100
- 2025-08-28 21:06:13 - pico-train - INFO - └─ Global Batch Size: 64
101
- 2025-08-28 21:06:13 - pico-train - INFO - └─ Per Device Batch Size: 1
102
- 2025-08-28 21:06:13 - pico-train - INFO - └─ Gradient Accumulation Steps: 64
103
- 2025-08-28 21:06:13 - pico-train - INFO - ==================================================
104
- 2025-08-28 21:06:22 - pico-train - INFO - Step 0 -- 🔄 Training Metrics
105
- 2025-08-28 21:06:22 - pico-train - INFO - ├── Loss: 10.9947
106
- 2025-08-28 21:06:22 - pico-train - INFO - ├── Learning Rate: 0.00e+00
107
- 2025-08-28 21:06:22 - pico-train - INFO - └── Inf/NaN count: 0
108
- 2025-08-28 21:06:22 - pico-train - INFO - Step 0 -- 📈 Saving Learning Dynamics
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pico-decoder-tiny-max-vram/logs/log_20250828_210719.log DELETED
@@ -1,108 +0,0 @@
1
- 2025-08-28 21:08:29 - pico-train - INFO - Step 0 -- 📊 Evaluation Results
2
- 2025-08-28 21:08:29 - pico-train - INFO - └── paloma: 59434.76580466028
3
- 2025-08-28 21:08:30 - pico-train - INFO - ==================================================
4
- 2025-08-28 21:08:30 - pico-train - INFO - ✨ Training Configuration
5
- 2025-08-28 21:08:30 - pico-train - INFO - ==================================================
6
- 2025-08-28 21:08:30 - pico-train - INFO - ╭─────────────────────────────────────────────────────╮
7
- 2025-08-28 21:08:30 - pico-train - INFO - │ checkpointing: │
8
- 2025-08-28 21:08:30 - pico-train - INFO - │ checkpoints_dir: checkpoints │
9
- 2025-08-28 21:08:30 - pico-train - INFO - │ evaluation: │
10
- 2025-08-28 21:08:30 - pico-train - INFO - │ eval_results_dir: eval_results │
11
- 2025-08-28 21:08:30 - pico-train - INFO - │ fabric_checkpoint_dir: fabric_state │
12
- 2025-08-28 21:08:30 - pico-train - INFO - │ fabric_checkpoint_filename: checkpoint.pt │
13
- 2025-08-28 21:08:30 - pico-train - INFO - │ hf_checkpoint: │
14
- 2025-08-28 21:08:30 - pico-train - INFO - │ collection_slug: null │
15
- 2025-08-28 21:08:30 - pico-train - INFO - │ repo_id: ThomasTheMaker/pico-decoder-tiny │
16
- 2025-08-28 21:08:30 - pico-train - INFO - │ learning_dynamics: │
17
- 2025-08-28 21:08:30 - pico-train - INFO - │ batch_size: 8 │
18
- 2025-08-28 21:08:30 - pico-train - INFO - │ eval_data: null │
19
- 2025-08-28 21:08:30 - pico-train - INFO - │ layer_suffixes: │
20
- 2025-08-28 21:08:30 - pico-train - INFO - │ - attention.v_proj │
21
- 2025-08-28 21:08:30 - pico-train - INFO - │ - attention.o_proj │
22
- 2025-08-28 21:08:30 - pico-train - INFO - │ - swiglu.w_2 │
23
- 2025-08-28 21:08:30 - pico-train - INFO - │ sequence_idx: -1 │
24
- 2025-08-28 21:08:30 - pico-train - INFO - │ learning_dynamics_dir: learning_dynamics │
25
- 2025-08-28 21:08:30 - pico-train - INFO - │ logs_dir: logs │
26
- 2025-08-28 21:08:30 - pico-train - INFO - │ run_name: pico-decoder-tiny-max-vram │
27
- 2025-08-28 21:08:30 - pico-train - INFO - │ runs_dir: runs │
28
- 2025-08-28 21:08:30 - pico-train - INFO - │ save_every_n_steps: 1000 │
29
- 2025-08-28 21:08:30 - pico-train - INFO - │ save_to_hf: true │
30
- 2025-08-28 21:08:30 - pico-train - INFO - │ training: │
31
- 2025-08-28 21:08:30 - pico-train - INFO - │ auto_resume: true │
32
- 2025-08-28 21:08:30 - pico-train - INFO - │ data: │
33
- 2025-08-28 21:08:30 - pico-train - INFO - │ dataloader: │
34
- 2025-08-28 21:08:30 - pico-train - INFO - │ batch_size: 64 │
35
- 2025-08-28 21:08:30 - pico-train - INFO - │ dataset: │
36
- 2025-08-28 21:08:30 - pico-train - INFO - │ name: pico-lm/pretokenized-dolma-tinsy │
37
- 2025-08-28 21:08:30 - pico-train - INFO - │ tokenizer: │
38
- 2025-08-28 21:08:30 - pico-train - INFO - │ name: allenai/OLMo-7B-0724-hf │
39
- 2025-08-28 21:08:30 - pico-train - INFO - │ vocab_size: 50304 │
40
- 2025-08-28 21:08:30 - pico-train - INFO - │ evaluation: │
41
- 2025-08-28 21:08:30 - pico-train - INFO - │ metrics: │
42
- 2025-08-28 21:08:30 - pico-train - INFO - │ - paloma │
43
- 2025-08-28 21:08:30 - pico-train - INFO - │ paloma: │
44
- 2025-08-28 21:08:30 - pico-train - INFO - │ batch_size: 2 │
45
- 2025-08-28 21:08:30 - pico-train - INFO - │ dataset_name: pico-lm/pretokenized-paloma-tinsy │
46
- 2025-08-28 21:08:30 - pico-train - INFO - │ dataset_split: val │
47
- 2025-08-28 21:08:30 - pico-train - INFO - │ max_length: 2048 │
48
- 2025-08-28 21:08:30 - pico-train - INFO - │ model: │
49
- 2025-08-28 21:08:30 - pico-train - INFO - │ activation_hidden_dim: 384 │
50
- 2025-08-28 21:08:30 - pico-train - INFO - │ attention_n_heads: 12 │
51
- 2025-08-28 21:08:30 - pico-train - INFO - │ attention_n_kv_heads: 4 │
52
- 2025-08-28 21:08:30 - pico-train - INFO - │ batch_size: 1024 │
53
- 2025-08-28 21:08:30 - pico-train - INFO - │ d_model: 96 │
54
- 2025-08-28 21:08:30 - pico-train - INFO - │ max_seq_len: 2048 │
55
- 2025-08-28 21:08:30 - pico-train - INFO - │ model_type: pico_decoder │
56
- 2025-08-28 21:08:30 - pico-train - INFO - │ n_layers: 12 │
57
- 2025-08-28 21:08:30 - pico-train - INFO - │ norm_eps: 1.0e-06 │
58
- 2025-08-28 21:08:30 - pico-train - INFO - │ position_emb_theta: 10000.0 │
59
- 2025-08-28 21:08:30 - pico-train - INFO - │ vocab_size: 50304 │
60
- 2025-08-28 21:08:30 - pico-train - INFO - │ monitoring: │
61
- 2025-08-28 21:08:30 - pico-train - INFO - │ logging: │
62
- 2025-08-28 21:08:30 - pico-train - INFO - │ log_every_n_steps: 100 │
63
- 2025-08-28 21:08:30 - pico-train - INFO - │ log_level: INFO │
64
- 2025-08-28 21:08:30 - pico-train - INFO - │ save_to_wandb: false │
65
- 2025-08-28 21:08:30 - pico-train - INFO - │ wandb: │
66
- 2025-08-28 21:08:30 - pico-train - INFO - │ entity: boymyc │
67
- 2025-08-28 21:08:30 - pico-train - INFO - │ project: pico-decoder-tiny │
68
- 2025-08-28 21:08:30 - pico-train - INFO - │ training: │
69
- 2025-08-28 21:08:30 - pico-train - INFO - │ fabric: │
70
- 2025-08-28 21:08:30 - pico-train - INFO - │ accelerator: cuda │
71
- 2025-08-28 21:08:30 - pico-train - INFO - │ num_devices: 1 │
72
- 2025-08-28 21:08:30 - pico-train - INFO - │ num_nodes: 1 │
73
- 2025-08-28 21:08:30 - pico-train - INFO - │ precision: 16-mixed │
74
- 2025-08-28 21:08:30 - pico-train - INFO - │ max_steps: 200000 │
75
- 2025-08-28 21:08:30 - pico-train - INFO - │ optimization: │
76
- 2025-08-28 21:08:30 - pico-train - INFO - │ gradient_accumulation_steps: 64 │
77
- 2025-08-28 21:08:30 - pico-train - INFO - │ lr: 0.0003 │
78
- 2025-08-28 21:08:30 - pico-train - INFO - │ lr_scheduler: linear_with_warmup │
79
- 2025-08-28 21:08:30 - pico-train - INFO - │ lr_warmup_steps: 2500 │
80
- 2025-08-28 21:08:30 - pico-train - INFO - │ optimizer: adamw │
81
- 2025-08-28 21:08:30 - pico-train - INFO - │ │
82
- 2025-08-28 21:08:30 - pico-train - INFO - ╰─────────────────────────────────────────────────────╯
83
- 2025-08-28 21:08:30 - pico-train - INFO - ==================================================
84
- 2025-08-28 21:08:30 - pico-train - INFO - ⛭ Runtime Summary:
85
- 2025-08-28 21:08:30 - pico-train - INFO - ==================================================
86
- 2025-08-28 21:08:30 - pico-train - INFO - Starting from step: 0
87
- 2025-08-28 21:08:30 - pico-train - INFO - Model Setup:
88
- 2025-08-28 21:08:30 - pico-train - INFO - └─ Total Parameters: 11,282,784
89
- 2025-08-28 21:08:30 - pico-train - INFO - └─ Trainable Parameters: 11,282,784
90
- 2025-08-28 21:08:30 - pico-train - INFO - Distributed Setup:
91
- 2025-08-28 21:08:30 - pico-train - INFO - └─ Number of Devices: 1
92
- 2025-08-28 21:08:30 - pico-train - INFO - └─ Device Type: NVIDIA GeForce RTX 5090
93
- 2025-08-28 21:08:30 - pico-train - INFO - └─ Available Memory: 33.68 GB
94
- 2025-08-28 21:08:30 - pico-train - INFO - Software Setup:
95
- 2025-08-28 21:08:30 - pico-train - INFO - └─ Python Version: 3.10.12
96
- 2025-08-28 21:08:30 - pico-train - INFO - └─ PyTorch Version: 2.8.0+cu128
97
- 2025-08-28 21:08:30 - pico-train - INFO - └─ CUDA Version: 12.8
98
- 2025-08-28 21:08:30 - pico-train - INFO - └─ Operating System: Linux 6.8.0-63-generic
99
- 2025-08-28 21:08:30 - pico-train - INFO - Batch Size Configuration:
100
- 2025-08-28 21:08:30 - pico-train - INFO - └─ Global Batch Size: 64
101
- 2025-08-28 21:08:30 - pico-train - INFO - └─ Per Device Batch Size: 1
102
- 2025-08-28 21:08:30 - pico-train - INFO - └─ Gradient Accumulation Steps: 64
103
- 2025-08-28 21:08:30 - pico-train - INFO - ==================================================
104
- 2025-08-28 21:08:39 - pico-train - INFO - Step 0 -- 🔄 Training Metrics
105
- 2025-08-28 21:08:39 - pico-train - INFO - ├── Loss: 10.9948
106
- 2025-08-28 21:08:39 - pico-train - INFO - ├── Learning Rate: 0.00e+00
107
- 2025-08-28 21:08:39 - pico-train - INFO - └── Inf/NaN count: 0
108
- 2025-08-28 21:08:39 - pico-train - INFO - Step 0 -- 📈 Saving Learning Dynamics
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pico-decoder-tiny-max-vram/logs/log_20250828_210922.log DELETED
@@ -1,113 +0,0 @@
1
- 2025-08-28 21:11:16 - pico-train - INFO - Step 0 -- 📊 Evaluation Results
2
- 2025-08-28 21:11:16 - pico-train - INFO - └── paloma: 59434.76600609756
3
- 2025-08-28 21:11:16 - pico-train - INFO - ==================================================
4
- 2025-08-28 21:11:16 - pico-train - INFO - ✨ Training Configuration
5
- 2025-08-28 21:11:16 - pico-train - INFO - ==================================================
6
- 2025-08-28 21:11:16 - pico-train - INFO - ╭─────────────────────────────────────────────────────╮
7
- 2025-08-28 21:11:16 - pico-train - INFO - │ checkpointing: │
8
- 2025-08-28 21:11:16 - pico-train - INFO - │ checkpoints_dir: checkpoints │
9
- 2025-08-28 21:11:16 - pico-train - INFO - │ evaluation: │
10
- 2025-08-28 21:11:16 - pico-train - INFO - │ eval_results_dir: eval_results │
11
- 2025-08-28 21:11:16 - pico-train - INFO - │ fabric_checkpoint_dir: fabric_state │
12
- 2025-08-28 21:11:16 - pico-train - INFO - │ fabric_checkpoint_filename: checkpoint.pt │
13
- 2025-08-28 21:11:16 - pico-train - INFO - │ hf_checkpoint: │
14
- 2025-08-28 21:11:16 - pico-train - INFO - │ collection_slug: null │
15
- 2025-08-28 21:11:16 - pico-train - INFO - │ repo_id: ThomasTheMaker/pico-decoder-tiny │
16
- 2025-08-28 21:11:16 - pico-train - INFO - │ learning_dynamics: │
17
- 2025-08-28 21:11:16 - pico-train - INFO - │ batch_size: 8 │
18
- 2025-08-28 21:11:16 - pico-train - INFO - │ eval_data: null │
19
- 2025-08-28 21:11:16 - pico-train - INFO - │ layer_suffixes: │
20
- 2025-08-28 21:11:16 - pico-train - INFO - │ - attention.v_proj │
21
- 2025-08-28 21:11:16 - pico-train - INFO - │ - attention.o_proj │
22
- 2025-08-28 21:11:16 - pico-train - INFO - │ - swiglu.w_2 │
23
- 2025-08-28 21:11:16 - pico-train - INFO - │ sequence_idx: -1 │
24
- 2025-08-28 21:11:16 - pico-train - INFO - │ learning_dynamics_dir: learning_dynamics │
25
- 2025-08-28 21:11:16 - pico-train - INFO - │ logs_dir: logs │
26
- 2025-08-28 21:11:16 - pico-train - INFO - │ run_name: pico-decoder-tiny-max-vram │
27
- 2025-08-28 21:11:16 - pico-train - INFO - │ runs_dir: runs │
28
- 2025-08-28 21:11:16 - pico-train - INFO - │ save_every_n_steps: 1000 │
29
- 2025-08-28 21:11:16 - pico-train - INFO - │ save_to_hf: true │
30
- 2025-08-28 21:11:16 - pico-train - INFO - │ training: │
31
- 2025-08-28 21:11:16 - pico-train - INFO - │ auto_resume: true │
32
- 2025-08-28 21:11:16 - pico-train - INFO - │ data: │
33
- 2025-08-28 21:11:16 - pico-train - INFO - │ dataloader: │
34
- 2025-08-28 21:11:16 - pico-train - INFO - │ batch_size: 64 │
35
- 2025-08-28 21:11:16 - pico-train - INFO - │ dataset: │
36
- 2025-08-28 21:11:16 - pico-train - INFO - │ name: pico-lm/pretokenized-dolma-tinsy │
37
- 2025-08-28 21:11:16 - pico-train - INFO - │ tokenizer: │
38
- 2025-08-28 21:11:16 - pico-train - INFO - │ name: allenai/OLMo-7B-0724-hf │
39
- 2025-08-28 21:11:16 - pico-train - INFO - │ vocab_size: 50304 │
40
- 2025-08-28 21:11:16 - pico-train - INFO - │ evaluation: │
41
- 2025-08-28 21:11:16 - pico-train - INFO - │ metrics: │
42
- 2025-08-28 21:11:16 - pico-train - INFO - │ - paloma │
43
- 2025-08-28 21:11:16 - pico-train - INFO - │ paloma: │
44
- 2025-08-28 21:11:16 - pico-train - INFO - │ batch_size: 2 │
45
- 2025-08-28 21:11:16 - pico-train - INFO - │ dataset_name: pico-lm/pretokenized-paloma-tinsy │
46
- 2025-08-28 21:11:16 - pico-train - INFO - │ dataset_split: val │
47
- 2025-08-28 21:11:16 - pico-train - INFO - │ max_length: 2048 │
48
- 2025-08-28 21:11:16 - pico-train - INFO - │ model: │
49
- 2025-08-28 21:11:16 - pico-train - INFO - │ activation_hidden_dim: 384 │
50
- 2025-08-28 21:11:16 - pico-train - INFO - │ attention_n_heads: 12 │
51
- 2025-08-28 21:11:16 - pico-train - INFO - │ attention_n_kv_heads: 4 │
52
- 2025-08-28 21:11:16 - pico-train - INFO - │ batch_size: 1024 │
53
- 2025-08-28 21:11:16 - pico-train - INFO - │ d_model: 96 │
54
- 2025-08-28 21:11:16 - pico-train - INFO - │ max_seq_len: 2048 │
55
- 2025-08-28 21:11:16 - pico-train - INFO - │ model_type: pico_decoder │
56
- 2025-08-28 21:11:16 - pico-train - INFO - │ n_layers: 12 │
57
- 2025-08-28 21:11:16 - pico-train - INFO - │ norm_eps: 1.0e-06 │
58
- 2025-08-28 21:11:16 - pico-train - INFO - │ position_emb_theta: 10000.0 │
59
- 2025-08-28 21:11:16 - pico-train - INFO - │ vocab_size: 50304 │
60
- 2025-08-28 21:11:16 - pico-train - INFO - │ monitoring: │
61
- 2025-08-28 21:11:16 - pico-train - INFO - │ logging: │
62
- 2025-08-28 21:11:16 - pico-train - INFO - │ log_every_n_steps: 100 │
63
- 2025-08-28 21:11:16 - pico-train - INFO - │ log_level: INFO │
64
- 2025-08-28 21:11:16 - pico-train - INFO - │ save_to_wandb: false │
65
- 2025-08-28 21:11:16 - pico-train - INFO - │ wandb: │
66
- 2025-08-28 21:11:16 - pico-train - INFO - │ entity: boymyc │
67
- 2025-08-28 21:11:16 - pico-train - INFO - │ project: pico-decoder-tiny │
68
- 2025-08-28 21:11:16 - pico-train - INFO - │ training: │
69
- 2025-08-28 21:11:16 - pico-train - INFO - │ fabric: │
70
- 2025-08-28 21:11:16 - pico-train - INFO - │ accelerator: cuda │
71
- 2025-08-28 21:11:16 - pico-train - INFO - │ num_devices: 1 │
72
- 2025-08-28 21:11:16 - pico-train - INFO - │ num_nodes: 1 │
73
- 2025-08-28 21:11:16 - pico-train - INFO - │ precision: 16-mixed │
74
- 2025-08-28 21:11:16 - pico-train - INFO - │ max_steps: 200000 │
75
- 2025-08-28 21:11:16 - pico-train - INFO - │ optimization: │
76
- 2025-08-28 21:11:16 - pico-train - INFO - │ gradient_accumulation_steps: 64 │
77
- 2025-08-28 21:11:16 - pico-train - INFO - │ lr: 0.0003 │
78
- 2025-08-28 21:11:16 - pico-train - INFO - │ lr_scheduler: linear_with_warmup │
79
- 2025-08-28 21:11:16 - pico-train - INFO - │ lr_warmup_steps: 2500 │
80
- 2025-08-28 21:11:16 - pico-train - INFO - │ optimizer: adamw │
81
- 2025-08-28 21:11:16 - pico-train - INFO - │ │
82
- 2025-08-28 21:11:16 - pico-train - INFO - ╰─────────────────────────────────────────────────────╯
83
- 2025-08-28 21:11:16 - pico-train - INFO - ==================================================
84
- 2025-08-28 21:11:16 - pico-train - INFO - ⛭ Runtime Summary:
85
- 2025-08-28 21:11:16 - pico-train - INFO - ==================================================
86
- 2025-08-28 21:11:16 - pico-train - INFO - Starting from step: 0
87
- 2025-08-28 21:11:16 - pico-train - INFO - Model Setup:
88
- 2025-08-28 21:11:16 - pico-train - INFO - └─ Total Parameters: 11,282,784
89
- 2025-08-28 21:11:16 - pico-train - INFO - └─ Trainable Parameters: 11,282,784
90
- 2025-08-28 21:11:16 - pico-train - INFO - Distributed Setup:
91
- 2025-08-28 21:11:16 - pico-train - INFO - └─ Number of Devices: 1
92
- 2025-08-28 21:11:16 - pico-train - INFO - └─ Device Type: NVIDIA GeForce RTX 5090
93
- 2025-08-28 21:11:16 - pico-train - INFO - └─ Available Memory: 33.68 GB
94
- 2025-08-28 21:11:16 - pico-train - INFO - Software Setup:
95
- 2025-08-28 21:11:16 - pico-train - INFO - └─ Python Version: 3.10.12
96
- 2025-08-28 21:11:16 - pico-train - INFO - └─ PyTorch Version: 2.8.0+cu128
97
- 2025-08-28 21:11:16 - pico-train - INFO - └─ CUDA Version: 12.8
98
- 2025-08-28 21:11:16 - pico-train - INFO - └─ Operating System: Linux 6.8.0-63-generic
99
- 2025-08-28 21:11:16 - pico-train - INFO - Batch Size Configuration:
100
- 2025-08-28 21:11:16 - pico-train - INFO - └─ Global Batch Size: 256
101
- 2025-08-28 21:11:16 - pico-train - INFO - └─ Per Device Batch Size: 1
102
- 2025-08-28 21:11:16 - pico-train - INFO - └─ Gradient Accumulation Steps: 256
103
- 2025-08-28 21:11:16 - pico-train - INFO - ==================================================
104
- 2025-08-28 21:11:49 - pico-train - INFO - Step 0 -- 🔄 Training Metrics
105
- 2025-08-28 21:11:49 - pico-train - INFO - ├── Loss: 10.9914
106
- 2025-08-28 21:11:49 - pico-train - INFO - ├── Learning Rate: 0.00e+00
107
- 2025-08-28 21:11:49 - pico-train - INFO - └── Inf/NaN count: 0
108
- 2025-08-28 21:11:49 - pico-train - INFO - Step 0 -- 📈 Saving Learning Dynamics
109
- 2025-08-28 21:26:36 - pico-train - INFO - Step 27 -- 💾 Saving Final Checkpoint
110
- 2025-08-28 21:28:36 - pico-train - INFO - Step 27 -- 📊 Evaluation Results
111
- 2025-08-28 21:28:36 - pico-train - INFO - └── paloma: 59120.39268292683
112
- 2025-08-28 21:28:37 - pico-train - INFO - 🎉 Training complete! Final step: 27
113
- 2025-08-28 21:28:37 - pico-train - WARNING - Note: Training stopped before max steps (200000)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pico-decoder-tiny-max-vram/training_config.yaml DELETED
@@ -1,74 +0,0 @@
1
- checkpointing:
2
- checkpoints_dir: checkpoints
3
- evaluation:
4
- eval_results_dir: eval_results
5
- fabric_checkpoint_dir: fabric_state
6
- fabric_checkpoint_filename: checkpoint.pt
7
- hf_checkpoint:
8
- collection_slug: null
9
- repo_id: ThomasTheMaker/pico-decoder-tiny
10
- learning_dynamics:
11
- batch_size: 8
12
- eval_data: null
13
- layer_suffixes:
14
- - attention.v_proj
15
- - attention.o_proj
16
- - swiglu.w_2
17
- sequence_idx: -1
18
- learning_dynamics_dir: learning_dynamics
19
- logs_dir: logs
20
- run_name: pico-decoder-tiny-max-vram
21
- runs_dir: runs
22
- save_every_n_steps: 1000
23
- save_to_hf: true
24
- training:
25
- auto_resume: true
26
- data:
27
- dataloader:
28
- batch_size: 64
29
- dataset:
30
- name: pico-lm/pretokenized-dolma-tinsy
31
- tokenizer:
32
- name: allenai/OLMo-7B-0724-hf
33
- vocab_size: 50304
34
- evaluation:
35
- metrics:
36
- - paloma
37
- paloma:
38
- batch_size: 2
39
- dataset_name: pico-lm/pretokenized-paloma-tinsy
40
- dataset_split: val
41
- max_length: 2048
42
- model:
43
- activation_hidden_dim: 384
44
- attention_n_heads: 12
45
- attention_n_kv_heads: 4
46
- batch_size: 1024
47
- d_model: 96
48
- max_seq_len: 2048
49
- model_type: pico_decoder
50
- n_layers: 12
51
- norm_eps: 1.0e-06
52
- position_emb_theta: 10000.0
53
- vocab_size: 50304
54
- monitoring:
55
- logging:
56
- log_every_n_steps: 100
57
- log_level: INFO
58
- save_to_wandb: false
59
- wandb:
60
- entity: boymyc
61
- project: pico-decoder-tiny
62
- training:
63
- fabric:
64
- accelerator: cuda
65
- num_devices: 1
66
- num_nodes: 1
67
- precision: 16-mixed
68
- max_steps: 200000
69
- optimization:
70
- gradient_accumulation_steps: 64
71
- lr: 0.0003
72
- lr_scheduler: linear_with_warmup
73
- lr_warmup_steps: 2500
74
- optimizer: adamw