Text Generation
Transformers
Safetensors
English
gpt-sdprelu
causal-lm
gpt2
tinystories
llm-kittens
thunderkittens
bf16
cuda
sd-prelu
custom-activation
custom_code
Instructions to use adamroberts/tinystories-5090-sdprelu with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use adamroberts/tinystories-5090-sdprelu with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="adamroberts/tinystories-5090-sdprelu", trust_remote_code=True)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("adamroberts/tinystories-5090-sdprelu", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use adamroberts/tinystories-5090-sdprelu with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "adamroberts/tinystories-5090-sdprelu" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "adamroberts/tinystories-5090-sdprelu", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/adamroberts/tinystories-5090-sdprelu
- SGLang
How to use adamroberts/tinystories-5090-sdprelu with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "adamroberts/tinystories-5090-sdprelu" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "adamroberts/tinystories-5090-sdprelu", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "adamroberts/tinystories-5090-sdprelu" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "adamroberts/tinystories-5090-sdprelu", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use adamroberts/tinystories-5090-sdprelu with Docker Model Runner:
docker model run hf.co/adamroberts/tinystories-5090-sdprelu
Upload folder using huggingface_hub
Browse files- .gitattributes +0 -34
- README.md +185 -0
- config.json +41 -0
- generation_config.json +12 -0
- merges.txt +0 -0
- model.safetensors +3 -0
- modeling_gpt_sdprelu.py +102 -0
- tokenizer.json +0 -0
- tokenizer_config.json +13 -0
- vocab.json +0 -0
.gitattributes
CHANGED
|
@@ -1,35 +1 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
language:
|
| 4 |
+
- en
|
| 5 |
+
library_name: transformers
|
| 6 |
+
tags:
|
| 7 |
+
- text-generation
|
| 8 |
+
- causal-lm
|
| 9 |
+
- gpt2
|
| 10 |
+
- tinystories
|
| 11 |
+
- llm-kittens
|
| 12 |
+
- thunderkittens
|
| 13 |
+
- bf16
|
| 14 |
+
- cuda
|
| 15 |
+
- sd-prelu
|
| 16 |
+
- custom-activation
|
| 17 |
+
- custom_code
|
| 18 |
+
datasets:
|
| 19 |
+
- roneneldan/TinyStories
|
| 20 |
+
---
|
| 21 |
+
|
| 22 |
+
# llm.kittens TinyStories 124M BF16 — SD-PReLU
|
| 23 |
+
|
| 24 |
+
This is a 124M-parameter GPT-2-style causal language model trained from scratch on TinyStories with the `llm.kittens` C++/CUDA trainer, which is a fork of Karpathy's llm.c with some optimisations for SM120, and multi-stack kernel optimisations.
|
| 25 |
+
|
| 26 |
+
Unlike the GELU baseline ([`tinystories-5090`](https://huggingface.co/adamroberts/tinystories-5090)), this checkpoint replaces the MLP's GELU nonlinearity with a learnable **SD-PReLU** activation (the llm.kittens `-af sd-prelu` activation). Because SD-PReLU is not part of stock Transformers or llama.cpp, the repo ships a small custom module (`modeling_gpt_sdprelu.py`) and must be loaded with `trust_remote_code=True`. See [Activation: SD-PReLU](#activation-sd-prelu) below.
|
| 27 |
+
|
| 28 |
+
The model is published as a Hugging Face Transformers checkpoint with BF16 `safetensors` weights plus custom modeling code.
|
| 29 |
+
It was trained on a single RTX 5090 in roughly 14 hours (2595.7 ms average iteration over 20,000 steps).
|
| 30 |
+
|
| 31 |
+
## Result
|
| 32 |
+
|
| 33 |
+
- Model weights: `model.safetensors`
|
| 34 |
+
- Training step: `20000 / 20000`
|
| 35 |
+
- Final train loss: `0.781594`
|
| 36 |
+
- Final validation loss: `0.870315`
|
| 37 |
+
- Final throughput: `198871 tokens/s`
|
| 38 |
+
- Final step time: `2642.72 ms`
|
| 39 |
+
- Final reported BF16 MFU: `38.0%`
|
| 40 |
+
- Average iteration time: `2595.669013 ms`
|
| 41 |
+
- Safetensors size: `248,896,984` bytes
|
| 42 |
+
- Parameter count: `124,475,904` base + `24` learnable SD-PReLU scalars (`theta_a`/`theta_b`, 2 per layer × 12 layers)
|
| 43 |
+
|
| 44 |
+
For reference, the GELU baseline reached `0.785740` train / `0.875080` validation loss on the same setup; SD-PReLU lands marginally lower on both (`0.781594` / `0.870315`) at near-identical throughput, with the activation adding only 24 scalar parameters.
|
| 45 |
+
|
| 46 |
+
The TinyStories paper reports eval losses of `1.33` to `1.58` for the 768-hidden-size 1- and 2-layer attention-head ablations in Figure 24. This run's `0.870315` validation loss is lower, but the comparison is not apples-to-apples: this model is a 12-layer GPT-2-style model using GPT-2 tokenization, a 1024-token context, and a different implementation/training setup.
|
| 47 |
+
|
| 48 |
+
## Activation: SD-PReLU
|
| 49 |
+
|
| 50 |
+
This is the key difference from the GELU baseline. The MLP's GELU nonlinearity (applied between the `c_fc` up-projection and the `c_proj` down-projection) is replaced by **SD-PReLU**, a self-gated, damped PReLU. Each transformer block learns two scalars (`theta_a`, `theta_b`) that are mapped through a bounded reparameterization into `a` and `b`:
|
| 51 |
+
|
| 52 |
+
```text
|
| 53 |
+
a = alpha_max * sigmoid(theta_a) # in [0, alpha_max), alpha_max = 0.30
|
| 54 |
+
b = beta_min + softplus(theta_b) # in (beta_min, inf), beta_min = 0.50
|
| 55 |
+
phi(x) = x * (a + (1 - a) * sigmoid(b * x))
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
- `a` is a learnable leak/floor (PReLU-like): the gate output never drops below `a`, so negative inputs are not fully zeroed.
|
| 59 |
+
- `b` controls the sharpness of the sigmoid gate. With `a = 0`, `phi` reduces to a Swish/SiLU-style `x * sigmoid(b * x)`.
|
| 60 |
+
- For numerical parity with the CUDA kernel, the activation is computed in float32 and the reparameterization/gate arguments are clamped to `[-20, 20]`; inputs and outputs stay in the surrounding model dtype (BF16).
|
| 61 |
+
|
| 62 |
+
The activation is configured via two config fields, `sdprelu_alpha_max` (`0.30`) and `sdprelu_beta_min` (`0.50`). Only 24 extra scalar parameters are introduced over the GELU baseline, so parameter count and on-disk size are essentially unchanged.
|
| 63 |
+
|
| 64 |
+
Note: the config still carries `"activation_function": "gelu_new"` for GPT-2 compatibility, but it is inert — the custom MLP overrides the activation with SD-PReLU regardless of that field.
|
| 65 |
+
|
| 66 |
+
## Custom activation inference code
|
| 67 |
+
|
| 68 |
+
The repo ships `modeling_gpt_sdprelu.py`, which defines:
|
| 69 |
+
|
| 70 |
+
- `GPTSDPReLUConfig` (`model_type = "gpt-sdprelu"`), a `GPT2Config` with the two extra `sdprelu_*` fields.
|
| 71 |
+
- `GPTSDPReLUMLP`, which reuses GPT-2's `c_fc` / `c_proj` / dropout submodules (so weight names stay `mlp.c_fc.*` / `mlp.c_proj.*`) and swaps GELU for SD-PReLU, adding `theta_a` / `theta_b` per layer.
|
| 72 |
+
- `GPTSDPReLULMHeadModel`, a `GPT2LMHeadModel` that installs the SD-PReLU MLP into every block.
|
| 73 |
+
|
| 74 |
+
These are wired into `config.json` via `auto_map`, so `trust_remote_code=True` is required to load the model. Everything else (attention, layernorms, embeddings, tied head, tokenizer) is standard GPT-2.
|
| 75 |
+
|
| 76 |
+
## Architecture
|
| 77 |
+
|
| 78 |
+
- Family: GPT-2-style decoder-only Transformer
|
| 79 |
+
- Descriptor: `d12`
|
| 80 |
+
- Layers: `12`
|
| 81 |
+
- Attention heads: `12`
|
| 82 |
+
- Hidden size: `768`
|
| 83 |
+
- Context length: `1024`
|
| 84 |
+
- Vocabulary size: `50,257`
|
| 85 |
+
- MLP activation: **SD-PReLU** (learnable, per-layer) — replaces GELU
|
| 86 |
+
- `model_type`: `gpt-sdprelu` (custom code, `trust_remote_code=True`)
|
| 87 |
+
- Precision: BF16 weights
|
| 88 |
+
|
| 89 |
+
## Training
|
| 90 |
+
|
| 91 |
+
The run used the TinyStories GPT-2 dataset files generated by `dev/data/tinystories.py` in `llm.kittens`. The only change from the GELU baseline is the `-af sd-prelu` activation flag.
|
| 92 |
+
|
| 93 |
+
```bash
|
| 94 |
+
./train_gpt2cu \
|
| 95 |
+
-i "dev/data/tinystories/TinyStories_train.bin" \
|
| 96 |
+
-j "dev/data/tinystories/TinyStories_val.bin" \
|
| 97 |
+
-o "log124M/5090_S" \
|
| 98 |
+
-v 250 -s 20000 -g 144 \
|
| 99 |
+
-h 0 \
|
| 100 |
+
-b 64 -t 1024 -d 524288 \
|
| 101 |
+
-r 0 \
|
| 102 |
+
-z 1 \
|
| 103 |
+
-c 0.1 \
|
| 104 |
+
-l 0.0006 -q 0.0 -u 700 -n 5000 \
|
| 105 |
+
-y 0 \
|
| 106 |
+
-e "d12" \
|
| 107 |
+
-af sd-prelu \
|
| 108 |
+
-x 20000
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
+
Key settings:
|
| 112 |
+
|
| 113 |
+
- Hardware target: RTX 5090 / SM120
|
| 114 |
+
- MLP activation: `sd-prelu` (`-af sd-prelu`)
|
| 115 |
+
- Micro batch: `64`
|
| 116 |
+
- Sequence length: `1024`
|
| 117 |
+
- Total desired batch size: `524,288` tokens
|
| 118 |
+
- Max steps: `20,000`
|
| 119 |
+
- Optimizer: AdamW as implemented in `llm.kittens`
|
| 120 |
+
- Peak learning rate: `6e-4`
|
| 121 |
+
- Scheduler: cosine
|
| 122 |
+
- Warmup: `700` steps
|
| 123 |
+
- Final LR fraction: `0.0`
|
| 124 |
+
- Weight decay: `0.1`
|
| 125 |
+
- Recompute: off
|
| 126 |
+
- ZeRO stage: `1`
|
| 127 |
+
- Checkpoint interval: `5000` steps
|
| 128 |
+
|
| 129 |
+
## Sample
|
| 130 |
+
|
| 131 |
+
Prompt/sample emitted at the final checkpoint (step 20000):
|
| 132 |
+
|
| 133 |
+
```text
|
| 134 |
+
Once upon a time, there was a little boy named Timmy. Timmy loved his soft couch and he kept it in his room. One day, Timmy's mom told him to clean his room. Timmy didn't want to clean his room because he loved playing with his toys. But his mom said he couldn't play with his toys until he cleaned his room.
|
| 135 |
+
Timmy felt sad and he started to hate cleaning his room. Then, he had an idea. He asked his mom if he could put away his toys in a different way and make a game out of it. His mom thought it was a good idea too and they both started to work together.
|
| 136 |
+
In the end, Tim
|
| 137 |
+
```
|
| 138 |
+
|
| 139 |
+
## Files
|
| 140 |
+
|
| 141 |
+
- `model.safetensors`: BF16 Transformers weights (including the per-layer `theta_a` / `theta_b` SD-PReLU scalars).
|
| 142 |
+
- `modeling_gpt_sdprelu.py`: custom SD-PReLU model/config code (required, loaded via `trust_remote_code=True`).
|
| 143 |
+
- `config.json`: model configuration, including `sdprelu_alpha_max` / `sdprelu_beta_min` and the `auto_map` wiring.
|
| 144 |
+
- `generation_config.json`: default generation settings.
|
| 145 |
+
- `tokenizer.json`: GPT-2 tokenizer.
|
| 146 |
+
- `vocab.json` and `merges.txt`: GPT-2 BPE vocabulary files.
|
| 147 |
+
|
| 148 |
+
## Loading
|
| 149 |
+
|
| 150 |
+
Because the SD-PReLU activation lives in custom code, you must pass `trust_remote_code=True`:
|
| 151 |
+
|
| 152 |
+
```python
|
| 153 |
+
import torch
|
| 154 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 155 |
+
|
| 156 |
+
model_id = "adamroberts/tinystories-5090-sdprelu"
|
| 157 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 158 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 159 |
+
model_id, trust_remote_code=True, dtype=torch.bfloat16
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
inputs = tokenizer("Once upon a time", return_tensors="pt")
|
| 163 |
+
with torch.inference_mode():
|
| 164 |
+
outputs = model.generate(**inputs, max_new_tokens=80, do_sample=True, temperature=0.8)
|
| 165 |
+
|
| 166 |
+
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
| 167 |
+
```
|
| 168 |
+
|
| 169 |
+
## GGUF / llama.cpp
|
| 170 |
+
|
| 171 |
+
GGUF export is **not available** for this checkpoint. The SD-PReLU activation is a custom, learnable nonlinearity with no equivalent in llama.cpp's GPT-2 graph, so the model cannot be quantized to GGUF or run with `llama.cpp` / LM Studio without implementing the activation there. Use the Transformers loading path above instead. The GELU baseline ([`tinystories-5090`](https://huggingface.co/adamroberts/tinystories-5090)) is available if you need a llama.cpp-compatible variant.
|
| 172 |
+
|
| 173 |
+
Recommended sampling settings (Transformers `generate`):
|
| 174 |
+
|
| 175 |
+
- Temperature: `0.8`
|
| 176 |
+
- Top-p: `0.95`
|
| 177 |
+
- Top-k: `50`
|
| 178 |
+
- Repetition penalty: `1.05`
|
| 179 |
+
- Stop/EOS token: `<|endoftext|>` / token id `50256`
|
| 180 |
+
|
| 181 |
+
This is a completion model, not a chat/instruction model: prompt it with the start of a story and always set a finite `max_new_tokens`, since it was trained for continuation and may not emit `<|endoftext|>` during normal generation.
|
| 182 |
+
|
| 183 |
+
Source implementation: `https://github.com/adamdroberts/llm.kittens`
|
| 184 |
+
|
| 185 |
+
TinyStories reference paper: `https://arxiv.org/abs/2305.07759`
|
config.json
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"activation_function": "gelu_new",
|
| 3 |
+
"add_cross_attention": false,
|
| 4 |
+
"architectures": [
|
| 5 |
+
"GPTSDPReLULMHeadModel"
|
| 6 |
+
],
|
| 7 |
+
"attn_pdrop": 0.0,
|
| 8 |
+
"auto_map": {
|
| 9 |
+
"AutoConfig": "modeling_gpt_sdprelu.GPTSDPReLUConfig",
|
| 10 |
+
"AutoModelForCausalLM": "modeling_gpt_sdprelu.GPTSDPReLULMHeadModel"
|
| 11 |
+
},
|
| 12 |
+
"bos_token_id": 50256,
|
| 13 |
+
"dtype": "bfloat16",
|
| 14 |
+
"embd_pdrop": 0.0,
|
| 15 |
+
"eos_token_id": 50256,
|
| 16 |
+
"initializer_range": 0.02,
|
| 17 |
+
"layer_norm_epsilon": 1e-05,
|
| 18 |
+
"model_type": "gpt-sdprelu",
|
| 19 |
+
"n_ctx": 1024,
|
| 20 |
+
"n_embd": 768,
|
| 21 |
+
"n_head": 12,
|
| 22 |
+
"n_inner": null,
|
| 23 |
+
"n_layer": 12,
|
| 24 |
+
"n_positions": 1024,
|
| 25 |
+
"pad_token_id": 50256,
|
| 26 |
+
"reorder_and_upcast_attn": false,
|
| 27 |
+
"resid_pdrop": 0.0,
|
| 28 |
+
"scale_attn_by_inverse_layer_idx": false,
|
| 29 |
+
"scale_attn_weights": true,
|
| 30 |
+
"sdprelu_alpha_max": 0.3,
|
| 31 |
+
"sdprelu_beta_min": 0.5,
|
| 32 |
+
"summary_activation": null,
|
| 33 |
+
"summary_first_dropout": 0.1,
|
| 34 |
+
"summary_proj_to_labels": true,
|
| 35 |
+
"summary_type": "cls_index",
|
| 36 |
+
"summary_use_proj": true,
|
| 37 |
+
"tie_word_embeddings": true,
|
| 38 |
+
"transformers_version": "5.9.0",
|
| 39 |
+
"use_cache": true,
|
| 40 |
+
"vocab_size": 50257
|
| 41 |
+
}
|
generation_config.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 50256,
|
| 3 |
+
"do_sample": true,
|
| 4 |
+
"eos_token_id": 50256,
|
| 5 |
+
"max_new_tokens": 144,
|
| 6 |
+
"pad_token_id": 50256,
|
| 7 |
+
"repetition_penalty": 1.05,
|
| 8 |
+
"temperature": 0.8,
|
| 9 |
+
"top_k": 50,
|
| 10 |
+
"top_p": 0.95,
|
| 11 |
+
"transformers_version": "5.9.0"
|
| 12 |
+
}
|
merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bd518a7bbe8bd5bffdf06f9bd5c3dabe97dd8dff19af18a39e707b53f00d3d9f
|
| 3 |
+
size 248896984
|
modeling_gpt_sdprelu.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""A GPT model with the SD-PReLU (DTSG-PReLU) MLP activation, for HF Transformers.
|
| 2 |
+
|
| 3 |
+
The backbone is GPT-2's (attention, layernorms, embeddings, tied head), but the
|
| 4 |
+
MLP activation is no longer GELU, so these classes drop the "GPT2" name. This
|
| 5 |
+
mirrors the llm.kittens `-af sd-prelu` activation exactly: per layer there are two
|
| 6 |
+
learnable scalars (theta_a, theta_b), materialized through a bounded
|
| 7 |
+
reparameterization and applied elementwise between the up- and down-projections:
|
| 8 |
+
|
| 9 |
+
a = alpha_max * sigmoid(theta_a) # in [0, alpha_max)
|
| 10 |
+
b = beta_min + softplus(theta_b) # in (beta_min, inf)
|
| 11 |
+
phi(x) = x * (a + (1 - a) * sigmoid(b * x))
|
| 12 |
+
|
| 13 |
+
To match the CUDA kernel bit-for-bit in spirit, the materialization and the gate
|
| 14 |
+
sigmoid clamp their argument to [-20, 20] and the activation is computed in float32
|
| 15 |
+
(inputs/outputs stay in the surrounding model dtype, typically bfloat16).
|
| 16 |
+
|
| 17 |
+
Load with:
|
| 18 |
+
|
| 19 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 20 |
+
model = AutoModelForCausalLM.from_pretrained(path, trust_remote_code=True)
|
| 21 |
+
tok = AutoTokenizer.from_pretrained(path)
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
from __future__ import annotations
|
| 25 |
+
|
| 26 |
+
import torch
|
| 27 |
+
import torch.nn as nn
|
| 28 |
+
import torch.nn.functional as F
|
| 29 |
+
from transformers import GPT2Config, GPT2LMHeadModel
|
| 30 |
+
from transformers.models.gpt2.modeling_gpt2 import GPT2MLP
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def sdprelu(
|
| 34 |
+
x: torch.Tensor,
|
| 35 |
+
theta_a: torch.Tensor,
|
| 36 |
+
theta_b: torch.Tensor,
|
| 37 |
+
alpha_max: float,
|
| 38 |
+
beta_min: float,
|
| 39 |
+
) -> torch.Tensor:
|
| 40 |
+
"""SD-PReLU activation: x * (a + (1 - a) * sigmoid(b * x)), computed in fp32."""
|
| 41 |
+
orig_dtype = x.dtype
|
| 42 |
+
x = x.float()
|
| 43 |
+
a = alpha_max * torch.sigmoid(theta_a.float().clamp(-20.0, 20.0))
|
| 44 |
+
b = beta_min + F.softplus(theta_b.float())
|
| 45 |
+
s = torch.sigmoid((b * x).clamp(-20.0, 20.0))
|
| 46 |
+
out = x * (a + (1.0 - a) * s)
|
| 47 |
+
return out.to(orig_dtype)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class GPTSDPReLUConfig(GPT2Config):
|
| 51 |
+
model_type = "gpt-sdprelu"
|
| 52 |
+
|
| 53 |
+
def __init__(
|
| 54 |
+
self,
|
| 55 |
+
sdprelu_alpha_max: float = 0.30,
|
| 56 |
+
sdprelu_beta_min: float = 0.50,
|
| 57 |
+
**kwargs,
|
| 58 |
+
):
|
| 59 |
+
self.sdprelu_alpha_max = sdprelu_alpha_max
|
| 60 |
+
self.sdprelu_beta_min = sdprelu_beta_min
|
| 61 |
+
super().__init__(**kwargs)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
class GPTSDPReLUMLP(nn.Module):
|
| 65 |
+
"""GPT-2 MLP projections with GELU swapped for per-layer SD-PReLU.
|
| 66 |
+
|
| 67 |
+
Reuses the c_fc / c_proj / dropout submodules of an existing GPT2MLP so the
|
| 68 |
+
weight names stay `mlp.c_fc.*` / `mlp.c_proj.*`; adds two scalar parameters
|
| 69 |
+
`theta_a` and `theta_b` per layer.
|
| 70 |
+
"""
|
| 71 |
+
|
| 72 |
+
def __init__(self, base_mlp: GPT2MLP, config: GPTSDPReLUConfig):
|
| 73 |
+
super().__init__()
|
| 74 |
+
self.c_fc = base_mlp.c_fc
|
| 75 |
+
self.c_proj = base_mlp.c_proj
|
| 76 |
+
self.dropout = base_mlp.dropout
|
| 77 |
+
self.alpha_max = float(config.sdprelu_alpha_max)
|
| 78 |
+
self.beta_min = float(config.sdprelu_beta_min)
|
| 79 |
+
self.theta_a = nn.Parameter(torch.zeros(1))
|
| 80 |
+
self.theta_b = nn.Parameter(torch.zeros(1))
|
| 81 |
+
|
| 82 |
+
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
| 83 |
+
hidden_states = self.c_fc(hidden_states)
|
| 84 |
+
hidden_states = sdprelu(
|
| 85 |
+
hidden_states, self.theta_a, self.theta_b, self.alpha_max, self.beta_min
|
| 86 |
+
)
|
| 87 |
+
hidden_states = self.c_proj(hidden_states)
|
| 88 |
+
hidden_states = self.dropout(hidden_states)
|
| 89 |
+
return hidden_states
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
class GPTSDPReLULMHeadModel(GPT2LMHeadModel):
|
| 93 |
+
config_class = GPTSDPReLUConfig
|
| 94 |
+
|
| 95 |
+
def __init__(self, config: GPTSDPReLUConfig):
|
| 96 |
+
super().__init__(config)
|
| 97 |
+
for block in self.transformer.h:
|
| 98 |
+
block.mlp = GPTSDPReLUMLP(block.mlp, config)
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
GPTSDPReLUConfig.register_for_auto_class()
|
| 102 |
+
GPTSDPReLULMHeadModel.register_for_auto_class("AutoModelForCausalLM")
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"bos_token": "<|endoftext|>",
|
| 5 |
+
"eos_token": "<|endoftext|>",
|
| 6 |
+
"errors": "replace",
|
| 7 |
+
"is_local": false,
|
| 8 |
+
"local_files_only": false,
|
| 9 |
+
"model_max_length": 1024,
|
| 10 |
+
"pad_token": "<|endoftext|>",
|
| 11 |
+
"tokenizer_class": "GPT2Tokenizer",
|
| 12 |
+
"unk_token": "<|endoftext|>"
|
| 13 |
+
}
|
vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|