Upload README.md with huggingface_hub
Browse files
README.md
CHANGED
|
@@ -8,11 +8,15 @@ tags:
|
|
| 8 |
- pytorch
|
| 9 |
- from-scratch
|
| 10 |
- foundation-model
|
|
|
|
|
|
|
|
|
|
| 11 |
datasets:
|
| 12 |
- allenai/c4
|
| 13 |
pipeline_tag: text-generation
|
|
|
|
| 14 |
model-index:
|
| 15 |
-
- name:
|
| 16 |
results:
|
| 17 |
- task:
|
| 18 |
type: text-generation
|
|
@@ -24,59 +28,373 @@ model-index:
|
|
| 24 |
- type: perplexity
|
| 25 |
value: 29.19
|
| 26 |
name: Perplexity
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
---
|
| 28 |
|
| 29 |
-
|
| 30 |
|
| 31 |
-
|
|
|
|
| 32 |
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
| Metric | Value |
|
| 36 |
-
|--------|-----
|
| 37 |
-
| **Architecture** | Decoder-only Transformer (GPT-style) |
|
| 38 |
-
| **Parameters** | 109.53M (84.95M non-embedding) |
|
| 39 |
-
| **Layers** | 12 |
|
| 40 |
-
| **Dimensions** | 768 |
|
| 41 |
-
| **Attention Heads** | 12 |
|
| 42 |
-
| **Context Window** | 1,024 tokens |
|
| 43 |
-
| **Vocabulary** | 32,000 BPE |
|
| 44 |
-
| **Training Data** | C4 (2B tokens, streamed) |
|
| 45 |
-
| **Training Hardware** | NVIDIA H100 NVL (96 GB) |
|
| 46 |
-
| **Training Time** | ~2 hours |
|
| 47 |
| **Val Perplexity** | **29.19** |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
-
|
| 50 |
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
## Usage
|
| 72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
```python
|
| 74 |
import torch
|
| 75 |
from model.config import BUVNConfig
|
| 76 |
from model.model import BUVNModel
|
| 77 |
|
| 78 |
# Load checkpoint
|
| 79 |
-
ckpt = torch.load('buvn_2.0_best.pt', map_location='
|
| 80 |
|
| 81 |
# Handle torch.compile prefix
|
| 82 |
state_dict = ckpt['model']
|
|
@@ -86,32 +404,148 @@ for k in list(state_dict.keys()):
|
|
| 86 |
|
| 87 |
# Build model
|
| 88 |
config = BUVNConfig.from_dict(ckpt['model_args'])
|
| 89 |
-
model = BUVNModel(config)
|
| 90 |
model.load_state_dict(state_dict)
|
| 91 |
model.eval()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
```
|
| 93 |
|
| 94 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
## The Beuvian Ecosystem
|
| 105 |
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
## Links
|
| 111 |
|
| 112 |
-
|
| 113 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
-
|
| 116 |
|
| 117 |
-
|
|
|
|
| 8 |
- pytorch
|
| 9 |
- from-scratch
|
| 10 |
- foundation-model
|
| 11 |
+
- language-model
|
| 12 |
+
- gpt
|
| 13 |
+
- llm
|
| 14 |
datasets:
|
| 15 |
- allenai/c4
|
| 16 |
pipeline_tag: text-generation
|
| 17 |
+
library_name: pytorch
|
| 18 |
model-index:
|
| 19 |
+
- name: BUVN-2.0
|
| 20 |
results:
|
| 21 |
- task:
|
| 22 |
type: text-generation
|
|
|
|
| 28 |
- type: perplexity
|
| 29 |
value: 29.19
|
| 30 |
name: Perplexity
|
| 31 |
+
- type: accuracy
|
| 32 |
+
value: 37.88
|
| 33 |
+
name: Top-1 Accuracy
|
| 34 |
+
- type: accuracy
|
| 35 |
+
value: 60.34
|
| 36 |
+
name: Top-5 Accuracy
|
| 37 |
---
|
| 38 |
|
| 39 |
+
<div align="center">
|
| 40 |
|
| 41 |
+
<!-- Header Banner -->
|
| 42 |
+
<img src="https://capsule-render.vercel.app/api?type=waving&color=gradient&customColorList=2,6,11,20,30&height=220§ion=header&text=BUVN-2.0&fontSize=80&fontColor=fff&animation=fadeIn&fontAlignY=30&desc=Foundation%20Language%20Model%20%7C%20109.5M%20Params%20%7C%20Beats%20GPT-2%20Small&descAlignY=55&descSize=18" width="100%"/>
|
| 43 |
|
| 44 |
+
<!-- Typing Animation -->
|
| 45 |
+
<a href="https://github.com/bhuvan0808/beuvian">
|
| 46 |
+
<img src="https://readme-typing-svg.demolab.com?font=Fira+Code&weight=600&size=20&pause=1000&color=58A6FF¢er=true&vCenter=true&multiline=true&repeat=true&width=700&height=100&lines=Perplexity+29.19+%E2%80%94+Beats+GPT-2+Small+(29.41)!;109.5M+Parameters+%7C+Trained+on+2B+Tokens+from+C4;Built+From+Scratch+on+NVIDIA+H100+in+~2+Hours" alt="Typing SVG" />
|
| 47 |
+
</a>
|
| 48 |
+
|
| 49 |
+
<br/>
|
| 50 |
+
|
| 51 |
+
<!-- Badges Row 1 -->
|
| 52 |
+
[](https://github.com/bhuvan0808/beuvian)
|
| 53 |
+
[](https://github.com/bhuvan0808/beuvian)
|
| 54 |
+
[](https://github.com/bhuvan0808/beuvian)
|
| 55 |
+
|
| 56 |
+
<!-- Badges Row 2 -->
|
| 57 |
+
[](https://pytorch.org)
|
| 58 |
+
[](https://www.nvidia.com)
|
| 59 |
+
[](https://github.com/bhuvan0808/beuvian/blob/main/LICENSE)
|
| 60 |
+
[](https://github.com/bhuvan0808/beuvian)
|
| 61 |
+
|
| 62 |
+
<br/>
|
| 63 |
+
|
| 64 |
+
<!-- Status Badges -->
|
| 65 |
+
<img src="https://img.shields.io/badge/Status-Production%20Ready-brightgreen?style=flat-square&labelColor=0d1117&color=16c79a" />
|
| 66 |
+
<img src="https://img.shields.io/badge/Beats-GPT--2%20Small%20%E2%9C%93-blue?style=flat-square&labelColor=0d1117&color=58a6ff" />
|
| 67 |
+
<img src="https://img.shields.io/badge/Beats-Pythia--160M%20%E2%9C%93-blue?style=flat-square&labelColor=0d1117&color=58a6ff" />
|
| 68 |
+
<img src="https://img.shields.io/badge/Beats-GPT--Neo%20125M%20%E2%9C%93-blue?style=flat-square&labelColor=0d1117&color=58a6ff" />
|
| 69 |
+
<img src="https://img.shields.io/badge/Trained%20on-2B%20Tokens-purple?style=flat-square&labelColor=0d1117&color=bc6ff1" />
|
| 70 |
+
|
| 71 |
+
</div>
|
| 72 |
+
|
| 73 |
+
<br/>
|
| 74 |
+
|
| 75 |
+
<!-- Divider -->
|
| 76 |
+
<img src="https://user-images.githubusercontent.com/73097560/115834477-dbab4500-a447-11eb-908a-139a6edaec5c.gif" width="100%">
|
| 77 |
+
|
| 78 |
+
## What is BUVN-2.0?
|
| 79 |
+
|
| 80 |
+
**BUVN-2.0** is a **109.5 million parameter** GPT-style decoder-only transformer language model, **built entirely from scratch** — no pretrained weights, no fine-tuning shortcuts. Trained on **2 billion tokens** from the C4 dataset on a single **NVIDIA H100 NVL GPU** in approximately **2 hours**.
|
| 81 |
+
|
| 82 |
+
It is the foundation model of the **[Beuvian AI Ecosystem](https://github.com/bhuvan0808/beuvian)** — a family of three specialized models:
|
| 83 |
+
|
| 84 |
+
<div align="center">
|
| 85 |
+
|
| 86 |
+
```
|
| 87 |
+
╔═══════════════════════════════════╗
|
| 88 |
+
║ 🧠 BUVN-2.0 (Foundation Model) ║
|
| 89 |
+
║ 109.5M params | PPL 29.19 ║
|
| 90 |
+
╚════════════╦════════════╦════════╝
|
| 91 |
+
║ ║
|
| 92 |
+
╔═══════╩═══╗ ╔════╩════════╗
|
| 93 |
+
║ 💻 SRVN ║ ║ 📈 MNI ║
|
| 94 |
+
║ Code Agent ║ ║ Finance ║
|
| 95 |
+
║ (Planned) ║ ║ (Planned) ║
|
| 96 |
+
╚═══════════╝ ╚═════════════╝
|
| 97 |
+
```
|
| 98 |
+
|
| 99 |
+
</div>
|
| 100 |
+
|
| 101 |
+
> *"Don't just use AI. Understand it. Build it. Own it."*
|
| 102 |
+
|
| 103 |
+
<br/>
|
| 104 |
+
|
| 105 |
+
<!-- Divider -->
|
| 106 |
+
<img src="https://user-images.githubusercontent.com/73097560/115834477-dbab4500-a447-11eb-908a-139a6edaec5c.gif" width="100%">
|
| 107 |
+
|
| 108 |
+
## Model Performance
|
| 109 |
+
|
| 110 |
+
<div align="center">
|
| 111 |
+
|
| 112 |
+
### 🏆 WikiText-103 Perplexity Leaderboard
|
| 113 |
+
|
| 114 |
+
</div>
|
| 115 |
+
|
| 116 |
+
| Rank | Model | Organization | Parameters | PPL (↓) | Training Tokens |
|
| 117 |
+
|:----:|-------|:------------:|:----------:|:-------:|:---------------:|
|
| 118 |
+
| 1 | LLaMA-2 7B | Meta | 7B | 5.47 | 2T |
|
| 119 |
+
| 2 | LLaMA 7B | Meta | 7B | 7.73 | 1T |
|
| 120 |
+
| 3 | Pythia-1B | EleutherAI | 1B | 16.71 | 300B |
|
| 121 |
+
| 4 | GPT-2 Large | OpenAI | 774M | 19.93 | ~40B |
|
| 122 |
+
| 5 | GPT-2 Medium | OpenAI | 355M | 22.76 | ~40B |
|
| 123 |
+
| 6 | OPT-125M | Meta | 125M | 27.65 | 300B |
|
| 124 |
+
| 7 | RWKV-169M | RWKV | 169M | 29.01 | 300B |
|
| 125 |
+
| **8** | **🟢 BUVN-2.0 (this model)** | **Bhuvan** | **109.5M** | **29.19** | **2B** |
|
| 126 |
+
| 9 | Pythia-160M | EleutherAI | 160M | 29.33 | 300B |
|
| 127 |
+
| 10 | GPT-2 Small | OpenAI | 124M | 29.41 | ~40B |
|
| 128 |
+
| 11 | GPT-Neo 125M | EleutherAI | 125M | 32.43 | 300B |
|
| 129 |
+
|
| 130 |
+
<div align="center">
|
| 131 |
+
|
| 132 |
+
> **BUVN-2.0 beats GPT-2 Small** with **9x fewer parameters** and **20,000x less training data**.
|
| 133 |
+
> The architecture is competitive — the gap to higher ranks is purely about scale.
|
| 134 |
+
|
| 135 |
+
</div>
|
| 136 |
+
|
| 137 |
+
<br/>
|
| 138 |
+
|
| 139 |
+
### 📊 Full Benchmark Results
|
| 140 |
+
|
| 141 |
+
<table>
|
| 142 |
+
<tr>
|
| 143 |
+
<td width="50%">
|
| 144 |
+
|
| 145 |
+
#### Quality Metrics
|
| 146 |
|
| 147 |
| Metric | Value |
|
| 148 |
+
|--------|:-----:|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
| **Val Perplexity** | **29.19** |
|
| 150 |
+
| Train Perplexity | 28.33 |
|
| 151 |
+
| Bits Per Character | 4.87 |
|
| 152 |
+
| Top-1 Accuracy | 37.88% |
|
| 153 |
+
| Top-5 Accuracy | 60.34% |
|
| 154 |
+
| Overfit Gap | 0.03 (healthy) |
|
| 155 |
+
| vs Random (32K) | 99.9% better |
|
| 156 |
+
|
| 157 |
+
</td>
|
| 158 |
+
<td width="50%">
|
| 159 |
+
|
| 160 |
+
#### Speed Metrics
|
| 161 |
+
|
| 162 |
+
| Metric | Value |
|
| 163 |
+
|--------|:-----:|
|
| 164 |
+
| Training Throughput | 320,000 tok/s |
|
| 165 |
+
| Forward Throughput | 126,976 tok/s |
|
| 166 |
+
| Generation Speed | 204 tok/s |
|
| 167 |
+
| Generation Latency | 4.9 ms/token |
|
| 168 |
+
| MFU (Training) | 24% |
|
| 169 |
+
| Peak VRAM | 8.14 GB |
|
| 170 |
+
| Training Time | ~2 hours |
|
| 171 |
+
|
| 172 |
+
</td>
|
| 173 |
+
</tr>
|
| 174 |
+
</table>
|
| 175 |
+
|
| 176 |
+
<br/>
|
| 177 |
+
|
| 178 |
+
### 📈 Training Progress
|
| 179 |
+
|
| 180 |
+
```
|
| 181 |
+
Perplexity over Training Steps:
|
| 182 |
+
|
| 183 |
+
37,600 ┤●
|
| 184 |
+
│ ╲
|
| 185 |
+
10,000 ┤ ╲
|
| 186 |
+
│ ╲
|
| 187 |
+
142 ┤ ●
|
| 188 |
+
│ ╲
|
| 189 |
+
78 ┤ ●
|
| 190 |
+
│ ╲──╲
|
| 191 |
+
55 ┤ ●───╲
|
| 192 |
+
│ ╲───╲
|
| 193 |
+
42 ┤ ●───╲
|
| 194 |
+
│ ╲───╲
|
| 195 |
+
36 ┤ ●───╲
|
| 196 |
+
│ ╲───●── 29.19 ✅
|
| 197 |
+
29 ┤ Beats GPT-2!
|
| 198 |
+
└──────────────────────────────────────────────────
|
| 199 |
+
0 250 1K 2K 4K 6K 8K 10K 15K
|
| 200 |
+
Training Steps →
|
| 201 |
+
```
|
| 202 |
+
|
| 203 |
+
<br/>
|
| 204 |
+
|
| 205 |
+
<!-- Divider -->
|
| 206 |
+
<img src="https://user-images.githubusercontent.com/73097560/115834477-dbab4500-a447-11eb-908a-139a6edaec5c.gif" width="100%">
|
| 207 |
+
|
| 208 |
+
## Architecture
|
| 209 |
|
| 210 |
+
<div align="center">
|
| 211 |
|
| 212 |
+
```mermaid
|
| 213 |
+
graph TB
|
| 214 |
+
INPUT["📝 Input Tokens"] --> EMB["Token Embedding<br/>(weight-tied with output)"]
|
| 215 |
+
EMB --> DROP["Dropout"]
|
| 216 |
+
DROP --> TB1["🔲 Transformer Block 1"]
|
| 217 |
+
TB1 --> TB2["🔲 Transformer Block 2"]
|
| 218 |
+
TB2 --> DOTS["⋮ (12 blocks total)"]
|
| 219 |
+
DOTS --> TBN["🔲 Transformer Block 12"]
|
| 220 |
+
TBN --> NORM["RMSNorm (final)"]
|
| 221 |
+
NORM --> OUT["📤 Output Projection → 32K Logits"]
|
| 222 |
|
| 223 |
+
subgraph TB["Each Transformer Block"]
|
| 224 |
+
direction TB
|
| 225 |
+
A1["RMSNorm"] --> A2["Multi-Head Attention<br/>12 heads × 64 dims + RoPE"]
|
| 226 |
+
A2 --> A3["+ Residual"]
|
| 227 |
+
A3 --> A4["RMSNorm"]
|
| 228 |
+
A4 --> A5["SwiGLU FFN<br/>768 → 2048 → 768"]
|
| 229 |
+
A5 --> A6["+ Residual"]
|
| 230 |
+
end
|
| 231 |
|
| 232 |
+
style INPUT fill:#0d1117,stroke:#58a6ff,color:#fff
|
| 233 |
+
style OUT fill:#0d1117,stroke:#16c79a,color:#fff
|
| 234 |
+
style TB1 fill:#161b22,stroke:#58a6ff,color:#fff
|
| 235 |
+
style TB2 fill:#161b22,stroke:#58a6ff,color:#fff
|
| 236 |
+
style TBN fill:#161b22,stroke:#58a6ff,color:#fff
|
| 237 |
+
style EMB fill:#161b22,stroke:#bc6ff1,color:#fff
|
| 238 |
+
style NORM fill:#161b22,stroke:#f39c12,color:#fff
|
| 239 |
+
```
|
| 240 |
+
|
| 241 |
+
</div>
|
| 242 |
+
|
| 243 |
+
### Model Configuration
|
| 244 |
+
|
| 245 |
+
| Parameter | Value | Description |
|
| 246 |
+
|-----------|:-----:|-------------|
|
| 247 |
+
| `d_model` | 768 | Embedding dimension |
|
| 248 |
+
| `n_layers` | 12 | Transformer blocks |
|
| 249 |
+
| `n_heads` | 12 | Attention heads |
|
| 250 |
+
| `head_dim` | 64 | Per-head dimension |
|
| 251 |
+
| `vocab_size` | 32,000 | BPE vocabulary |
|
| 252 |
+
| `max_seq_len` | 1,024 | Context window |
|
| 253 |
+
| `ffn_hidden` | 2,048 | SwiGLU hidden dim |
|
| 254 |
+
| `dropout` | 0.0 | No dropout (pre-training) |
|
| 255 |
+
| `bias` | False | No bias terms (LLaMA-style) |
|
| 256 |
+
| **Total Params** | **109.53M** | |
|
| 257 |
+
| Non-Embedding | 84.95M | Excluding shared embeddings |
|
| 258 |
+
|
| 259 |
+
### Architecture Highlights
|
| 260 |
+
|
| 261 |
+
<table>
|
| 262 |
+
<tr>
|
| 263 |
+
<td width="50%">
|
| 264 |
+
|
| 265 |
+
| Component | Choice |
|
| 266 |
+
|-----------|--------|
|
| 267 |
+
| **Position Encoding** | RoPE (Rotary) |
|
| 268 |
+
| **Normalization** | RMSNorm (pre-norm) |
|
| 269 |
+
| **Feedforward** | SwiGLU |
|
| 270 |
+
| **Attention** | Flash (SDPA) |
|
| 271 |
+
| **Weight Tying** | Yes (emb = output) |
|
| 272 |
+
| **Initialization** | Depth-scaled residual |
|
| 273 |
+
|
| 274 |
+
</td>
|
| 275 |
+
<td width="50%">
|
| 276 |
+
|
| 277 |
+
| Design Choice | Why |
|
| 278 |
+
|--------------|-----|
|
| 279 |
+
| RoPE over absolute | Better generalization, relative positions |
|
| 280 |
+
| RMSNorm over LayerNorm | 10-15% faster, same quality |
|
| 281 |
+
| SwiGLU over ReLU | 2-3% better PPL via gating |
|
| 282 |
+
| No bias | Standard in LLaMA, PaLM |
|
| 283 |
+
| Weight tying | Saves 24.6M parameters |
|
| 284 |
+
| Pre-norm | More stable training |
|
| 285 |
+
|
| 286 |
+
</td>
|
| 287 |
+
</tr>
|
| 288 |
+
</table>
|
| 289 |
+
|
| 290 |
+
### Parameter Breakdown
|
| 291 |
+
|
| 292 |
+
```
|
| 293 |
+
╔══════════════════════════════════════════════════╗
|
| 294 |
+
║ BUVN-2.0 Parameter Distribution ║
|
| 295 |
+
╠══════════════════════════════════════════════════╣
|
| 296 |
+
║ ║
|
| 297 |
+
║ Token Embedding ████████░░░░ 24.6M (22%) ║
|
| 298 |
+
║ (weight-tied) ║
|
| 299 |
+
║ ║
|
| 300 |
+
║ 12× Attention ██████████░░ 28.3M (26%) ║
|
| 301 |
+
║ (Wq, Wk, Wv, Wo) ║
|
| 302 |
+
║ ║
|
| 303 |
+
║ 12× SwiGLU FFN ████████████ 56.6M (52%) ║
|
| 304 |
+
║ (W1, W2, W3) ← Most "knowledge" here ║
|
| 305 |
+
║ ║
|
| 306 |
+
║ Norms + Other ░░░░░░░░░░░░ 18K (<1%) ║
|
| 307 |
+
║ ║
|
| 308 |
+
║ TOTAL ████████████ 109.5M (100%) ║
|
| 309 |
+
╚══════════════════════════════════════════════════╝
|
| 310 |
+
```
|
| 311 |
|
| 312 |
+
<br/>
|
| 313 |
+
|
| 314 |
+
<!-- Divider -->
|
| 315 |
+
<img src="https://user-images.githubusercontent.com/73097560/115834477-dbab4500-a447-11eb-908a-139a6edaec5c.gif" width="100%">
|
| 316 |
+
|
| 317 |
+
## Training Details
|
| 318 |
+
|
| 319 |
+
### Data Pipeline
|
| 320 |
+
|
| 321 |
+
```
|
| 322 |
+
C4 Dataset (HuggingFace)
|
| 323 |
+
│ 8 parallel stream workers (no download, 1.48M tok/s)
|
| 324 |
+
↓
|
| 325 |
+
BPE Tokenizer (32K vocab, trained on 100K samples in 14s)
|
| 326 |
+
│ tokenize in memory
|
| 327 |
+
↓
|
| 328 |
+
Binary files: train.bin (3.8 GB) + val.bin (20 MB)
|
| 329 |
+
│ 2.0 billion tokens total
|
| 330 |
+
↓
|
| 331 |
+
Memory-mapped DataLoader → GPU (zero-copy I/O)
|
| 332 |
+
```
|
| 333 |
+
|
| 334 |
+
### Training Configuration
|
| 335 |
+
|
| 336 |
+
| Setting | Value |
|
| 337 |
+
|---------|:-----:|
|
| 338 |
+
| **Optimizer** | AdamW |
|
| 339 |
+
| **Peak LR** | 6×10⁻⁴ |
|
| 340 |
+
| **Min LR** | 6×10⁻⁵ |
|
| 341 |
+
| **Schedule** | Cosine decay with 500-step warmup |
|
| 342 |
+
| **Batch Size** | 64 × 2 gradient accumulation = 128 |
|
| 343 |
+
| **Tokens/Iteration** | 131,072 |
|
| 344 |
+
| **Total Steps** | 15,000 |
|
| 345 |
+
| **Total Tokens** | ~2 billion |
|
| 346 |
+
| **Precision** | bfloat16 |
|
| 347 |
+
| **Compiler** | torch.compile (1.5x speedup) |
|
| 348 |
+
| **Weight Decay** | 0.1 |
|
| 349 |
+
| **Grad Clip** | 1.0 |
|
| 350 |
+
| **Beta1 / Beta2** | 0.9 / 0.95 |
|
| 351 |
+
|
| 352 |
+
### Hardware
|
| 353 |
+
|
| 354 |
+
| Component | Spec |
|
| 355 |
+
|-----------|------|
|
| 356 |
+
| **GPU** | NVIDIA H100 NVL (96 GB VRAM) |
|
| 357 |
+
| **CPU** | AMD EPYC 9V84 96-Core (40 vCPUs) |
|
| 358 |
+
| **RAM** | 314 GB |
|
| 359 |
+
| **PyTorch** | 2.9.1 + CUDA 12.8 |
|
| 360 |
+
|
| 361 |
+
<br/>
|
| 362 |
+
|
| 363 |
+
<!-- Divider -->
|
| 364 |
+
<img src="https://user-images.githubusercontent.com/73097560/115834477-dbab4500-a447-11eb-908a-139a6edaec5c.gif" width="100%">
|
| 365 |
|
| 366 |
## Usage
|
| 367 |
|
| 368 |
+
### Download and Run
|
| 369 |
+
|
| 370 |
+
```python
|
| 371 |
+
# 1. Clone the repo
|
| 372 |
+
# git clone https://github.com/bhuvan0808/beuvian.git
|
| 373 |
+
# cd beuvian/BUVN-1.1
|
| 374 |
+
# pip install -r requirements.txt
|
| 375 |
+
|
| 376 |
+
# 2. Download weights from this HuggingFace repo
|
| 377 |
+
python scripts/load_from_hub.py
|
| 378 |
+
|
| 379 |
+
# 3. Generate text
|
| 380 |
+
python inference/generate.py \
|
| 381 |
+
--prompt "The future of artificial intelligence" \
|
| 382 |
+
--checkpoint checkpoints/buvn_2.0_best.pt \
|
| 383 |
+
--tokenizer tokenizer/tokenizer_32k.json \
|
| 384 |
+
--max_new_tokens 150 \
|
| 385 |
+
--temperature 0.7 \
|
| 386 |
+
--top_k 50
|
| 387 |
+
```
|
| 388 |
+
|
| 389 |
+
### Load in Python
|
| 390 |
+
|
| 391 |
```python
|
| 392 |
import torch
|
| 393 |
from model.config import BUVNConfig
|
| 394 |
from model.model import BUVNModel
|
| 395 |
|
| 396 |
# Load checkpoint
|
| 397 |
+
ckpt = torch.load('buvn_2.0_best.pt', map_location='cuda', weights_only=False)
|
| 398 |
|
| 399 |
# Handle torch.compile prefix
|
| 400 |
state_dict = ckpt['model']
|
|
|
|
| 404 |
|
| 405 |
# Build model
|
| 406 |
config = BUVNConfig.from_dict(ckpt['model_args'])
|
| 407 |
+
model = BUVNModel(config).cuda()
|
| 408 |
model.load_state_dict(state_dict)
|
| 409 |
model.eval()
|
| 410 |
+
|
| 411 |
+
# Generate
|
| 412 |
+
from inference.sample import generate
|
| 413 |
+
text, usage = generate(model, tokenizer, "Your prompt here",
|
| 414 |
+
max_new_tokens=100, temperature=0.7, top_k=50, device='cuda')
|
| 415 |
+
print(text)
|
| 416 |
```
|
| 417 |
|
| 418 |
+
### API Server
|
| 419 |
+
|
| 420 |
+
```bash
|
| 421 |
+
python api/app.py \
|
| 422 |
+
--checkpoint checkpoints/buvn_2.0_best.pt \
|
| 423 |
+
--tokenizer tokenizer/tokenizer_32k.json \
|
| 424 |
+
--port 8000
|
| 425 |
+
|
| 426 |
+
# Test with curl:
|
| 427 |
+
curl -X POST http://localhost:8000/generate \
|
| 428 |
+
-H "Content-Type: application/json" \
|
| 429 |
+
-d '{"prompt": "The history of science", "max_tokens": 100, "temperature": 0.7}'
|
| 430 |
+
```
|
| 431 |
+
|
| 432 |
+
### Sampling Parameters
|
| 433 |
+
|
| 434 |
+
| Parameter | Range | Default | Effect |
|
| 435 |
+
|-----------|:-----:|:-------:|--------|
|
| 436 |
+
| `temperature` | 0.0 – 2.0 | 0.7 | 0 = deterministic, higher = more creative |
|
| 437 |
+
| `top_k` | 0 – 32000 | 50 | Only sample from top K most likely tokens |
|
| 438 |
+
| `top_p` | 0.0 – 1.0 | — | Nucleus sampling (adaptive token filtering) |
|
| 439 |
+
| `max_tokens` | 1 – 1024 | 100 | Maximum generation length |
|
| 440 |
+
|
| 441 |
+
<br/>
|
| 442 |
+
|
| 443 |
+
<!-- Divider -->
|
| 444 |
+
<img src="https://user-images.githubusercontent.com/73097560/115834477-dbab4500-a447-11eb-908a-139a6edaec5c.gif" width="100%">
|
| 445 |
|
| 446 |
+
## Sample Outputs
|
| 447 |
+
|
| 448 |
+
### Prompt: "The history of artificial intelligence began"
|
| 449 |
+
|
| 450 |
+
> The number of people living with heart disease in the United States is projected to increase by nearly 20 million every year, according to the Centers for Disease Control and Prevention. The Centers for Disease Control and Prevention (CDC) created the National Heart Disease Prevention and Control Program in 2007, the American Heart Association (AHA) released its findings on March 25, 2018...
|
| 451 |
+
|
| 452 |
+
### Prompt: "The president of the United States announced"
|
| 453 |
+
|
| 454 |
+
> Here at The Ritz and Suites, we are proud to offer a variety of unique and unique packages. Our experienced staff is here to help you find the perfect vacation, getaway or special event. Treat yourself to a luxurious vacation in the comfort of your own home!
|
| 455 |
+
|
| 456 |
+
### Prompt: "In a groundbreaking study published today"
|
| 457 |
+
|
| 458 |
+
> If you are having a dental emergency, you may be wondering how to get the most out of your dental treatment, right? Well, that's where the dental implant comes in. The dental implant is the most extensive prosthetic bone in the world...
|
| 459 |
+
|
| 460 |
+
> **Note:** The model generates fluent, grammatically correct web-text. It does **not** follow prompt topics because it has not been instruction-tuned yet. This is expected behavior for a foundation model. Instruction tuning (SFT) is the planned next step.
|
| 461 |
+
|
| 462 |
+
<br/>
|
| 463 |
+
|
| 464 |
+
<!-- Divider -->
|
| 465 |
+
<img src="https://user-images.githubusercontent.com/73097560/115834477-dbab4500-a447-11eb-908a-139a6edaec5c.gif" width="100%">
|
| 466 |
|
| 467 |
## The Beuvian Ecosystem
|
| 468 |
|
| 469 |
+
<div align="center">
|
| 470 |
+
|
| 471 |
+
```mermaid
|
| 472 |
+
graph LR
|
| 473 |
+
A["📚 Raw Text<br/>C4 (2B tokens)"] -->|Pre-training| B["🧠 BUVN-2.0<br/>Foundation"]
|
| 474 |
+
B -->|Fine-tune on Code| C["💻 SRVN<br/>Code Agent"]
|
| 475 |
+
B -->|Train on Markets| D["📈 MNI<br/>Finance"]
|
| 476 |
+
|
| 477 |
+
style A fill:#1a1a2e,stroke:#16c79a,color:#fff
|
| 478 |
+
style B fill:#0d1117,stroke:#58a6ff,color:#fff,stroke-width:3px
|
| 479 |
+
style C fill:#0d1117,stroke:#f39c12,color:#fff,stroke-width:2px
|
| 480 |
+
style D fill:#0d1117,stroke:#bc6ff1,color:#fff,stroke-width:2px
|
| 481 |
+
```
|
| 482 |
+
|
| 483 |
+
</div>
|
| 484 |
+
|
| 485 |
+
| Model | Role | Status | Description |
|
| 486 |
+
|:-----:|------|:------:|-------------|
|
| 487 |
+
| 🧠 **BUVN** | Foundation | ✅ **Released** | General language model — the base for everything |
|
| 488 |
+
| 💻 **SRVN** | Code Agent | 🔜 Planned | Fine-tuned on code (The Stack v2), agentic workflows |
|
| 489 |
+
| 📈 **MNI** | Finance | 🔜 Planned | Trained on market data, SEC filings, sentiment analysis |
|
| 490 |
+
|
| 491 |
+
<br/>
|
| 492 |
+
|
| 493 |
+
## Roadmap
|
| 494 |
+
|
| 495 |
+
- [x] ✅ BUVN-1.1 — 13.7M params, WikiText-103, PPL 35.87
|
| 496 |
+
- [x] ✅ **BUVN-2.0 — 109.5M params, C4 2B tokens, PPL 29.19 (beats GPT-2 Small!)**
|
| 497 |
+
- [ ] 🔜 Instruction Tuning (SFT) on OpenAssistant + Alpaca
|
| 498 |
+
- [ ] 🔜 SRVN — Code agent fine-tuning
|
| 499 |
+
- [ ] 🔜 MNI — Finance model training
|
| 500 |
+
- [ ] 📋 RLHF / DPO alignment
|
| 501 |
+
- [ ] 📋 Chat UI deployment
|
| 502 |
+
- [ ] 📋 HuggingFace Spaces demo
|
| 503 |
+
|
| 504 |
+
<br/>
|
| 505 |
+
|
| 506 |
+
<!-- Divider -->
|
| 507 |
+
<img src="https://user-images.githubusercontent.com/73097560/115834477-dbab4500-a447-11eb-908a-139a6edaec5c.gif" width="100%">
|
| 508 |
+
|
| 509 |
+
## Files in This Repository
|
| 510 |
+
|
| 511 |
+
| File | Size | Description |
|
| 512 |
+
|------|:----:|-------------|
|
| 513 |
+
| `buvn_2.0_best.pt` | 1.31 GB | Model checkpoint (109.5M params, trained 15K steps) |
|
| 514 |
+
| `tokenizer_32k.json` | 2.2 MB | 32K BPE tokenizer (Byte-Level, trained on C4) |
|
| 515 |
+
| `config.json` | ~200 B | Model hyperparameters |
|
| 516 |
+
| `README.md` | — | This model card |
|
| 517 |
+
|
| 518 |
+
## Citation
|
| 519 |
+
|
| 520 |
+
```bibtex
|
| 521 |
+
@misc{buvn2026,
|
| 522 |
+
title={BUVN-2.0: A Foundation Language Model Built From Scratch},
|
| 523 |
+
author={Bhuvan},
|
| 524 |
+
year={2026},
|
| 525 |
+
url={https://huggingface.co/bhuvan0808/buvn-2.0},
|
| 526 |
+
note={109.5M parameter decoder-only transformer, PPL 29.19 on WikiText-103}
|
| 527 |
+
}
|
| 528 |
+
```
|
| 529 |
|
| 530 |
## Links
|
| 531 |
|
| 532 |
+
| Resource | URL |
|
| 533 |
+
|----------|-----|
|
| 534 |
+
| 🐙 **GitHub** | [bhuvan0808/beuvian](https://github.com/bhuvan0808/beuvian) |
|
| 535 |
+
| 📘 **Documentation** | [docs/](https://github.com/bhuvan0808/beuvian/tree/main/BUVN-1.1/docs) |
|
| 536 |
+
| 🤗 **HuggingFace** | [bhuvan0808/buvn-2.0](https://huggingface.co/bhuvan0808/buvn-2.0) |
|
| 537 |
+
|
| 538 |
+
<br/>
|
| 539 |
+
|
| 540 |
+
<div align="center">
|
| 541 |
+
|
| 542 |
+
<!-- Footer -->
|
| 543 |
+
<img src="https://capsule-render.vercel.app/api?type=waving&color=gradient&customColorList=2,6,11,20,30&height=100§ion=footer&animation=twinkling" width="100%"/>
|
| 544 |
+
|
| 545 |
+
**Built with ❤️ by Bhuvan**
|
| 546 |
+
|
| 547 |
+
*BUVN-2.0 — Part of the [Beuvian AI Ecosystem](https://github.com/bhuvan0808/beuvian)*
|
| 548 |
|
| 549 |
+
<img src="https://readme-typing-svg.demolab.com?font=Fira+Code&size=14&pause=2000&color=58A6FF¢er=true&vCenter=true&width=500&lines=From+Scratch+→+Trained+→+Deployed+→+Beats+GPT-2+🔁" alt="footer" />
|
| 550 |
|
| 551 |
+
</div>
|