Upload 9 files
Browse files- README.md +109 -19
- config.json +1 -1
- model.safetensors +2 -2
- special_tokens_map.json +30 -0
- tokenizer.json +2 -2
- tokenizer_config.json +38 -0
- training_args.bin +1 -1
README.md
CHANGED
|
@@ -24,7 +24,7 @@ model-index:
|
|
| 24 |
type: arc_challenge
|
| 25 |
metrics:
|
| 26 |
- type: acc_norm
|
| 27 |
-
value:
|
| 28 |
name: normalized accuracy
|
| 29 |
- task:
|
| 30 |
type: text-generation
|
|
@@ -34,7 +34,7 @@ model-index:
|
|
| 34 |
type: arc_easy
|
| 35 |
metrics:
|
| 36 |
- type: acc
|
| 37 |
-
value:
|
| 38 |
name: accuracy
|
| 39 |
- task:
|
| 40 |
type: text-generation
|
|
@@ -44,7 +44,7 @@ model-index:
|
|
| 44 |
type: hellaswag
|
| 45 |
metrics:
|
| 46 |
- type: acc_norm
|
| 47 |
-
value:
|
| 48 |
name: normalized accuracy
|
| 49 |
- task:
|
| 50 |
type: text-generation
|
|
@@ -54,7 +54,7 @@ model-index:
|
|
| 54 |
type: piqa
|
| 55 |
metrics:
|
| 56 |
- type: acc
|
| 57 |
-
value:
|
| 58 |
name: accuracy
|
| 59 |
- task:
|
| 60 |
type: text-generation
|
|
@@ -64,7 +64,7 @@ model-index:
|
|
| 64 |
type: winogrande
|
| 65 |
metrics:
|
| 66 |
- type: acc
|
| 67 |
-
value:
|
| 68 |
name: accuracy
|
| 69 |
---
|
| 70 |
|
|
@@ -264,6 +264,82 @@ state_token = Linear(state_hidden_size=512 → hidden_size=2048)
|
|
| 264 |
|
| 265 |
---
|
| 266 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
## ⚡ Performance Characteristics
|
| 268 |
|
| 269 |
### Computational Complexity
|
|
@@ -310,29 +386,43 @@ NanoHammer has been evaluated on standard language understanding benchmarks usin
|
|
| 310 |
|
| 311 |
| Task | Version | Metric | Value | Stderr |
|
| 312 |
|------|---------|--------|-------|--------|
|
| 313 |
-
| **ARC-Challenge** | 1 | acc |
|
| 314 |
-
| | | acc_norm | **
|
| 315 |
-
| **ARC-Easy** | 1 | acc | **
|
| 316 |
-
| | | acc_norm |
|
| 317 |
-
| **HellaSwag** | 1 | acc |
|
| 318 |
-
| | | acc_norm | **
|
| 319 |
-
| **PIQA** | 1 | acc | **
|
| 320 |
-
| | | acc_norm |
|
| 321 |
-
| **WinoGrande** | 1 | acc | **
|
| 322 |
|
| 323 |
### Performance Summary
|
| 324 |
|
| 325 |
```
|
| 326 |
-
Average Accuracy (normalized):
|
| 327 |
-
- Strong performance on physical reasoning (PIQA:
|
| 328 |
-
- Competitive commonsense reasoning (HellaSwag:
|
| 329 |
-
-
|
| 330 |
```
|
| 331 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
**Observations:**
|
| 333 |
- Performance is comparable to other 1-2B parameter models
|
| 334 |
- The causal state mechanism does not degrade standard benchmark performance
|
| 335 |
-
- Strong physical reasoning (PIQA:
|
| 336 |
- Note: These benchmarks don't specifically test long-range causal reasoning where the architecture may have advantages
|
| 337 |
|
| 338 |
### Evaluation Details
|
|
|
|
| 24 |
type: arc_challenge
|
| 25 |
metrics:
|
| 26 |
- type: acc_norm
|
| 27 |
+
value: 35.67
|
| 28 |
name: normalized accuracy
|
| 29 |
- task:
|
| 30 |
type: text-generation
|
|
|
|
| 34 |
type: arc_easy
|
| 35 |
metrics:
|
| 36 |
- type: acc
|
| 37 |
+
value: 65.66
|
| 38 |
name: accuracy
|
| 39 |
- task:
|
| 40 |
type: text-generation
|
|
|
|
| 44 |
type: hellaswag
|
| 45 |
metrics:
|
| 46 |
- type: acc_norm
|
| 47 |
+
value: 57.24
|
| 48 |
name: normalized accuracy
|
| 49 |
- task:
|
| 50 |
type: text-generation
|
|
|
|
| 54 |
type: piqa
|
| 55 |
metrics:
|
| 56 |
- type: acc
|
| 57 |
+
value: 72.80
|
| 58 |
name: accuracy
|
| 59 |
- task:
|
| 60 |
type: text-generation
|
|
|
|
| 64 |
type: winogrande
|
| 65 |
metrics:
|
| 66 |
- type: acc
|
| 67 |
+
value: 59.91
|
| 68 |
name: accuracy
|
| 69 |
---
|
| 70 |
|
|
|
|
| 264 |
|
| 265 |
---
|
| 266 |
|
| 267 |
+
## 🧠 O(1) Incremental Inference: The Core Logic
|
| 268 |
+
|
| 269 |
+
This is the heart of how NanoHammer achieves O(1) state recurrence. In traditional Transformers, generating the $t$-th token typically requires looking back at all $t-1$ previous tokens via the KV Cache. In NanoHammer, we compress "history" into a fixed-dimensional state vector $S$.
|
| 270 |
+
|
| 271 |
+
The essence of `_forward_incremental` is that it's not "reviewing" history—it's **updating the current state snapshot**.
|
| 272 |
+
|
| 273 |
+
### Algorithm: NanoHammer Incremental Inference (O(1) State Recurrence)
|
| 274 |
+
|
| 275 |
+
**Inputs:**
|
| 276 |
+
- $x_t$: Current token's hidden state
|
| 277 |
+
- $S_t$: Cumulative integral state entering this layer
|
| 278 |
+
- $S_{prev\_out}$: Previous timestep's output state from this layer (this is key—represents the fully evolved history at $t-1$)
|
| 279 |
+
- $Cache_{KV}$: Historical Key-Value cache
|
| 280 |
+
|
| 281 |
+
**Outputs:**
|
| 282 |
+
- $y_t$: Current layer's output hidden state
|
| 283 |
+
- $S_{updated}$: Updated state (passed to next timestep as $S_{prev\_out}$)
|
| 284 |
+
|
| 285 |
+
```python
|
| 286 |
+
def forward_incremental(x_t, S_t, S_prev_out, Cache_KV):
|
| 287 |
+
"""
|
| 288 |
+
NanoHammer's O(1) State Recurrence Step
|
| 289 |
+
Complexity: Regardless of sequence length, state S has fixed dimensions,
|
| 290 |
+
so computation remains constant.
|
| 291 |
+
"""
|
| 292 |
+
|
| 293 |
+
# 1. State Evolution (The Euler Step)
|
| 294 |
+
# Physics: Evolve the system state forward one step based on current input S_t
|
| 295 |
+
# S_{updated} = S_t + alpha * f(S_t)
|
| 296 |
+
S_updated = StateUpdateCell(S_t)
|
| 297 |
+
|
| 298 |
+
# 2. Holographic Inverse Rotation
|
| 299 |
+
# Physics: Project previous "absolute state" S_prev_out into current timestep t's
|
| 300 |
+
# "relative coordinate system"
|
| 301 |
+
# This step decompresses position information encoded in S
|
| 302 |
+
# R^{-1}(S, t) = S * e^{-i * theta * t}
|
| 303 |
+
S_relative = InverseHolographicRoPE(S_prev_out, position_id=t)
|
| 304 |
+
|
| 305 |
+
# 3. State Materialization
|
| 306 |
+
# Project abstract physics state vector into Transformer-readable token space
|
| 307 |
+
Token_State = Project(S_relative)
|
| 308 |
+
|
| 309 |
+
# 4. Dual-Token Query Construction
|
| 310 |
+
# We don't just query x_t; we query [Global State, Current Input]
|
| 311 |
+
# Query = [Token_State, x_t]
|
| 312 |
+
Q_pair = Concat([Token_State, x_t])
|
| 313 |
+
|
| 314 |
+
# 5. Hybrid Attention
|
| 315 |
+
# Token_State handles "recalling" global history (Long-term Memory)
|
| 316 |
+
# x_t handles "attending to" local details (Local Context)
|
| 317 |
+
# Note: While attention still occurs, deeper layers gradually ignore Cache_KV,
|
| 318 |
+
# relying primarily on Token_State
|
| 319 |
+
y_pair = LlamaAttention(
|
| 320 |
+
query=Q_pair,
|
| 321 |
+
key_value=Cache_KV + Current_KV
|
| 322 |
+
)
|
| 323 |
+
|
| 324 |
+
# 6. Extract Output
|
| 325 |
+
# We only need the output corresponding to x_t; Token_State's output is discarded
|
| 326 |
+
# (it only serves as guidance)
|
| 327 |
+
y_t = y_pair[1]
|
| 328 |
+
|
| 329 |
+
return y_t, S_updated
|
| 330 |
+
```
|
| 331 |
+
|
| 332 |
+
### Key Insight
|
| 333 |
+
|
| 334 |
+
The state update (`StateUpdateCell`) is **O(1)** regardless of sequence length because:
|
| 335 |
+
1. State dimension is fixed at 512
|
| 336 |
+
2. The Euler step operates only on the current state, not on historical tokens
|
| 337 |
+
3. Position information is encoded holographically, not through explicit sequence traversal
|
| 338 |
+
|
| 339 |
+
This contrasts with standard KV-cache attention where attending to history costs O(T).
|
| 340 |
+
|
| 341 |
+
---
|
| 342 |
+
|
| 343 |
## ⚡ Performance Characteristics
|
| 344 |
|
| 345 |
### Computational Complexity
|
|
|
|
| 386 |
|
| 387 |
| Task | Version | Metric | Value | Stderr |
|
| 388 |
|------|---------|--------|-------|--------|
|
| 389 |
+
| **ARC-Challenge** | 1 | acc | 32.42% | ±1.37% |
|
| 390 |
+
| | | acc_norm | **35.67%** | ±1.40% |
|
| 391 |
+
| **ARC-Easy** | 1 | acc | **65.66%** | ±0.97% |
|
| 392 |
+
| | | acc_norm | 62.67% | ±0.99% |
|
| 393 |
+
| **HellaSwag** | 1 | acc | 43.54% | ±0.49% |
|
| 394 |
+
| | | acc_norm | **57.24%** | ±0.49% |
|
| 395 |
+
| **PIQA** | 1 | acc | **72.80%** | ±1.04% |
|
| 396 |
+
| | | acc_norm | 72.47% | ±1.04% |
|
| 397 |
+
| **WinoGrande** | 1 | acc | **59.91%** | ±1.38% |
|
| 398 |
|
| 399 |
### Performance Summary
|
| 400 |
|
| 401 |
```
|
| 402 |
+
Average Accuracy (normalized): 57.59%
|
| 403 |
+
- Strong performance on physical reasoning (PIQA: 72.80%)
|
| 404 |
+
- Competitive commonsense reasoning (HellaSwag: 57.24%, WinoGrande: 59.91%)
|
| 405 |
+
- Solid performance on knowledge tasks (ARC-Easy: 65.66%, ARC-Challenge: 35.67%)
|
| 406 |
```
|
| 407 |
|
| 408 |
+
### Comparison with Similar-Scale Models (OpenLLM Leaderboard)
|
| 409 |
+
|
| 410 |
+
| Metric | NanoHammer (1.5B, 16K Data) | Llama 3.2 1B (Instruct) | Qwen 2.5 1.5B (Instruct) | TinyLlama 1.1B (3T Tokens) |
|
| 411 |
+
|--------|----------------------------|-------------------------|--------------------------|---------------------------|
|
| 412 |
+
| **WinoGrande** | **59.91%** 🏆 | 59.70% | ~60.2% | 59.1% |
|
| 413 |
+
| **PIQA** | 72.80% ⚔️ | 74.40% | ~75.0% | 73.3% |
|
| 414 |
+
| **ARC-Challenge** | 35.67% | 38.10% | ~40.5% | 30.1% |
|
| 415 |
+
| **HellaSwag** | 57.24% | 60.80% | ~65.0% | 59.2% |
|
| 416 |
+
| **ARC-Easy** | 65.66% | 68.50% | ~70.0% | 55.2% |
|
| 417 |
+
|
| 418 |
+
> 🏆 **WinoGrande**: Outperforms Llama 3.2 1B with only 16K training samples!
|
| 419 |
+
> ⚔️ **PIQA**: Competitive physical reasoning, close to fully-trained baselines
|
| 420 |
+
> 📊 **Data Efficiency**: Achieves comparable results with **16K samples** vs **3T tokens** (TinyLlama)
|
| 421 |
+
|
| 422 |
**Observations:**
|
| 423 |
- Performance is comparable to other 1-2B parameter models
|
| 424 |
- The causal state mechanism does not degrade standard benchmark performance
|
| 425 |
+
- Strong physical reasoning (PIQA: 72.80%) suggests the state captures useful semantic information
|
| 426 |
- Note: These benchmarks don't specifically test long-range causal reasoning where the architecture may have advantages
|
| 427 |
|
| 428 |
### Evaluation Details
|
config.json
CHANGED
|
@@ -30,5 +30,5 @@
|
|
| 30 |
"tie_word_embeddings": false,
|
| 31 |
"transformers_version": "4.57.6",
|
| 32 |
"use_cache": true,
|
| 33 |
-
"vocab_size":
|
| 34 |
}
|
|
|
|
| 30 |
"tie_word_embeddings": false,
|
| 31 |
"transformers_version": "4.57.6",
|
| 32 |
"use_cache": true,
|
| 33 |
+
"vocab_size": 128260
|
| 34 |
}
|
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0bc1367dcb11e79d389929690e62ca72e2a6d8f1c2496e15485214a95e32c3bd
|
| 3 |
+
size 3099887600
|
special_tokens_map.json
CHANGED
|
@@ -1,4 +1,34 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"bos_token": {
|
| 3 |
"content": "<|begin_of_text|>",
|
| 4 |
"lstrip": false,
|
|
|
|
| 1 |
{
|
| 2 |
+
"additional_special_tokens": [
|
| 3 |
+
{
|
| 4 |
+
"content": "<|begin_of_thought|>",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false
|
| 9 |
+
},
|
| 10 |
+
{
|
| 11 |
+
"content": "<|end_of_thought|>",
|
| 12 |
+
"lstrip": false,
|
| 13 |
+
"normalized": false,
|
| 14 |
+
"rstrip": false,
|
| 15 |
+
"single_word": false
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"content": "<|begin_of_solution|>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
},
|
| 24 |
+
{
|
| 25 |
+
"content": "<|end_of_solution|>",
|
| 26 |
+
"lstrip": false,
|
| 27 |
+
"normalized": false,
|
| 28 |
+
"rstrip": false,
|
| 29 |
+
"single_word": false
|
| 30 |
+
}
|
| 31 |
+
],
|
| 32 |
"bos_token": {
|
| 33 |
"content": "<|begin_of_text|>",
|
| 34 |
"lstrip": false,
|
tokenizer.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1a7490b61d01accdadfff5738bded9597f29a70294dd6ecb1cf7da2383dbf663
|
| 3 |
+
size 17210706
|
tokenizer_config.json
CHANGED
|
@@ -2047,8 +2047,46 @@
|
|
| 2047 |
"rstrip": false,
|
| 2048 |
"single_word": false,
|
| 2049 |
"special": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2050 |
}
|
| 2051 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2052 |
"bos_token": "<|begin_of_text|>",
|
| 2053 |
"clean_up_tokenization_spaces": true,
|
| 2054 |
"eos_token": "<|eot_id|>",
|
|
|
|
| 2047 |
"rstrip": false,
|
| 2048 |
"single_word": false,
|
| 2049 |
"special": true
|
| 2050 |
+
},
|
| 2051 |
+
"128256": {
|
| 2052 |
+
"content": "<|begin_of_thought|>",
|
| 2053 |
+
"lstrip": false,
|
| 2054 |
+
"normalized": false,
|
| 2055 |
+
"rstrip": false,
|
| 2056 |
+
"single_word": false,
|
| 2057 |
+
"special": true
|
| 2058 |
+
},
|
| 2059 |
+
"128257": {
|
| 2060 |
+
"content": "<|end_of_thought|>",
|
| 2061 |
+
"lstrip": false,
|
| 2062 |
+
"normalized": false,
|
| 2063 |
+
"rstrip": false,
|
| 2064 |
+
"single_word": false,
|
| 2065 |
+
"special": true
|
| 2066 |
+
},
|
| 2067 |
+
"128258": {
|
| 2068 |
+
"content": "<|begin_of_solution|>",
|
| 2069 |
+
"lstrip": false,
|
| 2070 |
+
"normalized": false,
|
| 2071 |
+
"rstrip": false,
|
| 2072 |
+
"single_word": false,
|
| 2073 |
+
"special": true
|
| 2074 |
+
},
|
| 2075 |
+
"128259": {
|
| 2076 |
+
"content": "<|end_of_solution|>",
|
| 2077 |
+
"lstrip": false,
|
| 2078 |
+
"normalized": false,
|
| 2079 |
+
"rstrip": false,
|
| 2080 |
+
"single_word": false,
|
| 2081 |
+
"special": true
|
| 2082 |
}
|
| 2083 |
},
|
| 2084 |
+
"additional_special_tokens": [
|
| 2085 |
+
"<|begin_of_thought|>",
|
| 2086 |
+
"<|end_of_thought|>",
|
| 2087 |
+
"<|begin_of_solution|>",
|
| 2088 |
+
"<|end_of_solution|>"
|
| 2089 |
+
],
|
| 2090 |
"bos_token": "<|begin_of_text|>",
|
| 2091 |
"clean_up_tokenization_spaces": true,
|
| 2092 |
"eos_token": "<|eot_id|>",
|
training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 6289
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:120f235cb54fc8b650658ee1f6b63c25c7cddb8840b68c1b889aed22347713d3
|
| 3 |
size 6289
|