CharlesCNorton commited on
Commit ·
3615a51
1
Parent(s): 05b1aea
Promote 32-bit/64KB build to canonical, rewrite README
Browse filesRoot canonical neural_computer.safetensors is the most capable variant
(32-bit data path, 64 KB memory, ~8.47 M params). The legacy 8-bit/1KB
file at the same path and the redundant root neural_alu32.safetensors are
removed; equivalent configurations remain in variants/.
play.py is now manifest-aware and uses eval_all.py's GenericThresholdCPU,
defaulting to the small (1 KB) profile for fast interactive demos.
README rewritten as a self-contained overview: what the repo is, the file
layout, quick-start code, the architecture (ISA, state layout, instruction
encoding, circuit categories, naming), bit widths and memory profiles,
verification, threshold-logic background and history, neuromorphic
hardware compatibility, and the SmolLM2 LLM integration.
- README.md +480 -793
- neural_alu32.safetensors +0 -3
- neural_computer.safetensors +2 -2
- play.py +217 -457
README.md
CHANGED
|
@@ -1,793 +1,480 @@
|
|
| 1 |
-
---
|
| 2 |
-
license: mit
|
| 3 |
-
tags:
|
| 4 |
-
- threshold-logic
|
| 5 |
-
- neuromorphic
|
| 6 |
-
- computer-architecture
|
| 7 |
-
- turing-complete
|
| 8 |
-
- loihi
|
| 9 |
-
- truenorth
|
| 10 |
-
- akida
|
| 11 |
-
---
|
| 12 |
-
|
| 13 |
-
# 8bit-threshold-computer
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
```
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
##
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
|
| 166 |
-
|
|
| 167 |
-
|
|
| 168 |
-
|
|
| 169 |
-
|
|
| 170 |
-
|
|
| 171 |
-
|
|
| 172 |
-
|
| 173 |
-
---
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
|
| 213 |
-
|
|
| 214 |
-
|
|
| 215 |
-
|
|
| 216 |
-
|
|
| 217 |
-
|
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
```
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
---
|
| 237 |
-
|
| 238 |
-
## Verification
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
|
| 248 |
-
|
|
| 249 |
-
|
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
#
|
| 348 |
-
mlp_out
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
```
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
```
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
```
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
2. **Loss**: Multi-component BCE + CE
|
| 482 |
-
- `result_loss`: BCE on output bits vs expected
|
| 483 |
-
- `a_loss`, `b_loss`: BCE on extracted bits vs ground truth (2× weight)
|
| 484 |
-
- `op_loss`: CE on operation classification
|
| 485 |
-
3. **Optimizer**: AdamW, lr=3e-4, gradient clipping at 1.0
|
| 486 |
-
4. **Curriculum**: Epoch-based range expansion (0-9 → 0-99 → 0-255)
|
| 487 |
-
5. **Batching**: 256-4096 samples per batch (VRAM-dependent)
|
| 488 |
-
|
| 489 |
-
```bash
|
| 490 |
-
# Example training commands
|
| 491 |
-
python train.py --mode router --epochs 100 # Sanity check
|
| 492 |
-
python train.py --mode llm --epochs 100 --batch_size 256 # Frozen LLM
|
| 493 |
-
python train.py --mode llm --unfreeze_layers 4 --batch_size 4096 # Fine-tune top layers
|
| 494 |
-
```
|
| 495 |
-
|
| 496 |
-
### Inference
|
| 497 |
-
|
| 498 |
-
At inference, Heaviside is true step function—no approximation. If the Extractor correctly identifies operands, the circuit **will** output the correct result.
|
| 499 |
-
|
| 500 |
-
### Target Performance
|
| 501 |
-
|
| 502 |
-
| Condition | Configuration | Accuracy |
|
| 503 |
-
|-----------|---------------|----------|
|
| 504 |
-
| Control | Vanilla SmolLM2-360M | 11.90% |
|
| 505 |
-
| Circuits only | Ground truth bits | 100.00% |
|
| 506 |
-
| Experimental | LLM + Extractor + Circuits | **Target: 100%** |
|
| 507 |
-
|
| 508 |
-
The interface generalizes to **all** 65,536 8-bit additions once trained—no memorization, the circuits compute.
|
| 509 |
-
|
| 510 |
-
### LLM Integration: Proof of Concept (In Progress)
|
| 511 |
-
|
| 512 |
-
Before proceeding with architectural extensions, we are validating the core thesis: that frozen threshold circuits can provide exact arithmetic capability to language models that otherwise fail at computation.
|
| 513 |
-
|
| 514 |
-
#### Baseline Evaluation
|
| 515 |
-
|
| 516 |
-
We evaluated SmolLM2-360M-Instruct on randomized 8-bit arithmetic using a generous answer extraction protocol. The model was prompted with a system message instructing it to output only numeric answers, and we accepted any correct number found in the output (first number, last number, or word-to-number conversion).
|
| 517 |
-
|
| 518 |
-
| Operation | SmolLM2-360M Accuracy | Notes |
|
| 519 |
-
|-----------|----------------------|-------|
|
| 520 |
-
| Addition (A + B) | 35.92% | Best performance, still fails 2/3 |
|
| 521 |
-
| Subtraction (A - B) | 17.72% | Poor handling of borrowing |
|
| 522 |
-
| Multiplication (A × B) | **1.25%** | Near-total failure |
|
| 523 |
-
| Greater Than (A > B) | 14.37% | Often echoes expression |
|
| 524 |
-
| Less Than (A < B) | 4.31% | Often echoes expression |
|
| 525 |
-
| Equality (A == B) | 0.28% | Near-total failure |
|
| 526 |
-
| **Overall Fitness** | **11.90%** | 238/2000 correct |
|
| 527 |
-
|
| 528 |
-
**Methodology**: 2000 randomized test cases with operands uniformly sampled from [0, 255]. Ground truth computed as 8-bit arithmetic (matching the threshold circuit specification). Batch size 64, greedy decoding (temperature=0).
|
| 529 |
-
|
| 530 |
-
**Key Observations**:
|
| 531 |
-
- Multiplication accuracy (1.25%) is essentially random guessing over the output space
|
| 532 |
-
- Comparison operations fail because the model often echoes the expression rather than evaluating it
|
| 533 |
-
- Even addition—the simplest operation—fails nearly two-thirds of the time on 8-bit operands
|
| 534 |
-
- Performance degrades sharply as operand magnitude increases (edge cases like 127+128 are almost never correct)
|
| 535 |
-
|
| 536 |
-
These results establish the **control condition** for our experiment.
|
| 537 |
-
|
| 538 |
-
#### Experimental Design
|
| 539 |
-
|
| 540 |
-
| Condition | Model Configuration | Target Fitness |
|
| 541 |
-
|-----------|---------------------|----------------|
|
| 542 |
-
| **Control** | Vanilla SmolLM2-360M-Instruct | 11.90% (measured) |
|
| 543 |
-
| **Experimental** | SmolLM2-360M + Frozen ThresholdALU + Trained Interface | **100%** |
|
| 544 |
-
|
| 545 |
-
The experimental condition adds:
|
| 546 |
-
1. **BitEncoder** (trainable): Projects hidden states → 24 bits (3 × 8-bit operands)
|
| 547 |
-
2. **OpRouter** (trainable): Selects which circuit to activate based on context
|
| 548 |
-
3. **BitDecoder** (trainable): Projects 8-bit result → hidden state delta
|
| 549 |
-
4. **ThresholdALU** (frozen): The verified circuits from this repository
|
| 550 |
-
|
| 551 |
-
**Training Signal**: The fitness function itself. We do not provide answer supervision—the model must learn to correctly encode operands and route to circuits such that the frozen circuits produce correct outputs. This is possible because the circuits are proven correct; the interface layers need only learn the encoding/routing protocol.
|
| 552 |
-
|
| 553 |
-
**Success Criterion**: If the experimental condition achieves 100% fitness on randomized arithmetic while the control remains at ~12%, this demonstrates:
|
| 554 |
-
1. The frozen threshold circuits provide exact computation
|
| 555 |
-
2. Neural interface layers can learn to use discrete computational substrates
|
| 556 |
-
3. Small language models can achieve perfect arithmetic via architectural augmentation rather than scale
|
| 557 |
-
|
| 558 |
-
#### Progress
|
| 559 |
-
|
| 560 |
-
**Stage 1: Circuit Validation — COMPLETE**
|
| 561 |
-
|
| 562 |
-
The frozen threshold circuits achieve 100% accuracy when given correctly formatted bit inputs:
|
| 563 |
-
|
| 564 |
-
| Test | Result |
|
| 565 |
-
|------|--------|
|
| 566 |
-
| DirectCircuitModel (ground truth bits) | 100.00% on 10,000 random cases |
|
| 567 |
-
| All operations (ADD, SUB, MUL, GT, LT, EQ) | 100.00% each |
|
| 568 |
-
|
| 569 |
-
This confirms the circuits compute correctly. However, this was already established by `eval.py`.
|
| 570 |
-
|
| 571 |
-
**Stage 2: LLM Baseline — COMPLETE**
|
| 572 |
-
|
| 573 |
-
SmolLM2-360M-Instruct baseline on randomized 8-bit arithmetic:
|
| 574 |
-
|
| 575 |
-
| Operation | Accuracy |
|
| 576 |
-
|-----------|----------|
|
| 577 |
-
| Addition | 35.92% |
|
| 578 |
-
| Subtraction | 17.72% |
|
| 579 |
-
| Multiplication | 1.25% |
|
| 580 |
-
| Comparisons | 0.28–14.37% |
|
| 581 |
-
| **Overall** | **11.90%** |
|
| 582 |
-
|
| 583 |
-
Head-to-head on 50 random cases: SmolLM2 got 7/50 (14%), circuits got 50/50 (100%).
|
| 584 |
-
|
| 585 |
-
**Stage 3: LLM Integration — IN PROGRESS**
|
| 586 |
-
|
| 587 |
-
The challenge: train an interface that extracts operands and operations from natural language (not from pre-formatted bit inputs).
|
| 588 |
-
|
| 589 |
-
```
|
| 590 |
-
"47 + 86"
|
| 591 |
-
↓
|
| 592 |
-
[Language Model / Extractor]
|
| 593 |
-
↓
|
| 594 |
-
[a_bits, b_bits, op_logits]
|
| 595 |
-
↓
|
| 596 |
-
[Frozen threshold circuits]
|
| 597 |
-
↓
|
| 598 |
-
[Result bits] → 133
|
| 599 |
-
```
|
| 600 |
-
|
| 601 |
-
**SmolLM2 Approach** (`llm_integration/`):
|
| 602 |
-
|
| 603 |
-
Initial experiments used SmolLM2-360M-Instruct as the language understanding backbone.
|
| 604 |
-
|
| 605 |
-
| Mode | Description | Status |
|
| 606 |
-
|------|-------------|--------|
|
| 607 |
-
| `--mode router` | Train OpRouter with ground truth bits | 100% achieved |
|
| 608 |
-
| `--mode interface` | Train BitEncoder + OpRouter | Ready |
|
| 609 |
-
| `--mode llm` | Train from LLM hidden states | Explored |
|
| 610 |
-
|
| 611 |
-
**LLM Mode Options**:
|
| 612 |
-
- `--unfreeze_layers N`: Fine-tune top N transformer layers
|
| 613 |
-
- `--extract_layer N`: Extract from intermediate layer (-1 = final)
|
| 614 |
-
- `--position_extract`: Position-specific extraction (uses token positions)
|
| 615 |
-
- `--digit_pred`: Predict digits (0-9) instead of bits
|
| 616 |
-
|
| 617 |
-
**Extraction Architectures** (`model.py`):
|
| 618 |
-
- `Extractor`: Attention pooling + per-bit MLPs
|
| 619 |
-
- `PositionExtractor`: Position-aware (operand A from positions 0-2, B from 5-7)
|
| 620 |
-
- `DigitExtractor`: Predicts 3 digits per operand, converts to bits
|
| 621 |
-
- `HybridExtractor`: Digit lookup + MLP fallback for word inputs
|
| 622 |
-
|
| 623 |
-
**Curriculum Learning**: Training progresses 0-9 → 0-99 → 0-255 over epochs.
|
| 624 |
-
|
| 625 |
-
**Observations**: SmolLM2 integration proved challenging—360M parameters of pre-trained representations largely irrelevant to arithmetic parsing, high VRAM requirements, and gradient conflicts between frozen circuits and pre-trained weights.
|
| 626 |
-
|
| 627 |
-
**Pivot: From-Scratch Extractor**
|
| 628 |
-
|
| 629 |
-
Given that the task is fundamentally simple—parse `(a, b, op)` from structured text—a lightweight purpose-built model may be more appropriate than adapting a general LLM.
|
| 630 |
-
|
| 631 |
-
```
|
| 632 |
-
"one thousand plus two thousand"
|
| 633 |
-
↓
|
| 634 |
-
[Char-level tokenizer: ~40 tokens]
|
| 635 |
-
↓
|
| 636 |
-
[Small transformer: ~1-5M params]
|
| 637 |
-
↓
|
| 638 |
-
[3 heads: a_value, b_value, op_idx]
|
| 639 |
-
↓
|
| 640 |
-
[Frozen 32-bit threshold circuits]
|
| 641 |
-
↓
|
| 642 |
-
3000
|
| 643 |
-
```
|
| 644 |
-
|
| 645 |
-
**Design principles**:
|
| 646 |
-
- **Minimal Python**: All parsing logic learned in weights, not hardcoded
|
| 647 |
-
- **Character-level input**: No word tokenization; model learns "forty seven" = 47
|
| 648 |
-
- **From-scratch training**: No pre-trained weights to conflict with
|
| 649 |
-
- **32-bit target**: Practical arithmetic range (0–4,294,967,295)
|
| 650 |
-
|
| 651 |
-
**Planned architecture**:
|
| 652 |
-
- Vocab: ~40 chars (a-z, 0-9, space, operators)
|
| 653 |
-
- Embedding: 40 × 128d
|
| 654 |
-
- Encoder: 2-3 transformer layers
|
| 655 |
-
- Output heads: `a_classifier`, `b_classifier`, `op_classifier`
|
| 656 |
-
- Total: ~1-5M params (vs 360M for SmolLM2)
|
| 657 |
-
|
| 658 |
-
This approach treats the problem as what it is: a structured parsing task where the frozen circuits handle all computation. The extractor need only learn the mapping from text to operands—no world knowledge required.
|
| 659 |
-
|
| 660 |
-
#### Proof of Concept Scope
|
| 661 |
-
|
| 662 |
-
- **32-bit operands** (0–4,294,967,295)
|
| 663 |
-
- **Six operations**: ADD, SUB, MUL, GT, LT, EQ
|
| 664 |
-
- **Structured input**: Digits ("1000 + 2000") and number words ("one thousand plus two thousand")
|
| 665 |
-
|
| 666 |
-
**Current Status**:
|
| 667 |
-
- Circuit validation: Complete (100% on 8-bit operations)
|
| 668 |
-
- 32-bit circuits: Built and tested (adder verified on 1M+2M=3M, etc.)
|
| 669 |
-
- LLM baseline: Measured (11.90% - establishes control condition)
|
| 670 |
-
- SmolLM2 integration: Infrastructure complete, training explored
|
| 671 |
-
- From-scratch extractor: Design phase
|
| 672 |
-
|
| 673 |
-
### Extension Roadmap
|
| 674 |
-
|
| 675 |
-
#### Completed
|
| 676 |
-
|
| 677 |
-
1. **32-bit operations (0–4,294,967,295)** — Full 32-bit ALU implemented via `--bits 32` flag:
|
| 678 |
-
- 32-bit ripple carry adder (32 chained full adders) — **verified**
|
| 679 |
-
- 32-bit subtractor (NOT + adder with carry-in)
|
| 680 |
-
- 32-bit multiplication (1024 partial product ANDs)
|
| 681 |
-
- 32-bit division (32 restoring stages)
|
| 682 |
-
- 32-bit comparators (GT, LT, GE, LE, EQ)
|
| 683 |
-
- 32-bit bitwise ops (AND, OR, XOR, NOT)
|
| 684 |
-
- 32-bit shifts (SHL, SHR), INC, DEC, NEG
|
| 685 |
-
|
| 686 |
-
**Known issue**: Single-layer 32-bit comparators use weights up to 2³¹, which exceeds float32 mantissa precision (24 bits). Comparisons between large numbers differing only in low bits may fail. Fix planned: cascaded byte-wise comparison (compare MSB first, if equal compare next byte, etc.).
|
| 687 |
-
|
| 688 |
-
2. **3-operand addition (15 + 27 + 33 = 75)** — `arithmetic.add3_8bit` chains two 8-bit ripple carry stages. 16 full adders, 144 gates, 240 test cases verified.
|
| 689 |
-
|
| 690 |
-
3. **Order of operations (5 + 3 × 2 = 11)** — `arithmetic.expr_add_mul` computes A + (B × C) using shift-add multiplication then addition. 64 AND gates + 64 full adders, 73 test cases verified.
|
| 691 |
-
|
| 692 |
-
#### Planned
|
| 693 |
-
|
| 694 |
-
1. **Cascaded 32-bit comparators** — Replace single-layer weighted comparison with multi-layer byte-wise cascade. Each byte comparison uses 8-bit weights (max 128), well within float32 precision. Hardware-accurate and extensible to 64-bit, 128-bit, etc.
|
| 695 |
-
|
| 696 |
-
2. **Parenthetical expressions ((5 + 3) × 2 = 16)** — Explicit grouping overrides precedence. Parser must recognize parens and build correct tree. Evaluation proceeds innermost-out.
|
| 697 |
-
|
| 698 |
-
3. **Multi-operation chains (a + b - c × d)** — Sequential dispatch through multiple circuits with intermediate result routing. Requires state management in interface layers.
|
| 699 |
-
|
| 700 |
-
4. **Floating point arithmetic** — IEEE 754-style with separate circuits for mantissa and exponent. ADD: align exponents, add mantissas, renormalize. MUL: add exponents, multiply mantissas.
|
| 701 |
-
|
| 702 |
-
5. **Full CPU integration** — Enable memory access circuits for stateful computation. Allows multi-step algorithms executed entirely within threshold logic.
|
| 703 |
-
|
| 704 |
-
---
|
| 705 |
-
|
| 706 |
-
## Build Tool
|
| 707 |
-
|
| 708 |
-
Output filenames are auto-generated from configuration:
|
| 709 |
-
|
| 710 |
-
```
|
| 711 |
-
Format: neural_{alu|computer}{BITS}[_{MEMORY}].safetensors
|
| 712 |
-
|
| 713 |
-
Examples:
|
| 714 |
-
neural_alu8.safetensors # 8-bit, no memory
|
| 715 |
-
neural_alu32.safetensors # 32-bit, no memory
|
| 716 |
-
neural_computer8.safetensors # 8-bit, full memory (default)
|
| 717 |
-
neural_computer32.safetensors # 32-bit, full memory
|
| 718 |
-
neural_computer8_small.safetensors # 8-bit, 1KB memory
|
| 719 |
-
neural_computer32_small.safetensors # 32-bit, 1KB memory
|
| 720 |
-
neural_computer8_addr12.safetensors # 8-bit, custom 4KB (2^12 bytes)
|
| 721 |
-
```
|
| 722 |
-
|
| 723 |
-
```bash
|
| 724 |
-
# 8-bit CPU (default)
|
| 725 |
-
python build.py --apply all # -> neural_computer8.safetensors
|
| 726 |
-
python build.py -m none --apply all # -> neural_alu8.safetensors
|
| 727 |
-
python build.py -m scratchpad --apply all # -> neural_computer8_scratchpad.safetensors
|
| 728 |
-
|
| 729 |
-
# 16-bit ALU
|
| 730 |
-
python build.py --bits 16 --apply all # -> neural_computer16.safetensors
|
| 731 |
-
python build.py --bits 16 -m none --apply all # -> neural_alu16.safetensors
|
| 732 |
-
|
| 733 |
-
# 32-bit ALU
|
| 734 |
-
python build.py --bits 32 -m small --apply all # -> neural_computer32_small.safetensors
|
| 735 |
-
python build.py --bits 32 -m none --apply all # -> neural_alu32.safetensors
|
| 736 |
-
|
| 737 |
-
# Custom address width
|
| 738 |
-
python build.py --bits 16 -a 6 --apply all # -> neural_computer16_addr6.safetensors
|
| 739 |
-
```
|
| 740 |
-
|
| 741 |
-
**Bit widths** (`--bits`):
|
| 742 |
-
|
| 743 |
-
| Width | Range | Use Case |
|
| 744 |
-
|-------|-------|----------|
|
| 745 |
-
| 8 | 0–255 | Full CPU, legacy |
|
| 746 |
-
| 16 | 0–65,535 | Extended arithmetic |
|
| 747 |
-
| 32 | 0–4,294,967,295 | Practical arithmetic |
|
| 748 |
-
|
| 749 |
-
**Memory profiles** (`-m`):
|
| 750 |
-
|
| 751 |
-
| Profile | Size | Addr Bits | Filename Suffix | Params | Use Case |
|
| 752 |
-
|---------|------|-----------|-----------------|--------|----------|
|
| 753 |
-
| `none` | 0B | — | (uses `alu`) | ~32K | Pure ALU |
|
| 754 |
-
| `registers` | 16B | 4 | `_registers` | ~34K | Minimal state |
|
| 755 |
-
| `scratchpad` | 256B | 8 | `_scratchpad` | ~63K | 8-bit scratch |
|
| 756 |
-
| `small` | 1KB | 10 | `_small` | ~123K | 32-bit scratch |
|
| 757 |
-
| `reduced` | 4KB | 12 | `_reduced` | ~549K | Small programs |
|
| 758 |
-
| `full` | 64KB | 16 | (none) | ~8.29M | Full CPU |
|
| 759 |
-
|
| 760 |
-
**Custom address width** (`-a N`): Memory size = 2^N bytes, suffix = `_addrN`
|
| 761 |
-
|
| 762 |
-
---
|
| 763 |
-
|
| 764 |
-
## Citation
|
| 765 |
-
|
| 766 |
-
```bibtex
|
| 767 |
-
@misc{8bit-threshold-computer,
|
| 768 |
-
title={8bit-threshold-computer: A Turing-Complete Threshold Logic CPU},
|
| 769 |
-
author={Norton, Charles},
|
| 770 |
-
year={2026},
|
| 771 |
-
howpublished={Hugging Face},
|
| 772 |
-
url={https://huggingface.co/phanerozoic/8bit-threshold-computer}
|
| 773 |
-
}
|
| 774 |
-
```
|
| 775 |
-
|
| 776 |
-
---
|
| 777 |
-
|
| 778 |
-
## License
|
| 779 |
-
|
| 780 |
-
MIT
|
| 781 |
-
|
| 782 |
-
---
|
| 783 |
-
|
| 784 |
-
## References
|
| 785 |
-
|
| 786 |
-
1. McCulloch & Pitts (1943). "A Logical Calculus of Ideas Immanent in Nervous Activity"
|
| 787 |
-
2. Muroga (1971). "Threshold Logic and Its Applications"
|
| 788 |
-
3. Siegelmann & Sontag (1995). "On the Computational Power of Neural Nets"
|
| 789 |
-
4. Bengio et al. (2013). "Estimating or Propagating Gradients Through Stochastic Neurons"
|
| 790 |
-
5. Ma et al. (2024). "The Era of 1-bit LLMs" (BitNet b1.58)
|
| 791 |
-
6. HuggingFace (2024). "SmolLM2: Small Language Models" — [Model Card](https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct)
|
| 792 |
-
7. Vaswani et al. (2017). "Attention Is All You Need" — Transformer architecture
|
| 793 |
-
8. Su et al. (2021). "RoFormer: Enhanced Transformer with Rotary Position Embedding" — RoPE
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
tags:
|
| 4 |
+
- threshold-logic
|
| 5 |
+
- neuromorphic
|
| 6 |
+
- computer-architecture
|
| 7 |
+
- turing-complete
|
| 8 |
+
- loihi
|
| 9 |
+
- truenorth
|
| 10 |
+
- akida
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# 8bit-threshold-computer
|
| 14 |
+
|
| 15 |
+
A Turing-complete CPU implemented entirely as threshold logic gates. Every gate, from Boolean primitives to arithmetic to control flow, is a single threshold neuron of the form:
|
| 16 |
+
|
| 17 |
+
```
|
| 18 |
+
output = 1 if (Σ wᵢ·xᵢ + b) ≥ 0 else 0
|
| 19 |
+
```
|
| 20 |
+
|
| 21 |
+
Weights and biases are integers; activations are the Heaviside step. Nothing else.
|
| 22 |
+
|
| 23 |
+
The repository ships eighteen prebuilt configurations spanning three data-path widths (8, 16, 32 bits) and six memory sizes (0 B to 64 KB). The canonical file at the repo root is the largest of these: a 32-bit data path with a 64 KB address space and ~8.47 M parameters.
|
| 24 |
+
|
| 25 |
+
```
|
| 26 |
+
neural_computer.safetensors 32-bit data, 64 KB memory, ~8.47M params (canonical)
|
| 27 |
+
variants/neural_computer{8,16,32}.safetensors full memory (64 KB)
|
| 28 |
+
variants/neural_computer{8,16,32}_reduced.safetensors 4 KB memory
|
| 29 |
+
variants/neural_computer{8,16,32}_small.safetensors 1 KB memory
|
| 30 |
+
variants/neural_computer{8,16,32}_scratchpad.safetensors 256 B memory
|
| 31 |
+
variants/neural_computer{8,16,32}_registers.safetensors 16 B memory
|
| 32 |
+
variants/neural_alu{8,16,32}.safetensors pure ALU, no memory
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
---
|
| 36 |
+
|
| 37 |
+
## Quick start
|
| 38 |
+
|
| 39 |
+
```python
|
| 40 |
+
import torch
|
| 41 |
+
from safetensors.torch import load_file
|
| 42 |
+
|
| 43 |
+
tensors = load_file("neural_computer.safetensors")
|
| 44 |
+
|
| 45 |
+
def heaviside(x):
|
| 46 |
+
return (x >= 0).float()
|
| 47 |
+
|
| 48 |
+
# AND gate: fires when both inputs are 1
|
| 49 |
+
w = tensors['boolean.and.weight'] # [2]
|
| 50 |
+
b = tensors['boolean.and.bias'] # [1]
|
| 51 |
+
for a, c in [(0, 0), (0, 1), (1, 0), (1, 1)]:
|
| 52 |
+
out = heaviside((torch.tensor([a, c], dtype=torch.float32) * w).sum() + b)
|
| 53 |
+
print(f"AND({a}, {c}) = {int(out.item())}")
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
Run the full circuit verification suite against any variant:
|
| 57 |
+
|
| 58 |
+
```bash
|
| 59 |
+
python eval_all.py variants/ # all 18 in one pass
|
| 60 |
+
python eval_all.py neural_computer.safetensors # the canonical file
|
| 61 |
+
python eval_all.py --cpu-program variants/ # also run an assembled
|
| 62 |
+
# program through the
|
| 63 |
+
# threshold-gated CPU
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
`eval_all.py` reads each variant's manifest, runs a gate-level fitness suite (5,900–7,800 tests per variant covering Boolean, arithmetic, ALU, control, modular, error-detection, threshold, and IEEE 754 float circuits), and optionally executes a small assembled program through a manifest-sized threshold CPU plus a chained 16- or 32-bit ALU sequence on wider variants.
|
| 67 |
+
|
| 68 |
+
For an interactive walkthrough that exercises Boolean gates, the 8-bit ALU, mod-5 divisibility, and a CPU loop end-to-end:
|
| 69 |
+
|
| 70 |
+
```bash
|
| 71 |
+
python play.py # 1 KB demo, runs in seconds
|
| 72 |
+
python play.py --model neural_computer.safetensors # 64 KB, slower
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
---
|
| 76 |
+
|
| 77 |
+
## Execution model
|
| 78 |
+
|
| 79 |
+
A self-contained machine. State goes in, state comes out:
|
| 80 |
+
|
| 81 |
+
- **Pure tensor computation**: state in, state out
|
| 82 |
+
- **Frozen circuits**: integer weights, Heaviside activation
|
| 83 |
+
- **ACT execution**: internal loop until `HALT`
|
| 84 |
+
- **No external orchestration**: one forward pass equals one complete program execution
|
| 85 |
+
|
| 86 |
+
```
|
| 87 |
+
┌─────────────────────────────┐
|
| 88 |
+
│ Initial State │
|
| 89 |
+
│ [PC|Regs|Flags|Memory...] │
|
| 90 |
+
└─────────────┬───────────────┘
|
| 91 |
+
▼
|
| 92 |
+
┌─────────────────────────────┐
|
| 93 |
+
│ Threshold Circuit Layer │
|
| 94 |
+
│ ┌───────────────────────┐ │
|
| 95 |
+
│ │ Fetch: PC → Instr │ │
|
| 96 |
+
│ ├───────────────────────┤ │
|
| 97 |
+
│ │ Decode: Opcode/Ops │ │
|
| 98 |
+
│ ├───────────────────────┤ │
|
| 99 |
+
│ │ Execute: ALU/Mem │ │
|
| 100 |
+
│ ├───────────────────────┤ │
|
| 101 |
+
│ │ Writeback: Results │ │
|
| 102 |
+
│ ├───────────────────────┤ │
|
| 103 |
+
│ │ PC Update │ │
|
| 104 |
+
│ └───────────┬───────────┘ │
|
| 105 |
+
│ │ │
|
| 106 |
+
│ ┌────▼────┐ │
|
| 107 |
+
│ │ HALTED? │ │
|
| 108 |
+
│ └────┬────┘ │
|
| 109 |
+
│ no ────┴──── yes │
|
| 110 |
+
│ │ │ │
|
| 111 |
+
│ ▼ ▼ │
|
| 112 |
+
│ [loop] [exit] │
|
| 113 |
+
└─────────────┬───────────────┘
|
| 114 |
+
▼
|
| 115 |
+
┌─────────────────────────────┐
|
| 116 |
+
│ Final State │
|
| 117 |
+
└─────────────────────────────┘
|
| 118 |
+
```
|
| 119 |
+
|
| 120 |
+
### Instruction set
|
| 121 |
+
|
| 122 |
+
| Opcode | Mnemonic | Operation |
|
| 123 |
+
|--------|----------|-----------|
|
| 124 |
+
| 0x0 | ADD | R[d] = R[a] + R[b] |
|
| 125 |
+
| 0x1 | SUB | R[d] = R[a] - R[b] |
|
| 126 |
+
| 0x2 | AND | R[d] = R[a] & R[b] |
|
| 127 |
+
| 0x3 | OR | R[d] = R[a] \| R[b] |
|
| 128 |
+
| 0x4 | XOR | R[d] = R[a] ^ R[b] |
|
| 129 |
+
| 0x5 | SHL | R[d] = R[a] << 1 |
|
| 130 |
+
| 0x6 | SHR | R[d] = R[a] >> 1 |
|
| 131 |
+
| 0x7 | MUL | R[d] = R[a] * R[b] |
|
| 132 |
+
| 0x8 | DIV | R[d] = R[a] / R[b] |
|
| 133 |
+
| 0x9 | CMP | flags = R[a] - R[b] |
|
| 134 |
+
| 0xA | LOAD | R[d] = M[addr] |
|
| 135 |
+
| 0xB | STORE | M[addr] = R[s] |
|
| 136 |
+
| 0xC | JMP | PC = addr |
|
| 137 |
+
| 0xD | Jcc | PC = addr if cond (imm8[2:0]: 0=Z, 1=NZ, 2=C, 3=NC, 4=N, 5=P, 6=V, 7=NV) |
|
| 138 |
+
| 0xE | CALL | push PC; PC = addr |
|
| 139 |
+
| 0xF | HALT | stop execution |
|
| 140 |
+
|
| 141 |
+
### State tensor layout
|
| 142 |
+
|
| 143 |
+
All multi-bit fields are MSB-first (index 0 is the most-significant bit).
|
| 144 |
+
|
| 145 |
+
```
|
| 146 |
+
[ PC[N] | IR[16] | R0[8] R1[8] R2[8] R3[8] | FLAGS[4] | SP[N] | CTRL[4] | MEM[2^N][8] ]
|
| 147 |
+
```
|
| 148 |
+
|
| 149 |
+
`N` is the address width (configurable, 0–16). Flags are ordered `Z, N, C, V`. Control bits are ordered `HALT, MEM_WE, MEM_RE, RESERVED`.
|
| 150 |
+
|
| 151 |
+
### Instruction encoding (16-bit, MSB-first)
|
| 152 |
+
|
| 153 |
+
```
|
| 154 |
+
15..12 11..10 9..8 7..0
|
| 155 |
+
opcode rd rs imm8
|
| 156 |
+
```
|
| 157 |
+
|
| 158 |
+
Interpretation:
|
| 159 |
+
- **R-type**: `rd = rd op rs` (imm8 ignored)
|
| 160 |
+
- **I-type**: `rd = op rd, imm8` (rs ignored)
|
| 161 |
+
- **Address-extended**: `LOAD`, `STORE`, `JMP`, `Jcc`, `CALL` consume the next word as a 16-bit address (big-endian); `imm8` is reserved and the PC skips 4 bytes when the jump is not taken.
|
| 162 |
+
|
| 163 |
+
### Circuit categories
|
| 164 |
+
|
| 165 |
+
| Category | Circuits | Examples |
|
| 166 |
+
|----------|----------|----------|
|
| 167 |
+
| Boolean | 9 | AND, OR, NOT, NAND, NOR, XOR, XNOR, IMPLIES, BIIMPLIES |
|
| 168 |
+
| Arithmetic | 18+ | half/full adder, ripple-carry (8/16/32-bit), comparators (8/16/32-bit), 3-operand adder, A+B×C and (A+B)×C expressions |
|
| 169 |
+
| ALU | 8/16/32-bit | shifts, multiply, divide, INC/DEC, NEG, ROL/ROR, bitwise |
|
| 170 |
+
| Combinational | 10+ | MUX (2:1, 4:1, 8:1), DEMUX, 3-to-8 decoder, 8-to-3 encoder, barrel shifter, priority encoder |
|
| 171 |
+
| Control flow | 16 | JMP, conditional jumps (JZ/JNZ/JC/JNC/JN/JP/JV/JNV), CALL, RET, PUSH, POP |
|
| 172 |
+
| Memory | 3 | N-bit address decoder, read mux, write cells (packed) |
|
| 173 |
+
| Modular | 11 | divisibility by 2–12 (multi-layer for non-powers-of-2) |
|
| 174 |
+
| Threshold | 13 | k-of-n gates, majority, minority, exactly-k |
|
| 175 |
+
| Pattern | 10 | popcount, leading/trailing ones, symmetry |
|
| 176 |
+
| Error detection | 11 | parity (XOR tree), checksum, CRC, Hamming |
|
| 177 |
+
| Float (IEEE 754) | half + single | pack/unpack, classify, normalize, ADD, MUL, DIV, EQ/LT/LE/GT/GE |
|
| 178 |
+
|
| 179 |
+
### Tensor naming
|
| 180 |
+
|
| 181 |
+
```
|
| 182 |
+
{category}.{circuit}[.{layer}][.{component}].{weight|bias}
|
| 183 |
+
|
| 184 |
+
Examples:
|
| 185 |
+
boolean.and.weight
|
| 186 |
+
boolean.xor.layer1.neuron1.weight
|
| 187 |
+
arithmetic.ripplecarry8bit.fa7.ha2.sum.layer1.or.weight
|
| 188 |
+
modular.mod5.layer2.eq3.weight
|
| 189 |
+
error_detection.paritychecker8bit.stage2.xor1.layer1.nand.bias
|
| 190 |
+
```
|
| 191 |
+
|
| 192 |
+
Memory circuits are stored as packed tensors so the safetensors header stays manageable (`memory.addr_decode.weight`, `memory.read.and.weight`, `memory.write.and_old.weight`, etc.).
|
| 193 |
+
|
| 194 |
+
---
|
| 195 |
+
|
| 196 |
+
## Bit widths and memory profiles
|
| 197 |
+
|
| 198 |
+
The build tool emits one of 51 functionally distinct configurations: three data-path widths × seventeen address widths (0–16, where 0 means no memory).
|
| 199 |
+
|
| 200 |
+
**Bit widths** (`--bits`):
|
| 201 |
+
|
| 202 |
+
| Width | Range | Use case |
|
| 203 |
+
|-------|-------|----------|
|
| 204 |
+
| 8 | 0–255 | full CPU, legacy compatibility |
|
| 205 |
+
| 16 | 0–65,535 | extended arithmetic |
|
| 206 |
+
| 32 | 0–4,294,967,295 | practical arithmetic ranges |
|
| 207 |
+
|
| 208 |
+
**Memory profiles** (`-m`):
|
| 209 |
+
|
| 210 |
+
| Profile | Size | Addr bits | Filename suffix |
|
| 211 |
+
|---------|------|-----------|-----------------|
|
| 212 |
+
| `none` | 0 B | 0 | (uses `alu` instead of `computer`) |
|
| 213 |
+
| `registers` | 16 B | 4 | `_registers` |
|
| 214 |
+
| `scratchpad` | 256 B | 8 | `_scratchpad` |
|
| 215 |
+
| `small` | 1 KB | 10 | `_small` |
|
| 216 |
+
| `reduced` | 4 KB | 12 | `_reduced` |
|
| 217 |
+
| `full` | 64 KB | 16 | (none) |
|
| 218 |
+
|
| 219 |
+
Auto-generated filename: `neural_{alu|computer}{BITS}[_{MEMORY}].safetensors`. Custom address widths via `-a N` produce `_addrN`.
|
| 220 |
+
|
| 221 |
+
```bash
|
| 222 |
+
python build.py --bits 32 --apply all # neural_computer32.safetensors
|
| 223 |
+
python build.py --bits 8 -m none --apply all # neural_alu8.safetensors
|
| 224 |
+
python build.py --bits 16 -m small --apply all # neural_computer16_small.safetensors
|
| 225 |
+
python build.py --bits 32 -a 6 --apply all # neural_computer32_addr6.safetensors
|
| 226 |
+
```
|
| 227 |
+
|
| 228 |
+
To regenerate every named variant in one pass:
|
| 229 |
+
|
| 230 |
+
```bash
|
| 231 |
+
python build_all.py
|
| 232 |
+
```
|
| 233 |
+
|
| 234 |
+
This populates `variants/` with all 18 builds and runs `eval.py` on each as a sanity check.
|
| 235 |
+
|
| 236 |
+
---
|
| 237 |
+
|
| 238 |
+
## Verification
|
| 239 |
+
|
| 240 |
+
| Category | Status | Notes |
|
| 241 |
+
|----------|--------|-------|
|
| 242 |
+
| Boolean gates | exhaustive | all 2^n input combinations |
|
| 243 |
+
| Arithmetic | exhaustive | full 8-bit range; strategic sampling at 16/32-bit |
|
| 244 |
+
| ALU | exhaustive | every operation, every input |
|
| 245 |
+
| Control flow | exhaustive | branch and jump conditions |
|
| 246 |
+
| Threshold | exhaustive | k-of-n, majority, etc. |
|
| 247 |
+
| Modular (mod 3, 5, 6, 7, 9, 10, 11, 12) | exhaustive | multi-layer, hand-constructed |
|
| 248 |
+
| Parity | exhaustive | XOR tree, hand-constructed |
|
| 249 |
+
| Modular (mod 2, 4, 8) | exhaustive | single-layer, trivial |
|
| 250 |
+
|
| 251 |
+
Divisibility by non-powers-of-2 (3, 5, 7, ...) is not linearly separable in binary, so those circuits are multi-layer. Eight-bit parity (XOR of all bits) requires a tree of XOR gates. All circuits pass exhaustive testing over their full input domains.
|
| 252 |
+
|
| 253 |
+
`eval_all.py` runs the unified suite. Exit code is the number of failing variants (0 means all pass).
|
| 254 |
+
|
| 255 |
+
---
|
| 256 |
+
|
| 257 |
+
## Threshold logic
|
| 258 |
+
|
| 259 |
+
A threshold gate computes a Boolean function by taking a weighted sum of binary inputs and comparing the result to a threshold; the output is 1 when the sum meets or exceeds the threshold and 0 otherwise. Equivalently, it is a neuron with Heaviside step activation, integer weights, and an integer bias.
|
| 260 |
+
|
| 261 |
+
Threshold gates are strictly more powerful than standard Boolean gates. A single threshold gate can compute any linearly separable Boolean function, which includes AND, OR, NAND, NOR, IMPLIES, and many others that require multiple levels of conventional gates. Functions that are not linearly separable (XOR, parity, mod-k for k not a power of two) require multiple layers.
|
| 262 |
+
|
| 263 |
+
Example gates:
|
| 264 |
+
|
| 265 |
+
```
|
| 266 |
+
AND: w=[1, 1], b=-2
|
| 267 |
+
H(0+0-2) = 0 H(1+1-2) = 1
|
| 268 |
+
|
| 269 |
+
OR: w=[1, 1], b=-1
|
| 270 |
+
H(0+0-1) = 0 H(1+0-1) = 1
|
| 271 |
+
|
| 272 |
+
XOR: two layers (not linearly separable)
|
| 273 |
+
layer 1: OR + NAND
|
| 274 |
+
layer 2: AND of the two
|
| 275 |
+
```
|
| 276 |
+
|
| 277 |
+
A full adder is two half-adders plus a carry OR, around four threshold layers. An 8-bit ripple-carry adder is eight chained full adders, around 32 layers.
|
| 278 |
+
|
| 279 |
+
### History
|
| 280 |
+
|
| 281 |
+
Warren McCulloch and Walter Pitts introduced the threshold neuron in 1943, proving that networks of such neurons can compute any Boolean function. Their work preceded both the perceptron and modern neural networks and established the theoretical foundation for neural computation.
|
| 282 |
+
|
| 283 |
+
The 1960s saw substantial work on threshold logic synthesis. Saburo Muroga, Robert McNaughton, and Michael Dertouzos developed algebraic methods for determining whether a Boolean function can be implemented as a single threshold gate, and if so, how to compute the appropriate weights. The focus was on individual gates rather than complete systems.
|
| 284 |
+
|
| 285 |
+
Frank Rosenblatt's Mark I Perceptron (1957–1960) implemented threshold neurons in hardware using potentiometers for weights, but it was a pattern classifier that learned its weights through training; the final configurations were not published. Bernard Widrow's ADALINE and MADALINE (1960–1963) similarly used adaptive threshold elements with weights learned via the LMS algorithm.
|
| 286 |
+
|
| 287 |
+
Hava Siegelmann and Eduardo Sontag proved in the 1990s that recurrent neural networks are Turing-complete. The construction relied on continuous sigmoid activations with infinite precision, not the discrete step function used in threshold logic. Other theoretical work on neural Turing machines and differentiable computers followed similar patterns: universality with continuous activations chosen to support gradient-based training.
|
| 288 |
+
|
| 289 |
+
### Neuromorphic hardware
|
| 290 |
+
|
| 291 |
+
Modern neuromorphic processors implement large arrays of configurable threshold-like neurons in silicon:
|
| 292 |
+
|
| 293 |
+
- **Intel Loihi** (2017): 128 neuromorphic cores with programmable synaptic weights, spike-based communication, and on-chip learning. Supports integer weights and configurable neuron dynamics.
|
| 294 |
+
- **IBM TrueNorth** (2014): one million neurons and 256 million synapses across a 4096-core array. Each neurosynaptic core implements 256 neurons with configurable weights and thresholds.
|
| 295 |
+
- **BrainChip Akida** (2021): edge-oriented event-based processing with integer weights.
|
| 296 |
+
- **SpiNNaker** (University of Manchester): ARM cores simulating spiking networks at scale.
|
| 297 |
+
|
| 298 |
+
Published work on these platforms has focused on neural network inference, sensory processing, and pattern recognition. A 2024 paper demonstrated basic logic gates, adders, and decoders on SpiNNaker and Dynap-SE1 and described that work as "a first step toward the construction of a spiking computer"; that implementation lacked instruction fetch, a program counter, memory, and control logic.
|
| 299 |
+
|
| 300 |
+
The weights in this repository implement a complete CPU: registers, ALU with 16 operations, status flags, conditional branching, subroutine calls, stack operations, and memory access. Every component is a threshold neuron with integer weights.
|
| 301 |
+
|
| 302 |
+
---
|
| 303 |
+
|
| 304 |
+
## Hardware compatibility
|
| 305 |
+
|
| 306 |
+
All weights are integers, all activations are Heaviside step, and every gate is a single weighted sum. The circuits are intended to deploy directly on:
|
| 307 |
+
|
| 308 |
+
- **Intel Loihi**
|
| 309 |
+
- **IBM TrueNorth**
|
| 310 |
+
- **BrainChip Akida**
|
| 311 |
+
|
| 312 |
+
---
|
| 313 |
+
|
| 314 |
+
## LLM integration
|
| 315 |
+
|
| 316 |
+
Threshold circuits can be embedded into transformer MLP layers to give a language model exact arithmetic. Standard LLMs fail at arithmetic because they interpolate over the training distribution rather than compute, so a 360M-parameter model trained on web text has seen `127 + 128 = 255` few times if at all and guesses based on pattern matching.
|
| 317 |
+
|
| 318 |
+
The integration freezes the circuits and trains only the interface layers that:
|
| 319 |
+
|
| 320 |
+
1. Extract operands from token embeddings.
|
| 321 |
+
2. Route computation through the appropriate circuit.
|
| 322 |
+
3. Inject the result back into the residual stream.
|
| 323 |
+
|
| 324 |
+
The model learns *call dispatch*; the arithmetic is already solved.
|
| 325 |
+
|
| 326 |
+
### Architecture
|
| 327 |
+
|
| 328 |
+
```
|
| 329 |
+
x ──┬── MLP path ─────────────────┬── + ── output
|
| 330 |
+
│ │
|
| 331 |
+
└── BitExtractor ── Circuit ──┴── BitInjector
|
| 332 |
+
│
|
| 333 |
+
Router (learned weighting)
|
| 334 |
+
```
|
| 335 |
+
|
| 336 |
+
Augmented MLP forward pass:
|
| 337 |
+
|
| 338 |
+
```python
|
| 339 |
+
def forward(x): # x: [batch, seq, d_model=960]
|
| 340 |
+
mlp_out = self.down_proj(silu(self.gate_proj(x)) * self.up_proj(x))
|
| 341 |
+
|
| 342 |
+
a_bits, b_bits = self.bit_extractor(x) # [batch, seq, 8] each
|
| 343 |
+
result_bits, carry = self.circuits.add_8bit(a_bits, b_bits)
|
| 344 |
+
flags = self.compute_flags(result_bits, carry)
|
| 345 |
+
circuit_delta = self.bit_injector(result_bits, flags)
|
| 346 |
+
|
| 347 |
+
route_weights = self.router(x) # [batch, seq, 2] softmax
|
| 348 |
+
return mlp_out + route_weights[..., 1:2] * circuit_delta
|
| 349 |
+
```
|
| 350 |
+
|
| 351 |
+
### Target model
|
| 352 |
+
|
| 353 |
+
The reference integration uses HuggingFace's [SmolLM2-360M-Instruct](https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct). See [`llm_integration/SMOLLM2_ARCHITECTURE.md`](llm_integration/smollm2/SMOLLM2_ARCHITECTURE.md) for the full technical analysis.
|
| 354 |
+
|
| 355 |
+
| Property | Value |
|
| 356 |
+
|----------|-------|
|
| 357 |
+
| Parameters | 361.82 M |
|
| 358 |
+
| Hidden dimension | 960 (matches the extractor input) |
|
| 359 |
+
| Layers | 32 transformer blocks |
|
| 360 |
+
| Attention | 15 query heads, 5 KV heads (GQA) |
|
| 361 |
+
| MLP | SwiGLU (960 → 2560 → 960) |
|
| 362 |
+
| Position encoding | RoPE (theta = 100k, max 8192) |
|
| 363 |
+
|
| 364 |
+
Digits tokenize individually (`"47 + 86"` → `['4', '7', ' +', ' ', '8', '6']`, with digit token IDs `32 + digit_value`), which makes position-based operand extraction practical.
|
| 365 |
+
|
| 366 |
+
### Gradient flow
|
| 367 |
+
|
| 368 |
+
Heaviside has zero gradient almost everywhere. The implementation uses a straight-through estimator:
|
| 369 |
+
|
| 370 |
+
```python
|
| 371 |
+
class HeavisideSTE(torch.autograd.Function):
|
| 372 |
+
@staticmethod
|
| 373 |
+
def forward(ctx, x):
|
| 374 |
+
return (x >= 0).float()
|
| 375 |
+
|
| 376 |
+
@staticmethod
|
| 377 |
+
def backward(ctx, grad_output):
|
| 378 |
+
return grad_output
|
| 379 |
+
```
|
| 380 |
+
|
| 381 |
+
At inference, Heaviside is the true step function; if the extractor identifies operands correctly, the circuit produces the correct result by construction.
|
| 382 |
+
|
| 383 |
+
### Baseline
|
| 384 |
+
|
| 385 |
+
SmolLM2-360M-Instruct on randomized 8-bit arithmetic (2,000 cases, operands uniform on [0, 255], generous answer extraction):
|
| 386 |
+
|
| 387 |
+
| Operation | Accuracy |
|
| 388 |
+
|-----------|----------|
|
| 389 |
+
| Addition | 35.92% |
|
| 390 |
+
| Subtraction | 17.72% |
|
| 391 |
+
| Multiplication | 1.25% |
|
| 392 |
+
| Greater than | 14.37% |
|
| 393 |
+
| Less than | 4.31% |
|
| 394 |
+
| Equality | 0.28% |
|
| 395 |
+
| **Overall** | **11.90%** (238/2000) |
|
| 396 |
+
|
| 397 |
+
Multiplication accuracy at 1.25% is essentially random over the output space. Comparison operations often echo the expression rather than evaluate it. Even addition fails roughly two-thirds of the time on full 8-bit operands. Performance degrades further as operand magnitude increases: edge cases like `127 + 128` are almost never correct.
|
| 398 |
+
|
| 399 |
+
The frozen threshold circuits reach 100% on the same task when given correctly formatted bit inputs (10,000 random cases, every operation). The integration challenge is therefore the extractor, not the arithmetic.
|
| 400 |
+
|
| 401 |
+
### Trainable parameters (SmolLM2, hidden_dim = 960)
|
| 402 |
+
|
| 403 |
+
| Component | Parameters | Description |
|
| 404 |
+
|-----------|------------|-------------|
|
| 405 |
+
| AttentionPooling | ~3.7 M | 4-head attention over the sequence |
|
| 406 |
+
| MultiHeadBitExtractor (× 2) | ~245 K each | 8 per-bit MLPs for A and B |
|
| 407 |
+
| OpRouter | ~246 K | 960 → 256 → 6 MLP |
|
| 408 |
+
| **Extractor total** | **~4.4 M** | full extraction module |
|
| 409 |
+
|
| 410 |
+
Alternative architectures: `PositionExtractor` (~1.5 M, position-specific, no attention), `DigitExtractor` (~1.2 M, predicts digits 0–9 instead of bits), `HybridExtractor` (digit lookup with MLP fallback for word numerals). With `--unfreeze_layers 4` an additional ~39.3 M trainable parameters open up in the top four transformer layers.
|
| 411 |
+
|
| 412 |
+
### Training
|
| 413 |
+
|
| 414 |
+
```bash
|
| 415 |
+
python train.py --mode router --epochs 100 # sanity check
|
| 416 |
+
python train.py --mode llm --epochs 100 --batch_size 256 # frozen LLM
|
| 417 |
+
python train.py --mode llm --unfreeze_layers 4 --batch_size 4096 # fine-tune top layers
|
| 418 |
+
```
|
| 419 |
+
|
| 420 |
+
Loss components: BCE on output bits, BCE on extracted A and B bits (2× weight), and CE on operation classification. Curriculum runs 0–9 → 0–99 → 0–255. Optimizer is AdamW, lr 3e-4, gradient clipping 1.0.
|
| 421 |
+
|
| 422 |
+
---
|
| 423 |
+
|
| 424 |
+
## Repository layout
|
| 425 |
+
|
| 426 |
+
```
|
| 427 |
+
neural_computer.safetensors canonical model (32-bit, 64 KB, ~8.47M params)
|
| 428 |
+
variants/ 18 prebuilt configurations
|
| 429 |
+
build.py generator (one safetensors per invocation)
|
| 430 |
+
build_all.py builds and verifies every named profile
|
| 431 |
+
eval.py gate-level fitness suite + reference CPU runtime
|
| 432 |
+
eval_all.py variant-agnostic harness (recommended)
|
| 433 |
+
play.py interactive demo
|
| 434 |
+
prune_weights.py GPU-batched weight reduction with conflict resolution
|
| 435 |
+
llm_integration/ SmolLM2 extractor + circuit wrapper + training code
|
| 436 |
+
├── circuits.py FrozenThresholdCircuits (loads safetensors, exposes
|
| 437 |
+
│ add_8bit / sub_8bit / mul_8bit / compare_*)
|
| 438 |
+
├── model.py Extractor variants, ArithmeticModel
|
| 439 |
+
├── train.py router / interface / llm training modes
|
| 440 |
+
├── fitness.py randomized fitness function
|
| 441 |
+
├── baseline.py vanilla SmolLM2 baseline measurement
|
| 442 |
+
├── trained/ checkpointed extractor weights
|
| 443 |
+
└── smollm2/
|
| 444 |
+
├── SMOLLM2_ARCHITECTURE.md architecture analysis
|
| 445 |
+
├── analyze_smollm2.py analysis script
|
| 446 |
+
└── smollm2_analysis.json analysis output
|
| 447 |
+
```
|
| 448 |
+
|
| 449 |
+
---
|
| 450 |
+
|
| 451 |
+
## Citation
|
| 452 |
+
|
| 453 |
+
```bibtex
|
| 454 |
+
@misc{8bit-threshold-computer,
|
| 455 |
+
title={8bit-threshold-computer: A Turing-Complete Threshold Logic CPU},
|
| 456 |
+
author={Norton, Charles},
|
| 457 |
+
year={2026},
|
| 458 |
+
howpublished={Hugging Face},
|
| 459 |
+
url={https://huggingface.co/phanerozoic/8bit-threshold-computer}
|
| 460 |
+
}
|
| 461 |
+
```
|
| 462 |
+
|
| 463 |
+
---
|
| 464 |
+
|
| 465 |
+
## License
|
| 466 |
+
|
| 467 |
+
MIT
|
| 468 |
+
|
| 469 |
+
---
|
| 470 |
+
|
| 471 |
+
## References
|
| 472 |
+
|
| 473 |
+
1. McCulloch & Pitts (1943). *A Logical Calculus of Ideas Immanent in Nervous Activity.*
|
| 474 |
+
2. Muroga (1971). *Threshold Logic and Its Applications.*
|
| 475 |
+
3. Siegelmann & Sontag (1995). *On the Computational Power of Neural Nets.*
|
| 476 |
+
4. Bengio et al. (2013). *Estimating or Propagating Gradients Through Stochastic Neurons.*
|
| 477 |
+
5. Ma et al. (2024). *The Era of 1-bit LLMs* (BitNet b1.58).
|
| 478 |
+
6. HuggingFace (2024). *SmolLM2: Small Language Models* — [model card](https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct).
|
| 479 |
+
7. Vaswani et al. (2017). *Attention Is All You Need.*
|
| 480 |
+
8. Su et al. (2021). *RoFormer: Enhanced Transformer with Rotary Position Embedding.*
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
neural_alu32.safetensors
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:2277b9b7ca05aeca4b84da8f8cf48c8ceba9d81ea926a2a1f6be46462fdc9944
|
| 3 |
-
size 10082208
|
|
|
|
|
|
|
|
|
|
|
|
neural_computer.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:18f4f3420fb307d90ea7a8fe356c196a59d7a0f2ed4ec57679d87b209a7fec22
|
| 3 |
+
size 47693920
|
play.py
CHANGED
|
@@ -1,25 +1,34 @@
|
|
| 1 |
"""
|
| 2 |
Hands-on playground for the 8bit-threshold-computer.
|
| 3 |
|
| 4 |
-
Loads
|
| 5 |
-
|
| 6 |
-
comparators,
|
| 7 |
-
a small assembled program end-to-end through the
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
"""
|
| 9 |
|
| 10 |
from __future__ import annotations
|
|
|
|
| 11 |
import os
|
| 12 |
import sys
|
|
|
|
| 13 |
import torch
|
| 14 |
from safetensors import safe_open
|
| 15 |
|
| 16 |
-
sys.path.insert(0, os.path.dirname(__file__))
|
| 17 |
-
|
| 18 |
-
# ---------------------------------------------------------------------------
|
| 19 |
-
# Load model + manifest
|
| 20 |
-
# ---------------------------------------------------------------------------
|
| 21 |
|
| 22 |
-
|
|
|
|
| 23 |
|
| 24 |
|
| 25 |
def heaviside(x):
|
|
@@ -34,451 +43,202 @@ def load_tensors(path):
|
|
| 34 |
return out
|
| 35 |
|
| 36 |
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
def
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
def
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
# --------------------
|
| 88 |
-
|
| 89 |
-
print("
|
| 90 |
-
print("
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
print(
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
print()
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
c1 = gate(f"{prefix}.ha1.carry", [a, b])
|
| 117 |
-
s2 = xor(f"{prefix}.ha2.sum", [s1, cin])
|
| 118 |
-
c2 = gate(f"{prefix}.ha2.carry", [s1, cin])
|
| 119 |
-
cout = gate(f"{prefix}.carry_or", [c1, c2])
|
| 120 |
-
return s2, cout
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
def alu_add(a, b):
|
| 124 |
-
"""8-bit ripple carry add via threshold full-adders."""
|
| 125 |
-
a_bits = int_to_bits_msb(a, 8)
|
| 126 |
-
b_bits = int_to_bits_msb(b, 8)
|
| 127 |
-
a_lsb_first = list(reversed(a_bits))
|
| 128 |
-
b_lsb_first = list(reversed(b_bits))
|
| 129 |
-
carry = 0
|
| 130 |
-
sum_lsb_first = []
|
| 131 |
-
for i in range(8):
|
| 132 |
-
s, carry = fa(f"arithmetic.ripplecarry8bit.fa{i}", a_lsb_first[i], b_lsb_first[i], carry)
|
| 133 |
-
sum_lsb_first.append(s)
|
| 134 |
-
return bits_msb_to_int(list(reversed(sum_lsb_first))), carry
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
def alu_sub(a, b):
|
| 138 |
-
"""A - B via two's complement; uses sub8bit circuit family."""
|
| 139 |
-
a_lsb = list(reversed(int_to_bits_msb(a, 8)))
|
| 140 |
-
b_lsb = list(reversed(int_to_bits_msb(b, 8)))
|
| 141 |
-
carry = 1
|
| 142 |
-
diff_lsb = []
|
| 143 |
-
for i in range(8):
|
| 144 |
-
notb = gate(f"arithmetic.sub8bit.notb{i}", [b_lsb[i]])
|
| 145 |
-
x1 = xor(f"arithmetic.sub8bit.fa{i}.xor1", [a_lsb[i], notb])
|
| 146 |
-
x2 = xor(f"arithmetic.sub8bit.fa{i}.xor2", [x1, carry])
|
| 147 |
-
and1 = gate(f"arithmetic.sub8bit.fa{i}.and1", [a_lsb[i], notb])
|
| 148 |
-
and2 = gate(f"arithmetic.sub8bit.fa{i}.and2", [x1, carry])
|
| 149 |
-
carry = gate(f"arithmetic.sub8bit.fa{i}.or_carry", [and1, and2])
|
| 150 |
-
diff_lsb.append(x2)
|
| 151 |
-
return bits_msb_to_int(list(reversed(diff_lsb))), carry
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
def alu_compare(a, b, kind):
|
| 155 |
-
"""8-bit comparators (single-layer GT/LT, two-layer EQ)."""
|
| 156 |
-
a_bits = int_to_bits_msb(a, 8)
|
| 157 |
-
b_bits = int_to_bits_msb(b, 8)
|
| 158 |
-
inp = a_bits + b_bits
|
| 159 |
-
if kind == "eq":
|
| 160 |
-
h_geq = gate("arithmetic.equality8bit.layer1.geq", inp)
|
| 161 |
-
h_leq = gate("arithmetic.equality8bit.layer1.leq", inp)
|
| 162 |
-
return gate("arithmetic.equality8bit.layer2", [h_geq, h_leq])
|
| 163 |
-
return gate(f"arithmetic.{kind}8bit", inp)
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
def alu_mul(a, b):
|
| 167 |
-
"""Shift-add multiply via partial-product threshold AND gates + repeated add."""
|
| 168 |
-
a_bits = int_to_bits_msb(a, 8)
|
| 169 |
-
b_bits = int_to_bits_msb(b, 8)
|
| 170 |
-
pp = [[0] * 8 for _ in range(8)]
|
| 171 |
-
for i in range(8):
|
| 172 |
-
for j in range(8):
|
| 173 |
-
pp[i][j] = gate(f"alu.alu8bit.mul.pp.a{i}b{j}", [a_bits[i], b_bits[j]])
|
| 174 |
-
# accumulate weighted partial products in 8 bits (drop overflow above bit 7)
|
| 175 |
-
result = 0
|
| 176 |
-
for j in range(8): # j=0 is MSB of b -> weight 7-j
|
| 177 |
-
if b_bits[j] == 0:
|
| 178 |
-
continue
|
| 179 |
-
row = 0
|
| 180 |
-
for i in range(8):
|
| 181 |
-
row |= (pp[i][j] << (7 - i))
|
| 182 |
-
shift = 7 - j
|
| 183 |
-
result, _ = alu_add(result & 0xFF, (row << shift) & 0xFF)
|
| 184 |
-
return result & 0xFF
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
cases_arith = [(5, 3), (37, 100), (200, 99), (255, 1), (127, 128), (15, 17)]
|
| 188 |
-
print("ADD:")
|
| 189 |
-
for a, b in cases_arith:
|
| 190 |
-
r, c = alu_add(a, b)
|
| 191 |
-
expect = (a + b) & 0xFF
|
| 192 |
-
ok = "OK" if r == expect else "FAIL"
|
| 193 |
-
print(f" {a:3} + {b:3} = {r:3} (carry={c}) expected {expect:3} [{ok}]")
|
| 194 |
-
|
| 195 |
-
print("SUB:")
|
| 196 |
-
for a, b in cases_arith:
|
| 197 |
-
r, c = alu_sub(a, b)
|
| 198 |
-
expect = (a - b) & 0xFF
|
| 199 |
-
ok = "OK" if r == expect else "FAIL"
|
| 200 |
-
print(f" {a:3} - {b:3} = {r:3} (no_borrow={c}) expected {expect:3} [{ok}]")
|
| 201 |
-
|
| 202 |
-
print("CMP:")
|
| 203 |
-
cmp_cases = [(50, 30), (30, 50), (77, 77), (255, 0), (0, 255), (128, 127)]
|
| 204 |
-
for a, b in cmp_cases:
|
| 205 |
-
gt = alu_compare(a, b, "greaterthan")
|
| 206 |
-
lt = alu_compare(a, b, "lessthan")
|
| 207 |
-
eq = alu_compare(a, b, "eq")
|
| 208 |
-
print(f" {a:3} vs {b:3} -> GT={gt} LT={lt} EQ={eq}")
|
| 209 |
-
|
| 210 |
-
print("MUL (low 8 bits):")
|
| 211 |
-
for a, b in [(12, 11), (15, 17), (8, 32), (200, 3), (0, 99), (1, 255)]:
|
| 212 |
-
r = alu_mul(a, b)
|
| 213 |
-
expect = (a * b) & 0xFF
|
| 214 |
-
ok = "OK" if r == expect else "FAIL"
|
| 215 |
-
print(f" {a:3} * {b:3} = {r:3} expected {expect:3} [{ok}]")
|
| 216 |
-
print()
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
# ---------------------------------------------------------------------------
|
| 220 |
-
# Demo 3: A 4-bit divisibility test (mod 5) - non-linearly-separable
|
| 221 |
-
# ---------------------------------------------------------------------------
|
| 222 |
-
|
| 223 |
-
print("=" * 64)
|
| 224 |
-
print(" Demo 3: mod-5 divisibility (multi-layer, hand-constructed)")
|
| 225 |
-
print("=" * 64)
|
| 226 |
-
# layer1: per-residue geq/leq -> layer2: eq -> layer3: OR all eq's
|
| 227 |
-
def mod5(v):
|
| 228 |
-
bits = int_to_bits_msb(v, 8)
|
| 229 |
-
# discover number of geq/leq neurons
|
| 230 |
-
n = 0
|
| 231 |
-
while f"modular.mod5.layer1.geq{n}.weight" in T:
|
| 232 |
-
n += 1
|
| 233 |
-
eqs = []
|
| 234 |
-
for i in range(n):
|
| 235 |
-
h_geq = gate(f"modular.mod5.layer1.geq{i}", bits)
|
| 236 |
-
h_leq = gate(f"modular.mod5.layer1.leq{i}", bits)
|
| 237 |
-
eqs.append(gate(f"modular.mod5.layer2.eq{i}", [h_geq, h_leq]))
|
| 238 |
-
return gate("modular.mod5.layer3.or", eqs)
|
| 239 |
-
|
| 240 |
-
hits = [v for v in range(256) if mod5(v)]
|
| 241 |
-
print(f" v in [0,255] with mod5(v)==1: {len(hits)} hits, first 12: {hits[:12]}")
|
| 242 |
-
print(f" Sanity: {[h % 5 for h in hits[:12]]}")
|
| 243 |
-
print()
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
# ---------------------------------------------------------------------------
|
| 247 |
-
# Demo 4: Manifest-aware threshold CPU - run a real program
|
| 248 |
-
# ---------------------------------------------------------------------------
|
| 249 |
-
|
| 250 |
-
print("=" * 64)
|
| 251 |
-
print(" Demo 4: Threshold CPU running an assembled program")
|
| 252 |
-
print("=" * 64)
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
class ThresholdCPU10:
|
| 256 |
-
"""CPU runtime matching the bundled small-profile manifest (10-bit addr)."""
|
| 257 |
-
|
| 258 |
-
def __init__(self, addr_bits, mem_bytes):
|
| 259 |
-
self.addr_bits = addr_bits
|
| 260 |
-
self.mem_bytes = mem_bytes
|
| 261 |
-
|
| 262 |
-
# --- memory primitives, fully through threshold weights ---
|
| 263 |
-
def addr_decode(self, addr):
|
| 264 |
-
bits = torch.tensor(int_to_bits_msb(addr, self.addr_bits), dtype=torch.float32)
|
| 265 |
-
w = T["memory.addr_decode.weight"]
|
| 266 |
-
b = T["memory.addr_decode.bias"]
|
| 267 |
-
return heaviside((w * bits).sum(dim=1) + b) # [mem_bytes]
|
| 268 |
-
|
| 269 |
-
def mem_read(self, mem, addr):
|
| 270 |
-
sel = self.addr_decode(addr)
|
| 271 |
-
mem_bits = torch.tensor(
|
| 272 |
-
[int_to_bits_msb(byte, 8) for byte in mem], dtype=torch.float32
|
| 273 |
-
)
|
| 274 |
-
and_w = T["memory.read.and.weight"]
|
| 275 |
-
and_b = T["memory.read.and.bias"]
|
| 276 |
-
or_w = T["memory.read.or.weight"]
|
| 277 |
-
or_b = T["memory.read.or.bias"]
|
| 278 |
-
out_bits = []
|
| 279 |
-
for bit in range(8):
|
| 280 |
-
inp = torch.stack([mem_bits[:, bit], sel], dim=1)
|
| 281 |
-
and_out = heaviside((inp * and_w[bit]).sum(dim=1) + and_b[bit])
|
| 282 |
-
out_bits.append(int(heaviside((and_out * or_w[bit]).sum() + or_b[bit]).item()))
|
| 283 |
-
return bits_msb_to_int(out_bits)
|
| 284 |
-
|
| 285 |
-
def mem_write(self, mem, addr, value):
|
| 286 |
-
sel = self.addr_decode(addr)
|
| 287 |
-
data_bits = torch.tensor(int_to_bits_msb(value, 8), dtype=torch.float32)
|
| 288 |
-
mem_bits = torch.tensor(
|
| 289 |
-
[int_to_bits_msb(byte, 8) for byte in mem], dtype=torch.float32
|
| 290 |
-
)
|
| 291 |
-
sel_w = T["memory.write.sel.weight"]
|
| 292 |
-
sel_b = T["memory.write.sel.bias"]
|
| 293 |
-
nsel_w = T["memory.write.nsel.weight"].squeeze(1)
|
| 294 |
-
nsel_b = T["memory.write.nsel.bias"]
|
| 295 |
-
and_old_w = T["memory.write.and_old.weight"]
|
| 296 |
-
and_old_b = T["memory.write.and_old.bias"]
|
| 297 |
-
and_new_w = T["memory.write.and_new.weight"]
|
| 298 |
-
and_new_b = T["memory.write.and_new.bias"]
|
| 299 |
-
or_w = T["memory.write.or.weight"]
|
| 300 |
-
or_b = T["memory.write.or.bias"]
|
| 301 |
-
|
| 302 |
-
we = torch.ones_like(sel)
|
| 303 |
-
sel_inp = torch.stack([sel, we], dim=1)
|
| 304 |
-
write_sel = heaviside((sel_inp * sel_w).sum(dim=1) + sel_b)
|
| 305 |
-
nsel = heaviside(write_sel * nsel_w + nsel_b)
|
| 306 |
-
|
| 307 |
-
new_mem = mem[:]
|
| 308 |
-
for bit in range(8):
|
| 309 |
-
old = mem_bits[:, bit]
|
| 310 |
-
data_bit = data_bits[bit].expand(self.mem_bytes)
|
| 311 |
-
inp_old = torch.stack([old, nsel], dim=1)
|
| 312 |
-
inp_new = torch.stack([data_bit, write_sel], dim=1)
|
| 313 |
-
and_old = heaviside((inp_old * and_old_w[:, bit]).sum(dim=1) + and_old_b[:, bit])
|
| 314 |
-
and_new = heaviside((inp_new * and_new_w[:, bit]).sum(dim=1) + and_new_b[:, bit])
|
| 315 |
-
or_inp = torch.stack([and_old, and_new], dim=1)
|
| 316 |
-
new_bit = heaviside((or_inp * or_w[:, bit]).sum(dim=1) + or_b[:, bit])
|
| 317 |
-
mem_bits[:, bit] = new_bit
|
| 318 |
-
return [bits_msb_to_int([int(b) for b in mem_bits[i].tolist()]) for i in range(self.mem_bytes)]
|
| 319 |
-
|
| 320 |
-
# --- helper to use threshold ALU functions defined above ---
|
| 321 |
-
def step(self, state):
|
| 322 |
-
if state["halted"]:
|
| 323 |
-
return state
|
| 324 |
-
s = dict(state)
|
| 325 |
-
s["mem"] = state["mem"][:]
|
| 326 |
-
s["regs"] = state["regs"][:]
|
| 327 |
-
s["flags"] = state["flags"][:]
|
| 328 |
-
|
| 329 |
-
pc = s["pc"]
|
| 330 |
-
addr_mask = (1 << self.addr_bits) - 1
|
| 331 |
-
hi = self.mem_read(s["mem"], pc & addr_mask)
|
| 332 |
-
lo = self.mem_read(s["mem"], (pc + 1) & addr_mask)
|
| 333 |
-
ir = ((hi & 0xFF) << 8) | (lo & 0xFF)
|
| 334 |
-
opcode = (ir >> 12) & 0xF
|
| 335 |
-
rd = (ir >> 10) & 0x3
|
| 336 |
-
rs = (ir >> 8) & 0x3
|
| 337 |
-
imm = ir & 0xFF
|
| 338 |
-
|
| 339 |
-
next_pc = (pc + 2) & addr_mask
|
| 340 |
-
addr16 = None
|
| 341 |
-
if opcode in (0xA, 0xB, 0xC, 0xD, 0xE):
|
| 342 |
-
ah = self.mem_read(s["mem"], next_pc)
|
| 343 |
-
al = self.mem_read(s["mem"], (next_pc + 1) & addr_mask)
|
| 344 |
-
addr16 = ((ah & 0xFF) << 8) | (al & 0xFF)
|
| 345 |
-
next_pc = (next_pc + 2) & addr_mask
|
| 346 |
-
addr10 = (addr16 & addr_mask) if addr16 is not None else None
|
| 347 |
-
|
| 348 |
-
a = s["regs"][rd]
|
| 349 |
-
b = s["regs"][rs]
|
| 350 |
-
write = True
|
| 351 |
-
result = a
|
| 352 |
carry = 0
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
#
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
"regs": [0, 0, 0, 0],
|
| 473 |
-
"flags": [0, 0, 0, 0],
|
| 474 |
-
"mem": mem,
|
| 475 |
-
"halted": False,
|
| 476 |
-
}
|
| 477 |
-
print(" Program: sum 5+4+3+2+1 via loop (uses ADD/SUB/CMP/Jcc/LOAD/STORE/HALT, all threshold-gated)")
|
| 478 |
-
print(" Running ...")
|
| 479 |
-
final, cycles = cpu.run(state, max_cycles=200)
|
| 480 |
-
print(f" Halted after {cycles} cycles")
|
| 481 |
-
print(f" R0={final['regs'][0]} R1={final['regs'][1]} R2={final['regs'][2]} R3={final['regs'][3]}")
|
| 482 |
-
print(f" M[0x0103] = {final['mem'][0x103]} (expected 15)")
|
| 483 |
-
print()
|
| 484 |
-
print("Done.")
|
|
|
|
| 1 |
"""
|
| 2 |
Hands-on playground for the 8bit-threshold-computer.
|
| 3 |
|
| 4 |
+
Loads a safetensors model, reads its manifest, and exercises threshold
|
| 5 |
+
circuits at every level: raw Boolean gates, 8-bit ALU arithmetic and
|
| 6 |
+
comparators, multi-layer modular arithmetic, and a manifest-sized CPU
|
| 7 |
+
runtime running a small assembled program end-to-end through the
|
| 8 |
+
threshold weights.
|
| 9 |
+
|
| 10 |
+
The CPU demo defaults to the small (1 KB) profile so the run finishes in
|
| 11 |
+
a fraction of a second. Larger profiles (4 KB, 64 KB) take proportionally
|
| 12 |
+
longer because every memory access decodes against every address line.
|
| 13 |
+
|
| 14 |
+
Usage:
|
| 15 |
+
python play.py # fast 1KB demo
|
| 16 |
+
python play.py --model neural_computer.safetensors # full 64KB
|
| 17 |
+
python play.py --model variants/neural_alu8.safetensors --skip-cpu # ALU only
|
| 18 |
"""
|
| 19 |
|
| 20 |
from __future__ import annotations
|
| 21 |
+
import argparse
|
| 22 |
import os
|
| 23 |
import sys
|
| 24 |
+
|
| 25 |
import torch
|
| 26 |
from safetensors import safe_open
|
| 27 |
|
| 28 |
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
+
# Reuse the variant-aware CPU runtime from eval_all.py
|
| 31 |
+
from eval_all import GenericThresholdCPU, builtin_program
|
| 32 |
|
| 33 |
|
| 34 |
def heaviside(x):
|
|
|
|
| 43 |
return out
|
| 44 |
|
| 45 |
|
| 46 |
+
def main() -> int:
|
| 47 |
+
parser = argparse.ArgumentParser(description="Threshold computer playground")
|
| 48 |
+
parser.add_argument(
|
| 49 |
+
"--model", type=str,
|
| 50 |
+
default=os.path.join(os.path.dirname(__file__),
|
| 51 |
+
"variants", "neural_computer8_small.safetensors"),
|
| 52 |
+
help="Path to a .safetensors variant"
|
| 53 |
+
)
|
| 54 |
+
parser.add_argument("--skip-cpu", action="store_true",
|
| 55 |
+
help="Skip the CPU program demo (useful for pure-ALU files)")
|
| 56 |
+
args = parser.parse_args()
|
| 57 |
+
|
| 58 |
+
print("Loading", args.model)
|
| 59 |
+
T = load_tensors(args.model)
|
| 60 |
+
|
| 61 |
+
DATA_BITS = int(T["manifest.data_bits"].item())
|
| 62 |
+
ADDR_BITS = int(T["manifest.addr_bits"].item())
|
| 63 |
+
MEM_BYTES = int(T["manifest.memory_bytes"].item())
|
| 64 |
+
REGISTERS = int(T["manifest.registers"].item())
|
| 65 |
+
print(f"Manifest: data={DATA_BITS}-bit, addr={ADDR_BITS}-bit, mem={MEM_BYTES}B, regs={REGISTERS}")
|
| 66 |
+
print(f"Tensors: {len(T):,}")
|
| 67 |
+
print(f"Total params: {sum(t.numel() for t in T.values()):,}")
|
| 68 |
+
print()
|
| 69 |
+
|
| 70 |
+
def gate(name, inputs):
|
| 71 |
+
w = T[name + ".weight"].view(-1)
|
| 72 |
+
b = T[name + ".bias"].view(-1)
|
| 73 |
+
return int(heaviside((torch.tensor(inputs, dtype=torch.float32) * w).sum() + b).item())
|
| 74 |
+
|
| 75 |
+
def xor(prefix, inputs):
|
| 76 |
+
a, b_ = inputs
|
| 77 |
+
h_or = gate(f"{prefix}.layer1.or", [a, b_])
|
| 78 |
+
h_nand = gate(f"{prefix}.layer1.nand", [a, b_])
|
| 79 |
+
return gate(f"{prefix}.layer2", [h_or, h_nand])
|
| 80 |
+
|
| 81 |
+
def xor_neuron(prefix, inputs):
|
| 82 |
+
a, b_ = inputs
|
| 83 |
+
h1 = gate(f"{prefix}.layer1.neuron1", [a, b_])
|
| 84 |
+
h2 = gate(f"{prefix}.layer1.neuron2", [a, b_])
|
| 85 |
+
return gate(f"{prefix}.layer2", [h1, h2])
|
| 86 |
+
|
| 87 |
+
def int_to_bits_msb(v, n):
|
| 88 |
+
return [(v >> (n - 1 - i)) & 1 for i in range(n)]
|
| 89 |
+
|
| 90 |
+
def bits_msb_to_int(bits):
|
| 91 |
+
out = 0
|
| 92 |
+
for b in bits:
|
| 93 |
+
out = (out << 1) | int(b)
|
| 94 |
+
return out
|
| 95 |
+
|
| 96 |
+
# ---------- Demo 1: Boolean gates ----------
|
| 97 |
+
print("=" * 64)
|
| 98 |
+
print(" Demo 1: Boolean threshold gates")
|
| 99 |
+
print("=" * 64)
|
| 100 |
+
truth_2 = [(0, 0), (0, 1), (1, 0), (1, 1)]
|
| 101 |
+
for gname in ["and", "or", "nand", "nor", "implies"]:
|
| 102 |
+
row = " ".join(f"{a}{b}->{gate(f'boolean.{gname}', [a, b])}" for a, b in truth_2)
|
| 103 |
+
print(f" {gname:8} {row}")
|
| 104 |
+
for gname in ["xor", "xnor", "biimplies"]:
|
| 105 |
+
row = " ".join(f"{a}{b}->{xor_neuron(f'boolean.{gname}', [a, b])}" for a, b in truth_2)
|
| 106 |
+
print(f" {gname:8} {row}")
|
| 107 |
+
print(f" not 0->{gate('boolean.not', [0])} 1->{gate('boolean.not', [1])}")
|
| 108 |
+
print()
|
| 109 |
+
|
| 110 |
+
# ---------- Demo 2: 8-bit ALU arithmetic ----------
|
| 111 |
+
print("=" * 64)
|
| 112 |
+
print(" Demo 2: 8-bit ALU arithmetic (every gate is threshold logic)")
|
| 113 |
+
print("=" * 64)
|
| 114 |
+
|
| 115 |
+
def fa(prefix, a, b, cin):
|
| 116 |
+
s1 = xor(f"{prefix}.ha1.sum", [a, b])
|
| 117 |
+
c1 = gate(f"{prefix}.ha1.carry", [a, b])
|
| 118 |
+
s2 = xor(f"{prefix}.ha2.sum", [s1, cin])
|
| 119 |
+
c2 = gate(f"{prefix}.ha2.carry", [s1, cin])
|
| 120 |
+
return s2, gate(f"{prefix}.carry_or", [c1, c2])
|
| 121 |
+
|
| 122 |
+
def alu_add(a, b):
|
| 123 |
+
a_lsb = list(reversed(int_to_bits_msb(a, 8)))
|
| 124 |
+
b_lsb = list(reversed(int_to_bits_msb(b, 8)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
carry = 0
|
| 126 |
+
sum_lsb = []
|
| 127 |
+
for i in range(8):
|
| 128 |
+
s, carry = fa(f"arithmetic.ripplecarry8bit.fa{i}", a_lsb[i], b_lsb[i], carry)
|
| 129 |
+
sum_lsb.append(s)
|
| 130 |
+
return bits_msb_to_int(list(reversed(sum_lsb))), carry
|
| 131 |
+
|
| 132 |
+
def alu_sub(a, b):
|
| 133 |
+
a_lsb = list(reversed(int_to_bits_msb(a, 8)))
|
| 134 |
+
b_lsb = list(reversed(int_to_bits_msb(b, 8)))
|
| 135 |
+
carry = 1
|
| 136 |
+
diff_lsb = []
|
| 137 |
+
for i in range(8):
|
| 138 |
+
notb = gate(f"arithmetic.sub8bit.notb{i}", [b_lsb[i]])
|
| 139 |
+
x1 = xor(f"arithmetic.sub8bit.fa{i}.xor1", [a_lsb[i], notb])
|
| 140 |
+
x2 = xor(f"arithmetic.sub8bit.fa{i}.xor2", [x1, carry])
|
| 141 |
+
and1 = gate(f"arithmetic.sub8bit.fa{i}.and1", [a_lsb[i], notb])
|
| 142 |
+
and2 = gate(f"arithmetic.sub8bit.fa{i}.and2", [x1, carry])
|
| 143 |
+
carry = gate(f"arithmetic.sub8bit.fa{i}.or_carry", [and1, and2])
|
| 144 |
+
diff_lsb.append(x2)
|
| 145 |
+
return bits_msb_to_int(list(reversed(diff_lsb))), carry
|
| 146 |
+
|
| 147 |
+
def alu_compare(a, b, kind):
|
| 148 |
+
inp = int_to_bits_msb(a, 8) + int_to_bits_msb(b, 8)
|
| 149 |
+
if kind == "eq":
|
| 150 |
+
h_geq = gate("arithmetic.equality8bit.layer1.geq", inp)
|
| 151 |
+
h_leq = gate("arithmetic.equality8bit.layer1.leq", inp)
|
| 152 |
+
return gate("arithmetic.equality8bit.layer2", [h_geq, h_leq])
|
| 153 |
+
return gate(f"arithmetic.{kind}8bit", inp)
|
| 154 |
+
|
| 155 |
+
def alu_mul(a, b):
|
| 156 |
+
a_bits = int_to_bits_msb(a, 8)
|
| 157 |
+
b_bits = int_to_bits_msb(b, 8)
|
| 158 |
+
result = 0
|
| 159 |
+
for j in range(8):
|
| 160 |
+
if b_bits[j] == 0:
|
| 161 |
+
continue
|
| 162 |
+
row = 0
|
| 163 |
+
for i in range(8):
|
| 164 |
+
pp = gate(f"alu.alu8bit.mul.pp.a{i}b{j}", [a_bits[i], b_bits[j]])
|
| 165 |
+
row |= (pp << (7 - i))
|
| 166 |
+
shift = 7 - j
|
| 167 |
+
result, _ = alu_add(result & 0xFF, (row << shift) & 0xFF)
|
| 168 |
+
return result & 0xFF
|
| 169 |
+
|
| 170 |
+
cases_arith = [(5, 3), (37, 100), (200, 99), (255, 1), (127, 128), (15, 17)]
|
| 171 |
+
print("ADD:")
|
| 172 |
+
for a, b in cases_arith:
|
| 173 |
+
r, c = alu_add(a, b)
|
| 174 |
+
e = (a + b) & 0xFF
|
| 175 |
+
print(f" {a:3} + {b:3} = {r:3} (carry={c}) expected {e:3} [{'OK' if r == e else 'FAIL'}]")
|
| 176 |
+
print("SUB:")
|
| 177 |
+
for a, b in cases_arith:
|
| 178 |
+
r, c = alu_sub(a, b)
|
| 179 |
+
e = (a - b) & 0xFF
|
| 180 |
+
print(f" {a:3} - {b:3} = {r:3} (no_borrow={c}) expected {e:3} [{'OK' if r == e else 'FAIL'}]")
|
| 181 |
+
print("CMP:")
|
| 182 |
+
for a, b in [(50, 30), (30, 50), (77, 77), (255, 0), (0, 255), (128, 127)]:
|
| 183 |
+
gt = alu_compare(a, b, "greaterthan")
|
| 184 |
+
lt = alu_compare(a, b, "lessthan")
|
| 185 |
+
eq = alu_compare(a, b, "eq")
|
| 186 |
+
print(f" {a:3} vs {b:3} -> GT={gt} LT={lt} EQ={eq}")
|
| 187 |
+
print("MUL (low 8 bits):")
|
| 188 |
+
for a, b in [(12, 11), (15, 17), (8, 32), (200, 3), (0, 99), (1, 255)]:
|
| 189 |
+
r = alu_mul(a, b)
|
| 190 |
+
e = (a * b) & 0xFF
|
| 191 |
+
print(f" {a:3} * {b:3} = {r:3} expected {e:3} [{'OK' if r == e else 'FAIL'}]")
|
| 192 |
+
print()
|
| 193 |
+
|
| 194 |
+
# ---------- Demo 3: mod-5 divisibility ----------
|
| 195 |
+
print("=" * 64)
|
| 196 |
+
print(" Demo 3: mod-5 divisibility (multi-layer, hand-constructed)")
|
| 197 |
+
print("=" * 64)
|
| 198 |
+
|
| 199 |
+
def mod5(v):
|
| 200 |
+
bits = int_to_bits_msb(v, 8)
|
| 201 |
+
n = 0
|
| 202 |
+
while f"modular.mod5.layer1.geq{n}.weight" in T:
|
| 203 |
+
n += 1
|
| 204 |
+
eqs = []
|
| 205 |
+
for i in range(n):
|
| 206 |
+
h_geq = gate(f"modular.mod5.layer1.geq{i}", bits)
|
| 207 |
+
h_leq = gate(f"modular.mod5.layer1.leq{i}", bits)
|
| 208 |
+
eqs.append(gate(f"modular.mod5.layer2.eq{i}", [h_geq, h_leq]))
|
| 209 |
+
return gate("modular.mod5.layer3.or", eqs)
|
| 210 |
+
|
| 211 |
+
hits = [v for v in range(256) if mod5(v)]
|
| 212 |
+
print(f" v in [0,255] with mod5(v)==1: {len(hits)} hits, first 12: {hits[:12]}")
|
| 213 |
+
print(f" Sanity (each %5): {[h % 5 for h in hits[:12]]}")
|
| 214 |
+
print()
|
| 215 |
+
|
| 216 |
+
# ---------- Demo 4: CPU running an assembled program ----------
|
| 217 |
+
if args.skip_cpu or MEM_BYTES < 0x84:
|
| 218 |
+
if args.skip_cpu:
|
| 219 |
+
print("Demo 4 skipped (--skip-cpu).")
|
| 220 |
+
else:
|
| 221 |
+
print(f"Demo 4 skipped (memory={MEM_BYTES}B too small for the demo program).")
|
| 222 |
+
return 0
|
| 223 |
+
|
| 224 |
+
print("=" * 64)
|
| 225 |
+
print(f" Demo 4: Threshold CPU running an assembled program ({MEM_BYTES} B memory)")
|
| 226 |
+
print("=" * 64)
|
| 227 |
+
print(" Program: sum 5+4+3+2+1 via loop")
|
| 228 |
+
print(" uses LOAD/STORE/ADD/SUB/CMP/JNZ/HALT, all threshold-gated")
|
| 229 |
+
print(" Running ... (larger memories take longer because every memory access")
|
| 230 |
+
print(" decodes against every address line)")
|
| 231 |
+
cpu = GenericThresholdCPU({k: v for k, v in T.items()})
|
| 232 |
+
mem, expected = builtin_program(ADDR_BITS)
|
| 233 |
+
state = {"pc": 0, "regs": [0] * 4, "flags": [0] * 4, "mem": mem, "halted": False}
|
| 234 |
+
final, cycles = cpu.run(state, max_cycles=200)
|
| 235 |
+
got = final["mem"][0x83]
|
| 236 |
+
print(f" Halted after {cycles} cycles")
|
| 237 |
+
print(f" R0={final['regs'][0]} R1={final['regs'][1]} "
|
| 238 |
+
f"R2={final['regs'][2]} R3={final['regs'][3]}")
|
| 239 |
+
print(f" M[0x0083] = {got} (expected {expected}) [{'OK' if got == expected else 'FAIL'}]")
|
| 240 |
+
return 0 if got == expected else 1
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
if __name__ == "__main__":
|
| 244 |
+
sys.exit(main())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|