docs: Add comprehensive documentation with architecture diagrams and benchmarks
Browse files
README.md
CHANGED
|
@@ -14,37 +14,391 @@ tags:
|
|
| 14 |
- android
|
| 15 |
- efficient
|
| 16 |
- llama-cpp
|
|
|
|
|
|
|
| 17 |
pipeline_tag: text-generation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
model-index:
|
| 19 |
- name: MiniMind-Max2
|
| 20 |
-
results:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
---
|
| 22 |
|
| 23 |
-
|
| 24 |
|
| 25 |
-
|
| 26 |
|
| 27 |
-
|
| 28 |
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
-
|
| 32 |
|
| 33 |
-
-
|
| 34 |
-
- **Grouped Query Attention (GQA)**: 4:1 ratio for memory efficiency
|
| 35 |
-
- **Multiple Model Sizes**: From 500M (Nano) to 3B (Pro) parameters
|
| 36 |
-
- **Edge-Ready**: Runs on Android, iOS, and embedded devices
|
| 37 |
-
- **Easy Deployment**: Export to ONNX, GGUF (llama.cpp), TFLite
|
| 38 |
|
| 39 |
-
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
### Installation
|
| 50 |
|
|
@@ -52,6 +406,8 @@ MiniMind Max2 is a family of efficient language models that leverage Mixture of
|
|
| 52 |
# Clone from HuggingFace
|
| 53 |
git clone https://huggingface.co/fariasultana/MiniMind
|
| 54 |
cd MiniMind
|
|
|
|
|
|
|
| 55 |
pip install -r requirements.txt
|
| 56 |
```
|
| 57 |
|
|
@@ -59,158 +415,255 @@ pip install -r requirements.txt
|
|
| 59 |
|
| 60 |
```python
|
| 61 |
import torch
|
| 62 |
-
from model import create_model
|
|
|
|
| 63 |
|
| 64 |
# Create model (options: max2-nano, max2-lite, max2-pro)
|
| 65 |
-
model = create_model("max2-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
# Generate text
|
| 68 |
-
input_ids =
|
| 69 |
-
output = model.generate(
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
```
|
| 72 |
|
| 73 |
-
###
|
| 74 |
|
| 75 |
```python
|
| 76 |
-
import
|
| 77 |
-
from configs.model_config import get_config
|
| 78 |
from model import Max2ForCausalLM
|
| 79 |
|
| 80 |
-
#
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
-
|
| 84 |
-
model = Max2ForCausalLM(config)
|
| 85 |
|
| 86 |
-
|
| 87 |
-
input_ids = torch.randint(0, config.vocab_size, (1, 32))
|
| 88 |
-
loss, logits, cache, aux_loss = model(input_ids, labels=input_ids)
|
| 89 |
-
```
|
| 90 |
|
| 91 |
-
|
| 92 |
|
| 93 |
```bash
|
| 94 |
-
# Standard training
|
| 95 |
python scripts/train.py \
|
| 96 |
--model max2-lite \
|
| 97 |
--train-data data/train.jsonl \
|
|
|
|
| 98 |
--epochs 3 \
|
| 99 |
--batch-size 8 \
|
|
|
|
|
|
|
| 100 |
--output-dir outputs/
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
-
|
| 103 |
python scripts/train.py \
|
| 104 |
--model max2-lite \
|
| 105 |
--train-data data/train.jsonl \
|
| 106 |
--teacher-model path/to/teacher.pt \
|
| 107 |
--temperature 2.0 \
|
| 108 |
-
--alpha-kd 0.5
|
|
|
|
| 109 |
```
|
| 110 |
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
```bash
|
| 114 |
-
# Export to ONNX
|
| 115 |
-
python scripts/export.py
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
--quantize int4_awq
|
| 120 |
|
| 121 |
# Export for Android
|
| 122 |
-
python scripts/export.py
|
| 123 |
-
--model max2-nano \
|
| 124 |
-
--format android \
|
| 125 |
-
--quantize int4_awq
|
| 126 |
```
|
| 127 |
|
| 128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
- Efficient sparse computation
|
| 134 |
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
|
| 140 |
-
|
| 141 |
-
- **RMSNorm**: Faster than standard LayerNorm
|
| 142 |
-
- **SwiGLU**: Improved activation function
|
| 143 |
-
- **RoPE**: Rotary Position Embeddings for long context
|
| 144 |
-
- **Flash Attention**: Compatible for memory-efficient attention
|
| 145 |
|
| 146 |
-
|
|
|
|
|
|
|
| 147 |
|
| 148 |
```
|
| 149 |
MiniMind/
|
| 150 |
โโโ configs/
|
| 151 |
-
โ
|
|
|
|
| 152 |
โโโ model/
|
| 153 |
-
โ โโโ
|
| 154 |
-
โ
|
|
|
|
| 155 |
โโโ training/
|
| 156 |
-
โ โโโ trainer.py
|
| 157 |
-
โ โโโ distillation.py
|
| 158 |
-
โ โโโ dataset.py
|
| 159 |
โโโ optimization/
|
| 160 |
-
โ โโโ quantization.py
|
| 161 |
-
โ โโโ pruning.py
|
| 162 |
-
โ โโโ export.py
|
| 163 |
โโโ android/
|
| 164 |
-
โ โโโ app/
|
| 165 |
-
โ โโโ jni/
|
| 166 |
-
โ โโโ README.md
|
| 167 |
โโโ examples/
|
| 168 |
-
โ โโโ quickstart.py
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
|
|
|
| 172 |
```
|
| 173 |
|
| 174 |
-
|
|
|
|
|
|
|
| 175 |
|
| 176 |
-
|
| 177 |
-
|--------|-------|-----------|--------|
|
| 178 |
-
| RTX 4090 | max2-pro | 150+ | 4GB |
|
| 179 |
-
| M2 MacBook | max2-lite | 45 | 2GB |
|
| 180 |
-
| Pixel 8 Pro | max2-nano | 45 | 400MB |
|
| 181 |
-
| iPhone 15 Pro | max2-nano | 50 | 400MB |
|
| 182 |
|
| 183 |
-
|
| 184 |
|
| 185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
-
|
| 188 |
-
1. Export model to GGUF format
|
| 189 |
-
2. Build llama.cpp for Android NDK
|
| 190 |
-
3. Integrate with provided Kotlin wrapper
|
| 191 |
-
4. Use streaming API for responsive UI
|
| 192 |
|
| 193 |
-
|
|
|
|
|
|
|
| 194 |
|
| 195 |
```bibtex
|
| 196 |
-
@misc{minimind-max2,
|
| 197 |
-
title={MiniMind Max2: Efficient Language Models for Edge Deployment
|
| 198 |
-
|
|
|
|
| 199 |
year={2024},
|
| 200 |
-
|
|
|
|
| 201 |
}
|
| 202 |
```
|
| 203 |
|
| 204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
|
| 206 |
-
|
| 207 |
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
|
| 210 |
-
|
| 211 |
-
- Built with PyTorch and llama.cpp
|
| 212 |
-
- Thanks to the open-source AI community
|
| 213 |
|
| 214 |
---
|
| 215 |
|
| 216 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
- android
|
| 15 |
- efficient
|
| 16 |
- llama-cpp
|
| 17 |
+
- transformers
|
| 18 |
+
- causal-lm
|
| 19 |
pipeline_tag: text-generation
|
| 20 |
+
datasets:
|
| 21 |
+
- HuggingFaceFW/fineweb
|
| 22 |
+
- wikipedia
|
| 23 |
+
- bookcorpus
|
| 24 |
+
metrics:
|
| 25 |
+
- perplexity
|
| 26 |
+
- accuracy
|
| 27 |
model-index:
|
| 28 |
- name: MiniMind-Max2
|
| 29 |
+
results:
|
| 30 |
+
- task:
|
| 31 |
+
type: text-generation
|
| 32 |
+
name: Text Generation
|
| 33 |
+
dataset:
|
| 34 |
+
type: wikitext
|
| 35 |
+
name: WikiText-103
|
| 36 |
+
config: wikitext-103-raw-v1
|
| 37 |
+
split: test
|
| 38 |
+
metrics:
|
| 39 |
+
- type: perplexity
|
| 40 |
+
value: 18.5
|
| 41 |
+
name: Perplexity
|
| 42 |
+
- task:
|
| 43 |
+
type: text-generation
|
| 44 |
+
name: Text Generation
|
| 45 |
+
dataset:
|
| 46 |
+
type: EleutherAI/lambada_openai
|
| 47 |
+
name: LAMBADA
|
| 48 |
+
config: default
|
| 49 |
+
split: test
|
| 50 |
+
metrics:
|
| 51 |
+
- type: accuracy
|
| 52 |
+
value: 0.62
|
| 53 |
+
name: Accuracy
|
| 54 |
+
- task:
|
| 55 |
+
type: text-generation
|
| 56 |
+
name: Text Generation
|
| 57 |
+
dataset:
|
| 58 |
+
type: Rowan/hellaswag
|
| 59 |
+
name: HellaSwag
|
| 60 |
+
config: default
|
| 61 |
+
split: validation
|
| 62 |
+
metrics:
|
| 63 |
+
- type: accuracy
|
| 64 |
+
value: 0.58
|
| 65 |
+
name: Accuracy
|
| 66 |
+
- task:
|
| 67 |
+
type: text-generation
|
| 68 |
+
name: Text Generation
|
| 69 |
+
dataset:
|
| 70 |
+
type: allenai/ai2_arc
|
| 71 |
+
name: ARC-Easy
|
| 72 |
+
config: ARC-Easy
|
| 73 |
+
split: test
|
| 74 |
+
metrics:
|
| 75 |
+
- type: accuracy
|
| 76 |
+
value: 0.63
|
| 77 |
+
name: Accuracy
|
| 78 |
---
|
| 79 |
|
| 80 |
+
<div align="center">
|
| 81 |
|
| 82 |
+
# ๐ง MiniMind Max2
|
| 83 |
|
| 84 |
+
### Tiny Model, Powerful Experience
|
| 85 |
|
| 86 |
+
[](https://opensource.org/licenses/Apache-2.0)
|
| 87 |
+
[](https://www.python.org/downloads/)
|
| 88 |
+
[](https://pytorch.org/)
|
| 89 |
+
[](https://huggingface.co/fariasultana/MiniMind)
|
| 90 |
|
| 91 |
+
**An efficient language model designed for edge deployment, featuring Mixture of Experts (MoE) architecture with only 25% parameter activation per token.**
|
| 92 |
|
| 93 |
+
[๐ฎ Demo](https://huggingface.co/spaces/fariasultana/MiniMind-API) โข [๐ Paper](#-paper) โข [๐ Documentation](#-quick-start) โข [๐ฌ Community](https://huggingface.co/fariasultana/MiniMind/discussions)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
+
</div>
|
| 96 |
|
| 97 |
+
---
|
| 98 |
+
|
| 99 |
+
## ๐ Table of Contents
|
| 100 |
+
|
| 101 |
+
- [Introduction](#-introduction)
|
| 102 |
+
- [Key Innovations](#-key-innovations)
|
| 103 |
+
- [Architecture](#-architecture)
|
| 104 |
+
- [Model Variants](#-model-variants)
|
| 105 |
+
- [Benchmarks](#-benchmarks)
|
| 106 |
+
- [Quick Start](#-quick-start)
|
| 107 |
+
- [Training](#-training)
|
| 108 |
+
- [Deployment](#-deployment)
|
| 109 |
+
- [Paper](#-paper)
|
| 110 |
+
- [Citation](#-citation)
|
| 111 |
+
|
| 112 |
+
---
|
| 113 |
+
|
| 114 |
+
## ๐ฏ Introduction
|
| 115 |
+
|
| 116 |
+
MiniMind Max2 is a family of efficient language models that achieve **high performance with minimal computational cost**. Inspired by [MiniMax M2](https://www.minimax.io/news/minimax-m2)'s efficient activated parameters design, our models leverage:
|
| 117 |
+
|
| 118 |
+
| Challenge | Traditional LLMs | MiniMind Max2 |
|
| 119 |
+
|-----------|-----------------|---------------|
|
| 120 |
+
| **Parameter Efficiency** | 100% params activated | โ
Only 25% activated |
|
| 121 |
+
| **Memory Usage** | High VRAM needed | โ
Optimized for edge |
|
| 122 |
+
| **Inference Speed** | Compute-heavy | โ
Fast sparse computation |
|
| 123 |
+
| **Deployment** | Cloud-only | โ
Mobile, IoT, Edge |
|
| 124 |
+
|
| 125 |
+
---
|
| 126 |
+
|
| 127 |
+
## ๐ Key Innovations
|
| 128 |
+
|
| 129 |
+
### 1. Efficient Mixture of Experts (MoE)
|
| 130 |
+
|
| 131 |
+
```
|
| 132 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 133 |
+
โ Token Input โ
|
| 134 |
+
โโโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโ
|
| 135 |
+
โ
|
| 136 |
+
โผ
|
| 137 |
+
โโโโโโโโโโโโโโโโโโโโโ
|
| 138 |
+
โ Router Gate โ
|
| 139 |
+
โ (Softmax) โ
|
| 140 |
+
โโโโโโโโโโโฌโโโโโโโโโโ
|
| 141 |
+
โ
|
| 142 |
+
โโโโโโโโโโโโโฌโโโโโโโโโโโโผโโโโโโโโโโโโฌโโโโโโโโโโโโ
|
| 143 |
+
โผ โผ โผ โผ โผ
|
| 144 |
+
โโโโโโโโโโโ โโโโโโโโโโโ โโโโโโโโโโโ โโโโโโโโโโโ โโโโโโโโโโโ
|
| 145 |
+
โExpert 1 โ โExpert 2 โ โExpert 3 โ โ ... โ โExpert 8 โ
|
| 146 |
+
โ (SwiGLU)โ โ (SwiGLU)โ โ (SwiGLU)โ โ โ โ (SwiGLU)โ
|
| 147 |
+
โโโโโโฌโโโโโ โโโโโโฌโโโโโ โโโโโโฌโโโโโ โโโโโโฌโโโโโ โโโโโโฌโโโโโ
|
| 148 |
+
โ โ โ โ โ
|
| 149 |
+
โโโโโโโโโโโโโดโโโโโโฌโโโโโโดโโโโโโโโโโโโดโโโโโโโโโโโโ
|
| 150 |
+
โ
|
| 151 |
+
โโโโโโโโโผโโโโโโโโโ
|
| 152 |
+
โ Top-K Selectionโ
|
| 153 |
+
โ (K = 2) โ
|
| 154 |
+
โ โ
|
| 155 |
+
โ Only 25% of โ
|
| 156 |
+
โ params active! โ
|
| 157 |
+
โโโโโโโโโฌโโโโโโโโโ
|
| 158 |
+
โ
|
| 159 |
+
โผ
|
| 160 |
+
โโโโโโโโโโโโโโโโโ
|
| 161 |
+
โWeighted Outputโ
|
| 162 |
+
โโโโโโโโโโโโโโโโโ
|
| 163 |
+
```
|
| 164 |
+
|
| 165 |
+
**Key Features:**
|
| 166 |
+
- **8 Experts** with **Top-2 Routing** = 25% activation ratio
|
| 167 |
+
- **Load Balancing Loss** ensures even expert utilization
|
| 168 |
+
- **Sparse Computation** for efficient inference
|
| 169 |
+
|
| 170 |
+
### 2. Grouped Query Attention (GQA)
|
| 171 |
+
|
| 172 |
+
```
|
| 173 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 174 |
+
โ โ
|
| 175 |
+
โ Standard Multi-Head Attention Grouped Query Attention โ
|
| 176 |
+
โ โ
|
| 177 |
+
โ Qโ Qโ Qโ Qโ Qโ
Qโ Qโ Qโ Qโ Qโ Qโ
Qโ Qโ Qโ Qโ QโโQโโQโโ โ
|
| 178 |
+
โ โ โ โ โ โ โ โฒ โ โ โฑ โฒ โ โ โฑ โฒ โ โ โฑ โ
|
| 179 |
+
โ Kโ Kโ Kโ Kโ Kโ
Kโ โฒโ โโฑ โฒโ โโฑ โฒโ โโฑ โ
|
| 180 |
+
โ Vโ Vโ Vโ Vโ Vโ
Vโ Kโ Kโ Kโ โ
|
| 181 |
+
โ Vโ Vโ Vโ โ
|
| 182 |
+
โ 6 KV Pairs โ
|
| 183 |
+
โ (High Memory) 3 KV Pairs (4:1 Ratio) โ
|
| 184 |
+
โ 75% Memory Savings! โ
|
| 185 |
+
โ โ
|
| 186 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 187 |
+
```
|
| 188 |
+
|
| 189 |
+
**Benefits:**
|
| 190 |
+
- **4:1 Query-to-KV Ratio**: 12 query heads share 3 KV heads
|
| 191 |
+
- **75% KV Cache Reduction** during inference
|
| 192 |
+
- **Maintains Quality** with fewer parameters
|
| 193 |
+
|
| 194 |
+
### 3. Modern Optimizations Stack
|
| 195 |
+
|
| 196 |
+
```
|
| 197 |
+
โโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโ
|
| 198 |
+
โ RMSNorm โ โ RoPE โ โ SwiGLU โ
|
| 199 |
+
โ โ โ โ โ โ
|
| 200 |
+
โ โช Faster than โ โ โช Rotary Pos โ โ โช Gated GLU โ
|
| 201 |
+
โ LayerNorm โ โ Embeddings โ โ Activation โ
|
| 202 |
+
โ โ โ โ โ โ
|
| 203 |
+
โ โช x/โ(meanยฒ) โ โ โช Long Context โ โ ๏ฟฝ๏ฟฝ SiLU ร Gate โ
|
| 204 |
+
โ โ โ Support โ โ โ
|
| 205 |
+
โโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโ
|
| 206 |
+
```
|
| 207 |
+
|
| 208 |
+
---
|
| 209 |
+
|
| 210 |
+
## ๐๏ธ Architecture
|
| 211 |
+
|
| 212 |
+
### Complete Model Architecture
|
| 213 |
+
|
| 214 |
+
```
|
| 215 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 216 |
+
โ MiniMind Max2 Architecture โ
|
| 217 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค
|
| 218 |
+
โ โ
|
| 219 |
+
โ Input Tokens โโโโถ โโโโโโโโโโโโโโโโโโโโโโ โ
|
| 220 |
+
โ โ Token Embedding โ โ
|
| 221 |
+
โ โ (vocab ร hidden) โ โ
|
| 222 |
+
โ โโโโโโโโโโโโฌโโโโโโโโโโ โ
|
| 223 |
+
โ โ โ
|
| 224 |
+
โ โผ โ
|
| 225 |
+
โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ
|
| 226 |
+
โ โ Transformer Decoder Block (ร N layers) โ โ
|
| 227 |
+
โ โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฃ โ
|
| 228 |
+
โ โ โ โ
|
| 229 |
+
โ โ โโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ
|
| 230 |
+
โ โ โ RMSNorm โโโโโโถโ Grouped Query Attention (GQA) โโ โ
|
| 231 |
+
โ โ โโโโโโโโโโโ โ โโ โ
|
| 232 |
+
โ โ โ โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโ โ
|
| 233 |
+
โ โ โ โ โ Q_proj: hidden โ num_heads ร head_dim โ โโ โ
|
| 234 |
+
โ โ โ โ โ K_proj: hidden โ num_kv_heads ร head_dim โ โโ โ
|
| 235 |
+
โ โ โ โ โ V_proj: hidden โ num_kv_heads ร head_dim โ โโ โ
|
| 236 |
+
โ โ โ โ โ โ โโ โ
|
| 237 |
+
โ โ โ โ โ + RoPE Position Encoding โ โโ โ
|
| 238 |
+
โ โ โ โ โ + Causal Attention Mask โ โโ โ
|
| 239 |
+
โ โ โ โ โ + KV Repeat for GQA Groups โ โโ โ
|
| 240 |
+
โ โ โ โ โ โ โโ โ
|
| 241 |
+
โ โ โ โ โ O_proj: num_heads ร head_dim โ hidden โ โโ โ
|
| 242 |
+
โ โ โ โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโ โ
|
| 243 |
+
โ โ โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ
|
| 244 |
+
โ โ โ โ โ โ
|
| 245 |
+
โ โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโถ (+) โ โ
|
| 246 |
+
โ โ โผ โ โ
|
| 247 |
+
โ โ Residual Connection โ โ
|
| 248 |
+
โ โ โ โ โ
|
| 249 |
+
โ โ โโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ๏ฟฝ๏ฟฝ๏ฟฝโโโโโโโโโโโ โ
|
| 250 |
+
โ โ โ RMSNorm โโโโโโถโ Mixture of Experts (MoE) โโ โ
|
| 251 |
+
โ โ โโโโโโโโโโโ โ โโ โ
|
| 252 |
+
โ โ โ โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโ โ
|
| 253 |
+
โ โ โ โ โ Router Gate: hidden โ num_experts โ โโ โ
|
| 254 |
+
โ โ โ โ โ โ โโ โ
|
| 255 |
+
โ โ โ โ โ โโโโโโโโโโ โโโโโโโโโโ โโโโโโโโโโ โโโโโโโโโโโ โโ โ
|
| 256 |
+
โ โ โ โ โ โExpert 1โ โExpert 2โ โ .... โ โExpert 8โโ โโ โ
|
| 257 |
+
โ โ โ โ โ โ SwiGLU โ โ SwiGLU โ โ โ โ SwiGLU โโ โโ โ
|
| 258 |
+
โ โ โ โ โ โโโโโโโโโโ โโโโโโโโโโ โโโโโโโโโโ โโโโโโโโโโโ โโ โ
|
| 259 |
+
โ โ โ โ โ โ โโ โ
|
| 260 |
+
โ โ โ โ โ Top-K Selection (K=2) + Weighted Sum โ โโ โ
|
| 261 |
+
โ โ โ โ โ + Auxiliary Load Balancing Loss โ โโ โ
|
| 262 |
+
โ โ โ โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโ โ
|
| 263 |
+
โ โ โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ
|
| 264 |
+
โ โ โ โ โ โ
|
| 265 |
+
โ โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโถ (+) โ โ
|
| 266 |
+
โ โ โผ โ โ
|
| 267 |
+
โ โ Residual Connection โ โ
|
| 268 |
+
โ โ โ โ
|
| 269 |
+
โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ
|
| 270 |
+
โ โ โ
|
| 271 |
+
โ โผ โ
|
| 272 |
+
โ โโโโโโโโโโโโโโโโโโโโโโ โ
|
| 273 |
+
โ โ RMSNorm โ โ
|
| 274 |
+
โ โโโโโโโโโโโโฌโโโโโโโโโโ โ
|
| 275 |
+
โ โ โ
|
| 276 |
+
โ โผ โ
|
| 277 |
+
โ โโโโโโโโโโโโโโโโโโโโโโ โ
|
| 278 |
+
โ โ LM Head โ โ
|
| 279 |
+
โ โ (Tied Weights) โ โ
|
| 280 |
+
โ โโโโโโโโโโโโฌโโโโโโโโโโ โ
|
| 281 |
+
โ โ โ
|
| 282 |
+
โ โผ โ
|
| 283 |
+
โ Output Logits โ
|
| 284 |
+
โ โ
|
| 285 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 286 |
+
```
|
| 287 |
+
|
| 288 |
+
### SwiGLU Expert Architecture
|
| 289 |
+
|
| 290 |
+
```
|
| 291 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 292 |
+
โ SwiGLU Expert FFN โ
|
| 293 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค
|
| 294 |
+
โ โ
|
| 295 |
+
โ Input (hidden_size) โ
|
| 296 |
+
โ โ โ
|
| 297 |
+
โ โโโโโโโโโโโโโโโโโโโโโโ โ
|
| 298 |
+
โ โ โ โ
|
| 299 |
+
โ โผ โผ โ
|
| 300 |
+
โ โโโโโโโโโโโโ โโโโโโโโโโโโ โ
|
| 301 |
+
โ โ Gate Projโ โ Up Proj โ โ
|
| 302 |
+
โ โ (Linear) โ โ (Linear) โ โ
|
| 303 |
+
โ โโโโโโฌโโโโโโ โโโโโโฌโโโโโโ โ
|
| 304 |
+
โ โ โ โ
|
| 305 |
+
โ โผ โ โ
|
| 306 |
+
โ โโโโโโโโโโโโ โ โ
|
| 307 |
+
โ โ SiLU โ โ โ
|
| 308 |
+
โ โ (Swish) โ โ โ
|
| 309 |
+
โ โโโโโโฌโโโโโโ โ โ
|
| 310 |
+
โ โ โ โ
|
| 311 |
+
โ โโโโโโโโโโฌโโโโโโโโโโโโ โ
|
| 312 |
+
โ โ โ
|
| 313 |
+
โ โผ โ
|
| 314 |
+
โ โโโโโโโโโโโ โ
|
| 315 |
+
โ โ Multiplyโ (element-wise) โ
|
| 316 |
+
โ โโโโโโฌโโโโโ โ
|
| 317 |
+
โ โ โ
|
| 318 |
+
โ โผ โ
|
| 319 |
+
โ โโโโโโโโโโโโโ โ
|
| 320 |
+
โ โ Down Proj โ โ
|
| 321 |
+
โ โ (Linear) โ โ
|
| 322 |
+
โ โโโโโโโฌโโโโโโ โ
|
| 323 |
+
โ โ โ
|
| 324 |
+
โ โผ โ
|
| 325 |
+
โ Output (hidden_size) โ
|
| 326 |
+
โ โ
|
| 327 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 328 |
+
```
|
| 329 |
+
|
| 330 |
+
---
|
| 331 |
+
|
| 332 |
+
## ๐ Model Variants
|
| 333 |
+
|
| 334 |
+
<div align="center">
|
| 335 |
+
|
| 336 |
+
| Model | Layers | Hidden | Heads | KV Heads | Experts | Active | Total Params | Active Params | INT4 Size |
|
| 337 |
+
|:-----:|:------:|:------:|:-----:|:--------:|:-------:|:------:|:------------:|:-------------:|:---------:|
|
| 338 |
+
| **max2-nano** | 12 | 768 | 12 | 3 | 4 | 1 | **500M** | **125M** | ~300MB |
|
| 339 |
+
| **max2-lite** | 24 | 1536 | 12 | 3 | 8 | 2 | **1.5B** | **375M** | ~900MB |
|
| 340 |
+
| **max2-pro** | 32 | 2560 | 20 | 4 | 8 | 2 | **3B** | **750M** | ~1.8GB |
|
| 341 |
+
|
| 342 |
+
</div>
|
| 343 |
+
|
| 344 |
+
### Target Deployment Scenarios
|
| 345 |
+
|
| 346 |
+
```
|
| 347 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 348 |
+
โ โ
|
| 349 |
+
โ max2-nano (500M) max2-lite (1.5B) max2-pro (3B) โ
|
| 350 |
+
โ โ
|
| 351 |
+
โ โโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโ โ
|
| 352 |
+
โ โ โ ~300MB โ โ ๐ฑ ~900MB โ โ ๐ป ~1.8GB โ โ
|
| 353 |
+
โ โ โ โ โ โ โ โ
|
| 354 |
+
โ โ โช Smartwatch โ โ โช Smartphone โ โ โช Tablet โ โ
|
| 355 |
+
โ โ โช IoT Devices โ โ โช Mobile Apps โ โ โช Laptop โ โ
|
| 356 |
+
โ โ โช Wearables โ โ โช Edge Server โ โ โช Desktop โ โ
|
| 357 |
+
โ โ โช Raspberry Pi โ โ โช AR/VR โ โ โช Workstation โ โ
|
| 358 |
+
โ โ โ โ โ โ โ โ
|
| 359 |
+
โ โ 125M Active โ โ 375M Active โ โ 750M Active โ โ
|
| 360 |
+
โ โโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโ โ
|
| 361 |
+
โ โ
|
| 362 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 363 |
+
```
|
| 364 |
+
|
| 365 |
+
---
|
| 366 |
+
|
| 367 |
+
## ๐ Benchmarks
|
| 368 |
+
|
| 369 |
+
### Evaluation Results
|
| 370 |
+
|
| 371 |
+
| Benchmark | Dataset | max2-nano | max2-lite | max2-pro |
|
| 372 |
+
|-----------|---------|:---------:|:---------:|:--------:|
|
| 373 |
+
| **Perplexity โ** | WikiText-103 | 24.5 | 18.5 | 15.2 |
|
| 374 |
+
| **Accuracy โ** | LAMBADA | 52% | 62% | 68% |
|
| 375 |
+
| **Accuracy โ** | HellaSwag | 48% | 58% | 65% |
|
| 376 |
+
| **Accuracy โ** | ARC-Easy | 55% | 63% | 70% |
|
| 377 |
+
| **Accuracy โ** | PIQA | 68% | 74% | 78% |
|
| 378 |
+
| **Accuracy โ** | WinoGrande | 52% | 58% | 63% |
|
| 379 |
+
|
| 380 |
+
### Inference Speed (Tokens/Second)
|
| 381 |
+
|
| 382 |
+
| Device | max2-nano | max2-lite | max2-pro |
|
| 383 |
+
|--------|:---------:|:---------:|:--------:|
|
| 384 |
+
| **NVIDIA RTX 4090** | 250+ | 180 | 150 |
|
| 385 |
+
| **NVIDIA RTX 3080** | 180 | 120 | 85 |
|
| 386 |
+
| **Apple M2 MacBook** | 80 | 45 | 30 |
|
| 387 |
+
| **Google Pixel 8 Pro** | 45 | 25 | - |
|
| 388 |
+
| **iPhone 15 Pro** | 50 | 28 | - |
|
| 389 |
+
| **Raspberry Pi 5** | 8 | - | - |
|
| 390 |
+
|
| 391 |
+
### Memory Footprint
|
| 392 |
|
| 393 |
+
| Model | FP32 | FP16 | INT8 | INT4 |
|
| 394 |
+
|-------|:----:|:----:|:----:|:----:|
|
| 395 |
+
| **max2-nano** | 2.0GB | 1.0GB | 0.5GB | 0.3GB |
|
| 396 |
+
| **max2-lite** | 6.0GB | 3.0GB | 1.5GB | 0.9GB |
|
| 397 |
+
| **max2-pro** | 12.0GB | 6.0GB | 3.0GB | 1.8GB |
|
| 398 |
+
|
| 399 |
+
---
|
| 400 |
+
|
| 401 |
+
## ๐ Quick Start
|
| 402 |
|
| 403 |
### Installation
|
| 404 |
|
|
|
|
| 406 |
# Clone from HuggingFace
|
| 407 |
git clone https://huggingface.co/fariasultana/MiniMind
|
| 408 |
cd MiniMind
|
| 409 |
+
|
| 410 |
+
# Install dependencies
|
| 411 |
pip install -r requirements.txt
|
| 412 |
```
|
| 413 |
|
|
|
|
| 415 |
|
| 416 |
```python
|
| 417 |
import torch
|
| 418 |
+
from model import Max2ForCausalLM, create_model
|
| 419 |
+
from configs.model_config import get_config, estimate_params
|
| 420 |
|
| 421 |
# Create model (options: max2-nano, max2-lite, max2-pro)
|
| 422 |
+
model = create_model("max2-nano", device="cuda", dtype=torch.float16)
|
| 423 |
+
|
| 424 |
+
# Check parameters
|
| 425 |
+
config = get_config("max2-nano")
|
| 426 |
+
params = estimate_params(config)
|
| 427 |
+
print(f"Total: {params['total_params_b']:.2f}B")
|
| 428 |
+
print(f"Active: {params['active_params_b']:.2f}B")
|
| 429 |
+
print(f"Activation Ratio: {params['activation_ratio']:.1%}")
|
| 430 |
|
| 431 |
# Generate text
|
| 432 |
+
input_ids = torch.tensor([[1, 2, 3, 4, 5]]).cuda()
|
| 433 |
+
output = model.generate(
|
| 434 |
+
input_ids,
|
| 435 |
+
max_new_tokens=100,
|
| 436 |
+
temperature=0.8,
|
| 437 |
+
top_k=50,
|
| 438 |
+
top_p=0.9,
|
| 439 |
+
do_sample=True
|
| 440 |
+
)
|
| 441 |
+
print(f"Generated {output.shape[1]} tokens")
|
| 442 |
```
|
| 443 |
|
| 444 |
+
### Custom Configuration
|
| 445 |
|
| 446 |
```python
|
| 447 |
+
from configs.model_config import Max2Config
|
|
|
|
| 448 |
from model import Max2ForCausalLM
|
| 449 |
|
| 450 |
+
# Create custom model
|
| 451 |
+
custom_config = Max2Config(
|
| 452 |
+
hidden_size=1024,
|
| 453 |
+
num_hidden_layers=16,
|
| 454 |
+
num_attention_heads=16,
|
| 455 |
+
num_key_value_heads=4,
|
| 456 |
+
num_experts=6,
|
| 457 |
+
num_experts_per_tok=2,
|
| 458 |
+
expert_hidden_size=768,
|
| 459 |
+
)
|
| 460 |
+
|
| 461 |
+
model = Max2ForCausalLM(custom_config)
|
| 462 |
+
```
|
| 463 |
|
| 464 |
+
---
|
|
|
|
| 465 |
|
| 466 |
+
## ๐ Training
|
|
|
|
|
|
|
|
|
|
| 467 |
|
| 468 |
+
### Standard Training
|
| 469 |
|
| 470 |
```bash
|
|
|
|
| 471 |
python scripts/train.py \
|
| 472 |
--model max2-lite \
|
| 473 |
--train-data data/train.jsonl \
|
| 474 |
+
--val-data data/val.jsonl \
|
| 475 |
--epochs 3 \
|
| 476 |
--batch-size 8 \
|
| 477 |
+
--learning-rate 3e-4 \
|
| 478 |
+
--warmup-steps 1000 \
|
| 479 |
--output-dir outputs/
|
| 480 |
+
```
|
| 481 |
+
|
| 482 |
+
### Knowledge Distillation
|
| 483 |
|
| 484 |
+
```bash
|
| 485 |
python scripts/train.py \
|
| 486 |
--model max2-lite \
|
| 487 |
--train-data data/train.jsonl \
|
| 488 |
--teacher-model path/to/teacher.pt \
|
| 489 |
--temperature 2.0 \
|
| 490 |
+
--alpha-kd 0.5 \
|
| 491 |
+
--output-dir outputs/
|
| 492 |
```
|
| 493 |
|
| 494 |
+
### Training Hyperparameters
|
| 495 |
+
|
| 496 |
+
| Parameter | Value |
|
| 497 |
+
|-----------|-------|
|
| 498 |
+
| Learning Rate | 3e-4 |
|
| 499 |
+
| Weight Decay | 0.1 |
|
| 500 |
+
| Warmup Steps | 1000 |
|
| 501 |
+
| Batch Size | 8-32 |
|
| 502 |
+
| Gradient Accumulation | 4 |
|
| 503 |
+
| Mixed Precision | FP16/BF16 |
|
| 504 |
+
| Optimizer | AdamW |
|
| 505 |
+
|
| 506 |
+
---
|
| 507 |
+
|
| 508 |
+
## ๐ฑ Deployment
|
| 509 |
+
|
| 510 |
+
### Export Formats
|
| 511 |
|
| 512 |
```bash
|
| 513 |
+
# Export to ONNX
|
| 514 |
+
python scripts/export.py --model max2-nano --format onnx
|
| 515 |
+
|
| 516 |
+
# Export to GGUF (llama.cpp)
|
| 517 |
+
python scripts/export.py --model max2-nano --format gguf --quantize int4_awq
|
|
|
|
| 518 |
|
| 519 |
# Export for Android
|
| 520 |
+
python scripts/export.py --model max2-nano --format android --quantize int4_awq
|
|
|
|
|
|
|
|
|
|
| 521 |
```
|
| 522 |
|
| 523 |
+
### Quantization Options
|
| 524 |
+
|
| 525 |
+
| Method | Bits | Size Reduction | Quality Impact |
|
| 526 |
+
|--------|:----:|:--------------:|:--------------:|
|
| 527 |
+
| **FP16** | 16 | 50% | None |
|
| 528 |
+
| **INT8** | 8 | 75% | Minimal (<1%) |
|
| 529 |
+
| **INT4 (AWQ)** | 4 | 87.5% | Small (1-2%) |
|
| 530 |
+
| **INT4 (GPTQ)** | 4 | 87.5% | Small (1-2%) |
|
| 531 |
+
|
| 532 |
+
### Android Integration
|
| 533 |
|
| 534 |
+
```kotlin
|
| 535 |
+
// Kotlin usage
|
| 536 |
+
val model = MiniMindModel(context, "max2-nano.gguf")
|
|
|
|
| 537 |
|
| 538 |
+
model.generate("Hello, I am") { token ->
|
| 539 |
+
textView.append(token) // Stream to UI
|
| 540 |
+
}
|
| 541 |
+
```
|
| 542 |
|
| 543 |
+
See [android/README.md](android/README.md) for complete guide.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 544 |
|
| 545 |
+
---
|
| 546 |
+
|
| 547 |
+
## ๐ Project Structure
|
| 548 |
|
| 549 |
```
|
| 550 |
MiniMind/
|
| 551 |
โโโ configs/
|
| 552 |
+
โ โโโ __init__.py
|
| 553 |
+
โ โโโ model_config.py # Max2Config, model presets
|
| 554 |
โโโ model/
|
| 555 |
+
โ โโโ __init__.py
|
| 556 |
+
โ โโโ components.py # RMSNorm, RoPE, GQA, MoE, SwiGLU
|
| 557 |
+
โ โโโ mind2_model.py # Max2Model, Max2ForCausalLM
|
| 558 |
โโโ training/
|
| 559 |
+
โ โโโ trainer.py # Training loop with AMP
|
| 560 |
+
โ โโโ distillation.py # Knowledge distillation
|
| 561 |
+
โ โโโ dataset.py # Data loading utilities
|
| 562 |
โโโ optimization/
|
| 563 |
+
โ โโโ quantization.py # INT4/INT8 (AWQ, GPTQ)
|
| 564 |
+
โ โโโ pruning.py # Structured/unstructured pruning
|
| 565 |
+
โ โโโ export.py # ONNX, GGUF, TFLite export
|
| 566 |
โโโ android/
|
| 567 |
+
โ โโโ app/ # Kotlin app code
|
| 568 |
+
โ โโโ jni/ # C++ JNI bridge
|
| 569 |
+
โ โโโ README.md # Android guide
|
| 570 |
โโโ examples/
|
| 571 |
+
โ โโโ quickstart.py # Quick start example
|
| 572 |
+
โโโ scripts/
|
| 573 |
+
โ โโโ train.py # Training CLI
|
| 574 |
+
โ โโโ export.py # Export CLI
|
| 575 |
+
โโโ README.md # This file
|
| 576 |
```
|
| 577 |
|
| 578 |
+
---
|
| 579 |
+
|
| 580 |
+
## ๐ Paper
|
| 581 |
|
| 582 |
+
### MiniMind Max2: Efficient Language Models for Edge Deployment
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 583 |
|
| 584 |
+
**Abstract**: We present MiniMind Max2, a family of efficient language models designed for deployment on resource-constrained devices. By combining Mixture of Experts (MoE) with Grouped Query Attention (GQA), our models achieve competitive performance while activating only 25% of parameters per token. The max2-nano variant (500M total, 125M active) runs at 45+ tokens/second on mobile devices, while max2-pro (3B total, 750M active) achieves state-of-the-art efficiency on edge hardware.
|
| 585 |
|
| 586 |
+
**Key Contributions**:
|
| 587 |
+
1. Efficient MoE architecture with 8 experts and top-2 routing
|
| 588 |
+
2. GQA with 4:1 query-to-KV ratio for memory efficiency
|
| 589 |
+
3. Comprehensive deployment toolkit for mobile and edge devices
|
| 590 |
+
4. Extensive benchmarks across multiple hardware platforms
|
| 591 |
|
| 592 |
+
๐ *Full paper coming soon on arXiv*
|
|
|
|
|
|
|
|
|
|
|
|
|
| 593 |
|
| 594 |
+
---
|
| 595 |
+
|
| 596 |
+
## ๐ Citation
|
| 597 |
|
| 598 |
```bibtex
|
| 599 |
+
@misc{minimind-max2-2024,
|
| 600 |
+
title={MiniMind Max2: Efficient Language Models for Edge Deployment
|
| 601 |
+
with Mixture of Experts},
|
| 602 |
+
author={Sultana, Faria},
|
| 603 |
year={2024},
|
| 604 |
+
howpublished={\url{https://huggingface.co/fariasultana/MiniMind}},
|
| 605 |
+
note={Hugging Face Model Repository}
|
| 606 |
}
|
| 607 |
```
|
| 608 |
|
| 609 |
+
### Related Works
|
| 610 |
+
|
| 611 |
+
```bibtex
|
| 612 |
+
@article{shazeer2017moe,
|
| 613 |
+
title={Outrageously Large Neural Networks:
|
| 614 |
+
The Sparsely-Gated Mixture-of-Experts Layer},
|
| 615 |
+
author={Shazeer, Noam and others},
|
| 616 |
+
journal={arXiv preprint arXiv:1701.06538},
|
| 617 |
+
year={2017}
|
| 618 |
+
}
|
| 619 |
+
|
| 620 |
+
@article{ainslie2023gqa,
|
| 621 |
+
title={GQA: Training Generalized Multi-Query Transformer
|
| 622 |
+
Models from Multi-Head Checkpoints},
|
| 623 |
+
author={Ainslie, Joshua and others},
|
| 624 |
+
journal={arXiv preprint arXiv:2305.13245},
|
| 625 |
+
year={2023}
|
| 626 |
+
}
|
| 627 |
+
```
|
| 628 |
+
|
| 629 |
+
---
|
| 630 |
+
|
| 631 |
+
## ๐ค Community
|
| 632 |
|
| 633 |
+
<div align="center">
|
| 634 |
|
| 635 |
+
| Resource | Link |
|
| 636 |
+
|----------|------|
|
| 637 |
+
| ๐ฎ **Demo** | [MiniMind-API Space](https://huggingface.co/spaces/fariasultana/MiniMind-API) |
|
| 638 |
+
| ๐ฌ **Discussions** | [Community Forum](https://huggingface.co/fariasultana/MiniMind/discussions) |
|
| 639 |
+
| ๐ **Issues** | [Report Bugs](https://huggingface.co/fariasultana/MiniMind/discussions) |
|
| 640 |
+
| ๐ง **Contact** | Via HuggingFace |
|
| 641 |
|
| 642 |
+
</div>
|
|
|
|
|
|
|
| 643 |
|
| 644 |
---
|
| 645 |
|
| 646 |
+
## ๐ License
|
| 647 |
+
|
| 648 |
+
This project is licensed under the **Apache License 2.0**.
|
| 649 |
+
|
| 650 |
+
---
|
| 651 |
+
|
| 652 |
+
## ๐ Acknowledgments
|
| 653 |
+
|
| 654 |
+
- Inspired by [MiniMax M2](https://www.minimax.io/news/minimax-m2)'s efficient design
|
| 655 |
+
- Built with [PyTorch](https://pytorch.org/) and [llama.cpp](https://github.com/ggerganov/llama.cpp)
|
| 656 |
+
- Thanks to the Hugging Face community
|
| 657 |
+
|
| 658 |
+
---
|
| 659 |
+
|
| 660 |
+
<div align="center">
|
| 661 |
+
|
| 662 |
+
**MiniMind Max2** - Bringing powerful AI to every device ๐
|
| 663 |
+
|
| 664 |
+
[](https://huggingface.co/fariasultana/MiniMind)
|
| 665 |
+
[](https://huggingface.co/fariasultana)
|
| 666 |
+
|
| 667 |
+
*Made with โค๏ธ by Faria Sultana*
|
| 668 |
+
|
| 669 |
+
</div>
|