yongqiang commited on
Commit ·
1a6e584
1
Parent(s): 0451a01
first commit
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +2 -0
- .gitignore +5 -0
- README.md +169 -0
- config.json +0 -0
- infer_axmodel.py +76 -0
- smollm3_axmodel/model.embed_tokens.weight.npy +3 -0
- smollm3_axmodel/smollm3_p128_l0_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l10_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l11_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l12_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l13_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l14_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l15_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l16_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l17_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l18_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l19_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l1_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l20_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l21_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l22_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l23_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l24_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l25_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l26_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l27_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l28_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l29_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l2_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l30_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l31_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l32_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l33_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l34_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l35_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l3_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l4_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l5_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l6_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l7_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l8_together.axmodel +3 -0
- smollm3_axmodel/smollm3_p128_l9_together.axmodel +3 -0
- smollm3_axmodel/smollm3_post.axmodel +3 -0
- smolvlm3_tokenizer/.gitattributes +36 -0
- smolvlm3_tokenizer/README.md +368 -0
- smolvlm3_tokenizer/chat_template.jinja +94 -0
- smolvlm3_tokenizer/config.json +108 -0
- smolvlm3_tokenizer/generation_config.json +9 -0
- smolvlm3_tokenizer/model.safetensors.index.json +334 -0
- smolvlm3_tokenizer/notebook.ipynb +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__
|
| 2 |
+
*.tar
|
| 3 |
+
build-output/
|
| 4 |
+
compiled*
|
| 5 |
+
tmp/
|
README.md
CHANGED
|
@@ -1,3 +1,172 @@
|
|
| 1 |
---
|
| 2 |
license: bsd-3-clause
|
| 3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
license: bsd-3-clause
|
| 3 |
---
|
| 4 |
+
language:
|
| 5 |
+
- en
|
| 6 |
+
- zh
|
| 7 |
+
base_model:
|
| 8 |
+
- HuggingFaceTB/SmolLM3-3B
|
| 9 |
+
pipeline_tag: text-generation
|
| 10 |
+
tags:
|
| 11 |
+
- HuggingFaceTB
|
| 12 |
+
- SmolLM3-3B
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
# SmolLM3-3B-Int8
|
| 16 |
+
|
| 17 |
+
This version of SmolLM3-3B has been converted to run on the Axera NPU using **w8a16** quantization.
|
| 18 |
+
|
| 19 |
+
Compatible with Pulsar2 version: 4.1
|
| 20 |
+
|
| 21 |
+
## Convert tools links:
|
| 22 |
+
|
| 23 |
+
For those who are interested in model conversion, you can try to export axmodel through the original repo:
|
| 24 |
+
- https://huggingface.co/HuggingFaceTB/SmolLM3-3B
|
| 25 |
+
|
| 26 |
+
- [Github for SmolLM3-3B.axera](https://github.com/AXERA-TECH/SmolLM3-3B.axera)
|
| 27 |
+
|
| 28 |
+
- [Pulsar2 Link, How to Convert LLM from Huggingface to axmodel](https://pulsar2-docs.readthedocs.io/en/latest/appendix/build_llm.html)
|
| 29 |
+
|
| 30 |
+
## Support Platform
|
| 31 |
+
- AX650
|
| 32 |
+
- [M4N-Dock(爱芯派Pro)](https://wiki.sipeed.com/hardware/zh/maixIV/m4ndock/m4ndock.html)
|
| 33 |
+
|
| 34 |
+
## How to use
|
| 35 |
+
|
| 36 |
+
Download all files from this repository to the device.
|
| 37 |
+
|
| 38 |
+
**Using AX650 Board**
|
| 39 |
+
|
| 40 |
+
```bash
|
| 41 |
+
ai@ai-bj ~/yongqiang/push_hugging_face/SmolLM3-3B $ tree -L 1
|
| 42 |
+
.
|
| 43 |
+
├── config.json
|
| 44 |
+
├── infer_axmodel.py
|
| 45 |
+
├── README.md
|
| 46 |
+
├── smollm3_axmodel
|
| 47 |
+
├── smolvlm3_tokenizer
|
| 48 |
+
└── utils
|
| 49 |
+
|
| 50 |
+
3 directories, 3 files
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
#### Inference with AX650 Host, such as M4N-Dock(爱芯派Pro) or AX650N DEMO Board
|
| 54 |
+
|
| 55 |
+
input text:
|
| 56 |
+
|
| 57 |
+
```
|
| 58 |
+
帮我求解函数y=3x^2+1的导数.
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
log information(including the thinking process):
|
| 62 |
+
|
| 63 |
+
```bash
|
| 64 |
+
$ python3 infer_axmodel.py -q "帮我求解函数y=3x^2+1的导数." # 默认开启 think
|
| 65 |
+
...
|
| 66 |
+
Model loaded successfully!
|
| 67 |
+
slice_indices: [0, 1, 2]
|
| 68 |
+
Slice prefill done: 0
|
| 69 |
+
Slice prefill done: 1
|
| 70 |
+
Slice prefill done: 2
|
| 71 |
+
answer >> <think>
|
| 72 |
+
Okay, so I need to find the derivative of the function y = 3x² + 1. Hmm, let me think about how to approach this. I remember that when taking derivatives, we use the^@ power rule. The power rule says that if you have a function like x^n, its derivative is n*x^(n-1). Right? So, for each term in the function, I can apply this rule.
|
| 73 |
+
|
| 74 |
+
First, let's break down the function into its components. The function is 3x^@² + 1. The first term is 3x², and the second term is 1. The constant term 1 doesn't have an x in it, so when I take the derivative of 1, it should be 0 because the derivative of a constant is zero. That part seems straightforward^@.
|
| 75 |
+
|
| 76 |
+
Now, the main part is the term 3x². Here, the coefficient is 3, and the exponent is 2. Applying the power rule, the derivative of x² is 2x. But since there's a coefficient 3 in front of the x², I need to multiply^@ the derivative of the function by that coefficient. So, 3 times the derivative of x², which is 2x. That gives me 3*2x = 6x. So the derivative of 3x² is 6x.
|
| 77 |
+
|
| 78 |
+
Putting it all together, the derivative of the entire function^@ y = 3x² + 1 should be the derivative of 3x² plus the derivative of 1. The derivative of 3x² is 6x, and the derivative of 1 is 0. Therefore, the derivative of the whole function is 6x + 0,^@ which simplifies to 6x.
|
| 79 |
+
|
| 80 |
+
Wait, let me double-check that. If I have a function like 3x², the derivative is 6x. Let me verify that with the power rule. The power rule states that if you have a function f(x) = ax^n, then f'(^@x) = a*n*x^(n-1). In this case, a is 3 and n is 2. So f'(x) = 3*2*x^(2-1) = 6x. Yes, that's correct. So the derivative of 3x² is indeed ^@6x. And the derivative of the constant 1 is 0. So combining those, the derivative of the entire function is 6x. That seems right.
|
| 81 |
+
|
| 82 |
+
Is there anything else I need to consider here? Maybe I should check if there are any other terms or if I missed any steps. The original^@ function is a simple polynomial, so there shouldn't be any hidden complexities here. The power rule applies straightforwardly to each term. Since there are no other terms besides the 3x² and the constant, the process is complete.
|
| 83 |
+
|
| 84 |
+
Another way to think about it is to consider the limit definition of a derivative.^@ If I were to use the limit definition, the derivative of 3x² + 1 would be the limit as h approaches 0 of [ (3(x+h)² + 1) - (3x² + 1) ] / h. Simplifying that expression would lead me through the^@ same steps as before, but since I already applied the power rule, I can be confident that the result is correct.
|
| 85 |
+
|
| 86 |
+
Therefore, after going through the process step by step, I can be sure that the derivative of y = 3x² + 1 is indeed 6x. There's no mistake^@ in the calculation, and all the steps follow logically from the power rule. So the final answer is 6x.
|
| 87 |
+
|
| 88 |
+
Just to recap, the key steps were:
|
| 89 |
+
|
| 90 |
+
1. Identify the function: 3x² + 1.
|
| 91 |
+
2. Apply the power rule to each term.
|
| 92 |
+
3. For the term^@ 3x², the derivative is 3*2x^(2-1) = 6x.
|
| 93 |
+
4. For the term 1, the derivative is 0.
|
| 94 |
+
5. Combine the derivatives: 6x + 0 = 6x.
|
| 95 |
+
|
| 96 |
+
Yes, that all checks out. I^@ think that's thorough enough. I don't see any errors in this reasoning. Therefore, the derivative of the function y = 3x² + 1 is 6x.
|
| 97 |
+
|
| 98 |
+
**Final Answer**
|
| 99 |
+
The derivative of the function \( y = 3x^2 + 1 \) is \(\boxed{6x}\).
|
| 100 |
+
</think>
|
| 101 |
+
To find the derivative of the function \( y = 3x^2 + 1 \), we can use the power rule of differentiation. The power rule states that if we have a function of the form \( ax^n \), its derivative is \( a \cdot^@ n \cdot x^{n-1} \).
|
| 102 |
+
|
| 103 |
+
1. **Identify the terms in the function:**
|
| 104 |
+
- The first term is \( 3x^2 \).
|
| 105 |
+
- The second term is \( 1 \).
|
| 106 |
+
|
| 107 |
+
2. **Apply the power rule to each term:**
|
| 108 |
+
^@ - For the term \( 3x^2 \):
|
| 109 |
+
- The coefficient \( a \) is 3.
|
| 110 |
+
- The exponent \( n \) is 2.
|
| 111 |
+
- The derivative is \( 3 \cdot 2 \cdot x^{2-1} = 6x \).
|
| 112 |
+
^@ - For the term \( 1 \):
|
| 113 |
+
- The derivative of a constant is 0.
|
| 114 |
+
|
| 115 |
+
3. **Combine the results:**
|
| 116 |
+
- The derivative of \( 3x^2 \) is \( 6x \).
|
| 117 |
+
- The derivative of \( 1 \) is \( ^@0 \).
|
| 118 |
+
|
| 119 |
+
4. **Final result:**
|
| 120 |
+
- The derivative of the entire function \( 3x^2 + 1 \) is \( 6x + 0 = 6x \).
|
| 121 |
+
|
| 122 |
+
Thus, the derivative of the function \( y = 3x^2 + 1^@ \) is \( 6x \).
|
| 123 |
+
|
| 124 |
+
\[
|
| 125 |
+
\boxed{6x}
|
| 126 |
+
\]
|
| 127 |
+
```
|
| 128 |
+
|
| 129 |
+
use the parameter `--disable-think` to disable the thinking process:
|
| 130 |
+
|
| 131 |
+
```sh
|
| 132 |
+
$ python3 infer_axmodel.py -q "帮我求解函数y=3x^2+1的导数." --disable-think
|
| 133 |
+
|
| 134 |
+
Model loaded successfully!
|
| 135 |
+
slice_indices: [0]
|
| 136 |
+
Slice prefill done: 0
|
| 137 |
+
answer >> 要求解函数 \( y = 3x^2 + 1 \) 的导数,我们可以使用导数的基本规则。
|
| 138 |
+
|
| 139 |
+
函数导数的导数可以通过导数的导数规则来求解。对于多项式^@函数,导数可以通过导数的导数规则来求解。对于函数 \( y = 3x^2 + 1 \),我们可以逐步求导:
|
| 140 |
+
|
| 141 |
+
1. **求导函数 \( y = 3x^2 \)**:
|
| 142 |
+
根据导数的导^@数规则,导数规则中对于 \( x^n \) 的导数规则,导数规则为:
|
| 143 |
+
\[
|
| 144 |
+
\frac{d}{dx} (x^n) = n x^{n-1}
|
| 145 |
+
\]
|
| 146 |
+
在这里,\( n = 2^@ \),所以:
|
| 147 |
+
\[
|
| 148 |
+
\frac{d}{dx} (3x^2) = 3 \cdot \frac{d}{dx} (x^2) = 3 \cdot 2x^{2-1} = 6x
|
| 149 |
+
\]
|
| 150 |
+
|
| 151 |
+
2. **求^@导数规则中的常数项**:
|
| 152 |
+
对于常数项 \( 1 \),其导数为零,因为导数规则中常数项的导数为零:
|
| 153 |
+
\[
|
| 154 |
+
\frac{d}{dx} (1) = 0
|
| 155 |
+
\]
|
| 156 |
+
|
| 157 |
+
将^@以上结果结合起来,我们得到:
|
| 158 |
+
\[
|
| 159 |
+
\frac{d}{dx} (y) = \frac{d}{dx} (3x^2 + 1) = 6x + 0 = 6x
|
| 160 |
+
\]
|
| 161 |
+
|
| 162 |
+
因此,函数 \( y = 3x^2 +^@ 1 \) 的导数为:
|
| 163 |
+
\[
|
| 164 |
+
\frac{dy}{dx} = 6x
|
| 165 |
+
\]
|
| 166 |
+
|
| 167 |
+
所以,求解函数 \( y = 3x^2 + 1 \) 的导数,我们得到:
|
| 168 |
+
\[
|
| 169 |
+
\frac{d}{dx} (3x^^@2 + 1) = 6x
|
| 170 |
+
\]
|
| 171 |
+
|
| 172 |
+
```
|
config.json
ADDED
|
File without changes
|
infer_axmodel.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import AutoProcessor, AutoModelForImageTextToText
|
| 2 |
+
import torch
|
| 3 |
+
import onnx
|
| 4 |
+
import onnxruntime as ort
|
| 5 |
+
import numpy as np
|
| 6 |
+
import os
|
| 7 |
+
from tqdm import tqdm
|
| 8 |
+
from transformers import AutoConfig, AutoTokenizer
|
| 9 |
+
from typing import List, Tuple
|
| 10 |
+
from axengine import InferenceSession
|
| 11 |
+
from ml_dtypes import bfloat16
|
| 12 |
+
from utils.infer_func import InferManager
|
| 13 |
+
import argparse
|
| 14 |
+
from PIL import Image
|
| 15 |
+
from torchvision.transforms import Resize, ToTensor, Normalize, Compose
|
| 16 |
+
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
if __name__ == "__main__":
|
| 20 |
+
|
| 21 |
+
prompt = None
|
| 22 |
+
parser = argparse.ArgumentParser(description="Model configuration parameters")
|
| 23 |
+
parser.add_argument("--hf_model", type=str, default="./smolvlm3_tokenizer/",
|
| 24 |
+
help="Path to HuggingFace model")
|
| 25 |
+
parser.add_argument("--axmodel_path", type=str, default="./smollm3_axmodel/",
|
| 26 |
+
help="Path to save compiled axmodel of llama model")
|
| 27 |
+
parser.add_argument("--disable-think", action="store_true", default=False,
|
| 28 |
+
help="Disable thinking.")
|
| 29 |
+
parser.add_argument("-q", "--question", type=str, default="Give me a brief explanation of gravity in simple terms.",
|
| 30 |
+
help="Your question that you want to ask the model.")
|
| 31 |
+
args = parser.parse_args()
|
| 32 |
+
|
| 33 |
+
hf_model_path = args.hf_model
|
| 34 |
+
axmodel_path = args.axmodel_path
|
| 35 |
+
prompt = args.question
|
| 36 |
+
|
| 37 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 38 |
+
embeds = np.load(os.path.join(axmodel_path, "model.embed_tokens.weight.npy"))
|
| 39 |
+
|
| 40 |
+
# load the tokenizer and the model
|
| 41 |
+
tokenizer = AutoTokenizer.from_pretrained(hf_model_path)
|
| 42 |
+
cfg = AutoConfig.from_pretrained(hf_model_path, trust_remote_code=True)
|
| 43 |
+
|
| 44 |
+
# model = AutoModelForCausalLM.from_pretrained(
|
| 45 |
+
# hf_model_path,
|
| 46 |
+
# ).to(device)
|
| 47 |
+
|
| 48 |
+
# prepare the model input
|
| 49 |
+
if not args.disable_think:
|
| 50 |
+
messages = [
|
| 51 |
+
{"role": "user", "content": prompt}
|
| 52 |
+
]
|
| 53 |
+
else:
|
| 54 |
+
messages = [
|
| 55 |
+
{"role": "system", "content": "/no_think"},
|
| 56 |
+
{"role": "user", "content": prompt}
|
| 57 |
+
]
|
| 58 |
+
|
| 59 |
+
text = tokenizer.apply_chat_template(
|
| 60 |
+
messages,
|
| 61 |
+
tokenize=False,
|
| 62 |
+
add_generation_prompt=True,
|
| 63 |
+
)
|
| 64 |
+
model_inputs = tokenizer([text], return_tensors="pt").to(device)
|
| 65 |
+
input_ids = model_inputs.input_ids
|
| 66 |
+
|
| 67 |
+
token_ids = input_ids[0].cpu().numpy().tolist()
|
| 68 |
+
token_len = len(token_ids)
|
| 69 |
+
prefill_data = np.take(embeds, token_ids, axis=0)
|
| 70 |
+
prefill_data = prefill_data.astype(bfloat16)
|
| 71 |
+
|
| 72 |
+
imer = InferManager(cfg, axmodel_path)
|
| 73 |
+
|
| 74 |
+
token_ids = imer.prefill(tokenizer, token_ids, prefill_data, slice_len=128)
|
| 75 |
+
imer.decode(tokenizer, token_ids, embeds, slice_len=128)
|
| 76 |
+
print("\n")
|
smollm3_axmodel/model.embed_tokens.weight.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:007278f3c51a2ad992d9ba2c8f5b0623f66da7a030eb490107e34ae59cd14e6d
|
| 3 |
+
size 1050673280
|
smollm3_axmodel/smollm3_p128_l0_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:66a5639c528524d2137c425155033c92e1b70d32f28cee8e6b7f5439dfe57f13
|
| 3 |
+
size 100180663
|
smollm3_axmodel/smollm3_p128_l10_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ef11e903bcf69a4f00f448b2cd923b5bbe399fb9f92862b856f4538c2665af13
|
| 3 |
+
size 100185047
|
smollm3_axmodel/smollm3_p128_l11_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:027781437f895ac7fa2dd7b6d09426b1b864b5df2ead1ba7b3ee194f1916d2fe
|
| 3 |
+
size 100189271
|
smollm3_axmodel/smollm3_p128_l12_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:248a266148663c334a3b39c785032b25bd38a9fd9617432ac7042e1be8f4fe5f
|
| 3 |
+
size 100183063
|
smollm3_axmodel/smollm3_p128_l13_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fc9406d0b8b2cd2ccd10923d794d8389661235956d4cce973d7b34e6531c93cd
|
| 3 |
+
size 100183575
|
smollm3_axmodel/smollm3_p128_l14_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:787bfde4dde0817374587ab724e0ea50122bc8a8aea7a666099853d914691235
|
| 3 |
+
size 100186935
|
smollm3_axmodel/smollm3_p128_l15_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:462366b54a8a0866baf221bb15db72135d33ef0d388ba06e84b1b4827ee5071d
|
| 3 |
+
size 100181303
|
smollm3_axmodel/smollm3_p128_l16_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5c2fe71c1a837c472b6e9eb4f1c7a9e680aba96b296c23cb0998cf09d446d4fd
|
| 3 |
+
size 100185431
|
smollm3_axmodel/smollm3_p128_l17_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:036e5cf041752d481ee81b613eab47e618870776db81237a09d2652d1eae3198
|
| 3 |
+
size 100184343
|
smollm3_axmodel/smollm3_p128_l18_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:455535e56fe0506eb6f4b892979538918ee560d1556aa55d10e4aa6b5c013fe5
|
| 3 |
+
size 100187703
|
smollm3_axmodel/smollm3_p128_l19_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f3db3a90ee18b8ceefc9fa9e66526f649665fd69289f94c0be60fb4dc2afe7bb
|
| 3 |
+
size 100185271
|
smollm3_axmodel/smollm3_p128_l1_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5eb418fc9545d5c8a0607fd425e8f97528af905e58f1b282d341f38494b66d9a
|
| 3 |
+
size 100182167
|
smollm3_axmodel/smollm3_p128_l20_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cd604aebc14db90a32108cbff73fb33c95fc28f9eccff1daa44e3d86dcfac8bf
|
| 3 |
+
size 100181111
|
smollm3_axmodel/smollm3_p128_l21_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:417c0ae816ec2ac6b633812f1a8ebb1059d5c095e220a18b54a8851c5cb6146d
|
| 3 |
+
size 100183639
|
smollm3_axmodel/smollm3_p128_l22_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:913f0da3baf02a9691a46b36164993a033eac129cea3072a30d1c7787f5fdb03
|
| 3 |
+
size 100187095
|
smollm3_axmodel/smollm3_p128_l23_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1febf968dca57c080bb2c6ebfcf737d53e020fb7086b8d4f99995967bf67bf64
|
| 3 |
+
size 100185271
|
smollm3_axmodel/smollm3_p128_l24_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4bffb0ddc7507918bfbd3ffe289d660cad550bd9d4ab39a93e4747b5f919e43c
|
| 3 |
+
size 100186743
|
smollm3_axmodel/smollm3_p128_l25_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fd0afa2885d980dbcf4cb5255a0a39ebb767a3978148b2c3170456e2de658843
|
| 3 |
+
size 100187575
|
smollm3_axmodel/smollm3_p128_l26_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:383cc020bc009141a76f9670bbfe8b756a83a4c46eabb881c9ecc7a41e936955
|
| 3 |
+
size 100182775
|
smollm3_axmodel/smollm3_p128_l27_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:829bfb1740d0804a2a558d40d622409fa2db83016784dbc230c6b36b9f915ae7
|
| 3 |
+
size 100187607
|
smollm3_axmodel/smollm3_p128_l28_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e9b7a2abfabac90accb6323629ecf9e568430c732d4969873f9976ee82421fca
|
| 3 |
+
size 100184599
|
smollm3_axmodel/smollm3_p128_l29_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:23c7c5c20f76c684d5bf79186eb05590ad97112decf08e91198a0a4ab9906492
|
| 3 |
+
size 100186103
|
smollm3_axmodel/smollm3_p128_l2_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:64eb8e9ab0a170b4ff6022258a46a464a579fe0d5e51d9cb8b029fc64c759407
|
| 3 |
+
size 100185175
|
smollm3_axmodel/smollm3_p128_l30_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5994aeb56eaae4d2fb5b076ef98789986b77b94d403cd622cf245e7f7db57eb0
|
| 3 |
+
size 100182455
|
smollm3_axmodel/smollm3_p128_l31_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:41bc5f7808e70253526f8e139e413b66bd3abefd85b142d33ebc31fe4155c470
|
| 3 |
+
size 100186807
|
smollm3_axmodel/smollm3_p128_l32_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a7eb161e08d748a95296307e10b389cc8e1be31ae0b95bb74fff54e1573179fe
|
| 3 |
+
size 100185111
|
smollm3_axmodel/smollm3_p128_l33_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:12e69776df76335fceb2a6a8082aab158ec7902ac805e24af6d9447c854b70ba
|
| 3 |
+
size 100186615
|
smollm3_axmodel/smollm3_p128_l34_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:42642f4da1c70b6f63af9ce5cb8ffa1339b41401bbd6aec3cf9636ba8da36b2b
|
| 3 |
+
size 100182295
|
smollm3_axmodel/smollm3_p128_l35_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9c81e256d4f4025394e7208cb77a520d969ec98a3504faabda289ada58b03e24
|
| 3 |
+
size 100187415
|
smollm3_axmodel/smollm3_p128_l3_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:201b4f41fc825554b1ed1ad5eed6247bf1a881c8ee096315c8ba8661b3c4f71c
|
| 3 |
+
size 100186455
|
smollm3_axmodel/smollm3_p128_l4_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a1cd3e0d86090339360b0e73d6de3c960b7007c73f072bc890f8bf51876c6e85
|
| 3 |
+
size 100185495
|
smollm3_axmodel/smollm3_p128_l5_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:47e755071c64e5bd89d80678ac27fb3ef7c8e8e1057e5acbd321bcaf66fffd86
|
| 3 |
+
size 100189751
|
smollm3_axmodel/smollm3_p128_l6_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d62ee3b10a488762485e1284fba7afef1c4c5418d0d684758e27e44726e678c2
|
| 3 |
+
size 100188279
|
smollm3_axmodel/smollm3_p128_l7_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fd503175a3017173e36d52130a99d64500a1c050c101f1aef6f26139717caa2f
|
| 3 |
+
size 100184983
|
smollm3_axmodel/smollm3_p128_l8_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d6b9b9632050c0ac1eb4bfb9547674bea32415bc2a422bdfcb8fddfa226ab9c1
|
| 3 |
+
size 100187319
|
smollm3_axmodel/smollm3_p128_l9_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:24e2faf3ce56ce9a623417ca6312a4d7b4283cf2737f17a1178251bae80a2d54
|
| 3 |
+
size 100187927
|
smollm3_axmodel/smollm3_post.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c04b9e34dc47fcd6e13c660c48a271eb5a107fb8616111974f11d336d80b685c
|
| 3 |
+
size 286865743
|
smolvlm3_tokenizer/.gitattributes
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
smolvlm3_tokenizer/README.md
ADDED
|
@@ -0,0 +1,368 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
library_name: transformers
|
| 3 |
+
license: apache-2.0
|
| 4 |
+
language:
|
| 5 |
+
- en
|
| 6 |
+
- fr
|
| 7 |
+
- es
|
| 8 |
+
- it
|
| 9 |
+
- pt
|
| 10 |
+
- zh
|
| 11 |
+
- ar
|
| 12 |
+
- ru
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
# SmolLM3
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+

|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
## Table of Contents
|
| 23 |
+
|
| 24 |
+
1. [Model Summary](#model-summary)
|
| 25 |
+
2. [How to use](#how-to-use)
|
| 26 |
+
3. [Evaluation](#evaluation)
|
| 27 |
+
4. [Training](#training)
|
| 28 |
+
5. [Limitations](#limitations)
|
| 29 |
+
6. [License](#license)
|
| 30 |
+
|
| 31 |
+
## Model Summary
|
| 32 |
+
|
| 33 |
+
SmolLM3 is a 3B parameter language model designed to push the boundaries of small models. It supports 6 languages, advanced reasoning and long context. SmolLM3 is a fully open model that offers strong performance at the 3B–4B scale.
|
| 34 |
+
|
| 35 |
+

|
| 36 |
+
|
| 37 |
+
The model is a decoder-only transformer using GQA and NoPE (with 3:1 ratio), it was pretrained on 11.2T tokens with a staged curriculum of web, code, math and reasoning data. Post-training included midtraining on 140B reasoning tokens followed by supervised fine-tuning and alignment via Anchored Preference Optimization (APO).
|
| 38 |
+
|
| 39 |
+
### Key features
|
| 40 |
+
- Instruct model optimized for **hybrid reasoning**
|
| 41 |
+
- **Fully open model**: open weights + full training details including public data mixture and training configs
|
| 42 |
+
- **Long context:** Trained on 64k context and suppots up to **128k tokens** using YARN extrapolation
|
| 43 |
+
- **Multilingual**: 6 natively supported (English, French, Spanish, German, Italian, and Portuguese)
|
| 44 |
+
|
| 45 |
+
For more details refer to our blog post: https://hf.co/blog/smollm3
|
| 46 |
+
|
| 47 |
+
## How to use
|
| 48 |
+
|
| 49 |
+
The modeling code for SmolLM3 is available in transformers `v4.53.0`, so make sure to upgrade your transformers version. You can also load the model with the latest `vllm` which uses transformers as a backend.
|
| 50 |
+
```bash
|
| 51 |
+
pip install -U transformers
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
```python
|
| 55 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 56 |
+
|
| 57 |
+
model_name = "HuggingFaceTB/SmolLM3-3B"
|
| 58 |
+
device = "cuda" # for GPU usage or "cpu" for CPU usage
|
| 59 |
+
|
| 60 |
+
# load the tokenizer and the model
|
| 61 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 62 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 63 |
+
model_name,
|
| 64 |
+
).to(device)
|
| 65 |
+
|
| 66 |
+
# prepare the model input
|
| 67 |
+
prompt = "Give me a brief explanation of gravity in simple terms."
|
| 68 |
+
messages_think = [
|
| 69 |
+
{"role": "user", "content": prompt}
|
| 70 |
+
]
|
| 71 |
+
|
| 72 |
+
text = tokenizer.apply_chat_template(
|
| 73 |
+
messages_think,
|
| 74 |
+
tokenize=False,
|
| 75 |
+
add_generation_prompt=True,
|
| 76 |
+
)
|
| 77 |
+
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
|
| 78 |
+
|
| 79 |
+
# Generate the output
|
| 80 |
+
generated_ids = model.generate(**model_inputs, max_new_tokens=32768)
|
| 81 |
+
|
| 82 |
+
# Get and decode the output
|
| 83 |
+
output_ids = generated_ids[0][len(model_inputs.input_ids[0]) :]
|
| 84 |
+
print(tokenizer.decode(output_ids, skip_special_tokens=True))
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
>[!TIP]
|
| 88 |
+
> We recommend setting `temperature=0.6` and `top_p=0.95` in the sampling parameters.
|
| 89 |
+
|
| 90 |
+
### Long context processing
|
| 91 |
+
|
| 92 |
+
The current `config.json` is set for context length up to 65,536 tokens. To handle longer inputs (128k or 256k), we utilize YaRN you can change the `max_position_embeddings` and rope_scaling` to:
|
| 93 |
+
```
|
| 94 |
+
{
|
| 95 |
+
...,
|
| 96 |
+
"rope_scaling": {
|
| 97 |
+
"factor": 2.0, #2x65536=131 072
|
| 98 |
+
"original_max_position_embeddings": 65536,
|
| 99 |
+
"type": "yarn"
|
| 100 |
+
}
|
| 101 |
+
}
|
| 102 |
+
```
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
### Enabling and Disabling Extended Thinking Mode
|
| 106 |
+
|
| 107 |
+
We enable extended thinking by default, so the example above generates the output with a reasoning trace. For choosing between enabling, you can provide the `/think` and `/no_think` flags through the system prompt as shown in the snippet below for extended thinking disabled. The code for generating the response with extended thinking would be the same except that the system prompt should have `/think` instead of `/no_think`.
|
| 108 |
+
|
| 109 |
+
```python
|
| 110 |
+
prompt = "Give me a brief explanation of gravity in simple terms."
|
| 111 |
+
messages = [
|
| 112 |
+
{"role": "system", "content": "/no_think"},
|
| 113 |
+
{"role": "user", "content": prompt}
|
| 114 |
+
]
|
| 115 |
+
|
| 116 |
+
text = tokenizer.apply_chat_template(
|
| 117 |
+
messages,
|
| 118 |
+
tokenize=False,
|
| 119 |
+
add_generation_prompt=True,
|
| 120 |
+
)
|
| 121 |
+
```
|
| 122 |
+
|
| 123 |
+
We also provide the option of specifying the whether to use extended thinking through the `enable_thinking` kwarg as in the example below. You do not need to set the `/no_think` or `/think` flags through the system prompt if using the kwarg, but keep in mind that the flag in the system prompt overwrites the setting in the kwarg.
|
| 124 |
+
|
| 125 |
+
```python
|
| 126 |
+
prompt = "Give me a brief explanation of gravity in simple terms."
|
| 127 |
+
messages = [
|
| 128 |
+
{"role": "user", "content": prompt}
|
| 129 |
+
]
|
| 130 |
+
|
| 131 |
+
text = tokenizer.apply_chat_template(
|
| 132 |
+
messages,
|
| 133 |
+
tokenize=False,
|
| 134 |
+
add_generation_prompt=True,
|
| 135 |
+
enable_thinking=False
|
| 136 |
+
)
|
| 137 |
+
```
|
| 138 |
+
|
| 139 |
+
### Agentic Usage
|
| 140 |
+
|
| 141 |
+
SmolLM3 supports tool calling!
|
| 142 |
+
Just pass your list of tools:
|
| 143 |
+
- Under the argument `xml_tools` for standard tool-calling: these tools will be called as JSON blobs within XML tags, like `<tool_call>{"name": "get_weather", "arguments": {"city": "Copenhagen"}}</tool_call>`
|
| 144 |
+
- Or under `python_tools`: then the model will call tools like python functions in a `<code>` snippet, like `<code>get_weather(city="Copenhagen")</code>`
|
| 145 |
+
|
| 146 |
+
```python
|
| 147 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 148 |
+
|
| 149 |
+
checkpoint = "HuggingFaceTB/SmolLM3-3B"
|
| 150 |
+
|
| 151 |
+
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
| 152 |
+
model = AutoModelForCausalLM.from_pretrained(checkpoint)
|
| 153 |
+
|
| 154 |
+
tools = [
|
| 155 |
+
{
|
| 156 |
+
"name": "get_weather",
|
| 157 |
+
"description": "Get the weather in a city",
|
| 158 |
+
"parameters": {"type": "object", "properties": {"city": {"type": "string", "description": "The city to get the weather for"}}}}
|
| 159 |
+
]
|
| 160 |
+
|
| 161 |
+
messages = [
|
| 162 |
+
{
|
| 163 |
+
"role": "user",
|
| 164 |
+
"content": "Hello! How is the weather today in Copenhagen?"
|
| 165 |
+
}
|
| 166 |
+
]
|
| 167 |
+
|
| 168 |
+
inputs = tokenizer.apply_chat_template(
|
| 169 |
+
messages,
|
| 170 |
+
enable_thinking=False, # True works as well, your choice!
|
| 171 |
+
xml_tools=tools,
|
| 172 |
+
add_generation_prompt=True,
|
| 173 |
+
tokenize=True,
|
| 174 |
+
return_tensors="pt"
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
outputs = model.generate(inputs)
|
| 178 |
+
print(tokenizer.decode(outputs[0]))
|
| 179 |
+
```
|
| 180 |
+
|
| 181 |
+
### Using Custom System Instructions.
|
| 182 |
+
|
| 183 |
+
You can specify custom instruction through the system prompt while controlling whether to use extended thinking. For example, the snippet below shows how to make the model speak like a pirate while enabling extended thinking.
|
| 184 |
+
|
| 185 |
+
```python
|
| 186 |
+
prompt = "Give me a brief explanation of gravity in simple terms."
|
| 187 |
+
messages = [
|
| 188 |
+
{"role": "system", "content": "Speak like a pirate./think"},
|
| 189 |
+
{"role": "user", "content": prompt}
|
| 190 |
+
]
|
| 191 |
+
|
| 192 |
+
text = tokenizer.apply_chat_template(
|
| 193 |
+
messages,
|
| 194 |
+
tokenize=False,
|
| 195 |
+
add_generation_prompt=True,
|
| 196 |
+
)
|
| 197 |
+
```
|
| 198 |
+
|
| 199 |
+
For local inference, you can use `llama.cpp`, `ONNX`, `MLX` and `MLC`. You can find quantized checkpoints in this collection (https://huggingface.co/collections/HuggingFaceTB/smollm3-686d33c1fdffe8e635317e23)
|
| 200 |
+
|
| 201 |
+
### vLLM and SGLang
|
| 202 |
+
|
| 203 |
+
You can use vLLM and SGLang to deploy the model in an API compatible with OpenAI format.
|
| 204 |
+
|
| 205 |
+
#### SGLang
|
| 206 |
+
|
| 207 |
+
```bash
|
| 208 |
+
python -m sglang.launch_server --model-path HuggingFaceTB/SmolLM3-3B
|
| 209 |
+
```
|
| 210 |
+
|
| 211 |
+
#### vLLM
|
| 212 |
+
|
| 213 |
+
```bash
|
| 214 |
+
vllm serve HuggingFaceTB/SmolLM3-3B --enable-auto-tool-choice --tool-call-parser=hermes
|
| 215 |
+
```
|
| 216 |
+
|
| 217 |
+
#### Setting `chat_template_kwargs`
|
| 218 |
+
|
| 219 |
+
You can specify `chat_template_kwargs` such as `enable_thinking` to a deployed model by passing the `chat_template_kwargs` parameter in the API request.
|
| 220 |
+
|
| 221 |
+
```bash
|
| 222 |
+
curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
|
| 223 |
+
"model": "HuggingFaceTB/SmolLM3-3B",
|
| 224 |
+
"messages": [
|
| 225 |
+
{"role": "user", "content": "Give me a brief explanation of gravity in simple terms."}
|
| 226 |
+
],
|
| 227 |
+
"temperature": 0.6,
|
| 228 |
+
"top_p": 0.95,
|
| 229 |
+
"max_tokens": 16384,
|
| 230 |
+
"chat_template_kwargs": {"enable_thinking": false}
|
| 231 |
+
}'
|
| 232 |
+
```
|
| 233 |
+
|
| 234 |
+
## Evaluation
|
| 235 |
+
|
| 236 |
+
In this section, we report the evaluation results of SmolLM3 model. All evaluations are zero-shot unless stated otherwise, and we use [lighteval](https://github.com/huggingface/lighteval) to run them.
|
| 237 |
+
|
| 238 |
+
We highlight the best score in bold and underline the second-best score.
|
| 239 |
+
|
| 240 |
+
### Instruction Model
|
| 241 |
+
|
| 242 |
+
#### No Extended Thinking
|
| 243 |
+
Evaluation results of non reasoning models and reasoning models in no thinking mode. We highlight the best and second-best scores in bold.
|
| 244 |
+
| Category | Metric | SmoLLM3-3B | Qwen2.5-3B | Llama3.1-3B | Qwen3-1.7B | Qwen3-4B |
|
| 245 |
+
|---------|--------|------------|------------|-------------|------------|----------|
|
| 246 |
+
| High school math competition | AIME 2025 | <u>9.3</u> | 2.9 | 0.3 | 8.0 | **17.1** |
|
| 247 |
+
| Math problem-solving | GSM-Plus | 72.8 | <u>74.1</u> | 59.2 | 68.3 | **82.1** |
|
| 248 |
+
| Competitive programming | LiveCodeBench v4 | <u>15.2</u> | 10.5 | 3.4 | 15.0 | **24.9** |
|
| 249 |
+
| Graduate-level reasoning | GPQA Diamond | <u>35.7</u> | 32.2 | 29.4 | 31.8 | **44.4** |
|
| 250 |
+
| Instruction following | IFEval | **76.7** | 65.6 | 71.6 | <u>74.0</u> | 68.9 |
|
| 251 |
+
| Alignment | MixEval Hard | 26.9 | <u>27.6</u> | 24.9 | 24.3 | **31.6** |
|
| 252 |
+
| Tool Calling | BFCL| <u>92.3</u> | - | <u>92.3</u> * | 89.5 | **95.0** |
|
| 253 |
+
| Multilingual Q&A | Global MMLU | <u>53.5</u> | 50.54 | 46.8 | 49.5 | **65.1** |
|
| 254 |
+
|
| 255 |
+
(*): this is a tool calling finetune
|
| 256 |
+
|
| 257 |
+
#### Extended Thinking
|
| 258 |
+
Evaluation results in reasoning mode for SmolLM3 and Qwen3 models:
|
| 259 |
+
| Category | Metric | SmoLLM3-3B | Qwen3-1.7B | Qwen3-4B |
|
| 260 |
+
|---------|--------|------------|------------|----------|
|
| 261 |
+
| High school math competition | AIME 2025 | <u>36.7</u> | 30.7 | **58.8** |
|
| 262 |
+
| Math problem-solving | GSM-Plus | <u>83.4</u> | 79.4 | **88.2** |
|
| 263 |
+
| Competitive programming | LiveCodeBench v4 | 30.0 | <u>34.4</u> | **52.9** |
|
| 264 |
+
| Graduate-level reasoning | GPQA Diamond | <u>41.7</u> | 39.9 | **55.3** |
|
| 265 |
+
| Instruction following | IFEval | 71.2 | <u>74.2</u> | **85.4** |
|
| 266 |
+
| Alignment | MixEval Hard | 30.8 | <u>33.9</u> | **38.0** |
|
| 267 |
+
| Tool Calling | BFCL | <u>88.8</u> | <u>88.8</u> | **95.5** |
|
| 268 |
+
| Multilingual Q&A | Global MMLU | <u>64.1</u> | 62.3 | **73.3** |
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
### Base Pre-Trained Model
|
| 272 |
+
|
| 273 |
+
#### English benchmarks
|
| 274 |
+
Note: All evaluations are zero-shot unless stated otherwise. For Ruler 64k evaluation, we apply YaRN to the Qwen models with 32k context to extrapolate the context length.
|
| 275 |
+
|
| 276 |
+
| Category | Metric | SmolLM3-3B | Qwen2.5-3B | Llama3-3.2B | Qwen3-1.7B-Base | Qwen3-4B-Base |
|
| 277 |
+
|---------|--------|---------------------|------------|--------------|------------------|---------------|
|
| 278 |
+
| Reasoning & Commonsense| HellaSwag | **76.15** | 74.19 |<u>75.52</u> | 60.52 | 74.37 |
|
| 279 |
+
| | ARC-CF (Average) | **65.61** | 59.81 | 58.58 | 55.88 | <u>62.11</u> |
|
| 280 |
+
| | Winogrande | 58.88 | **61.41** | 58.72 | 57.06 | <u>59.59</u> |
|
| 281 |
+
| | CommonsenseQA | <u>55.28</u> | 49.14 | **60.60** | 48.98 | 52.99 |
|
| 282 |
+
| Knowledge & Understanding | MMLU-CF (Average) | <u>44.13</u> | 42.93 | 41.32 | 39.11 | **47.65** |
|
| 283 |
+
| | MMLU Pro CF | <u>19.61</u> | 16.66 | 16.42 | 18.04 | **24.92** |
|
| 284 |
+
| | MMLU Pro MCF | <u>32.70</u> | 31.32 | 25.07 | 30.39 | **41.07** |
|
| 285 |
+
| | PIQA | **78.89** | 78.35 | <u>78.51</u> | 75.35 | 77.58 |
|
| 286 |
+
| | OpenBookQA | 40.60 | 40.20 | <u>42.00</u> | 36.40 | **42.40** |
|
| 287 |
+
| | BoolQ | **78.99** | 73.61 | <u>75.33</u> | 74.46 | 74.28 |
|
| 288 |
+
| **Math & Code** | | | | | | |
|
| 289 |
+
| Coding & math | HumanEval+ | 30.48 | 34.14| 25.00 | <u>43.29</u>| **54.87** |
|
| 290 |
+
| | MBPP+ | 52.91 | 52.11 | 38.88| <u>59.25</u> | **63.75** |
|
| 291 |
+
| | MATH (4-shot) | <u>46.10</u> | 40.10 | 7.44 | 41.64 | **51.20** |
|
| 292 |
+
| | GSM8k (5-shot) | 67.63 | <u>70.13</u> | 25.92 | 65.88 | **74.14** |
|
| 293 |
+
| **Long context** | | | | | | |
|
| 294 |
+
| | Ruler 32k | 76.35 | 75.93 | <u>77.58</u> | 70.63 | **83.98** |
|
| 295 |
+
| | Ruler 64k | <u>67.85</u> | 64.90 | **72.93** | 57.18 | 60.29 |
|
| 296 |
+
| | Ruler 128k | 61.03 | <u>62.23</u> | **71.30** | 43.03 | 47.23 |
|
| 297 |
+
|
| 298 |
+
#### Multilingual benchmarks
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
| Category | Metric | SmolLM3 3B Base | Qwen2.5-3B | Llama3.2 3B | Qwen3 1.7B Base | Qwen3 4B Base |
|
| 302 |
+
|---------|--------|---------------------|------------|--------------|------------------|---------------|
|
| 303 |
+
| Main supported languages | | | | | | | |
|
| 304 |
+
| French| MLMM Hellaswag | **63.94** | 57.47 | 57.66 | 51.26 | <u>61.00</u> |
|
| 305 |
+
| | Belebele | 51.00 | <u>51.55</u> | 49.22 |49.44| **55.00** |
|
| 306 |
+
| | Global MMLU (CF) | <u>38.37</u> | 34.22 | 33.71 | 34.94 |**41.80** |
|
| 307 |
+
| | Flores-200 (5-shot) | 62.85| 61.38| <u>62.89</u> | 58.68 | **65.76** |
|
| 308 |
+
| Spanish| MLMM Hellaswag | **65.85** | 58.25 | 59.39 | 52.40 | <u>61.85</u> |
|
| 309 |
+
| | Belebele | 47.00 | <u>48.88</u> | 47.00 | 47.56 | **50.33** |
|
| 310 |
+
| | Global MMLU (CF) | <u>38.51</u> | 35.84 | 35.60 | 34.79 |**41.22** |
|
| 311 |
+
| | Flores-200 (5-shot) | <u>48.25</u>| 50.00| 44.45 | 46.93 | **50.16** |
|
| 312 |
+
| German| MLMM Hellaswag | **59.56** | 49.99| 53.19|46.10| <u>56.43</u>|
|
| 313 |
+
| | Belebele | <u>48.44</u> | 47.88 | 46.22 | 48.00 | **53.44**|
|
| 314 |
+
| | Global MMLU (CF) | <u>35.10</u> | 33.19 | 32.60 | 32.73 |**38.70** |
|
| 315 |
+
| | Flores-200 (5-shot) | **56.60**| 50.63| <u>54.95</u> | 52.58 | 50.48 |
|
| 316 |
+
| Italian| MLMM Hellaswag | **62.49** | 53.21 | 54.96 | 48.72 | <u>58.76</u> |
|
| 317 |
+
| | Belebele | <u>46.44</u> | 44.77 | 43.88 | 44.00 | **48.78** | 44.88 |
|
| 318 |
+
| | Global MMLU (CF) | <u>36.99</u> | 33.91 | 32.79 | 35.37 |**39.26** |
|
| 319 |
+
| | Flores-200 (5-shot) | <u>52.65<u/>| **54.87**| 48.83 | 48.37 | 49.11 |
|
| 320 |
+
| Portuguese| MLMM Hellaswag | **63.22** | 57.38 | 56.84 | 50.73 | <u>59.89</u> |
|
| 321 |
+
| | Belebele | 47.67 | **49.22** | 45.00 | 44.00 | 50.00 | <u>49.00</U> |
|
| 322 |
+
| | Global MMLU (CF) | <u>36.88</u> | 34.72 | 33.05 | 35.26 |**40.66** |
|
| 323 |
+
| | Flores-200 (5-shot) | <u>60.93</u> |57.68| 54.28 | 56.58 | **63.43** |
|
| 324 |
+
|
| 325 |
+
The model has also been trained on Arabic (standard), Chinese and Russian data, but has seen fewer tokens in these languages compared to the 6 above. We report the performance on these langages for information.
|
| 326 |
+
| Category | Metric | SmolLM3 3B Base | Qwen2.5-3B | Llama3.2 3B | Qwen3 1.7B Base | Qwen3 4B Base |
|
| 327 |
+
|---------|--------|---------------------|------------|--------------|------------------|---------------|
|
| 328 |
+
| Other supported languages | | | | | | | |
|
| 329 |
+
| Arabic| Belebele | 40.22 | 44.22 | <u>45.33</u> | 42.33 | **51.78** |
|
| 330 |
+
| | Global MMLU (CF) | 28.57 | 28.81 | 27.67 | <u>29.37</u> | **31.85** |
|
| 331 |
+
| | Flores-200 (5-shot) | <u>40.22</u> | 39.44 | **44.43** | 35.82 | 39.76 |
|
| 332 |
+
| Chinese| Belebele | 43.78 | 44.56 | <u>49.56</u> | 48.78 | **53.22** |
|
| 333 |
+
| | Global MMLU (CF) | 36.16 | 33.79 | <u>39.57</u> | 38.56 | **44.55** |
|
| 334 |
+
| | Flores-200 (5-shot) | 29.17 | **33.21** | 31.89 | 25.70 | <u>32.50</u> |
|
| 335 |
+
| Russian| Belebele | <u>47.44</u> | 45.89 | <u>47.44</u> | 45.22 | **51.44** |
|
| 336 |
+
| | Global MMLU (CF) | <u>36.51</u> | 32.47 | 34.52 | 34.83 | **38.80** |
|
| 337 |
+
| | Flores-200 (5-shot) | 47.13 | 48.74 | 50.74 | <u>54.70</u> | **60.53** |
|
| 338 |
+
|
| 339 |
+
## Training
|
| 340 |
+
|
| 341 |
+
### Model
|
| 342 |
+
|
| 343 |
+
- **Architecture:** Transformer decoder
|
| 344 |
+
- **Pretraining tokens:** 11T
|
| 345 |
+
- **Precision:** bfloat16
|
| 346 |
+
|
| 347 |
+
### Software & hardware
|
| 348 |
+
|
| 349 |
+
- **GPUs:** 384 H100
|
| 350 |
+
- **Training Framework:** [nanotron](https://github.com/huggingface/nanotron/tree/smollm3)
|
| 351 |
+
- **Data processing framework:** [datatrove](https://github.com/huggingface/datatrove)
|
| 352 |
+
- **Evaluation framework:** [lighteval](https://github.com/huggingface/lighteval)
|
| 353 |
+
- **Post-training Framework:** [TRL](https://github.com/huggingface/trl)
|
| 354 |
+
|
| 355 |
+
### Open resources
|
| 356 |
+
Here is an infographic with all the training details
|
| 357 |
+
- The datasets used for pretraining can be found in this [collection](https://huggingface.co/collections/HuggingFaceTB/smollm3-pretraining-datasets-685a7353fdc01aecde51b1d9) and those used in mid-training and post-training will be uploaded later
|
| 358 |
+
- The training and evaluation configs and code can be found in the [huggingface/smollm](https://github.com/huggingface/smollm) repository.
|
| 359 |
+
|
| 360 |
+

|
| 361 |
+
|
| 362 |
+
## Limitations
|
| 363 |
+
|
| 364 |
+
SmolLM3 can produce text on a variety of topics, but the generated content may not always be factually accurate, logically consistent, or free from biases present in the training data. These models should be used as assistive tools rather than definitive sources of information. Users should always verify important information and critically evaluate any generated content.
|
| 365 |
+
|
| 366 |
+
|
| 367 |
+
## License
|
| 368 |
+
[Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0)
|
smolvlm3_tokenizer/chat_template.jinja
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{# ───── defaults ───── #}
|
| 2 |
+
{%- if enable_thinking is not defined -%}
|
| 3 |
+
{%- set enable_thinking = true -%}
|
| 4 |
+
{%- endif -%}
|
| 5 |
+
|
| 6 |
+
{# ───── reasoning mode ───── #}
|
| 7 |
+
{%- if enable_thinking -%}
|
| 8 |
+
{%- set reasoning_mode = "/think" -%}
|
| 9 |
+
{%- else -%}
|
| 10 |
+
{%- set reasoning_mode = "/no_think" -%}
|
| 11 |
+
{%- endif -%}
|
| 12 |
+
|
| 13 |
+
{# ───── header (system message) ───── #}
|
| 14 |
+
{{- "<|im_start|>system\n" -}}
|
| 15 |
+
|
| 16 |
+
{%- if messages[0].role == "system" -%}
|
| 17 |
+
{%- set system_message = messages[0].content -%}
|
| 18 |
+
{%- if "/no_think" in system_message -%}
|
| 19 |
+
{%- set reasoning_mode = "/no_think" -%}
|
| 20 |
+
{%- elif "/think" in system_message -%}
|
| 21 |
+
{%- set reasoning_mode = "/think" -%}
|
| 22 |
+
{%- endif -%}
|
| 23 |
+
{%- set custom_instructions = system_message.replace("/no_think", "").replace("/think", "").rstrip() -%}
|
| 24 |
+
{%- endif -%}
|
| 25 |
+
|
| 26 |
+
{%- if "/system_override" in system_message -%}
|
| 27 |
+
{{- custom_instructions.replace("/system_override", "").rstrip() -}}
|
| 28 |
+
{{- "<|im_end|>\n" -}}
|
| 29 |
+
{%- else -%}
|
| 30 |
+
{{- "## Metadata\n\n" -}}
|
| 31 |
+
{{- "Knowledge Cutoff Date: June 2025\n" -}}
|
| 32 |
+
{%- set today = strftime_now("%d %B %Y") -%}
|
| 33 |
+
{{- "Today Date: " ~ today ~ "\n" -}}
|
| 34 |
+
{{- "Reasoning Mode: " + reasoning_mode + "\n\n" -}}
|
| 35 |
+
|
| 36 |
+
{{- "## Custom Instructions\n\n" -}}
|
| 37 |
+
{%- if custom_instructions -%}
|
| 38 |
+
{{- custom_instructions + "\n\n" -}}
|
| 39 |
+
{%- elif reasoning_mode == "/think" -%}
|
| 40 |
+
{{- "You are a helpful AI assistant named SmolLM, trained by Hugging Face. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracking, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: <think> Thought section </think> Solution section. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed to reach the conclusion.\n\n" -}}
|
| 41 |
+
{%- else -%}
|
| 42 |
+
{{- "You are a helpful AI assistant named SmolLM, trained by Hugging Face.\n\n" -}}
|
| 43 |
+
{%- endif -%}
|
| 44 |
+
|
| 45 |
+
{%- if xml_tools or python_tools or tools -%}
|
| 46 |
+
{{- "### Tools\n\n" -}}
|
| 47 |
+
{%- if xml_tools or tools -%}
|
| 48 |
+
{%- if tools -%}
|
| 49 |
+
{%- set xml_tools = tools -%}
|
| 50 |
+
{%- endif -%}
|
| 51 |
+
{%- set ns = namespace(xml_tool_string="You may call one or more functions to assist with the user query.\nYou are provided with function signatures within <tools></tools> XML tags:\n\n<tools>\n") -%}
|
| 52 |
+
{%- for tool in xml_tools[:] -%} {# The slicing makes sure that xml_tools is a list #}
|
| 53 |
+
{%- set ns.xml_tool_string = ns.xml_tool_string ~ (tool | string) ~ "\n" -%}
|
| 54 |
+
{%- endfor -%}
|
| 55 |
+
{%- set xml_tool_string = ns.xml_tool_string + "</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>" -%}
|
| 56 |
+
{{- xml_tool_string -}}
|
| 57 |
+
{%- endif -%}
|
| 58 |
+
{%- if python_tools -%}
|
| 59 |
+
{%- set ns = namespace(python_tool_string="When you send a message containing Python code between '<code>' and '</code>' tags, it will be executed in a stateful Jupyter notebook environment, and you will then be given the output to continued reasoning in an agentic loop.\n\nYou can use the following tools in your python code like regular functions:\n<tools>\n") -%}
|
| 60 |
+
{%- for tool in python_tools[:] -%} {# The slicing makes sure that python_tools is a list #}
|
| 61 |
+
{%- set ns.python_tool_string = ns.python_tool_string ~ (tool | string) ~ "\n" -%}
|
| 62 |
+
{%- endfor -%}
|
| 63 |
+
{%- set python_tool_string = ns.python_tool_string + "</tools>\n\nThe state persists between code executions: so variables that you define in one step are still available thereafter." -%}
|
| 64 |
+
{{- python_tool_string -}}
|
| 65 |
+
{%- endif -%}
|
| 66 |
+
{{- "\n\n" -}}
|
| 67 |
+
{{- "<|im_end|>\n" -}}
|
| 68 |
+
{%- endif -%}
|
| 69 |
+
{%- endif -%}
|
| 70 |
+
{# ───── main loop ───── #}
|
| 71 |
+
{%- for message in messages -%}
|
| 72 |
+
{%- set content = message.content if message.content is string else "" -%}
|
| 73 |
+
{%- if message.role == "user" -%}
|
| 74 |
+
{{ "<|im_start|>" + message.role + "\n" + content + "<|im_end|>\n" }}
|
| 75 |
+
{%- elif message.role == "assistant" -%}
|
| 76 |
+
{% generation %}
|
| 77 |
+
{%- if reasoning_mode == "/think" -%}
|
| 78 |
+
{{ "<|im_start|>assistant\n" + content.lstrip("\n") + "<|im_end|>\n" }}
|
| 79 |
+
{%- else -%}
|
| 80 |
+
{{ "<|im_start|>assistant\n" + "<think>\n\n</think>\n" + content.lstrip("\n") + "<|im_end|>\n" }}
|
| 81 |
+
{%- endif -%}
|
| 82 |
+
{% endgeneration %}
|
| 83 |
+
{%- elif message.role == "tool" -%}
|
| 84 |
+
{{ "<|im_start|>" + "user\n" + content + "<|im_end|>\n" }}
|
| 85 |
+
{%- endif -%}
|
| 86 |
+
{%- endfor -%}
|
| 87 |
+
{# ───── generation prompt ───── #}
|
| 88 |
+
{%- if add_generation_prompt -%}
|
| 89 |
+
{%- if reasoning_mode == "/think" -%}
|
| 90 |
+
{{ "<|im_start|>assistant\n" }}
|
| 91 |
+
{%- else -%}
|
| 92 |
+
{{ "<|im_start|>assistant\n" + "<think>\n\n</think>\n" }}
|
| 93 |
+
{%- endif -%}
|
| 94 |
+
{%- endif -%}
|
smolvlm3_tokenizer/config.json
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"SmolLM3ForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_bias": false,
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"bos_token_id": 128000,
|
| 8 |
+
"eos_token_id": 128012,
|
| 9 |
+
"hidden_act": "silu",
|
| 10 |
+
"hidden_size": 2048,
|
| 11 |
+
"initializer_range": 0.02,
|
| 12 |
+
"intermediate_size": 11008,
|
| 13 |
+
"layer_types": [
|
| 14 |
+
"full_attention",
|
| 15 |
+
"full_attention",
|
| 16 |
+
"full_attention",
|
| 17 |
+
"full_attention",
|
| 18 |
+
"full_attention",
|
| 19 |
+
"full_attention",
|
| 20 |
+
"full_attention",
|
| 21 |
+
"full_attention",
|
| 22 |
+
"full_attention",
|
| 23 |
+
"full_attention",
|
| 24 |
+
"full_attention",
|
| 25 |
+
"full_attention",
|
| 26 |
+
"full_attention",
|
| 27 |
+
"full_attention",
|
| 28 |
+
"full_attention",
|
| 29 |
+
"full_attention",
|
| 30 |
+
"full_attention",
|
| 31 |
+
"full_attention",
|
| 32 |
+
"full_attention",
|
| 33 |
+
"full_attention",
|
| 34 |
+
"full_attention",
|
| 35 |
+
"full_attention",
|
| 36 |
+
"full_attention",
|
| 37 |
+
"full_attention",
|
| 38 |
+
"full_attention",
|
| 39 |
+
"full_attention",
|
| 40 |
+
"full_attention",
|
| 41 |
+
"full_attention",
|
| 42 |
+
"full_attention",
|
| 43 |
+
"full_attention",
|
| 44 |
+
"full_attention",
|
| 45 |
+
"full_attention",
|
| 46 |
+
"full_attention",
|
| 47 |
+
"full_attention",
|
| 48 |
+
"full_attention",
|
| 49 |
+
"full_attention"
|
| 50 |
+
],
|
| 51 |
+
"max_position_embeddings": 65536,
|
| 52 |
+
"max_window_layers": 28,
|
| 53 |
+
"mlp_bias": false,
|
| 54 |
+
"model_type": "smollm3",
|
| 55 |
+
"no_rope_layer_interval": 4,
|
| 56 |
+
"no_rope_layers": [
|
| 57 |
+
1,
|
| 58 |
+
1,
|
| 59 |
+
1,
|
| 60 |
+
0,
|
| 61 |
+
1,
|
| 62 |
+
1,
|
| 63 |
+
1,
|
| 64 |
+
0,
|
| 65 |
+
1,
|
| 66 |
+
1,
|
| 67 |
+
1,
|
| 68 |
+
0,
|
| 69 |
+
1,
|
| 70 |
+
1,
|
| 71 |
+
1,
|
| 72 |
+
0,
|
| 73 |
+
1,
|
| 74 |
+
1,
|
| 75 |
+
1,
|
| 76 |
+
0,
|
| 77 |
+
1,
|
| 78 |
+
1,
|
| 79 |
+
1,
|
| 80 |
+
0,
|
| 81 |
+
1,
|
| 82 |
+
1,
|
| 83 |
+
1,
|
| 84 |
+
0,
|
| 85 |
+
1,
|
| 86 |
+
1,
|
| 87 |
+
1,
|
| 88 |
+
0,
|
| 89 |
+
1,
|
| 90 |
+
1,
|
| 91 |
+
1,
|
| 92 |
+
0
|
| 93 |
+
],
|
| 94 |
+
"num_attention_heads": 16,
|
| 95 |
+
"num_hidden_layers": 36,
|
| 96 |
+
"num_key_value_heads": 4,
|
| 97 |
+
"pad_token_id": 128004,
|
| 98 |
+
"pretraining_tp": 2,
|
| 99 |
+
"rms_norm_eps": 1e-06,
|
| 100 |
+
"rope_scaling": null,
|
| 101 |
+
"rope_theta": 5000000.0,
|
| 102 |
+
"sliding_window": null,
|
| 103 |
+
"torch_dtype": "bfloat16",
|
| 104 |
+
"transformers_version": "4.54.0.dev0",
|
| 105 |
+
"use_cache": false,
|
| 106 |
+
"use_sliding_window": false,
|
| 107 |
+
"vocab_size": 128256
|
| 108 |
+
}
|
smolvlm3_tokenizer/generation_config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 128000,
|
| 3 |
+
"eos_token_id": 128012,
|
| 4 |
+
"pad_token_id": 128004,
|
| 5 |
+
"transformers_version": "4.54.0.dev0",
|
| 6 |
+
"temperature": 0.6,
|
| 7 |
+
"top_p": 0.95,
|
| 8 |
+
"do_sample": true
|
| 9 |
+
}
|
smolvlm3_tokenizer/model.safetensors.index.json
ADDED
|
@@ -0,0 +1,334 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"total_parameters": 3075098624,
|
| 4 |
+
"total_size": 6150197248
|
| 5 |
+
},
|
| 6 |
+
"weight_map": {
|
| 7 |
+
"model.embed_tokens.weight": "model-00001-of-00002.safetensors",
|
| 8 |
+
"model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 9 |
+
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 10 |
+
"model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 11 |
+
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 12 |
+
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 13 |
+
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 14 |
+
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 15 |
+
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 16 |
+
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 17 |
+
"model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 18 |
+
"model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 19 |
+
"model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 20 |
+
"model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 21 |
+
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 22 |
+
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 23 |
+
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 24 |
+
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 25 |
+
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 26 |
+
"model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 27 |
+
"model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 28 |
+
"model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 29 |
+
"model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 30 |
+
"model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 31 |
+
"model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 32 |
+
"model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 33 |
+
"model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 34 |
+
"model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 35 |
+
"model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 36 |
+
"model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 37 |
+
"model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 38 |
+
"model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 39 |
+
"model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 40 |
+
"model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 41 |
+
"model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 42 |
+
"model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 43 |
+
"model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 44 |
+
"model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 45 |
+
"model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 46 |
+
"model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 47 |
+
"model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 48 |
+
"model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 49 |
+
"model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 50 |
+
"model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 51 |
+
"model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 52 |
+
"model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 53 |
+
"model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 54 |
+
"model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 55 |
+
"model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 56 |
+
"model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 57 |
+
"model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 58 |
+
"model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 59 |
+
"model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 60 |
+
"model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 61 |
+
"model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 62 |
+
"model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 63 |
+
"model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 64 |
+
"model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 65 |
+
"model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 66 |
+
"model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 67 |
+
"model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 68 |
+
"model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 69 |
+
"model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 70 |
+
"model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 71 |
+
"model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 72 |
+
"model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 73 |
+
"model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 74 |
+
"model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 75 |
+
"model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 76 |
+
"model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 77 |
+
"model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 78 |
+
"model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 79 |
+
"model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 80 |
+
"model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 81 |
+
"model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 82 |
+
"model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 83 |
+
"model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 84 |
+
"model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 85 |
+
"model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 86 |
+
"model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 87 |
+
"model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 88 |
+
"model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 89 |
+
"model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 90 |
+
"model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 91 |
+
"model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 92 |
+
"model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 93 |
+
"model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 94 |
+
"model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 95 |
+
"model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 96 |
+
"model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 97 |
+
"model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 98 |
+
"model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 99 |
+
"model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 100 |
+
"model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 101 |
+
"model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 102 |
+
"model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 103 |
+
"model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 104 |
+
"model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 105 |
+
"model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 106 |
+
"model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 107 |
+
"model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 108 |
+
"model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 109 |
+
"model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 110 |
+
"model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 111 |
+
"model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 112 |
+
"model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 113 |
+
"model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 114 |
+
"model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 115 |
+
"model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 116 |
+
"model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 117 |
+
"model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 118 |
+
"model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 119 |
+
"model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 120 |
+
"model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 121 |
+
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 122 |
+
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 123 |
+
"model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 124 |
+
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 125 |
+
"model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 126 |
+
"model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 127 |
+
"model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 128 |
+
"model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 129 |
+
"model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 130 |
+
"model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 131 |
+
"model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 132 |
+
"model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 133 |
+
"model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 134 |
+
"model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 135 |
+
"model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 136 |
+
"model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 137 |
+
"model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 138 |
+
"model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 139 |
+
"model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 140 |
+
"model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 141 |
+
"model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 142 |
+
"model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 143 |
+
"model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 144 |
+
"model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 145 |
+
"model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 146 |
+
"model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 147 |
+
"model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 148 |
+
"model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 149 |
+
"model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 150 |
+
"model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 151 |
+
"model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 152 |
+
"model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 153 |
+
"model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 154 |
+
"model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 155 |
+
"model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 156 |
+
"model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 157 |
+
"model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 158 |
+
"model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 159 |
+
"model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 160 |
+
"model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 161 |
+
"model.layers.24.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 162 |
+
"model.layers.24.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 163 |
+
"model.layers.24.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 164 |
+
"model.layers.24.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 165 |
+
"model.layers.24.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 166 |
+
"model.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 167 |
+
"model.layers.24.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 168 |
+
"model.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 169 |
+
"model.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 170 |
+
"model.layers.25.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 171 |
+
"model.layers.25.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 172 |
+
"model.layers.25.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 173 |
+
"model.layers.25.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 174 |
+
"model.layers.25.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 175 |
+
"model.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 176 |
+
"model.layers.25.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 177 |
+
"model.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 178 |
+
"model.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 179 |
+
"model.layers.26.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 180 |
+
"model.layers.26.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 181 |
+
"model.layers.26.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 182 |
+
"model.layers.26.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 183 |
+
"model.layers.26.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 184 |
+
"model.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 185 |
+
"model.layers.26.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 186 |
+
"model.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 187 |
+
"model.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 188 |
+
"model.layers.27.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 189 |
+
"model.layers.27.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 190 |
+
"model.layers.27.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 191 |
+
"model.layers.27.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 192 |
+
"model.layers.27.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 193 |
+
"model.layers.27.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 194 |
+
"model.layers.27.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 195 |
+
"model.layers.27.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 196 |
+
"model.layers.27.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 197 |
+
"model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 198 |
+
"model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 199 |
+
"model.layers.28.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 200 |
+
"model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 201 |
+
"model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 202 |
+
"model.layers.28.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 203 |
+
"model.layers.28.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 204 |
+
"model.layers.28.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 205 |
+
"model.layers.28.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 206 |
+
"model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 207 |
+
"model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 208 |
+
"model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 209 |
+
"model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 210 |
+
"model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 211 |
+
"model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 212 |
+
"model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 213 |
+
"model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 214 |
+
"model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 215 |
+
"model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 216 |
+
"model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 217 |
+
"model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 218 |
+
"model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 219 |
+
"model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 220 |
+
"model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 221 |
+
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 222 |
+
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 223 |
+
"model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 224 |
+
"model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 225 |
+
"model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 226 |
+
"model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 227 |
+
"model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 228 |
+
"model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 229 |
+
"model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 230 |
+
"model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 231 |
+
"model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 232 |
+
"model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 233 |
+
"model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 234 |
+
"model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 235 |
+
"model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 236 |
+
"model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 237 |
+
"model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 238 |
+
"model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 239 |
+
"model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 240 |
+
"model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 241 |
+
"model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 242 |
+
"model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 243 |
+
"model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 244 |
+
"model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 245 |
+
"model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 246 |
+
"model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 247 |
+
"model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 248 |
+
"model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 249 |
+
"model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 250 |
+
"model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 251 |
+
"model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 252 |
+
"model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 253 |
+
"model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 254 |
+
"model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 255 |
+
"model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 256 |
+
"model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 257 |
+
"model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 258 |
+
"model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 259 |
+
"model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 260 |
+
"model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 261 |
+
"model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 262 |
+
"model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 263 |
+
"model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 264 |
+
"model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 265 |
+
"model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 266 |
+
"model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 267 |
+
"model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 268 |
+
"model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 269 |
+
"model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 270 |
+
"model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 271 |
+
"model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 272 |
+
"model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 273 |
+
"model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 274 |
+
"model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 275 |
+
"model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 276 |
+
"model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 277 |
+
"model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 278 |
+
"model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 279 |
+
"model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 280 |
+
"model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 281 |
+
"model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 282 |
+
"model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 283 |
+
"model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 284 |
+
"model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 285 |
+
"model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 286 |
+
"model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 287 |
+
"model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 288 |
+
"model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 289 |
+
"model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 290 |
+
"model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 291 |
+
"model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 292 |
+
"model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 293 |
+
"model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 294 |
+
"model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 295 |
+
"model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 296 |
+
"model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 297 |
+
"model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 298 |
+
"model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 299 |
+
"model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 300 |
+
"model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 301 |
+
"model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 302 |
+
"model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 303 |
+
"model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 304 |
+
"model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 305 |
+
"model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 306 |
+
"model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 307 |
+
"model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 308 |
+
"model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 309 |
+
"model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 310 |
+
"model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 311 |
+
"model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 312 |
+
"model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 313 |
+
"model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 314 |
+
"model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 315 |
+
"model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 316 |
+
"model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 317 |
+
"model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 318 |
+
"model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 319 |
+
"model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 320 |
+
"model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 321 |
+
"model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 322 |
+
"model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 323 |
+
"model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 324 |
+
"model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 325 |
+
"model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 326 |
+
"model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 327 |
+
"model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 328 |
+
"model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 329 |
+
"model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 330 |
+
"model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 331 |
+
"model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 332 |
+
"model.norm.weight": "model-00002-of-00002.safetensors"
|
| 333 |
+
}
|
| 334 |
+
}
|
smolvlm3_tokenizer/notebook.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|