Commit ·
766ec4a
0
Parent(s):
Duplicate from g-ntovas/Qwen3.5-0.8B-LiteRT
Browse filesCo-authored-by: John <g-ntovas@users.noreply.huggingface.co>
- .gitattributes +39 -0
- README.md +161 -0
- inference_tflite.py +216 -0
- qwen35_embedder_q8.tflite +3 -0
- qwen35_mm_q8_ekv2048.litertlm +3 -0
- qwen35_mm_q8_ekv2048.tflite +3 -0
- qwen35_vision_adapter_q8.tflite +3 -0
- qwen35_vision_encoder_q8.tflite +3 -0
- tokenizer.json +3 -0
- tokenizer_config.json +305 -0
.gitattributes
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
qwen35_q8_ekv2048.litertlm filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
qwen35_mm_q8_ekv2048.litertlm filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
qwen35_mm_q4_block32_ekv4096.litertlm filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
base_model:
|
| 4 |
+
- Qwen/Qwen3.5-0.8B
|
| 5 |
+
pipeline_tag: image-text-to-text
|
| 6 |
+
library_name: litert-lm
|
| 7 |
+
tags:
|
| 8 |
+
- Qwen3.5
|
| 9 |
+
- litert
|
| 10 |
+
- litert-lm
|
| 11 |
+
- tflite
|
| 12 |
+
- on-device
|
| 13 |
+
- hybrid-attention
|
| 14 |
+
- GatedDeltaNet
|
| 15 |
+
- multimodal
|
| 16 |
+
- vision
|
| 17 |
+
---
|
| 18 |
+
|
| 19 |
+
# Qwen3.5-0.8B LiteRT (Multimodal)
|
| 20 |
+
|
| 21 |
+
This repository contains a [LiteRT](https://ai.google.dev/edge/litert) (formerly TFLite) conversion of [Qwen/Qwen3.5-0.8B](https://huggingface.co/Qwen/Qwen3.5-0.8B) for on-device inference, packaged in the [LiteRT-LM](https://github.com/nicfv/litert-torch) `.litertlm` format. Includes the **full multimodal pipeline**: language model, vision encoder, and vision adapter for image understanding.
|
| 22 |
+
|
| 23 |
+
## Model Details
|
| 24 |
+
|
| 25 |
+
| Property | Value |
|
| 26 |
+
|----------|-------|
|
| 27 |
+
| **Base Model** | [Qwen/Qwen3.5-0.8B](https://huggingface.co/Qwen/Qwen3.5-0.8B) |
|
| 28 |
+
| **Architecture** | Hybrid attention (GatedDeltaNet + Full Attention) + ViT vision encoder |
|
| 29 |
+
| **Parameters** | 752M (language) + 675M (vision encoder) + 10M (vision adapter) |
|
| 30 |
+
| **Quantization** | Dynamic INT8 |
|
| 31 |
+
| **KV Cache Length** | 2048 |
|
| 32 |
+
| **Prefill Signatures** | 64, 128, 256, 512 |
|
| 33 |
+
| **Vision Signatures** | 256, 576, 1024, 2304 patches |
|
| 34 |
+
| **Format** | `.litertlm` (LiteRT-LM container) |
|
| 35 |
+
|
| 36 |
+
## Architecture
|
| 37 |
+
|
| 38 |
+
### Language Model
|
| 39 |
+
|
| 40 |
+
Qwen3.5-0.8B uses a **hybrid attention** architecture that combines:
|
| 41 |
+
|
| 42 |
+
- **18 GatedDeltaNet layers** (linear attention with recurrent delta rule) at positions 0-2, 4-6, 8-10, 12-14, 16-18, 20-22
|
| 43 |
+
- **6 Full Attention layers** (standard multi-head attention with output gating and partial RoPE) at positions 3, 7, 11, 15, 19, 23
|
| 44 |
+
|
| 45 |
+
### Vision Encoder
|
| 46 |
+
|
| 47 |
+
The vision encoder is a 27-layer Vision Transformer (ViT):
|
| 48 |
+
|
| 49 |
+
- **Patch embedding**: Conv3d (3→1152, kernel=[2,16,16]) with learned position embeddings (bilinear interpolation from 48×48 grid)
|
| 50 |
+
- **27 VisionBlocks**: LayerNorm → Self-Attention (16 heads, head_dim=72, 2D rotary pos emb) → MLP (1152→4304→1152, GELU)
|
| 51 |
+
- **Patch merger** (vision adapter): Groups 4 adjacent patches (spatial_merge_size=2) and projects to language model dimension (4608→1024)
|
| 52 |
+
|
| 53 |
+
The model was **re-authored from scratch** using the LiteRT Generative API. The vision encoder and adapter are exported as separate TFLite models bundled alongside the language model.
|
| 54 |
+
|
| 55 |
+
## Files
|
| 56 |
+
|
| 57 |
+
| File | Size | Description |
|
| 58 |
+
|------|------|-------------|
|
| 59 |
+
| `qwen35_mm_q8_ekv2048.litertlm` | ~1.2 GB | LiteRT-LM bundle (LM + vision encoder + vision adapter + tokenizer) |
|
| 60 |
+
| `qwen35_mm_q8_ekv2048.tflite` | ~757 MB | Language model TFLite |
|
| 61 |
+
| `qwen35_vision_encoder_q8.tflite` | ~88 MB | Vision encoder TFLite |
|
| 62 |
+
| `qwen35_vision_adapter_q8.tflite` | ~12 MB | Vision adapter TFLite |
|
| 63 |
+
| `qwen35_embedder_q8.tflite` | ~245 MB | Text embedder TFLite |
|
| 64 |
+
| `tokenizer.json` | ~11 MB | HuggingFace tokenizer |
|
| 65 |
+
| `tokenizer_config.json` | ~2 KB | Tokenizer configuration |
|
| 66 |
+
|
| 67 |
+
## Signatures
|
| 68 |
+
|
| 69 |
+
### Language Model
|
| 70 |
+
|
| 71 |
+
| Signature | Input Length | Outputs |
|
| 72 |
+
|-----------|-------------|---------|
|
| 73 |
+
| `prefill_64` | 64 tokens | Updated KV cache |
|
| 74 |
+
| `prefill_128` | 128 tokens | Updated KV cache |
|
| 75 |
+
| `prefill_256` | 256 tokens | Updated KV cache |
|
| 76 |
+
| `prefill_512` | 512 tokens | Updated KV cache |
|
| 77 |
+
| `decode` | 1 token | Logits + Updated KV cache |
|
| 78 |
+
|
| 79 |
+
### Vision Encoder
|
| 80 |
+
|
| 81 |
+
| Signature | Patches | Approx. Image Size |
|
| 82 |
+
|-----------|---------|---------------------|
|
| 83 |
+
| `encode_256` | 256 | 256×256 |
|
| 84 |
+
| `encode_576` | 576 | 384×384 |
|
| 85 |
+
| `encode_1024` | 1024 | 512×512 |
|
| 86 |
+
| `encode_2304` | 2304 | 768×768 |
|
| 87 |
+
|
| 88 |
+
### Vision Adapter
|
| 89 |
+
|
| 90 |
+
| Signature | Merged Tokens | From Patches |
|
| 91 |
+
|-----------|---------------|--------------|
|
| 92 |
+
| `adapt_64` | 64 | 256 |
|
| 93 |
+
| `adapt_144` | 144 | 576 |
|
| 94 |
+
| `adapt_256` | 256 | 1024 |
|
| 95 |
+
| `adapt_576` | 576 | 2304 |
|
| 96 |
+
|
| 97 |
+
## Usage
|
| 98 |
+
|
| 99 |
+
### Python (ai-edge-litert)
|
| 100 |
+
|
| 101 |
+
```python
|
| 102 |
+
import numpy as np
|
| 103 |
+
from ai_edge_litert import interpreter as tfl_interpreter
|
| 104 |
+
|
| 105 |
+
# Load model
|
| 106 |
+
interp = tfl_interpreter.Interpreter(model_path="qwen35_mm_q8_ekv2048.tflite")
|
| 107 |
+
interp.allocate_tensors()
|
| 108 |
+
|
| 109 |
+
# Initialize KV cache (24 layers, mixed shapes)
|
| 110 |
+
kv_cache = {} # See inference_tflite.py for full initialization
|
| 111 |
+
|
| 112 |
+
# Prefill
|
| 113 |
+
prefill_runner = interp.get_signature_runner("prefill_64")
|
| 114 |
+
tokens = np.array([[...]], dtype=np.int32) # Padded to 64
|
| 115 |
+
input_pos = np.arange(64, dtype=np.int32)
|
| 116 |
+
output = prefill_runner(tokens=tokens, input_pos=input_pos, **kv_cache)
|
| 117 |
+
|
| 118 |
+
# Decode loop
|
| 119 |
+
decode_runner = interp.get_signature_runner("decode")
|
| 120 |
+
for step in range(max_tokens):
|
| 121 |
+
output = decode_runner(tokens=next_token, input_pos=pos, **kv_cache)
|
| 122 |
+
next_token = np.argmax(output["logits"][0, -1])
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
### Tokenizer
|
| 126 |
+
|
| 127 |
+
```python
|
| 128 |
+
from transformers import AutoTokenizer
|
| 129 |
+
tokenizer = AutoTokenizer.from_pretrained("g-ntovas/Qwen3.5-0.8B-LiteRT")
|
| 130 |
+
```
|
| 131 |
+
|
| 132 |
+
## Conversion Details
|
| 133 |
+
|
| 134 |
+
- **Source**: [Qwen/Qwen3.5-0.8B](https://huggingface.co/Qwen/Qwen3.5-0.8B) (multimodal model)
|
| 135 |
+
- **Method**: Custom re-authoring using LiteRT Generative API
|
| 136 |
+
- **Quantization**: Dynamic INT8 (`dynamic_int8`)
|
| 137 |
+
- **Export**: Per-signature tracing with fixed prefill lengths and patch counts
|
| 138 |
+
- **Vision**: Encoder and adapter exported as separate TFLite models, bundled into `.litertlm`
|
| 139 |
+
|
| 140 |
+
## Limitations
|
| 141 |
+
|
| 142 |
+
- Video input is not yet supported (encoder architecture supports it, but the data processor returns UNIMPLEMENTED for video)
|
| 143 |
+
- Prompts are padded to the nearest prefill signature length, which may introduce minor quality differences for the linear attention layers
|
| 144 |
+
- The recurrent GatedDeltaNet implementation may produce slightly different outputs compared to the chunk-based HuggingFace implementation due to floating-point operation ordering
|
| 145 |
+
|
| 146 |
+
## License
|
| 147 |
+
|
| 148 |
+
This model inherits the [Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0) from the original [Qwen/Qwen3.5-0.8B](https://huggingface.co/Qwen/Qwen3.5-0.8B) model.
|
| 149 |
+
|
| 150 |
+
## Citation
|
| 151 |
+
|
| 152 |
+
If you use this model, please cite the original Qwen3.5 paper:
|
| 153 |
+
|
| 154 |
+
```bibtex
|
| 155 |
+
@misc{qwen3.5,
|
| 156 |
+
title={Qwen3.5 Technical Report},
|
| 157 |
+
author={Qwen Team},
|
| 158 |
+
year={2026},
|
| 159 |
+
url={https://huggingface.co/Qwen/Qwen3.5-0.8B}
|
| 160 |
+
}
|
| 161 |
+
```
|
inference_tflite.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Run text generation inference on the exported Qwen3.5-0.8B TFLite model.
|
| 3 |
+
|
| 4 |
+
Usage:
|
| 5 |
+
python inference_tflite.py --model_path output/qwen35_0.8b/qwen35_q8_ekv2048.tflite
|
| 6 |
+
python inference_tflite.py --prompt "Explain gravity" --max_new_tokens 100
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import argparse
|
| 10 |
+
import glob
|
| 11 |
+
import logging
|
| 12 |
+
import time
|
| 13 |
+
|
| 14 |
+
import numpy as np
|
| 15 |
+
import transformers
|
| 16 |
+
from ai_edge_litert import interpreter as tfl_interpreter
|
| 17 |
+
|
| 18 |
+
logging.basicConfig(
|
| 19 |
+
level=logging.INFO,
|
| 20 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
| 21 |
+
)
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
# Architecture constants (must match qwen35_model.py)
|
| 25 |
+
NUM_LAYERS = 24
|
| 26 |
+
LAYER_TYPES = [
|
| 27 |
+
"linear", "linear", "linear", "full",
|
| 28 |
+
"linear", "linear", "linear", "full",
|
| 29 |
+
"linear", "linear", "linear", "full",
|
| 30 |
+
"linear", "linear", "linear", "full",
|
| 31 |
+
"linear", "linear", "linear", "full",
|
| 32 |
+
"linear", "linear", "linear", "full",
|
| 33 |
+
]
|
| 34 |
+
LINEAR_QKV_DIM = 6144
|
| 35 |
+
LINEAR_CONV_KERNEL = 4
|
| 36 |
+
LINEAR_NUM_HEADS = 16
|
| 37 |
+
LINEAR_K_HEAD_DIM = 128
|
| 38 |
+
LINEAR_V_HEAD_DIM = 128
|
| 39 |
+
FULL_ATTN_NUM_KV_HEADS = 2
|
| 40 |
+
FULL_ATTN_HEAD_DIM = 256
|
| 41 |
+
|
| 42 |
+
MODEL_ID = "Qwen/Qwen3.5-0.8B"
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def create_initial_kv_cache(kv_cache_max_len, batch_size=1):
|
| 46 |
+
"""Create zero-initialized KV cache arrays matching the model's per-layer shapes."""
|
| 47 |
+
kv = {}
|
| 48 |
+
for i in range(NUM_LAYERS):
|
| 49 |
+
if LAYER_TYPES[i] == "linear":
|
| 50 |
+
kv[f"kv_cache_k_{i}"] = np.zeros(
|
| 51 |
+
(batch_size, LINEAR_QKV_DIM, LINEAR_CONV_KERNEL - 1),
|
| 52 |
+
dtype=np.float32,
|
| 53 |
+
)
|
| 54 |
+
kv[f"kv_cache_v_{i}"] = np.zeros(
|
| 55 |
+
(batch_size, LINEAR_NUM_HEADS, LINEAR_K_HEAD_DIM, LINEAR_V_HEAD_DIM),
|
| 56 |
+
dtype=np.float32,
|
| 57 |
+
)
|
| 58 |
+
else:
|
| 59 |
+
kv[f"kv_cache_k_{i}"] = np.zeros(
|
| 60 |
+
(batch_size, kv_cache_max_len, FULL_ATTN_NUM_KV_HEADS, FULL_ATTN_HEAD_DIM),
|
| 61 |
+
dtype=np.float32,
|
| 62 |
+
)
|
| 63 |
+
kv[f"kv_cache_v_{i}"] = np.zeros(
|
| 64 |
+
(batch_size, kv_cache_max_len, FULL_ATTN_NUM_KV_HEADS, FULL_ATTN_HEAD_DIM),
|
| 65 |
+
dtype=np.float32,
|
| 66 |
+
)
|
| 67 |
+
return kv
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def find_prefill_signature(signatures, seq_len):
|
| 71 |
+
"""Find the best prefill signature for the given sequence length."""
|
| 72 |
+
prefill_sigs = sorted(
|
| 73 |
+
[s for s in signatures if s.startswith("prefill_")],
|
| 74 |
+
key=lambda s: int(s.split("_")[1]),
|
| 75 |
+
)
|
| 76 |
+
if not prefill_sigs:
|
| 77 |
+
raise ValueError("No prefill signatures found in model")
|
| 78 |
+
|
| 79 |
+
for sig in prefill_sigs:
|
| 80 |
+
sig_len = int(sig.split("_")[1])
|
| 81 |
+
if sig_len >= seq_len:
|
| 82 |
+
return sig, sig_len
|
| 83 |
+
|
| 84 |
+
# Use largest available
|
| 85 |
+
largest = prefill_sigs[-1]
|
| 86 |
+
return largest, int(largest.split("_")[1])
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def generate(model_path, prompt, max_new_tokens, kv_cache_max_len):
|
| 90 |
+
"""Run text generation with the TFLite model."""
|
| 91 |
+
# Load tokenizer
|
| 92 |
+
logger.info("Loading tokenizer from: %s", MODEL_ID)
|
| 93 |
+
tokenizer = transformers.AutoTokenizer.from_pretrained(
|
| 94 |
+
MODEL_ID, trust_remote_code=True
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
# Tokenize prompt
|
| 98 |
+
input_ids = tokenizer.encode(prompt)
|
| 99 |
+
logger.info("Prompt: %s", prompt)
|
| 100 |
+
logger.info("Token count: %d", len(input_ids))
|
| 101 |
+
|
| 102 |
+
# Load TFLite model
|
| 103 |
+
logger.info("Loading TFLite model from: %s", model_path)
|
| 104 |
+
t0 = time.time()
|
| 105 |
+
interp = tfl_interpreter.Interpreter(model_path=model_path)
|
| 106 |
+
interp.allocate_tensors()
|
| 107 |
+
logger.info("Model loaded in %.1fs", time.time() - t0)
|
| 108 |
+
|
| 109 |
+
signatures = interp.get_signature_list()
|
| 110 |
+
logger.info("Available signatures: %s", list(signatures.keys()))
|
| 111 |
+
|
| 112 |
+
# Initialize KV cache
|
| 113 |
+
kv_cache = create_initial_kv_cache(kv_cache_max_len)
|
| 114 |
+
|
| 115 |
+
# --- Prefill phase ---
|
| 116 |
+
sig_name, sig_len = find_prefill_signature(signatures, len(input_ids))
|
| 117 |
+
logger.info("Using prefill signature: %s (padding %d -> %d)", sig_name, len(input_ids), sig_len)
|
| 118 |
+
|
| 119 |
+
# Pad input to match signature length
|
| 120 |
+
padded_ids = input_ids + [0] * (sig_len - len(input_ids))
|
| 121 |
+
tokens = np.array([padded_ids], dtype=np.int32)
|
| 122 |
+
input_pos = np.arange(sig_len, dtype=np.int32)
|
| 123 |
+
|
| 124 |
+
prefill_runner = interp.get_signature_runner(sig_name)
|
| 125 |
+
t0 = time.time()
|
| 126 |
+
prefill_out = prefill_runner(tokens=tokens, input_pos=input_pos, **kv_cache)
|
| 127 |
+
prefill_time = time.time() - t0
|
| 128 |
+
logger.info("Prefill done in %.2fs", prefill_time)
|
| 129 |
+
|
| 130 |
+
# Update KV cache from prefill output
|
| 131 |
+
for key in kv_cache:
|
| 132 |
+
if key in prefill_out:
|
| 133 |
+
kv_cache[key] = prefill_out[key]
|
| 134 |
+
|
| 135 |
+
# --- Decode phase ---
|
| 136 |
+
# Prefill processed sig_len tokens (including padding). Next decode
|
| 137 |
+
# position is sig_len. We feed the last real token to get the first
|
| 138 |
+
# generated token.
|
| 139 |
+
decode_runner = interp.get_signature_runner("decode")
|
| 140 |
+
generated_ids = list(input_ids)
|
| 141 |
+
current_pos = sig_len # continue after prefill
|
| 142 |
+
|
| 143 |
+
logger.info("Starting decode (max %d tokens)...", max_new_tokens)
|
| 144 |
+
print(f"\n--- Generated text ---\n{prompt}", end="", flush=True)
|
| 145 |
+
|
| 146 |
+
t0 = time.time()
|
| 147 |
+
for step in range(max_new_tokens):
|
| 148 |
+
# Feed last token, get next
|
| 149 |
+
tok = np.array([[generated_ids[-1]]], dtype=np.int32)
|
| 150 |
+
pos = np.array([current_pos], dtype=np.int32)
|
| 151 |
+
decode_out = decode_runner(tokens=tok, input_pos=pos, **kv_cache)
|
| 152 |
+
|
| 153 |
+
# Update KV cache
|
| 154 |
+
for key in kv_cache:
|
| 155 |
+
if key in decode_out:
|
| 156 |
+
kv_cache[key] = decode_out[key]
|
| 157 |
+
|
| 158 |
+
next_token = int(np.argmax(decode_out["logits"][0, -1]))
|
| 159 |
+
generated_ids.append(next_token)
|
| 160 |
+
current_pos += 1
|
| 161 |
+
|
| 162 |
+
# Print token
|
| 163 |
+
word = tokenizer.decode([next_token])
|
| 164 |
+
print(word, end="", flush=True)
|
| 165 |
+
|
| 166 |
+
# Stop on EOS
|
| 167 |
+
if next_token == tokenizer.eos_token_id:
|
| 168 |
+
break
|
| 169 |
+
|
| 170 |
+
decode_time = time.time() - t0
|
| 171 |
+
num_decoded = len(generated_ids) - len(input_ids)
|
| 172 |
+
print(f"\n\n--- Stats ---")
|
| 173 |
+
print(f"Prefill: {prefill_time:.2f}s ({len(input_ids)} tokens)")
|
| 174 |
+
print(f"Decode: {decode_time:.2f}s ({num_decoded} tokens, {num_decoded/decode_time:.1f} tok/s)")
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def main():
|
| 178 |
+
parser = argparse.ArgumentParser(description="TFLite inference for Qwen3.5-0.8B")
|
| 179 |
+
parser.add_argument(
|
| 180 |
+
"--model_path",
|
| 181 |
+
default=None,
|
| 182 |
+
help="Path to .tflite model file",
|
| 183 |
+
)
|
| 184 |
+
parser.add_argument(
|
| 185 |
+
"--prompt",
|
| 186 |
+
default="What is the meaning of life?",
|
| 187 |
+
help="Input prompt",
|
| 188 |
+
)
|
| 189 |
+
parser.add_argument(
|
| 190 |
+
"--max_new_tokens",
|
| 191 |
+
type=int,
|
| 192 |
+
default=50,
|
| 193 |
+
help="Maximum tokens to generate",
|
| 194 |
+
)
|
| 195 |
+
parser.add_argument(
|
| 196 |
+
"--kv_cache_max_len",
|
| 197 |
+
type=int,
|
| 198 |
+
default=2048,
|
| 199 |
+
help="KV cache max length (must match exported model)",
|
| 200 |
+
)
|
| 201 |
+
args = parser.parse_args()
|
| 202 |
+
|
| 203 |
+
# Auto-find model if not specified
|
| 204 |
+
if args.model_path is None:
|
| 205 |
+
files = glob.glob("output/**/*.tflite", recursive=True)
|
| 206 |
+
if files:
|
| 207 |
+
args.model_path = max(files, key=lambda f: __import__("os").path.getmtime(f))
|
| 208 |
+
logger.info("Auto-found model: %s", args.model_path)
|
| 209 |
+
else:
|
| 210 |
+
raise FileNotFoundError("No .tflite files found in output/")
|
| 211 |
+
|
| 212 |
+
generate(args.model_path, args.prompt, args.max_new_tokens, args.kv_cache_max_len)
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
if __name__ == "__main__":
|
| 216 |
+
main()
|
qwen35_embedder_q8.tflite
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0a3cc2102f1c345110215d23bc6963a1369c358d8ef91fe4f295b8606dc1df27
|
| 3 |
+
size 257260872
|
qwen35_mm_q8_ekv2048.litertlm
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:92999fe4a9242c983e99892d6e57f368e8cd7a4534bc9a383a9551155b7f70a5
|
| 3 |
+
size 1159757824
|
qwen35_mm_q8_ekv2048.tflite
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2a59a2b85cf06e1245a5ea4a0b0e1e0b0348de8c803e8c806dac42951a3035ed
|
| 3 |
+
size 793905384
|
qwen35_vision_adapter_q8.tflite
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b1032a0082a74a38c8d1a56e024c2f596de48973c18bf54aeed1acff2e11d1a4
|
| 3 |
+
size 12662960
|
qwen35_vision_encoder_q8.tflite
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2e58590d0a610d399438223c854192ac3ccbfc98b0bd57f0aedb84ddae17540a
|
| 3 |
+
size 92250944
|
tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5f9e4d4901a92b997e463c1f46055088b6cca5ca61a6522d1b9f64c4bb81cb42
|
| 3 |
+
size 12807982
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,305 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"added_tokens_decoder": {
|
| 4 |
+
"248044": {
|
| 5 |
+
"content": "<|endoftext|>",
|
| 6 |
+
"lstrip": false,
|
| 7 |
+
"normalized": false,
|
| 8 |
+
"rstrip": false,
|
| 9 |
+
"single_word": false,
|
| 10 |
+
"special": true
|
| 11 |
+
},
|
| 12 |
+
"248045": {
|
| 13 |
+
"content": "<|im_start|>",
|
| 14 |
+
"lstrip": false,
|
| 15 |
+
"normalized": false,
|
| 16 |
+
"rstrip": false,
|
| 17 |
+
"single_word": false,
|
| 18 |
+
"special": true
|
| 19 |
+
},
|
| 20 |
+
"248046": {
|
| 21 |
+
"content": "<|im_end|>",
|
| 22 |
+
"lstrip": false,
|
| 23 |
+
"normalized": false,
|
| 24 |
+
"rstrip": false,
|
| 25 |
+
"single_word": false,
|
| 26 |
+
"special": true
|
| 27 |
+
},
|
| 28 |
+
"248047": {
|
| 29 |
+
"content": "<|object_ref_start|>",
|
| 30 |
+
"lstrip": false,
|
| 31 |
+
"normalized": false,
|
| 32 |
+
"rstrip": false,
|
| 33 |
+
"single_word": false,
|
| 34 |
+
"special": true
|
| 35 |
+
},
|
| 36 |
+
"248048": {
|
| 37 |
+
"content": "<|object_ref_end|>",
|
| 38 |
+
"lstrip": false,
|
| 39 |
+
"normalized": false,
|
| 40 |
+
"rstrip": false,
|
| 41 |
+
"single_word": false,
|
| 42 |
+
"special": true
|
| 43 |
+
},
|
| 44 |
+
"248049": {
|
| 45 |
+
"content": "<|box_start|>",
|
| 46 |
+
"lstrip": false,
|
| 47 |
+
"normalized": false,
|
| 48 |
+
"rstrip": false,
|
| 49 |
+
"single_word": false,
|
| 50 |
+
"special": true
|
| 51 |
+
},
|
| 52 |
+
"248050": {
|
| 53 |
+
"content": "<|box_end|>",
|
| 54 |
+
"lstrip": false,
|
| 55 |
+
"normalized": false,
|
| 56 |
+
"rstrip": false,
|
| 57 |
+
"single_word": false,
|
| 58 |
+
"special": true
|
| 59 |
+
},
|
| 60 |
+
"248051": {
|
| 61 |
+
"content": "<|quad_start|>",
|
| 62 |
+
"lstrip": false,
|
| 63 |
+
"normalized": false,
|
| 64 |
+
"rstrip": false,
|
| 65 |
+
"single_word": false,
|
| 66 |
+
"special": true
|
| 67 |
+
},
|
| 68 |
+
"248052": {
|
| 69 |
+
"content": "<|quad_end|>",
|
| 70 |
+
"lstrip": false,
|
| 71 |
+
"normalized": false,
|
| 72 |
+
"rstrip": false,
|
| 73 |
+
"single_word": false,
|
| 74 |
+
"special": true
|
| 75 |
+
},
|
| 76 |
+
"248053": {
|
| 77 |
+
"content": "<|vision_start|>",
|
| 78 |
+
"lstrip": false,
|
| 79 |
+
"normalized": false,
|
| 80 |
+
"rstrip": false,
|
| 81 |
+
"single_word": false,
|
| 82 |
+
"special": true
|
| 83 |
+
},
|
| 84 |
+
"248054": {
|
| 85 |
+
"content": "<|vision_end|>",
|
| 86 |
+
"lstrip": false,
|
| 87 |
+
"normalized": false,
|
| 88 |
+
"rstrip": false,
|
| 89 |
+
"single_word": false,
|
| 90 |
+
"special": true
|
| 91 |
+
},
|
| 92 |
+
"248055": {
|
| 93 |
+
"content": "<|vision_pad|>",
|
| 94 |
+
"lstrip": false,
|
| 95 |
+
"normalized": false,
|
| 96 |
+
"rstrip": false,
|
| 97 |
+
"single_word": false,
|
| 98 |
+
"special": true
|
| 99 |
+
},
|
| 100 |
+
"248056": {
|
| 101 |
+
"content": "<|image_pad|>",
|
| 102 |
+
"lstrip": false,
|
| 103 |
+
"normalized": false,
|
| 104 |
+
"rstrip": false,
|
| 105 |
+
"single_word": false,
|
| 106 |
+
"special": true
|
| 107 |
+
},
|
| 108 |
+
"248057": {
|
| 109 |
+
"content": "<|video_pad|>",
|
| 110 |
+
"lstrip": false,
|
| 111 |
+
"normalized": false,
|
| 112 |
+
"rstrip": false,
|
| 113 |
+
"single_word": false,
|
| 114 |
+
"special": true
|
| 115 |
+
},
|
| 116 |
+
"248058": {
|
| 117 |
+
"content": "<tool_call>",
|
| 118 |
+
"lstrip": false,
|
| 119 |
+
"normalized": false,
|
| 120 |
+
"rstrip": false,
|
| 121 |
+
"single_word": false,
|
| 122 |
+
"special": false
|
| 123 |
+
},
|
| 124 |
+
"248059": {
|
| 125 |
+
"content": "</tool_call>",
|
| 126 |
+
"lstrip": false,
|
| 127 |
+
"normalized": false,
|
| 128 |
+
"rstrip": false,
|
| 129 |
+
"single_word": false,
|
| 130 |
+
"special": false
|
| 131 |
+
},
|
| 132 |
+
"248060": {
|
| 133 |
+
"content": "<|fim_prefix|>",
|
| 134 |
+
"lstrip": false,
|
| 135 |
+
"normalized": false,
|
| 136 |
+
"rstrip": false,
|
| 137 |
+
"single_word": false,
|
| 138 |
+
"special": false
|
| 139 |
+
},
|
| 140 |
+
"248061": {
|
| 141 |
+
"content": "<|fim_middle|>",
|
| 142 |
+
"lstrip": false,
|
| 143 |
+
"normalized": false,
|
| 144 |
+
"rstrip": false,
|
| 145 |
+
"single_word": false,
|
| 146 |
+
"special": false
|
| 147 |
+
},
|
| 148 |
+
"248062": {
|
| 149 |
+
"content": "<|fim_suffix|>",
|
| 150 |
+
"lstrip": false,
|
| 151 |
+
"normalized": false,
|
| 152 |
+
"rstrip": false,
|
| 153 |
+
"single_word": false,
|
| 154 |
+
"special": false
|
| 155 |
+
},
|
| 156 |
+
"248063": {
|
| 157 |
+
"content": "<|fim_pad|>",
|
| 158 |
+
"lstrip": false,
|
| 159 |
+
"normalized": false,
|
| 160 |
+
"rstrip": false,
|
| 161 |
+
"single_word": false,
|
| 162 |
+
"special": false
|
| 163 |
+
},
|
| 164 |
+
"248064": {
|
| 165 |
+
"content": "<|repo_name|>",
|
| 166 |
+
"lstrip": false,
|
| 167 |
+
"normalized": false,
|
| 168 |
+
"rstrip": false,
|
| 169 |
+
"single_word": false,
|
| 170 |
+
"special": false
|
| 171 |
+
},
|
| 172 |
+
"248065": {
|
| 173 |
+
"content": "<|file_sep|>",
|
| 174 |
+
"lstrip": false,
|
| 175 |
+
"normalized": false,
|
| 176 |
+
"rstrip": false,
|
| 177 |
+
"single_word": false,
|
| 178 |
+
"special": false
|
| 179 |
+
},
|
| 180 |
+
"248066": {
|
| 181 |
+
"content": "<tool_response>",
|
| 182 |
+
"lstrip": false,
|
| 183 |
+
"normalized": false,
|
| 184 |
+
"rstrip": false,
|
| 185 |
+
"single_word": false,
|
| 186 |
+
"special": false
|
| 187 |
+
},
|
| 188 |
+
"248067": {
|
| 189 |
+
"content": "</tool_response>",
|
| 190 |
+
"lstrip": false,
|
| 191 |
+
"normalized": false,
|
| 192 |
+
"rstrip": false,
|
| 193 |
+
"single_word": false,
|
| 194 |
+
"special": false
|
| 195 |
+
},
|
| 196 |
+
"248068": {
|
| 197 |
+
"content": "<think>",
|
| 198 |
+
"lstrip": false,
|
| 199 |
+
"normalized": false,
|
| 200 |
+
"rstrip": false,
|
| 201 |
+
"single_word": false,
|
| 202 |
+
"special": false
|
| 203 |
+
},
|
| 204 |
+
"248069": {
|
| 205 |
+
"content": "</think>",
|
| 206 |
+
"lstrip": false,
|
| 207 |
+
"normalized": false,
|
| 208 |
+
"rstrip": false,
|
| 209 |
+
"single_word": false,
|
| 210 |
+
"special": false
|
| 211 |
+
},
|
| 212 |
+
"248070": {
|
| 213 |
+
"content": "<|audio_start|>",
|
| 214 |
+
"lstrip": false,
|
| 215 |
+
"normalized": false,
|
| 216 |
+
"rstrip": false,
|
| 217 |
+
"single_word": false,
|
| 218 |
+
"special": true
|
| 219 |
+
},
|
| 220 |
+
"248071": {
|
| 221 |
+
"content": "<|audio_end|>",
|
| 222 |
+
"lstrip": false,
|
| 223 |
+
"normalized": false,
|
| 224 |
+
"rstrip": false,
|
| 225 |
+
"single_word": false,
|
| 226 |
+
"special": true
|
| 227 |
+
},
|
| 228 |
+
"248072": {
|
| 229 |
+
"content": "<tts_pad>",
|
| 230 |
+
"lstrip": false,
|
| 231 |
+
"normalized": false,
|
| 232 |
+
"rstrip": false,
|
| 233 |
+
"single_word": false,
|
| 234 |
+
"special": true
|
| 235 |
+
},
|
| 236 |
+
"248073": {
|
| 237 |
+
"content": "<tts_text_bos>",
|
| 238 |
+
"lstrip": false,
|
| 239 |
+
"normalized": false,
|
| 240 |
+
"rstrip": false,
|
| 241 |
+
"single_word": false,
|
| 242 |
+
"special": true
|
| 243 |
+
},
|
| 244 |
+
"248074": {
|
| 245 |
+
"content": "<tts_text_eod>",
|
| 246 |
+
"lstrip": false,
|
| 247 |
+
"normalized": false,
|
| 248 |
+
"rstrip": false,
|
| 249 |
+
"single_word": false,
|
| 250 |
+
"special": true
|
| 251 |
+
},
|
| 252 |
+
"248075": {
|
| 253 |
+
"content": "<tts_text_bos_single>",
|
| 254 |
+
"lstrip": false,
|
| 255 |
+
"normalized": false,
|
| 256 |
+
"rstrip": false,
|
| 257 |
+
"single_word": false,
|
| 258 |
+
"special": true
|
| 259 |
+
},
|
| 260 |
+
"248076": {
|
| 261 |
+
"content": "<|audio_pad|>",
|
| 262 |
+
"lstrip": false,
|
| 263 |
+
"normalized": false,
|
| 264 |
+
"rstrip": false,
|
| 265 |
+
"single_word": false,
|
| 266 |
+
"special": true
|
| 267 |
+
}
|
| 268 |
+
},
|
| 269 |
+
"additional_special_tokens": [
|
| 270 |
+
"<|im_start|>",
|
| 271 |
+
"<|im_end|>",
|
| 272 |
+
"<|object_ref_start|>",
|
| 273 |
+
"<|object_ref_end|>",
|
| 274 |
+
"<|box_start|>",
|
| 275 |
+
"<|box_end|>",
|
| 276 |
+
"<|quad_start|>",
|
| 277 |
+
"<|quad_end|>",
|
| 278 |
+
"<|vision_start|>",
|
| 279 |
+
"<|vision_end|>",
|
| 280 |
+
"<|vision_pad|>",
|
| 281 |
+
"<|image_pad|>",
|
| 282 |
+
"<|video_pad|>"
|
| 283 |
+
],
|
| 284 |
+
"bos_token": null,
|
| 285 |
+
"chat_template": "{%- set image_count = namespace(value=0) %}\n{%- set video_count = namespace(value=0) %}\n{%- macro render_content(content, do_vision_count, is_system_content=false) %}\n {%- if content is string %}\n {{- content }}\n {%- elif content is iterable and content is not mapping %}\n {%- for item in content %}\n {%- if 'image' in item or 'image_url' in item or item.type == 'image' %}\n {%- if is_system_content %}\n {{- raise_exception('System message cannot contain images.') }}\n {%- endif %}\n {%- if do_vision_count %}\n {%- set image_count.value = image_count.value + 1 %}\n {%- endif %}\n {%- if add_vision_id %}\n {{- 'Picture ' ~ image_count.value ~ ': ' }}\n {%- endif %}\n {{- '<|vision_start|><|image_pad|><|vision_end|>' }}\n {%- elif 'video' in item or item.type == 'video' %}\n {%- if is_system_content %}\n {{- raise_exception('System message cannot contain videos.') }}\n {%- endif %}\n {%- if do_vision_count %}\n {%- set video_count.value = video_count.value + 1 %}\n {%- endif %}\n {%- if add_vision_id %}\n {{- 'Video ' ~ video_count.value ~ ': ' }}\n {%- endif %}\n {{- '<|vision_start|><|video_pad|><|vision_end|>' }}\n {%- elif 'text' in item %}\n {{- item.text }}\n {%- else %}\n {{- raise_exception('Unexpected item type in content.') }}\n {%- endif %}\n {%- endfor %}\n {%- elif content is none or content is undefined %}\n {{- '' }}\n {%- else %}\n {{- raise_exception('Unexpected content type.') }}\n {%- endif %}\n{%- endmacro %}\n{%- if not messages %}\n {{- raise_exception('No messages provided.') }}\n{%- endif %}\n{%- if tools and tools is iterable and tools is not mapping %}\n {{- '<|im_start|>system\\n' }}\n {{- \"# Tools\\n\\nYou have access to the following functions:\\n\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\" }}\n {{- '\\n\\nIf you choose to call a function ONLY reply in the following format with NO suffix:\\n\\n<tool_call>\\n<function=example_function_name>\\n<parameter=example_parameter_1>\\nvalue_1\\n</parameter>\\n<parameter=example_parameter_2>\\nThis is the value for the second parameter\\nthat can span\\nmultiple lines\\n</parameter>\\n</function>\\n</tool_call>\\n\\n<IMPORTANT>\\nReminder:\\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\\n- Required parameters MUST be specified\\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\\n</IMPORTANT>' }}\n {%- if messages[0].role == 'system' %}\n {%- set content = render_content(messages[0].content, false, true)|trim %}\n {%- if content %}\n {{- '\\n\\n' + content }}\n {%- endif %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {%- set content = render_content(messages[0].content, false, true)|trim %}\n {{- '<|im_start|>system\\n' + content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" %}\n {%- set content = render_content(message.content, false)|trim %}\n {%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if ns.multi_step_tool %}\n {{- raise_exception('No user query found in messages.') }}\n{%- endif %}\n{%- for message in messages %}\n {%- set content = render_content(message.content, true)|trim %}\n {%- if message.role == \"system\" %}\n {%- if not loop.first %}\n {{- raise_exception('System message must be at the beginning.') }}\n {%- endif %}\n {%- elif message.role == \"user\" %}\n {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is string %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in content %}\n {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- set content = content.split('</think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- set reasoning_content = reasoning_content|trim %}\n {%- if loop.index0 > ns.last_query_index %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content + '\\n</think>\\n\\n' + content }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {%- if loop.first %}\n {%- if content|trim %}\n {{- '\\n\\n<tool_call>\\n<function=' + tool_call.name + '>\\n' }}\n {%- else %}\n {{- '<tool_call>\\n<function=' + tool_call.name + '>\\n' }}\n {%- endif %}\n {%- else %}\n {{- '\\n<tool_call>\\n<function=' + tool_call.name + '>\\n' }}\n {%- endif %}\n {%- if tool_call.arguments is defined %}\n {%- for args_name, args_value in tool_call.arguments|items %}\n {{- '<parameter=' + args_name + '>\\n' }}\n {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}\n {{- args_value }}\n {{- '\\n</parameter>\\n' }}\n {%- endfor %}\n {%- endif %}\n {{- '</function>\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.previtem and loop.previtem.role != \"tool\" %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- content }}\n {{- '\\n</tool_response>' }}\n {%- if not loop.last and loop.nextitem.role != \"tool\" %}\n {{- '<|im_end|>\\n' }}\n {%- elif loop.last %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- else %}\n {{- raise_exception('Unexpected message role.') }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is true %}\n {{- '<think>\\n' }}\n {%- else %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
|
| 286 |
+
"clean_up_tokenization_spaces": false,
|
| 287 |
+
"eos_token": "<|im_end|>",
|
| 288 |
+
"errors": "replace",
|
| 289 |
+
"model_max_length": 262144,
|
| 290 |
+
"pad_token": "<|endoftext|>",
|
| 291 |
+
"split_special_tokens": false,
|
| 292 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 293 |
+
"unk_token": null,
|
| 294 |
+
"add_bos_token": false,
|
| 295 |
+
"pretokenize_regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
| 296 |
+
"extra_special_tokens": {
|
| 297 |
+
"audio_bos_token": "<|audio_start|>",
|
| 298 |
+
"audio_eos_token": "<|audio_end|>",
|
| 299 |
+
"audio_token": "<|audio_pad|>",
|
| 300 |
+
"image_token": "<|image_pad|>",
|
| 301 |
+
"video_token": "<|video_pad|>",
|
| 302 |
+
"vision_bos_token": "<|vision_start|>",
|
| 303 |
+
"vision_eos_token": "<|vision_end|>"
|
| 304 |
+
}
|
| 305 |
+
}
|