Text Generation
Transformers
Safetensors
qwen3_5_moe
image-text-to-text
Merge
evolutionary-merge
darwin
darwin-v5
model-mri
reasoning
advanced-reasoning
chain-of-thought
thinking
qwen3.5
qwen
Mixture of Experts
mixture-of-experts
claude-opus
distillation
multimodal
vision-language
201-languages
gpqa
benchmark
open-source
apache-2.0
natural-selection
layer-wise-merge
moe-merge
dead-expert-revival
neural-anatomy
coding-agent
tool-calling
long-context
262k-context
conversational
Eval Results (legacy)
Commit ·
2eab6a3
0
Parent(s):
Duplicate from FINAL-Bench/Darwin-35B-A3B-Opus
Browse filesCo-authored-by: VIDRAFT_LAB <SeaWolf-AI@users.noreply.huggingface.co>
- .gitattributes +38 -0
- README.md +616 -0
- a1.png +0 -0
- a2.png +0 -0
- a3.png +0 -0
- added_tokens.json +35 -0
- c1.png +0 -0
- chat_template.jinja +154 -0
- config.json +120 -0
- f1.png +0 -0
- f2.png +0 -0
- f3.png +0 -0
- generation_config.json +13 -0
- info.png +3 -0
- m1.png +0 -0
- m2.png +0 -0
- m3.png +0 -0
- merges.txt +0 -0
- model.safetensors-00001-of-00014.safetensors +3 -0
- model.safetensors-00002-of-00014.safetensors +3 -0
- model.safetensors-00003-of-00014.safetensors +3 -0
- model.safetensors-00004-of-00014.safetensors +3 -0
- model.safetensors-00005-of-00014.safetensors +3 -0
- model.safetensors-00006-of-00014.safetensors +3 -0
- model.safetensors-00007-of-00014.safetensors +3 -0
- model.safetensors-00008-of-00014.safetensors +3 -0
- model.safetensors-00009-of-00014.safetensors +3 -0
- model.safetensors-00010-of-00014.safetensors +3 -0
- model.safetensors-00011-of-00014.safetensors +3 -0
- model.safetensors-00012-of-00014.safetensors +3 -0
- model.safetensors-00013-of-00014.safetensors +3 -0
- model.safetensors-00014-of-00014.safetensors +3 -0
- model.safetensors.index.json +0 -0
- preprocessor_config.json +39 -0
- qwen.png +3 -0
- special_tokens_map.json +38 -0
- tokenizer.json +3 -0
- tokenizer_config.json +312 -0
- video_preprocessor_config.json +41 -0
- vocab.json +0 -0
.gitattributes
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
info.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
qwen.png filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,616 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
base_model:
|
| 4 |
+
- Qwen/Qwen3.5-35B-A3B
|
| 5 |
+
- Jackrong/Qwen3.5-35B-A3B-Claude-4.6-Opus-Reasoning-Distilled
|
| 6 |
+
tags:
|
| 7 |
+
- merge
|
| 8 |
+
- evolutionary-merge
|
| 9 |
+
- darwin
|
| 10 |
+
- darwin-v5
|
| 11 |
+
- model-mri
|
| 12 |
+
- reasoning
|
| 13 |
+
- advanced-reasoning
|
| 14 |
+
- chain-of-thought
|
| 15 |
+
- thinking
|
| 16 |
+
- qwen3.5
|
| 17 |
+
- qwen
|
| 18 |
+
- moe
|
| 19 |
+
- mixture-of-experts
|
| 20 |
+
- claude-opus
|
| 21 |
+
- distillation
|
| 22 |
+
- multimodal
|
| 23 |
+
- vision-language
|
| 24 |
+
- multilingual
|
| 25 |
+
- 201-languages
|
| 26 |
+
- gpqa
|
| 27 |
+
- benchmark
|
| 28 |
+
- open-source
|
| 29 |
+
- apache-2.0
|
| 30 |
+
- natural-selection
|
| 31 |
+
- layer-wise-merge
|
| 32 |
+
- moe-merge
|
| 33 |
+
- dead-expert-revival
|
| 34 |
+
- neural-anatomy
|
| 35 |
+
- coding-agent
|
| 36 |
+
- tool-calling
|
| 37 |
+
- long-context
|
| 38 |
+
- 262k-context
|
| 39 |
+
language:
|
| 40 |
+
- en
|
| 41 |
+
- zh
|
| 42 |
+
- ko
|
| 43 |
+
- ja
|
| 44 |
+
- de
|
| 45 |
+
- fr
|
| 46 |
+
- es
|
| 47 |
+
- ru
|
| 48 |
+
- ar
|
| 49 |
+
- multilingual
|
| 50 |
+
pipeline_tag: text-generation
|
| 51 |
+
library_name: transformers
|
| 52 |
+
model-index:
|
| 53 |
+
- name: Darwin-35B-A3B-Opus
|
| 54 |
+
results:
|
| 55 |
+
- task:
|
| 56 |
+
type: text-generation
|
| 57 |
+
name: Graduate-Level Reasoning
|
| 58 |
+
dataset:
|
| 59 |
+
type: Idavidrein/gpqa
|
| 60 |
+
name: GPQA Diamond
|
| 61 |
+
config: gpqa_diamond
|
| 62 |
+
split: train
|
| 63 |
+
metrics:
|
| 64 |
+
- type: accuracy
|
| 65 |
+
value: 90.0
|
| 66 |
+
name: Accuracy
|
| 67 |
+
verified: false
|
| 68 |
+
- task:
|
| 69 |
+
type: text-generation
|
| 70 |
+
name: Multilingual Knowledge
|
| 71 |
+
dataset:
|
| 72 |
+
type: openai/MMMLU
|
| 73 |
+
name: MMMLU
|
| 74 |
+
metrics:
|
| 75 |
+
- type: accuracy
|
| 76 |
+
value: 85.0
|
| 77 |
+
name: Accuracy
|
| 78 |
+
verified: false
|
| 79 |
+
---
|
| 80 |
+
|
| 81 |
+
# Darwin-35B-A3B-Opus
|
| 82 |
+
|
| 83 |
+
<p align="center">
|
| 84 |
+
<img src="info.png" alt="Darwin-35B-A3B-Opus" width="100%">
|
| 85 |
+
</p>
|
| 86 |
+
|
| 87 |
+
<p align="center">
|
| 88 |
+
<a href="https://huggingface.co/FINAL-Bench/Darwin-35B-A3B-Opus"><img src="https://img.shields.io/badge/🤗_Model-Darwin--35B--A3B--Opus-blue" alt="Model"></a>
|
| 89 |
+
<a href="https://huggingface.co/spaces/FINAL-Bench/Darwin-35B-A3B-Opus"><img src="https://img.shields.io/badge/🚀_Space-Live_Demo-purple" alt="Space"></a>
|
| 90 |
+
<a href="https://huggingface.co/spaces/FINAL-Bench/Leaderboard"><img src="https://img.shields.io/badge/🏆_FINAL_Bench-Leaderboard-green" alt="FINAL Bench"></a>
|
| 91 |
+
<a href="https://huggingface.co/spaces/FINAL-Bench/all-bench-leaderboard"><img src="https://img.shields.io/badge/📊_ALL_Bench-Leaderboard-orange" alt="ALL Bench"></a>
|
| 92 |
+
</p>
|
| 93 |
+
|
| 94 |
+
<p align="center">
|
| 95 |
+
<em>"The child surpassed both parents — that is evolution."</em>
|
| 96 |
+
</p>
|
| 97 |
+
|
| 98 |
+
<!-- SEO: Structured Summary for Search Engines & AI Answer Engines -->
|
| 99 |
+
<!--
|
| 100 |
+
Darwin-35B-A3B-Opus is a 35B parameter Mixture-of-Experts (MoE) language model with 3B active parameters,
|
| 101 |
+
created by VIDRAFT using the Darwin V5 evolutionary merge engine with Model MRI integration.
|
| 102 |
+
It achieves 90.0% on GPQA Diamond (vs Father Qwen3.5-35B-A3B at 84.2%) and 85.0% on MMMLU,
|
| 103 |
+
while preserving multimodal capabilities (image/video), 201 language support, and 262K context length.
|
| 104 |
+
Licensed under Apache 2.0.
|
| 105 |
+
-->
|
| 106 |
+
|
| 107 |
+
> **TL;DR**: 35B MoE (3B active) | **GPQA Diamond 90.0%** (beats Father 84.2% & Mother 85.0%) | **MMMLU 85.0%** | Multimodal ✅ | 201 Languages | 262K Context | 147.8 tok/s | Apache 2.0
|
| 108 |
+
>
|
| 109 |
+
> `#Darwin` `#EvolutionaryMerge` `#ModelMRI` `#Qwen3.5` `#MoE` `#Reasoning` `#GPQA90` `#Multimodal` `#OpenSource` `#Apache2` `#DarwinV5` `#VIDRAFT`
|
| 110 |
+
|
| 111 |
+
---
|
| 112 |
+
|
| 113 |
+
## Why Darwin? — The Child That Surpassed Both Parents
|
| 114 |
+
|
| 115 |
+
The fundamental question of AI model merging: **If parent models already exist, why crossbreed?**
|
| 116 |
+
|
| 117 |
+
This model is the answer.
|
| 118 |
+
|
| 119 |
+
### Benchmark Results
|
| 120 |
+
|
| 121 |
+
**GPQA Diamond (198 Questions, Graduate-Level Reasoning)**
|
| 122 |
+
|
| 123 |
+
| Model | Accuracy | Multimodal | Benchmark Published |
|
| 124 |
+
|---|---|---|---|
|
| 125 |
+
| 🧬 **Darwin-35B-A3B-Opus (Child)** | **90.0%** | ✅ Image/Video | ✅ Fully Open |
|
| 126 |
+
| 👩 Mother — Jackrong Claude 4.6 Opus Distilled | 85.0% | ❌ Text-only | ❌ Not Published |
|
| 127 |
+
| 👨 Father — Qwen3.5-35B-A3B (Official) | 84.2% | ✅ Image/Video | ✅ Official |
|
| 128 |
+
|
| 129 |
+
> *Evaluation: SGLang, context 32768, temperature 0, greedy decoding, official GPQA prompt format ("ANSWER: LETTER")*
|
| 130 |
+
|
| 131 |
+
**MMMLU (Multilingual Knowledge, 29 Languages)**
|
| 132 |
+
|
| 133 |
+
| Model | Accuracy |
|
| 134 |
+
|---|---|
|
| 135 |
+
| 🧬 **Darwin-35B-A3B-Opus (Child)** | **85.0%** |
|
| 136 |
+
| 👨 Father — Qwen3.5-35B-A3B (Official) | 85.2% |
|
| 137 |
+
|
| 138 |
+
> *Darwin maintains Father-level multilingual knowledge while gaining superior reasoning.*
|
| 139 |
+
|
| 140 |
+
**The child surpassed both parents in reasoning, and matched the Father in multilingual knowledge.**
|
| 141 |
+
|
| 142 |
+
- GPQA vs Father: **+6.9% relative improvement** ((90.0−84.2)/84.2)
|
| 143 |
+
- GPQA vs Mother: **+5.9% relative improvement** ((90.0−85.0)/85.0)
|
| 144 |
+
- MMMLU: **85.0%** — Father-level (85.2%) multilingual knowledge preserved
|
| 145 |
+
|
| 146 |
+
### Why Not Just Use the Mother?
|
| 147 |
+
|
| 148 |
+
| | Mother (Claude Distilled) | Darwin (Child) |
|
| 149 |
+
|---|---|---|
|
| 150 |
+
| Reasoning | Strong (85.0%) | **Stronger (90.0%)** |
|
| 151 |
+
| Image/Video | ❌ Lost (text-only fine-tune) | ✅ Inherited from Father |
|
| 152 |
+
| 201 Languages | ❌ Potentially degraded | ✅ Inherited from Father |
|
| 153 |
+
| 262K Context | Unverified | ✅ Father's architecture preserved |
|
| 154 |
+
| Benchmark Transparency | ❌ No scores published | ✅ Fully open |
|
| 155 |
+
|
| 156 |
+
### Why Not Just Use the Father?
|
| 157 |
+
|
| 158 |
+
The Father (Qwen3.5-35B-A3B) excels in versatility but scores 84.2% on hard reasoning. Darwin **pushes reasoning to 90.0%** while maintaining Father-level multilingual knowledge (MMMLU 85.0% vs 85.2%) and all general capabilities.
|
| 159 |
+
|
| 160 |
+
**Conclusion: The only model that surpasses the Mother's reasoning, preserves the Father's multilingual knowledge, and retains full multimodal capabilities.**
|
| 161 |
+
|
| 162 |
+
---
|
| 163 |
+
|
| 164 |
+
## Model Overview
|
| 165 |
+
|
| 166 |
+
**Darwin-35B-A3B-Opus** is a next-generation reasoning-enhanced language model created by VIDRAFT's **Darwin V5** evolution engine.
|
| 167 |
+
|
| 168 |
+
Darwin V5 combines two innovations:
|
| 169 |
+
1. **Evolutionary Merge** — Applies natural selection to automatically find optimal weight combinations
|
| 170 |
+
2. **Model MRI Integration** — CT-scans parent models layer by layer before merging, guiding evolution with structural insight
|
| 171 |
+
|
| 172 |
+
If conventional merging is "mixing recipes blindfolded," Darwin V5 is **"precision surgery with X-ray guidance."**
|
| 173 |
+
|
| 174 |
+
---
|
| 175 |
+
|
| 176 |
+
## Parent Models
|
| 177 |
+
|
| 178 |
+
| Role | Model | Strengths |
|
| 179 |
+
|---|---|---|
|
| 180 |
+
| 👨 Father | [Qwen/Qwen3.5-35B-A3B](https://huggingface.co/Qwen/Qwen3.5-35B-A3B) | General knowledge, multimodal (image/video), coding, agents, 201 languages, 262K context |
|
| 181 |
+
| 👩 Mother | [Jackrong/Qwen3.5-35B-A3B-Claude-4.6-Opus-Reasoning-Distilled](https://huggingface.co/Jackrong/Qwen3.5-35B-A3B-Claude-4.6-Opus-Reasoning-Distilled) | Claude 4.6 Opus CoT distillation, structured step-by-step reasoning, coding agent compatibility |
|
| 182 |
+
|
| 183 |
+
---
|
| 184 |
+
|
| 185 |
+
## Darwin V5 — Beyond Simple Merge
|
| 186 |
+
|
| 187 |
+
### Limitations of Conventional Merging
|
| 188 |
+
|
| 189 |
+
Traditional model merging relies on humans setting hyperparameters like ratio and density **by intuition**. Set ratio=0.5, density=0.9, run once, and hope for the best. The result depends on luck, and applying the same ratio uniformly across billions of parameters ignores each layer's unique role.
|
| 190 |
+
|
| 191 |
+
### Darwin V4's Advance
|
| 192 |
+
|
| 193 |
+
Darwin V4 solved this with **evolutionary algorithms** — automatically searching hundreds of parameter combinations and selecting survivors by real benchmark scores. But V4 was still **blind evolution**: it didn't know what each layer does.
|
| 194 |
+
|
| 195 |
+
### Darwin V5: Model MRI Opens the Eyes
|
| 196 |
+
|
| 197 |
+
V5 integrates **Model MRI** (neural anatomy analyzer) to give evolution "sight":
|
| 198 |
+
|
| 199 |
+
```
|
| 200 |
+
[Phase 0] Model MRI — CT-scan both parents layer by layer
|
| 201 |
+
↓ "Father's layers 15-25 concentrate multilingual knowledge"
|
| 202 |
+
↓ "Mother's layers 30-40 concentrate reasoning patterns"
|
| 203 |
+
↓
|
| 204 |
+
[Phase 1] MRI-Guided Evolution — Start from scan-informed initial genome
|
| 205 |
+
↓ Not random, but "informed by CT results"
|
| 206 |
+
↓
|
| 207 |
+
[Phase 2] mergekit real merge + benchmark fitness selection
|
| 208 |
+
↓ Faster convergence in MRI-narrowed search space
|
| 209 |
+
↓
|
| 210 |
+
[Phase 3] MRI Health Check — CT-scan the child model
|
| 211 |
+
↓ Detect interference, function loss
|
| 212 |
+
↓ Prescribe layer-specific ratio adjustments
|
| 213 |
+
↓
|
| 214 |
+
[Final] Darwin-35B-A3B-Opus
|
| 215 |
+
```
|
| 216 |
+
|
| 217 |
+
### V4 vs V5
|
| 218 |
+
|
| 219 |
+
| | Darwin V4 | Darwin V5 |
|
| 220 |
+
|---|---|---|
|
| 221 |
+
| Analogy | Mixing recipes blindfolded | **Precision surgery with X-ray** |
|
| 222 |
+
| Initial genome | Random | **MRI-guided** |
|
| 223 |
+
| Layer control | 2 ratios (attn/ffn) | **40 layers independently** |
|
| 224 |
+
| Pre-diagnosis | ❌ None | ✅ Phase 0 MRI scan |
|
| 225 |
+
| Post-verification | Benchmark only | ✅ Phase 3 health check |
|
| 226 |
+
| Search efficiency | Wide space | **Narrowed, guided search** |
|
| 227 |
+
| Failure diagnosis | Unknown "why" | **Pinpoint which layer failed** |
|
| 228 |
+
|
| 229 |
+
---
|
| 230 |
+
|
| 231 |
+
### Darwin V4: Discovered Optimal Parameters (Blind Evolution)
|
| 232 |
+
|
| 233 |
+
| Parameter | Value | Meaning |
|
| 234 |
+
|---|---|---|
|
| 235 |
+
| ratio | 0.481 | Father 52% : Mother 48% asymmetric blend |
|
| 236 |
+
| density_a | 0.855 | Selected 85.5% of Father's weights |
|
| 237 |
+
| density_b | 0.971 | Adopted 97.1% of Mother's weights |
|
| 238 |
+
| attn | 0.168 | Only 16.8% change in attention layers |
|
| 239 |
+
| ffn | 0.841 | 84.1% change in FFN layers |
|
| 240 |
+
|
| 241 |
+
**Interpretation:** Attention patterns (what to focus on) are **almost entirely preserved** from the Father, while FFN layers (knowledge storage) are **largely replaced** with the Mother's reasoning patterns.
|
| 242 |
+
|
| 243 |
+
Discovering attn=0.168 and ffn=0.841 — this extreme asymmetry — is **virtually impossible by human intuition**.
|
| 244 |
+
|
| 245 |
+
### Darwin V5: MRI-Guided Merge Recipe
|
| 246 |
+
|
| 247 |
+
After scanning both parents, Model MRI generated a fundamentally different prescription:
|
| 248 |
+
|
| 249 |
+
<p align="center"><img src="a2.png" width="500" alt="MRI-Guided Genome"></p>
|
| 250 |
+
|
| 251 |
+
| Parameter | V4 (Blind) | V5 (MRI) | Change |
|
| 252 |
+
|---|---|---|---|
|
| 253 |
+
| global_ratio | 0.481 | **0.800** | Mother weight ↑↑ |
|
| 254 |
+
| attn_ratio | 0.168 | **0.320** | Attention also shifts to Mother |
|
| 255 |
+
| ffn_ratio | 0.841 | **0.590** | FFN becomes more conservative |
|
| 256 |
+
| density_a | 0.855 | **0.799** | Similar |
|
| 257 |
+
| density_b | 0.971 | **0.799** | Mother density ↓ (Dead Expert compensation) |
|
| 258 |
+
|
| 259 |
+
**Key insight:** MRI prescribed "use more of the Mother (ratio 0.8), but reduce density (0.799) because 50-65% of her experts are dead." V4 found ratio=0.481 blindly — the **opposite direction**.
|
| 260 |
+
|
| 261 |
+
### Layer-Wise Merge Strategy (3 Blocks)
|
| 262 |
+
|
| 263 |
+
MRI didn't apply uniform ratios. It split 40 layers into 3 blocks:
|
| 264 |
+
|
| 265 |
+
<p align="center"><img src="a1.png" width="700" alt="Merge Ratio + Parent Importance + MoE Health per Layer"></p>
|
| 266 |
+
|
| 267 |
+
| Block | Layers | t (Mother %) | Router Source | Rationale |
|
| 268 |
+
|---|---|---|---|---|
|
| 269 |
+
| Block 1 | L0~L37 | **59.9%** | Mother | Reasoning pattern injection across most layers |
|
| 270 |
+
| Block 2 | L38 | **90.0%** | Mother | **Golden Layer — Mother's reasoning engine core** |
|
| 271 |
+
| Block 3 | L39 | **53.4%** | Father | Output layer — Father's router preserves multimodal routing |
|
| 272 |
+
|
| 273 |
+
**L38 is the "Golden Layer"**: Mother's MRI showed peak cosine distance at L34~L38 (see Mother MRI below). Darwin V5 responded by assigning t=0.9 to L38 — transplanting the Mother's reasoning engine almost entirely.
|
| 274 |
+
|
| 275 |
+
---
|
| 276 |
+
|
| 277 |
+
## Model MRI Scans — Parent Neural Anatomy
|
| 278 |
+
|
| 279 |
+
### Mother MRI: Claude 4.6 Opus Distilled
|
| 280 |
+
|
| 281 |
+
<p align="center"><img src="m3.png" width="600" alt="Mother Probe Cosine Distance"></p>
|
| 282 |
+
|
| 283 |
+
**Probe-wise Layer Importance:** L34~L38 shows intense red (high cosine distance) across REASONING, CODE, LOGIC probes — this is the Mother's reasoning engine.
|
| 284 |
+
|
| 285 |
+
<p align="center"><img src="m1.png" width="500" alt="Mother MoE Health"></p>
|
| 286 |
+
|
| 287 |
+
| Metric | Status | Interpretation |
|
| 288 |
+
|---|---|---|
|
| 289 |
+
| Router Entropy | ✅ ~1.0 across all layers | Healthy — experts evenly distributed |
|
| 290 |
+
| **Dead Expert %** | 🔴 **50~65%** | **Critical — Claude distillation killed half the experts** |
|
| 291 |
+
| Expert Similarity | ✅ 0.001~0.008 | Healthy — surviving experts remain diverse |
|
| 292 |
+
|
| 293 |
+
**Dead Expert 50~65% is the fingerprint of Claude text-only distillation.** The fine-tuning killed multimodal and multilingual experts that were no longer activated during text-only training.
|
| 294 |
+
|
| 295 |
+
<p align="center"><img src="m2.png" width="600" alt="Mother Expert Utilization"></p>
|
| 296 |
+
|
| 297 |
+
**Expert Utilization Heatmap:** Mostly dark (inactive) with sparse bright activations — the Claude reasoning pattern is concentrated in a small number of specialized experts.
|
| 298 |
+
|
| 299 |
+
### Father MRI: Healthy Generalist (Organ Donor)
|
| 300 |
+
|
| 301 |
+
<p align="center"><img src="f1.png" width="500" alt="Father MoE Health"></p>
|
| 302 |
+
|
| 303 |
+
<p align="center"><img src="f2.png" width="600" alt="Father Expert Utilization"></p>
|
| 304 |
+
|
| 305 |
+
<p align="center"><img src="f3.png" width="600" alt="Father Layer Importance by Probe"></p>
|
| 306 |
+
|
| 307 |
+
The Father (Qwen3.5-35B-A3B) shows **healthy, uniform expert activation** across all 40 layers — a well-balanced generalist with all experts alive. This is the "organ donor" that revives the Mother's dead 50–65% experts.
|
| 308 |
+
|
| 309 |
+
### Parent Comparison: Layer Advantage Map
|
| 310 |
+
|
| 311 |
+
<p align="center"><img src="a3.png" width="600" alt="Parent A vs B Layer Advantage"></p>
|
| 312 |
+
|
| 313 |
+
- **Above zero (↑ A):** Father stronger — primarily L0~L5 (embedding/early layers)
|
| 314 |
+
- **Below zero (↓ B):** Mother stronger — scattered but consistent across L5~L35
|
| 315 |
+
- **L34~L38:** Mother shows strongest advantage in REASONING and CODE probes
|
| 316 |
+
- **L39:** Father recovers — output layer favors Father's multimodal routing
|
| 317 |
+
|
| 318 |
+
This advantage map directly informed the 3-block merge recipe: Mother dominates L0~L38, Father retakes L39.
|
| 319 |
+
|
| 320 |
+
### How GPQA 90% Was Achieved
|
| 321 |
+
|
| 322 |
+
```
|
| 323 |
+
Mother L34~L38 reasoning engine (MRI red zone)
|
| 324 |
+
↓ t=0.9 — transplanted almost entirely
|
| 325 |
+
+
|
| 326 |
+
Father L39 output router (multimodal/multilingual expert activation)
|
| 327 |
+
↓ t=0.53 — Father's routing preserved
|
| 328 |
+
+
|
| 329 |
+
Dead Expert replacement → Father's living experts fill Mother's dead slots
|
| 330 |
+
↓
|
| 331 |
+
= GPQA 90.0% (surpassed both parents)
|
| 332 |
+
```
|
| 333 |
+
|
| 334 |
+
The Mother's "reasoning brain" was transplanted while her dead experts were replaced with the Father's living ones. **Reasoning went up, versatility was preserved.**
|
| 335 |
+
|
| 336 |
+
### Evolution History
|
| 337 |
+
|
| 338 |
+
- Phase 1 → Phase 2 evolution complete
|
| 339 |
+
- Final real_score: **0.8405**
|
| 340 |
+
- Merge time: 181.6 seconds
|
| 341 |
+
- Merge commit: `109838c2`
|
| 342 |
+
|
| 343 |
+
---
|
| 344 |
+
|
| 345 |
+
## Model MRI Health Check — Child vs Parents
|
| 346 |
+
|
| 347 |
+
<p align="center">
|
| 348 |
+
<img src="c1.png" alt="Darwin Health Check — Child vs Parents" width="100%">
|
| 349 |
+
</p>
|
| 350 |
+
|
| 351 |
+
**✅ Health: Healthy — No issues detected.**
|
| 352 |
+
|
| 353 |
+
The chart above shows the **layer-by-layer importance** of the child (Darwin, green bars) compared to both parents (Father = blue dashed, Mother = red dashed). Key findings:
|
| 354 |
+
|
| 355 |
+
**Layer 0 (Embedding):** Child importance spikes to 0.42 — both parents show similar peaks (~0.35–0.50). The child successfully inherited the critical embedding layer from both parents without interference.
|
| 356 |
+
|
| 357 |
+
**Layers 1–33 (Middle):** Near-zero importance across all three models. This is normal — middle layers in MoE models process information incrementally, with no single layer being critical. The child tracks both parents perfectly, confirming **no function loss** in the bulk of the network.
|
| 358 |
+
|
| 359 |
+
**Layers 34–39 (Reasoning Engine):** Importance rises sharply. This is the region where Mother's MRI showed intense reasoning activity (cosine distance > 0.6). The child's green bars match or exceed both parents — proving that **Mother's reasoning patterns were successfully transplanted** while Father's output routing was preserved.
|
| 360 |
+
|
| 361 |
+
**Layer 39 (Output):** Child peaks at ~0.48, closely matching both parents. The final output layer is intact.
|
| 362 |
+
|
| 363 |
+
### Why This Matters
|
| 364 |
+
|
| 365 |
+
The MRI health check confirms three things:
|
| 366 |
+
1. **No interference** — No layer where child importance abnormally exceeds parents (which would indicate weight conflict)
|
| 367 |
+
2. **No function loss** — No layer where parents had high importance but child dropped to zero
|
| 368 |
+
3. **Successful transplant** — L34–L39 reasoning engine from Mother is fully operational in the child
|
| 369 |
+
|
| 370 |
+
### Darwin V5 MRI-Guided Merge Recipe
|
| 371 |
+
|
| 372 |
+
```yaml
|
| 373 |
+
# MRI-guided layer-wise merge (3 blocks)
|
| 374 |
+
# Genome: ratio=0.800 attn=0.320 ffn=0.590 density=0.799
|
| 375 |
+
|
| 376 |
+
L0–L37: t=0.5988 (Mother 60%) — router from Mother
|
| 377 |
+
L38: t=0.9000 (Mother 90%) — "Golden Layer" reasoning core
|
| 378 |
+
L39: t=0.5336 (Father 47%) — router from Father (output routing)
|
| 379 |
+
```
|
| 380 |
+
|
| 381 |
+
| Insight | Detail |
|
| 382 |
+
|---|---|
|
| 383 |
+
| **L38 = "Golden Layer"** | MRI identified L34–L38 as Mother's reasoning core. Darwin assigned t=0.9 (90% Mother) to L38 specifically |
|
| 384 |
+
| **Router Strategy: B→B→A** | Mother's router for reasoning layers, Father's router for final output — preserves both reasoning paths and multimodal routing |
|
| 385 |
+
| **Dead Expert Revival** | Mother's 50–65% dead experts (killed by text-only fine-tuning) were replaced with Father's live experts — restoring multimodal and multilingual capabilities |
|
| 386 |
+
|
| 387 |
+
---
|
| 388 |
+
|
| 389 |
+
## Inherited Capabilities
|
| 390 |
+
|
| 391 |
+
### From Father (Qwen3.5-35B-A3B)
|
| 392 |
+
- **Multimodal**: Image and video understanding
|
| 393 |
+
- **201 Languages**: Global linguistic coverage
|
| 394 |
+
- **262K Context**: Native long-context (extendable to 1M via YaRN)
|
| 395 |
+
- **Gated DeltaNet + MoE**: Efficient hybrid architecture
|
| 396 |
+
- **Multi-Token Prediction**: Improved inference throughput
|
| 397 |
+
|
| 398 |
+
### From Mother (Claude 4.6 Opus Distilled)
|
| 399 |
+
- **Structured Thinking**: Systematic step-by-step reasoning within `<think>` tags
|
| 400 |
+
- **Efficient Reasoning**: "Let me analyze this request carefully: 1..2..3..." pattern
|
| 401 |
+
- **Coding Agent Compatibility**: Native "developer" role support for Claude Code, OpenCode
|
| 402 |
+
- **Tool Calling Stability**: Consistent performance in tool-use scenarios
|
| 403 |
+
- **Autonomous Execution**: Extended autonomous operation in agentic environments
|
| 404 |
+
|
| 405 |
+
---
|
| 406 |
+
|
| 407 |
+
## Father's Official Benchmarks (Reference)
|
| 408 |
+
|
| 409 |
+
Darwin is built on this architecture with enhanced reasoning:
|
| 410 |
+
|
| 411 |
+
| Category | Benchmark | Father Official |
|
| 412 |
+
|---|---|---|
|
| 413 |
+
| Knowledge | MMLU-Pro | 85.3 |
|
| 414 |
+
| Knowledge | MMLU-Redux | 93.3 |
|
| 415 |
+
| Reasoning | GPQA Diamond | 84.2 |
|
| 416 |
+
| Reasoning | HLE w/ CoT | 22.4 |
|
| 417 |
+
| Math | HMMT Feb 2025 | 89.0 |
|
| 418 |
+
| Coding | SWE-bench Verified | 69.2 |
|
| 419 |
+
| Coding | LiveCodeBench v6 | 74.6 |
|
| 420 |
+
| Agent | TAU2-Bench | 81.2 |
|
| 421 |
+
| Agent | BFCL-V4 (Tool Use) | 67.3 |
|
| 422 |
+
| Instruction | IFEval | 91.9 |
|
| 423 |
+
| Multilingual | MMMLU | 85.2 |
|
| 424 |
+
| Agentic Search | BrowseComp | 61.0 |
|
| 425 |
+
|
| 426 |
+
---
|
| 427 |
+
|
| 428 |
+
## Performance
|
| 429 |
+
|
| 430 |
+
### Inference Speed
|
| 431 |
+
|
| 432 |
+
| Metric | Value |
|
| 433 |
+
|---|---|
|
| 434 |
+
| **Generation Speed** | **147.8 tok/s** |
|
| 435 |
+
| Environment | Single NVIDIA H100 93GB NVL, SGLang, BF16 |
|
| 436 |
+
| Qwen Official API | 162.8 tok/s (Alibaba Cloud) |
|
| 437 |
+
|
| 438 |
+
### Hardware Requirements
|
| 439 |
+
|
| 440 |
+
| Setup | VRAM | Status |
|
| 441 |
+
|---|---|---|
|
| 442 |
+
| **BF16 (Full Precision)** | **65.5 GiB** | |
|
| 443 |
+
| Single H100 93GB NVL | 93 GB | ✅ Comfortable |
|
| 444 |
+
| Single A100 80GB | 80 GB | ⚠️ Tight |
|
| 445 |
+
| Single A100 40GB | 40 GB | ❌ Insufficient |
|
| 446 |
+
| **Q8 Quantized** | **~35 GiB** | |
|
| 447 |
+
| Single A100 40GB | 40 GB | ✅ Possible |
|
| 448 |
+
| **Q4_K_M Quantized** | **~18 GiB** | |
|
| 449 |
+
| Single RTX 4090 24GB | 24 GB | ✅ Comfortable |
|
| 450 |
+
| 2× RTX 4090 (tp=2) | 48 GB | ✅ BF16 possible |
|
| 451 |
+
|
| 452 |
+
> As a Mixture-of-Experts model, only 3B parameters are active per token despite loading the full 35B. Quantization has minimal impact due to this sparsity.
|
| 453 |
+
|
| 454 |
+
---
|
| 455 |
+
|
| 456 |
+
## Model Specifications
|
| 457 |
+
|
| 458 |
+
| | |
|
| 459 |
+
|---|---|
|
| 460 |
+
| Architecture | Qwen3.5 MoE (Gated DeltaNet + MoE) |
|
| 461 |
+
| Total Parameters | 35B |
|
| 462 |
+
| Active Parameters | 3B per forward pass |
|
| 463 |
+
| Hidden Dimension | 2,048 |
|
| 464 |
+
| Layers | 40 |
|
| 465 |
+
| Layer Layout | 10 × (3 × GDN→MoE + 1 × Attention→MoE) |
|
| 466 |
+
| Experts | 256 (8 routed + 1 shared active) |
|
| 467 |
+
| Expert Intermediate Dim | 512 |
|
| 468 |
+
| Context Length | 262,144 native (up to 1,010,000 via YaRN) |
|
| 469 |
+
| Languages | 201 |
|
| 470 |
+
| Multimodal | ✅ Image & Video input |
|
| 471 |
+
| License | Apache 2.0 |
|
| 472 |
+
| Engine | Darwin V5 (Evolutionary Merge + Model MRI) |
|
| 473 |
+
| Evolution Phase | Phase 2, real_score 0.8405 |
|
| 474 |
+
| Merge Commit | 109838c2 |
|
| 475 |
+
|
| 476 |
+
---
|
| 477 |
+
|
| 478 |
+
## Usage
|
| 479 |
+
|
| 480 |
+
### SGLang (Recommended)
|
| 481 |
+
|
| 482 |
+
```bash
|
| 483 |
+
python -m sglang.launch_server \
|
| 484 |
+
--model-path FINAL-Bench/Darwin-35B-A3B-Opus \
|
| 485 |
+
--tp 1 \
|
| 486 |
+
--mem-fraction-static 0.90 \
|
| 487 |
+
--context-length 32768 \
|
| 488 |
+
--trust-remote-code
|
| 489 |
+
```
|
| 490 |
+
|
| 491 |
+
### vLLM
|
| 492 |
+
|
| 493 |
+
```bash
|
| 494 |
+
vllm serve FINAL-Bench/Darwin-35B-A3B-Opus \
|
| 495 |
+
--trust-remote-code \
|
| 496 |
+
--enforce-eager
|
| 497 |
+
```
|
| 498 |
+
|
| 499 |
+
### Transformers
|
| 500 |
+
|
| 501 |
+
```python
|
| 502 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 503 |
+
|
| 504 |
+
tokenizer = AutoTokenizer.from_pretrained("FINAL-Bench/Darwin-35B-A3B-Opus", trust_remote_code=True)
|
| 505 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 506 |
+
"FINAL-Bench/Darwin-35B-A3B-Opus",
|
| 507 |
+
dtype="bfloat16",
|
| 508 |
+
device_map="auto",
|
| 509 |
+
trust_remote_code=True,
|
| 510 |
+
)
|
| 511 |
+
```
|
| 512 |
+
|
| 513 |
+
### Best Practices
|
| 514 |
+
- Use **context ≥ 32K** for reasoning tasks — the model leverages extended thinking
|
| 515 |
+
- For maximum reasoning quality, use **thinking mode (default)** with sufficient max_tokens (≥ 16384)
|
| 516 |
+
- The model generates `<think>` blocks for internal reasoning; extract the final answer after `</think>`
|
| 517 |
+
|
| 518 |
+
---
|
| 519 |
+
|
| 520 |
+
## Built By
|
| 521 |
+
|
| 522 |
+
| | |
|
| 523 |
+
|---|---|
|
| 524 |
+
| Developer | **VIDRAFT** |
|
| 525 |
+
| Evolution Engine | Darwin V5 (Evolutionary Merge + Model MRI) |
|
| 526 |
+
| Infrastructure | 4 × NVIDIA H100 93GB NVL GPU |
|
| 527 |
+
| Merge Time | 181.6 seconds |
|
| 528 |
+
| Shard Distribution | 14 shards → GPU [1, 2, 3] round-robin |
|
| 529 |
+
|
| 530 |
+
---
|
| 531 |
+
|
| 532 |
+
## Acknowledgements
|
| 533 |
+
|
| 534 |
+
- **Korean Government** — This research was supported by the Korean Government's 'GPU Support Program' research grant
|
| 535 |
+
- [Qwen Team](https://huggingface.co/Qwen) — Qwen3.5-35B-A3B base architecture
|
| 536 |
+
- [Jackrong](https://huggingface.co/Jackrong) — Claude 4.6 Opus Reasoning Distilled model
|
| 537 |
+
- [nohurry](https://huggingface.co/datasets/nohurry/Opus-4.6-Reasoning-3000x-filtered), [TeichAI](https://huggingface.co/datasets/TeichAI/claude-4.5-opus-high-reasoning-250x) — Distillation datasets
|
| 538 |
+
|
| 539 |
+
---
|
| 540 |
+
|
| 541 |
+
## Citation
|
| 542 |
+
|
| 543 |
+
```bibtex
|
| 544 |
+
@misc{vidraft_darwin_35b_opus,
|
| 545 |
+
title = {Darwin-35B-A3B-Opus: MRI-Guided Evolutionary Merge Beyond Both Parents},
|
| 546 |
+
author = {VIDRAFT},
|
| 547 |
+
year = {2026},
|
| 548 |
+
publisher = {Hugging Face},
|
| 549 |
+
howpublished = {\url{https://huggingface.co/FINAL-Bench/Darwin-35B-A3B-Opus}}
|
| 550 |
+
}
|
| 551 |
+
```
|
| 552 |
+
---
|
| 553 |
+
|
| 554 |
+
## Contact
|
| 555 |
+
|
| 556 |
+
📧 **kkms1116@koreacu.ac.kr**
|
| 557 |
+
|
| 558 |
+
---
|
| 559 |
+
|
| 560 |
+
## FAQ (Frequently Asked Questions)
|
| 561 |
+
|
| 562 |
+
<details>
|
| 563 |
+
<summary><b>What is Darwin-35B-A3B-Opus?</b></summary>
|
| 564 |
+
Darwin-35B-A3B-Opus is a 35 billion parameter Mixture-of-Experts language model (3B active per token) that was created using evolutionary merge techniques. It combines Qwen3.5-35B-A3B's multimodal versatility with Claude 4.6 Opus reasoning distillation, achieving 90.0% on GPQA Diamond — surpassing both parent models.
|
| 565 |
+
</details>
|
| 566 |
+
|
| 567 |
+
<details>
|
| 568 |
+
<summary><b>How does Darwin V5 differ from simple model merging?</b></summary>
|
| 569 |
+
Traditional merging applies uniform ratios by guesswork. Darwin V5 uses evolutionary algorithms (natural selection) combined with Model MRI (neural CT-scanning) to automatically discover optimal layer-specific merge ratios. For example, it found attn=0.168 and ffn=0.841 — an extreme asymmetry impossible to find by intuition.
|
| 570 |
+
</details>
|
| 571 |
+
|
| 572 |
+
<details>
|
| 573 |
+
<summary><b>What GPU do I need to run this model?</b></summary>
|
| 574 |
+
For BF16 full precision: A100 80GB (tight) or H100 93GB (comfortable). For Q4 quantization: a single RTX 4090 (24GB) is sufficient. The model loads 35B parameters but only activates 3B per token due to its MoE architecture.
|
| 575 |
+
</details>
|
| 576 |
+
|
| 577 |
+
<details>
|
| 578 |
+
<summary><b>Does it support multimodal (images/video)?</b></summary>
|
| 579 |
+
Yes. Darwin inherits the Father model's (Qwen3.5-35B-A3B) full multimodal capabilities including image and video understanding, unlike the Mother model which lost this during text-only fine-tuning.
|
| 580 |
+
</details>
|
| 581 |
+
|
| 582 |
+
<details>
|
| 583 |
+
<summary><b>What languages does it support?</b></summary>
|
| 584 |
+
201 languages and dialects, inherited from Qwen3.5's multilingual training. MMMLU benchmark confirms 85.0% multilingual knowledge retention across 29 evaluated languages.
|
| 585 |
+
</details>
|
| 586 |
+
|
| 587 |
+
<details>
|
| 588 |
+
<summary><b>What is Model MRI?</b></summary>
|
| 589 |
+
Model MRI is a neural anatomy analysis tool that CT-scans each layer of a language model to understand what functions it performs. When integrated with Darwin, it guides the evolutionary merge process — telling the algorithm which layers to preserve from each parent and which to replace. In this model, MRI identified L38 as the Mother's "golden layer" (core reasoning engine) and prescribed 90% Mother weight for that specific layer.
|
| 590 |
+
</details>
|
| 591 |
+
|
| 592 |
+
<details>
|
| 593 |
+
<summary><b>What are "Dead Experts" and why does it matter?</b></summary>
|
| 594 |
+
In Mixture-of-Experts (MoE) models, each layer contains hundreds of specialist sub-networks (experts). The Mother model's Claude distillation killed 50–65% of these experts because text-only fine-tuning didn't activate multimodal/multilingual specialists. Darwin's MRI detected this and prescribed replacing dead experts with the Father's living ones — reviving capabilities the Mother lost.
|
| 595 |
+
</details>
|
| 596 |
+
|
| 597 |
+
<details>
|
| 598 |
+
<summary><b>Is this model open source?</b></summary>
|
| 599 |
+
Yes. Darwin-35B-A3B-Opus is released under the Apache 2.0 license, fully open for commercial and research use.
|
| 600 |
+
</details>
|
| 601 |
+
|
| 602 |
+
---
|
| 603 |
+
|
| 604 |
+
<!-- AEO: Keywords for AI Answer Engines -->
|
| 605 |
+
<!--
|
| 606 |
+
Keywords: Darwin-35B-A3B-Opus, evolutionary merge, model merging, Darwin V5, Model MRI,
|
| 607 |
+
GPQA Diamond 90%, Qwen3.5-35B-A3B, Claude 4.6 Opus, reasoning model, mixture of experts,
|
| 608 |
+
MoE 3B active, 35B parameters, multimodal LLM, 201 languages, 262K context,
|
| 609 |
+
open source AI model, Apache 2.0, VIDRAFT, natural selection AI,
|
| 610 |
+
layer-wise merge ratio, attention preservation, FFN replacement,
|
| 611 |
+
best open source reasoning model 2026, Qwen merge, coding agent compatible,
|
| 612 |
+
dead expert revival, golden layer L38, MoE merge technique, neural anatomy analysis,
|
| 613 |
+
router entropy, expert utilization heatmap, cosine distance probe, 3-block surgical merge
|
| 614 |
+
-->
|
| 615 |
+
|
| 616 |
+
`#DarwinAI` `#EvolutionaryMerge` `#ModelMRI` `#DarwinV5` `#GPQA90` `#Qwen35` `#MoE3B` `#Reasoning` `#Multimodal` `#201Languages` `#OpenSource` `#Apache2` `#VIDRAFT` `#NaturalSelection` `#LayerWiseMerge` `#ClaudeOpus` `#ThinkingModel` `#CodingAgent` `#LongContext262K` `#BestOpenSourceLLM2026` `#DeadExpertRevival` `#GoldenLayer` `#MoEMerge` `#NeuralAnatomy`
|
a1.png
ADDED
|
a2.png
ADDED
|
a3.png
ADDED
|
added_tokens.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"</think>": 248069,
|
| 3 |
+
"</tool_call>": 248059,
|
| 4 |
+
"</tool_response>": 248067,
|
| 5 |
+
"<think>": 248068,
|
| 6 |
+
"<tool_call>": 248058,
|
| 7 |
+
"<tool_response>": 248066,
|
| 8 |
+
"<tts_pad>": 248072,
|
| 9 |
+
"<tts_text_bos>": 248073,
|
| 10 |
+
"<tts_text_bos_single>": 248075,
|
| 11 |
+
"<tts_text_eod>": 248074,
|
| 12 |
+
"<|audio_end|>": 248071,
|
| 13 |
+
"<|audio_pad|>": 248076,
|
| 14 |
+
"<|audio_start|>": 248070,
|
| 15 |
+
"<|box_end|>": 248050,
|
| 16 |
+
"<|box_start|>": 248049,
|
| 17 |
+
"<|endoftext|>": 248044,
|
| 18 |
+
"<|file_sep|>": 248065,
|
| 19 |
+
"<|fim_middle|>": 248061,
|
| 20 |
+
"<|fim_pad|>": 248063,
|
| 21 |
+
"<|fim_prefix|>": 248060,
|
| 22 |
+
"<|fim_suffix|>": 248062,
|
| 23 |
+
"<|im_end|>": 248046,
|
| 24 |
+
"<|im_start|>": 248045,
|
| 25 |
+
"<|image_pad|>": 248056,
|
| 26 |
+
"<|object_ref_end|>": 248048,
|
| 27 |
+
"<|object_ref_start|>": 248047,
|
| 28 |
+
"<|quad_end|>": 248052,
|
| 29 |
+
"<|quad_start|>": 248051,
|
| 30 |
+
"<|repo_name|>": 248064,
|
| 31 |
+
"<|video_pad|>": 248057,
|
| 32 |
+
"<|vision_end|>": 248054,
|
| 33 |
+
"<|vision_pad|>": 248055,
|
| 34 |
+
"<|vision_start|>": 248053
|
| 35 |
+
}
|
c1.png
ADDED
|
chat_template.jinja
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{%- set image_count = namespace(value=0) %}
|
| 2 |
+
{%- set video_count = namespace(value=0) %}
|
| 3 |
+
{%- macro render_content(content, do_vision_count, is_system_content=false) %}
|
| 4 |
+
{%- if content is string %}
|
| 5 |
+
{{- content }}
|
| 6 |
+
{%- elif content is iterable and content is not mapping %}
|
| 7 |
+
{%- for item in content %}
|
| 8 |
+
{%- if 'image' in item or 'image_url' in item or item.type == 'image' %}
|
| 9 |
+
{%- if is_system_content %}
|
| 10 |
+
{{- raise_exception('System message cannot contain images.') }}
|
| 11 |
+
{%- endif %}
|
| 12 |
+
{%- if do_vision_count %}
|
| 13 |
+
{%- set image_count.value = image_count.value + 1 %}
|
| 14 |
+
{%- endif %}
|
| 15 |
+
{%- if add_vision_id %}
|
| 16 |
+
{{- 'Picture ' ~ image_count.value ~ ': ' }}
|
| 17 |
+
{%- endif %}
|
| 18 |
+
{{- '<|vision_start|><|image_pad|><|vision_end|>' }}
|
| 19 |
+
{%- elif 'video' in item or item.type == 'video' %}
|
| 20 |
+
{%- if is_system_content %}
|
| 21 |
+
{{- raise_exception('System message cannot contain videos.') }}
|
| 22 |
+
{%- endif %}
|
| 23 |
+
{%- if do_vision_count %}
|
| 24 |
+
{%- set video_count.value = video_count.value + 1 %}
|
| 25 |
+
{%- endif %}
|
| 26 |
+
{%- if add_vision_id %}
|
| 27 |
+
{{- 'Video ' ~ video_count.value ~ ': ' }}
|
| 28 |
+
{%- endif %}
|
| 29 |
+
{{- '<|vision_start|><|video_pad|><|vision_end|>' }}
|
| 30 |
+
{%- elif 'text' in item %}
|
| 31 |
+
{{- item.text }}
|
| 32 |
+
{%- else %}
|
| 33 |
+
{{- raise_exception('Unexpected item type in content.') }}
|
| 34 |
+
{%- endif %}
|
| 35 |
+
{%- endfor %}
|
| 36 |
+
{%- elif content is none or content is undefined %}
|
| 37 |
+
{{- '' }}
|
| 38 |
+
{%- else %}
|
| 39 |
+
{{- raise_exception('Unexpected content type.') }}
|
| 40 |
+
{%- endif %}
|
| 41 |
+
{%- endmacro %}
|
| 42 |
+
{%- if not messages %}
|
| 43 |
+
{{- raise_exception('No messages provided.') }}
|
| 44 |
+
{%- endif %}
|
| 45 |
+
{%- if tools and tools is iterable and tools is not mapping %}
|
| 46 |
+
{{- '<|im_start|>system\n' }}
|
| 47 |
+
{{- "# Tools\n\nYou have access to the following functions:\n\n<tools>" }}
|
| 48 |
+
{%- for tool in tools %}
|
| 49 |
+
{{- "\n" }}
|
| 50 |
+
{{- tool | tojson }}
|
| 51 |
+
{%- endfor %}
|
| 52 |
+
{{- "\n</tools>" }}
|
| 53 |
+
{{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
|
| 54 |
+
{%- if messages[0].role == 'system' %}
|
| 55 |
+
{%- set content = render_content(messages[0].content, false, true)|trim %}
|
| 56 |
+
{%- if content %}
|
| 57 |
+
{{- '\n\n' + content }}
|
| 58 |
+
{%- endif %}
|
| 59 |
+
{%- endif %}
|
| 60 |
+
{{- '<|im_end|>\n' }}
|
| 61 |
+
{%- else %}
|
| 62 |
+
{%- if messages[0].role == 'system' %}
|
| 63 |
+
{%- set content = render_content(messages[0].content, false, true)|trim %}
|
| 64 |
+
{{- '<|im_start|>system\n' + content + '<|im_end|>\n' }}
|
| 65 |
+
{%- endif %}
|
| 66 |
+
{%- endif %}
|
| 67 |
+
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
| 68 |
+
{%- for message in messages[::-1] %}
|
| 69 |
+
{%- set index = (messages|length - 1) - loop.index0 %}
|
| 70 |
+
{%- if ns.multi_step_tool and message.role == "user" %}
|
| 71 |
+
{%- set content = render_content(message.content, false)|trim %}
|
| 72 |
+
{%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}
|
| 73 |
+
{%- set ns.multi_step_tool = false %}
|
| 74 |
+
{%- set ns.last_query_index = index %}
|
| 75 |
+
{%- endif %}
|
| 76 |
+
{%- endif %}
|
| 77 |
+
{%- endfor %}
|
| 78 |
+
{%- if ns.multi_step_tool %}
|
| 79 |
+
{{- raise_exception('No user query found in messages.') }}
|
| 80 |
+
{%- endif %}
|
| 81 |
+
{%- for message in messages %}
|
| 82 |
+
{%- set content = render_content(message.content, true)|trim %}
|
| 83 |
+
{%- if message.role == "system" %}
|
| 84 |
+
{%- if not loop.first %}
|
| 85 |
+
{{- raise_exception('System message must be at the beginning.') }}
|
| 86 |
+
{%- endif %}
|
| 87 |
+
{%- elif message.role == "user" %}
|
| 88 |
+
{{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
|
| 89 |
+
{%- elif message.role == "assistant" %}
|
| 90 |
+
{%- set reasoning_content = '' %}
|
| 91 |
+
{%- if message.reasoning_content is string %}
|
| 92 |
+
{%- set reasoning_content = message.reasoning_content %}
|
| 93 |
+
{%- else %}
|
| 94 |
+
{%- if '</think>' in content %}
|
| 95 |
+
{%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
| 96 |
+
{%- set content = content.split('</think>')[-1].lstrip('\n') %}
|
| 97 |
+
{%- endif %}
|
| 98 |
+
{%- endif %}
|
| 99 |
+
{%- set reasoning_content = reasoning_content|trim %}
|
| 100 |
+
{%- if loop.index0 > ns.last_query_index %}
|
| 101 |
+
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content + '\n</think>\n\n' + content }}
|
| 102 |
+
{%- else %}
|
| 103 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
| 104 |
+
{%- endif %}
|
| 105 |
+
{%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}
|
| 106 |
+
{%- for tool_call in message.tool_calls %}
|
| 107 |
+
{%- if tool_call.function is defined %}
|
| 108 |
+
{%- set tool_call = tool_call.function %}
|
| 109 |
+
{%- endif %}
|
| 110 |
+
{%- if loop.first %}
|
| 111 |
+
{%- if content|trim %}
|
| 112 |
+
{{- '\n\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
|
| 113 |
+
{%- else %}
|
| 114 |
+
{{- '<tool_call>\n<function=' + tool_call.name + '>\n' }}
|
| 115 |
+
{%- endif %}
|
| 116 |
+
{%- else %}
|
| 117 |
+
{{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
|
| 118 |
+
{%- endif %}
|
| 119 |
+
{%- if tool_call.arguments is defined %}
|
| 120 |
+
{%- for args_name, args_value in tool_call.arguments|items %}
|
| 121 |
+
{{- '<parameter=' + args_name + '>\n' }}
|
| 122 |
+
{%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
|
| 123 |
+
{{- args_value }}
|
| 124 |
+
{{- '\n</parameter>\n' }}
|
| 125 |
+
{%- endfor %}
|
| 126 |
+
{%- endif %}
|
| 127 |
+
{{- '</function>\n</tool_call>' }}
|
| 128 |
+
{%- endfor %}
|
| 129 |
+
{%- endif %}
|
| 130 |
+
{{- '<|im_end|>\n' }}
|
| 131 |
+
{%- elif message.role == "tool" %}
|
| 132 |
+
{%- if loop.previtem and loop.previtem.role != "tool" %}
|
| 133 |
+
{{- '<|im_start|>user' }}
|
| 134 |
+
{%- endif %}
|
| 135 |
+
{{- '\n<tool_response>\n' }}
|
| 136 |
+
{{- content }}
|
| 137 |
+
{{- '\n</tool_response>' }}
|
| 138 |
+
{%- if not loop.last and loop.nextitem.role != "tool" %}
|
| 139 |
+
{{- '<|im_end|>\n' }}
|
| 140 |
+
{%- elif loop.last %}
|
| 141 |
+
{{- '<|im_end|>\n' }}
|
| 142 |
+
{%- endif %}
|
| 143 |
+
{%- else %}
|
| 144 |
+
{{- raise_exception('Unexpected message role.') }}
|
| 145 |
+
{%- endif %}
|
| 146 |
+
{%- endfor %}
|
| 147 |
+
{%- if add_generation_prompt %}
|
| 148 |
+
{{- '<|im_start|>assistant\n' }}
|
| 149 |
+
{%- if enable_thinking is defined and enable_thinking is false %}
|
| 150 |
+
{{- '<think>\n\n</think>\n\n' }}
|
| 151 |
+
{%- else %}
|
| 152 |
+
{{- '<think>\n' }}
|
| 153 |
+
{%- endif %}
|
| 154 |
+
{%- endif %}
|
config.json
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"Qwen3_5MoeForConditionalGeneration"
|
| 4 |
+
],
|
| 5 |
+
"image_token_id": 248056,
|
| 6 |
+
"model_type": "qwen3_5_moe",
|
| 7 |
+
"text_config": {
|
| 8 |
+
"attention_bias": false,
|
| 9 |
+
"attention_dropout": 0.0,
|
| 10 |
+
"attn_output_gate": true,
|
| 11 |
+
"bos_token_id": null,
|
| 12 |
+
"dtype": "bfloat16",
|
| 13 |
+
"eos_token_id": 248044,
|
| 14 |
+
"full_attention_interval": 4,
|
| 15 |
+
"head_dim": 256,
|
| 16 |
+
"hidden_act": "silu",
|
| 17 |
+
"hidden_size": 2048,
|
| 18 |
+
"initializer_range": 0.02,
|
| 19 |
+
"layer_types": [
|
| 20 |
+
"linear_attention",
|
| 21 |
+
"linear_attention",
|
| 22 |
+
"linear_attention",
|
| 23 |
+
"full_attention",
|
| 24 |
+
"linear_attention",
|
| 25 |
+
"linear_attention",
|
| 26 |
+
"linear_attention",
|
| 27 |
+
"full_attention",
|
| 28 |
+
"linear_attention",
|
| 29 |
+
"linear_attention",
|
| 30 |
+
"linear_attention",
|
| 31 |
+
"full_attention",
|
| 32 |
+
"linear_attention",
|
| 33 |
+
"linear_attention",
|
| 34 |
+
"linear_attention",
|
| 35 |
+
"full_attention",
|
| 36 |
+
"linear_attention",
|
| 37 |
+
"linear_attention",
|
| 38 |
+
"linear_attention",
|
| 39 |
+
"full_attention",
|
| 40 |
+
"linear_attention",
|
| 41 |
+
"linear_attention",
|
| 42 |
+
"linear_attention",
|
| 43 |
+
"full_attention",
|
| 44 |
+
"linear_attention",
|
| 45 |
+
"linear_attention",
|
| 46 |
+
"linear_attention",
|
| 47 |
+
"full_attention",
|
| 48 |
+
"linear_attention",
|
| 49 |
+
"linear_attention",
|
| 50 |
+
"linear_attention",
|
| 51 |
+
"full_attention",
|
| 52 |
+
"linear_attention",
|
| 53 |
+
"linear_attention",
|
| 54 |
+
"linear_attention",
|
| 55 |
+
"full_attention",
|
| 56 |
+
"linear_attention",
|
| 57 |
+
"linear_attention",
|
| 58 |
+
"linear_attention",
|
| 59 |
+
"full_attention"
|
| 60 |
+
],
|
| 61 |
+
"linear_conv_kernel_dim": 4,
|
| 62 |
+
"linear_key_head_dim": 128,
|
| 63 |
+
"linear_num_key_heads": 16,
|
| 64 |
+
"linear_num_value_heads": 32,
|
| 65 |
+
"linear_value_head_dim": 128,
|
| 66 |
+
"mamba_ssm_dtype": "float32",
|
| 67 |
+
"max_position_embeddings": 262144,
|
| 68 |
+
"mlp_only_layers": [],
|
| 69 |
+
"model_type": "qwen3_5_moe_text",
|
| 70 |
+
"moe_intermediate_size": 512,
|
| 71 |
+
"mtp_num_hidden_layers": 1,
|
| 72 |
+
"mtp_use_dedicated_embeddings": false,
|
| 73 |
+
"num_attention_heads": 16,
|
| 74 |
+
"num_experts": 256,
|
| 75 |
+
"num_experts_per_tok": 8,
|
| 76 |
+
"num_hidden_layers": 40,
|
| 77 |
+
"num_key_value_heads": 2,
|
| 78 |
+
"output_router_logits": false,
|
| 79 |
+
"pad_token_id": null,
|
| 80 |
+
"partial_rotary_factor": 0.25,
|
| 81 |
+
"rms_norm_eps": 1e-06,
|
| 82 |
+
"rope_parameters": {
|
| 83 |
+
"mrope_interleaved": true,
|
| 84 |
+
"mrope_section": [
|
| 85 |
+
11,
|
| 86 |
+
11,
|
| 87 |
+
10
|
| 88 |
+
],
|
| 89 |
+
"partial_rotary_factor": 0.25,
|
| 90 |
+
"rope_theta": 10000000,
|
| 91 |
+
"rope_type": "default"
|
| 92 |
+
},
|
| 93 |
+
"router_aux_loss_coef": 0.001,
|
| 94 |
+
"shared_expert_intermediate_size": 512,
|
| 95 |
+
"tie_word_embeddings": false,
|
| 96 |
+
"use_cache": true,
|
| 97 |
+
"vocab_size": 248320
|
| 98 |
+
},
|
| 99 |
+
"tie_word_embeddings": false,
|
| 100 |
+
"transformers_version": "5.4.0",
|
| 101 |
+
"video_token_id": 248057,
|
| 102 |
+
"vision_config": {
|
| 103 |
+
"deepstack_visual_indexes": [],
|
| 104 |
+
"depth": 27,
|
| 105 |
+
"hidden_act": "gelu_pytorch_tanh",
|
| 106 |
+
"hidden_size": 1152,
|
| 107 |
+
"in_channels": 3,
|
| 108 |
+
"initializer_range": 0.02,
|
| 109 |
+
"intermediate_size": 4304,
|
| 110 |
+
"model_type": "qwen3_5_moe",
|
| 111 |
+
"num_heads": 16,
|
| 112 |
+
"num_position_embeddings": 2304,
|
| 113 |
+
"out_hidden_size": 2048,
|
| 114 |
+
"patch_size": 16,
|
| 115 |
+
"spatial_merge_size": 2,
|
| 116 |
+
"temporal_patch_size": 2
|
| 117 |
+
},
|
| 118 |
+
"vision_end_token_id": 248054,
|
| 119 |
+
"vision_start_token_id": 248053
|
| 120 |
+
}
|
f1.png
ADDED
|
f2.png
ADDED
|
f3.png
ADDED
|
generation_config.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 248044,
|
| 3 |
+
"do_sample": true,
|
| 4 |
+
"eos_token_id": [
|
| 5 |
+
248046,
|
| 6 |
+
248044
|
| 7 |
+
],
|
| 8 |
+
"pad_token_id": 248044,
|
| 9 |
+
"temperature": 1.0,
|
| 10 |
+
"top_k": 20,
|
| 11 |
+
"top_p": 0.95,
|
| 12 |
+
"transformers_version": "4.57.0.dev0"
|
| 13 |
+
}
|
info.png
ADDED
|
Git LFS Details
|
m1.png
ADDED
|
m2.png
ADDED
|
m3.png
ADDED
|
merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
model.safetensors-00001-of-00014.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1b9f16fdae2411c0c50dbc7a5dacaebd033b28f3fd083e708f53cda3420e7247
|
| 3 |
+
size 5368709816
|
model.safetensors-00002-of-00014.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:79f06bc069d273b37ae7e1ef21eb24d8641ac74227b6139b00af55addf27e9ee
|
| 3 |
+
size 5368709808
|
model.safetensors-00003-of-00014.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2cb56343806fd845eb2ca3ff5c46b1de409e05dc09d03edb86f7fb2d752b4f3d
|
| 3 |
+
size 5368709808
|
model.safetensors-00004-of-00014.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:39a39c799e52db2ca3d6123dad077b7dbfff3dbad4bcc9cbf66502db7be650a3
|
| 3 |
+
size 5368709816
|
model.safetensors-00005-of-00014.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1a3b31ebb04bffa60d0ad5d9f2973d476d88959bb842ce003f068a6a9c5d4776
|
| 3 |
+
size 5368709816
|
model.safetensors-00006-of-00014.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:af5f17d292df6b200ee928d7dac30a66eab3e8f6d5d88a84fe1d27859d4a64fe
|
| 3 |
+
size 5368709816
|
model.safetensors-00007-of-00014.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:52885dbecd4e8e2e52e6cd50a1693a640ce622f219c35bf317aad89f72d956fc
|
| 3 |
+
size 5368709808
|
model.safetensors-00008-of-00014.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:43f232ba029dc4ca5ad994e38913c6ff74f110198e005ee2f35e42f9ce438cb3
|
| 3 |
+
size 5368709808
|
model.safetensors-00009-of-00014.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8fa6a375dd539822b19ad18ad002f8acedd4920ff770bc284cfb428a213c9f17
|
| 3 |
+
size 5255463936
|
model.safetensors-00010-of-00014.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d89e80aaf84c0b8a442162fc97c38fe7ce85d11ae728f67aa449350da86f8212
|
| 3 |
+
size 5368710456
|
model.safetensors-00011-of-00014.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5b774025e5ed974256ce40498bfe4e1ad9ae8a4f13f9e9145666d24ea7404cae
|
| 3 |
+
size 5368710464
|
model.safetensors-00012-of-00014.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d0f0bd7a5f09e6e05aee9a6ef5ba97fe678b0e19b891a81969682b0bdc9bb5c7
|
| 3 |
+
size 5368710456
|
model.safetensors-00013-of-00014.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:164b7cd0c0dc86437c8970566d6bebfc6a5f6ff0e111fb6d7ce97e978a62d33e
|
| 3 |
+
size 5367839544
|
model.safetensors-00014-of-00014.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b1807b29c79a2e228820e4535a7375e63dfc4916bdfb9d661311d6089a37085b
|
| 3 |
+
size 2224755616
|
model.safetensors.index.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
preprocessor_config.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"crop_size": null,
|
| 3 |
+
"data_format": "channels_first",
|
| 4 |
+
"default_to_square": true,
|
| 5 |
+
"device": null,
|
| 6 |
+
"disable_grouping": null,
|
| 7 |
+
"do_center_crop": null,
|
| 8 |
+
"do_convert_rgb": true,
|
| 9 |
+
"do_normalize": true,
|
| 10 |
+
"do_pad": null,
|
| 11 |
+
"do_rescale": true,
|
| 12 |
+
"do_resize": true,
|
| 13 |
+
"image_mean": [
|
| 14 |
+
0.5,
|
| 15 |
+
0.5,
|
| 16 |
+
0.5
|
| 17 |
+
],
|
| 18 |
+
"image_processor_type": "Qwen2VLImageProcessorFast",
|
| 19 |
+
"image_std": [
|
| 20 |
+
0.5,
|
| 21 |
+
0.5,
|
| 22 |
+
0.5
|
| 23 |
+
],
|
| 24 |
+
"input_data_format": null,
|
| 25 |
+
"max_pixels": null,
|
| 26 |
+
"merge_size": 2,
|
| 27 |
+
"min_pixels": null,
|
| 28 |
+
"pad_size": null,
|
| 29 |
+
"patch_size": 16,
|
| 30 |
+
"processor_class": "Qwen3VLProcessor",
|
| 31 |
+
"resample": 3,
|
| 32 |
+
"rescale_factor": 0.00392156862745098,
|
| 33 |
+
"return_tensors": null,
|
| 34 |
+
"size": {
|
| 35 |
+
"longest_edge": 16777216,
|
| 36 |
+
"shortest_edge": 65536
|
| 37 |
+
},
|
| 38 |
+
"temporal_patch_size": 2
|
| 39 |
+
}
|
qwen.png
ADDED
|
Git LFS Details
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"additional_special_tokens": [
|
| 3 |
+
"<|im_start|>",
|
| 4 |
+
"<|im_end|>",
|
| 5 |
+
"<|object_ref_start|>",
|
| 6 |
+
"<|object_ref_end|>",
|
| 7 |
+
"<|box_start|>",
|
| 8 |
+
"<|box_end|>",
|
| 9 |
+
"<|quad_start|>",
|
| 10 |
+
"<|quad_end|>",
|
| 11 |
+
"<|vision_start|>",
|
| 12 |
+
"<|vision_end|>",
|
| 13 |
+
"<|vision_pad|>",
|
| 14 |
+
"<|image_pad|>",
|
| 15 |
+
"<|video_pad|>"
|
| 16 |
+
],
|
| 17 |
+
"audio_bos_token": "<|audio_start|>",
|
| 18 |
+
"audio_eos_token": "<|audio_end|>",
|
| 19 |
+
"audio_token": "<|audio_pad|>",
|
| 20 |
+
"eos_token": {
|
| 21 |
+
"content": "<|im_end|>",
|
| 22 |
+
"lstrip": false,
|
| 23 |
+
"normalized": false,
|
| 24 |
+
"rstrip": false,
|
| 25 |
+
"single_word": false
|
| 26 |
+
},
|
| 27 |
+
"image_token": "<|image_pad|>",
|
| 28 |
+
"pad_token": {
|
| 29 |
+
"content": "<|endoftext|>",
|
| 30 |
+
"lstrip": false,
|
| 31 |
+
"normalized": false,
|
| 32 |
+
"rstrip": false,
|
| 33 |
+
"single_word": false
|
| 34 |
+
},
|
| 35 |
+
"video_token": "<|video_pad|>",
|
| 36 |
+
"vision_bos_token": "<|vision_start|>",
|
| 37 |
+
"vision_eos_token": "<|vision_end|>"
|
| 38 |
+
}
|
tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:87a7830d63fcf43bf241c3c5242e96e62dd3fdc29224ca26fed8ea333db72de4
|
| 3 |
+
size 19989343
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,312 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": false,
|
| 3 |
+
"add_prefix_space": false,
|
| 4 |
+
"added_tokens_decoder": {
|
| 5 |
+
"248044": {
|
| 6 |
+
"content": "<|endoftext|>",
|
| 7 |
+
"lstrip": false,
|
| 8 |
+
"normalized": false,
|
| 9 |
+
"rstrip": false,
|
| 10 |
+
"single_word": false,
|
| 11 |
+
"special": true
|
| 12 |
+
},
|
| 13 |
+
"248045": {
|
| 14 |
+
"content": "<|im_start|>",
|
| 15 |
+
"lstrip": false,
|
| 16 |
+
"normalized": false,
|
| 17 |
+
"rstrip": false,
|
| 18 |
+
"single_word": false,
|
| 19 |
+
"special": true
|
| 20 |
+
},
|
| 21 |
+
"248046": {
|
| 22 |
+
"content": "<|im_end|>",
|
| 23 |
+
"lstrip": false,
|
| 24 |
+
"normalized": false,
|
| 25 |
+
"rstrip": false,
|
| 26 |
+
"single_word": false,
|
| 27 |
+
"special": true
|
| 28 |
+
},
|
| 29 |
+
"248047": {
|
| 30 |
+
"content": "<|object_ref_start|>",
|
| 31 |
+
"lstrip": false,
|
| 32 |
+
"normalized": false,
|
| 33 |
+
"rstrip": false,
|
| 34 |
+
"single_word": false,
|
| 35 |
+
"special": true
|
| 36 |
+
},
|
| 37 |
+
"248048": {
|
| 38 |
+
"content": "<|object_ref_end|>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false,
|
| 43 |
+
"special": true
|
| 44 |
+
},
|
| 45 |
+
"248049": {
|
| 46 |
+
"content": "<|box_start|>",
|
| 47 |
+
"lstrip": false,
|
| 48 |
+
"normalized": false,
|
| 49 |
+
"rstrip": false,
|
| 50 |
+
"single_word": false,
|
| 51 |
+
"special": true
|
| 52 |
+
},
|
| 53 |
+
"248050": {
|
| 54 |
+
"content": "<|box_end|>",
|
| 55 |
+
"lstrip": false,
|
| 56 |
+
"normalized": false,
|
| 57 |
+
"rstrip": false,
|
| 58 |
+
"single_word": false,
|
| 59 |
+
"special": true
|
| 60 |
+
},
|
| 61 |
+
"248051": {
|
| 62 |
+
"content": "<|quad_start|>",
|
| 63 |
+
"lstrip": false,
|
| 64 |
+
"normalized": false,
|
| 65 |
+
"rstrip": false,
|
| 66 |
+
"single_word": false,
|
| 67 |
+
"special": true
|
| 68 |
+
},
|
| 69 |
+
"248052": {
|
| 70 |
+
"content": "<|quad_end|>",
|
| 71 |
+
"lstrip": false,
|
| 72 |
+
"normalized": false,
|
| 73 |
+
"rstrip": false,
|
| 74 |
+
"single_word": false,
|
| 75 |
+
"special": true
|
| 76 |
+
},
|
| 77 |
+
"248053": {
|
| 78 |
+
"content": "<|vision_start|>",
|
| 79 |
+
"lstrip": false,
|
| 80 |
+
"normalized": false,
|
| 81 |
+
"rstrip": false,
|
| 82 |
+
"single_word": false,
|
| 83 |
+
"special": true
|
| 84 |
+
},
|
| 85 |
+
"248054": {
|
| 86 |
+
"content": "<|vision_end|>",
|
| 87 |
+
"lstrip": false,
|
| 88 |
+
"normalized": false,
|
| 89 |
+
"rstrip": false,
|
| 90 |
+
"single_word": false,
|
| 91 |
+
"special": true
|
| 92 |
+
},
|
| 93 |
+
"248055": {
|
| 94 |
+
"content": "<|vision_pad|>",
|
| 95 |
+
"lstrip": false,
|
| 96 |
+
"normalized": false,
|
| 97 |
+
"rstrip": false,
|
| 98 |
+
"single_word": false,
|
| 99 |
+
"special": true
|
| 100 |
+
},
|
| 101 |
+
"248056": {
|
| 102 |
+
"content": "<|image_pad|>",
|
| 103 |
+
"lstrip": false,
|
| 104 |
+
"normalized": false,
|
| 105 |
+
"rstrip": false,
|
| 106 |
+
"single_word": false,
|
| 107 |
+
"special": true
|
| 108 |
+
},
|
| 109 |
+
"248057": {
|
| 110 |
+
"content": "<|video_pad|>",
|
| 111 |
+
"lstrip": false,
|
| 112 |
+
"normalized": false,
|
| 113 |
+
"rstrip": false,
|
| 114 |
+
"single_word": false,
|
| 115 |
+
"special": true
|
| 116 |
+
},
|
| 117 |
+
"248058": {
|
| 118 |
+
"content": "<tool_call>",
|
| 119 |
+
"lstrip": false,
|
| 120 |
+
"normalized": false,
|
| 121 |
+
"rstrip": false,
|
| 122 |
+
"single_word": false,
|
| 123 |
+
"special": false
|
| 124 |
+
},
|
| 125 |
+
"248059": {
|
| 126 |
+
"content": "</tool_call>",
|
| 127 |
+
"lstrip": false,
|
| 128 |
+
"normalized": false,
|
| 129 |
+
"rstrip": false,
|
| 130 |
+
"single_word": false,
|
| 131 |
+
"special": false
|
| 132 |
+
},
|
| 133 |
+
"248060": {
|
| 134 |
+
"content": "<|fim_prefix|>",
|
| 135 |
+
"lstrip": false,
|
| 136 |
+
"normalized": false,
|
| 137 |
+
"rstrip": false,
|
| 138 |
+
"single_word": false,
|
| 139 |
+
"special": false
|
| 140 |
+
},
|
| 141 |
+
"248061": {
|
| 142 |
+
"content": "<|fim_middle|>",
|
| 143 |
+
"lstrip": false,
|
| 144 |
+
"normalized": false,
|
| 145 |
+
"rstrip": false,
|
| 146 |
+
"single_word": false,
|
| 147 |
+
"special": false
|
| 148 |
+
},
|
| 149 |
+
"248062": {
|
| 150 |
+
"content": "<|fim_suffix|>",
|
| 151 |
+
"lstrip": false,
|
| 152 |
+
"normalized": false,
|
| 153 |
+
"rstrip": false,
|
| 154 |
+
"single_word": false,
|
| 155 |
+
"special": false
|
| 156 |
+
},
|
| 157 |
+
"248063": {
|
| 158 |
+
"content": "<|fim_pad|>",
|
| 159 |
+
"lstrip": false,
|
| 160 |
+
"normalized": false,
|
| 161 |
+
"rstrip": false,
|
| 162 |
+
"single_word": false,
|
| 163 |
+
"special": false
|
| 164 |
+
},
|
| 165 |
+
"248064": {
|
| 166 |
+
"content": "<|repo_name|>",
|
| 167 |
+
"lstrip": false,
|
| 168 |
+
"normalized": false,
|
| 169 |
+
"rstrip": false,
|
| 170 |
+
"single_word": false,
|
| 171 |
+
"special": false
|
| 172 |
+
},
|
| 173 |
+
"248065": {
|
| 174 |
+
"content": "<|file_sep|>",
|
| 175 |
+
"lstrip": false,
|
| 176 |
+
"normalized": false,
|
| 177 |
+
"rstrip": false,
|
| 178 |
+
"single_word": false,
|
| 179 |
+
"special": false
|
| 180 |
+
},
|
| 181 |
+
"248066": {
|
| 182 |
+
"content": "<tool_response>",
|
| 183 |
+
"lstrip": false,
|
| 184 |
+
"normalized": false,
|
| 185 |
+
"rstrip": false,
|
| 186 |
+
"single_word": false,
|
| 187 |
+
"special": false
|
| 188 |
+
},
|
| 189 |
+
"248067": {
|
| 190 |
+
"content": "</tool_response>",
|
| 191 |
+
"lstrip": false,
|
| 192 |
+
"normalized": false,
|
| 193 |
+
"rstrip": false,
|
| 194 |
+
"single_word": false,
|
| 195 |
+
"special": false
|
| 196 |
+
},
|
| 197 |
+
"248068": {
|
| 198 |
+
"content": "<think>",
|
| 199 |
+
"lstrip": false,
|
| 200 |
+
"normalized": false,
|
| 201 |
+
"rstrip": false,
|
| 202 |
+
"single_word": false,
|
| 203 |
+
"special": false
|
| 204 |
+
},
|
| 205 |
+
"248069": {
|
| 206 |
+
"content": "</think>",
|
| 207 |
+
"lstrip": false,
|
| 208 |
+
"normalized": false,
|
| 209 |
+
"rstrip": false,
|
| 210 |
+
"single_word": false,
|
| 211 |
+
"special": false
|
| 212 |
+
},
|
| 213 |
+
"248070": {
|
| 214 |
+
"content": "<|audio_start|>",
|
| 215 |
+
"lstrip": false,
|
| 216 |
+
"normalized": false,
|
| 217 |
+
"rstrip": false,
|
| 218 |
+
"single_word": false,
|
| 219 |
+
"special": true
|
| 220 |
+
},
|
| 221 |
+
"248071": {
|
| 222 |
+
"content": "<|audio_end|>",
|
| 223 |
+
"lstrip": false,
|
| 224 |
+
"normalized": false,
|
| 225 |
+
"rstrip": false,
|
| 226 |
+
"single_word": false,
|
| 227 |
+
"special": true
|
| 228 |
+
},
|
| 229 |
+
"248072": {
|
| 230 |
+
"content": "<tts_pad>",
|
| 231 |
+
"lstrip": false,
|
| 232 |
+
"normalized": false,
|
| 233 |
+
"rstrip": false,
|
| 234 |
+
"single_word": false,
|
| 235 |
+
"special": true
|
| 236 |
+
},
|
| 237 |
+
"248073": {
|
| 238 |
+
"content": "<tts_text_bos>",
|
| 239 |
+
"lstrip": false,
|
| 240 |
+
"normalized": false,
|
| 241 |
+
"rstrip": false,
|
| 242 |
+
"single_word": false,
|
| 243 |
+
"special": true
|
| 244 |
+
},
|
| 245 |
+
"248074": {
|
| 246 |
+
"content": "<tts_text_eod>",
|
| 247 |
+
"lstrip": false,
|
| 248 |
+
"normalized": false,
|
| 249 |
+
"rstrip": false,
|
| 250 |
+
"single_word": false,
|
| 251 |
+
"special": true
|
| 252 |
+
},
|
| 253 |
+
"248075": {
|
| 254 |
+
"content": "<tts_text_bos_single>",
|
| 255 |
+
"lstrip": false,
|
| 256 |
+
"normalized": false,
|
| 257 |
+
"rstrip": false,
|
| 258 |
+
"single_word": false,
|
| 259 |
+
"special": true
|
| 260 |
+
},
|
| 261 |
+
"248076": {
|
| 262 |
+
"content": "<|audio_pad|>",
|
| 263 |
+
"lstrip": false,
|
| 264 |
+
"normalized": false,
|
| 265 |
+
"rstrip": false,
|
| 266 |
+
"single_word": false,
|
| 267 |
+
"special": true
|
| 268 |
+
}
|
| 269 |
+
},
|
| 270 |
+
"additional_special_tokens": [
|
| 271 |
+
"<|im_start|>",
|
| 272 |
+
"<|im_end|>",
|
| 273 |
+
"<|object_ref_start|>",
|
| 274 |
+
"<|object_ref_end|>",
|
| 275 |
+
"<|box_start|>",
|
| 276 |
+
"<|box_end|>",
|
| 277 |
+
"<|quad_start|>",
|
| 278 |
+
"<|quad_end|>",
|
| 279 |
+
"<|vision_start|>",
|
| 280 |
+
"<|vision_end|>",
|
| 281 |
+
"<|vision_pad|>",
|
| 282 |
+
"<|image_pad|>",
|
| 283 |
+
"<|video_pad|>"
|
| 284 |
+
],
|
| 285 |
+
"audio_bos_token": "<|audio_start|>",
|
| 286 |
+
"audio_eos_token": "<|audio_end|>",
|
| 287 |
+
"audio_token": "<|audio_pad|>",
|
| 288 |
+
"bos_token": null,
|
| 289 |
+
"clean_up_tokenization_spaces": false,
|
| 290 |
+
"eos_token": "<|im_end|>",
|
| 291 |
+
"errors": "replace",
|
| 292 |
+
"extra_special_tokens": {
|
| 293 |
+
"audio_bos_token": "<|audio_start|>",
|
| 294 |
+
"audio_eos_token": "<|audio_end|>",
|
| 295 |
+
"audio_token": "<|audio_pad|>",
|
| 296 |
+
"image_token": "<|image_pad|>",
|
| 297 |
+
"video_token": "<|video_pad|>",
|
| 298 |
+
"vision_bos_token": "<|vision_start|>",
|
| 299 |
+
"vision_eos_token": "<|vision_end|>"
|
| 300 |
+
},
|
| 301 |
+
"image_token": "<|image_pad|>",
|
| 302 |
+
"model_max_length": 262144,
|
| 303 |
+
"pad_token": "<|endoftext|>",
|
| 304 |
+
"pretokenize_regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
| 305 |
+
"processor_class": "Qwen3VLProcessor",
|
| 306 |
+
"split_special_tokens": false,
|
| 307 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 308 |
+
"unk_token": null,
|
| 309 |
+
"video_token": "<|video_pad|>",
|
| 310 |
+
"vision_bos_token": "<|vision_start|>",
|
| 311 |
+
"vision_eos_token": "<|vision_end|>"
|
| 312 |
+
}
|
video_preprocessor_config.json
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"crop_size": null,
|
| 3 |
+
"data_format": "channels_first",
|
| 4 |
+
"default_to_square": true,
|
| 5 |
+
"device": null,
|
| 6 |
+
"do_center_crop": null,
|
| 7 |
+
"do_convert_rgb": true,
|
| 8 |
+
"do_normalize": true,
|
| 9 |
+
"do_rescale": true,
|
| 10 |
+
"do_resize": true,
|
| 11 |
+
"do_sample_frames": true,
|
| 12 |
+
"fps": 2,
|
| 13 |
+
"image_mean": [
|
| 14 |
+
0.5,
|
| 15 |
+
0.5,
|
| 16 |
+
0.5
|
| 17 |
+
],
|
| 18 |
+
"image_std": [
|
| 19 |
+
0.5,
|
| 20 |
+
0.5,
|
| 21 |
+
0.5
|
| 22 |
+
],
|
| 23 |
+
"input_data_format": null,
|
| 24 |
+
"max_frames": 768,
|
| 25 |
+
"merge_size": 2,
|
| 26 |
+
"min_frames": 4,
|
| 27 |
+
"num_frames": null,
|
| 28 |
+
"pad_size": null,
|
| 29 |
+
"patch_size": 16,
|
| 30 |
+
"processor_class": "Qwen3VLProcessor",
|
| 31 |
+
"resample": 3,
|
| 32 |
+
"rescale_factor": 0.00392156862745098,
|
| 33 |
+
"return_metadata": false,
|
| 34 |
+
"size": {
|
| 35 |
+
"longest_edge": 25165824,
|
| 36 |
+
"shortest_edge": 4096
|
| 37 |
+
},
|
| 38 |
+
"temporal_patch_size": 2,
|
| 39 |
+
"video_metadata": null,
|
| 40 |
+
"video_processor_type": "Qwen3VLVideoProcessor"
|
| 41 |
+
}
|
vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|