FrankensteinSim courage17340 commited on
Commit
7ad8a8b
·
0 Parent(s):

Duplicate from moonshotai/Kimi-K2.7-Code

Browse files

Co-authored-by: Chu Wei <courage17340@users.noreply.huggingface.co>

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +37 -0
  2. LICENSE +27 -0
  3. README.md +335 -0
  4. THIRD_PARTY_NOTICES.md +43 -0
  5. chat_template.jinja +112 -0
  6. config.json +194 -0
  7. configuration_deepseek.py +214 -0
  8. configuration_kimi_k25.py +123 -0
  9. docs/deploy_guidance.md +93 -0
  10. figures/demo_video.mp4 +3 -0
  11. figures/kimi-logo.png +0 -0
  12. generation_config.json +4 -0
  13. kimi_k25_processor.py +165 -0
  14. kimi_k25_vision_processing.py +251 -0
  15. media_utils.py +368 -0
  16. model-00001-of-000064.safetensors +3 -0
  17. model-00002-of-000064.safetensors +3 -0
  18. model-00003-of-000064.safetensors +3 -0
  19. model-00004-of-000064.safetensors +3 -0
  20. model-00005-of-000064.safetensors +3 -0
  21. model-00006-of-000064.safetensors +3 -0
  22. model-00007-of-000064.safetensors +3 -0
  23. model-00008-of-000064.safetensors +3 -0
  24. model-00009-of-000064.safetensors +3 -0
  25. model-00010-of-000064.safetensors +3 -0
  26. model-00011-of-000064.safetensors +3 -0
  27. model-00012-of-000064.safetensors +3 -0
  28. model-00013-of-000064.safetensors +3 -0
  29. model-00014-of-000064.safetensors +3 -0
  30. model-00015-of-000064.safetensors +3 -0
  31. model-00016-of-000064.safetensors +3 -0
  32. model-00017-of-000064.safetensors +3 -0
  33. model-00018-of-000064.safetensors +3 -0
  34. model-00019-of-000064.safetensors +3 -0
  35. model-00020-of-000064.safetensors +3 -0
  36. model-00021-of-000064.safetensors +3 -0
  37. model-00022-of-000064.safetensors +3 -0
  38. model-00023-of-000064.safetensors +3 -0
  39. model-00024-of-000064.safetensors +3 -0
  40. model-00025-of-000064.safetensors +3 -0
  41. model-00026-of-000064.safetensors +3 -0
  42. model-00027-of-000064.safetensors +3 -0
  43. model-00028-of-000064.safetensors +3 -0
  44. model-00029-of-000064.safetensors +3 -0
  45. model-00030-of-000064.safetensors +3 -0
  46. model-00031-of-000064.safetensors +3 -0
  47. model-00032-of-000064.safetensors +3 -0
  48. model-00033-of-000064.safetensors +3 -0
  49. model-00034-of-000064.safetensors +3 -0
  50. model-00035-of-000064.safetensors +3 -0
.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model.safetensors.index.json filter=lfs diff=lfs merge=lfs -text
37
+ figures/demo_video.mp4 filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Modified MIT License
2
+
3
+ Copyright (c) 2026 Moonshot AI
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the “Software”), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
23
+ Our only modification part is that, if the Software (or any derivative works
24
+ thereof) is used for any of your commercial products or services that have
25
+ more than 100 million monthly active users, or more than 20 million US dollars
26
+ (or equivalent in other currencies) in monthly revenue, you shall prominently
27
+ display "Kimi K2.7 Code" on the user interface of such product or service.
README.md ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - compressed-tensors
4
+ license: other
5
+ license_name: modified-mit
6
+ library_name: transformers
7
+ pipeline_tag: image-text-to-text
8
+ ---
9
+ <div align="center">
10
+ <picture>
11
+ <img src="figures/kimi-logo.png" width="30%" alt="Kimi K2.7 Code">
12
+ </picture>
13
+ </div>
14
+ <hr>
15
+ <div align="center" style="line-height:1">
16
+ <a href="https://www.kimi.com/code" target="_blank"><img alt="Chat" src="https://img.shields.io/badge/🤖-Kimi--Code-ff6b6b?color=1783ff&logoColor=white"/></a>
17
+ <a href="https://www.moonshot.ai" target="_blank"><img alt="Homepage" src="https://img.shields.io/badge/Homepage-Moonshot%20AI-white?logo=Kimi&logoColor=white"/></a>
18
+ </div>
19
+
20
+ <div align="center" style="line-height: 1;">
21
+ <a href="https://huggingface.co/moonshotai" target="_blank"><img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Moonshot%20AI-ffc107?color=ffc107&logoColor=white"/></a>
22
+ <a href="https://twitter.com/kimi_moonshot" target="_blank"><img alt="Twitter Follow" src="https://img.shields.io/badge/Twitter-Kimi.ai-white?logo=x&logoColor=white"/></a>
23
+ <a href="https://discord.gg/TYU2fdJykW" target="_blank"><img alt="Discord" src="https://img.shields.io/badge/Discord-Kimi.ai-white?logo=discord&logoColor=white"/></a>
24
+ <a href="https://modelscope.cn/organization/moonshotai" target="_blank"><img alt="ModelScope" src="https://img.shields.io/badge/ModelScope-Moonshot%20AI-white?labelColor=rgb(99%2C%2074%2C%20255)"/></a>
25
+ </div>
26
+ <div align="center" style="line-height: 1;">
27
+ <a href="https://huggingface.co/moonshotai/Kimi-K2.7-Code/blob/main/LICENSE"><img alt="License" src="https://img.shields.io/badge/License-Modified_MIT-f5de53?&color=f5de53"/></a>
28
+ </div>
29
+
30
+
31
+
32
+
33
+ ## 1. Model Introduction
34
+
35
+ Kimi K2.7 Code is a coding-focused agentic model built upon Kimi K2.6. With substantial improvements on real-world long-horizon coding tasks, it strengthens end-to-end task completion across complex software engineering workflows while improving token efficiency, reducing thinking-token usage by approximately 30% compared with Kimi K2.6.
36
+
37
+ ## 2. Model Summary
38
+
39
+ <div align="center">
40
+
41
+
42
+ | | |
43
+ |:---:|:---:|
44
+ | **Architecture** | Mixture-of-Experts (MoE) |
45
+ | **Total Parameters** | 1T |
46
+ | **Activated Parameters** | 32B |
47
+ | **Number of Layers** (Dense layer included) | 61 |
48
+ | **Number of Dense Layers** | 1 |
49
+ | **Attention Hidden Dimension** | 7168 |
50
+ | **MoE Hidden Dimension** (per Expert) | 2048 |
51
+ | **Number of Attention Heads** | 64 |
52
+ | **Number of Experts** | 384 |
53
+ | **Selected Experts per Token** | 8 |
54
+ | **Number of Shared Experts** | 1 |
55
+ | **Vocabulary Size** | 160K |
56
+ | **Context Length** | 256K |
57
+ | **Attention Mechanism** | MLA |
58
+ | **Activation Function** | SwiGLU |
59
+ | **Vision Encoder** | MoonViT |
60
+ | **Parameters of Vision Encoder** | 400M |
61
+ </div>
62
+
63
+ ## 3. Evaluation Results
64
+
65
+ <div align="center">
66
+ <table>
67
+ <thead>
68
+ <tr>
69
+ <th align="center">Benchmark</th>
70
+ <th align="center">Kimi K2.6</th>
71
+ <th align="center">Kimi K2.7 Code</th>
72
+ <th align="center">GPT-5.5</th>
73
+ <th align="center">Claude Opus 4.8</th>
74
+ </tr>
75
+ </thead>
76
+ <tbody>
77
+ <tr>
78
+ <td align="center" colspan=5><strong>Coding</strong></td>
79
+ </tr>
80
+ <tr>
81
+ <td align="center" style="vertical-align: middle">Kimi Code Bench v2</td>
82
+ <td align="center" style="vertical-align: middle">50.9</td>
83
+ <td align="center" style="vertical-align: middle">62.0</td>
84
+ <td align="center" style="vertical-align: middle">69.0</td>
85
+ <td align="center" style="vertical-align: middle">67.4</td>
86
+ </tr>
87
+ <tr>
88
+ <td align="center" style="vertical-align: middle">Program Bench</td>
89
+ <td align="center" style="vertical-align: middle">48.3</td>
90
+ <td align="center" style="vertical-align: middle">53.6</td>
91
+ <td align="center" style="vertical-align: middle">69.1</td>
92
+ <td align="center" style="vertical-align: middle">63.8</td>
93
+ </tr>
94
+ <tr>
95
+ <td align="center" style="vertical-align: middle">MLS Bench Lite</td>
96
+ <td align="center" style="vertical-align: middle">26.7</td>
97
+ <td align="center" style="vertical-align: middle">35.1</td>
98
+ <td align="center" style="vertical-align: middle">35.5</td>
99
+ <td align="center" style="vertical-align: middle">42.8</td>
100
+ </tr>
101
+ <tr>
102
+ <td align="center" colspan=5><strong>Agentic</strong></td>
103
+ </tr>
104
+ <tr>
105
+ <td align="center" style="vertical-align: middle">Kimi Claw 24/7 Bench</td>
106
+ <td align="center" style="vertical-align: middle">42.9</td>
107
+ <td align="center" style="vertical-align: middle">46.9</td>
108
+ <td align="center" style="vertical-align: middle">52.8</td>
109
+ <td align="center" style="vertical-align: middle">50.4</td>
110
+ </tr>
111
+ <tr>
112
+ <td align="center" style="vertical-align: middle">MCP Atlas</td>
113
+ <td align="center" style="vertical-align: middle">69.4</td>
114
+ <td align="center" style="vertical-align: middle">76.0</td>
115
+ <td align="center" style="vertical-align: middle">79.4</td>
116
+ <td align="center" style="vertical-align: middle">81.3</td>
117
+ </tr>
118
+ <tr>
119
+ <td align="center" style="vertical-align: middle">MCP Mark Verified</td>
120
+ <td align="center" style="vertical-align: middle">72.8</td>
121
+ <td align="center" style="vertical-align: middle">81.1</td>
122
+ <td align="center" style="vertical-align: middle">92.9</td>
123
+ <td align="center" style="vertical-align: middle">76.4</td>
124
+ </tr>
125
+ </tbody>
126
+ </table>
127
+ </div>
128
+
129
+ <details>
130
+ <summary><b>Footnotes</b></summary>
131
+
132
+ 1. **General Testing Details**
133
+ - Unless stated otherwise, Kimi K2.7 Code and K2.6 were tested with thinking mode enabled via Kimi Code CLI at temperature = 1.0, top-p = 0.95, and a 262,144-token context length; GPT-5.5 ran in Codex with xhigh mode, and Opus 4.8 in Claude Code with xhigh mode. Aside from these differences, all benchmarks were evaluated under the same conditions.
134
+ 2. **Coding Benchmarks**
135
+ - Kimi Code Bench V2 is our in-house benchmark designed to evaluate coding agents on realistic tasks. It has diversed software engineering tasks across 10+ mainstream programming languages and a full production tech stack covering tasks from internal engineering use cases, production incidents, and real-world open-source projects, with emphasis on backend services, infrastructure, performance engineering, systems programming, security, frontend development, and ML/data engineering.
136
+ - [Program Bench](https://programbench.com/) evaluates code-generation agents by asking them to recreate a program’s behavior from only a compiled binary and its documentation. It spans 200 tasks, from small CLI tools to large systems like FFmpeg and SQLite. Submissions are judged against over 248,000 fuzz-generated behavioral tests. In each task, the agent is given an executable and its documentation, but no source code, decompilation, or internet access. It must choose its own implementation language, build the full program from scratch, and pass a behavioral test suite comparing its output against the original binary.
137
+ - [MLS-Bench](https://mls-bench.com) evaluates whether AI systems can invent generalizable and scalable ML methods. MLS-Bench-Lite is the official 30-task subset of MLS-Bench, covering LLM pretraining and post-training, robotics, world models, computer vision, reinforcement learning, optimization, ML systems, AI for Science, and more. Agents are given 5 hours to explore before submitting their solutions. Opus 4.8 is evaluated with the max effort setting in Claude Code.
138
+ 3. **Agentic Benchmarks**
139
+ - Kimi Claw 24/7 Bench is our in-house benchmark for evaluating long-horizon agentic performance in persistent, multi-day coworking tasks. It spans 17 professional scenarios across 610 evaluation points, covering domains such as software engineering, ML research, recruiting, trading, marketing. All tasks are executed through the OpenClaw harness. The final score is the average pass rate across all evaluation points, and is averaged over 3 runs.
140
+ - [MCP-Atlas](https://labs.scale.com/leaderboard/mcp_atlas) evaluates LLM performance on realistic tool-use tasks through the scalable MCPs. We followed the official MCP-Atlas evaluation configuration with a 100 tool-call budget, and with 32k max tokens per step. The final result is averaged over 3 runs.
141
+ - MCPMark-Verified is a human-verified edition of [MCPMark](https://mcpmark.ai/), a benchmark for evaluating MCP tool use across five real server environments — Notion, GitHub, Filesystem, Postgres, and Playwright. Each task has been re-checked by our team and the benchmark offical and will be open-sourced soon. We followed the official MCPMark evaluation configuration with a 100-step tool-call budget and 32k max tokens per step. The final result is averaged over 3 runs.
142
+
143
+ </details>
144
+
145
+
146
+ ## 4. Native INT4 Quantization
147
+ Kimi-K2.7-Code adopts the same native int4 quantization method as [Kimi-K2-Thinking](https://huggingface.co/moonshotai/Kimi-K2-Thinking#4-native-int4-quantization).
148
+
149
+ ## 5. Deployment
150
+
151
+ > [!Note]
152
+ > You can access Kimi-K2.7-Code's API on https://platform.moonshot.ai and we provide OpenAI/Anthropic-compatible API for you.
153
+ Currently, Kimi-K2.7-Code is recommended to run on the following inference engines:
154
+ * vLLM
155
+ * SGLang
156
+ * KTransformers
157
+
158
+ Kimi-K2.7-Code has the same architecture as Kimi-K2.5/Kimi-K2.6, and the deployment method can be directly reused.
159
+
160
+ The version requirement for `transformers` is `>=4.57.1, <5.0.0`.
161
+
162
+ Deployment examples can be found in the [Model Deployment Guide](docs/deploy_guidance.md).
163
+
164
+
165
+ ---
166
+ ## 6. Model Usage
167
+
168
+ The usage demos below demonstrate how to call our official API. Note that Kimi-K2.7-Code forces thinking and preserve_thinking as True.
169
+
170
+ For third-party APIs deployed with vLLM or SGLang, please note that:
171
+ > [!Note]
172
+ > - Chat with video content is an experimental feature and is only supported in our official API for now.
173
+ >
174
+ > - The recommended `temperature` will be `1.0` for Thinking mode.
175
+ >
176
+ > - The recommended `top_p` is `0.95`.
177
+ >
178
+ > - Instant mode is not supported.
179
+
180
+ ### Chat Completion
181
+
182
+ This is a simple chat completion script which shows how to call K2.7-Code API in Thinking mode.
183
+
184
+ ```python
185
+ import openai
186
+ import base64
187
+ import requests
188
+ def simple_chat(client: openai.OpenAI, model_name: str):
189
+ messages = [
190
+ {'role': 'system', 'content': 'You are Kimi, an AI assistant created by Moonshot AI.'},
191
+ {
192
+ 'role': 'user',
193
+ 'content': [
194
+ {'type': 'text', 'text': 'which one is bigger, 9.11 or 9.9? think carefully.'}
195
+ ],
196
+ },
197
+ ]
198
+ response = client.chat.completions.create(
199
+ model=model_name, messages=messages, stream=False, max_tokens=4096
200
+ )
201
+ print('====== Below is reasoning content in Thinking Mode ======')
202
+ print(f'reasoning content: {response.choices[0].message.reasoning}')
203
+ print('====== Below is response in Thinking Mode ======')
204
+ print(f'response: {response.choices[0].message.content}')
205
+ ```
206
+
207
+
208
+ ### Chat Completion with visual content
209
+
210
+ K2.7-Code supports Image and Video input.
211
+
212
+ The following example demonstrates how to call K2.7-Code API with image input:
213
+
214
+ ```python
215
+ import openai
216
+ import base64
217
+ import requests
218
+
219
+ def chat_with_image(client: openai.OpenAI, model_name: str):
220
+ url = 'https://huggingface.co/moonshotai/Kimi-K2.7-Code/resolve/main/figures/kimi-logo.png'
221
+ image_base64 = base64.b64encode(requests.get(url).content).decode()
222
+ messages = [
223
+ {
224
+ 'role': 'user',
225
+ 'content': [
226
+ {'type': 'text', 'text': 'Describe this image in detail.'},
227
+ {
228
+ 'type': 'image_url',
229
+ 'image_url': {'url': f'data:image/png;base64,{image_base64}'},
230
+ },
231
+ ],
232
+ }
233
+ ]
234
+
235
+ response = client.chat.completions.create(
236
+ model=model_name, messages=messages, stream=False, max_tokens=8192
237
+ )
238
+ print('====== Below is reasoning content in Thinking Mode ======')
239
+ print(f'reasoning content: {response.choices[0].message.reasoning}')
240
+ print('====== Below is response in Thinking Mode ======')
241
+ print(f'response: {response.choices[0].message.content}')
242
+ ```
243
+
244
+ The following example demonstrates how to call K2.7-Code API with video input:
245
+
246
+ ```python
247
+ import openai
248
+ import base64
249
+ import requests
250
+
251
+ def chat_with_video(client: openai.OpenAI, model_name:str):
252
+ url = 'https://huggingface.co/moonshotai/Kimi-K2.7-Code/resolve/main/figures/demo_video.mp4'
253
+ video_base64 = base64.b64encode(requests.get(url).content).decode()
254
+ messages = [
255
+ {
256
+ "role": "user",
257
+ "content": [
258
+ {"type": "text","text": "Describe the video in detail."},
259
+ {
260
+ "type": "video_url",
261
+ "video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
262
+ },
263
+ ],
264
+ }
265
+ ]
266
+
267
+ response = client.chat.completions.create(model=model_name, messages=messages)
268
+ print('====== Below is reasoning content in Thinking Mode ======')
269
+ print(f'reasoning content: {response.choices[0].message.reasoning}')
270
+ print('====== Below is response in Thinking Mode ======')
271
+ print(f'response: {response.choices[0].message.content}')
272
+ ```
273
+
274
+ ### Preserve Thinking
275
+ Kimi K2.7 Code forces `preserve_thinking` mode, which retains full reasoning content across multi-turn interactions and enhances performance in coding agent scenarios.
276
+
277
+ This feature is enabled by default and can't be disabled. The following example demonstrates how to call K2.7-Code API in `preserve_thinking` mode:
278
+
279
+ ```python
280
+ def chat_with_preserve_thinking(client: openai.OpenAI, model_name: str):
281
+ messages = [
282
+ {
283
+ "role": "user",
284
+ "content": "Tell me three random numbers."
285
+ },
286
+ {
287
+ "role": "assistant",
288
+ "reasoning_content": "I'll start by listing five numbers: 473, 921, 235, 215, 222, and I'll tell you the first three.",
289
+ # Some API (e.g. vLLM) may not support reasoning_content, you can try reasoning instead
290
+ "content": "473, 921, 235"
291
+ },
292
+ {
293
+ "role": "user",
294
+ "content": "What are the other two numbers you have in mind?"
295
+ }
296
+ ]
297
+
298
+ response = client.chat.completions.create(
299
+ model=model_name,
300
+ messages=messages,
301
+ stream=False,
302
+ max_tokens=4096,
303
+ )
304
+ # the assistant should mention 215 and 222 that appear in the prior reasoning content
305
+ print(f"response: {response.choices[0].message.reasoning}")
306
+ return response.choices[0].message.content
307
+
308
+ ```
309
+
310
+ ### Interleaved Thinking and Multi-Step Tool Call
311
+
312
+ K2.7-Code shares the same design of Interleaved Thinking and Multi-Step Tool Call as K2 Thinking. For usage example, please refer to the [K2 Thinking documentation](https://platform.moonshot.ai/docs/guide/use-kimi-k2-thinking-model#complete-example).
313
+
314
+ ### Coding Agent Framework
315
+
316
+ Kimi K2.7-Code works best with Kimi Code CLI as its agent framework — give it a try at https://www.kimi.com/code.
317
+
318
+
319
+ ---
320
+
321
+ ## 7. License
322
+
323
+ Both the code repository and the model weights are released under the [Modified MIT License](LICENSE).
324
+
325
+ ---
326
+
327
+ ## 8. Third Party Notices
328
+
329
+ See [THIRD PARTY NOTICES](THIRD_PARTY_NOTICES.md)
330
+
331
+ ---
332
+
333
+ ## 9. Contact Us
334
+
335
+ If you have any questions, please reach out at [support@moonshot.ai](mailto:support@moonshot.ai).
THIRD_PARTY_NOTICES.md ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # THIRD_PARTY_NOTICES
2
+
3
+ This file lists third-party software contained in Kimi-K2.7-Code along with their licenses, in compliance with the redistribution clauses of those licenses.
4
+
5
+ ---
6
+
7
+ ## 1. DeepSeek-V3
8
+
9
+ Our model architecture is DeepSeek-V3-like. Some of modeling codes are copied from the source repository.
10
+
11
+ - **Source Repository**
12
+ https://huggingface.co/deepseek-ai/DeepSeek-V3
13
+
14
+ - **Files / Directories Used**
15
+ - configuration_deepseek.py
16
+ - modeling_deepseek.py
17
+
18
+ - **License Type**
19
+ MIT License
20
+
21
+ - **Copyright Notice**
22
+ Copyright (c) 2023 DeepSeek
23
+
24
+ - **Full License Text**
25
+ ```
26
+ MIT License
27
+ Copyright (c) 2023 DeepSeek
28
+ Permission is hereby granted, free of charge, to any person obtaining a copy
29
+ of this software and associated documentation files (the "Software"), to deal
30
+ in the Software without restriction, including without limitation the rights
31
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
32
+ copies of the Software, and to permit persons to whom the Software is
33
+ furnished to do so, subject to the following conditions:
34
+ The above copyright notice and this permission notice shall be included in all
35
+ copies or substantial portions of the Software.
36
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
37
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
38
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
39
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
40
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
41
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
42
+ SOFTWARE.
43
+ ```
chat_template.jinja ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- macro render_content(msg) -%}
2
+ {%- set c = msg.get('content') -%}
3
+ {%- if c is string -%}
4
+ {{ c }}
5
+ {%- elif c is not none -%}
6
+ {% for content in c -%}
7
+ {% if content['type'] == 'image' or content['type'] == 'image_url' -%}
8
+ <|media_begin|>image<|media_content|><|media_pad|><|media_end|>
9
+ {% elif content['type'] == 'video' or content['type']== 'video_url'-%}
10
+ <|kimi_k25_video_placeholder|>
11
+ {% else -%}
12
+ {{ content['text'] }}
13
+ {%- endif -%}
14
+ {%- endfor -%}
15
+ {%- endif -%}
16
+ {%- endmacro -%}
17
+
18
+ {% macro set_roles(message) -%}
19
+ {%- set role_name = message.get('name') or message['role'] -%}
20
+ {%- if message['role'] == 'user' -%}
21
+ <|im_user|>{{role_name}}<|im_middle|>
22
+ {%- elif message['role'] == 'assistant' -%}
23
+ <|im_assistant|>{{role_name}}<|im_middle|>
24
+ {%- else -%}
25
+ <|im_system|>{{role_name}}<|im_middle|>
26
+ {%- endif -%}
27
+ {%- endmacro -%}
28
+
29
+
30
+ {%- macro render_toolcalls(message) -%}
31
+ <|tool_calls_section_begin|>
32
+ {%- for tool_call in message['tool_calls'] -%}
33
+ {%- set formatted_id = tool_call['id'] -%}
34
+ <|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{% if tool_call['function']['arguments'] is string %}{{ tool_call['function']['arguments'] }}{% else %}{{ tool_call['function']['arguments'] | tojson }}{% endif %}<|tool_call_end|>
35
+ {%- endfor -%}
36
+ <|tool_calls_section_end|>
37
+ {%- endmacro -%}
38
+
39
+
40
+ {%- set preserve_thinking = preserve_thinking | default(true) -%}
41
+ {# Find last non-tool-call assistant message. If preserve_thinking, keep -1 so hist is empty and all msgs use suffix (retain reasoning). #}
42
+ {%- set ns = namespace(last_non_tool_call_assistant_msg=-1) -%}
43
+ {%- if not preserve_thinking -%}
44
+ {%- for idx in range(messages|length-1, -1, -1) -%}
45
+ {%- if messages[idx]['role'] == 'assistant' and not messages[idx].get('tool_calls') -%}
46
+ {%- set ns.last_non_tool_call_assistant_msg = idx -%}
47
+ {%- break -%}
48
+ {%- endif -%}
49
+ {%- endfor -%}
50
+ {%- endif -%}
51
+
52
+ {# split all messages into history & suffix, reasoning_content in suffix should be reserved.#}
53
+ {%- set hist_msgs = messages[:ns.last_non_tool_call_assistant_msg+1] -%}
54
+ {%- set suffix_msgs = messages[ns.last_non_tool_call_assistant_msg+1:] -%}
55
+
56
+ {%- if tools -%}
57
+ {%- if tools_ts_str -%}
58
+ <|im_system|>tool_declare<|im_middle|>{{ tools_ts_str }}<|im_end|>
59
+ {%- else -%}
60
+ <|im_system|>tool_declare<|im_middle|>{{ tools | tojson(separators=(',', ':')) }}<|im_end|>
61
+ {%- endif -%}
62
+ {%- endif -%}
63
+
64
+
65
+ {%- for message in hist_msgs -%}
66
+ {{set_roles(message)}}
67
+ {%- if message['role'] == 'assistant' -%}
68
+ <think></think>{{render_content(message)}}
69
+ {%- if message.get('tool_calls') -%}
70
+ {{render_toolcalls(message)}}
71
+ {%- endif -%}
72
+ {%- elif message['role'] == 'tool' -%}
73
+ {%- set tool_call_id = message.tool_call_id -%}
74
+ ## Return of {{ tool_call_id }}
75
+ {{render_content(message)}}
76
+ {%- elif message['content'] is not none -%}
77
+ {{render_content(message)}}
78
+ {%- endif -%}
79
+ <|im_end|>
80
+ {%- endfor -%}
81
+
82
+ {%- for message in suffix_msgs -%}
83
+ {{set_roles(message)}}
84
+ {%- if message['role'] == 'assistant' -%}
85
+ {%- if thinking is defined and thinking is false and preserve_thinking is false -%}
86
+ <think></think>{{render_content(message)}}
87
+ {%- else -%}
88
+ {%- set rc = message.get('reasoning', message.get('reasoning_content', '')) -%}
89
+ <think>{{rc}}</think>{{render_content(message)}}
90
+ {%- endif -%}
91
+ {%- if message.get('tool_calls') -%}
92
+ {{render_toolcalls(message)}}
93
+ {%- endif -%}
94
+ {%- elif message['role'] == 'tool' -%}
95
+ {%- set tool_call_id = message.tool_call_id -%}
96
+ ## Return of {{ tool_call_id }}
97
+ {{render_content(message)}}
98
+ {%- elif message['content'] is not none -%}
99
+ {{render_content(message)}}
100
+ {%- endif -%}
101
+ <|im_end|>
102
+ {%- endfor -%}
103
+
104
+
105
+ {%- if add_generation_prompt -%}
106
+ <|im_assistant|>assistant<|im_middle|>
107
+ {%- if thinking is defined and thinking is false -%}
108
+ <think></think>
109
+ {%- else -%}
110
+ <think>
111
+ {%- endif -%}
112
+ {%- endif -%}
config.json ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "KimiK25ForConditionalGeneration"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_kimi_k25.KimiK25Config",
7
+ "AutoModel": "modeling_kimi_k25.KimiK25ForConditionalGeneration",
8
+ "AutoModelForCausalLM": "modeling_kimi_k25.KimiK25ForConditionalGeneration"
9
+ },
10
+ "bos_token_id": 163584,
11
+ "dtype": "bfloat16",
12
+ "eos_token_id": 163586,
13
+ "ignore_index": -100,
14
+ "media_placeholder_token_id": 163605,
15
+ "model_type": "kimi_k25",
16
+ "pad_token_id": 163839,
17
+ "text_config": {
18
+ "_name_or_path": "",
19
+ "add_cross_attention": false,
20
+ "architectures": [
21
+ "DeepseekV3ForCausalLM"
22
+ ],
23
+ "attention_bias": false,
24
+ "attention_dropout": 0.0,
25
+ "auto_map": {
26
+ "AutoConfig": "configuration_deepseek.DeepseekV3Config",
27
+ "AutoModel": "modeling_deepseek.DeepseekV3Model",
28
+ "AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM"
29
+ },
30
+ "aux_loss_alpha": 0.001,
31
+ "bad_words_ids": null,
32
+ "begin_suppress_tokens": null,
33
+ "bos_token_id": 163584,
34
+ "chunk_size_feed_forward": 0,
35
+ "cross_attention_hidden_size": null,
36
+ "decoder_start_token_id": null,
37
+ "diversity_penalty": 0.0,
38
+ "do_sample": false,
39
+ "dtype": "bfloat16",
40
+ "early_stopping": false,
41
+ "encoder_no_repeat_ngram_size": 0,
42
+ "eos_token_id": 163586,
43
+ "ep_size": 1,
44
+ "exponential_decay_length_penalty": null,
45
+ "finetuning_task": null,
46
+ "first_k_dense_replace": 1,
47
+ "forced_bos_token_id": null,
48
+ "forced_eos_token_id": null,
49
+ "hidden_act": "silu",
50
+ "hidden_size": 7168,
51
+ "id2label": {
52
+ "0": "LABEL_0",
53
+ "1": "LABEL_1"
54
+ },
55
+ "initializer_range": 0.02,
56
+ "intermediate_size": 18432,
57
+ "is_decoder": false,
58
+ "is_encoder_decoder": false,
59
+ "kv_lora_rank": 512,
60
+ "label2id": {
61
+ "LABEL_0": 0,
62
+ "LABEL_1": 1
63
+ },
64
+ "length_penalty": 1.0,
65
+ "max_length": 20,
66
+ "max_position_embeddings": 262144,
67
+ "min_length": 0,
68
+ "model_type": "kimi_k2",
69
+ "moe_intermediate_size": 2048,
70
+ "moe_layer_freq": 1,
71
+ "n_group": 1,
72
+ "n_routed_experts": 384,
73
+ "n_shared_experts": 1,
74
+ "no_repeat_ngram_size": 0,
75
+ "norm_topk_prob": true,
76
+ "num_attention_heads": 64,
77
+ "num_beam_groups": 1,
78
+ "num_beams": 1,
79
+ "num_experts_per_tok": 8,
80
+ "num_hidden_layers": 61,
81
+ "num_key_value_heads": 64,
82
+ "num_nextn_predict_layers": 0,
83
+ "num_return_sequences": 1,
84
+ "output_attentions": false,
85
+ "output_hidden_states": false,
86
+ "output_scores": false,
87
+ "pad_token_id": 163839,
88
+ "prefix": null,
89
+ "pretraining_tp": 1,
90
+ "problem_type": null,
91
+ "pruned_heads": {},
92
+ "q_lora_rank": 1536,
93
+ "qk_nope_head_dim": 128,
94
+ "qk_rope_head_dim": 64,
95
+ "quantization_config": {
96
+ "config_groups": {
97
+ "group_0": {
98
+ "input_activations": null,
99
+ "output_activations": null,
100
+ "targets": [
101
+ "Linear"
102
+ ],
103
+ "weights": {
104
+ "actorder": null,
105
+ "block_structure": null,
106
+ "dynamic": false,
107
+ "group_size": 32,
108
+ "num_bits": 4,
109
+ "observer": "minmax",
110
+ "observer_kwargs": {},
111
+ "strategy": "group",
112
+ "symmetric": true,
113
+ "type": "int"
114
+ }
115
+ }
116
+ },
117
+ "format": "pack-quantized",
118
+ "ignore": [
119
+ "re:.*self_attn.*",
120
+ "re:.*shared_experts.*",
121
+ "re:.*mlp\\.(gate|up|gate_up|down)_proj.*",
122
+ "re:.*lm_head.*",
123
+ "re:.*vision_tower.*",
124
+ "re:.*mm_projector.*"
125
+ ],
126
+ "kv_cache_scheme": null,
127
+ "quant_method": "compressed-tensors",
128
+ "quantization_status": "compressed"
129
+ },
130
+ "remove_invalid_values": false,
131
+ "repetition_penalty": 1.0,
132
+ "return_dict": true,
133
+ "return_dict_in_generate": false,
134
+ "rms_norm_eps": 1e-05,
135
+ "rope_scaling": {
136
+ "beta_fast": 32.0,
137
+ "beta_slow": 1.0,
138
+ "factor": 64.0,
139
+ "mscale": 1.0,
140
+ "mscale_all_dim": 1.0,
141
+ "original_max_position_embeddings": 4096,
142
+ "type": "yarn"
143
+ },
144
+ "rope_theta": 50000.0,
145
+ "routed_scaling_factor": 2.827,
146
+ "scoring_func": "sigmoid",
147
+ "sep_token_id": null,
148
+ "seq_aux": true,
149
+ "suppress_tokens": null,
150
+ "task_specific_params": null,
151
+ "temperature": 1.0,
152
+ "tf_legacy_loss": false,
153
+ "tie_encoder_decoder": false,
154
+ "tie_word_embeddings": false,
155
+ "tokenizer_class": null,
156
+ "top_k": 50,
157
+ "top_p": 1.0,
158
+ "topk_group": 1,
159
+ "topk_method": "noaux_tc",
160
+ "torchscript": false,
161
+ "transformers_version": "4.56.2",
162
+ "typical_p": 1.0,
163
+ "use_bfloat16": false,
164
+ "use_cache": true,
165
+ "v_head_dim": 128,
166
+ "vocab_size": 163840
167
+ },
168
+ "tie_word_embeddings": false,
169
+ "use_unified_vision_chunk": true,
170
+ "video_placeholder": "<|kimi_k25_video_placeholder|>",
171
+ "vision_config": {
172
+ "_attn_implementation": "flash_attention_2",
173
+ "init_pos_emb_height": 64,
174
+ "init_pos_emb_time": 4,
175
+ "init_pos_emb_width": 64,
176
+ "merge_kernel_size": [
177
+ 2,
178
+ 2
179
+ ],
180
+ "merge_type": "sd2_tpool",
181
+ "mm_hidden_size": 1152,
182
+ "mm_projector_type": "patchmerger",
183
+ "patch_size": 14,
184
+ "pos_emb_type": "divided_fixed",
185
+ "projector_hidden_act": "gelu",
186
+ "projector_ln_eps": 1e-05,
187
+ "text_hidden_size": 7168,
188
+ "video_attn_type": "spatial_temporal",
189
+ "vt_hidden_size": 1152,
190
+ "vt_intermediate_size": 4304,
191
+ "vt_num_attention_heads": 16,
192
+ "vt_num_hidden_layers": 27
193
+ }
194
+ }
configuration_deepseek.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copy from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/configuration_deepseek.py
2
+
3
+ from transformers.configuration_utils import PretrainedConfig
4
+ from transformers.utils import logging
5
+
6
+ logger = logging.get_logger(__name__)
7
+
8
+ DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
9
+
10
+
11
+ class DeepseekV3Config(PretrainedConfig):
12
+ r"""
13
+ This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
14
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
15
+ defaults will yield a similar configuration to that of the DeepSeek-V3.
16
+
17
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
18
+ documentation from [`PretrainedConfig`] for more information.
19
+
20
+
21
+ Args:
22
+ vocab_size (`int`, *optional*, defaults to 129280):
23
+ Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
24
+ `inputs_ids` passed when calling [`DeepseekV3Model`]
25
+ hidden_size (`int`, *optional*, defaults to 4096):
26
+ Dimension of the hidden representations.
27
+ intermediate_size (`int`, *optional*, defaults to 11008):
28
+ Dimension of the MLP representations.
29
+ moe_intermediate_size (`int`, *optional*, defaults to 1407):
30
+ Dimension of the MoE representations.
31
+ num_hidden_layers (`int`, *optional*, defaults to 32):
32
+ Number of hidden layers in the Transformer decoder.
33
+ num_nextn_predict_layers (`int`, *optional*, defaults to 1):
34
+ Number of nextn predict layers in the DeepSeekV3 Model.
35
+ num_attention_heads (`int`, *optional*, defaults to 32):
36
+ Number of attention heads for each attention layer in the Transformer decoder.
37
+ n_shared_experts (`int`, *optional*, defaults to None):
38
+ Number of shared experts, None means dense model.
39
+ n_routed_experts (`int`, *optional*, defaults to None):
40
+ Number of routed experts, None means dense model.
41
+ routed_scaling_factor (`float`, *optional*, defaults to 1.0):
42
+ Scaling factor or routed experts.
43
+ topk_method (`str`, *optional*, defaults to `gready`):
44
+ Topk method used in routed gate.
45
+ n_group (`int`, *optional*, defaults to None):
46
+ Number of groups for routed experts.
47
+ topk_group (`int`, *optional*, defaults to None):
48
+ Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
49
+ num_experts_per_tok (`int`, *optional*, defaults to None):
50
+ Number of selected experts, None means dense model.
51
+ moe_layer_freq (`int`, *optional*, defaults to 1):
52
+ The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers.
53
+ first_k_dense_replace (`int`, *optional*, defaults to 0):
54
+ Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
55
+ \--k dense layers--/
56
+ norm_topk_prob (`bool`, *optional*, defaults to False):
57
+ Whether to normalize the weights of the routed experts.
58
+ scoring_func (`str`, *optional*, defaults to 'softmax'):
59
+ Method of computing expert weights.
60
+ aux_loss_alpha (`float`, *optional*, defaults to 0.001):
61
+ Auxiliary loss weight coefficient.
62
+ seq_aux = (`bool`, *optional*, defaults to True):
63
+ Whether to compute the auxiliary loss for each individual sample.
64
+ num_key_value_heads (`int`, *optional*):
65
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
66
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
67
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
68
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
69
+ by meanpooling all the original heads within that group. For more details checkout [this
70
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
71
+ `num_attention_heads`.
72
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
73
+ The non-linear activation function (function or string) in the decoder.
74
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
75
+ The maximum sequence length that this model might ever be used with.
76
+ initializer_range (`float`, *optional*, defaults to 0.02):
77
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
78
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
79
+ The epsilon used by the rms normalization layers.
80
+ use_cache (`bool`, *optional*, defaults to `True`):
81
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
82
+ relevant if `config.is_decoder=True`.
83
+ pad_token_id (`int`, *optional*):
84
+ Padding token id.
85
+ bos_token_id (`int`, *optional*, defaults to 1):
86
+ Beginning of stream token id.
87
+ eos_token_id (`int`, *optional*, defaults to 2):
88
+ End of stream token id.
89
+ pretraining_tp (`int`, *optional*, defaults to 1):
90
+ Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
91
+ document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
92
+ necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
93
+ issue](https://github.com/pytorch/pytorch/issues/76232).
94
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
95
+ Whether to tie weight embeddings
96
+ rope_theta (`float`, *optional*, defaults to 10000.0):
97
+ The base period of the RoPE embeddings.
98
+ rope_scaling (`Dict`, *optional*):
99
+ Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
100
+ strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
101
+ `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
102
+ `max_position_embeddings` to the expected new maximum.
103
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
104
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
105
+ attention_dropout (`float`, *optional*, defaults to 0.0):
106
+ The dropout ratio for the attention probabilities.
107
+
108
+ ```python
109
+ >>> from transformers import DeepseekV3Model, DeepseekV3Config
110
+
111
+ >>> # Initializing a Deepseek-V3 style configuration
112
+ >>> configuration = DeepseekV3Config()
113
+
114
+ >>> # Accessing the model configuration
115
+ >>> configuration = model.config
116
+ ```"""
117
+
118
+ model_type = "deepseek_v3"
119
+ keys_to_ignore_at_inference = ["past_key_values"]
120
+
121
+ def __init__(
122
+ self,
123
+ vocab_size=129280,
124
+ hidden_size=7168,
125
+ intermediate_size=18432,
126
+ moe_intermediate_size=2048,
127
+ num_hidden_layers=61,
128
+ num_nextn_predict_layers=1,
129
+ num_attention_heads=128,
130
+ num_key_value_heads=128,
131
+ n_shared_experts=1,
132
+ n_routed_experts=256,
133
+ ep_size=1,
134
+ routed_scaling_factor=2.5,
135
+ kv_lora_rank=512,
136
+ q_lora_rank=1536,
137
+ qk_rope_head_dim=64,
138
+ v_head_dim=128,
139
+ qk_nope_head_dim=128,
140
+ topk_method='noaux_tc',
141
+ n_group=8,
142
+ topk_group=4,
143
+ num_experts_per_tok=8,
144
+ moe_layer_freq=1,
145
+ first_k_dense_replace=3,
146
+ norm_topk_prob=True,
147
+ scoring_func='sigmoid',
148
+ aux_loss_alpha=0.001,
149
+ seq_aux=True,
150
+ hidden_act="silu",
151
+ max_position_embeddings=4096,
152
+ initializer_range=0.02,
153
+ rms_norm_eps=1e-6,
154
+ use_cache=True,
155
+ pad_token_id=None,
156
+ bos_token_id=0,
157
+ eos_token_id=1,
158
+ pretraining_tp=1,
159
+ tie_word_embeddings=False,
160
+ rope_theta=10000.0,
161
+ rope_scaling=None,
162
+ attention_bias=False,
163
+ attention_dropout=0.0,
164
+ **kwargs,
165
+ ):
166
+ self.vocab_size = vocab_size
167
+ self.max_position_embeddings = max_position_embeddings
168
+ self.hidden_size = hidden_size
169
+ self.intermediate_size = intermediate_size
170
+ self.moe_intermediate_size = moe_intermediate_size
171
+ self.num_hidden_layers = num_hidden_layers
172
+ self.num_nextn_predict_layers = num_nextn_predict_layers
173
+ self.num_attention_heads = num_attention_heads
174
+ self.n_shared_experts = n_shared_experts
175
+ self.n_routed_experts = n_routed_experts
176
+ self.ep_size = ep_size
177
+ self.routed_scaling_factor = routed_scaling_factor
178
+ self.kv_lora_rank = kv_lora_rank
179
+ self.q_lora_rank = q_lora_rank
180
+ self.qk_rope_head_dim = qk_rope_head_dim
181
+ self.v_head_dim = v_head_dim
182
+ self.qk_nope_head_dim = qk_nope_head_dim
183
+ self.topk_method = topk_method
184
+ self.n_group = n_group
185
+ self.topk_group = topk_group
186
+ self.num_experts_per_tok = num_experts_per_tok
187
+ self.moe_layer_freq = moe_layer_freq
188
+ self.first_k_dense_replace = first_k_dense_replace
189
+ self.norm_topk_prob = norm_topk_prob
190
+ self.scoring_func = scoring_func
191
+ self.aux_loss_alpha = aux_loss_alpha
192
+ self.seq_aux = seq_aux
193
+ # for backward compatibility
194
+ if num_key_value_heads is None:
195
+ num_key_value_heads = num_attention_heads
196
+
197
+ self.num_key_value_heads = num_key_value_heads
198
+ self.hidden_act = hidden_act
199
+ self.initializer_range = initializer_range
200
+ self.rms_norm_eps = rms_norm_eps
201
+ self.pretraining_tp = pretraining_tp
202
+ self.use_cache = use_cache
203
+ self.rope_theta = rope_theta
204
+ self.rope_scaling = rope_scaling
205
+ self.attention_bias = attention_bias
206
+ self.attention_dropout = attention_dropout
207
+
208
+ super().__init__(
209
+ pad_token_id=pad_token_id,
210
+ bos_token_id=bos_token_id,
211
+ eos_token_id=eos_token_id,
212
+ tie_word_embeddings=tie_word_embeddings,
213
+ **kwargs,
214
+ )
configuration_kimi_k25.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.configuration_utils import PretrainedConfig
2
+
3
+ try:
4
+ from configuration_deepseek import DeepseekV3Config
5
+ except ImportError:
6
+ from .configuration_deepseek import DeepseekV3Config
7
+
8
+
9
+ class KimiK25VisionConfig(PretrainedConfig):
10
+
11
+ def __init__(
12
+ self,
13
+ patch_size: int = 14,
14
+ init_pos_emb_height: int = 64,
15
+ init_pos_emb_width: int = 64,
16
+ init_pos_emb_time: int = 4,
17
+ pos_emb_type: str = 'divided_fixed',
18
+ vt_num_attention_heads: int = 16,
19
+ vt_num_hidden_layers: int = 27,
20
+ vt_hidden_size: int = 1152,
21
+ vt_intermediate_size: int = 4304,
22
+ merge_kernel_size: tuple = (2, 2),
23
+ video_attn_type: str = 'spatial_temporal',
24
+ merge_type: str = 'sd2_tpool',
25
+ _attn_implementation: str = 'flash_attention_2',
26
+ # MM Projector parameters
27
+ mm_projector_type: str = 'patchmerger',
28
+ mm_hidden_size: int | None = None,
29
+ projector_hidden_act: str = "gelu",
30
+ projector_ln_eps: float = 1e-5,
31
+ # Other parameters
32
+ ignore_index: int = -100,
33
+ media_placeholder_token_id: int = 163605,
34
+ pad_token_id: int = 0,
35
+ use_unified_vision_chunk: bool = True,
36
+ video_placeholder="<|kimi_k25_video_placeholder|>",
37
+ text_hidden_size=7168,
38
+ **vision_config_kwargs):
39
+
40
+ self.patch_size = patch_size
41
+ self.init_pos_emb_height = init_pos_emb_height
42
+ self.init_pos_emb_width = init_pos_emb_width
43
+ self.init_pos_emb_time = init_pos_emb_time
44
+ self.pos_emb_type = pos_emb_type
45
+ self.vt_num_attention_heads = vt_num_attention_heads
46
+ self.vt_num_hidden_layers = vt_num_hidden_layers
47
+ self.vt_hidden_size = vt_hidden_size
48
+ self.vt_intermediate_size = vt_intermediate_size
49
+ self.merge_kernel_size = merge_kernel_size
50
+ self.video_attn_type = video_attn_type
51
+ self.merge_type = merge_type
52
+ self._attn_implementation = _attn_implementation
53
+
54
+ # MM Projector config
55
+ self.mm_projector_type = mm_projector_type
56
+ self.mm_hidden_size = mm_hidden_size if mm_hidden_size is not None else vt_hidden_size
57
+ self.projector_hidden_act = projector_hidden_act
58
+ self.projector_ln_eps = projector_ln_eps
59
+ self.text_hidden_size = text_hidden_size
60
+
61
+
62
+ class KimiK25Config(PretrainedConfig):
63
+ """Kimi-K2.5 model configuration.
64
+
65
+ Args:
66
+ text_config (dict | DeepseekV3Config): Configuration for the text model.
67
+
68
+ Vision Tower Parameters (from MoonViT3dConfig):
69
+ patch_size (int): Patch size for vision tower.
70
+ init_pos_emb_height (int): Initial position embedding height.
71
+ init_pos_emb_width (int): Initial position embedding width.
72
+ init_pos_emb_time (int): Initial position embedding time dimension.
73
+ pos_emb_type (str): Type of position embedding.
74
+ vt_num_attention_heads (int): Number of attention heads in vision tower.
75
+ vt_num_hidden_layers (int): Number of hidden layers in vision tower.
76
+ vt_hidden_size (int): Hidden size of vision tower.
77
+ vt_intermediate_size (int): Intermediate size in vision tower FFN.
78
+ merge_kernel_size (tuple): Kernel size for patch merging.
79
+ video_attn_type (str): Type of video attention.
80
+ merge_type (str): Type of merge operation.
81
+ _attn_implementation (str): Attention implementation type.
82
+
83
+ MM Projector Parameters (from MultiModalProjectorConfig):
84
+ mm_projector_type (str): Type of multimodal projector.
85
+ mm_hidden_size (int): Hidden size from vision tower (should match vt_hidden_size).
86
+ projector_hidden_act (str): Activation function for projector.
87
+ projector_ln_eps (float): Layer norm epsilon for projector.
88
+
89
+ Other Parameters:
90
+ ignore_index (int): The ignore index for the loss function.
91
+ media_placeholder_token_id (int): The token ID to use for media placeholders.
92
+ pad_token_id (int): The token ID to use for padding.
93
+ """
94
+
95
+ model_type = "kimi_k25"
96
+
97
+ def __init__(
98
+ self,
99
+ text_config: dict | DeepseekV3Config = None,
100
+ vision_config: dict | KimiK25VisionConfig = None,
101
+ # Other parameters
102
+ ignore_index: int = -100,
103
+ media_placeholder_token_id: int = 163605,
104
+ pad_token_id: int = 0,
105
+ use_unified_vision_chunk: bool = True,
106
+ video_placeholder="<|kimi_k25_video_placeholder|>",
107
+ **kwargs,
108
+ ):
109
+ if isinstance(text_config, dict):
110
+ text_config = DeepseekV3Config(**text_config)
111
+ if isinstance(vision_config, dict):
112
+ vision_config = KimiK25VisionConfig(**vision_config)
113
+ self.text_config = text_config
114
+ self.vision_config = vision_config
115
+ # Other config
116
+ self.ignore_index = ignore_index
117
+ self.media_placeholder_token_id = media_placeholder_token_id
118
+ self.use_unified_vision_chunk = use_unified_vision_chunk
119
+ self.video_placeholder = video_placeholder
120
+ if getattr(self.text_config, "quantization_config", None) is not None:
121
+ self.quantization_config = self.text_config.quantization_config
122
+
123
+ super().__init__(pad_token_id=pad_token_id, **kwargs)
docs/deploy_guidance.md ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Kimi-K2.7-Code Deployment Guide
2
+
3
+ > [!Note]
4
+ > This guide only provides some examples of deployment commands for Kimi-K2.7-Code, which may not be the optimal configuration. Since inference engines are still being updated frequently, please continue to follow the guidance from their homepage if you want to achieve better inference performance.
5
+
6
+ > [!Note]
7
+ > Kimi-K2.7-Code has the same architecture as Kimi-K2.5/Kimi-K2.6, and the deployment method can be directly reused.
8
+ ## vLLM Deployment
9
+
10
+ You can refer to https://recipes.vllm.ai/moonshotai/Kimi-K2.6 for the newest deployment guide.
11
+
12
+ This model is available in nightly vLLM wheel:
13
+ ```
14
+ uv pip install -U vllm \
15
+ --torch-backend=auto \
16
+ --extra-index-url https://wheels.vllm.ai/nightly
17
+ ```
18
+
19
+ Nightly wheels may be unstable and are considered experimental. For stable production use, we recommend vLLM 0.19.1, which has been manually verified.
20
+
21
+ Here is the example to serve this model on a H200 single node with TP8 via vLLM:
22
+ ```bash
23
+ vllm serve $MODEL_PATH -tp 8 --mm-encoder-tp-mode data --trust-remote-code --tool-call-parser kimi_k2 --reasoning-parser kimi_k2
24
+ ```
25
+ **Key notes**
26
+ - `--tool-call-parser kimi_k2`: Required for enabling tool calling
27
+ - `--reasoning-parser kimi_k2`: Kimi-K2.7-Code supports thinking mode only. Make sure to pass this for correct reasoning processing.
28
+
29
+ ## SGLang Deployment
30
+
31
+ You can refer to https://cookbook.sglang.io/autoregressive/Moonshotai/Kimi-K2.6 for the newest deployment guide.
32
+
33
+ This model is supported in SGLang v0.5.10 and later stable releases (no nightly / main build required). `uv` is preferred:
34
+
35
+ ```
36
+ uv pip install "sglang>=0.5.10.post1" --prerelease=allow
37
+ ```
38
+
39
+ Here is the example for it to run with TP8 on H200 in a single node via SGLang:
40
+ ``` bash
41
+ sglang serve --model-path $MODEL_PATH --tp 8 --trust-remote-code --tool-call-parser kimi_k2 --reasoning-parser kimi_k2
42
+ ```
43
+ **Key parameter notes:**
44
+ - `--tool-call-parser kimi_k2`: Required when enabling tool usage.
45
+ - `--reasoning-parser kimi_k2`: Required for correctly processing reasoning content.
46
+
47
+ ## KTransformers Deployment
48
+ ### KTransformers+SGLang Inference Deployment
49
+ Launch with KTransformers + SGLang for CPU+GPU heterogeneous inference:
50
+
51
+ ```
52
+ python -m sglang.launch_server \
53
+ --host 0.0.0.0 \
54
+ --port 31245 \
55
+ --model /path/to/kimi-k2.7-code \
56
+ --kt-weight-path /path/to/kimi-k2.7-code \
57
+ --kt-cpuinfer 96 \
58
+ --kt-threadpool-count 2 \
59
+ --kt-num-gpu-experts 30 \
60
+ --kt-method RAWINT4 \
61
+ --kt-gpu-prefill-token-threshold 400 \
62
+ --trust-remote-code \
63
+ --mem-fraction-static 0.94 \
64
+ --served-model-name Kimi-K2.7-Code \
65
+ --enable-mixed-chunk \
66
+ --tensor-parallel-size 4 \
67
+ --enable-p2p-check \
68
+ --disable-shared-experts-fusion \
69
+ --chunked-prefill-size 32658 \
70
+ --max-total-tokens 50000 \
71
+ --attention-backend flashinfer
72
+ ```
73
+
74
+ Achieves 640.12 tokens/s Prefill and 24.51 tokens/s Decode (48-way concurrency) on 8× NVIDIA L20 + 2× Intel 6454S.
75
+
76
+ More details: https://github.com/kvcache-ai/ktransformers/blob/main/doc/en/Kimi-K2.5.md .
77
+
78
+ ### KTransformers+LLaMA-Factory Fine-tuning Deployment
79
+
80
+ You can use below command to run LoRA SFT with KT+llamafactory.
81
+
82
+ ```
83
+ # For LoRA SFT
84
+ USE_KT=1 llamafactory-cli train examples/train_lora/kimik2_lora_sft_kt.yaml
85
+ # For Chat with model after LoRA SFT
86
+ llamafactory-cli chat examples/inference/kimik2_lora_sft_kt.yaml
87
+ # For API with model after LoRA SFT
88
+ llamafactory-cli api examples/inference/kimik2_lora_sft_kt.yaml
89
+ ```
90
+
91
+ This achieves end-to-end LoRA SFT Throughput: 44.55 token/s on 2× NVIDIA 4090 + Intel 8488C with 1.97T RAM and 200G swap memory.
92
+
93
+ More details refer to https://github.com/kvcache-ai/ktransformers/blob/main/doc/en/SFT_Installation_Guide_KimiK2.5.md .
figures/demo_video.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09b4d925aa0a7c712feef50765355f0625d8f6d46ea302fd98db9609e9070047
3
+ size 270100
figures/kimi-logo.png ADDED
generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_length": 262144,
3
+ "eos_token_id": 163586
4
+ }
kimi_k25_processor.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.feature_extraction_utils import BatchFeature
2
+ from transformers.processing_utils import ProcessorMixin
3
+ from transformers.utils import logging
4
+
5
+ logger = logging.get_logger(__name__)
6
+
7
+
8
+ class KimiK25Processor(ProcessorMixin):
9
+ r"""
10
+ Constructs a KimiK25 processor which wraps a KimiK25 image processor and a tokenizer into a single processor.
11
+
12
+ [`KimiK25Processor`] offers all the functionalities of [`KimiK25ImageProcessor`] and [`TikTokenTokenizer`]. See the
13
+ [`~KimiK25Processor.__call__`] and [`~KimiK25Processor.decode`] for more information.
14
+
15
+ Args:
16
+ image_processor ([`KimiK25ImageProcessor`], *optional*):
17
+ The image processor is a required input.
18
+ tokenizer ([`TikTokenTokenizer`], *optional*):
19
+ The tokenizer is a required input.
20
+ chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
21
+ in a chat into a tokenizable string.
22
+ """
23
+
24
+ attributes = ["image_processor", "tokenizer"]
25
+ valid_kwargs = ["chat_template"]
26
+ image_processor_class = "AutoImageProcessor"
27
+ tokenizer_class = "AutoTokenizer"
28
+
29
+ def __init__(
30
+ self,
31
+ image_processor=None,
32
+ tokenizer=None,
33
+ chat_template=None,
34
+ **kwargs,
35
+ ):
36
+ super().__init__(image_processor,
37
+ tokenizer,
38
+ chat_template=chat_template)
39
+ self.media_processor = image_processor
40
+ # A special temporal placeholder to be replaced by actual video placeholders
41
+ self.video_placeholder = "<|kimi_k25_video_placeholder|>"
42
+
43
+ def update_raw_text(self, text: str, video_prompts: list[str]) -> str:
44
+ # replace video prompt in text with video chunk prompts
45
+ video_count = text.count(self.video_placeholder)
46
+ if video_count == 0:
47
+ return text
48
+ assert video_count == len(video_prompts)
49
+ text_parts = text.split(self.video_placeholder)
50
+ assert len(text_parts) == len(video_prompts) + 1
51
+ text = "".join([
52
+ text_parts[i] + video_prompts[i] for i in range(len(video_prompts))
53
+ ])
54
+ text += text_parts[-1]
55
+ return text
56
+
57
+ def preprocess_medias(self, medias: list[dict]) -> list[dict]:
58
+ updated_medias = []
59
+ video_prompts = []
60
+ for media in medias:
61
+ if media['type'] == 'image':
62
+ updated_medias.append(media)
63
+ elif media['type'] == 'video':
64
+ video_chunks = self.media_processor.split_video_chunks(
65
+ media['video'])
66
+ updated_medias.extend(video_chunks)
67
+ video_prompts.append("".join(
68
+ [vc['prompt'] for vc in video_chunks]))
69
+ else:
70
+ raise ValueError(f"unsupported media type: {media['type']}")
71
+ return updated_medias, video_prompts
72
+
73
+ def __call__(self,
74
+ messages: list[dict] = None,
75
+ medias: list[dict] = None,
76
+ text: str = None,
77
+ return_tensors: str = "pt",
78
+ **kwargs) -> BatchFeature:
79
+ """
80
+ Process multimodal inputs for Kimi-K2.5 model.
81
+
82
+ This processor accepts ordered messages and extracts both media and text in a single pass.
83
+ text will be automatically updated if video input detected in messages
84
+
85
+ Args:
86
+ messages: List of message dicts with 'role' and 'content' fields.
87
+ If provided, medias and text will be extracted automatically.
88
+ medias: Pre-extracted list of media dicts. If None, extracted from messages.
89
+ text: Pre-formatted text string. If None, generated via apply_chat_template.
90
+ return_tensors: Format of returned tensors ('pt', 'np', 'tf'). Default: 'pt'.
91
+ **kwargs: Additional arguments passed to tokenizer.apply_chat_template.
92
+
93
+ Returns:
94
+ BatchFeature with fields: input_ids, attention_mask, pixel_values, grid_thws.
95
+ """
96
+ if messages is None and (medias is None or text is None):
97
+ raise ValueError(
98
+ "Provide either 'messages' or both 'medias' and 'text'")
99
+
100
+ if medias is not None and text is not None:
101
+ updated_medias, video_prompts = self.preprocess_medias(medias)
102
+ preprocessed = self.media_processor.preprocess(
103
+ updated_medias, return_tensors=return_tensors)
104
+ text = self.update_raw_text(text, video_prompts)
105
+ text_inputs = self.tokenizer(text, return_tensors=return_tensors)
106
+ return BatchFeature(data={**text_inputs, **preprocessed.data})
107
+
108
+ if medias is None:
109
+ medias = self._extract_medias_from_messages(messages)
110
+ updated_medias, video_prompts = self.preprocess_medias(medias)
111
+ preprocessed = self.media_processor.preprocess(
112
+ updated_medias, return_tensors=return_tensors)
113
+
114
+ # Generate text if not provided
115
+ if text is None:
116
+ text = self.tokenizer.apply_chat_template(messages, **kwargs)
117
+
118
+ text = self.update_raw_text(text, video_prompts)
119
+
120
+ text_inputs = self.tokenizer(text, return_tensors=return_tensors)
121
+ return BatchFeature(data={**text_inputs, **preprocessed.data})
122
+
123
+ @staticmethod
124
+ def _extract_medias_from_messages(messages: list[dict]) -> list[dict]:
125
+ """
126
+ Extract media items from messages in a single pass.
127
+
128
+ This is an optimized version that processes messages only once.
129
+ Kept as internal method since external callers should use __call__.
130
+ """
131
+ medias = []
132
+ for msg in messages:
133
+ if msg['role'] != 'user' or not msg.get('content'):
134
+ continue
135
+
136
+ for content_part in msg['content']:
137
+ if not isinstance(content_part, dict):
138
+ continue
139
+
140
+ content_type = content_part.get('type')
141
+ if content_type in ['video_url', 'video']:
142
+ medias.append({
143
+ 'type': 'video',
144
+ 'video': content_part['video_url']['url'],
145
+ 'first_frame_timestamp': 0.0
146
+ })
147
+ elif content_type in ['image_url', 'image']:
148
+ medias.append({
149
+ 'type': 'image',
150
+ 'image': content_part['image_url'],
151
+ })
152
+ return medias
153
+
154
+ def apply_chat_template(self, messages, **kwargs):
155
+ return self.tokenizer.apply_chat_template(messages, **kwargs)
156
+
157
+ def batch_decode(self, *args, **kwargs):
158
+ return self.tokenizer.batch_decode(*args, **kwargs)
159
+
160
+ def decode(self, *args, **kwargs):
161
+ return self.tokenizer.decode(*args, **kwargs)
162
+
163
+ @property
164
+ def model_input_names(self):
165
+ return ['input_ids', 'attention_mask', 'pixel_values', 'grid_thws']
kimi_k25_vision_processing.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Image processor class for Kimi-K2.5.
2
+ """
3
+
4
+ import json
5
+ from typing import Any, Dict, Optional, Union
6
+
7
+ import numpy as np
8
+ import torch
9
+ from PIL import Image
10
+ from transformers.image_processing_utils import (BaseImageProcessor,
11
+ BatchFeature)
12
+ from transformers.utils import TensorType
13
+
14
+ from .media_utils import (MediaInput, VideoChunkInput, _to_tensor,
15
+ ensure_media_type, get_video_meta, image_to_np,
16
+ navit_patchify, navit_resize_image,
17
+ navit_resize_video, normalize,
18
+ real_sample_fps_and_max_num_frames, timestamp_as_str)
19
+
20
+ try:
21
+ from mecord import VideoReader
22
+ except ImportError:
23
+ VideoReader = None
24
+
25
+
26
+ def resampling(video_bytes: bytes,
27
+ sample_indices: list[int],
28
+ key_indices=None,
29
+ frame_time_info=None,
30
+ num_threads=4) -> str:
31
+ video = VideoReader(video_bytes,
32
+ num_threads=num_threads,
33
+ frame_time_info=frame_time_info,
34
+ key_indices=key_indices)
35
+ # extract target frames
36
+ frames = video[sample_indices]
37
+ frames = [Image.fromarray(frame) for frame in frames]
38
+ return frames
39
+
40
+
41
+ class KimiK25VisionProcessor(BaseImageProcessor):
42
+ model_type = "kimi_k25"
43
+
44
+ def __init__(
45
+ self,
46
+ media_proc_cfg: dict,
47
+ **kwargs,
48
+ ):
49
+ super().__init__(**kwargs)
50
+ self.media_proc_cfg = media_proc_cfg
51
+ self.num_frames_per_chunk = media_proc_cfg[
52
+ 'temporal_merge_kernel_size']
53
+
54
+ def media_tokens_calculator(self, media: MediaInput):
55
+ media = ensure_media_type(media)
56
+ ret = self.get_resize_config(media)
57
+ return ret['num_tokens']
58
+
59
+ @classmethod
60
+ def make_chunk_prompt(cls, timestamp_text: str) -> str:
61
+ return f"{timestamp_text}<|media_begin|>video<|media_content|><|media_pad|><|media_end|>"
62
+
63
+ def split_video_chunks(self,
64
+ video_url: str | bytes) -> list[list[Image.Image]]:
65
+ # video_url should be base64 str or bytes
66
+ video_spec = get_video_meta(video_url)
67
+ sample_fps = min(self.media_proc_cfg['sample_fps'], video_spec.fps)
68
+ sampled_nframes = max(
69
+ round(video_spec.num_frames * sample_fps / video_spec.fps), 1)
70
+ frame_inds = np.linspace(0, video_spec.num_frames - 1,
71
+ sampled_nframes).round().astype(int)
72
+ frame_inds = frame_inds.tolist()
73
+ sampled_frame_ids = []
74
+ temporal_merge_kernel_size = self.media_proc_cfg[
75
+ "temporal_merge_kernel_size"]
76
+ num_chunks = 0
77
+ chunk_timestamp = []
78
+ for i in range(0, len(frame_inds), temporal_merge_kernel_size):
79
+ sampled_frame_ids.extend(frame_inds[i:i +
80
+ temporal_merge_kernel_size])
81
+ start_time = frame_inds[i] / float(video_spec.fps)
82
+ timestamp_text = timestamp_as_str(
83
+ start_time, self.media_proc_cfg["timestamp_mode"])
84
+ chunk_timestamp.append(timestamp_text)
85
+ num_chunks += 1
86
+
87
+ sampled_frames = resampling(video_url, sampled_frame_ids)
88
+ chunks = []
89
+ for chunk_id in range(num_chunks):
90
+ chunk = sampled_frames[chunk_id *
91
+ temporal_merge_kernel_size:(chunk_id + 1) *
92
+ temporal_merge_kernel_size]
93
+ chunks.append(
94
+ VideoChunkInput(type="video_chunk",
95
+ video_chunk=chunk,
96
+ prompt=self.make_chunk_prompt(
97
+ chunk_timestamp[chunk_id])))
98
+ return chunks
99
+
100
+ def get_resize_config(self, media_input: MediaInput) -> dict:
101
+ if media_input['type'] == 'image':
102
+ w, h = media_input['image'].size
103
+ ret = navit_resize_image(
104
+ w, h, self.media_proc_cfg['patch_size'],
105
+ self.media_proc_cfg['merge_kernel_size'],
106
+ self.media_proc_cfg['in_patch_limit'],
107
+ self.media_proc_cfg['patch_limit_on_one_side'],
108
+ self.media_proc_cfg['fixed_output_tokens'])
109
+ return ret
110
+ elif media_input['type'] == 'video_chunk':
111
+ frame = media_input['video_chunk'][0]
112
+ width, height = frame.size
113
+ num_frames = len(media_input["video_chunk"])
114
+ fps = 1.0
115
+
116
+ sample_fps, max_num_frames_each_video = real_sample_fps_and_max_num_frames(
117
+ media_input["type"],
118
+ self.media_proc_cfg['sample_fps'],
119
+ self.media_proc_cfg['max_num_frames_each_video'],
120
+ )
121
+
122
+ in_patch_limit_each_frame = self.media_proc_cfg[
123
+ 'in_patch_limit_each_frame']
124
+ if in_patch_limit_each_frame is None:
125
+ in_patch_limit_each_frame = self.media_proc_cfg[
126
+ 'in_patch_limit']
127
+
128
+ ret = navit_resize_video(
129
+ width,
130
+ height,
131
+ num_frames,
132
+ fps,
133
+ sample_fps,
134
+ self.media_proc_cfg['patch_size'],
135
+ self.media_proc_cfg['merge_kernel_size'],
136
+ in_patch_limit_each_frame,
137
+ self.media_proc_cfg['patch_limit_on_one_side'],
138
+ self.media_proc_cfg['in_patch_limit_video'],
139
+ max_num_frames_each_video,
140
+ self.media_proc_cfg['fixed_output_tokens'],
141
+ )
142
+ return ret
143
+ else:
144
+ raise ValueError("Unsupported type: {}".format(
145
+ media_input['type']))
146
+
147
+ def resize_image(self, image: Image.Image, new_width: int, new_height: int,
148
+ pad_width: int, pad_height: int) -> np.ndarray:
149
+ image_np = image_to_np(image, (new_width, new_height), "resize")
150
+ image_np = np.pad(
151
+ image_np,
152
+ ((0, pad_height), (0, pad_width), (0, 0)),
153
+ mode="constant",
154
+ constant_values=0,
155
+ )
156
+ return image_np
157
+
158
+ def preprocess(
159
+ self,
160
+ medias: list[MediaInput],
161
+ return_tensors: Optional[Union[str, TensorType]] = None,
162
+ ) -> BatchFeature:
163
+ """
164
+ Preprocess a atom vision input (images/video_chunk) into model-ready tensors.
165
+
166
+ Args:
167
+ medias: List of MediaInput.
168
+ return_tensors: Desired output format ('pt', 'np', 'tf', or None).
169
+
170
+ Returns:
171
+ BatchFeature containing 'pixel_values' and 'grid_thws' tensors.
172
+ """
173
+ if not isinstance(medias, list):
174
+ medias = [medias]
175
+ if medias:
176
+ pixel_values = []
177
+ for item in medias:
178
+ item = ensure_media_type(item)
179
+ resize_config = self.get_resize_config(item)
180
+ new_width, new_height, pad_width, pad_height = resize_config[
181
+ 'new_width'], resize_config['new_height'], resize_config[
182
+ 'pad_width'], resize_config['pad_height']
183
+ if item['type'] == 'image':
184
+ image = item['image']
185
+ image_np = self.resize_image(image, new_width, new_height,
186
+ pad_width, pad_height)
187
+ pixel_values.append(np.expand_dims(image_np, axis=0))
188
+ elif item['type'] == 'video_chunk':
189
+ pixels = []
190
+ for frame in item['video_chunk']:
191
+ frame_np = self.resize_image(frame, new_width,
192
+ new_height, pad_width,
193
+ pad_height)
194
+ pixels.append(frame_np)
195
+ pixel_values.append(np.stack(pixels, axis=0))
196
+ else:
197
+ raise ValueError("Unsupported type: {}".format(
198
+ item['type']))
199
+ normalized_pixel_values = []
200
+ image_std_inv = 1.0 / np.array(self.media_proc_cfg['image_std'])
201
+ image_mean = np.array(self.media_proc_cfg['image_mean'])
202
+ for pixels in pixel_values:
203
+ pixels = normalize(pixels, image_mean, image_std_inv)
204
+ pixels_and_thw = navit_patchify(
205
+ pixels,
206
+ self.media_proc_cfg['patch_size'],
207
+ )
208
+ normalized_pixel_values.append(pixels_and_thw)
209
+
210
+ pixel_values = torch.cat([
211
+ _to_tensor(pixel_value['pixel_values'])
212
+ for pixel_value in normalized_pixel_values
213
+ ])
214
+ grid_thws = torch.cat([
215
+ _to_tensor(pixel_value['grid_thw'],
216
+ dtype=torch.int64).unsqueeze(0)
217
+ for pixel_value in normalized_pixel_values
218
+ ])
219
+
220
+ data = {
221
+ 'pixel_values': pixel_values,
222
+ 'grid_thws': grid_thws,
223
+ }
224
+
225
+ else:
226
+ data = {}
227
+
228
+ return BatchFeature(data=data, tensor_type=return_tensors)
229
+
230
+ def __repr__(self):
231
+ return f"KimiK25VisionProcessor(media_proc_cfg={self.media_proc_cfg})"
232
+
233
+ def to_dict(self) -> Dict[str, Any]:
234
+ output = super().to_dict()
235
+ output["media_proc_cfg"] = self.media_proc_cfg
236
+ if "media_processor" in output:
237
+ del output["media_processor"]
238
+ return output
239
+
240
+ @classmethod
241
+ def from_dict(cls, config_dict: Dict[str, Any], **kwargs):
242
+ config = config_dict.copy()
243
+ media_proc_cfg = config.pop("media_proc_cfg", {})
244
+ return cls(media_proc_cfg=media_proc_cfg, **config, **kwargs)
245
+
246
+ def to_json_string(self):
247
+ dictionary = self.to_dict()
248
+ for key, value in dictionary.items():
249
+ if hasattr(value, 'tolist'):
250
+ dictionary[key] = value.tolist()
251
+ return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
media_utils.py ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import io
3
+ import math
4
+ import os
5
+ from datetime import datetime, timezone
6
+ from typing import List, Literal, Optional, TypedDict
7
+
8
+ import numpy as np
9
+ from PIL import Image
10
+ from pydantic import BaseModel, Field
11
+
12
+ try:
13
+ from mecord import VideoReader
14
+ except ImportError:
15
+ VideoReader = None
16
+
17
+
18
+ class VideoSpec(BaseModel):
19
+ media_type: str = Literal['video']
20
+ height: int = Field(..., gt=0, description="video frame height")
21
+ width: int = Field(..., gt=0, description="video frame width")
22
+ num_frames: int = Field(..., gt=0, description="num frames")
23
+ fps: float = Field(..., gt=0, description="average fps")
24
+
25
+ # optional, help to accelerate video reading
26
+ key_indices: list[int] = Field(None, description="key indices")
27
+ frame_time_info: dict = Field(None, description="frame time info")
28
+
29
+
30
+ class ImageInput(TypedDict):
31
+ type: Literal['image']
32
+ image: Image.Image
33
+
34
+
35
+ class VideoChunkInput(TypedDict):
36
+ type: Literal['video_chunk']
37
+ video_chunk: List[Image.Image]
38
+ prompt: Optional[str] = None
39
+
40
+
41
+ MediaInput = ImageInput | VideoChunkInput
42
+
43
+
44
+ def get_video_meta(video_src: bytes | str | os.PathLike,
45
+ accurate: bool = True) -> dict:
46
+ """Get the dimensions of a video."""
47
+ if isinstance(video_src, os.PathLike):
48
+ video_src = str(video_src)
49
+ # if b64 string, decode to bytes
50
+ if isinstance(video_src,
51
+ str) and video_src.startswith('data:video/mp4;base64,'):
52
+ video_src = base64.b64decode(video_src.split(',')[1])
53
+ video = VideoReader(video_src, auto_init=accurate, num_threads=1)
54
+ assert video.num_frames > 0, "Invalid video format."
55
+ assert video.original_width > 0 and video.original_height > 0, (
56
+ "Invalid video format.")
57
+ assert video.avg_fps > 0, "Invalid video format."
58
+ return VideoSpec(media_type='video',
59
+ height=video.original_height,
60
+ width=video.original_width,
61
+ num_frames=video.num_frames,
62
+ fps=video.avg_fps,
63
+ key_indices=video.key_indices,
64
+ frame_time_info=video.frame_time_info)
65
+
66
+
67
+ def timestamp_as_str(timestamp: float,
68
+ timestamp_mode: str = "hh:mm:ss.fff") -> str:
69
+ """Convert a timestamp to a string in the format of HH:MM:SS.mmm."""
70
+ if timestamp_mode == "hh:mm:ss.fff":
71
+ return (datetime.fromtimestamp(timestamp,
72
+ tz=timezone.utc).strftime("%H:%M:%S") +
73
+ f".{int((timestamp % 1) * 1000):03d}")
74
+ elif timestamp_mode == "mm:ss.fff":
75
+ return (datetime.fromtimestamp(timestamp,
76
+ tz=timezone.utc).strftime("%M:%S") +
77
+ f".{int((timestamp % 1) * 1000):03d}")
78
+ elif timestamp_mode == "mm:ss":
79
+ return datetime.fromtimestamp(timestamp,
80
+ tz=timezone.utc).strftime("%M:%S")
81
+ else:
82
+ raise ValueError(f"Invalid timestamp mode: {timestamp_mode}")
83
+
84
+
85
+ def navit_resize_image(
86
+ width: int,
87
+ height: int,
88
+ patch_size: int,
89
+ merge_kernel_size: int,
90
+ in_patch_limit: int,
91
+ patch_limit_on_one_side: int,
92
+ fixed_output_tokens: int | None,
93
+ ):
94
+ # Apply the patch limits.
95
+ s1 = math.sqrt(
96
+ in_patch_limit /
97
+ (max(1.0, width // patch_size) * max(1.0, height // patch_size)))
98
+ s2 = patch_limit_on_one_side * patch_size / width
99
+ s3 = patch_limit_on_one_side * patch_size / height
100
+ scale = min(1.0, s1, s2, s3)
101
+ new_w, new_h = max(1, int(width * scale)), max(1, int(height * scale))
102
+ new_w = min(new_w, patch_limit_on_one_side * patch_size)
103
+ new_h = min(new_h, patch_limit_on_one_side * patch_size)
104
+
105
+ # Calculate the padding to make the height and width divisible by the merge kernel size and patch size.
106
+ factor = merge_kernel_size * patch_size
107
+
108
+ pad_height = (factor - new_h % factor) % factor
109
+ pad_width = (factor - new_w % factor) % factor
110
+
111
+ if fixed_output_tokens is not None:
112
+ num_tokens = fixed_output_tokens
113
+ else:
114
+ # Calculate new dimensions after padding and patching
115
+ token_height = (new_h + pad_height) // factor
116
+ token_width = (new_w + pad_width) // factor
117
+
118
+ assert token_height * merge_kernel_size <= patch_limit_on_one_side, (
119
+ f"token_height {token_height} * merge_kernel_size {merge_kernel_size} > patch_limit_on_one_side {patch_limit_on_one_side}"
120
+ )
121
+ assert token_width * merge_kernel_size <= patch_limit_on_one_side, (
122
+ f"token_width {token_width} * merge_kernel_size {merge_kernel_size} > patch_limit_on_one_side {patch_limit_on_one_side}"
123
+ )
124
+
125
+ num_tokens = token_height * token_width
126
+ return {
127
+ "num_tokens": num_tokens,
128
+ "new_width": new_w,
129
+ "new_height": new_h,
130
+ "pad_width": pad_width,
131
+ "pad_height": pad_height,
132
+ "sampled_nframes": 1,
133
+ }
134
+
135
+
136
+ def navit_resize_video(
137
+ width: int,
138
+ height: int,
139
+ nframes: int,
140
+ avg_fps: float,
141
+ sample_fps: float,
142
+ patch_size: int,
143
+ merge_kernel_size: int,
144
+ in_patch_limit_each_frame: int,
145
+ patch_limit_on_one_side: int,
146
+ in_patch_limit_total: int | None,
147
+ max_num_frames_each_video: int | None,
148
+ fixed_output_tokens_each_frame: int | None,
149
+ ):
150
+ sample_fps = min(sample_fps, avg_fps)
151
+ # Calculate the number of frames to sample based on target FPS
152
+ sampled_nframes = max(round(nframes * sample_fps / avg_fps), 1)
153
+ if max_num_frames_each_video is not None:
154
+ sampled_nframes = min(sampled_nframes, max_num_frames_each_video)
155
+
156
+ if in_patch_limit_total is not None:
157
+ in_patch_limit_each_frame = min(
158
+ round(in_patch_limit_total / sampled_nframes),
159
+ in_patch_limit_each_frame)
160
+
161
+ ret = navit_resize_image(
162
+ width,
163
+ height,
164
+ patch_size,
165
+ merge_kernel_size,
166
+ in_patch_limit_each_frame,
167
+ patch_limit_on_one_side,
168
+ fixed_output_tokens_each_frame,
169
+ )
170
+ ret["sampled_nframes"] = sampled_nframes
171
+ return ret
172
+
173
+
174
+ def real_sample_fps_and_max_num_frames(
175
+ type_name: Literal["video", "video_chunk"],
176
+ sample_fps: float,
177
+ max_num_frames_each_video: int | None,
178
+ ) -> tuple[int, int | None]:
179
+ if type_name == "video":
180
+ return sample_fps, max_num_frames_each_video
181
+ elif type_name == "video_chunk":
182
+ max_num_frames_each_video = None
183
+ sample_fps = math.inf
184
+ return sample_fps, max_num_frames_each_video
185
+ else:
186
+ return math.inf, None
187
+
188
+
189
+ def _to_pil(data: str | bytes):
190
+ if isinstance(data, Image.Image):
191
+
192
+ return data.convert("RGB")
193
+ elif isinstance(data, str):
194
+ if data.startswith("data:"):
195
+ raw_base64 = data.split(",")[1]
196
+ return Image.open(io.BytesIO(
197
+ base64.b64decode(raw_base64))).convert("RGB")
198
+ else:
199
+ return Image.open(data).convert("RGB")
200
+ elif isinstance(data, bytes):
201
+ return Image.open(io.BytesIO(data)).convert("RGB")
202
+ else:
203
+ raise ValueError(f"Unsupported data type: {type(data)}")
204
+
205
+
206
+ def ensure_media_type(media: MediaInput) -> MediaInput:
207
+ if media['type'] == 'image':
208
+ media['image'] = _to_pil(media['image'])
209
+ return media
210
+ elif media['type'] == 'video_chunk':
211
+ media['video_chunk'] = [
212
+ _to_pil(frame) for frame in media['video_chunk']
213
+ ]
214
+ return media
215
+ else:
216
+ raise ValueError(f"Unsupported media type: {media['type']}")
217
+
218
+
219
+ def image_to_np(
220
+ image: Image.Image,
221
+ resize_to: tuple[int, int] | None = None,
222
+ mode: str = "resize",
223
+ raise_error_for_ill_resize: bool = True,
224
+ ) -> np.ndarray:
225
+ """Convert an image to a numpy array.
226
+
227
+ Args:
228
+ content: The image to convert.
229
+ resize_to: The size to resize the image to.
230
+ mode: The mode to resize the image to.
231
+ raise_error_for_ill_resize: Whether to raise an error for ill-sized resize.
232
+
233
+ Returns:
234
+ A numpy array.
235
+ """
236
+ assert isinstance(image, Image.Image), "image must be a PIL Image"
237
+ if resize_to is not None:
238
+ if mode == "resize":
239
+ image = image.resize(resize_to, resample=Image.Resampling.BICUBIC)
240
+
241
+ elif mode == "rescale_and_pad_to_center":
242
+ scale = min(resize_to[0] / image.width,
243
+ resize_to[1] / image.height, 1.0)
244
+ new_width = round(image.width * scale)
245
+ new_height = round(image.height * scale)
246
+ if new_width == 0 or new_height == 0:
247
+ if raise_error_for_ill_resize:
248
+ raise ValueError(
249
+ f"Invalid resize to: {resize_to}, from image size: {image.size}"
250
+ )
251
+ else:
252
+ return np.zeros((resize_to[1], resize_to[0], 3),
253
+ dtype=np.uint8)
254
+
255
+ image = image.resize((new_width, new_height),
256
+ resample=Image.Resampling.BICUBIC)
257
+ padding_left = (resize_to[0] - new_width) // 2
258
+ padding_right = resize_to[0] - new_width - padding_left
259
+ padding_top = (resize_to[1] - new_height) // 2
260
+ padding_bottom = resize_to[1] - new_height - padding_top
261
+ image = np.asarray(image)
262
+ image = np.pad(
263
+ image,
264
+ ((padding_top, padding_bottom), (padding_left, padding_right),
265
+ (0, 0)),
266
+ mode="constant",
267
+ constant_values=0,
268
+ )
269
+ assert image.shape == (resize_to[1], resize_to[0], 3)
270
+
271
+ elif mode == "rescale_and_pad_to_rightbottom":
272
+ scale = min(resize_to[0] / image.width,
273
+ resize_to[1] / image.height, 1.0)
274
+ new_width = round(image.width * scale)
275
+ new_height = round(image.height * scale)
276
+ if new_width == 0 or new_height == 0:
277
+ if raise_error_for_ill_resize:
278
+ raise ValueError(
279
+ f"Invalid resize to: {resize_to}, from image size: {image.size}"
280
+ )
281
+ else:
282
+ return np.zeros((resize_to[1], resize_to[0], 3),
283
+ dtype=np.uint8)
284
+
285
+ image = image.resize((new_width, new_height),
286
+ resample=Image.Resampling.BICUBIC)
287
+ padding_right = resize_to[0] - new_width
288
+ padding_bottom = resize_to[1] - new_height
289
+ image = np.asarray(image)
290
+ image = np.pad(
291
+ image,
292
+ ((0, padding_bottom), (0, padding_right), (0, 0)),
293
+ mode="constant",
294
+ constant_values=0,
295
+ )
296
+ assert image.shape == (resize_to[1], resize_to[0], 3)
297
+
298
+ else:
299
+ raise ValueError(f"Invalid mode: {mode}")
300
+
301
+ if isinstance(image, Image.Image):
302
+ return np.asarray(image)
303
+ else:
304
+ return image
305
+
306
+
307
+ def navit_patchify(pixel_values: np.ndarray,
308
+ patch_size: int) -> dict[str, np.ndarray]:
309
+ """Reshape the pixel values to a navit shape.
310
+
311
+ Args:
312
+ pixel_values: np.ndarray, shape (t, h, w, c)
313
+ patch_size: int
314
+
315
+ Returns:
316
+ dict[str, np.ndarray]
317
+ - patches: np.ndarray, shape (t * h//patch_size * w//patch_size, c, patch_size, patch_size)
318
+ - grid_thw: np.ndarray, (t, h//patch_size, w//patch_size)
319
+ """
320
+ T, H, W, C = pixel_values.shape
321
+ assert C == 3, "pixel_values must have 3 channels"
322
+
323
+ patches = pixel_values.reshape(T, H // patch_size, patch_size,
324
+ W // patch_size, patch_size, C)
325
+ # (T, H//patch_size, W//patch_size, C, patch_size, patch_size)
326
+ patches = patches.transpose(0, 1, 3, 5, 2, 4)
327
+ patches = patches.reshape(-1, C, patch_size, patch_size)
328
+ grid_thw = np.array([T, H // patch_size, W // patch_size])
329
+ return {"pixel_values": patches, "grid_thw": grid_thw}
330
+
331
+
332
+ def normalize(x: np.ndarray,
333
+ mean,
334
+ std_inv,
335
+ pixels_dtype: np.dtype = np.float32) -> np.ndarray:
336
+ """Normalize the image.
337
+
338
+ Args:
339
+ x: The image to normalize. The shape is (..., 3). The dtype is uint8. The range is [0, 255].
340
+ mean: The mean of the image.
341
+ std_inv: The inverse of the std of the image.
342
+ pixels_dtype: The dtype of the image.
343
+ Returns:
344
+ The normalized image. The shape is (..., 3). The dtype is determined by the pixels_dtype.
345
+ """
346
+ x = (x / 255.0).astype(pixels_dtype)
347
+ x -= mean
348
+ x *= std_inv
349
+ return x
350
+
351
+
352
+ def _to_tensor(data, **kwargs):
353
+ import torch
354
+
355
+ if isinstance(data, np.ndarray):
356
+ return torch.from_numpy(data).to(**kwargs)
357
+ elif isinstance(data, torch.Tensor):
358
+ return data.to(**kwargs)
359
+ elif isinstance(data, list):
360
+ return [_to_tensor(item, **kwargs) for item in data]
361
+ elif isinstance(data, tuple):
362
+ return tuple(_to_tensor(item, **kwargs) for item in data)
363
+ elif isinstance(data, dict):
364
+ return {k: _to_tensor(v, **kwargs) for k, v in data.items()}
365
+ elif data is None:
366
+ return None
367
+ else:
368
+ raise ValueError(f"Unsupported data type: {type(data)}")
model-00001-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f039216c7c449fc944ec23f96ca0a19dcdfd10f7134dbc05b94dc82bfc48c4b4
3
+ size 995001888
model-00002-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f29f76850192a8a0422fc3d75be10ffdb6e1e6c57536db09b9e57330402938a
3
+ size 9809047464
model-00003-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e3c59ab16fde9059831edf1245e34aaea32ecf28c332af52e7ef53f35169281
3
+ size 9809047464
model-00004-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0863caf2c5b5bd6f3374757b7acbfec477b31bcec3bceaf064327478e09f6027
3
+ size 9809047464
model-00005-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63709e48656914ee96cc239311049f4a616b54dc687b27d702b24d266acec2d6
3
+ size 9809047464
model-00006-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a303b0bd76aaed95328b680b11ff7e96f68aa2947a93e1e112ebdcb98affa7d1
3
+ size 9809047464
model-00007-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94244e6eee19b2b9e60f1ed9bd1cca37b126ffefa10fd03bbf06739e473b0405
3
+ size 9809047464
model-00008-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40099c8bb5935abe5a22147a17eabb07e19b4d3678ae8b4f8ee0238bfdfd5581
3
+ size 9809047464
model-00009-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e32aee880f63e74f9c9a0e968f198829aabd6afc1e41fa40d7ca16f77a833d6
3
+ size 9809047464
model-00010-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b601a5e469be6f384d6bcd194bdb728a346b2dc9953b198a2c96caf090359cd2
3
+ size 9809047464
model-00011-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:235b8db1f814df9ec4dbee727327703134e8e29e74ac198b437170ffaa85da32
3
+ size 9809050936
model-00012-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ad887ad0b63c8e4f59ec9090bf7b8f01ca091df828ef4601b53b9d8fe943da8
3
+ size 9809050936
model-00013-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a10a26f5e633d7d0a300f37490ca0b2c57cf05e66c29b3089ca037280ed1bfa
3
+ size 9809050936
model-00014-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7536219b9599de4cca8a57d08dce416f2f38b9472600e945e018dc9daa64840
3
+ size 9809050936
model-00015-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f279fa4cc3d3d72d31c9a391b1c1ff3977c6f26f1297a913c221d33f8dde9092
3
+ size 9809050936
model-00016-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1a5426a084d23e9fcdbb485b79b87972128c7b6bb7a331d82af94939d49068f
3
+ size 9809050936
model-00017-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41acfc6a032b0dd4baa1f60c2722e9d4c89d8592cf4c36487dcc7f6f71ff92d0
3
+ size 9809050936
model-00018-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9bf9d5a8b5c148a0f8330dacc4725f37346cf103ac025b4970db1f535398028d
3
+ size 9809050936
model-00019-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e57a91d0b0badfe4d0d4dc81e2b301361b2bb7cdfdfd8e0f4335c7a8f059d63b
3
+ size 9809050936
model-00020-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3e10972380a82cc494452d42e8b529e14883904d9d6592cb7822e6216cc0f72
3
+ size 9809050936
model-00021-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c299d47a3e086de334b966444759fcf3990b7243b566d3c9671109ac817f688f
3
+ size 9809050936
model-00022-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e63d7c0a78f9ecefb9d6929e517f4abc0057ff9fb41a9412cd2a4bd31b2a957
3
+ size 9809050936
model-00023-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffe92aab5e845a289c7a71ff1c5f9e38c374072f06c63ef3d509b677ba28eeaf
3
+ size 9809050936
model-00024-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9be724631c19fb10b9021d0b3ae5d3de7c673590273ea70d2322fba42a5c2037
3
+ size 9809050936
model-00025-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:467da6b42df6ec2363a0a51396d27575f294f1e09341609a4c93cbbe35837861
3
+ size 9809050936
model-00026-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09a9ba1ea4b4a2c9cf06814f64681295366246b45a7a347a73ef85be490f7ff0
3
+ size 9809050936
model-00027-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57387f5523c7a0083ca38435717209179711d6a0eddbfb77c7f6b39f527346b1
3
+ size 9809050936
model-00028-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b40522c6529ff6156823ca0c640e7a47ba4034fd8fdd13e4ff8f507dbd01d86a
3
+ size 9809050936
model-00029-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9836d709a2baa92af5c11bdb8329e414df843d26b18eba00814040721f7beea8
3
+ size 9809050936
model-00030-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92afe1fb4c0fb6533c3ab870f39c35481769115480928b455151ba14e25e5ad5
3
+ size 9809050936
model-00031-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9208435eb7ea4652dce1024cf818df1951ae94a75e01c0ca12beb53e4c4172be
3
+ size 9809050936
model-00032-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e1a18abff1537b11dda04b1d669960fc4bed0794b56bc0fab29620e962848ff
3
+ size 9809050936
model-00033-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6d01497bd1013531ca185a120f2bea40281baad8822a9a60953ceac4e2a2c51
3
+ size 9809050936
model-00034-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d1f5005abb5810f8602c2175e9173fba5771116e95aadda295ffe7a23be4b83
3
+ size 9809050936
model-00035-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e188d2d498e5761f71f5206d3c89fdca88c28eaa62b48f519b04e46527204857
3
+ size 9809050936