Mayo commited on
Commit
fefd132
·
unverified ·
1 Parent(s): 45f8398

feat: Qwen3.6

Browse files
.cargo/config.toml CHANGED
@@ -1,7 +1,8 @@
1
  [env]
2
  # refer: https://stackoverflow.com/questions/43577885/is-there-a-cargo-environment-variable-for-the-workspace-directory
3
  CARGO_WORKSPACE_DIR = { value = "", relative = true }
4
- LLAMA_CPP_TAG = "b8665"
 
5
  # CUDA 13.0 requires C++17
6
  NVCC_PREPEND_FLAGS = "-std=c++17"
7
  # override nvidia-smi compute capability
 
1
  [env]
2
  # refer: https://stackoverflow.com/questions/43577885/is-there-a-cargo-environment-variable-for-the-workspace-directory
3
  CARGO_WORKSPACE_DIR = { value = "", relative = true }
4
+ # llama.cpp release tag
5
+ LLAMA_CPP_TAG = "b8935"
6
  # CUDA 13.0 requires C++17
7
  NVCC_PREPEND_FLAGS = "-std=c++17"
8
  # override nvidia-smi compute capability
README.md CHANGED
@@ -229,6 +229,7 @@ These are broad instruct models that work well when you want one local model for
229
 
230
  - Gemma 4 instruct: [gemma4-e2b-it](https://huggingface.co/unsloth/gemma-4-E2B-it-GGUF), [gemma4-e4b-it](https://huggingface.co/unsloth/gemma-4-E4B-it-GGUF), [gemma4-26b-a4b-it](https://huggingface.co/unsloth/gemma-4-26B-A4B-it-GGUF), [gemma4-31b-it](https://huggingface.co/unsloth/gemma-4-31B-it-GGUF)
231
  - Qwen 3.5: [qwen3.5-0.8b](https://huggingface.co/unsloth/Qwen3.5-0.8B-GGUF), [qwen3.5-2b](https://huggingface.co/unsloth/Qwen3.5-2B-GGUF), [qwen3.5-4b](https://huggingface.co/unsloth/Qwen3.5-4B-GGUF), [qwen3.5-9b](https://huggingface.co/unsloth/Qwen3.5-9B-GGUF), [qwen3.5-27b](https://huggingface.co/unsloth/Qwen3.5-27B-GGUF), [qwen3.5-35b-a3b](https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF)
 
232
 
233
  #### NSFW-Capable Local Models
234
 
@@ -236,17 +237,18 @@ These variants relax the safety tuning applied to the corresponding base instruc
236
 
237
  - Gemma 4 uncensored: [gemma4-e2b-uncensored](https://huggingface.co/HauhauCS/Gemma-4-E2B-Uncensored-HauhauCS-Aggressive), [gemma4-e4b-uncensored](https://huggingface.co/HauhauCS/Gemma-4-E4B-Uncensored-HauhauCS-Aggressive)
238
  - Qwen 3.5 uncensored: [qwen3.5-2b-uncensored](https://huggingface.co/HauhauCS/Qwen3.5-2B-Uncensored-HauhauCS-Aggressive), [qwen3.5-4b-uncensored](https://huggingface.co/HauhauCS/Qwen3.5-4B-Uncensored-HauhauCS-Aggressive), [qwen3.5-9b-uncensored](https://huggingface.co/HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive), [qwen3.5-27b-uncensored](https://huggingface.co/HauhauCS/Qwen3.5-27B-Uncensored-HauhauCS-Aggressive), [qwen3.5-35b-a3b-uncensored](https://huggingface.co/HauhauCS/Qwen3.5-35B-A3B-Uncensored-HauhauCS-Aggressive)
 
239
 
240
  #### Fine-Tuned Translation Models
241
 
242
  These models are more specialized for translation quality, language coverage, or lower-resource setups.
243
 
244
- - [vntl-llama3-8b-v2](https://huggingface.co/lmg-anon/vntl-llama3-8b-v2-gguf): around 8.5 GB in Q8_0, best when translation quality matters more than speed or memory use
245
  - [lfm2.5-1.2b-instruct](https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct-GGUF): a smaller multilingual instruct model that is easier to run on CPUs or low-memory GPUs
246
  - [sugoi-14b-ultra](https://huggingface.co/sugoitoolkit/Sugoi-14B-Ultra-GGUF) and [sugoi-32b-ultra](https://huggingface.co/sugoitoolkit/Sugoi-32B-Ultra-GGUF): larger translation-oriented options when you have more VRAM or RAM available
247
- - [sakura-galtransl-7b-v3.7](https://huggingface.co/SakuraLLM/Sakura-GalTransl-7B-v3.7): around 6.3 GB, a good balance of quality and speed on 8 GB GPUs
248
  - [sakura-1.5b-qwen2.5-v1.0](https://huggingface.co/shing3232/Sakura-1.5B-Qwen2.5-v1.0-GGUF-IMX): lighter and faster, useful on mid-range GPUs or CPU-only setups
249
- - [hunyuan-mt-7b](https://huggingface.co/Mungert/Hunyuan-MT-7B-GGUF): around 6.3 GB, with broad multilingual translation coverage
250
 
251
  LLMs are downloaded on demand when you activate a model. For constrained memory environments, start with a smaller model. When VRAM or RAM permits, 7B and 8B class models generally provide better translation quality.
252
 
 
229
 
230
  - Gemma 4 instruct: [gemma4-e2b-it](https://huggingface.co/unsloth/gemma-4-E2B-it-GGUF), [gemma4-e4b-it](https://huggingface.co/unsloth/gemma-4-E4B-it-GGUF), [gemma4-26b-a4b-it](https://huggingface.co/unsloth/gemma-4-26B-A4B-it-GGUF), [gemma4-31b-it](https://huggingface.co/unsloth/gemma-4-31B-it-GGUF)
231
  - Qwen 3.5: [qwen3.5-0.8b](https://huggingface.co/unsloth/Qwen3.5-0.8B-GGUF), [qwen3.5-2b](https://huggingface.co/unsloth/Qwen3.5-2B-GGUF), [qwen3.5-4b](https://huggingface.co/unsloth/Qwen3.5-4B-GGUF), [qwen3.5-9b](https://huggingface.co/unsloth/Qwen3.5-9B-GGUF), [qwen3.5-27b](https://huggingface.co/unsloth/Qwen3.5-27B-GGUF), [qwen3.5-35b-a3b](https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF)
232
+ - Qwen 3.6: [qwen3.6-27b](https://huggingface.co/unsloth/Qwen3.6-27B-GGUF), [qwen3.6-35b-a3b](https://huggingface.co/unsloth/Qwen3.6-35B-A3B-GGUF)
233
 
234
  #### NSFW-Capable Local Models
235
 
 
237
 
238
  - Gemma 4 uncensored: [gemma4-e2b-uncensored](https://huggingface.co/HauhauCS/Gemma-4-E2B-Uncensored-HauhauCS-Aggressive), [gemma4-e4b-uncensored](https://huggingface.co/HauhauCS/Gemma-4-E4B-Uncensored-HauhauCS-Aggressive)
239
  - Qwen 3.5 uncensored: [qwen3.5-2b-uncensored](https://huggingface.co/HauhauCS/Qwen3.5-2B-Uncensored-HauhauCS-Aggressive), [qwen3.5-4b-uncensored](https://huggingface.co/HauhauCS/Qwen3.5-4B-Uncensored-HauhauCS-Aggressive), [qwen3.5-9b-uncensored](https://huggingface.co/HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive), [qwen3.5-27b-uncensored](https://huggingface.co/HauhauCS/Qwen3.5-27B-Uncensored-HauhauCS-Aggressive), [qwen3.5-35b-a3b-uncensored](https://huggingface.co/HauhauCS/Qwen3.5-35B-A3B-Uncensored-HauhauCS-Aggressive)
240
+ - Qwen 3.6 uncensored: [qwen3.6-27b-uncensored](https://huggingface.co/HauhauCS/Qwen3.6-27B-Uncensored-HauhauCS-Balanced), [qwen3.6-35b-a3b-uncensored](https://huggingface.co/HauhauCS/Qwen3.6-35B-A3B-Uncensored-HauhauCS-Aggressive)
241
 
242
  #### Fine-Tuned Translation Models
243
 
244
  These models are more specialized for translation quality, language coverage, or lower-resource setups.
245
 
246
+ - [vntl-llama3-8b-v2](https://huggingface.co/lmg-anon/vntl-llama3-8b-v2-gguf): a Q5_K_M GGUF, best when translation quality matters more than speed or memory use
247
  - [lfm2.5-1.2b-instruct](https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct-GGUF): a smaller multilingual instruct model that is easier to run on CPUs or low-memory GPUs
248
  - [sugoi-14b-ultra](https://huggingface.co/sugoitoolkit/Sugoi-14B-Ultra-GGUF) and [sugoi-32b-ultra](https://huggingface.co/sugoitoolkit/Sugoi-32B-Ultra-GGUF): larger translation-oriented options when you have more VRAM or RAM available
249
+ - [sakura-galtransl-7b-v3.7](https://huggingface.co/SakuraLLM/Sakura-GalTransl-7B-v3.7): a smaller IQ4_XS GGUF, a good balance of quality and speed on 8 GB GPUs
250
  - [sakura-1.5b-qwen2.5-v1.0](https://huggingface.co/shing3232/Sakura-1.5B-Qwen2.5-v1.0-GGUF-IMX): lighter and faster, useful on mid-range GPUs or CPU-only setups
251
+ - [hunyuan-mt-7b](https://huggingface.co/Mungert/Hunyuan-MT-7B-GGUF): a Q4_K_M GGUF with broad multilingual translation coverage
252
 
253
  LLMs are downloaded on demand when you activate a model. For constrained memory environments, start with a smaller model. When VRAM or RAM permits, 7B and 8B class models generally provide better translation quality.
254
 
docs/en-US/explanation/models-and-providers.md CHANGED
@@ -57,7 +57,7 @@ In practice, the local models are usually quantized decoder-only transformers. G
57
 
58
  ### Translation-focused built-in local models for English output
59
 
60
- - [vntl-llama3-8b-v2](https://huggingface.co/lmg-anon/vntl-llama3-8b-v2-gguf): around 8.5 GB in Q8_0 form, best when translation quality matters most
61
  - [lfm2.5-1.2b-instruct](https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct-GGUF): a smaller multilingual instruct option for low-memory systems or faster iteration
62
  - [sugoi-14b-ultra](https://huggingface.co/sugoitoolkit/Sugoi-14B-Ultra-GGUF) and [sugoi-32b-ultra](https://huggingface.co/sugoitoolkit/Sugoi-32B-Ultra-GGUF): larger translation-oriented choices when you want more headroom
63
 
@@ -78,6 +78,8 @@ The local picker also includes general-purpose families that are not translation
78
  - Gemma 4 uncensored: `gemma4-e2b-uncensored`, `gemma4-e4b-uncensored`
79
  - Qwen 3.5: `qwen3.5-0.8b`, `qwen3.5-2b`, `qwen3.5-4b`, `qwen3.5-9b`, `qwen3.5-27b`, `qwen3.5-35b-a3b`
80
  - Qwen 3.5 uncensored: `qwen3.5-2b-uncensored`, `qwen3.5-4b-uncensored`, `qwen3.5-9b-uncensored`, `qwen3.5-27b-uncensored`, `qwen3.5-35b-a3b-uncensored`
 
 
81
 
82
  ## Remote providers
83
 
 
57
 
58
  ### Translation-focused built-in local models for English output
59
 
60
+ - [vntl-llama3-8b-v2](https://huggingface.co/lmg-anon/vntl-llama3-8b-v2-gguf): a Q5_K_M GGUF, best when translation quality matters most
61
  - [lfm2.5-1.2b-instruct](https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct-GGUF): a smaller multilingual instruct option for low-memory systems or faster iteration
62
  - [sugoi-14b-ultra](https://huggingface.co/sugoitoolkit/Sugoi-14B-Ultra-GGUF) and [sugoi-32b-ultra](https://huggingface.co/sugoitoolkit/Sugoi-32B-Ultra-GGUF): larger translation-oriented choices when you want more headroom
63
 
 
78
  - Gemma 4 uncensored: `gemma4-e2b-uncensored`, `gemma4-e4b-uncensored`
79
  - Qwen 3.5: `qwen3.5-0.8b`, `qwen3.5-2b`, `qwen3.5-4b`, `qwen3.5-9b`, `qwen3.5-27b`, `qwen3.5-35b-a3b`
80
  - Qwen 3.5 uncensored: `qwen3.5-2b-uncensored`, `qwen3.5-4b-uncensored`, `qwen3.5-9b-uncensored`, `qwen3.5-27b-uncensored`, `qwen3.5-35b-a3b-uncensored`
81
+ - Qwen 3.6: `qwen3.6-27b`, `qwen3.6-35b-a3b`
82
+ - Qwen 3.6 uncensored: `qwen3.6-27b-uncensored`, `qwen3.6-35b-a3b-uncensored`
83
 
84
  ## Remote providers
85
 
docs/ja-JP/explanation/models-and-providers.md CHANGED
@@ -57,7 +57,7 @@ Koharu は [llama.cpp](https://github.com/ggml-org/llama.cpp) を通じてロー
57
 
58
  ### 英語出力向けの翻訳特化組み込みローカルモデル
59
 
60
- - [vntl-llama3-8b-v2](https://huggingface.co/lmg-anon/vntl-llama3-8b-v2-gguf): Q8_0 で約 8.5 GB。翻訳品質を優先するなら有力
61
  - [lfm2.5-1.2b-instruct](https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct-GGUF): 低メモリ環境や高速な試行に向く小型の多言語 instruction モデル
62
  - [sugoi-14b-ultra](https://huggingface.co/sugoitoolkit/Sugoi-14B-Ultra-GGUF) と [sugoi-32b-ultra](https://huggingface.co/sugoitoolkit/Sugoi-32B-Ultra-GGUF): より多くの VRAM / RAM を使える環境向けの大型翻訳寄りモデル
63
 
@@ -78,6 +78,8 @@ LLM ピッカーには、翻訳専用ではない汎用ファミリも含まれ
78
  - Gemma 4 uncensored: `gemma4-e2b-uncensored`, `gemma4-e4b-uncensored`
79
  - Qwen 3.5: `qwen3.5-0.8b`, `qwen3.5-2b`, `qwen3.5-4b`, `qwen3.5-9b`, `qwen3.5-27b`, `qwen3.5-35b-a3b`
80
  - Qwen 3.5 uncensored: `qwen3.5-2b-uncensored`, `qwen3.5-4b-uncensored`, `qwen3.5-9b-uncensored`, `qwen3.5-27b-uncensored`, `qwen3.5-35b-a3b-uncensored`
 
 
81
 
82
  ## リモートプロバイダ
83
 
 
57
 
58
  ### 英語出力向けの翻訳特化組み込みローカルモデル
59
 
60
+ - [vntl-llama3-8b-v2](https://huggingface.co/lmg-anon/vntl-llama3-8b-v2-gguf): Q5_K_M GGUF。翻訳品質を優先するなら有力
61
  - [lfm2.5-1.2b-instruct](https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct-GGUF): 低メモリ環境や高速な試行に向く小型の多言語 instruction モデル
62
  - [sugoi-14b-ultra](https://huggingface.co/sugoitoolkit/Sugoi-14B-Ultra-GGUF) と [sugoi-32b-ultra](https://huggingface.co/sugoitoolkit/Sugoi-32B-Ultra-GGUF): より多くの VRAM / RAM を使える環境向けの大型翻訳寄りモデル
63
 
 
78
  - Gemma 4 uncensored: `gemma4-e2b-uncensored`, `gemma4-e4b-uncensored`
79
  - Qwen 3.5: `qwen3.5-0.8b`, `qwen3.5-2b`, `qwen3.5-4b`, `qwen3.5-9b`, `qwen3.5-27b`, `qwen3.5-35b-a3b`
80
  - Qwen 3.5 uncensored: `qwen3.5-2b-uncensored`, `qwen3.5-4b-uncensored`, `qwen3.5-9b-uncensored`, `qwen3.5-27b-uncensored`, `qwen3.5-35b-a3b-uncensored`
81
+ - Qwen 3.6: `qwen3.6-27b`, `qwen3.6-35b-a3b`
82
+ - Qwen 3.6 uncensored: `qwen3.6-27b-uncensored`, `qwen3.6-35b-a3b-uncensored`
83
 
84
  ## リモートプロバイダ
85
 
docs/pt-BR/explanation/models-and-providers.md CHANGED
@@ -57,7 +57,7 @@ Na prática, os modelos locais geralmente são transformers decoder-only quantiz
57
 
58
  ### Modelos locais internos focados em tradução para saída em inglês
59
 
60
- - [vntl-llama3-8b-v2](https://huggingface.co/lmg-anon/vntl-llama3-8b-v2-gguf): cerca de 8,5 GB na forma Q8_0, melhor quando a qualidade da tradução importa mais
61
  - [lfm2.5-1.2b-instruct](https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct-GGUF): uma opção menor multilíngue do tipo instruct para sistemas com pouca memória ou iteração mais rápida
62
  - [sugoi-14b-ultra](https://huggingface.co/sugoitoolkit/Sugoi-14B-Ultra-GGUF) e [sugoi-32b-ultra](https://huggingface.co/sugoitoolkit/Sugoi-32B-Ultra-GGUF): escolhas maiores orientadas para tradução quando você quer mais folga
63
 
@@ -78,6 +78,8 @@ O seletor local também inclui famílias de propósito geral que não são espec
78
  - Gemma 4 uncensored: `gemma4-e2b-uncensored`, `gemma4-e4b-uncensored`
79
  - Qwen 3.5: `qwen3.5-0.8b`, `qwen3.5-2b`, `qwen3.5-4b`, `qwen3.5-9b`, `qwen3.5-27b`, `qwen3.5-35b-a3b`
80
  - Qwen 3.5 uncensored: `qwen3.5-2b-uncensored`, `qwen3.5-4b-uncensored`, `qwen3.5-9b-uncensored`, `qwen3.5-27b-uncensored`, `qwen3.5-35b-a3b-uncensored`
 
 
81
 
82
  ## Provedores remotos
83
 
 
57
 
58
  ### Modelos locais internos focados em tradução para saída em inglês
59
 
60
+ - [vntl-llama3-8b-v2](https://huggingface.co/lmg-anon/vntl-llama3-8b-v2-gguf): um GGUF Q5_K_M, melhor quando a qualidade da tradução importa mais
61
  - [lfm2.5-1.2b-instruct](https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct-GGUF): uma opção menor multilíngue do tipo instruct para sistemas com pouca memória ou iteração mais rápida
62
  - [sugoi-14b-ultra](https://huggingface.co/sugoitoolkit/Sugoi-14B-Ultra-GGUF) e [sugoi-32b-ultra](https://huggingface.co/sugoitoolkit/Sugoi-32B-Ultra-GGUF): escolhas maiores orientadas para tradução quando você quer mais folga
63
 
 
78
  - Gemma 4 uncensored: `gemma4-e2b-uncensored`, `gemma4-e4b-uncensored`
79
  - Qwen 3.5: `qwen3.5-0.8b`, `qwen3.5-2b`, `qwen3.5-4b`, `qwen3.5-9b`, `qwen3.5-27b`, `qwen3.5-35b-a3b`
80
  - Qwen 3.5 uncensored: `qwen3.5-2b-uncensored`, `qwen3.5-4b-uncensored`, `qwen3.5-9b-uncensored`, `qwen3.5-27b-uncensored`, `qwen3.5-35b-a3b-uncensored`
81
+ - Qwen 3.6: `qwen3.6-27b`, `qwen3.6-35b-a3b`
82
+ - Qwen 3.6 uncensored: `qwen3.6-27b-uncensored`, `qwen3.6-35b-a3b-uncensored`
83
 
84
  ## Provedores remotos
85
 
docs/zh-CN/explanation/models-and-providers.md CHANGED
@@ -57,7 +57,7 @@ Koharu 通过 [llama.cpp](https://github.com/ggml-org/llama.cpp) 支持本地 GG
57
 
58
  ### 面向英文输出的翻译型内置本地模型
59
 
60
- - [vntl-llama3-8b-v2](https://huggingface.co/lmg-anon/vntl-llama3-8b-v2-gguf):Q8_0 约 8.5 GB,更适合追求翻译质量
61
  - [lfm2.5-1.2b-instruct](https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct-GGUF):更小的多语言 instruction 模型,适合低内存机器或更快的迭代
62
  - [sugoi-14b-ultra](https://huggingface.co/sugoitoolkit/Sugoi-14B-Ultra-GGUF) 和 [sugoi-32b-ultra](https://huggingface.co/sugoitoolkit/Sugoi-32B-Ultra-GGUF):更大的翻译取向模型,适合有更多 VRAM / RAM 的环境
63
 
@@ -78,6 +78,8 @@ Koharu 通过 [llama.cpp](https://github.com/ggml-org/llama.cpp) 支持本地 GG
78
  - Gemma 4 uncensored:`gemma4-e2b-uncensored`、`gemma4-e4b-uncensored`
79
  - Qwen 3.5:`qwen3.5-0.8b`、`qwen3.5-2b`、`qwen3.5-4b`、`qwen3.5-9b`、`qwen3.5-27b`、`qwen3.5-35b-a3b`
80
  - Qwen 3.5 uncensored:`qwen3.5-2b-uncensored`、`qwen3.5-4b-uncensored`、`qwen3.5-9b-uncensored`、`qwen3.5-27b-uncensored`、`qwen3.5-35b-a3b-uncensored`
 
 
81
 
82
  ## 远程提供商
83
 
 
57
 
58
  ### 面向英文输出的翻译型内置本地模型
59
 
60
+ - [vntl-llama3-8b-v2](https://huggingface.co/lmg-anon/vntl-llama3-8b-v2-gguf):Q5_K_M GGUF,更适合追求翻译质量
61
  - [lfm2.5-1.2b-instruct](https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct-GGUF):更小的多语言 instruction 模型,适合低内存机器或更快的迭代
62
  - [sugoi-14b-ultra](https://huggingface.co/sugoitoolkit/Sugoi-14B-Ultra-GGUF) 和 [sugoi-32b-ultra](https://huggingface.co/sugoitoolkit/Sugoi-32B-Ultra-GGUF):更大的翻译取向模型,适合有更多 VRAM / RAM 的环境
63
 
 
78
  - Gemma 4 uncensored:`gemma4-e2b-uncensored`、`gemma4-e4b-uncensored`
79
  - Qwen 3.5:`qwen3.5-0.8b`、`qwen3.5-2b`、`qwen3.5-4b`、`qwen3.5-9b`、`qwen3.5-27b`、`qwen3.5-35b-a3b`
80
  - Qwen 3.5 uncensored:`qwen3.5-2b-uncensored`、`qwen3.5-4b-uncensored`、`qwen3.5-9b-uncensored`、`qwen3.5-27b-uncensored`、`qwen3.5-35b-a3b-uncensored`
81
+ - Qwen 3.6:`qwen3.6-27b`、`qwen3.6-35b-a3b`
82
+ - Qwen 3.6 uncensored:`qwen3.6-27b-uncensored`、`qwen3.6-35b-a3b-uncensored`
83
 
84
  ## 远程提供商
85
 
koharu-llm/src/lib.rs CHANGED
@@ -51,7 +51,7 @@ pub enum ModelId {
51
  serialize = "vntl-llama3-8b-v2",
52
  props(
53
  repo = "lmg-anon/vntl-llama3-8b-v2-gguf",
54
- filename = "vntl-llama3-8b-v2-hf-q8_0.gguf",
55
  languages = "en-US"
56
  )
57
  )]
@@ -60,7 +60,7 @@ pub enum ModelId {
60
  serialize = "lfm2.5-1.2b-instruct",
61
  props(
62
  repo = "LiquidAI/LFM2.5-1.2B-Instruct-GGUF",
63
- filename = "LFM2.5-1.2B-Instruct-Q8_0.gguf",
64
  languages = "en-US,ar-SA,zh-CN,fr-FR,de-DE,ja-JP,ko-KR,pt-PT,es-ES"
65
  )
66
  )]
@@ -69,7 +69,7 @@ pub enum ModelId {
69
  serialize = "sakura-galtransl-7b-v3.7",
70
  props(
71
  repo = "SakuraLLM/Sakura-GalTransl-7B-v3.7",
72
- filename = "Sakura-Galtransl-7B-v3.7.gguf",
73
  languages = "zh-CN"
74
  )
75
  )]
@@ -87,7 +87,7 @@ pub enum ModelId {
87
  serialize = "hunyuan-mt-7b",
88
  props(
89
  repo = "Mungert/Hunyuan-MT-7B-GGUF",
90
- filename = "Hunyuan-MT-7B-q6_k_m.gguf",
91
  languages = "zh-CN,en-US,fr-FR,pt-PT,pt-BR,es-ES,ja-JP,tr-TR,ru-RU,ar-SA,ko-KR,th-TH,it-IT,de-DE,vi-VN,ms-MY,id-ID,fil-PH,hi-IN,zh-TW,pl-PL,cs-CZ,nl-NL,km-KH,my-MM,fa-IR,gu-IN,ur-PK,te-IN,mr-IN,he-IL,bn-BD,ta-IN,uk-UA,bo-CN,kk-KZ,mn-MN,ug-CN,yue-HK"
92
  )
93
  )]
@@ -96,7 +96,7 @@ pub enum ModelId {
96
  serialize = "sugoi-14b-ultra",
97
  props(
98
  repo = "sugoitoolkit/Sugoi-14B-Ultra-GGUF",
99
- filename = "Sugoi-14B-Ultra-Q8_0.gguf",
100
  languages = "en-US"
101
  )
102
  )]
@@ -114,7 +114,7 @@ pub enum ModelId {
114
  serialize = "gemma4-e2b-it",
115
  props(
116
  repo = "unsloth/gemma-4-E2B-it-GGUF",
117
- filename = "gemma-4-e2b-it-Q8_0.gguf",
118
  languages = "*"
119
  )
120
  )]
@@ -123,7 +123,7 @@ pub enum ModelId {
123
  serialize = "gemma4-e4b-it",
124
  props(
125
  repo = "unsloth/gemma-4-E4B-it-GGUF",
126
- filename = "gemma-4-e4b-it-Q8_0.gguf",
127
  languages = "*"
128
  )
129
  )]
@@ -132,7 +132,7 @@ pub enum ModelId {
132
  serialize = "gemma4-26b-a4b-it",
133
  props(
134
  repo = "unsloth/gemma-4-26B-A4B-it-GGUF",
135
- filename = "gemma-4-26B-A4B-it-Q8_0.gguf",
136
  languages = "*"
137
  )
138
  )]
@@ -150,7 +150,7 @@ pub enum ModelId {
150
  serialize = "gemma4-e2b-uncensored",
151
  props(
152
  repo = "HauhauCS/Gemma-4-E2B-Uncensored-HauhauCS-Aggressive",
153
- filename = "Gemma-4-E2B-Uncensored-HauhauCS-Aggressive-Q8_K_P.gguf",
154
  languages = "*"
155
  )
156
  )]
@@ -168,7 +168,7 @@ pub enum ModelId {
168
  serialize = "qwen3.5-0.8b",
169
  props(
170
  repo = "unsloth/Qwen3.5-0.8B-GGUF",
171
- filename = "Qwen3.5-0.8B-Q8_0.gguf",
172
  languages = "*"
173
  )
174
  )]
@@ -177,7 +177,7 @@ pub enum ModelId {
177
  serialize = "qwen3.5-2b",
178
  props(
179
  repo = "unsloth/Qwen3.5-2B-GGUF",
180
- filename = "Qwen3.5-2B-Q8_0.gguf",
181
  languages = "*"
182
  )
183
  )]
@@ -186,7 +186,7 @@ pub enum ModelId {
186
  serialize = "qwen3.5-4b",
187
  props(
188
  repo = "unsloth/Qwen3.5-4B-GGUF",
189
- filename = "Qwen3.5-4B-Q8_0.gguf",
190
  languages = "*"
191
  )
192
  )]
@@ -195,7 +195,7 @@ pub enum ModelId {
195
  serialize = "qwen3.5-9b",
196
  props(
197
  repo = "unsloth/Qwen3.5-9B-GGUF",
198
- filename = "Qwen3.5-9B-Q8_0.gguf",
199
  languages = "*"
200
  )
201
  )]
@@ -213,16 +213,34 @@ pub enum ModelId {
213
  serialize = "qwen3.5-35b-a3b",
214
  props(
215
  repo = "unsloth/Qwen3.5-35B-A3B-GGUF",
216
- filename = "Qwen3.5-35B-A3B-Q8_0.gguf",
217
  languages = "*"
218
  )
219
  )]
220
  Qwen3_5_35bA3b,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  #[strum(
222
  serialize = "qwen3.5-2b-uncensored",
223
  props(
224
  repo = "HauhauCS/Qwen3.5-2B-Uncensored-HauhauCS-Aggressive",
225
- filename = "Qwen3.5-2B-Uncensored-HauhauCS-Aggressive-Q8_0.gguf",
226
  languages = "*"
227
  )
228
  )]
@@ -231,7 +249,7 @@ pub enum ModelId {
231
  serialize = "qwen3.5-4b-uncensored",
232
  props(
233
  repo = "HauhauCS/Qwen3.5-4B-Uncensored-HauhauCS-Aggressive",
234
- filename = "Qwen3.5-4B-Uncensored-HauhauCS-Aggressive-Q8_0.gguf",
235
  languages = "*"
236
  )
237
  )]
@@ -240,7 +258,7 @@ pub enum ModelId {
240
  serialize = "qwen3.5-9b-uncensored",
241
  props(
242
  repo = "HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive",
243
- filename = "Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-Q8_0.gguf",
244
  languages = "*"
245
  )
246
  )]
@@ -258,11 +276,29 @@ pub enum ModelId {
258
  serialize = "qwen3.5-35b-a3b-uncensored",
259
  props(
260
  repo = "HauhauCS/Qwen3.5-35B-A3B-Uncensored-HauhauCS-Aggressive",
261
- filename = "Qwen3.5-35B-A3B-Uncensored-HauhauCS-Aggressive-Q8_0.gguf",
262
  languages = "*"
263
  )
264
  )]
265
  Qwen3_5_35bA3bUncensored,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  }
267
 
268
  impl ModelId {
@@ -319,6 +355,19 @@ impl ModelId {
319
  repeat_penalty: 1.0,
320
  ..Default::default()
321
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
322
  // Sugoi: temp=0.1, top_k=40, top_p=0.95, min_p=0.05, repeat=1.1
323
  Self::Sugoi14bUltra | Self::Sugoi32bUltra => GenerateOptions {
324
  temperature: 0.1,
 
51
  serialize = "vntl-llama3-8b-v2",
52
  props(
53
  repo = "lmg-anon/vntl-llama3-8b-v2-gguf",
54
+ filename = "vntl-llama3-8b-v2-hf-q5_k_m.gguf",
55
  languages = "en-US"
56
  )
57
  )]
 
60
  serialize = "lfm2.5-1.2b-instruct",
61
  props(
62
  repo = "LiquidAI/LFM2.5-1.2B-Instruct-GGUF",
63
+ filename = "LFM2.5-1.2B-Instruct-Q4_K_M.gguf",
64
  languages = "en-US,ar-SA,zh-CN,fr-FR,de-DE,ja-JP,ko-KR,pt-PT,es-ES"
65
  )
66
  )]
 
69
  serialize = "sakura-galtransl-7b-v3.7",
70
  props(
71
  repo = "SakuraLLM/Sakura-GalTransl-7B-v3.7",
72
+ filename = "Sakura-Galtransl-7B-v3.7-IQ4_XS.gguf",
73
  languages = "zh-CN"
74
  )
75
  )]
 
87
  serialize = "hunyuan-mt-7b",
88
  props(
89
  repo = "Mungert/Hunyuan-MT-7B-GGUF",
90
+ filename = "Hunyuan-MT-7B-q4_k_m.gguf",
91
  languages = "zh-CN,en-US,fr-FR,pt-PT,pt-BR,es-ES,ja-JP,tr-TR,ru-RU,ar-SA,ko-KR,th-TH,it-IT,de-DE,vi-VN,ms-MY,id-ID,fil-PH,hi-IN,zh-TW,pl-PL,cs-CZ,nl-NL,km-KH,my-MM,fa-IR,gu-IN,ur-PK,te-IN,mr-IN,he-IL,bn-BD,ta-IN,uk-UA,bo-CN,kk-KZ,mn-MN,ug-CN,yue-HK"
92
  )
93
  )]
 
96
  serialize = "sugoi-14b-ultra",
97
  props(
98
  repo = "sugoitoolkit/Sugoi-14B-Ultra-GGUF",
99
+ filename = "Sugoi-14B-Ultra-Q4_K_M.gguf",
100
  languages = "en-US"
101
  )
102
  )]
 
114
  serialize = "gemma4-e2b-it",
115
  props(
116
  repo = "unsloth/gemma-4-E2B-it-GGUF",
117
+ filename = "gemma-4-E2B-it-Q4_K_M.gguf",
118
  languages = "*"
119
  )
120
  )]
 
123
  serialize = "gemma4-e4b-it",
124
  props(
125
  repo = "unsloth/gemma-4-E4B-it-GGUF",
126
+ filename = "gemma-4-E4B-it-Q4_K_M.gguf",
127
  languages = "*"
128
  )
129
  )]
 
132
  serialize = "gemma4-26b-a4b-it",
133
  props(
134
  repo = "unsloth/gemma-4-26B-A4B-it-GGUF",
135
+ filename = "gemma-4-26B-A4B-it-UD-Q4_K_M.gguf",
136
  languages = "*"
137
  )
138
  )]
 
150
  serialize = "gemma4-e2b-uncensored",
151
  props(
152
  repo = "HauhauCS/Gemma-4-E2B-Uncensored-HauhauCS-Aggressive",
153
+ filename = "Gemma-4-E2B-Uncensored-HauhauCS-Aggressive-Q4_K_P.gguf",
154
  languages = "*"
155
  )
156
  )]
 
168
  serialize = "qwen3.5-0.8b",
169
  props(
170
  repo = "unsloth/Qwen3.5-0.8B-GGUF",
171
+ filename = "Qwen3.5-0.8B-Q4_K_M.gguf",
172
  languages = "*"
173
  )
174
  )]
 
177
  serialize = "qwen3.5-2b",
178
  props(
179
  repo = "unsloth/Qwen3.5-2B-GGUF",
180
+ filename = "Qwen3.5-2B-Q4_K_M.gguf",
181
  languages = "*"
182
  )
183
  )]
 
186
  serialize = "qwen3.5-4b",
187
  props(
188
  repo = "unsloth/Qwen3.5-4B-GGUF",
189
+ filename = "Qwen3.5-4B-Q4_K_M.gguf",
190
  languages = "*"
191
  )
192
  )]
 
195
  serialize = "qwen3.5-9b",
196
  props(
197
  repo = "unsloth/Qwen3.5-9B-GGUF",
198
+ filename = "Qwen3.5-9B-Q4_K_M.gguf",
199
  languages = "*"
200
  )
201
  )]
 
213
  serialize = "qwen3.5-35b-a3b",
214
  props(
215
  repo = "unsloth/Qwen3.5-35B-A3B-GGUF",
216
+ filename = "Qwen3.5-35B-A3B-Q4_K_M.gguf",
217
  languages = "*"
218
  )
219
  )]
220
  Qwen3_5_35bA3b,
221
+ #[strum(
222
+ serialize = "qwen3.6-27b",
223
+ props(
224
+ repo = "unsloth/Qwen3.6-27B-GGUF",
225
+ filename = "Qwen3.6-27B-IQ4_XS.gguf",
226
+ languages = "*"
227
+ )
228
+ )]
229
+ Qwen3_6_27b,
230
+ #[strum(
231
+ serialize = "qwen3.6-35b-a3b",
232
+ props(
233
+ repo = "unsloth/Qwen3.6-35B-A3B-GGUF",
234
+ filename = "Qwen3.6-35B-A3B-UD-IQ4_XS.gguf",
235
+ languages = "*"
236
+ )
237
+ )]
238
+ Qwen3_6_35bA3b,
239
  #[strum(
240
  serialize = "qwen3.5-2b-uncensored",
241
  props(
242
  repo = "HauhauCS/Qwen3.5-2B-Uncensored-HauhauCS-Aggressive",
243
+ filename = "Qwen3.5-2B-Uncensored-HauhauCS-Aggressive-Q4_K_M.gguf",
244
  languages = "*"
245
  )
246
  )]
 
249
  serialize = "qwen3.5-4b-uncensored",
250
  props(
251
  repo = "HauhauCS/Qwen3.5-4B-Uncensored-HauhauCS-Aggressive",
252
+ filename = "Qwen3.5-4B-Uncensored-HauhauCS-Aggressive-Q4_K_M.gguf",
253
  languages = "*"
254
  )
255
  )]
 
258
  serialize = "qwen3.5-9b-uncensored",
259
  props(
260
  repo = "HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive",
261
+ filename = "Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-Q4_K_M.gguf",
262
  languages = "*"
263
  )
264
  )]
 
276
  serialize = "qwen3.5-35b-a3b-uncensored",
277
  props(
278
  repo = "HauhauCS/Qwen3.5-35B-A3B-Uncensored-HauhauCS-Aggressive",
279
+ filename = "Qwen3.5-35B-A3B-Uncensored-HauhauCS-Aggressive-Q4_K_M.gguf",
280
  languages = "*"
281
  )
282
  )]
283
  Qwen3_5_35bA3bUncensored,
284
+ #[strum(
285
+ serialize = "qwen3.6-27b-uncensored",
286
+ props(
287
+ repo = "HauhauCS/Qwen3.6-27B-Uncensored-HauhauCS-Aggressive",
288
+ filename = "Qwen3.6-27B-Uncensored-HauhauCS-Aggressive-IQ4_XS.gguf",
289
+ languages = "*"
290
+ )
291
+ )]
292
+ Qwen3_6_27bUncensored,
293
+ #[strum(
294
+ serialize = "qwen3.6-35b-a3b-uncensored",
295
+ props(
296
+ repo = "HauhauCS/Qwen3.6-35B-A3B-Uncensored-HauhauCS-Aggressive",
297
+ filename = "Qwen3.6-35B-A3B-Uncensored-HauhauCS-Aggressive-IQ4_XS.gguf",
298
+ languages = "*"
299
+ )
300
+ )]
301
+ Qwen3_6_35bA3bUncensored,
302
  }
303
 
304
  impl ModelId {
 
355
  repeat_penalty: 1.0,
356
  ..Default::default()
357
  },
358
+ // Qwen3.6 non-thinking: temp=0.7, top_k=20, top_p=0.8, presence=1.5
359
+ Self::Qwen3_6_27b
360
+ | Self::Qwen3_6_35bA3b
361
+ | Self::Qwen3_6_27bUncensored
362
+ | Self::Qwen3_6_35bA3bUncensored => GenerateOptions {
363
+ temperature: 0.7,
364
+ top_k: Some(20),
365
+ top_p: Some(0.8),
366
+ min_p: Some(0.0),
367
+ presence_penalty: 1.5,
368
+ repeat_penalty: 1.0,
369
+ ..Default::default()
370
+ },
371
  // Sugoi: temp=0.1, top_k=40, top_p=0.95, min_p=0.05, repeat=1.1
372
  Self::Sugoi14bUltra | Self::Sugoi32bUltra => GenerateOptions {
373
  temperature: 0.1,
koharu-llm/src/safe/context.rs CHANGED
@@ -363,11 +363,6 @@ impl<'model> LlamaContext<'model> {
363
  tracing::debug!("Remove lora adapter");
364
  Ok(())
365
  }
366
-
367
- /// Print a breakdown of per-device memory use to the default logger.
368
- pub fn print_memory_breakdown(&self) {
369
- unsafe { crate::sys::llama_memory_breakdown_print(self.context.as_ptr()) }
370
- }
371
  }
372
 
373
  impl Drop for LlamaContext<'_> {
 
363
  tracing::debug!("Remove lora adapter");
364
  Ok(())
365
  }
 
 
 
 
 
366
  }
367
 
368
  impl Drop for LlamaContext<'_> {
koharu-llm/src/safe/mtmd.rs CHANGED
@@ -190,7 +190,15 @@ impl MtmdContext {
190
  /// Check whether non-causal attention mask is needed before `llama_decode`.
191
  #[must_use]
192
  pub fn decode_use_non_causal(&self) -> bool {
193
- unsafe { crate::sys::mtmd_decode_use_non_causal(self.context.as_ptr()) }
 
 
 
 
 
 
 
 
194
  }
195
 
196
  /// Check whether the current model uses M-RoPE for `llama_decode`.
 
190
  /// Check whether non-causal attention mask is needed before `llama_decode`.
191
  #[must_use]
192
  pub fn decode_use_non_causal(&self) -> bool {
193
+ unsafe { crate::sys::mtmd_decode_use_non_causal(self.context.as_ptr(), std::ptr::null()) }
194
+ }
195
+
196
+ /// Check whether non-causal attention mask is needed before decoding a chunk.
197
+ #[must_use]
198
+ pub fn decode_use_non_causal_for_chunk(&self, chunk: &MtmdInputChunk) -> bool {
199
+ unsafe {
200
+ crate::sys::mtmd_decode_use_non_causal(self.context.as_ptr(), chunk.chunk.as_ptr())
201
+ }
202
  }
203
 
204
  /// Check whether the current model uses M-RoPE for `llama_decode`.