Aspandiyar Nurimanov commited on Nov 21, 2025

Commit

e460f36

verified ·

1 Parent(s): df11108

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +17 -0
BF16/Qolda-BF16.gguf +3 -0
BF16/README.md +199 -0
BF16/config.json +128 -0
BF16/generation_config.json +7 -0
BF16/merges.txt +0 -0
BF16/special_tokens_map.json +27 -0
BF16/tokenizer_config.json +308 -0
BF16/vocab.json +0 -0
IQ2_M/Qolda-IQ2_M.gguf +3 -0
IQ3_XXS/Qolda-IQ3_XXS.gguf +3 -0
IQ4_NL/Qolda-IQ4_NL.gguf +3 -0
IQ4_NL/README.md +199 -0
IQ4_NL/config.json +128 -0
IQ4_NL/generation_config.json +7 -0
IQ4_NL/merges.txt +0 -0
IQ4_NL/special_tokens_map.json +27 -0
IQ4_NL/tokenizer_config.json +308 -0
IQ4_NL/vocab.json +0 -0
IQ4_XS/Qolda-IQ4_XS.gguf +3 -0
IQ4_XS/README.md +199 -0
IQ4_XS/config.json +128 -0
IQ4_XS/generation_config.json +7 -0
IQ4_XS/merges.txt +0 -0
IQ4_XS/special_tokens_map.json +27 -0
IQ4_XS/tokenizer_config.json +308 -0
IQ4_XS/vocab.json +0 -0
Q2_K/Qolda-Q2_K.gguf +3 -0
Q2_K/README.md +199 -0
Q2_K/config.json +128 -0
Q2_K/generation_config.json +7 -0
Q2_K/merges.txt +0 -0
Q2_K/special_tokens_map.json +27 -0
Q2_K/tokenizer_config.json +308 -0
Q2_K/vocab.json +0 -0
Q3_K_M/Qolda-Q3_K_M.gguf +3 -0
Q3_K_M/README.md +199 -0
Q3_K_M/config.json +128 -0
Q3_K_M/generation_config.json +7 -0
Q3_K_M/merges.txt +0 -0
Q3_K_M/special_tokens_map.json +27 -0
Q3_K_M/tokenizer_config.json +308 -0
Q3_K_M/vocab.json +0 -0
Q3_K_S/Qolda-Q3_K_S.gguf +3 -0
Q3_K_S/README.md +199 -0
Q3_K_S/config.json +128 -0
Q3_K_S/generation_config.json +7 -0
Q3_K_S/merges.txt +0 -0
Q3_K_S/special_tokens_map.json +27 -0
Q3_K_S/tokenizer_config.json +308 -0

.gitattributes CHANGED Viewed

@@ -36,3 +36,20 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 Qolda-F16.gguf filter=lfs diff=lfs merge=lfs -text
 Qolda-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
 Qolda-mmproj-F16.gguf filter=lfs diff=lfs merge=lfs -text

 Qolda-F16.gguf filter=lfs diff=lfs merge=lfs -text
 Qolda-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
 Qolda-mmproj-F16.gguf filter=lfs diff=lfs merge=lfs -text
+BF16/Qolda-BF16.gguf filter=lfs diff=lfs merge=lfs -text
+IQ2_M/Qolda-IQ2_M.gguf filter=lfs diff=lfs merge=lfs -text
+IQ3_XXS/Qolda-IQ3_XXS.gguf filter=lfs diff=lfs merge=lfs -text
+IQ4_NL/Qolda-IQ4_NL.gguf filter=lfs diff=lfs merge=lfs -text
+IQ4_XS/Qolda-IQ4_XS.gguf filter=lfs diff=lfs merge=lfs -text
+Q2_K/Qolda-Q2_K.gguf filter=lfs diff=lfs merge=lfs -text
+Q3_K_M/Qolda-Q3_K_M.gguf filter=lfs diff=lfs merge=lfs -text
+Q3_K_S/Qolda-Q3_K_S.gguf filter=lfs diff=lfs merge=lfs -text
+Q4_0/Qolda-Q4_0.gguf filter=lfs diff=lfs merge=lfs -text
+Q4_1/Qolda-Q4_1.gguf filter=lfs diff=lfs merge=lfs -text
+Q4_K_M/Qolda-Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
+Q4_K_S/Qolda-Q4_K_S.gguf filter=lfs diff=lfs merge=lfs -text
+Q5_K_M/Qolda-Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text
+Q5_K_S/Qolda-Q5_K_S.gguf filter=lfs diff=lfs merge=lfs -text
+Q6_K/Qolda-Q6_K.gguf filter=lfs diff=lfs merge=lfs -text
+Q8_0/Qolda-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
+TQ1_0/Qolda-TQ1_0.gguf filter=lfs diff=lfs merge=lfs -text

BF16/Qolda-BF16.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a67799076983cf3b8f28f8b56f99d3027445fc5eeac9944ba4800dfa72e06fab
+size 8051285056

BF16/README.md ADDED Viewed

	@@ -0,0 +1,199 @@

+---
+language:
+- kk
+- ru
+- en
+base_model:
+- OpenGVLab/InternVL3_5-4B
+pipeline_tag: image-text-to-text
+---
+[Қазақша](#кіріспе)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[English](#introduction)
+# Qolda
+[![GitHub](https://img.shields.io/badge/GitHub-Qolda--deployment-blue?logo=github)](https://github.com/IS2AI/Qolda-deployment)
+[![License](https://img.shields.io/badge/License-Apache%202.0-green.svg)](https://www.apache.org/licenses/LICENSE-2.0)
+## Introduction
+Built on top of InternVL3.5 and Qwen3, **Qolda** is a small vision-language model designed to operate in Kazakh, Russian, and English. The model has 4.3B parameters and comprises the InternViT-300M vision encoder and MLP Projector components from [InternVL3.5-4B](https://huggingface.co/OpenGVLab/InternVL3_5-4B), along with the [Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B) language model. Model training was performed using the [InternVL framework](https://github.com/OpenGVLab/InternVL) 💙
+The name "Qolda" reflects both its design and purpose in Kazakh: "in hand" (қолда) for its compact accessibility, and "to support" (қолдау) for its assistive nature.
+## Evaluation Results
+Evaluation was conducted separately for text-only and vision-language modalities. Qolda demonstrates significant performance improvements for Kazakh while maintaining comparable performance on Russian and English.
+### Text Benchmarks
+![Model performance comparison on language benchmarks](assets/eval-results-text.png)
+*Performance comparison on language tasks including MMLU, Winogrande, HellaSwag, ARC, GSM8K, and DROP.*
+**Note:** The comparison below presents Qolda's performance against Qwen3-4B on **Kazakh** language benchmarks only. Evaluation results for additional models and performance on Russian and English will be added later.
+| Model | Mode | Avg | MMLU | Winogrande | HellaSwag | ARC | GSM8K | DROP |
+|-------|------|-----|------|------------|-----------|-----|-------|------|
+| Qwen3-4B | Direct | 52.00 | 42.43 | 56.88 | 42.04 | 64.77 | 73.62 | 32.27 |
+| Qwen3-4B | Think | 57.73 | 52.98 | 51.27 | 41.86 | 79.65 | 64.82 | 55.81 |
+| Qolda | Direct | 58.77 | 46.55 | 56.37 | 55.75 | 73.62 | 63.50 | 56.84 |
+| Qolda | Think | **71.64** | **64.56** | **70.54** | **57.70** | **89.99** | **79.47** | **67.59** |
+### Vision Benchmarks
+![Model performance comparison on vision-language benchmarks](assets/eval-results-vision.png)
+*Performance comparison on vision-language tasks including AI2D, MMStar, RealWorldQA, and KazakhOCR.*
+**Note:** The comparison below presents Qolda's performance against InternVL3.5-4B on **Kazakh** vision-language benchmarks only. Evaluation results for additional models and performance on Russian and English will be added later.
+| Model | Mode | Avg | AI2D | MMStar | RealWorldQA | KazakhOCR |
+|-------|------|--------|--------|----------|---------------|-------------|
+| InternVL3.5-4B | Direct | 42.23 | 52.33 | 47.47 | 38.32 | 30.81 |
+| InternVL3.5-4B | Think | 42.58 | 51.42 | 49.33 | 38.74 | 30.81 |
+| Qolda | Direct | 59.39 | 66.06 | 55.47 | 54.97 | **61.06** |
+| Qolda | Think | **60.44** | **67.62** | **56.53** | **57.07** | 60.54 |
+## Model Usage
+To run inference with Transformers, please follow the [guidelines](https://huggingface.co/OpenGVLab/InternVL3_5-4B#inference-with-transformers) from InternVL.
+Alternatively, to run the model via an OpenAI-compatible server, you can use lmdeploy:
+```bash
+pip install lmdeploy>=0.9.1
+lmdeploy serve api_server issai/Qolda --server-port 23333 --tp 1 --backend pytorch
+```
+**Note:** Unlike the original InternVL3.5, this model requires the `enable_thinking` parameter to be explicitly set in the `extra_body` of your API calls. However, depending on the task complexity, an empty thinking response might be generated.
+Then, make a standard API call:
+```python
+import base64
+from openai import OpenAI
+client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1')
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+image_path = "./assets/eval-results-text.png"
+response = client.chat.completions.create(
+    model=client.models.list().data[0].id,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'text',
+                'text': 'Берілген диаграмманың сипаттамасын бер.'
+            },
+            {
+                'type': 'image_url',
+                'image_url': {
+                    'url': f'data:image/png;base64,{encode_image(image_path)}',
+                },
+            }
+        ],
+    }],
+    max_tokens=8192,
+    temperature=0.6,
+    top_p=0.95,
+    extra_body={
+        "top_k": 20,
+        "enable_thinking": True
+    },
+)
+print(response.choices[0].message.content)
+```
+## License
+This model is licensed under the Apache License 2.0.
+## Кіріспе
+InternVL3.5 және Qwen3 негізінде жаса��ған **Qolda** — қазақ, орыс және ағылшын тілдерінде жұмыс істеуге арналған шағын көру-тілдік моделі (vision-language model). Модель 4,3 млрд параметрге ие және [InternVL3.5-4B](https://huggingface.co/OpenGVLab/InternVL3_5-4B) моделінің InternViT-300M көру энкодері мен MLP проектор компоненттерін, сондай-ақ [Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B) тілдік моделін қамтиды. Модельді оқыту [InternVL фреймворкі](https://github.com/OpenGVLab/InternVL) көмегімен жүзеге асырылды 💙
+"Qolda" атауы модельдің дизайны мен мақсатын қазақ тіліндегі қолда сөзінің қос мағынасы арқылы көрсетеді. Біріншісі, шағын әрі қолжетімді болуы үшін "қолда" cөзі арқылы және екіншісі, көмекші табиғаты үшін, "қолдау" мағынасы арқылы.
+## Бағалау нәтижелері
+Мәтіндік және көру-тілдік модальділіктер үшін бағалау бөлек жүргізілді. Qolda орыс және ағылшын тілдеріндегі өзінің бастапқы деңгейін сақтай отырып, қазақ тіліндегі өнімділігін айтарлықтай жақсартты.
+### Мәтіндік бенчмарктар
+![Тілдік бенчмарктардағы модель өнімділігін салыстыру](assets/eval-results-text.png)
+*MMLU, Winogrande, HellaSwag, ARC, GSM8K және DROP сияқты тілдік тапсырмалардағы өнімділікті салыстыру.*
+**Ескерту:** Төмендегі кестедегі Qolda және Qwen3-4B модельдерінің салыстырылуы тек **қазақ** тіліндегі бенчмарктар нәтижелерін көрсетеді. Басқа модельдердің өнімділігі, сондай-ақ орыс және ағылшын тілдеріндегі көрсеткіштер кейінірек ұсынылады.
+| Model | Mode | Avg | MMLU | Winogrande | HellaSwag | ARC | GSM8K | DROP |
+|-------|------|-----|------|------------|-----------|-----|-------|------|
+| Qwen3-4B | Direct | 52.00 | 42.43 | 56.88 | 42.04 | 64.77 | 73.62 | 32.27 |
+| Qwen3-4B | Think | 57.73 | 52.98 | 51.27 | 41.86 | 79.65 | 64.82 | 55.81 |
+| Qolda | Direct | 58.77 | 46.55 | 56.37 | 55.75 | 73.62 | 63.50 | 56.84 |
+| Qolda | Think | **71.64** | **64.56** | **70.54** | **57.70** | **89.99** | **79.47** | **67.59** |
+### Көру бенчмарктары
+![Көру-тілдік бенчмарктарындағы модель өнімділігін салыстыру](assets/eval-results-vision.png)
+*AI2D, MMStar, RealWorldQA және KazakhOCR сияқты көру-тілдік тапсырмаларындағы өнімділікті салыстыру.*
+**Ескерту:** Төмендегі кестедегі Qolda және InternVL3.5-4B модельдерінің салыстырылуы тек **қазақ** тіліндегі көру-тілдік бенчмарктар нәтижелерін көрсетеді. Басқа модельдердің өнімділігі, сондай-ақ орыс және ағылшын тілдеріндегі көрсеткіштер кейінірек ұсынылады.
+| Model | Mode | Avg | AI2D | MMStar | RealWorldQA | KazakhOCR |
+|-------|------|--------|--------|----------|---------------|-------------|
+| InternVL3.5-4B | Direct | 42.23 | 52.33 | 47.47 | 38.32 | 30.81 |
+| InternVL3.5-4B | Think | 42.58 | 51.42 | 49.33 | 38.74 | 30.81 |
+| Qolda | Direct | 59.39 | 66.06 | 55.47 | 54.97 | **61.06** |
+| Qolda | Think | **60.44** | **67.62** | **56.53** | **57.07** | 60.54 |
+## Модельді қолдану
+Transformers арқылы инференсті іске қосу үшін InternVL ұсынған [нұсқаулықтарды](https://huggingface.co/OpenGVLab/InternVL3_5-4B#inference-with-transformers) орындаңыз.
+Немесе, модельді OpenAI-үйлесімді сервер арқылы іске қосу үшін lmdeploy құралын пайдалануға болады:
+```bash
+pip install lmdeploy>=0.9.1
+lmdeploy serve api_server issai/Qolda --server-port 23333 --tp 1 --backend pytorch
+```
+**Ескерту:** Qolda-ның түпнұсқалық InternVL3.5-тен айырмашылығы, бұл модель API call жасаған кезде `extra_body` бөлігінде `enable_thinking` параметрінің нақты орнатылуын талап етеді. Тапсырманың күрделілігіне байланысты бос thinking жауабы қайтарылуы мүмкін.
+Содан соң, стандартты API call жасаңыз:
+```python
+import base64
+from openai import OpenAI
+client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1')
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+image_path = "./assets/eval-results-text.png"
+response = client.chat.completions.create(
+    model=client.models.list().data[0].id,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'text',
+                'text': 'Берілген диаграмманың сипаттамасын бер.'
+            },
+            {
+                'type': 'image_url',
+                'image_url': {
+                    'url': f'data:image/png;base64,{encode_image(image_path)}',
+                },
+            }
+        ],
+    }],
+    max_tokens=8192,
+    temperature=0.6,
+    top_p=0.95,
+    extra_body={
+        "top_k": 20,
+        "enable_thinking": True
+    },
+)
+print(response.choices[0].message.content)
+```
+## Лицензия
+Бұл модель Apache License 2.0 бойынша лицензияланған.

BF16/config.json ADDED Viewed

	@@ -0,0 +1,128 @@

+{
+  "architectures": [
+    "InternVLChatModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_internvl_chat.InternVLChatConfig",
+    "AutoModel": "modeling_internvl_chat.InternVLChatModel",
+    "AutoModelForCausalLM": "modeling_internvl_chat.InternVLChatModel"
+  },
+  "downsample_ratio": 0.5,
+  "dynamic_image_size": true,
+  "eos_token_id": 151645,
+  "force_image_size": 448,
+  "hidden_size": 2560,
+  "llm_config": {
+    "_attn_implementation_autoset": true,
+    "architectures": [
+      "Qwen3ForCausalLM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 2560,
+    "initializer_range": 0.02,
+    "intermediate_size": 9728,
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 40960,
+    "max_window_layers": 36,
+    "model_type": "qwen3",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000,
+    "sliding_window": null,
+    "tie_word_embeddings": true,
+    "torch_dtype": "bfloat16",
+    "use_cache": false,
+    "use_sliding_window": false,
+    "vocab_size": 151936
+  },
+  "max_dynamic_patch": 12,
+  "min_dynamic_patch": 1,
+  "model_type": "internvl_chat",
+  "output_attentions": false,
+  "pad2square": false,
+  "pad_token_id": 151643,
+  "ps_version": "v2",
+  "select_layer": -1,
+  "template": "internvl2_5",
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": null,
+  "use_backbone_lora": 0,
+  "use_llm_lora": 0,
+  "use_thumbnail": true,
+  "vision_config": {
+    "_attn_implementation_autoset": true,
+    "architectures": [
+      "InternVisionModel"
+    ],
+    "attention_dropout": 0.0,
+    "auto_map": {
+      "AutoConfig": "configuration_intern_vit.InternVisionConfig",
+      "AutoModel": "modeling_intern_vit.InternVisionModel"
+    },
+    "drop_path_rate": 0.0,
+    "dropout": 0.0,
+    "hidden_act": "gelu",
+    "hidden_size": 1024,
+    "image_size": 448,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "layer_norm_eps": 1e-06,
+    "model_type": "intern_vit_6b",
+    "norm_type": "layer_norm",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 24,
+    "patch_size": 14,
+    "qk_normalization": false,
+    "qkv_bias": true,
+    "torch_dtype": "bfloat16",
+    "use_fa3": false,
+    "use_flash_attn": true
+  }
+}

BF16/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "pad_token_id": 151643,
+  "transformers_version": "4.51.0"
+}

BF16/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

BF16/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "additional_special_tokens": [
+    "<img>",
+    "</img>",
+    "<IMG_CONTEXT>",
+    "<quad>",
+    "</quad>",
+    "<ref>",
+    "</ref>",
+    "<box>",
+    "</box>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

BF16/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,308 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151669": {
+      "content": "<img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "</img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "<IMG_CONTEXT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "</quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "<ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151675": {
+      "content": "</ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151676": {
+      "content": "<box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151677": {
+      "content": "</box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<img>",
+    "</img>",
+    "<IMG_CONTEXT>",
+    "<quad>",
+    "</quad>",
+    "<ref>",
+    "</ref>",
+    "<box>",
+    "</box>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

BF16/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

IQ2_M/Qolda-IQ2_M.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:396fa4ddc45daf38581ece3a046fa3897a286e6f264908ff9ed6ba904fc521f9
+size 273374272

IQ3_XXS/Qolda-IQ3_XXS.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:396fa4ddc45daf38581ece3a046fa3897a286e6f264908ff9ed6ba904fc521f9
+size 273374272

IQ4_NL/Qolda-IQ4_NL.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd451aa7562045deecb63b09e82a6048ce9b6dc7beb5272a901cb6f9b278da03
+size 2393795136

IQ4_NL/README.md ADDED Viewed

	@@ -0,0 +1,199 @@

+---
+language:
+- kk
+- ru
+- en
+base_model:
+- OpenGVLab/InternVL3_5-4B
+pipeline_tag: image-text-to-text
+---
+[Қазақша](#кіріспе)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[English](#introduction)
+# Qolda
+[![GitHub](https://img.shields.io/badge/GitHub-Qolda--deployment-blue?logo=github)](https://github.com/IS2AI/Qolda-deployment)
+[![License](https://img.shields.io/badge/License-Apache%202.0-green.svg)](https://www.apache.org/licenses/LICENSE-2.0)
+## Introduction
+Built on top of InternVL3.5 and Qwen3, **Qolda** is a small vision-language model designed to operate in Kazakh, Russian, and English. The model has 4.3B parameters and comprises the InternViT-300M vision encoder and MLP Projector components from [InternVL3.5-4B](https://huggingface.co/OpenGVLab/InternVL3_5-4B), along with the [Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B) language model. Model training was performed using the [InternVL framework](https://github.com/OpenGVLab/InternVL) 💙
+The name "Qolda" reflects both its design and purpose in Kazakh: "in hand" (қолда) for its compact accessibility, and "to support" (қолдау) for its assistive nature.
+## Evaluation Results
+Evaluation was conducted separately for text-only and vision-language modalities. Qolda demonstrates significant performance improvements for Kazakh while maintaining comparable performance on Russian and English.
+### Text Benchmarks
+![Model performance comparison on language benchmarks](assets/eval-results-text.png)
+*Performance comparison on language tasks including MMLU, Winogrande, HellaSwag, ARC, GSM8K, and DROP.*
+**Note:** The comparison below presents Qolda's performance against Qwen3-4B on **Kazakh** language benchmarks only. Evaluation results for additional models and performance on Russian and English will be added later.
+| Model | Mode | Avg | MMLU | Winogrande | HellaSwag | ARC | GSM8K | DROP |
+|-------|------|-----|------|------------|-----------|-----|-------|------|
+| Qwen3-4B | Direct | 52.00 | 42.43 | 56.88 | 42.04 | 64.77 | 73.62 | 32.27 |
+| Qwen3-4B | Think | 57.73 | 52.98 | 51.27 | 41.86 | 79.65 | 64.82 | 55.81 |
+| Qolda | Direct | 58.77 | 46.55 | 56.37 | 55.75 | 73.62 | 63.50 | 56.84 |
+| Qolda | Think | **71.64** | **64.56** | **70.54** | **57.70** | **89.99** | **79.47** | **67.59** |
+### Vision Benchmarks
+![Model performance comparison on vision-language benchmarks](assets/eval-results-vision.png)
+*Performance comparison on vision-language tasks including AI2D, MMStar, RealWorldQA, and KazakhOCR.*
+**Note:** The comparison below presents Qolda's performance against InternVL3.5-4B on **Kazakh** vision-language benchmarks only. Evaluation results for additional models and performance on Russian and English will be added later.
+| Model | Mode | Avg | AI2D | MMStar | RealWorldQA | KazakhOCR |
+|-------|------|--------|--------|----------|---------------|-------------|
+| InternVL3.5-4B | Direct | 42.23 | 52.33 | 47.47 | 38.32 | 30.81 |
+| InternVL3.5-4B | Think | 42.58 | 51.42 | 49.33 | 38.74 | 30.81 |
+| Qolda | Direct | 59.39 | 66.06 | 55.47 | 54.97 | **61.06** |
+| Qolda | Think | **60.44** | **67.62** | **56.53** | **57.07** | 60.54 |
+## Model Usage
+To run inference with Transformers, please follow the [guidelines](https://huggingface.co/OpenGVLab/InternVL3_5-4B#inference-with-transformers) from InternVL.
+Alternatively, to run the model via an OpenAI-compatible server, you can use lmdeploy:
+```bash
+pip install lmdeploy>=0.9.1
+lmdeploy serve api_server issai/Qolda --server-port 23333 --tp 1 --backend pytorch
+```
+**Note:** Unlike the original InternVL3.5, this model requires the `enable_thinking` parameter to be explicitly set in the `extra_body` of your API calls. However, depending on the task complexity, an empty thinking response might be generated.
+Then, make a standard API call:
+```python
+import base64
+from openai import OpenAI
+client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1')
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+image_path = "./assets/eval-results-text.png"
+response = client.chat.completions.create(
+    model=client.models.list().data[0].id,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'text',
+                'text': 'Берілген диаграмманың сипаттамасын бер.'
+            },
+            {
+                'type': 'image_url',
+                'image_url': {
+                    'url': f'data:image/png;base64,{encode_image(image_path)}',
+                },
+            }
+        ],
+    }],
+    max_tokens=8192,
+    temperature=0.6,
+    top_p=0.95,
+    extra_body={
+        "top_k": 20,
+        "enable_thinking": True
+    },
+)
+print(response.choices[0].message.content)
+```
+## License
+This model is licensed under the Apache License 2.0.
+## Кіріспе
+InternVL3.5 және Qwen3 негізінде жаса��ған **Qolda** — қазақ, орыс және ағылшын тілдерінде жұмыс істеуге арналған шағын көру-тілдік моделі (vision-language model). Модель 4,3 млрд параметрге ие және [InternVL3.5-4B](https://huggingface.co/OpenGVLab/InternVL3_5-4B) моделінің InternViT-300M көру энкодері мен MLP проектор компоненттерін, сондай-ақ [Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B) тілдік моделін қамтиды. Модельді оқыту [InternVL фреймворкі](https://github.com/OpenGVLab/InternVL) көмегімен жүзеге асырылды 💙
+"Qolda" атауы модельдің дизайны мен мақсатын қазақ тіліндегі қолда сөзінің қос мағынасы арқылы көрсетеді. Біріншісі, шағын әрі қолжетімді болуы үшін "қолда" cөзі арқылы және екіншісі, көмекші табиғаты үшін, "қолдау" мағынасы арқылы.
+## Бағалау нәтижелері
+Мәтіндік және көру-тілдік модальділіктер үшін бағалау бөлек жүргізілді. Qolda орыс және ағылшын тілдеріндегі өзінің бастапқы деңгейін сақтай отырып, қазақ тіліндегі өнімділігін айтарлықтай жақсартты.
+### Мәтіндік бенчмарктар
+![Тілдік бенчмарктардағы модель өнімділігін салыстыру](assets/eval-results-text.png)
+*MMLU, Winogrande, HellaSwag, ARC, GSM8K және DROP сияқты тілдік тапсырмалардағы өнімділікті салыстыру.*
+**Ескерту:** Төмендегі кестедегі Qolda және Qwen3-4B модельдерінің салыстырылуы тек **қазақ** тіліндегі бенчмарктар нәтижелерін көрсетеді. Басқа модельдердің өнімділігі, сондай-ақ орыс және ағылшын тілдеріндегі көрсеткіштер кейінірек ұсынылады.
+| Model | Mode | Avg | MMLU | Winogrande | HellaSwag | ARC | GSM8K | DROP |
+|-------|------|-----|------|------------|-----------|-----|-------|------|
+| Qwen3-4B | Direct | 52.00 | 42.43 | 56.88 | 42.04 | 64.77 | 73.62 | 32.27 |
+| Qwen3-4B | Think | 57.73 | 52.98 | 51.27 | 41.86 | 79.65 | 64.82 | 55.81 |
+| Qolda | Direct | 58.77 | 46.55 | 56.37 | 55.75 | 73.62 | 63.50 | 56.84 |
+| Qolda | Think | **71.64** | **64.56** | **70.54** | **57.70** | **89.99** | **79.47** | **67.59** |
+### Көру бенчмарктары
+![Көру-тілдік бенчмарктарындағы модель өнімділігін салыстыру](assets/eval-results-vision.png)
+*AI2D, MMStar, RealWorldQA және KazakhOCR сияқты көру-тілдік тапсырмаларындағы өнімділікті салыстыру.*
+**Ескерту:** Төмендегі кестедегі Qolda және InternVL3.5-4B модельдерінің салыстырылуы тек **қазақ** тіліндегі көру-тілдік бенчмарктар нәтижелерін көрсетеді. Басқа модельдердің өнімділігі, сондай-ақ орыс және ағылшын тілдеріндегі көрсеткіштер кейінірек ұсынылады.
+| Model | Mode | Avg | AI2D | MMStar | RealWorldQA | KazakhOCR |
+|-------|------|--------|--------|----------|---------------|-------------|
+| InternVL3.5-4B | Direct | 42.23 | 52.33 | 47.47 | 38.32 | 30.81 |
+| InternVL3.5-4B | Think | 42.58 | 51.42 | 49.33 | 38.74 | 30.81 |
+| Qolda | Direct | 59.39 | 66.06 | 55.47 | 54.97 | **61.06** |
+| Qolda | Think | **60.44** | **67.62** | **56.53** | **57.07** | 60.54 |
+## Модельді қолдану
+Transformers арқылы инференсті іске қосу үшін InternVL ұсынған [нұсқаулықтарды](https://huggingface.co/OpenGVLab/InternVL3_5-4B#inference-with-transformers) орындаңыз.
+Немесе, модельді OpenAI-үйлесімді сервер арқылы іске қосу үшін lmdeploy құралын пайдалануға болады:
+```bash
+pip install lmdeploy>=0.9.1
+lmdeploy serve api_server issai/Qolda --server-port 23333 --tp 1 --backend pytorch
+```
+**Ескерту:** Qolda-ның түпнұсқалық InternVL3.5-тен айырмашылығы, бұл модель API call жасаған кезде `extra_body` бөлігінде `enable_thinking` параметрінің нақты орнатылуын талап етеді. Тапсырманың күрделілігіне байланысты бос thinking жауабы қайтарылуы мүмкін.
+Содан соң, стандартты API call жасаңыз:
+```python
+import base64
+from openai import OpenAI
+client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1')
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+image_path = "./assets/eval-results-text.png"
+response = client.chat.completions.create(
+    model=client.models.list().data[0].id,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'text',
+                'text': 'Берілген диаграмманың сипаттамасын бер.'
+            },
+            {
+                'type': 'image_url',
+                'image_url': {
+                    'url': f'data:image/png;base64,{encode_image(image_path)}',
+                },
+            }
+        ],
+    }],
+    max_tokens=8192,
+    temperature=0.6,
+    top_p=0.95,
+    extra_body={
+        "top_k": 20,
+        "enable_thinking": True
+    },
+)
+print(response.choices[0].message.content)
+```
+## Лицензия
+Бұл модель Apache License 2.0 бойынша лицензияланған.

IQ4_NL/config.json ADDED Viewed

	@@ -0,0 +1,128 @@

+{
+  "architectures": [
+    "InternVLChatModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_internvl_chat.InternVLChatConfig",
+    "AutoModel": "modeling_internvl_chat.InternVLChatModel",
+    "AutoModelForCausalLM": "modeling_internvl_chat.InternVLChatModel"
+  },
+  "downsample_ratio": 0.5,
+  "dynamic_image_size": true,
+  "eos_token_id": 151645,
+  "force_image_size": 448,
+  "hidden_size": 2560,
+  "llm_config": {
+    "_attn_implementation_autoset": true,
+    "architectures": [
+      "Qwen3ForCausalLM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 2560,
+    "initializer_range": 0.02,
+    "intermediate_size": 9728,
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 40960,
+    "max_window_layers": 36,
+    "model_type": "qwen3",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000,
+    "sliding_window": null,
+    "tie_word_embeddings": true,
+    "torch_dtype": "bfloat16",
+    "use_cache": false,
+    "use_sliding_window": false,
+    "vocab_size": 151936
+  },
+  "max_dynamic_patch": 12,
+  "min_dynamic_patch": 1,
+  "model_type": "internvl_chat",
+  "output_attentions": false,
+  "pad2square": false,
+  "pad_token_id": 151643,
+  "ps_version": "v2",
+  "select_layer": -1,
+  "template": "internvl2_5",
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": null,
+  "use_backbone_lora": 0,
+  "use_llm_lora": 0,
+  "use_thumbnail": true,
+  "vision_config": {
+    "_attn_implementation_autoset": true,
+    "architectures": [
+      "InternVisionModel"
+    ],
+    "attention_dropout": 0.0,
+    "auto_map": {
+      "AutoConfig": "configuration_intern_vit.InternVisionConfig",
+      "AutoModel": "modeling_intern_vit.InternVisionModel"
+    },
+    "drop_path_rate": 0.0,
+    "dropout": 0.0,
+    "hidden_act": "gelu",
+    "hidden_size": 1024,
+    "image_size": 448,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "layer_norm_eps": 1e-06,
+    "model_type": "intern_vit_6b",
+    "norm_type": "layer_norm",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 24,
+    "patch_size": 14,
+    "qk_normalization": false,
+    "qkv_bias": true,
+    "torch_dtype": "bfloat16",
+    "use_fa3": false,
+    "use_flash_attn": true
+  }
+}

IQ4_NL/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "pad_token_id": 151643,
+  "transformers_version": "4.51.0"
+}

IQ4_NL/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

IQ4_NL/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "additional_special_tokens": [
+    "<img>",
+    "</img>",
+    "<IMG_CONTEXT>",
+    "<quad>",
+    "</quad>",
+    "<ref>",
+    "</ref>",
+    "<box>",
+    "</box>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

IQ4_NL/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,308 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151669": {
+      "content": "<img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "</img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "<IMG_CONTEXT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "</quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "<ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151675": {
+      "content": "</ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151676": {
+      "content": "<box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151677": {
+      "content": "</box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<img>",
+    "</img>",
+    "<IMG_CONTEXT>",
+    "<quad>",
+    "</quad>",
+    "<ref>",
+    "</ref>",
+    "<box>",
+    "</box>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

IQ4_NL/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

IQ4_XS/Qolda-IQ4_XS.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc4f5a37e8e19b336e55a4ecb67b356d3c905d6f17ac674f115bc3e92e7c395a
+size 2286316096

IQ4_XS/README.md ADDED Viewed

	@@ -0,0 +1,199 @@

+---
+language:
+- kk
+- ru
+- en
+base_model:
+- OpenGVLab/InternVL3_5-4B
+pipeline_tag: image-text-to-text
+---
+[Қазақша](#кіріспе)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[English](#introduction)
+# Qolda
+[![GitHub](https://img.shields.io/badge/GitHub-Qolda--deployment-blue?logo=github)](https://github.com/IS2AI/Qolda-deployment)
+[![License](https://img.shields.io/badge/License-Apache%202.0-green.svg)](https://www.apache.org/licenses/LICENSE-2.0)
+## Introduction
+Built on top of InternVL3.5 and Qwen3, **Qolda** is a small vision-language model designed to operate in Kazakh, Russian, and English. The model has 4.3B parameters and comprises the InternViT-300M vision encoder and MLP Projector components from [InternVL3.5-4B](https://huggingface.co/OpenGVLab/InternVL3_5-4B), along with the [Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B) language model. Model training was performed using the [InternVL framework](https://github.com/OpenGVLab/InternVL) 💙
+The name "Qolda" reflects both its design and purpose in Kazakh: "in hand" (қолда) for its compact accessibility, and "to support" (қолдау) for its assistive nature.
+## Evaluation Results
+Evaluation was conducted separately for text-only and vision-language modalities. Qolda demonstrates significant performance improvements for Kazakh while maintaining comparable performance on Russian and English.
+### Text Benchmarks
+![Model performance comparison on language benchmarks](assets/eval-results-text.png)
+*Performance comparison on language tasks including MMLU, Winogrande, HellaSwag, ARC, GSM8K, and DROP.*
+**Note:** The comparison below presents Qolda's performance against Qwen3-4B on **Kazakh** language benchmarks only. Evaluation results for additional models and performance on Russian and English will be added later.
+| Model | Mode | Avg | MMLU | Winogrande | HellaSwag | ARC | GSM8K | DROP |
+|-------|------|-----|------|------------|-----------|-----|-------|------|
+| Qwen3-4B | Direct | 52.00 | 42.43 | 56.88 | 42.04 | 64.77 | 73.62 | 32.27 |
+| Qwen3-4B | Think | 57.73 | 52.98 | 51.27 | 41.86 | 79.65 | 64.82 | 55.81 |
+| Qolda | Direct | 58.77 | 46.55 | 56.37 | 55.75 | 73.62 | 63.50 | 56.84 |
+| Qolda | Think | **71.64** | **64.56** | **70.54** | **57.70** | **89.99** | **79.47** | **67.59** |
+### Vision Benchmarks
+![Model performance comparison on vision-language benchmarks](assets/eval-results-vision.png)
+*Performance comparison on vision-language tasks including AI2D, MMStar, RealWorldQA, and KazakhOCR.*
+**Note:** The comparison below presents Qolda's performance against InternVL3.5-4B on **Kazakh** vision-language benchmarks only. Evaluation results for additional models and performance on Russian and English will be added later.
+| Model | Mode | Avg | AI2D | MMStar | RealWorldQA | KazakhOCR |
+|-------|------|--------|--------|----------|---------------|-------------|
+| InternVL3.5-4B | Direct | 42.23 | 52.33 | 47.47 | 38.32 | 30.81 |
+| InternVL3.5-4B | Think | 42.58 | 51.42 | 49.33 | 38.74 | 30.81 |
+| Qolda | Direct | 59.39 | 66.06 | 55.47 | 54.97 | **61.06** |
+| Qolda | Think | **60.44** | **67.62** | **56.53** | **57.07** | 60.54 |
+## Model Usage
+To run inference with Transformers, please follow the [guidelines](https://huggingface.co/OpenGVLab/InternVL3_5-4B#inference-with-transformers) from InternVL.
+Alternatively, to run the model via an OpenAI-compatible server, you can use lmdeploy:
+```bash
+pip install lmdeploy>=0.9.1
+lmdeploy serve api_server issai/Qolda --server-port 23333 --tp 1 --backend pytorch
+```
+**Note:** Unlike the original InternVL3.5, this model requires the `enable_thinking` parameter to be explicitly set in the `extra_body` of your API calls. However, depending on the task complexity, an empty thinking response might be generated.
+Then, make a standard API call:
+```python
+import base64
+from openai import OpenAI
+client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1')
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+image_path = "./assets/eval-results-text.png"
+response = client.chat.completions.create(
+    model=client.models.list().data[0].id,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'text',
+                'text': 'Берілген диаграмманың сипаттамасын бер.'
+            },
+            {
+                'type': 'image_url',
+                'image_url': {
+                    'url': f'data:image/png;base64,{encode_image(image_path)}',
+                },
+            }
+        ],
+    }],
+    max_tokens=8192,
+    temperature=0.6,
+    top_p=0.95,
+    extra_body={
+        "top_k": 20,
+        "enable_thinking": True
+    },
+)
+print(response.choices[0].message.content)
+```
+## License
+This model is licensed under the Apache License 2.0.
+## Кіріспе
+InternVL3.5 және Qwen3 негізінде жаса��ған **Qolda** — қазақ, орыс және ағылшын тілдерінде жұмыс істеуге арналған шағын көру-тілдік моделі (vision-language model). Модель 4,3 млрд параметрге ие және [InternVL3.5-4B](https://huggingface.co/OpenGVLab/InternVL3_5-4B) моделінің InternViT-300M көру энкодері мен MLP проектор компоненттерін, сондай-ақ [Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B) тілдік моделін қамтиды. Модельді оқыту [InternVL фреймворкі](https://github.com/OpenGVLab/InternVL) көмегімен жүзеге асырылды 💙
+"Qolda" атауы модельдің дизайны мен мақсатын қазақ тіліндегі қолда сөзінің қос мағынасы арқылы көрсетеді. Біріншісі, шағын әрі қолжетімді болуы үшін "қолда" cөзі арқылы және екіншісі, көмекші табиғаты үшін, "қолдау" мағынасы арқылы.
+## Бағалау нәтижелері
+Мәтіндік және көру-тілдік модальділіктер үшін бағалау бөлек жүргізілді. Qolda орыс және ағылшын тілдеріндегі өзінің бастапқы деңгейін сақтай отырып, қазақ тіліндегі өнімділігін айтарлықтай жақсартты.
+### Мәтіндік бенчмарктар
+![Тілдік бенчмарктардағы модель өнімділігін салыстыру](assets/eval-results-text.png)
+*MMLU, Winogrande, HellaSwag, ARC, GSM8K және DROP сияқты тілдік тапсырмалардағы өнімділікті салыстыру.*
+**Ескерту:** Төмендегі кестедегі Qolda және Qwen3-4B модельдерінің салыстырылуы тек **қазақ** тіліндегі бенчмарктар нәтижелерін көрсетеді. Басқа модельдердің өнімділігі, сондай-ақ орыс және ағылшын тілдеріндегі көрсеткіштер кейінірек ұсынылады.
+| Model | Mode | Avg | MMLU | Winogrande | HellaSwag | ARC | GSM8K | DROP |
+|-------|------|-----|------|------------|-----------|-----|-------|------|
+| Qwen3-4B | Direct | 52.00 | 42.43 | 56.88 | 42.04 | 64.77 | 73.62 | 32.27 |
+| Qwen3-4B | Think | 57.73 | 52.98 | 51.27 | 41.86 | 79.65 | 64.82 | 55.81 |
+| Qolda | Direct | 58.77 | 46.55 | 56.37 | 55.75 | 73.62 | 63.50 | 56.84 |
+| Qolda | Think | **71.64** | **64.56** | **70.54** | **57.70** | **89.99** | **79.47** | **67.59** |
+### Көру бенчмарктары
+![Көру-тілдік бенчмарктарындағы модель өнімділігін салыстыру](assets/eval-results-vision.png)
+*AI2D, MMStar, RealWorldQA және KazakhOCR сияқты көру-тілдік тапсырмаларындағы өнімділікті салыстыру.*
+**Ескерту:** Төмендегі кестедегі Qolda және InternVL3.5-4B модельдерінің салыстырылуы тек **қазақ** тіліндегі көру-тілдік бенчмарктар нәтижелерін көрсетеді. Басқа модельдердің өнімділігі, сондай-ақ орыс және ағылшын тілдеріндегі көрсеткіштер кейінірек ұсынылады.
+| Model | Mode | Avg | AI2D | MMStar | RealWorldQA | KazakhOCR |
+|-------|------|--------|--------|----------|---------------|-------------|
+| InternVL3.5-4B | Direct | 42.23 | 52.33 | 47.47 | 38.32 | 30.81 |
+| InternVL3.5-4B | Think | 42.58 | 51.42 | 49.33 | 38.74 | 30.81 |
+| Qolda | Direct | 59.39 | 66.06 | 55.47 | 54.97 | **61.06** |
+| Qolda | Think | **60.44** | **67.62** | **56.53** | **57.07** | 60.54 |
+## Модельді қолдану
+Transformers арқылы инференсті іске қосу үшін InternVL ұсынған [нұсқаулықтарды](https://huggingface.co/OpenGVLab/InternVL3_5-4B#inference-with-transformers) орындаңыз.
+Немесе, модельді OpenAI-үйлесімді сервер арқылы іске қосу үшін lmdeploy құралын пайдалануға болады:
+```bash
+pip install lmdeploy>=0.9.1
+lmdeploy serve api_server issai/Qolda --server-port 23333 --tp 1 --backend pytorch
+```
+**Ескерту:** Qolda-ның түпнұсқалық InternVL3.5-тен айырмашылығы, бұл модель API call жасаған кезде `extra_body` бөлігінде `enable_thinking` параметрінің нақты орнатылуын талап етеді. Тапсырманың күрделілігіне байланысты бос thinking жауабы қайтарылуы мүмкін.
+Содан соң, стандартты API call жасаңыз:
+```python
+import base64
+from openai import OpenAI
+client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1')
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+image_path = "./assets/eval-results-text.png"
+response = client.chat.completions.create(
+    model=client.models.list().data[0].id,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'text',
+                'text': 'Берілген диаграмманың сипаттамасын бер.'
+            },
+            {
+                'type': 'image_url',
+                'image_url': {
+                    'url': f'data:image/png;base64,{encode_image(image_path)}',
+                },
+            }
+        ],
+    }],
+    max_tokens=8192,
+    temperature=0.6,
+    top_p=0.95,
+    extra_body={
+        "top_k": 20,
+        "enable_thinking": True
+    },
+)
+print(response.choices[0].message.content)
+```
+## Лицензия
+Бұл модель Apache License 2.0 бойынша лицензияланған.

IQ4_XS/config.json ADDED Viewed

	@@ -0,0 +1,128 @@

+{
+  "architectures": [
+    "InternVLChatModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_internvl_chat.InternVLChatConfig",
+    "AutoModel": "modeling_internvl_chat.InternVLChatModel",
+    "AutoModelForCausalLM": "modeling_internvl_chat.InternVLChatModel"
+  },
+  "downsample_ratio": 0.5,
+  "dynamic_image_size": true,
+  "eos_token_id": 151645,
+  "force_image_size": 448,
+  "hidden_size": 2560,
+  "llm_config": {
+    "_attn_implementation_autoset": true,
+    "architectures": [
+      "Qwen3ForCausalLM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 2560,
+    "initializer_range": 0.02,
+    "intermediate_size": 9728,
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 40960,
+    "max_window_layers": 36,
+    "model_type": "qwen3",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000,
+    "sliding_window": null,
+    "tie_word_embeddings": true,
+    "torch_dtype": "bfloat16",
+    "use_cache": false,
+    "use_sliding_window": false,
+    "vocab_size": 151936
+  },
+  "max_dynamic_patch": 12,
+  "min_dynamic_patch": 1,
+  "model_type": "internvl_chat",
+  "output_attentions": false,
+  "pad2square": false,
+  "pad_token_id": 151643,
+  "ps_version": "v2",
+  "select_layer": -1,
+  "template": "internvl2_5",
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": null,
+  "use_backbone_lora": 0,
+  "use_llm_lora": 0,
+  "use_thumbnail": true,
+  "vision_config": {
+    "_attn_implementation_autoset": true,
+    "architectures": [
+      "InternVisionModel"
+    ],
+    "attention_dropout": 0.0,
+    "auto_map": {
+      "AutoConfig": "configuration_intern_vit.InternVisionConfig",
+      "AutoModel": "modeling_intern_vit.InternVisionModel"
+    },
+    "drop_path_rate": 0.0,
+    "dropout": 0.0,
+    "hidden_act": "gelu",
+    "hidden_size": 1024,
+    "image_size": 448,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "layer_norm_eps": 1e-06,
+    "model_type": "intern_vit_6b",
+    "norm_type": "layer_norm",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 24,
+    "patch_size": 14,
+    "qk_normalization": false,
+    "qkv_bias": true,
+    "torch_dtype": "bfloat16",
+    "use_fa3": false,
+    "use_flash_attn": true
+  }
+}

IQ4_XS/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "pad_token_id": 151643,
+  "transformers_version": "4.51.0"
+}

IQ4_XS/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

IQ4_XS/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "additional_special_tokens": [
+    "<img>",
+    "</img>",
+    "<IMG_CONTEXT>",
+    "<quad>",
+    "</quad>",
+    "<ref>",
+    "</ref>",
+    "<box>",
+    "</box>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

IQ4_XS/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,308 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151669": {
+      "content": "<img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "</img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "<IMG_CONTEXT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "</quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "<ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151675": {
+      "content": "</ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151676": {
+      "content": "<box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151677": {
+      "content": "</box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<img>",
+    "</img>",
+    "<IMG_CONTEXT>",
+    "<quad>",
+    "</quad>",
+    "<ref>",
+    "</ref>",
+    "<box>",
+    "</box>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

IQ4_XS/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

Q2_K/Qolda-Q2_K.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:136e01918c5161c389595bb15e664a44ad9dfd41034f8aa8a0d3840e94add17f
+size 1669499456

Q2_K/README.md ADDED Viewed

	@@ -0,0 +1,199 @@

+---
+language:
+- kk
+- ru
+- en
+base_model:
+- OpenGVLab/InternVL3_5-4B
+pipeline_tag: image-text-to-text
+---
+[Қазақша](#кіріспе)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[English](#introduction)
+# Qolda
+[![GitHub](https://img.shields.io/badge/GitHub-Qolda--deployment-blue?logo=github)](https://github.com/IS2AI/Qolda-deployment)
+[![License](https://img.shields.io/badge/License-Apache%202.0-green.svg)](https://www.apache.org/licenses/LICENSE-2.0)
+## Introduction
+Built on top of InternVL3.5 and Qwen3, **Qolda** is a small vision-language model designed to operate in Kazakh, Russian, and English. The model has 4.3B parameters and comprises the InternViT-300M vision encoder and MLP Projector components from [InternVL3.5-4B](https://huggingface.co/OpenGVLab/InternVL3_5-4B), along with the [Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B) language model. Model training was performed using the [InternVL framework](https://github.com/OpenGVLab/InternVL) 💙
+The name "Qolda" reflects both its design and purpose in Kazakh: "in hand" (қолда) for its compact accessibility, and "to support" (қолдау) for its assistive nature.
+## Evaluation Results
+Evaluation was conducted separately for text-only and vision-language modalities. Qolda demonstrates significant performance improvements for Kazakh while maintaining comparable performance on Russian and English.
+### Text Benchmarks
+![Model performance comparison on language benchmarks](assets/eval-results-text.png)
+*Performance comparison on language tasks including MMLU, Winogrande, HellaSwag, ARC, GSM8K, and DROP.*
+**Note:** The comparison below presents Qolda's performance against Qwen3-4B on **Kazakh** language benchmarks only. Evaluation results for additional models and performance on Russian and English will be added later.
+| Model | Mode | Avg | MMLU | Winogrande | HellaSwag | ARC | GSM8K | DROP |
+|-------|------|-----|------|------------|-----------|-----|-------|------|
+| Qwen3-4B | Direct | 52.00 | 42.43 | 56.88 | 42.04 | 64.77 | 73.62 | 32.27 |
+| Qwen3-4B | Think | 57.73 | 52.98 | 51.27 | 41.86 | 79.65 | 64.82 | 55.81 |
+| Qolda | Direct | 58.77 | 46.55 | 56.37 | 55.75 | 73.62 | 63.50 | 56.84 |
+| Qolda | Think | **71.64** | **64.56** | **70.54** | **57.70** | **89.99** | **79.47** | **67.59** |
+### Vision Benchmarks
+![Model performance comparison on vision-language benchmarks](assets/eval-results-vision.png)
+*Performance comparison on vision-language tasks including AI2D, MMStar, RealWorldQA, and KazakhOCR.*
+**Note:** The comparison below presents Qolda's performance against InternVL3.5-4B on **Kazakh** vision-language benchmarks only. Evaluation results for additional models and performance on Russian and English will be added later.
+| Model | Mode | Avg | AI2D | MMStar | RealWorldQA | KazakhOCR |
+|-------|------|--------|--------|----------|---------------|-------------|
+| InternVL3.5-4B | Direct | 42.23 | 52.33 | 47.47 | 38.32 | 30.81 |
+| InternVL3.5-4B | Think | 42.58 | 51.42 | 49.33 | 38.74 | 30.81 |
+| Qolda | Direct | 59.39 | 66.06 | 55.47 | 54.97 | **61.06** |
+| Qolda | Think | **60.44** | **67.62** | **56.53** | **57.07** | 60.54 |
+## Model Usage
+To run inference with Transformers, please follow the [guidelines](https://huggingface.co/OpenGVLab/InternVL3_5-4B#inference-with-transformers) from InternVL.
+Alternatively, to run the model via an OpenAI-compatible server, you can use lmdeploy:
+```bash
+pip install lmdeploy>=0.9.1
+lmdeploy serve api_server issai/Qolda --server-port 23333 --tp 1 --backend pytorch
+```
+**Note:** Unlike the original InternVL3.5, this model requires the `enable_thinking` parameter to be explicitly set in the `extra_body` of your API calls. However, depending on the task complexity, an empty thinking response might be generated.
+Then, make a standard API call:
+```python
+import base64
+from openai import OpenAI
+client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1')
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+image_path = "./assets/eval-results-text.png"
+response = client.chat.completions.create(
+    model=client.models.list().data[0].id,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'text',
+                'text': 'Берілген диаграмманың сипаттамасын бер.'
+            },
+            {
+                'type': 'image_url',
+                'image_url': {
+                    'url': f'data:image/png;base64,{encode_image(image_path)}',
+                },
+            }
+        ],
+    }],
+    max_tokens=8192,
+    temperature=0.6,
+    top_p=0.95,
+    extra_body={
+        "top_k": 20,
+        "enable_thinking": True
+    },
+)
+print(response.choices[0].message.content)
+```
+## License
+This model is licensed under the Apache License 2.0.
+## Кіріспе
+InternVL3.5 және Qwen3 негізінде жаса��ған **Qolda** — қазақ, орыс және ағылшын тілдерінде жұмыс істеуге арналған шағын көру-тілдік моделі (vision-language model). Модель 4,3 млрд параметрге ие және [InternVL3.5-4B](https://huggingface.co/OpenGVLab/InternVL3_5-4B) моделінің InternViT-300M көру энкодері мен MLP проектор компоненттерін, сондай-ақ [Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B) тілдік моделін қамтиды. Модельді оқыту [InternVL фреймворкі](https://github.com/OpenGVLab/InternVL) көмегімен жүзеге асырылды 💙
+"Qolda" атауы модельдің дизайны мен мақсатын қазақ тіліндегі қолда сөзінің қос мағынасы арқылы көрсетеді. Біріншісі, шағын әрі қолжетімді болуы үшін "қолда" cөзі арқылы және екіншісі, көмекші табиғаты үшін, "қолдау" мағынасы арқылы.
+## Бағалау нәтижелері
+Мәтіндік және көру-тілдік модальділіктер үшін бағалау бөлек жүргізілді. Qolda орыс және ағылшын тілдеріндегі өзінің бастапқы деңгейін сақтай отырып, қазақ тіліндегі өнімділігін айтарлықтай жақсартты.
+### Мәтіндік бенчмарктар
+![Тілдік бенчмарктардағы модель өнімділігін салыстыру](assets/eval-results-text.png)
+*MMLU, Winogrande, HellaSwag, ARC, GSM8K және DROP сияқты тілдік тапсырмалардағы өнімділікті салыстыру.*
+**Ескерту:** Төмендегі кестедегі Qolda және Qwen3-4B модельдерінің салыстырылуы тек **қазақ** тіліндегі бенчмарктар нәтижелерін көрсетеді. Басқа модельдердің өнімділігі, сондай-ақ орыс және ағылшын тілдеріндегі көрсеткіштер кейінірек ұсынылады.
+| Model | Mode | Avg | MMLU | Winogrande | HellaSwag | ARC | GSM8K | DROP |
+|-------|------|-----|------|------------|-----------|-----|-------|------|
+| Qwen3-4B | Direct | 52.00 | 42.43 | 56.88 | 42.04 | 64.77 | 73.62 | 32.27 |
+| Qwen3-4B | Think | 57.73 | 52.98 | 51.27 | 41.86 | 79.65 | 64.82 | 55.81 |
+| Qolda | Direct | 58.77 | 46.55 | 56.37 | 55.75 | 73.62 | 63.50 | 56.84 |
+| Qolda | Think | **71.64** | **64.56** | **70.54** | **57.70** | **89.99** | **79.47** | **67.59** |
+### Көру бенчмарктары
+![Көру-тілдік бенчмарктарындағы модель өнімділігін салыстыру](assets/eval-results-vision.png)
+*AI2D, MMStar, RealWorldQA және KazakhOCR сияқты көру-тілдік тапсырмаларындағы өнімділікті салыстыру.*
+**Ескерту:** Төмендегі кестедегі Qolda және InternVL3.5-4B модельдерінің салыстырылуы тек **қазақ** тіліндегі көру-тілдік бенчмарктар нәтижелерін көрсетеді. Басқа модельдердің өнімділігі, сондай-ақ орыс және ағылшын тілдеріндегі көрсеткіштер кейінірек ұсынылады.
+| Model | Mode | Avg | AI2D | MMStar | RealWorldQA | KazakhOCR |
+|-------|------|--------|--------|----------|---------------|-------------|
+| InternVL3.5-4B | Direct | 42.23 | 52.33 | 47.47 | 38.32 | 30.81 |
+| InternVL3.5-4B | Think | 42.58 | 51.42 | 49.33 | 38.74 | 30.81 |
+| Qolda | Direct | 59.39 | 66.06 | 55.47 | 54.97 | **61.06** |
+| Qolda | Think | **60.44** | **67.62** | **56.53** | **57.07** | 60.54 |
+## Модельді қолдану
+Transformers арқылы инференсті іске қосу үшін InternVL ұсынған [нұсқаулықтарды](https://huggingface.co/OpenGVLab/InternVL3_5-4B#inference-with-transformers) орындаңыз.
+Немесе, модельді OpenAI-үйлесімді сервер арқылы іске қосу үшін lmdeploy құралын пайдалануға болады:
+```bash
+pip install lmdeploy>=0.9.1
+lmdeploy serve api_server issai/Qolda --server-port 23333 --tp 1 --backend pytorch
+```
+**Ескерту:** Qolda-ның түпнұсқалық InternVL3.5-тен айырмашылығы, бұл модель API call жасаған кезде `extra_body` бөлігінде `enable_thinking` параметрінің нақты орнатылуын талап етеді. Тапсырманың күрделілігіне байланысты бос thinking жауабы қайтарылуы мүмкін.
+Содан соң, стандартты API call жасаңыз:
+```python
+import base64
+from openai import OpenAI
+client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1')
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+image_path = "./assets/eval-results-text.png"
+response = client.chat.completions.create(
+    model=client.models.list().data[0].id,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'text',
+                'text': 'Берілген диаграмманың сипаттамасын бер.'
+            },
+            {
+                'type': 'image_url',
+                'image_url': {
+                    'url': f'data:image/png;base64,{encode_image(image_path)}',
+                },
+            }
+        ],
+    }],
+    max_tokens=8192,
+    temperature=0.6,
+    top_p=0.95,
+    extra_body={
+        "top_k": 20,
+        "enable_thinking": True
+    },
+)
+print(response.choices[0].message.content)
+```
+## Лицензия
+Бұл модель Apache License 2.0 бойынша лицензияланған.

Q2_K/config.json ADDED Viewed

	@@ -0,0 +1,128 @@

+{
+  "architectures": [
+    "InternVLChatModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_internvl_chat.InternVLChatConfig",
+    "AutoModel": "modeling_internvl_chat.InternVLChatModel",
+    "AutoModelForCausalLM": "modeling_internvl_chat.InternVLChatModel"
+  },
+  "downsample_ratio": 0.5,
+  "dynamic_image_size": true,
+  "eos_token_id": 151645,
+  "force_image_size": 448,
+  "hidden_size": 2560,
+  "llm_config": {
+    "_attn_implementation_autoset": true,
+    "architectures": [
+      "Qwen3ForCausalLM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 2560,
+    "initializer_range": 0.02,
+    "intermediate_size": 9728,
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 40960,
+    "max_window_layers": 36,
+    "model_type": "qwen3",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000,
+    "sliding_window": null,
+    "tie_word_embeddings": true,
+    "torch_dtype": "bfloat16",
+    "use_cache": false,
+    "use_sliding_window": false,
+    "vocab_size": 151936
+  },
+  "max_dynamic_patch": 12,
+  "min_dynamic_patch": 1,
+  "model_type": "internvl_chat",
+  "output_attentions": false,
+  "pad2square": false,
+  "pad_token_id": 151643,
+  "ps_version": "v2",
+  "select_layer": -1,
+  "template": "internvl2_5",
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": null,
+  "use_backbone_lora": 0,
+  "use_llm_lora": 0,
+  "use_thumbnail": true,
+  "vision_config": {
+    "_attn_implementation_autoset": true,
+    "architectures": [
+      "InternVisionModel"
+    ],
+    "attention_dropout": 0.0,
+    "auto_map": {
+      "AutoConfig": "configuration_intern_vit.InternVisionConfig",
+      "AutoModel": "modeling_intern_vit.InternVisionModel"
+    },
+    "drop_path_rate": 0.0,
+    "dropout": 0.0,
+    "hidden_act": "gelu",
+    "hidden_size": 1024,
+    "image_size": 448,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "layer_norm_eps": 1e-06,
+    "model_type": "intern_vit_6b",
+    "norm_type": "layer_norm",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 24,
+    "patch_size": 14,
+    "qk_normalization": false,
+    "qkv_bias": true,
+    "torch_dtype": "bfloat16",
+    "use_fa3": false,
+    "use_flash_attn": true
+  }
+}

Q2_K/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "pad_token_id": 151643,
+  "transformers_version": "4.51.0"
+}

Q2_K/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

Q2_K/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "additional_special_tokens": [
+    "<img>",
+    "</img>",
+    "<IMG_CONTEXT>",
+    "<quad>",
+    "</quad>",
+    "<ref>",
+    "</ref>",
+    "<box>",
+    "</box>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

Q2_K/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,308 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151669": {
+      "content": "<img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "</img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "<IMG_CONTEXT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "</quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "<ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151675": {
+      "content": "</ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151676": {
+      "content": "<box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151677": {
+      "content": "</box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<img>",
+    "</img>",
+    "<IMG_CONTEXT>",
+    "<quad>",
+    "</quad>",
+    "<ref>",
+    "</ref>",
+    "<box>",
+    "</box>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

Q2_K/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

Q3_K_M/Qolda-Q3_K_M.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a1e32493e1e7ac979387341ab657f411628a97002ffeb56b022f7abe6f01da7f
+size 2075617856

Q3_K_M/README.md ADDED Viewed

	@@ -0,0 +1,199 @@

+---
+language:
+- kk
+- ru
+- en
+base_model:
+- OpenGVLab/InternVL3_5-4B
+pipeline_tag: image-text-to-text
+---
+[Қазақша](#кіріспе)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[English](#introduction)
+# Qolda
+[![GitHub](https://img.shields.io/badge/GitHub-Qolda--deployment-blue?logo=github)](https://github.com/IS2AI/Qolda-deployment)
+[![License](https://img.shields.io/badge/License-Apache%202.0-green.svg)](https://www.apache.org/licenses/LICENSE-2.0)
+## Introduction
+Built on top of InternVL3.5 and Qwen3, **Qolda** is a small vision-language model designed to operate in Kazakh, Russian, and English. The model has 4.3B parameters and comprises the InternViT-300M vision encoder and MLP Projector components from [InternVL3.5-4B](https://huggingface.co/OpenGVLab/InternVL3_5-4B), along with the [Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B) language model. Model training was performed using the [InternVL framework](https://github.com/OpenGVLab/InternVL) 💙
+The name "Qolda" reflects both its design and purpose in Kazakh: "in hand" (қолда) for its compact accessibility, and "to support" (қолдау) for its assistive nature.
+## Evaluation Results
+Evaluation was conducted separately for text-only and vision-language modalities. Qolda demonstrates significant performance improvements for Kazakh while maintaining comparable performance on Russian and English.
+### Text Benchmarks
+![Model performance comparison on language benchmarks](assets/eval-results-text.png)
+*Performance comparison on language tasks including MMLU, Winogrande, HellaSwag, ARC, GSM8K, and DROP.*
+**Note:** The comparison below presents Qolda's performance against Qwen3-4B on **Kazakh** language benchmarks only. Evaluation results for additional models and performance on Russian and English will be added later.
+| Model | Mode | Avg | MMLU | Winogrande | HellaSwag | ARC | GSM8K | DROP |
+|-------|------|-----|------|------------|-----------|-----|-------|------|
+| Qwen3-4B | Direct | 52.00 | 42.43 | 56.88 | 42.04 | 64.77 | 73.62 | 32.27 |
+| Qwen3-4B | Think | 57.73 | 52.98 | 51.27 | 41.86 | 79.65 | 64.82 | 55.81 |
+| Qolda | Direct | 58.77 | 46.55 | 56.37 | 55.75 | 73.62 | 63.50 | 56.84 |
+| Qolda | Think | **71.64** | **64.56** | **70.54** | **57.70** | **89.99** | **79.47** | **67.59** |
+### Vision Benchmarks
+![Model performance comparison on vision-language benchmarks](assets/eval-results-vision.png)
+*Performance comparison on vision-language tasks including AI2D, MMStar, RealWorldQA, and KazakhOCR.*
+**Note:** The comparison below presents Qolda's performance against InternVL3.5-4B on **Kazakh** vision-language benchmarks only. Evaluation results for additional models and performance on Russian and English will be added later.
+| Model | Mode | Avg | AI2D | MMStar | RealWorldQA | KazakhOCR |
+|-------|------|--------|--------|----------|---------------|-------------|
+| InternVL3.5-4B | Direct | 42.23 | 52.33 | 47.47 | 38.32 | 30.81 |
+| InternVL3.5-4B | Think | 42.58 | 51.42 | 49.33 | 38.74 | 30.81 |
+| Qolda | Direct | 59.39 | 66.06 | 55.47 | 54.97 | **61.06** |
+| Qolda | Think | **60.44** | **67.62** | **56.53** | **57.07** | 60.54 |
+## Model Usage
+To run inference with Transformers, please follow the [guidelines](https://huggingface.co/OpenGVLab/InternVL3_5-4B#inference-with-transformers) from InternVL.
+Alternatively, to run the model via an OpenAI-compatible server, you can use lmdeploy:
+```bash
+pip install lmdeploy>=0.9.1
+lmdeploy serve api_server issai/Qolda --server-port 23333 --tp 1 --backend pytorch
+```
+**Note:** Unlike the original InternVL3.5, this model requires the `enable_thinking` parameter to be explicitly set in the `extra_body` of your API calls. However, depending on the task complexity, an empty thinking response might be generated.
+Then, make a standard API call:
+```python
+import base64
+from openai import OpenAI
+client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1')
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+image_path = "./assets/eval-results-text.png"
+response = client.chat.completions.create(
+    model=client.models.list().data[0].id,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'text',
+                'text': 'Берілген диаграмманың сипаттамасын бер.'
+            },
+            {
+                'type': 'image_url',
+                'image_url': {
+                    'url': f'data:image/png;base64,{encode_image(image_path)}',
+                },
+            }
+        ],
+    }],
+    max_tokens=8192,
+    temperature=0.6,
+    top_p=0.95,
+    extra_body={
+        "top_k": 20,
+        "enable_thinking": True
+    },
+)
+print(response.choices[0].message.content)
+```
+## License
+This model is licensed under the Apache License 2.0.
+## Кіріспе
+InternVL3.5 және Qwen3 негізінде жаса��ған **Qolda** — қазақ, орыс және ағылшын тілдерінде жұмыс істеуге арналған шағын көру-тілдік моделі (vision-language model). Модель 4,3 млрд параметрге ие және [InternVL3.5-4B](https://huggingface.co/OpenGVLab/InternVL3_5-4B) моделінің InternViT-300M көру энкодері мен MLP проектор компоненттерін, сондай-ақ [Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B) тілдік моделін қамтиды. Модельді оқыту [InternVL фреймворкі](https://github.com/OpenGVLab/InternVL) көмегімен жүзеге асырылды 💙
+"Qolda" атауы модельдің дизайны мен мақсатын қазақ тіліндегі қолда сөзінің қос мағынасы арқылы көрсетеді. Біріншісі, шағын әрі қолжетімді болуы үшін "қолда" cөзі арқылы және екіншісі, көмекші табиғаты үшін, "қолдау" мағынасы арқылы.
+## Бағалау нәтижелері
+Мәтіндік және көру-тілдік модальділіктер үшін бағалау бөлек жүргізілді. Qolda орыс және ағылшын тілдеріндегі өзінің бастапқы деңгейін сақтай отырып, қазақ тіліндегі өнімділігін айтарлықтай жақсартты.
+### Мәтіндік бенчмарктар
+![Тілдік бенчмарктардағы модель өнімділігін салыстыру](assets/eval-results-text.png)
+*MMLU, Winogrande, HellaSwag, ARC, GSM8K және DROP сияқты тілдік тапсырмалардағы өнімділікті салыстыру.*
+**Ескерту:** Төмендегі кестедегі Qolda және Qwen3-4B модельдерінің салыстырылуы тек **қазақ** тіліндегі бенчмарктар нәтижелерін көрсетеді. Басқа модельдердің өнімділігі, сондай-ақ орыс және ағылшын тілдеріндегі көрсеткіштер кейінірек ұсынылады.
+| Model | Mode | Avg | MMLU | Winogrande | HellaSwag | ARC | GSM8K | DROP |
+|-------|------|-----|------|------------|-----------|-----|-------|------|
+| Qwen3-4B | Direct | 52.00 | 42.43 | 56.88 | 42.04 | 64.77 | 73.62 | 32.27 |
+| Qwen3-4B | Think | 57.73 | 52.98 | 51.27 | 41.86 | 79.65 | 64.82 | 55.81 |
+| Qolda | Direct | 58.77 | 46.55 | 56.37 | 55.75 | 73.62 | 63.50 | 56.84 |
+| Qolda | Think | **71.64** | **64.56** | **70.54** | **57.70** | **89.99** | **79.47** | **67.59** |
+### Көру бенчмарктары
+![Көру-тілдік бенчмарктарындағы модель өнімділігін салыстыру](assets/eval-results-vision.png)
+*AI2D, MMStar, RealWorldQA және KazakhOCR сияқты көру-тілдік тапсырмаларындағы өнімділікті салыстыру.*
+**Ескерту:** Төмендегі кестедегі Qolda және InternVL3.5-4B модельдерінің салыстырылуы тек **қазақ** тіліндегі көру-тілдік бенчмарктар нәтижелерін көрсетеді. Басқа модельдердің өнімділігі, сондай-ақ орыс және ағылшын тілдеріндегі көрсеткіштер кейінірек ұсынылады.
+| Model | Mode | Avg | AI2D | MMStar | RealWorldQA | KazakhOCR |
+|-------|------|--------|--------|----------|---------------|-------------|
+| InternVL3.5-4B | Direct | 42.23 | 52.33 | 47.47 | 38.32 | 30.81 |
+| InternVL3.5-4B | Think | 42.58 | 51.42 | 49.33 | 38.74 | 30.81 |
+| Qolda | Direct | 59.39 | 66.06 | 55.47 | 54.97 | **61.06** |
+| Qolda | Think | **60.44** | **67.62** | **56.53** | **57.07** | 60.54 |
+## Модельді қолдану
+Transformers арқылы инференсті іске қосу үшін InternVL ұсынған [нұсқаулықтарды](https://huggingface.co/OpenGVLab/InternVL3_5-4B#inference-with-transformers) орындаңыз.
+Немесе, модельді OpenAI-үйлесімді сервер арқылы іске қосу үшін lmdeploy құралын пайдалануға болады:
+```bash
+pip install lmdeploy>=0.9.1
+lmdeploy serve api_server issai/Qolda --server-port 23333 --tp 1 --backend pytorch
+```
+**Ескерту:** Qolda-ның түпнұсқалық InternVL3.5-тен айырмашылығы, бұл модель API call жасаған кезде `extra_body` бөлігінде `enable_thinking` параметрінің нақты орнатылуын талап етеді. Тапсырманың күрделілігіне байланысты бос thinking жауабы қайтарылуы мүмкін.
+Содан соң, стандартты API call жасаңыз:
+```python
+import base64
+from openai import OpenAI
+client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1')
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+image_path = "./assets/eval-results-text.png"
+response = client.chat.completions.create(
+    model=client.models.list().data[0].id,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'text',
+                'text': 'Берілген диаграмманың сипаттамасын бер.'
+            },
+            {
+                'type': 'image_url',
+                'image_url': {
+                    'url': f'data:image/png;base64,{encode_image(image_path)}',
+                },
+            }
+        ],
+    }],
+    max_tokens=8192,
+    temperature=0.6,
+    top_p=0.95,
+    extra_body={
+        "top_k": 20,
+        "enable_thinking": True
+    },
+)
+print(response.choices[0].message.content)
+```
+## Лицензия
+Бұл модель Apache License 2.0 бойынша лицензияланған.

Q3_K_M/config.json ADDED Viewed

	@@ -0,0 +1,128 @@

+{
+  "architectures": [
+    "InternVLChatModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_internvl_chat.InternVLChatConfig",
+    "AutoModel": "modeling_internvl_chat.InternVLChatModel",
+    "AutoModelForCausalLM": "modeling_internvl_chat.InternVLChatModel"
+  },
+  "downsample_ratio": 0.5,
+  "dynamic_image_size": true,
+  "eos_token_id": 151645,
+  "force_image_size": 448,
+  "hidden_size": 2560,
+  "llm_config": {
+    "_attn_implementation_autoset": true,
+    "architectures": [
+      "Qwen3ForCausalLM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 2560,
+    "initializer_range": 0.02,
+    "intermediate_size": 9728,
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 40960,
+    "max_window_layers": 36,
+    "model_type": "qwen3",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000,
+    "sliding_window": null,
+    "tie_word_embeddings": true,
+    "torch_dtype": "bfloat16",
+    "use_cache": false,
+    "use_sliding_window": false,
+    "vocab_size": 151936
+  },
+  "max_dynamic_patch": 12,
+  "min_dynamic_patch": 1,
+  "model_type": "internvl_chat",
+  "output_attentions": false,
+  "pad2square": false,
+  "pad_token_id": 151643,
+  "ps_version": "v2",
+  "select_layer": -1,
+  "template": "internvl2_5",
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": null,
+  "use_backbone_lora": 0,
+  "use_llm_lora": 0,
+  "use_thumbnail": true,
+  "vision_config": {
+    "_attn_implementation_autoset": true,
+    "architectures": [
+      "InternVisionModel"
+    ],
+    "attention_dropout": 0.0,
+    "auto_map": {
+      "AutoConfig": "configuration_intern_vit.InternVisionConfig",
+      "AutoModel": "modeling_intern_vit.InternVisionModel"
+    },
+    "drop_path_rate": 0.0,
+    "dropout": 0.0,
+    "hidden_act": "gelu",
+    "hidden_size": 1024,
+    "image_size": 448,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "layer_norm_eps": 1e-06,
+    "model_type": "intern_vit_6b",
+    "norm_type": "layer_norm",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 24,
+    "patch_size": 14,
+    "qk_normalization": false,
+    "qkv_bias": true,
+    "torch_dtype": "bfloat16",
+    "use_fa3": false,
+    "use_flash_attn": true
+  }
+}

Q3_K_M/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "pad_token_id": 151643,
+  "transformers_version": "4.51.0"
+}

Q3_K_M/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

Q3_K_M/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "additional_special_tokens": [
+    "<img>",
+    "</img>",
+    "<IMG_CONTEXT>",
+    "<quad>",
+    "</quad>",
+    "<ref>",
+    "</ref>",
+    "<box>",
+    "</box>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

Q3_K_M/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,308 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151669": {
+      "content": "<img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "</img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "<IMG_CONTEXT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "</quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "<ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151675": {
+      "content": "</ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151676": {
+      "content": "<box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151677": {
+      "content": "</box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<img>",
+    "</img>",
+    "<IMG_CONTEXT>",
+    "<quad>",
+    "</quad>",
+    "<ref>",
+    "</ref>",
+    "<box>",
+    "</box>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

Q3_K_M/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

Q3_K_S/Qolda-Q3_K_S.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dbeab7f510099010636ba780e50949d559fd0aa254926820db247a33706e1fec
+size 1886997056

Q3_K_S/README.md ADDED Viewed

	@@ -0,0 +1,199 @@

+---
+language:
+- kk
+- ru
+- en
+base_model:
+- OpenGVLab/InternVL3_5-4B
+pipeline_tag: image-text-to-text
+---
+[Қазақша](#кіріспе)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[English](#introduction)
+# Qolda
+[![GitHub](https://img.shields.io/badge/GitHub-Qolda--deployment-blue?logo=github)](https://github.com/IS2AI/Qolda-deployment)
+[![License](https://img.shields.io/badge/License-Apache%202.0-green.svg)](https://www.apache.org/licenses/LICENSE-2.0)
+## Introduction
+Built on top of InternVL3.5 and Qwen3, **Qolda** is a small vision-language model designed to operate in Kazakh, Russian, and English. The model has 4.3B parameters and comprises the InternViT-300M vision encoder and MLP Projector components from [InternVL3.5-4B](https://huggingface.co/OpenGVLab/InternVL3_5-4B), along with the [Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B) language model. Model training was performed using the [InternVL framework](https://github.com/OpenGVLab/InternVL) 💙
+The name "Qolda" reflects both its design and purpose in Kazakh: "in hand" (қолда) for its compact accessibility, and "to support" (қолдау) for its assistive nature.
+## Evaluation Results
+Evaluation was conducted separately for text-only and vision-language modalities. Qolda demonstrates significant performance improvements for Kazakh while maintaining comparable performance on Russian and English.
+### Text Benchmarks
+![Model performance comparison on language benchmarks](assets/eval-results-text.png)
+*Performance comparison on language tasks including MMLU, Winogrande, HellaSwag, ARC, GSM8K, and DROP.*
+**Note:** The comparison below presents Qolda's performance against Qwen3-4B on **Kazakh** language benchmarks only. Evaluation results for additional models and performance on Russian and English will be added later.
+| Model | Mode | Avg | MMLU | Winogrande | HellaSwag | ARC | GSM8K | DROP |
+|-------|------|-----|------|------------|-----------|-----|-------|------|
+| Qwen3-4B | Direct | 52.00 | 42.43 | 56.88 | 42.04 | 64.77 | 73.62 | 32.27 |
+| Qwen3-4B | Think | 57.73 | 52.98 | 51.27 | 41.86 | 79.65 | 64.82 | 55.81 |
+| Qolda | Direct | 58.77 | 46.55 | 56.37 | 55.75 | 73.62 | 63.50 | 56.84 |
+| Qolda | Think | **71.64** | **64.56** | **70.54** | **57.70** | **89.99** | **79.47** | **67.59** |
+### Vision Benchmarks
+![Model performance comparison on vision-language benchmarks](assets/eval-results-vision.png)
+*Performance comparison on vision-language tasks including AI2D, MMStar, RealWorldQA, and KazakhOCR.*
+**Note:** The comparison below presents Qolda's performance against InternVL3.5-4B on **Kazakh** vision-language benchmarks only. Evaluation results for additional models and performance on Russian and English will be added later.
+| Model | Mode | Avg | AI2D | MMStar | RealWorldQA | KazakhOCR |
+|-------|------|--------|--------|----------|---------------|-------------|
+| InternVL3.5-4B | Direct | 42.23 | 52.33 | 47.47 | 38.32 | 30.81 |
+| InternVL3.5-4B | Think | 42.58 | 51.42 | 49.33 | 38.74 | 30.81 |
+| Qolda | Direct | 59.39 | 66.06 | 55.47 | 54.97 | **61.06** |
+| Qolda | Think | **60.44** | **67.62** | **56.53** | **57.07** | 60.54 |
+## Model Usage
+To run inference with Transformers, please follow the [guidelines](https://huggingface.co/OpenGVLab/InternVL3_5-4B#inference-with-transformers) from InternVL.
+Alternatively, to run the model via an OpenAI-compatible server, you can use lmdeploy:
+```bash
+pip install lmdeploy>=0.9.1
+lmdeploy serve api_server issai/Qolda --server-port 23333 --tp 1 --backend pytorch
+```
+**Note:** Unlike the original InternVL3.5, this model requires the `enable_thinking` parameter to be explicitly set in the `extra_body` of your API calls. However, depending on the task complexity, an empty thinking response might be generated.
+Then, make a standard API call:
+```python
+import base64
+from openai import OpenAI
+client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1')
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+image_path = "./assets/eval-results-text.png"
+response = client.chat.completions.create(
+    model=client.models.list().data[0].id,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'text',
+                'text': 'Берілген диаграмманың сипаттамасын бер.'
+            },
+            {
+                'type': 'image_url',
+                'image_url': {
+                    'url': f'data:image/png;base64,{encode_image(image_path)}',
+                },
+            }
+        ],
+    }],
+    max_tokens=8192,
+    temperature=0.6,
+    top_p=0.95,
+    extra_body={
+        "top_k": 20,
+        "enable_thinking": True
+    },
+)
+print(response.choices[0].message.content)
+```
+## License
+This model is licensed under the Apache License 2.0.
+## Кіріспе
+InternVL3.5 және Qwen3 негізінде жаса��ған **Qolda** — қазақ, орыс және ағылшын тілдерінде жұмыс істеуге арналған шағын көру-тілдік моделі (vision-language model). Модель 4,3 млрд параметрге ие және [InternVL3.5-4B](https://huggingface.co/OpenGVLab/InternVL3_5-4B) моделінің InternViT-300M көру энкодері мен MLP проектор компоненттерін, сондай-ақ [Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B) тілдік моделін қамтиды. Модельді оқыту [InternVL фреймворкі](https://github.com/OpenGVLab/InternVL) көмегімен жүзеге асырылды 💙
+"Qolda" атауы модельдің дизайны мен мақсатын қазақ тіліндегі қолда сөзінің қос мағынасы арқылы көрсетеді. Біріншісі, шағын әрі қолжетімді болуы үшін "қолда" cөзі арқылы және екіншісі, көмекші табиғаты үшін, "қолдау" мағынасы арқылы.
+## Бағалау нәтижелері
+Мәтіндік және көру-тілдік модальділіктер үшін бағалау бөлек жүргізілді. Qolda орыс және ағылшын тілдеріндегі өзінің бастапқы деңгейін сақтай отырып, қазақ тіліндегі өнімділігін айтарлықтай жақсартты.
+### Мәтіндік бенчмарктар
+![Тілдік бенчмарктардағы модель өнімділігін салыстыру](assets/eval-results-text.png)
+*MMLU, Winogrande, HellaSwag, ARC, GSM8K және DROP сияқты тілдік тапсырмалардағы өнімділікті салыстыру.*
+**Ескерту:** Төмендегі кестедегі Qolda және Qwen3-4B модельдерінің салыстырылуы тек **қазақ** тіліндегі бенчмарктар нәтижелерін көрсетеді. Басқа модельдердің өнімділігі, сондай-ақ орыс және ағылшын тілдеріндегі көрсеткіштер кейінірек ұсынылады.
+| Model | Mode | Avg | MMLU | Winogrande | HellaSwag | ARC | GSM8K | DROP |
+|-------|------|-----|------|------------|-----------|-----|-------|------|
+| Qwen3-4B | Direct | 52.00 | 42.43 | 56.88 | 42.04 | 64.77 | 73.62 | 32.27 |
+| Qwen3-4B | Think | 57.73 | 52.98 | 51.27 | 41.86 | 79.65 | 64.82 | 55.81 |
+| Qolda | Direct | 58.77 | 46.55 | 56.37 | 55.75 | 73.62 | 63.50 | 56.84 |
+| Qolda | Think | **71.64** | **64.56** | **70.54** | **57.70** | **89.99** | **79.47** | **67.59** |
+### Көру бенчмарктары
+![Көру-тілдік бенчмарктарындағы модель өнімділігін салыстыру](assets/eval-results-vision.png)
+*AI2D, MMStar, RealWorldQA және KazakhOCR сияқты көру-тілдік тапсырмаларындағы өнімділікті салыстыру.*
+**Ескерту:** Төмендегі кестедегі Qolda және InternVL3.5-4B модельдерінің салыстырылуы тек **қазақ** тіліндегі көру-тілдік бенчмарктар нәтижелерін көрсетеді. Басқа модельдердің өнімділігі, сондай-ақ орыс және ағылшын тілдеріндегі көрсеткіштер кейінірек ұсынылады.
+| Model | Mode | Avg | AI2D | MMStar | RealWorldQA | KazakhOCR |
+|-------|------|--------|--------|----------|---------------|-------------|
+| InternVL3.5-4B | Direct | 42.23 | 52.33 | 47.47 | 38.32 | 30.81 |
+| InternVL3.5-4B | Think | 42.58 | 51.42 | 49.33 | 38.74 | 30.81 |
+| Qolda | Direct | 59.39 | 66.06 | 55.47 | 54.97 | **61.06** |
+| Qolda | Think | **60.44** | **67.62** | **56.53** | **57.07** | 60.54 |
+## Модельді қолдану
+Transformers арқылы инференсті іске қосу үшін InternVL ұсынған [нұсқаулықтарды](https://huggingface.co/OpenGVLab/InternVL3_5-4B#inference-with-transformers) орындаңыз.
+Немесе, модельді OpenAI-үйлесімді сервер арқылы іске қосу үшін lmdeploy құралын пайдалануға болады:
+```bash
+pip install lmdeploy>=0.9.1
+lmdeploy serve api_server issai/Qolda --server-port 23333 --tp 1 --backend pytorch
+```
+**Ескерту:** Qolda-ның түпнұсқалық InternVL3.5-тен айырмашылығы, бұл модель API call жасаған кезде `extra_body` бөлігінде `enable_thinking` параметрінің нақты орнатылуын талап етеді. Тапсырманың күрделілігіне байланысты бос thinking жауабы қайтарылуы мүмкін.
+Содан соң, стандартты API call жасаңыз:
+```python
+import base64
+from openai import OpenAI
+client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1')
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+image_path = "./assets/eval-results-text.png"
+response = client.chat.completions.create(
+    model=client.models.list().data[0].id,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'text',
+                'text': 'Берілген диаграмманың сипаттамасын бер.'
+            },
+            {
+                'type': 'image_url',
+                'image_url': {
+                    'url': f'data:image/png;base64,{encode_image(image_path)}',
+                },
+            }
+        ],
+    }],
+    max_tokens=8192,
+    temperature=0.6,
+    top_p=0.95,
+    extra_body={
+        "top_k": 20,
+        "enable_thinking": True
+    },
+)
+print(response.choices[0].message.content)
+```
+## Лицензия
+Бұл модель Apache License 2.0 бойынша лицензияланған.

Q3_K_S/config.json ADDED Viewed

	@@ -0,0 +1,128 @@

+{
+  "architectures": [
+    "InternVLChatModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_internvl_chat.InternVLChatConfig",
+    "AutoModel": "modeling_internvl_chat.InternVLChatModel",
+    "AutoModelForCausalLM": "modeling_internvl_chat.InternVLChatModel"
+  },
+  "downsample_ratio": 0.5,
+  "dynamic_image_size": true,
+  "eos_token_id": 151645,
+  "force_image_size": 448,
+  "hidden_size": 2560,
+  "llm_config": {
+    "_attn_implementation_autoset": true,
+    "architectures": [
+      "Qwen3ForCausalLM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 2560,
+    "initializer_range": 0.02,
+    "intermediate_size": 9728,
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 40960,
+    "max_window_layers": 36,
+    "model_type": "qwen3",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000,
+    "sliding_window": null,
+    "tie_word_embeddings": true,
+    "torch_dtype": "bfloat16",
+    "use_cache": false,
+    "use_sliding_window": false,
+    "vocab_size": 151936
+  },
+  "max_dynamic_patch": 12,
+  "min_dynamic_patch": 1,
+  "model_type": "internvl_chat",
+  "output_attentions": false,
+  "pad2square": false,
+  "pad_token_id": 151643,
+  "ps_version": "v2",
+  "select_layer": -1,
+  "template": "internvl2_5",
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": null,
+  "use_backbone_lora": 0,
+  "use_llm_lora": 0,
+  "use_thumbnail": true,
+  "vision_config": {
+    "_attn_implementation_autoset": true,
+    "architectures": [
+      "InternVisionModel"
+    ],
+    "attention_dropout": 0.0,
+    "auto_map": {
+      "AutoConfig": "configuration_intern_vit.InternVisionConfig",
+      "AutoModel": "modeling_intern_vit.InternVisionModel"
+    },
+    "drop_path_rate": 0.0,
+    "dropout": 0.0,
+    "hidden_act": "gelu",
+    "hidden_size": 1024,
+    "image_size": 448,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "layer_norm_eps": 1e-06,
+    "model_type": "intern_vit_6b",
+    "norm_type": "layer_norm",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 24,
+    "patch_size": 14,
+    "qk_normalization": false,
+    "qkv_bias": true,
+    "torch_dtype": "bfloat16",
+    "use_fa3": false,
+    "use_flash_attn": true
+  }
+}

Q3_K_S/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "pad_token_id": 151643,
+  "transformers_version": "4.51.0"
+}

Q3_K_S/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

Q3_K_S/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "additional_special_tokens": [
+    "<img>",
+    "</img>",
+    "<IMG_CONTEXT>",
+    "<quad>",
+    "</quad>",
+    "<ref>",
+    "</ref>",
+    "<box>",
+    "</box>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

Q3_K_S/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,308 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151669": {
+      "content": "<img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "</img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "<IMG_CONTEXT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "</quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "<ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151675": {
+      "content": "</ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151676": {
+      "content": "<box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151677": {
+      "content": "</box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<img>",
+    "</img>",
+    "<IMG_CONTEXT>",
+    "<quad>",
+    "</quad>",
+    "<ref>",
+    "</ref>",
+    "<box>",
+    "</box>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}