Upload DBTrimKV checkpoint (Qwen3-4B, OpenR1-Math-220k)

Browse files

Files changed (4) hide show

.gitattributes +1 -33
README.md +73 -0
config.json +87 -0
trimkv_weights.pth +3 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,3 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

 *.pth filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,73 @@

+---
+license: apache-2.0
+datasets:
+- open-r1/OpenR1-Math-220k
+base_model:
+- Qwen/Qwen3-4B
+tags:
+- math
+- dbtrimkv
+- trimkv
+- kv-cache
+- compression
+---
+> DBTrimKV is the dynamic-budget variant of TrimKV: a single global KV budget is shared across layers and heads and reallocated on the fly, with the retention-gate's final projection tied across layers.
+This repository hosts the **DBTrimKV** retention-gate weights for `Qwen/Qwen3-4B` (32768-token training context, M = 128). The base-model weights are not included — they are loaded from `Qwen/Qwen3-4B` at runtime and the retention-gate weights from `trimkv_weights.pth` are overlaid on top.
+<a href="https://arxiv.org/pdf/2512.03324"><img src="https://img.shields.io/badge/arxiv-2512.03324-red?style=for-the-badge"></a>
+For the full list of released checkpoints, training recipes, and benchmark scripts, see the GitHub repository: **https://github.com/ngocbh/trimkv**.
+## Quick start
+```python
+import torch
+from trimkv.models.qwen3 import TrimKVQwen3ForCausalLM
+from trimkv.cache_utils import PagedTrimKVCache
+from transformers import AutoTokenizer
+model = TrimKVQwen3ForCausalLM.from_pretrained(
+    "ngocbh/DBTrimKV-Qwen3-4B-Math",
+    torch_dtype=torch.bfloat16,
+    load_trimkv_weights=True,
+    download_from="huggingface",
+    use_cache=True,
+    device_map="cuda",
+)
+model.config._attn_implementation = "flash_attention_2"
+tokenizer = AutoTokenizer.from_pretrained(
+    model.config.base_model, use_fast=True, padding_side="left"
+)
+past_key_values = PagedTrimKVCache(
+    num_layers=model.config.num_hidden_layers,
+    num_heads=model.config.num_key_value_heads,
+    max_seq_len=32768,
+    memory_size=128,
+    num_blocks_ratio=1.0,
+    buffer_size=32,
+    strategy="fixed_budget",
+    device="cuda",
+)
+# Use as a normal HF model — pass `past_key_values=past_key_values` to .generate
+```
+See [`examples/test_qwen3.py`](https://github.com/ngocbh/trimkv/blob/main/examples/test_qwen3.py) in the GitHub repo for a full runnable example.
+## Training details
+- Base model: `Qwen/Qwen3-4B`
+- Variant: **DBTrimKV** (`retention_gate=rg10`)
+- Training dataset: open-r1/OpenR1-Math-220k
+- Training memory size M: `128`
+- Training context length: `32768`
+- Loss: `fwkl_ntp`
+- Attention impl: `rg_attn_flex`
+## Citation
+For the up-to-date BibTeX entry, see the [GitHub repository](https://github.com/ngocbh/trimkv).

config.json ADDED Viewed

	@@ -0,0 +1,87 @@

+{
+  "alpha_threshold": 0.0,
+  "architectures": [
+    "TrimKVQwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attn_impl": "rg_attn_flex",
+  "base_loss": "fwkl_ntp",
+  "base_model": "Qwen/Qwen3-4B",
+  "bos_token_id": 151643,
+  "buffer_size": 128,
+  "compress_memory": true,
+  "compress_strategy": "alpha",
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "floor_budget_ratio": 0.0,
+  "global_capacity": true,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2560,
+  "initializer_range": 0.02,
+  "intermediate_size": 9728,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "logit_block_size": 16384,
+  "max_position_embeddings": 40960,
+  "max_seq_len": 32768,
+  "max_window_layers": 36,
+  "memory_size": 128.0,
+  "model_type": "qwen3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "retention_gate": "rg10",
+  "retention_gate_bias_init": 18.0,
+  "retention_gate_intermediate_size": 512,
+  "retention_weight": 1.0,
+  "rg_dropout": 0.0,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_retention_gate_layers": true,
+  "tie_word_embeddings": true,
+  "trainable_params": "self_attn.retention_gate",
+  "transformers_version": "4.57.1",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

trimkv_weights.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b50190691b4b7621917777fe087ea8c9caae917cac07fd873f28b0285c2a70ef
+size 113361053