tacodevs commited on
Commit
a70ca0d
·
verified ·
1 Parent(s): f74ada5

Initial upload: FP8 dynamic quantization of Behemoth-X-R1-123B

Browse files
README.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ license_name: mistral-research-license
4
+ license_link: https://mistral.ai/licenses/MRL-0.1.md
5
+ base_model: tacodevs/Behemoth-X-R1-123B
6
+ base_model_relation: quantized
7
+ tags:
8
+ - mistral
9
+ - fp8
10
+ - w8a8
11
+ - compressed-tensors
12
+ - quantized
13
+ - thinking
14
+ - roleplay
15
+ - creative-writing
16
+ language:
17
+ - en
18
+ pipeline_tag: text-generation
19
+ ---
20
+
21
+ <div align="center">
22
+ <img src="https://huggingface.co/tacodevs/Behemoth-X-R1-123B/resolve/main/assets/hero.png" alt="Behemoth-X-R1-123B" style="width:100%; max-width:960px; border-radius:16px; box-shadow:0 0 60px rgba(236,72,153,0.35), 0 0 100px rgba(139,92,246,0.25);"/>
23
+ </div>
24
+
25
+ <div align="center" style="margin-top:24px;">
26
+
27
+ <h1 style="font-size:3em; font-weight:900; background:linear-gradient(90deg,#ec4899 0%,#a855f7 50%,#06b6d4 100%); -webkit-background-clip:text; -webkit-text-fill-color:transparent; background-clip:text; margin:0;">Behemoth-X-R1-123B · FP8</h1>
28
+
29
+ <p style="font-size:1.2em; color:#a855f7; font-style:italic;">Single-GPU beast mode.</p>
30
+
31
+ <p>
32
+ <img src="https://img.shields.io/badge/quant-FP8_Dynamic-8B5CF6?style=for-the-badge" alt="quant"/>
33
+ <img src="https://img.shields.io/badge/VRAM-~130GB-EC4899?style=for-the-badge" alt="vram"/>
34
+ <img src="https://img.shields.io/badge/runs_on-1x_H200-06B6D4?style=for-the-badge" alt="gpu"/>
35
+ </p>
36
+
37
+ </div>
38
+
39
+ ## About
40
+
41
+ FP8 dynamic quantization of [`tacodevs/Behemoth-X-R1-123B`](https://huggingface.co/tacodevs/Behemoth-X-R1-123B). Near-lossless quality, half the weight bytes, fits on a single H200.
42
+
43
+ - **Method:** W8A8 dynamic quantization via [llm-compressor](https://github.com/vllm-project/llm-compressor)
44
+ - **Format:** `compressed-tensors`
45
+ - **Size:** ~115 GB
46
+ - **Calibration:** None needed (dynamic scheme)
47
+
48
+ ## Usage with vLLM
49
+
50
+ ```bash
51
+ python -m vllm.entrypoints.openai.api_server \
52
+ --model tacodevs/Behemoth-X-R1-123B-FP8 \
53
+ --max-model-len 16384 \
54
+ --gpu-memory-utilization 0.95 \
55
+ --trust-remote-code
56
+ ```
57
+
58
+ Fits on **1× H200 141GB** with ~30k context window.
59
+
60
+ ## See the main model card
61
+
62
+ Full documentation, prompt format, prefill examples, credits, and everything else is on the source repo:
63
+
64
+ ### 👉 [tacodevs/Behemoth-X-R1-123B](https://huggingface.co/tacodevs/Behemoth-X-R1-123B)
65
+
66
+ ## License
67
+
68
+ Inherited from base: **[Mistral Research License](https://mistral.ai/licenses/MRL-0.1.md)** — non-commercial use only.
chat_template.jinja ADDED
@@ -0,0 +1 @@
 
 
1
+ {{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + '[/INST]' }}{% elif message['role'] == 'system' %}{{ '[SYSTEM_PROMPT] ' + message['content'] + '[/SYSTEM_PROMPT]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token }}{% else %}{{ raise_exception('Only user, system and assistant roles are supported!') }}{% endif %}{% endfor %}
config.json ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MistralForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 1,
7
+ "dtype": "bfloat16",
8
+ "eos_token_id": 2,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 12288,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 28672,
14
+ "max_position_embeddings": 131072,
15
+ "model_type": "mistral",
16
+ "num_attention_heads": 96,
17
+ "num_hidden_layers": 88,
18
+ "num_key_value_heads": 8,
19
+ "quantization_config": {
20
+ "config_groups": {
21
+ "group_0": {
22
+ "format": "float-quantized",
23
+ "input_activations": {
24
+ "actorder": null,
25
+ "block_structure": null,
26
+ "dynamic": true,
27
+ "group_size": null,
28
+ "num_bits": 8,
29
+ "observer": null,
30
+ "observer_kwargs": {},
31
+ "scale_dtype": null,
32
+ "strategy": "token",
33
+ "symmetric": true,
34
+ "type": "float",
35
+ "zp_dtype": null
36
+ },
37
+ "output_activations": null,
38
+ "targets": [
39
+ "Linear"
40
+ ],
41
+ "weights": {
42
+ "actorder": null,
43
+ "block_structure": null,
44
+ "dynamic": false,
45
+ "group_size": null,
46
+ "num_bits": 8,
47
+ "observer": "memoryless_minmax",
48
+ "observer_kwargs": {},
49
+ "scale_dtype": null,
50
+ "strategy": "channel",
51
+ "symmetric": true,
52
+ "type": "float",
53
+ "zp_dtype": null
54
+ }
55
+ }
56
+ },
57
+ "format": "float-quantized",
58
+ "global_compression_ratio": null,
59
+ "ignore": [
60
+ "lm_head"
61
+ ],
62
+ "kv_cache_scheme": null,
63
+ "quant_method": "compressed-tensors",
64
+ "quantization_status": "compressed",
65
+ "sparsity_config": {},
66
+ "transform_config": {},
67
+ "version": "0.14.0.1"
68
+ },
69
+ "rms_norm_eps": 1e-05,
70
+ "rope_theta": 1000000.0,
71
+ "sliding_window": null,
72
+ "tie_word_embeddings": false,
73
+ "transformers_version": "4.57.6",
74
+ "use_cache": true,
75
+ "vocab_size": 32768
76
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.57.6"
6
+ }
model-00001-of-00026.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efb74e9841acfd6dfa40df465c2eda9dafef9264452f9a3d6f6e76c5c5769d89
3
+ size 4958398128
model-00002-of-00026.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b13127acfafaa4d7a09b68c1cced6351c9b959de9047b779c1cee5357eaf077
3
+ size 4832680488
model-00003-of-00026.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1f6e6092fe279168537976050970b450de7829e338418f70d309434c431d541
3
+ size 4857866304
model-00004-of-00026.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:512871a736c969f0617b17edf00777a6def462817c9fbb885889834dc5171bcc
3
+ size 4832680552
model-00005-of-00026.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d0ae510f7740646a80a199779ae3edacfe9cb44bcb6460537dc28f3745e7c16
3
+ size 4857866352
model-00006-of-00026.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35503883727c6ad094f179422deaff88223eb16d8794308243d95f071eba1955
3
+ size 4832680552
model-00007-of-00026.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0470a73a624b43ce0493b5225a7f6f6dd1392ff3c3b72c3a47fe3dd1bab992d9
3
+ size 4857866352
model-00008-of-00026.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11efbb99f403f6e4a2609d5ba1eaff7502a0cd07c202cea5d56cce46e1ddcc50
3
+ size 4832680552
model-00009-of-00026.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2eaeb07360cb340a23602bda6ae7f75d2335d9fc85abe0d8a14ef5ce79f3eda7
3
+ size 4857866352
model-00010-of-00026.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4981e6912bd0ab826dd37cc3a7c8b33142e1551e8053a354054acc4ffcea56fc
3
+ size 4832680552
model-00011-of-00026.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39f433c0f7eec9dd432dfada040c3e1343b72c4cc146418a32988971ef62d874
3
+ size 4857866352
model-00012-of-00026.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdec8bc16948827f3fc714579a432075741aaae269dce4fe91aa5eb465baa080
3
+ size 4832680552
model-00013-of-00026.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c74581ed1d11b47962ebdeb65a0b8558077681a98a37a9bc5541f91026453503
3
+ size 4857866352
model-00014-of-00026.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c634e6c7452cc5be9f31770a22fcde3ae1428cfc5cbf1915bd45980ee903a8c
3
+ size 4832680552
model-00015-of-00026.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7580792ca2eb2f535217f8b5b37dc0263bb55bdc10edf4c036f35899bc1bc9a
3
+ size 4857866352
model-00016-of-00026.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:447596bada0662a5adcba445015df866b8b37f242ba7446299ed07baed3bbbf7
3
+ size 4832680552
model-00017-of-00026.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdbda607587a45ba8285c78d21ccde129108a8b5da56f8093d43d32246d909e7
3
+ size 4857866352
model-00018-of-00026.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c13cda5c464cd5454887ccb52b2a0f7ecb7ef26d43ac0e17b880734e95bad1a7
3
+ size 4832680552
model-00019-of-00026.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93f78a12bce76ec9e4d6ed879e77bfce4f8ad894c009776ea60397cfa0c41e07
3
+ size 4857866352
model-00020-of-00026.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27caf7c0ecdc5f7815359ff9bce3289c0e34df72056eec8cd12b0ee5eefa3e11
3
+ size 4832680552
model-00021-of-00026.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e7e79cb9d5ac66606833dc3c1de82bb05b6f620e1a777bfe2b85d9ad0ee6976
3
+ size 4857866352
model-00022-of-00026.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22cd92b88545f9dbda06876f1acd65044ba866ee6b9a7a0b21dd1513e0fdcecf
3
+ size 4832680552
model-00023-of-00026.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bc6815475a3f53c242a80d6c686190845fc5c3238c16fb15a561f9ed2fc57c0
3
+ size 4857866352
model-00024-of-00026.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04dba7fd22a277041b189c49a297a3f7c0118ece04acea4ef47c73ac28fff953
3
+ size 4832680552
model-00025-of-00026.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0104b0cf3298c5c2ff2dedfbea5213ae0e37c907385ce0dea8b05d019b7de092
3
+ size 4857866352
model-00026-of-00026.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:802c6f2de89d64753b4db7835631921745f8a21d154394226a40475c8a097b21
3
+ size 2189695048
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
recipe.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ default_stage:
2
+ default_modifiers:
3
+ QuantizationModifier:
4
+ targets: [Linear]
5
+ ignore: [lm_head]
6
+ scheme: FP8_DYNAMIC
7
+ bypass_divisibility_checks: false
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b968b8dc352f42192367337c78ccc61e1eaddc6d641a579372d4f20694beb7a
3
+ size 587562
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff