https://github.com/huggingface/transformers/issues/42907

https://www.reddit.com/r/LocalLLaMA/comments/1prpe36/comment/nv3ly0f/

The NotImplementedError is a known bug because Transformers currently lacks the reverse logic to save fine-grained FP8 weights. You can bypass this by calling model.dequantize() and saving the state_dict directly using safetensors instead of the broken save_pretrained method.

from transformers import Ministral3ForCausalLM, FineGrainedFP8Config
from safetensors.torch import save_file
import os

model_id = "mistralai/Devstral-2-123B-Instruct-2512"
model = Ministral3ForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=FineGrainedFP8Config(dequantize=True)
)

# Dequantize the model to convert FP8 weights back to higher precision
# model.dequantize() # Unsupported in older GPUs like A100. `quantization_config=FineGrainedFP8Config(dequantize=True)` still works though.

output_dir = "./devstral"
os.makedirs(output_dir, exist_ok=True)

state_dict = {k: v.cpu() for k, v in model.state_dict().items()}

# Shard into ~5GB chunks
max_shard_size = 5 * 1024 ** 3  # 5GB in bytes
current_shard = {}
current_size = 0
shard_idx = 0

for key, tensor in state_dict.items():
    tensor_size = tensor.numel() * tensor.element_size()
    print("tensor_size", tensor_size)
    print("current_size", current_size)
    print("idx", shard_idx)
    
    if current_size + tensor_size > max_shard_size and current_shard:
        save_file(current_shard, os.path.join(output_dir, f"model-{shard_idx:05d}.safetensors"))
        shard_idx += 1
        current_shard = {}
        current_size = 0
    
    current_shard[key] = tensor
    current_size += tensor_size

# Save the last shard
if current_shard:
    save_file(current_shard, os.path.join(output_dir, f"model-{shard_idx:05d}.safetensors"))

model.config.save_pretrained(output_dir)

^ Main logic for bypassing model.save_pretrained(), save the state_dict directly instead!

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.save_pretrained(output_dir)

^ Forgot to save the tokenizer files

import os
import json
from safetensors import safe_open

output_dir = "./devstral"

weight_map = {}
metadata = {"total_size": 0}

# Iterate through all shard files
shard_files = sorted([f for f in os.listdir(output_dir) if f.endswith(".safetensors")])

for shard_file in shard_files:
    shard_path = os.path.join(output_dir, shard_file)
    
    with safe_open(shard_path, framework="pt") as f:
        for key in f.keys():
            weight_map[key] = shard_file
            tensor = f.get_tensor(key)
            metadata["total_size"] += tensor.numel() * tensor.element_size()

index = {
    "metadata": metadata,
    "weight_map": weight_map
}

with open(os.path.join(output_dir, "model.safetensors.index.json"), "w") as f:
    json.dump(index, f, indent=2)

print(f"Created index with {len(weight_map)} tensors, total size: {metadata['total_size'] / 1e9:.2f} GB")

^ Generate the model.safetensors.index.json

from transformers import Ministral3ForCausalLM, AutoTokenizer

model_id = "./devstral"
model = Ministral3ForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
)
model.save_pretrained("./devstral-123b", max_shard_size="5GB")

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.save_pretrained("./devstral-123b")

^ Load new devstral normally but save it with HF standards. Generates other files too!

Screenshot 2025-12-21 at 7.31.20 AM

^ Quanted and inferenced!

Downloads last month
227
Safetensors
Model size
125B params
Tensor type
BF16
·
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

Model tree for TheDrummer/Devstral-2-123B-Instruct-2512-HF

Finetuned
(6)
this model