https://github.com/huggingface/transformers/issues/42907
https://www.reddit.com/r/LocalLLaMA/comments/1prpe36/comment/nv3ly0f/
The NotImplementedError is a known bug because Transformers currently lacks the reverse logic to save fine-grained FP8 weights. You can bypass this by calling model.dequantize() and saving the state_dict directly using safetensors instead of the broken save_pretrained method.
from transformers import Ministral3ForCausalLM, FineGrainedFP8Config
from safetensors.torch import save_file
import os
model_id = "mistralai/Devstral-2-123B-Instruct-2512"
model = Ministral3ForCausalLM.from_pretrained(
model_id,
device_map="auto",
quantization_config=FineGrainedFP8Config(dequantize=True)
)
# Dequantize the model to convert FP8 weights back to higher precision
# model.dequantize() # Unsupported in older GPUs like A100. `quantization_config=FineGrainedFP8Config(dequantize=True)` still works though.
output_dir = "./devstral"
os.makedirs(output_dir, exist_ok=True)
state_dict = {k: v.cpu() for k, v in model.state_dict().items()}
# Shard into ~5GB chunks
max_shard_size = 5 * 1024 ** 3 # 5GB in bytes
current_shard = {}
current_size = 0
shard_idx = 0
for key, tensor in state_dict.items():
tensor_size = tensor.numel() * tensor.element_size()
print("tensor_size", tensor_size)
print("current_size", current_size)
print("idx", shard_idx)
if current_size + tensor_size > max_shard_size and current_shard:
save_file(current_shard, os.path.join(output_dir, f"model-{shard_idx:05d}.safetensors"))
shard_idx += 1
current_shard = {}
current_size = 0
current_shard[key] = tensor
current_size += tensor_size
# Save the last shard
if current_shard:
save_file(current_shard, os.path.join(output_dir, f"model-{shard_idx:05d}.safetensors"))
model.config.save_pretrained(output_dir)
^ Main logic for bypassing model.save_pretrained(), save the state_dict directly instead!
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.save_pretrained(output_dir)
^ Forgot to save the tokenizer files
import os
import json
from safetensors import safe_open
output_dir = "./devstral"
weight_map = {}
metadata = {"total_size": 0}
# Iterate through all shard files
shard_files = sorted([f for f in os.listdir(output_dir) if f.endswith(".safetensors")])
for shard_file in shard_files:
shard_path = os.path.join(output_dir, shard_file)
with safe_open(shard_path, framework="pt") as f:
for key in f.keys():
weight_map[key] = shard_file
tensor = f.get_tensor(key)
metadata["total_size"] += tensor.numel() * tensor.element_size()
index = {
"metadata": metadata,
"weight_map": weight_map
}
with open(os.path.join(output_dir, "model.safetensors.index.json"), "w") as f:
json.dump(index, f, indent=2)
print(f"Created index with {len(weight_map)} tensors, total size: {metadata['total_size'] / 1e9:.2f} GB")
^ Generate the model.safetensors.index.json
from transformers import Ministral3ForCausalLM, AutoTokenizer
model_id = "./devstral"
model = Ministral3ForCausalLM.from_pretrained(
model_id,
device_map="auto",
)
model.save_pretrained("./devstral-123b", max_shard_size="5GB")
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.save_pretrained("./devstral-123b")
^ Load new devstral normally but save it with HF standards. Generates other files too!
^ Quanted and inferenced!
- Downloads last month
- 227
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
🙋
Ask for provider support
Model tree for TheDrummer/Devstral-2-123B-Instruct-2512-HF
Base model
mistralai/Devstral-2-123B-Instruct-2512