RnJ-1-Instruct-FP8 / scripts /quantize_rnj1_fp8.py
Doradus AI
Upload folder using huggingface_hub
6a70e5e verified
#!/usr/bin/env python3
"""
Quantize RnJ-1-Instruct to FP8 using llmcompressor (Neural Magic)
This script performs dynamic FP8 quantization (W8A8) on the RnJ-1-Instruct model.
No calibration data is needed - it's a fast, deterministic conversion.
Requirements:
pip install llmcompressor torch transformers accelerate
Usage:
python quantize_rnj1_fp8.py
Output:
./RnJ-1-Instruct-FP8/ - vLLM-compatible FP8 model
Hardware Requirements:
- 32GB+ RAM for model loading
- GPU optional (CPU quantization supported)
"""
import os
import sys
from pathlib import Path
def main():
try:
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
except ImportError:
print("Error: llmcompressor not installed")
print("Install with: pip install llmcompressor torch transformers accelerate")
sys.exit(1)
# Configuration
MODEL_PATH = "DoradusAI/RnJ-1-Instruct"
OUTPUT_PATH = "./RnJ-1-Instruct-FP8"
print(f"Quantizing {MODEL_PATH} to FP8...")
print(f"Output directory: {OUTPUT_PATH}")
# FP8 Dynamic quantization recipe
# - Weights: FP8 E4M3 (channel-wise, symmetric)
# - Activations: FP8 E4M3 (token-wise, dynamic)
# - lm_head kept in BF16 for output quality
recipe = QuantizationModifier(
targets="Linear",
scheme="FP8_DYNAMIC",
ignore=["lm_head"],
)
# Run quantization
oneshot(
model=MODEL_PATH,
output_dir=OUTPUT_PATH,
recipe=recipe,
num_calibration_samples=0, # Dynamic quant needs no calibration
save_compressed=True,
)
print(f"\nQuantization complete!")
print(f"Model saved to: {OUTPUT_PATH}")
print(f"\nTo test with vLLM:")
print(f" python -m vllm.entrypoints.openai.api_server \\")
print(f" --model {OUTPUT_PATH} \\")
print(f" --trust-remote-code")
if __name__ == "__main__":
main()