#!/usr/bin/env python3 """ Quantize RnJ-1-Instruct to FP8 using llmcompressor (Neural Magic) This script performs dynamic FP8 quantization (W8A8) on the RnJ-1-Instruct model. No calibration data is needed - it's a fast, deterministic conversion. Requirements: pip install llmcompressor torch transformers accelerate Usage: python quantize_rnj1_fp8.py Output: ./RnJ-1-Instruct-FP8/ - vLLM-compatible FP8 model Hardware Requirements: - 32GB+ RAM for model loading - GPU optional (CPU quantization supported) """ import os import sys from pathlib import Path def main(): try: from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier except ImportError: print("Error: llmcompressor not installed") print("Install with: pip install llmcompressor torch transformers accelerate") sys.exit(1) # Configuration MODEL_PATH = "DoradusAI/RnJ-1-Instruct" OUTPUT_PATH = "./RnJ-1-Instruct-FP8" print(f"Quantizing {MODEL_PATH} to FP8...") print(f"Output directory: {OUTPUT_PATH}") # FP8 Dynamic quantization recipe # - Weights: FP8 E4M3 (channel-wise, symmetric) # - Activations: FP8 E4M3 (token-wise, dynamic) # - lm_head kept in BF16 for output quality recipe = QuantizationModifier( targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"], ) # Run quantization oneshot( model=MODEL_PATH, output_dir=OUTPUT_PATH, recipe=recipe, num_calibration_samples=0, # Dynamic quant needs no calibration save_compressed=True, ) print(f"\nQuantization complete!") print(f"Model saved to: {OUTPUT_PATH}") print(f"\nTo test with vLLM:") print(f" python -m vllm.entrypoints.openai.api_server \\") print(f" --model {OUTPUT_PATH} \\") print(f" --trust-remote-code") if __name__ == "__main__": main()