|
|
|
|
|
""" |
|
|
Quantize RnJ-1-Instruct to FP8 using llmcompressor (Neural Magic) |
|
|
|
|
|
This script performs dynamic FP8 quantization (W8A8) on the RnJ-1-Instruct model. |
|
|
No calibration data is needed - it's a fast, deterministic conversion. |
|
|
|
|
|
Requirements: |
|
|
pip install llmcompressor torch transformers accelerate |
|
|
|
|
|
Usage: |
|
|
python quantize_rnj1_fp8.py |
|
|
|
|
|
Output: |
|
|
./RnJ-1-Instruct-FP8/ - vLLM-compatible FP8 model |
|
|
|
|
|
Hardware Requirements: |
|
|
- 32GB+ RAM for model loading |
|
|
- GPU optional (CPU quantization supported) |
|
|
""" |
|
|
|
|
|
import os |
|
|
import sys |
|
|
from pathlib import Path |
|
|
|
|
|
def main(): |
|
|
try: |
|
|
from llmcompressor import oneshot |
|
|
from llmcompressor.modifiers.quantization import QuantizationModifier |
|
|
except ImportError: |
|
|
print("Error: llmcompressor not installed") |
|
|
print("Install with: pip install llmcompressor torch transformers accelerate") |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
MODEL_PATH = "DoradusAI/RnJ-1-Instruct" |
|
|
OUTPUT_PATH = "./RnJ-1-Instruct-FP8" |
|
|
|
|
|
print(f"Quantizing {MODEL_PATH} to FP8...") |
|
|
print(f"Output directory: {OUTPUT_PATH}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
recipe = QuantizationModifier( |
|
|
targets="Linear", |
|
|
scheme="FP8_DYNAMIC", |
|
|
ignore=["lm_head"], |
|
|
) |
|
|
|
|
|
|
|
|
oneshot( |
|
|
model=MODEL_PATH, |
|
|
output_dir=OUTPUT_PATH, |
|
|
recipe=recipe, |
|
|
num_calibration_samples=0, |
|
|
save_compressed=True, |
|
|
) |
|
|
|
|
|
print(f"\nQuantization complete!") |
|
|
print(f"Model saved to: {OUTPUT_PATH}") |
|
|
print(f"\nTo test with vLLM:") |
|
|
print(f" python -m vllm.entrypoints.openai.api_server \\") |
|
|
print(f" --model {OUTPUT_PATH} \\") |
|
|
print(f" --trust-remote-code") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|