File size: 1,955 Bytes
6a70e5e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/env python3
"""
Quantize RnJ-1-Instruct to FP8 using llmcompressor (Neural Magic)

This script performs dynamic FP8 quantization (W8A8) on the RnJ-1-Instruct model.
No calibration data is needed - it's a fast, deterministic conversion.

Requirements:
    pip install llmcompressor torch transformers accelerate

Usage:
    python quantize_rnj1_fp8.py

Output:
    ./RnJ-1-Instruct-FP8/ - vLLM-compatible FP8 model

Hardware Requirements:
    - 32GB+ RAM for model loading
    - GPU optional (CPU quantization supported)
"""

import os
import sys
from pathlib import Path

def main():
    try:
        from llmcompressor import oneshot
        from llmcompressor.modifiers.quantization import QuantizationModifier
    except ImportError:
        print("Error: llmcompressor not installed")
        print("Install with: pip install llmcompressor torch transformers accelerate")
        sys.exit(1)

    # Configuration
    MODEL_PATH = "DoradusAI/RnJ-1-Instruct"
    OUTPUT_PATH = "./RnJ-1-Instruct-FP8"

    print(f"Quantizing {MODEL_PATH} to FP8...")
    print(f"Output directory: {OUTPUT_PATH}")

    # FP8 Dynamic quantization recipe
    # - Weights: FP8 E4M3 (channel-wise, symmetric)
    # - Activations: FP8 E4M3 (token-wise, dynamic)
    # - lm_head kept in BF16 for output quality
    recipe = QuantizationModifier(
        targets="Linear",
        scheme="FP8_DYNAMIC",
        ignore=["lm_head"],
    )

    # Run quantization
    oneshot(
        model=MODEL_PATH,
        output_dir=OUTPUT_PATH,
        recipe=recipe,
        num_calibration_samples=0,  # Dynamic quant needs no calibration
        save_compressed=True,
    )

    print(f"\nQuantization complete!")
    print(f"Model saved to: {OUTPUT_PATH}")
    print(f"\nTo test with vLLM:")
    print(f"  python -m vllm.entrypoints.openai.api_server \\")
    print(f"    --model {OUTPUT_PATH} \\")
    print(f"    --trust-remote-code")


if __name__ == "__main__":
    main()