File size: 5,132 Bytes
e888982
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/usr/bin/env python3
"""Quantize Nemotron encoder to int8 for smaller model size.

Reduces encoder from ~2.2GB to ~550MB with minimal quality loss.
"""
from pathlib import Path
import json
import shutil

import typer
import coremltools as ct
from coremltools.optimize.coreml import (
    OptimizationConfig,
    OpLinearQuantizerConfig,
    linear_quantize_weights,
)

app = typer.Typer(add_completion=False, pretty_exceptions_show_locals=False)


def _dir_size_mb(path: Path) -> float:
    total = 0
    for p in path.rglob("*"):
        if p.is_file():
            try:
                total += p.stat().st_size
            except OSError:
                pass
    return total / (1024 * 1024)


@app.command()
def quantize(
    model_dir: Path = typer.Option(
        Path("nemotron_coreml"),
        "--model-dir",
        help="Directory containing the baseline encoder.mlpackage",
    ),
    output_dir: Path = typer.Option(
        Path("nemotron_coreml_int8"),
        "--output-dir",
        help="Output directory for quantized models",
    ),
    granularity: str = typer.Option(
        "per_channel",
        "--granularity",
        help="Quantization granularity: per_channel or per_tensor",
    ),
) -> None:
    """Quantize the Nemotron encoder to int8."""

    encoder_path = model_dir / "encoder.mlpackage"
    if not encoder_path.exists():
        typer.echo(f"Error: encoder.mlpackage not found at {encoder_path}")
        raise typer.Exit(code=1)

    output_dir.mkdir(parents=True, exist_ok=True)

    # Get baseline size
    baseline_size = _dir_size_mb(encoder_path)
    typer.echo(f"Baseline encoder size: {baseline_size:.1f} MB")

    # Load encoder
    typer.echo("Loading encoder model...")
    encoder = ct.models.MLModel(str(encoder_path), compute_units=ct.ComputeUnit.CPU_AND_NE)

    # Set minimum deployment target for proper quantization ops
    try:
        encoder.minimum_deployment_target = ct.target.iOS17
    except Exception:
        pass

    # Configure int8 quantization
    typer.echo(f"Quantizing to int8 ({granularity})...")
    config = OptimizationConfig(
        global_config=OpLinearQuantizerConfig(
            mode="linear",
            granularity=granularity,
        )
    )

    # Quantize
    encoder_q = linear_quantize_weights(encoder, config)

    # Save quantized encoder
    encoder_q_path = output_dir / "encoder.mlpackage"
    typer.echo(f"Saving quantized encoder to {encoder_q_path}...")
    encoder_q.short_description = "Nemotron Streaming Encoder (int8 quantized)"
    encoder_q.author = "Fluid Inference"
    encoder_q.save(str(encoder_q_path))

    # Get quantized size
    quantized_size = _dir_size_mb(encoder_q_path)
    compression_ratio = baseline_size / quantized_size

    typer.echo(f"\nQuantized encoder size: {quantized_size:.1f} MB")
    typer.echo(f"Compression ratio: {compression_ratio:.2f}x")
    typer.echo(f"Size reduction: {baseline_size - quantized_size:.1f} MB")

    # Copy other models (they're already small enough)
    typer.echo("\nCopying other models...")
    for model_name in ["preprocessor.mlpackage", "decoder.mlpackage", "joint.mlpackage"]:
        src = model_dir / model_name
        dst = output_dir / model_name
        if src.exists():
            if dst.exists():
                shutil.rmtree(dst)
            shutil.copytree(src, dst)
            typer.echo(f"  Copied {model_name}")

    # Copy metadata and tokenizer
    for file_name in ["metadata.json", "tokenizer.json"]:
        src = model_dir / file_name
        dst = output_dir / file_name
        if src.exists():
            shutil.copy2(src, dst)
            typer.echo(f"  Copied {file_name}")

    # Update metadata with quantization info
    metadata_path = output_dir / "metadata.json"
    if metadata_path.exists():
        with open(metadata_path) as f:
            metadata = json.load(f)
        metadata["quantization"] = {
            "encoder": {
                "method": "int8_linear",
                "granularity": granularity,
                "baseline_size_mb": round(baseline_size, 1),
                "quantized_size_mb": round(quantized_size, 1),
                "compression_ratio": round(compression_ratio, 2),
            }
        }
        with open(metadata_path, "w") as f:
            json.dump(metadata, f, indent=2)
        typer.echo("  Updated metadata.json with quantization info")

    typer.echo(f"\nDone! Quantized models saved to {output_dir}")

    # Print summary
    typer.echo("\n" + "=" * 50)
    typer.echo("SUMMARY")
    typer.echo("=" * 50)
    typer.echo(f"Encoder: {baseline_size:.1f} MB → {quantized_size:.1f} MB ({compression_ratio:.2f}x)")

    total_baseline = baseline_size
    total_quantized = quantized_size
    for model_name in ["preprocessor.mlpackage", "decoder.mlpackage", "joint.mlpackage"]:
        src = model_dir / model_name
        if src.exists():
            size = _dir_size_mb(src)
            total_baseline += size
            total_quantized += size

    typer.echo(f"Total pipeline: {total_baseline:.1f} MB → {total_quantized:.1f} MB")


if __name__ == "__main__":
    app()