| |
| """ |
| AutoRound W4A16 Quantization for GLM-4.7 REAP models |
| |
| This script quantizes a REAP-pruned GLM-4.7 model to INT4 weights using Intel's AutoRound. |
| Reduces model size by ~4x while maintaining quality. |
| |
| Requirements: |
| pip install auto-round |
| |
| Usage: |
| python run_autoround.py --model-path ./GLM-4.7-REAP-50 --output-dir ./GLM-4.7-REAP-50-W4A16 |
| """ |
|
|
| import argparse |
| import subprocess |
| import sys |
| from pathlib import Path |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="AutoRound W4A16 quantization") |
| parser.add_argument("--model-path", type=str, required=True, |
| help="Path to REAP-pruned model") |
| parser.add_argument("--output-dir", type=str, default=None, |
| help="Output directory (default: {model-path}-W4A16)") |
| parser.add_argument("--bits", type=int, default=4, |
| help="Weight bit width (default: 4)") |
| parser.add_argument("--group-size", type=int, default=128, |
| help="Quantization group size (default: 128)") |
| parser.add_argument("--format", type=str, default="auto_gptq", |
| choices=["auto_gptq", "auto_awq", "auto_round"], |
| help="Output format (default: auto_gptq)") |
| parser.add_argument("--iters", type=int, default=200, |
| help="Optimization iterations (default: 200)") |
|
|
| args = parser.parse_args() |
|
|
| |
| if not Path(args.model_path).exists(): |
| print(f"ERROR: Model path not found: {args.model_path}") |
| sys.exit(1) |
|
|
| |
| if args.output_dir is None: |
| args.output_dir = f"{args.model_path}-W{args.bits}A16" |
|
|
| Path(args.output_dir).mkdir(parents=True, exist_ok=True) |
|
|
| |
| model_size_gb = sum(f.stat().st_size for f in Path(args.model_path).rglob("*.safetensors")) / (1024**3) |
| expected_output_gb = model_size_gb / 4 |
|
|
| print("=" * 60) |
| print(f"AutoRound W{args.bits}A16 Quantization") |
| print("=" * 60) |
| print(f"Input Model: {args.model_path}") |
| print(f"Input Size: {model_size_gb:.1f} GB") |
| print(f"Output: {args.output_dir}") |
| print(f"Expected Output Size: ~{expected_output_gb:.1f} GB") |
| print(f"Config: {args.bits}-bit, group_size={args.group_size}, format={args.format}") |
| print("=" * 60) |
| print("\nThis will take ~2-3 hours for a 92-layer MoE model...") |
| print() |
|
|
| |
| cmd = [ |
| "auto-round", |
| "--model", args.model_path, |
| "--bits", str(args.bits), |
| "--group_size", str(args.group_size), |
| "--format", args.format, |
| "--output_dir", args.output_dir, |
| "--iters", str(args.iters), |
| ] |
|
|
| result = subprocess.run(cmd) |
|
|
| if result.returncode == 0: |
| |
| output_size_gb = sum(f.stat().st_size for f in Path(args.output_dir).rglob("*.safetensors")) / (1024**3) |
| compression = model_size_gb / output_size_gb if output_size_gb > 0 else 0 |
|
|
| print("\n" + "=" * 60) |
| print("AutoRound quantization complete!") |
| print(f"Output: {args.output_dir}") |
| print(f"Output Size: {output_size_gb:.1f} GB ({compression:.1f}x compression)") |
| print("=" * 60) |
| else: |
| print(f"\nERROR: AutoRound failed with code {result.returncode}") |
| sys.exit(1) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|