Update README.md

f144175 verified 10 months ago

9.82 kB

	---
	library_name: transformers
	tags: []
	---

	# Model Card for Model ID
	## Comand

	`python ao/prep_model.py --quant_type fp8 --push_to_hub True `
	### Script to generate


	```py
	#!/usr/bin/env python3
	# SPDX-License-Identifier: Apache-2.0

	"""
	Script for quantizing LLM models with TorchAO.
	Supports various quantization configurations and model types.
	"""

	import random
	import numpy as np
	import torch
	import time
	from pathlib import Path
	from typing import Optional, Literal

	from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
	from transformer_nuggets.utils.benchmark import benchmark_cuda_function_in_microseconds
	from torchao.quantization.quant_api import (
	Float8DynamicActivationFloat8WeightConfig,
	Int4WeightOnlyConfig,
	Int8WeightOnlyConfig,
	Int8DynamicActivationInt8WeightConfig,
	PerRow,
	PerTensor,
	GemliteUIntXWeightOnlyConfig,
	Int4DynamicActivationInt4WeightConfig,
	Int8DynamicActivationInt4WeightConfig,
	CutlassInt4PackedLayout,
	)
	from torchao.prototype.mx_formats.mx_subclass import MXFPInferenceConfig
	from torchao.prototype.mx_formats import MXGemmKernelChoice
	from jsonargparse import CLI, Namespace
	from rich import print


	# Set seeds for reproducibility
	def set_seed(seed):
	random.seed(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)
	torch.cuda.manual_seed_all(seed)


	def get_quantization_config(args):
	"""Create TorchAo quantization config based on provided args."""
	granularity_mapping = {
	"per_row": PerRow(),
	"per_tensor": PerTensor(),
	}

	gran = granularity_mapping[args.granularity]

	match args.quant_type:
	case "autoquant":
	return TorchAoConfig("autoquant", min_sqnr=args.min_sqnr)
	case "fp8":
	return TorchAoConfig(
	Float8DynamicActivationFloat8WeightConfig(granularity=gran)
	)
	case "int4_weight_only":
	return TorchAoConfig(Int4WeightOnlyConfig(group_size=128))
	case "int8_weight_only":
	return TorchAoConfig(Int8WeightOnlyConfig())
	case "int8_dynamic_act_int8_weight":
	return TorchAoConfig(Int8DynamicActivationInt8WeightConfig())
	case "gemlite":
	return TorchAoConfig(GemliteUIntXWeightOnlyConfig())
	case "A4W4":
	return TorchAoConfig(Int4DynamicActivationInt4WeightConfig())
	case "A8W4":
	return TorchAoConfig(
	Int8DynamicActivationInt4WeightConfig(layout=CutlassInt4PackedLayout())
	)
	case "mxfp8":
	return TorchAoConfig(MXFPInferenceConfig())
	case "mxfp4":
	return TorchAoConfig(
	MXFPInferenceConfig(
	activation_dtype=torch.float4_e2m1fn_x2,
	weight_dtype=torch.float4_e2m1fn_x2,
	block_size=32,
	gemm_kernel_choice=MXGemmKernelChoice.CUTLASS,
	)
	)
	case _:
	raise ValueError(f"Unsupported quantization type: {args.quant_type}")


	def benchmark_model(model, input_ids, max_new_tokens, name=""):
	"""Benchmark model generation speed."""
	try:
	time_ms = benchmark_cuda_function_in_microseconds(
	model.generate,
	**input_ids,
	max_new_tokens=max_new_tokens,
	cache_implementation="static",
	)
	tokens_per_second = max_new_tokens / (time_ms / 1000)
	print(
	f"{name} model: {time_ms:.2f}ms for {max_new_tokens} tokens ({tokens_per_second:.2f} tokens/sec)"
	)
	return time_ms
	except ImportError:
	# Fallback to simple timing if inductor utils not available
	print("torch._inductor.utils not available, using simple timing")
	start = time.time()
	model.generate(
	**input_ids, max_new_tokens=max_new_tokens, cache_implementation="static"
	)
	elapsed = (time.time() - start) * 1000 # ms
	tokens_per_second = max_new_tokens / (elapsed / 1000)
	print(
	f"{name} model: {elapsed:.2f}ms for {max_new_tokens} tokens ({tokens_per_second:.2f} tokens/sec)"
	)
	return elapsed


	def main(
	model_name: str = "facebook/opt-125m",
	output_dir: Optional[str] = None,
	push_to_hub: bool = False,
	quant_type: Literal[
	"fp8",
	"int4_weight_only",
	"int8_weight_only",
	"int8_dynamic_act_int8_weight",
	"autoquant",
	"gemlite",
	"A4W4",
	"A8W4",
	"fp8",
	"mxfp4",
	] = "fp8",
	granularity: Literal["per_row", "per_tensor"] = "per_row",
	min_sqnr: Optional[float] = None,
	max_new_tokens: int = 64,
	benchmark: bool = False,
	bench_tokens: int = 100,
	device_map: str = "cuda",
	):
	"""
	Quantize a model with TorchAO and test its performance.

	Args:
	model_name: Model to quantize (e.g., meta-llama/Meta-Llama-3-8B, facebook/opt-125m)
	output_dir: Directory to save the quantized model
	push_to_hub: HF Hub repo name to push the model (e.g., 'your-username/model-name')
	quant_type: Quantization type to use
	granularity: Quantization granularity
	min_sqnr: Minimum SQNR for autoquant
	max_new_tokens: Max tokens to generate for testing
	benchmark: Run benchmarking comparison
	bench_tokens: Number of tokens to generate for benchmarking
	device_map: Device mapping strategy
	"""
	# Set seed before creating the model
	set_seed(42)

	# Set default output directory based on model base name if not provided
	if output_dir is None:
	model_base_name = model_name.split("/")[-1]
	output_dir = f"data/{quant_type}-{model_base_name}"

	# Convert to args-like object for compatibility with the rest of the code
	args = Namespace(
	model_name=model_name,
	output_dir=output_dir,
	push_to_hub=push_to_hub,
	quant_type=quant_type,
	granularity=granularity,
	min_sqnr=min_sqnr,
	max_new_tokens=max_new_tokens,
	benchmark=benchmark,
	bench_tokens=bench_tokens,
	device_map=device_map,
	)
	print(f"Using Model name: {args.model_name}")
	print(f"Quantization type: {args.quant_type}")

	# Create output directory
	output_dir = Path(args.output_dir)
	output_dir.mkdir(parents=True, exist_ok=True)

	# Get quantization config
	quantization_config = get_quantization_config(args)

	# Load and quantize model
	print("Loading and quantizing model...")
	quantized_model = AutoModelForCausalLM.from_pretrained(
	args.model_name,
	torch_dtype="bfloat16",
	device_map=args.device_map,
	quantization_config=quantization_config,
	)

	# Load tokenizer
	tokenizer = AutoTokenizer.from_pretrained(args.model_name)

	# Test prompts
	prompts = [
	"Why is Pytorch 2.0 the best machine learning compiler?",
	"Hello, my name is",
	"The president of the United States is",
	"The capital of France is",
	"The future of AI is",
	]

	# Test generation
	print("\nTesting quantized model generation...")
	input_ids = tokenizer(prompts, return_tensors="pt", padding=True).to(
	quantized_model.device
	)
	outputs = quantized_model.generate(**input_ids, max_new_tokens=args.max_new_tokens)

	for i, (prompt, output) in enumerate(zip(prompts, outputs)):
	generated_text = tokenizer.decode(output, skip_special_tokens=True)
	print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

	# Save quantized model
	print(f"\n📁Saving quantized model to: {output_dir}")
	quantized_model.save_pretrained(output_dir, safe_serialization=False)
	tokenizer.save_pretrained(output_dir)

	# Push to HuggingFace hub if requested
	if args.push_to_hub:
	# Get model name from output_dir
	model_name = output_dir.name
	hub_path = f"drisspg/ao_models/{model_name}"
	print(f"Pushing model to HuggingFace Hub: {hub_path}")
	quantized_model.push_to_hub(model_name, safe_serialization=False)
	tokenizer.push_to_hub(model_name)

	# Load saved model to verify
	print("\nLoading saved quantized model to verify...")
	loaded_model = AutoModelForCausalLM.from_pretrained(
	output_dir, device_map=args.device_map, torch_dtype="auto"
	)

	# Test loaded model with first prompt
	test_prompt = prompts[0]
	input_ids = tokenizer(test_prompt, return_tensors="pt").to(loaded_model.device)
	output = loaded_model.generate(**input_ids, max_new_tokens=args.max_new_tokens)
	generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
	print(f"Verification - Prompt: {test_prompt!r}, Generated text: {generated_text!r}")

	# Benchmark if requested
	if args.benchmark:
	print("\nBenchmarking models...")
	# Benchmark quantized model
	print("Benchmarking quantized model:")
	quant_time = benchmark_model(
	loaded_model, input_ids, args.bench_tokens, f"Quantized ({args.quant_type})"
	)

	# Load and benchmark original model in BF16
	print("\nLoading original model in BF16 for comparison...")
	bf16_model = AutoModelForCausalLM.from_pretrained(
	args.model_name, device_map=args.device_map, torch_dtype=torch.bfloat16
	)

	# Benchmark original model
	print("Benchmarking original BF16 model:")
	bf16_time = benchmark_model(bf16_model, input_ids, args.bench_tokens, "BF16")

	# Calculate speedup
	speedup = bf16_time / quant_time if quant_time > 0 else 0
	print(f"\nSpeedup: {speedup:.2f}x")

	print("\nQuantization process completed successfully.")


	if __name__ == "__main__":
	CLI(main)

	```