onnx-models / old_scripts /convert_to_onnx.py

Upload folder using huggingface_hub

16ffc97 verified about 1 year ago

9.66 kB

	import os
	import gc
	import sys
	import time
	import logging
	import traceback
	import torch
	import warnings
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from onnxruntime.quantization import quantize_dynamic, QuantType

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s',
	datefmt='%Y-%m-%d %H:%M:%S',
	handlers=[logging.StreamHandler(sys.stdout)]
	)
	logger = logging.getLogger(__name__)

	# Suppress specific warnings
	warnings.filterwarnings("ignore", category=UserWarning, message=".The shape of the input dimension.")
	warnings.filterwarnings("ignore", category=UserWarning, message=".Converting a tensor to a Python.")

	# Models that are known to work well with ONNX conversion
	RELIABLE_MODELS = [
	{
	"id": "facebook/opt-350m",
	"description": "Well-balanced model (350M) for RAG and chatbots"
	},
	{
	"id": "gpt2",
	"description": "Very reliable model (124M) with excellent ONNX compatibility"
	},
	{
	"id": "distilgpt2",
	"description": "Lightweight (82M) model with good performance"
	}
	]

	class ModelWrapper(torch.nn.Module):
	"""
	Wrapper to handle ONNX export compatibility issues.
	This wrapper specifically:
	1. Bypasses cache handling
	2. Simplifies the forward pass to avoid dynamic operations
	"""
	def __init__(self, model):
	super().__init__()
	self.model = model

	def forward(self, input_ids):
	# Force no cache, no gradient, and no special features
	with torch.no_grad():
	return self.model(input_ids=input_ids, use_cache=False, return_dict=False)[0]

	def convert_model(model_id, output_dir, quantize=True):
	"""Convert a model to ONNX format with maximum compatibility."""
	start_time = time.time()

	logger.info(f"\n{'=' * 60}")
	logger.info(f"Converting {model_id} to ONNX")
	logger.info(f"{'=' * 60}")

	# Create output directory
	model_name = model_id.split("/")[-1]
	model_dir = os.path.join(output_dir, model_name)
	os.makedirs(model_dir, exist_ok=True)

	try:
	# Step 1: Load tokenizer
	logger.info("Step 1/5: Loading tokenizer...")

	tokenizer = AutoTokenizer.from_pretrained(model_id)

	# Handle missing pad token
	if tokenizer.pad_token is None and hasattr(tokenizer, 'eos_token'):
	logger.info("Adding pad_token = eos_token")
	tokenizer.pad_token = tokenizer.eos_token

	# Save tokenizer
	tokenizer.save_pretrained(model_dir)
	logger.info(f"✓ Tokenizer saved to {model_dir}")

	# Step 2: Load model with memory optimizations
	logger.info("Step 2/5: Loading model with memory optimizations...")

	# Clean memory before loading
	gc.collect()
	torch.cuda.empty_cache() if torch.cuda.is_available() else None

	# Load model with optimizations
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	torch_dtype=torch.float16, # Use half precision
	low_cpu_mem_usage=True # Reduce memory usage
	)

	# Save config for reference
	model.config.save_pretrained(model_dir)
	logger.info(f"✓ Model config saved to {model_dir}")

	# Step 3: Prepare for export
	logger.info("Step 3/5: Preparing for export...")

	# Wrap model to avoid tracing issues
	wrapped_model = ModelWrapper(model)
	wrapped_model.eval() # Set to evaluation mode

	# Clean memory again
	gc.collect()
	torch.cuda.empty_cache() if torch.cuda.is_available() else None

	# Step 4: Export to ONNX
	logger.info("Step 4/5: Exporting to ONNX format...")
	onnx_path = os.path.join(model_dir, "model.onnx")

	# Create dummy input
	batch_size = 1
	seq_length = 8 # Small sequence length to reduce memory
	dummy_input = torch.ones(batch_size, seq_length, dtype=torch.long)

	# Export to ONNX format with new opset version
	torch.onnx.export(
	wrapped_model, # Use wrapped model
	dummy_input, # Model input
	onnx_path, # Output path
	export_params=True, # Store model weights
	opset_version=14, # ONNX opset version (changed from 13 to 14)
	do_constant_folding=True, # Optimize constants
	input_names=['input_ids'], # Input names
	output_names=['logits'], # Output names
	dynamic_axes={
	'input_ids': {0: 'batch_size', 1: 'sequence'},
	'logits': {0: 'batch_size', 1: 'sequence'}
	}
	)

	# Clean up to save memory
	del model
	del wrapped_model
	gc.collect()
	torch.cuda.empty_cache() if torch.cuda.is_available() else None

	# Verify export was successful
	if os.path.exists(onnx_path):
	size_mb = os.path.getsize(onnx_path) / (1024 * 1024)
	logger.info(f"✓ ONNX model saved to {onnx_path}")
	logger.info(f"✓ Original size: {size_mb:.2f} MB")

	# Step 5: Quantize
	if quantize:
	logger.info("Step 5/5: Applying int8 quantization...")
	quant_path = onnx_path.replace(".onnx", "_quantized.onnx")

	try:
	quantize_dynamic(
	model_input=onnx_path,
	model_output=quant_path,
	per_channel=False,
	reduce_range=False,
	weight_type=QuantType.QInt8
	)

	if os.path.exists(quant_path):
	quant_size = os.path.getsize(quant_path) / (1024 * 1024)
	logger.info(f"✓ Quantized size: {quant_size:.2f} MB")
	logger.info(f"✓ Size reduction: {(1 - quant_size/size_mb) * 100:.1f}%")

	# Replace original with quantized to save space
	os.replace(quant_path, onnx_path)
	logger.info("✓ Replaced original with quantized version")
	else:
	logger.warning("⚠ Quantized file not created, using original")
	except Exception as e:
	logger.error(f"⚠ Quantization error: {str(e)}")
	logger.info("⚠ Using original model without quantization")
	else:
	logger.info("Step 5/5: Skipping quantization (not requested)")

	# Calculate elapsed time
	end_time = time.time()
	duration = end_time - start_time
	logger.info(f"✓ Conversion completed in {duration:.2f} seconds")

	return {
	"success": True,
	"model_id": model_id,
	"size_mb": os.path.getsize(onnx_path) / (1024 * 1024),
	"duration_seconds": duration,
	"output_dir": model_dir
	}
	else:
	logger.error(f"× ONNX file not created at {onnx_path}")
	return {
	"success": False,
	"model_id": model_id,
	"error": "ONNX file not created"
	}

	except Exception as e:
	logger.error(f"× Error converting model: {str(e)}")
	logger.error(traceback.format_exc())

	return {
	"success": False,
	"model_id": model_id,
	"error": str(e)
	}

	def main():
	"""Convert all reliable models."""
	# Print header
	logger.info("\nGUARANTEED ONNX CONVERTER")
	logger.info("======================")
	logger.info("Using reliable models with proven ONNX compatibility")

	# Create output directory
	output_dir = "./onnx_models"
	os.makedirs(output_dir, exist_ok=True)

	# Check if specific model ID provided as argument
	if len(sys.argv) > 1:
	model_id = sys.argv[1]
	logger.info(f"Converting single model: {model_id}")
	convert_model(model_id, output_dir)
	return

	# Convert all reliable models
	results = []
	for model_info in RELIABLE_MODELS:
	model_id = model_info["id"]
	logger.info(f"Processing model: {model_id}")
	logger.info(f"Description: {model_info['description']}")

	result = convert_model(model_id, output_dir)
	results.append(result)

	# Print summary
	logger.info("\n" + "=" * 60)
	logger.info("CONVERSION SUMMARY")
	logger.info("=" * 60)

	success_count = 0
	for result in results:
	if result.get("success", False):
	success_count += 1
	size_info = f" - Size: {result.get('size_mb', 0):.2f} MB"
	time_info = f" - Time: {result.get('duration_seconds', 0):.2f}s"
	logger.info(f"✓ SUCCESS: {result['model_id']}{size_info}{time_info}")
	else:
	logger.info(f"× FAILED: {result['model_id']} - Error: {result.get('error', 'Unknown error')}")

	logger.info(f"\nSuccessfully converted {success_count}/{len(RELIABLE_MODELS)} models")
	logger.info(f"Models saved to: {os.path.abspath(output_dir)}")

	if success_count > 0:
	logger.info("\nThe models are ready for RAG and chatbot applications!")

	if __name__ == "__main__":
	main()