FastVLM_SANA / ml-stable-diffusion /convert_vae_to_mlx.py

Upload ml-stable-diffusion/convert_vae_to_mlx.py with huggingface_hub

5723b4b verified 4 months ago

5.3 kB

	#!/usr/bin/env python3
	"""
	Convert Sana DC-AE VAE from PyTorch/Diffusers to MLX format

	This avoids Core ML conversion issues by using MLX, Apple's optimized
	framework for Apple Silicon.

	Usage:
	python convert_vae_to_mlx.py \
	--model-version Efficient-Large-Model/Sana_600M_512px_diffusers \
	--output sana_vae_mlx.npz
	"""

	import argparse
	import json
	import numpy as np
	from pathlib import Path
	import torch


	def get_arguments():
	parser = argparse.ArgumentParser()

	parser.add_argument(
	"--model-version",
	default="Efficient-Large-Model/Sana_600M_512px_diffusers",
	help="Sana model from Hugging Face",
	)

	parser.add_argument(
	"--output",
	required=True,
	help="Output .npz file for MLX weights",
	)

	parser.add_argument(
	"--component",
	choices=["decoder", "encoder", "both"],
	default="decoder",
	help="Which component to convert",
	)

	return parser.parse_args()


	def convert_pytorch_to_mlx(pytorch_weights, prefix="decoder."):
	"""
	Convert PyTorch weights to MLX format

	MLX uses channels-last format (BHWC) while PyTorch uses (BCHW)
	"""
	mlx_weights = {}

	for key, value in pytorch_weights.items():
	if not key.startswith(prefix):
	continue

	# Remove prefix
	mlx_key = key[len(prefix):]

	# Convert tensor to numpy
	if isinstance(value, torch.Tensor):
	value = value.cpu().numpy()

	# Convert Conv2d weights from (out_c, in_c, h, w) to (out_c, h, w, in_c)
	if "conv" in mlx_key and value.ndim == 4:
	value = np.transpose(value, (0, 2, 3, 1))

	# Convert Linear weights - MLX uses (out, in) same as PyTorch
	# So no conversion needed for linear layers

	mlx_weights[mlx_key] = value.astype(np.float32)

	return mlx_weights


	def main():
	args = get_arguments()

	print("=" * 80)
	print("Converting Sana VAE to MLX Format")
	print("=" * 80)
	print()

	# Download model
	print(f"Downloading model: {args.model_version}")
	from huggingface_hub import snapshot_download

	local_path = snapshot_download(
	repo_id=args.model_version,
	allow_patterns=["vae/*"],
	)

	print(f"✓ Downloaded to: {local_path}")
	print()

	# Load config
	config_path = Path(local_path) / "vae" / "config.json"
	with open(config_path) as f:
	config = json.load(f)

	print("VAE Configuration:")
	print(f" Latent channels: {config.get('latent_channels', 32)}")
	print(f" Scaling factor: {config.get('scaling_factor', 1.0)}")
	print(f" Block channels: {config.get('block_out_channels')}")
	print()

	# Load PyTorch weights
	weights_path = Path(local_path) / "vae" / "diffusion_pytorch_model.safetensors"
	if not weights_path.exists():
	weights_path = Path(local_path) / "vae" / "diffusion_pytorch_model.bin"

	print(f"Loading weights from: {weights_path.name}")

	if weights_path.suffix == ".safetensors":
	from safetensors.torch import load_file
	pytorch_weights = load_file(str(weights_path))
	else:
	pytorch_weights = torch.load(weights_path, map_location="cpu")

	print(f"✓ Loaded {len(pytorch_weights)} weight tensors")
	print()

	# Convert weights
	output_weights = {}

	if args.component in ["decoder", "both"]:
	print("Converting decoder weights...")
	decoder_weights = convert_pytorch_to_mlx(pytorch_weights, "decoder.")
	print(f" ✓ Converted {len(decoder_weights)} decoder weights")
	output_weights.update({f"decoder.{k}": v for k, v in decoder_weights.items()})

	if args.component in ["encoder", "both"]:
	print("Converting encoder weights...")
	encoder_weights = convert_pytorch_to_mlx(pytorch_weights, "encoder.")
	print(f" ✓ Converted {len(encoder_weights)} encoder weights")
	output_weights.update({f"encoder.{k}": v for k, v in encoder_weights.items()})

	print()

	# Add config to weights
	output_weights["config"] = json.dumps(config)

	# Save MLX weights
	print(f"Saving MLX weights to: {args.output}")
	np.savez(args.output, **output_weights)

	print("✓ Conversion complete!")
	print()
	print("=" * 80)
	print("Usage Example:")
	print("=" * 80)
	print()
	print("import mlx.core as mx")
	print("from sana_vae_mlx import DCAEDecoder")
	print()
	print(f'weights = np.load("{args.output}")')
	print("decoder = DCAEDecoder(...)")
	print("decoder.load_weights([(k, mx.array(v)) for k, v in weights.items()])")
	print()
	print("# Or use the built-in loader:")
	print(f'decoder = DCAEDecoder.from_pretrained("{args.model_version}")')
	print()
	print("# Decode latents")
	print("latents = mx.random.normal((1, 32, 16, 16)) # [B, C, H, W]")
	print("image = decoder.decode(latents) # [B, 512, 512, 3]")
	print()

	# Print size info
	total_size = sum(v.nbytes for v in output_weights.values() if isinstance(v, np.ndarray))
	print(f"Total size: {total_size / 1024 / 1024:.1f} MB")


	if __name__ == "__main__":
	main()