| |
| """ |
| Quantize and save VibeVoice model using bitsandbytes |
| Creates a pre-quantized model that can be shared and loaded directly |
| """ |
|
|
| import os |
| import json |
| import shutil |
| import torch |
| from pathlib import Path |
| from transformers import BitsAndBytesConfig |
| from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference |
| from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor |
| from transformers.utils import logging |
| from safetensors.torch import save_file |
|
|
| logging.set_verbosity_info() |
|
|
| def quantize_and_save_model( |
| model_path: str, |
| output_dir: str, |
| bits: int = 4, |
| quant_type: str = "nf4" |
| ): |
| """Quantize VibeVoice model and save it for distribution""" |
| |
| print(f"\n{'='*70}") |
| print(f"VIBEVOICE QUANTIZATION - {bits}-bit ({quant_type})") |
| print(f"{'='*70}") |
| print(f"Source: {model_path}") |
| print(f"Output: {output_dir}") |
| print(f"{'='*70}\n") |
| |
| |
| output_path = Path(output_dir) |
| output_path.mkdir(parents=True, exist_ok=True) |
| |
| |
| if bits == 4: |
| bnb_config = BitsAndBytesConfig( |
| load_in_4bit=True, |
| bnb_4bit_compute_dtype=torch.bfloat16, |
| bnb_4bit_use_double_quant=True, |
| bnb_4bit_quant_type=quant_type |
| ) |
| elif bits == 8: |
| bnb_config = BitsAndBytesConfig( |
| load_in_8bit=True, |
| bnb_8bit_compute_dtype=torch.bfloat16, |
| ) |
| else: |
| raise ValueError(f"Unsupported bit width: {bits}") |
| |
| print("π§ Loading and quantizing model...") |
| |
| |
| model = VibeVoiceForConditionalGenerationInference.from_pretrained( |
| model_path, |
| quantization_config=bnb_config, |
| device_map='cuda', |
| torch_dtype=torch.bfloat16, |
| ) |
| |
| |
| memory_gb = torch.cuda.memory_allocated() / 1e9 |
| print(f"πΎ Quantized model memory usage: {memory_gb:.1f} GB") |
| |
| |
| print("\nπ¦ Saving quantized model...") |
| |
| |
| try: |
| |
| model.save_pretrained( |
| output_path, |
| safe_serialization=True, |
| max_shard_size="5GB" |
| ) |
| |
| |
| quant_config_dict = { |
| "quantization_config": bnb_config.to_dict(), |
| "quantization_method": "bitsandbytes", |
| "bits": bits, |
| "quant_type": quant_type |
| } |
| |
| with open(output_path / "quantization_config.json", 'w') as f: |
| json.dump(quant_config_dict, f, indent=2) |
| |
| print("β
Model saved with integrated quantization") |
| |
| except Exception as e: |
| print(f"β οΈ Standard save failed: {e}") |
| print("Trying alternative save method...") |
| |
| |
| save_quantized_state_dict(model, output_path, bnb_config) |
| |
| |
| print("\nπ Copying processor files...") |
| processor = VibeVoiceProcessor.from_pretrained(model_path) |
| processor.save_pretrained(output_path) |
| |
| |
| for file in ["config.json", "generation_config.json"]: |
| src = Path(model_path) / file |
| if src.exists(): |
| shutil.copy2(src, output_path / file) |
| |
| |
| config_path = output_path / "config.json" |
| if config_path.exists(): |
| with open(config_path, 'r') as f: |
| config = json.load(f) |
| |
| config["quantization_config"] = bnb_config.to_dict() |
| config["_quantization_method"] = "bitsandbytes" |
| |
| with open(config_path, 'w') as f: |
| json.dump(config, f, indent=2) |
| |
| print(f"\nβ
Quantized model saved to: {output_path}") |
| |
| |
| create_loading_script(output_path, bits, quant_type) |
| |
| return output_path |
|
|
| def save_quantized_state_dict(model, output_path, bnb_config): |
| """Alternative method to save quantized weights""" |
| print("\nπ§ Saving quantized state dict...") |
| |
| |
| state_dict = model.state_dict() |
| |
| |
| quantized_state = {} |
| metadata = { |
| "quantized_modules": [], |
| "quantization_config": bnb_config.to_dict() |
| } |
| |
| for name, param in state_dict.items(): |
| |
| if hasattr(param, 'quant_state'): |
| |
| metadata["quantized_modules"].append(name) |
| quantized_state[name] = param.data |
| else: |
| |
| quantized_state[name] = param |
| |
| |
| save_file(quantized_state, output_path / "model.safetensors", metadata=metadata) |
| |
| |
| with open(output_path / "quantization_metadata.json", 'w') as f: |
| json.dump(metadata, f, indent=2) |
|
|
| def create_loading_script(output_path, bits, quant_type): |
| """Create a script to load the quantized model""" |
| |
| script_content = f'''#!/usr/bin/env python |
| """ |
| Load and use the {bits}-bit quantized VibeVoice model |
| """ |
| |
| import torch |
| from transformers import BitsAndBytesConfig |
| from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference |
| from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor |
| |
| def load_quantized_model(model_path="{output_path}"): |
| """Load the pre-quantized VibeVoice model""" |
| |
| print("Loading {bits}-bit quantized VibeVoice model...") |
| |
| # The model is already quantized, but we need to specify the config |
| # to ensure proper loading of quantized weights |
| bnb_config = BitsAndBytesConfig( |
| load_in_{bits}bit=True, |
| bnb_{bits}bit_compute_dtype=torch.bfloat16, |
| {"bnb_4bit_use_double_quant=True," if bits == 4 else ""} |
| {"bnb_4bit_quant_type='" + quant_type + "'" if bits == 4 else ""} |
| ) |
| |
| # Load processor |
| processor = VibeVoiceProcessor.from_pretrained(model_path) |
| |
| # Load model |
| model = VibeVoiceForConditionalGenerationInference.from_pretrained( |
| model_path, |
| quantization_config=bnb_config, |
| device_map='cuda', |
| torch_dtype=torch.bfloat16, |
| ) |
| |
| model.eval() |
| |
| print("β
Model loaded successfully!") |
| print(f"πΎ Memory usage: {{torch.cuda.memory_allocated() / 1e9:.1f}} GB") |
| |
| return model, processor |
| |
| # Example usage |
| if __name__ == "__main__": |
| model, processor = load_quantized_model() |
| |
| # Generate audio |
| text = "Speaker 1: Hello! Speaker 2: Hi there!" |
| inputs = processor( |
| text=[text], |
| voice_samples=[["path/to/voice1.wav", "path/to/voice2.wav"]], |
| padding=True, |
| return_tensors="pt", |
| ) |
| |
| with torch.no_grad(): |
| outputs = model.generate(**inputs) |
| |
| # Save audio |
| processor.save_audio(outputs.speech_outputs[0], "output.wav") |
| ''' |
| |
| script_path = output_path / f"load_quantized_{bits}bit.py" |
| with open(script_path, 'w') as f: |
| f.write(script_content) |
| |
| print(f"π Created loading script: {script_path}") |
|
|
| def test_quantized_model(model_path): |
| """Test loading and generating with the quantized model""" |
| print(f"\nπ§ͺ Testing quantized model from: {model_path}") |
| |
| try: |
| |
| processor = VibeVoiceProcessor.from_pretrained(model_path) |
| |
| |
| model = VibeVoiceForConditionalGenerationInference.from_pretrained( |
| model_path, |
| device_map='cuda', |
| torch_dtype=torch.bfloat16, |
| ) |
| |
| print("β
Model loaded successfully!") |
| |
| |
| test_text = "Speaker 1: Testing quantized model. Speaker 2: It works!" |
| print(f"\nπ€ Testing generation with: '{test_text}'") |
| |
| |
| voices_dir = "/home/deveraux/Desktop/vibevoice/VibeVoice-main/demo/voices" |
| speaker_voices = [ |
| os.path.join(voices_dir, "en-Alice_woman.wav"), |
| os.path.join(voices_dir, "en-Carter_man.wav") |
| ] |
| |
| inputs = processor( |
| text=[test_text], |
| voice_samples=[speaker_voices], |
| padding=True, |
| return_tensors="pt", |
| return_attention_mask=True, |
| ) |
| |
| with torch.no_grad(): |
| outputs = model.generate( |
| **inputs, |
| max_new_tokens=None, |
| cfg_scale=1.3, |
| tokenizer=processor.tokenizer, |
| generation_config={'do_sample': False}, |
| ) |
| |
| print("β
Generation successful!") |
| |
| |
| output_path = Path(model_path) / "test_output.wav" |
| processor.save_audio(outputs.speech_outputs[0], output_path=str(output_path)) |
| print(f"π Test audio saved to: {output_path}") |
| |
| return True |
| |
| except Exception as e: |
| print(f"β Test failed: {e}") |
| return False |
|
|
| def main(): |
| import argparse |
| parser = argparse.ArgumentParser(description="Quantize and save VibeVoice model") |
| parser.add_argument("--model_path", default="/home/deveraux/Desktop/vibevoice/VibeVoice-Large-pt", |
| help="Path to the original model") |
| parser.add_argument("--output_dir", default="/home/deveraux/Desktop/vibevoice/VibeVoice-Large-4bit", |
| help="Output directory for quantized model") |
| parser.add_argument("--bits", type=int, default=4, choices=[4, 8], |
| help="Quantization bits (4 or 8)") |
| parser.add_argument("--quant_type", default="nf4", choices=["nf4", "fp4"], |
| help="4-bit quantization type") |
| parser.add_argument("--test", action="store_true", |
| help="Test the quantized model after saving") |
| |
| args = parser.parse_args() |
| |
| |
| if str(args.bits) not in args.output_dir: |
| args.output_dir = args.output_dir.replace("4bit", f"{args.bits}bit") |
| |
| |
| output_path = quantize_and_save_model( |
| args.model_path, |
| args.output_dir, |
| args.bits, |
| args.quant_type |
| ) |
| |
| |
| if args.test: |
| test_quantized_model(output_path) |
| |
| print(f"\nπ Done! Quantized model ready for distribution at: {output_path}") |
| print(f"\nπ¦ To share this model:") |
| print(f"1. Upload the entire '{output_path}' directory") |
| print(f"2. Users can load it with the provided script or directly with transformers") |
| print(f"3. The model will load in {args.bits}-bit without additional quantization") |
|
|
| if __name__ == "__main__": |
| main() |