Upload 8-bit quantized VibeVoice-Large

Browse files

Files changed (9) hide show

config.json +132 -0
generation_config.json +4 -0
load_quantized_8bit.py +60 -0
model-00001-of-00003.safetensors +3 -0
model-00002-of-00003.safetensors +3 -0
model-00003-of-00003.safetensors +3 -0
model.safetensors.index.json +0 -0
preprocessor_config.json +12 -0
quantization_config.json +20 -0

config.json ADDED Viewed

	@@ -0,0 +1,132 @@

+{
+  "acostic_vae_dim": 64,
+  "acoustic_tokenizer_config": {
+    "causal": true,
+    "channels": 1,
+    "conv_bias": true,
+    "conv_norm": "none",
+    "corpus_normalize": 0.0,
+    "decoder_depths": null,
+    "decoder_n_filters": 32,
+    "decoder_ratios": [
+      8,
+      5,
+      5,
+      4,
+      2,
+      2
+    ],
+    "disable_last_norm": true,
+    "encoder_depths": "3-3-3-3-3-3-8",
+    "encoder_n_filters": 32,
+    "encoder_ratios": [
+      8,
+      5,
+      5,
+      4,
+      2,
+      2
+    ],
+    "fix_std": 0.5,
+    "layer_scale_init_value": 1e-06,
+    "layernorm": "RMSNorm",
+    "layernorm_elementwise_affine": true,
+    "layernorm_eps": 1e-05,
+    "mixer_layer": "depthwise_conv",
+    "model_type": "vibevoice_acoustic_tokenizer",
+    "pad_mode": "constant",
+    "std_dist_type": "gaussian",
+    "vae_dim": 64,
+    "weight_init_value": 0.01
+  },
+  "architectures": [
+    "VibeVoiceForConditionalGeneration"
+  ],
+  "decoder_config": {
+    "attention_dropout": 0.0,
+    "hidden_act": "silu",
+    "hidden_size": 3584,
+    "initializer_range": 0.02,
+    "intermediate_size": 18944,
+    "max_position_embeddings": 32768,
+    "max_window_layers": 28,
+    "model_type": "qwen2",
+    "num_attention_heads": 28,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 4,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000.0,
+    "sliding_window": null,
+    "torch_dtype": "bfloat16",
+    "use_cache": true,
+    "use_mrope": false,
+    "use_sliding_window": false,
+    "vocab_size": 152064
+  },
+  "diffusion_head_config": {
+    "ddpm_batch_mul": 4,
+    "ddpm_beta_schedule": "cosine",
+    "ddpm_num_inference_steps": 20,
+    "ddpm_num_steps": 1000,
+    "diffusion_type": "ddpm",
+    "head_ffn_ratio": 3.0,
+    "head_layers": 4,
+    "hidden_size": 3584,
+    "latent_size": 64,
+    "model_type": "vibevoice_diffusion_head",
+    "prediction_type": "v_prediction",
+    "rms_norm_eps": 1e-05,
+    "speech_vae_dim": 64
+  },
+  "model_type": "vibevoice",
+  "semantic_tokenizer_config": {
+    "causal": true,
+    "channels": 1,
+    "conv_bias": true,
+    "conv_norm": "none",
+    "corpus_normalize": 0.0,
+    "disable_last_norm": true,
+    "encoder_depths": "3-3-3-3-3-3-8",
+    "encoder_n_filters": 32,
+    "encoder_ratios": [
+      8,
+      5,
+      5,
+      4,
+      2,
+      2
+    ],
+    "fix_std": 0,
+    "layer_scale_init_value": 1e-06,
+    "layernorm": "RMSNorm",
+    "layernorm_elementwise_affine": true,
+    "layernorm_eps": 1e-05,
+    "mixer_layer": "depthwise_conv",
+    "model_type": "vibevoice_semantic_tokenizer",
+    "pad_mode": "constant",
+    "std_dist_type": "none",
+    "vae_dim": 128,
+    "weight_init_value": 0.01
+  },
+  "semantic_vae_dim": 128,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "quantization_config": {
+    "quant_method": "bitsandbytes",
+    "_load_in_8bit": true,
+    "_load_in_4bit": false,
+    "llm_int8_threshold": 6.0,
+    "llm_int8_skip_modules": null,
+    "llm_int8_enable_fp32_cpu_offload": false,
+    "llm_int8_has_fp16_weight": false,
+    "bnb_4bit_quant_type": "fp4",
+    "bnb_4bit_use_double_quant": false,
+    "bnb_4bit_compute_dtype": "float32",
+    "bnb_4bit_quant_storage": "uint8",
+    "load_in_4bit": false,
+    "load_in_8bit": true
+  },
+  "_quantization_method": "bitsandbytes"
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "_from_model_config": true,
+  "transformers_version": "4.52.4"
+}

load_quantized_8bit.py ADDED Viewed

	@@ -0,0 +1,60 @@

+#!/usr/bin/env python
+"""
+Load and use the 8-bit quantized VibeVoice model
+"""
+import torch
+from transformers import BitsAndBytesConfig
+from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
+from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
+def load_quantized_model(model_path="quantized_8bit"):
+    """Load the pre-quantized VibeVoice model"""
+    print("Loading 8-bit quantized VibeVoice model...")
+    # The model is already quantized, but we need to specify the config
+    # to ensure proper loading of quantized weights
+    bnb_config = BitsAndBytesConfig(
+        load_in_8bit=True,
+        bnb_8bit_compute_dtype=torch.bfloat16,
+    )
+    # Load processor
+    processor = VibeVoiceProcessor.from_pretrained(model_path)
+    # Load model
+    model = VibeVoiceForConditionalGenerationInference.from_pretrained(
+        model_path,
+        quantization_config=bnb_config,
+        device_map='cuda',
+        torch_dtype=torch.bfloat16,
+    )
+    model.eval()
+    print("✅ Model loaded successfully!")
+    print(f"💾 Memory usage: {torch.cuda.memory_allocated() / 1e9:.1f} GB")
+    return model, processor
+# Example usage
+if __name__ == "__main__":
+    model, processor = load_quantized_model()
+    # Generate audio
+    text = "Speaker 1: Hello! Speaker 2: Hi there!"
+    inputs = processor(
+        text=[text],
+        voice_samples=[["path/to/voice1.wav", "path/to/voice2.wav"]],
+        padding=True,
+        return_tensors="pt",
+    )
+    with torch.no_grad():
+        outputs = model.generate(**inputs)
+    # Save audio
+    processor.save_audio(outputs.speech_outputs[0], "output.wav")

model-00001-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:68f98075dac463766219e6e61ff5fe9ab969f8fea621a65906f1d6793f2eaf72
+size 4987685394

model-00002-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:48940fb59366de226af5df46020f022d4d651f4563f190142c175b5bf733e9c7
+size 4489976774

model-00003-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d83c0514c0c9d2675cb4d51ee56b12515ea45770ce35acc5ab0ec4bc7d1bef73
+size 1089994880

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "processor_class": "VibeVoiceProcessor",
+  "speech_tok_compress_ratio": 3200,
+  "db_normalize": true,
+  "audio_processor": {
+    "feature_extractor_type": "VibeVoiceTokenizerProcessor",
+    "sampling_rate": 24000,
+    "normalize_audio": true,
+    "target_dB_FS": -25,
+    "eps": 1e-06
+  }
+}

quantization_config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "quantization_config": {
+    "quant_method": "bitsandbytes",
+    "_load_in_8bit": true,
+    "_load_in_4bit": false,
+    "llm_int8_threshold": 6.0,
+    "llm_int8_skip_modules": null,
+    "llm_int8_enable_fp32_cpu_offload": false,
+    "llm_int8_has_fp16_weight": false,
+    "bnb_4bit_quant_type": "fp4",
+    "bnb_4bit_use_double_quant": false,
+    "bnb_4bit_compute_dtype": "float32",
+    "bnb_4bit_quant_storage": "uint8",
+    "load_in_4bit": false,
+    "load_in_8bit": true
+  },
+  "quantization_method": "bitsandbytes",
+  "bits": 8,
+  "quant_type": "nf4"
+}