|
|
|
|
|
"""
|
|
|
Simple example of using the pre-quantized VibeVoice model
|
|
|
No need for on-the-fly quantization - loads much faster!
|
|
|
"""
|
|
|
|
|
|
import os
|
|
|
import torch
|
|
|
from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
|
|
|
from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
|
|
|
|
|
|
def main():
|
|
|
|
|
|
model_path = "/home/deveraux/Desktop/vibevoice/VibeVoice-Large-4bit"
|
|
|
|
|
|
print("Loading pre-quantized VibeVoice 4-bit model...")
|
|
|
|
|
|
|
|
|
processor = VibeVoiceProcessor.from_pretrained(model_path)
|
|
|
|
|
|
|
|
|
|
|
|
model = VibeVoiceForConditionalGenerationInference.from_pretrained(
|
|
|
model_path,
|
|
|
device_map='cuda',
|
|
|
torch_dtype=torch.bfloat16,
|
|
|
)
|
|
|
model.eval()
|
|
|
|
|
|
|
|
|
memory_gb = torch.cuda.memory_allocated() / 1e9
|
|
|
print(f"✅ Model loaded! Memory usage: {memory_gb:.1f} GB")
|
|
|
|
|
|
|
|
|
text = "Speaker 1: Welcome to our podcast! Speaker 2: Thanks for having me!"
|
|
|
|
|
|
|
|
|
voices_dir = "/home/deveraux/Desktop/vibevoice/VibeVoice-main/demo/voices"
|
|
|
speaker_voices = [
|
|
|
os.path.join(voices_dir, "en-Alice_woman.wav"),
|
|
|
os.path.join(voices_dir, "en-Carter_man.wav")
|
|
|
]
|
|
|
|
|
|
|
|
|
inputs = processor(
|
|
|
text=[text],
|
|
|
voice_samples=[speaker_voices],
|
|
|
padding=True,
|
|
|
return_tensors="pt",
|
|
|
return_attention_mask=True,
|
|
|
)
|
|
|
|
|
|
|
|
|
print(f"\nGenerating: '{text}'")
|
|
|
with torch.no_grad():
|
|
|
outputs = model.generate(
|
|
|
**inputs,
|
|
|
max_new_tokens=None,
|
|
|
cfg_scale=1.3,
|
|
|
tokenizer=processor.tokenizer,
|
|
|
generation_config={'do_sample': False},
|
|
|
)
|
|
|
|
|
|
|
|
|
output_path = "quantized_output.wav"
|
|
|
processor.save_audio(outputs.speech_outputs[0], output_path=output_path)
|
|
|
print(f"✅ Audio saved to: {output_path}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main() |