Vibevoice_1_5_lora / simple_inference.py
DevParker's picture
Upload 8 files
86e8346 verified
import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__), '../src'))
import torch
from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
from peft import PeftModel
# Configuration
MODEL_DIR = ".." # Path to VibeVoice-1.5B directory
LORA_DIR = "../finetune_elise_single_speaker/lora" # Path to your fine-tuned LoRA weights
OUTPUT_DIR = "output_audio"
def load_model():
"""Load the fine-tuned model"""
print("Loading model...")
# Load base model
model = VibeVoiceForConditionalGenerationInference.from_pretrained(
MODEL_DIR,
torch_dtype=torch.bfloat16,
device_map="cuda",
attn_implementation="flash_attention_2"
)
# Load fine-tuned LoRA weights
model.model.language_model = PeftModel.from_pretrained(
model.model.language_model,
LORA_DIR
)
# Load diffusion head
diffusion_state = torch.load(f"{LORA_DIR}/diffusion_head_full.bin", map_location="cpu")
model.model.prediction_head.load_state_dict(diffusion_state)
# Load processor
processor = VibeVoiceProcessor.from_pretrained(f"{MODEL_DIR}/src/vibevoice/processor")
model.eval()
model.set_ddpm_inference_steps(num_steps=20)
return model, processor
def generate_speech(model, processor, text, voice_sample_path=None):
"""Generate speech from text"""
# Format text with Speaker 0 prefix (required!)
prompt = f"Speaker 0: {text}"
# If no voice sample provided, use a dummy one from training data
# The model ignores this since it was trained with voice_prompt_drop_rate=1.0
if voice_sample_path is None:
# You'll need at least one audio file from the training set
voice_sample_path = "../elise_cleaned/wavs/sample_000009.wav"
# Process inputs
inputs = processor(
text=[prompt],
voice_samples=[[voice_sample_path]],
return_tensors="pt"
)
# Move to GPU
for k, v in inputs.items():
if torch.is_tensor(v):
inputs[k] = v.to("cuda")
# Generate audio
outputs = model.generate(
**inputs,
cfg_scale=2.0,
tokenizer=processor.tokenizer,
generation_config={'do_sample': False},
verbose=False
)
if outputs.speech_outputs and outputs.speech_outputs[0] is not None:
audio = outputs.speech_outputs[0]
# Add small silence padding at the end
silence = torch.zeros_like(audio[..., :4800]) # 200ms
padded = torch.cat([audio, silence], dim=-1)
return padded
return None
def main():
# Load model once
model, processor = load_model()
# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)
# Example texts
texts = [
"Hello! This is the Elise voice model.",
"I can generate speech without needing voice samples.",
"Thank you for using this model!"
]
# Generate speech for each text
for i, text in enumerate(texts):
print(f"\nGenerating: {text}")
audio = generate_speech(model, processor, text)
if audio is not None:
output_path = f"{OUTPUT_DIR}/output_{i:02d}.wav"
processor.save_audio(audio, output_path)
duration = (audio.shape[-1] - 4800) / 24000 # Subtract padding
print(f"Saved: {output_path} ({duration:.2f}s)")
else:
print("Failed to generate audio")
if __name__ == "__main__":
main()