import sys import os sys.path.append(os.path.join(os.path.dirname(__file__), '../src')) import torch from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor from peft import PeftModel # Configuration MODEL_DIR = ".." # Path to VibeVoice-1.5B directory LORA_DIR = "../finetune_elise_single_speaker/lora" # Path to your fine-tuned LoRA weights OUTPUT_DIR = "output_audio" def load_model(): """Load the fine-tuned model""" print("Loading model...") # Load base model model = VibeVoiceForConditionalGenerationInference.from_pretrained( MODEL_DIR, torch_dtype=torch.bfloat16, device_map="cuda", attn_implementation="flash_attention_2" ) # Load fine-tuned LoRA weights model.model.language_model = PeftModel.from_pretrained( model.model.language_model, LORA_DIR ) # Load diffusion head diffusion_state = torch.load(f"{LORA_DIR}/diffusion_head_full.bin", map_location="cpu") model.model.prediction_head.load_state_dict(diffusion_state) # Load processor processor = VibeVoiceProcessor.from_pretrained(f"{MODEL_DIR}/src/vibevoice/processor") model.eval() model.set_ddpm_inference_steps(num_steps=20) return model, processor def generate_speech(model, processor, text, voice_sample_path=None): """Generate speech from text""" # Format text with Speaker 0 prefix (required!) prompt = f"Speaker 0: {text}" # If no voice sample provided, use a dummy one from training data # The model ignores this since it was trained with voice_prompt_drop_rate=1.0 if voice_sample_path is None: # You'll need at least one audio file from the training set voice_sample_path = "../elise_cleaned/wavs/sample_000009.wav" # Process inputs inputs = processor( text=[prompt], voice_samples=[[voice_sample_path]], return_tensors="pt" ) # Move to GPU for k, v in inputs.items(): if torch.is_tensor(v): inputs[k] = v.to("cuda") # Generate audio outputs = model.generate( **inputs, cfg_scale=2.0, tokenizer=processor.tokenizer, generation_config={'do_sample': False}, verbose=False ) if outputs.speech_outputs and outputs.speech_outputs[0] is not None: audio = outputs.speech_outputs[0] # Add small silence padding at the end silence = torch.zeros_like(audio[..., :4800]) # 200ms padded = torch.cat([audio, silence], dim=-1) return padded return None def main(): # Load model once model, processor = load_model() # Create output directory os.makedirs(OUTPUT_DIR, exist_ok=True) # Example texts texts = [ "Hello! This is the Elise voice model.", "I can generate speech without needing voice samples.", "Thank you for using this model!" ] # Generate speech for each text for i, text in enumerate(texts): print(f"\nGenerating: {text}") audio = generate_speech(model, processor, text) if audio is not None: output_path = f"{OUTPUT_DIR}/output_{i:02d}.wav" processor.save_audio(audio, output_path) duration = (audio.shape[-1] - 4800) / 24000 # Subtract padding print(f"Saved: {output_path} ({duration:.2f}s)") else: print("Failed to generate audio") if __name__ == "__main__": main()