|
|
import sys |
|
|
import os |
|
|
sys.path.append(os.path.join(os.path.dirname(__file__), '../src')) |
|
|
|
|
|
import torch |
|
|
from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference |
|
|
from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor |
|
|
from peft import PeftModel |
|
|
|
|
|
|
|
|
MODEL_DIR = ".." |
|
|
LORA_DIR = "../finetune_elise_single_speaker/lora" |
|
|
OUTPUT_DIR = "output_audio" |
|
|
|
|
|
def load_model(): |
|
|
"""Load the fine-tuned model""" |
|
|
print("Loading model...") |
|
|
|
|
|
|
|
|
model = VibeVoiceForConditionalGenerationInference.from_pretrained( |
|
|
MODEL_DIR, |
|
|
torch_dtype=torch.bfloat16, |
|
|
device_map="cuda", |
|
|
attn_implementation="flash_attention_2" |
|
|
) |
|
|
|
|
|
|
|
|
model.model.language_model = PeftModel.from_pretrained( |
|
|
model.model.language_model, |
|
|
LORA_DIR |
|
|
) |
|
|
|
|
|
|
|
|
diffusion_state = torch.load(f"{LORA_DIR}/diffusion_head_full.bin", map_location="cpu") |
|
|
model.model.prediction_head.load_state_dict(diffusion_state) |
|
|
|
|
|
|
|
|
processor = VibeVoiceProcessor.from_pretrained(f"{MODEL_DIR}/src/vibevoice/processor") |
|
|
|
|
|
model.eval() |
|
|
model.set_ddpm_inference_steps(num_steps=20) |
|
|
|
|
|
return model, processor |
|
|
|
|
|
def generate_speech(model, processor, text, voice_sample_path=None): |
|
|
"""Generate speech from text""" |
|
|
|
|
|
|
|
|
prompt = f"Speaker 0: {text}" |
|
|
|
|
|
|
|
|
|
|
|
if voice_sample_path is None: |
|
|
|
|
|
voice_sample_path = "../elise_cleaned/wavs/sample_000009.wav" |
|
|
|
|
|
|
|
|
inputs = processor( |
|
|
text=[prompt], |
|
|
voice_samples=[[voice_sample_path]], |
|
|
return_tensors="pt" |
|
|
) |
|
|
|
|
|
|
|
|
for k, v in inputs.items(): |
|
|
if torch.is_tensor(v): |
|
|
inputs[k] = v.to("cuda") |
|
|
|
|
|
|
|
|
outputs = model.generate( |
|
|
**inputs, |
|
|
cfg_scale=2.0, |
|
|
tokenizer=processor.tokenizer, |
|
|
generation_config={'do_sample': False}, |
|
|
verbose=False |
|
|
) |
|
|
|
|
|
if outputs.speech_outputs and outputs.speech_outputs[0] is not None: |
|
|
audio = outputs.speech_outputs[0] |
|
|
|
|
|
|
|
|
silence = torch.zeros_like(audio[..., :4800]) |
|
|
padded = torch.cat([audio, silence], dim=-1) |
|
|
|
|
|
return padded |
|
|
|
|
|
return None |
|
|
|
|
|
def main(): |
|
|
|
|
|
model, processor = load_model() |
|
|
|
|
|
|
|
|
os.makedirs(OUTPUT_DIR, exist_ok=True) |
|
|
|
|
|
|
|
|
texts = [ |
|
|
"Hello! This is the Elise voice model.", |
|
|
"I can generate speech without needing voice samples.", |
|
|
"Thank you for using this model!" |
|
|
] |
|
|
|
|
|
|
|
|
for i, text in enumerate(texts): |
|
|
print(f"\nGenerating: {text}") |
|
|
|
|
|
audio = generate_speech(model, processor, text) |
|
|
|
|
|
if audio is not None: |
|
|
output_path = f"{OUTPUT_DIR}/output_{i:02d}.wav" |
|
|
processor.save_audio(audio, output_path) |
|
|
|
|
|
duration = (audio.shape[-1] - 4800) / 24000 |
|
|
print(f"Saved: {output_path} ({duration:.2f}s)") |
|
|
else: |
|
|
print("Failed to generate audio") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |