Upload 3 files
Browse files
loraV2/finetune_elise_fadeout.sh
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# Single-speaker fine-tuning script for VibeVoice-1.5B on Elise dataset
|
| 4 |
+
# WITH FADEOUT AND PADDING - ensures all audio has clean endings
|
| 5 |
+
# No voice prompts - pure text-to-speech for Elise voice only
|
| 6 |
+
|
| 7 |
+
echo "Single-speaker fine-tuning with fadeout dataset..."
|
| 8 |
+
echo "All audio files have 100ms fadeout + 250ms padding"
|
| 9 |
+
echo "NO voice prompts - training pure Elise TTS model"
|
| 10 |
+
|
| 11 |
+
python -m src.finetune_vibevoice_lora \
|
| 12 |
+
--model_name_or_path . \
|
| 13 |
+
--train_jsonl jinsaryko_elise_fadeout/elise_train_split.jsonl \
|
| 14 |
+
--validation_jsonl jinsaryko_elise_fadeout/elise_val.jsonl \
|
| 15 |
+
--text_column_name text \
|
| 16 |
+
--audio_column_name audio \
|
| 17 |
+
--output_dir finetune_elise_fadeout \
|
| 18 |
+
--per_device_train_batch_size 4 \
|
| 19 |
+
--gradient_accumulation_steps 8 \
|
| 20 |
+
--learning_rate 2.5e-5 \
|
| 21 |
+
--num_train_epochs 4 \
|
| 22 |
+
--logging_steps 10 \
|
| 23 |
+
--save_steps 100 \
|
| 24 |
+
--eval_steps 100 \
|
| 25 |
+
--report_to none \
|
| 26 |
+
--remove_unused_columns False \
|
| 27 |
+
--bf16 True \
|
| 28 |
+
--do_train \
|
| 29 |
+
--do_eval \
|
| 30 |
+
--gradient_clipping \
|
| 31 |
+
--gradient_checkpointing False \
|
| 32 |
+
--ddpm_batch_mul 2 \
|
| 33 |
+
--diffusion_loss_weight 1.4 \
|
| 34 |
+
--train_diffusion_head True \
|
| 35 |
+
--ce_loss_weight 0.04 \
|
| 36 |
+
--voice_prompt_drop_rate 1.0 \
|
| 37 |
+
--lora_target_modules q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj \
|
| 38 |
+
--lr_scheduler_type cosine \
|
| 39 |
+
--warmup_ratio 0.03 \
|
| 40 |
+
--max_grad_norm 0.8 \
|
| 41 |
+
--max_length 4096
|
| 42 |
+
|
| 43 |
+
echo "Single-speaker fine-tuning with fadeout complete!"
|
| 44 |
+
echo "Model trained on audio with smooth fadeouts - no abrupt endings!"
|
loraV2/prepare_jinsaryko_elise_fadeout.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
from datasets import load_dataset
|
| 4 |
+
import soundfile as sf
|
| 5 |
+
import numpy as np
|
| 6 |
+
from tqdm import tqdm
|
| 7 |
+
|
| 8 |
+
# Create output directories
|
| 9 |
+
os.makedirs("jinsaryko_elise_fadeout", exist_ok=True)
|
| 10 |
+
os.makedirs("jinsaryko_elise_fadeout/wavs", exist_ok=True)
|
| 11 |
+
|
| 12 |
+
def apply_fadeout_and_padding(audio_array, sr, fadeout_duration=0.1, silence_duration=0.25):
|
| 13 |
+
"""
|
| 14 |
+
Apply fadeout and add silence padding to audio
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
audio_array: numpy array of audio samples
|
| 18 |
+
sr: sample rate
|
| 19 |
+
fadeout_duration: duration of fadeout in seconds (default 0.1s = 100ms)
|
| 20 |
+
silence_duration: duration of silence to add after fadeout (default 0.25s = 250ms)
|
| 21 |
+
|
| 22 |
+
Returns:
|
| 23 |
+
processed audio array
|
| 24 |
+
"""
|
| 25 |
+
# Calculate sample counts
|
| 26 |
+
fadeout_samples = int(fadeout_duration * sr)
|
| 27 |
+
silence_samples = int(silence_duration * sr)
|
| 28 |
+
|
| 29 |
+
# Make sure we don't fade out more than the audio length
|
| 30 |
+
if fadeout_samples > len(audio_array):
|
| 31 |
+
fadeout_samples = len(audio_array) // 2
|
| 32 |
+
|
| 33 |
+
# Create fadeout curve (linear fade)
|
| 34 |
+
fade_curve = np.linspace(1.0, 0.0, fadeout_samples)
|
| 35 |
+
|
| 36 |
+
# Apply fadeout to the last part of the audio
|
| 37 |
+
audio_with_fade = audio_array.copy()
|
| 38 |
+
audio_with_fade[-fadeout_samples:] *= fade_curve
|
| 39 |
+
|
| 40 |
+
# Add silence padding
|
| 41 |
+
silence = np.zeros(silence_samples, dtype=audio_array.dtype)
|
| 42 |
+
audio_padded = np.concatenate([audio_with_fade, silence])
|
| 43 |
+
|
| 44 |
+
return audio_padded
|
| 45 |
+
|
| 46 |
+
# Load the Jinsaryko/Elise dataset
|
| 47 |
+
print("Loading Jinsaryko/Elise dataset...")
|
| 48 |
+
dataset = load_dataset("Jinsaryko/Elise")
|
| 49 |
+
|
| 50 |
+
# Since it's a single speaker dataset, we'll use voice_prompt_drop_rate=1.0 as recommended
|
| 51 |
+
# But we still need to format the text with Speaker 0: prefix
|
| 52 |
+
|
| 53 |
+
jsonl_data = []
|
| 54 |
+
|
| 55 |
+
print("Processing audio files with fadeout and padding...")
|
| 56 |
+
for idx, sample in enumerate(tqdm(dataset['train'])):
|
| 57 |
+
# Format text with Speaker 0: prefix
|
| 58 |
+
original_text = sample['text']
|
| 59 |
+
formatted_text = f"Speaker 0: {original_text}"
|
| 60 |
+
|
| 61 |
+
# Get audio data
|
| 62 |
+
audio = sample['audio']
|
| 63 |
+
audio_array = audio['array'].astype(np.float32)
|
| 64 |
+
sampling_rate = audio['sampling_rate']
|
| 65 |
+
|
| 66 |
+
# Apply fadeout and padding
|
| 67 |
+
processed_audio = apply_fadeout_and_padding(
|
| 68 |
+
audio_array,
|
| 69 |
+
sampling_rate,
|
| 70 |
+
fadeout_duration=0.1, # 100ms fadeout
|
| 71 |
+
silence_duration=0.25 # 250ms silence
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
# Save processed audio file
|
| 75 |
+
audio_filename = f"jinsaryko_elise_fadeout/wavs/sample_{idx:06d}.wav"
|
| 76 |
+
sf.write(audio_filename, processed_audio, sampling_rate)
|
| 77 |
+
|
| 78 |
+
# Add to JSONL
|
| 79 |
+
jsonl_entry = {
|
| 80 |
+
"text": formatted_text,
|
| 81 |
+
"audio": os.path.abspath(audio_filename)
|
| 82 |
+
}
|
| 83 |
+
jsonl_data.append(jsonl_entry)
|
| 84 |
+
|
| 85 |
+
# Save JSONL file
|
| 86 |
+
print("Saving JSONL files...")
|
| 87 |
+
with open("jinsaryko_elise_fadeout/elise_train.jsonl", "w") as f:
|
| 88 |
+
for entry in jsonl_data:
|
| 89 |
+
f.write(json.dumps(entry) + "\n")
|
| 90 |
+
|
| 91 |
+
# Create a small validation set (5% of data)
|
| 92 |
+
val_size = int(0.05 * len(jsonl_data))
|
| 93 |
+
train_data = jsonl_data[:-val_size]
|
| 94 |
+
val_data = jsonl_data[-val_size:]
|
| 95 |
+
|
| 96 |
+
with open("jinsaryko_elise_fadeout/elise_train_split.jsonl", "w") as f:
|
| 97 |
+
for entry in train_data:
|
| 98 |
+
f.write(json.dumps(entry) + "\n")
|
| 99 |
+
|
| 100 |
+
with open("jinsaryko_elise_fadeout/elise_val.jsonl", "w") as f:
|
| 101 |
+
for entry in val_data:
|
| 102 |
+
f.write(json.dumps(entry) + "\n")
|
| 103 |
+
|
| 104 |
+
print(f"\nDataset with fadeout prepared!")
|
| 105 |
+
print(f"Total samples: {len(jsonl_data)}")
|
| 106 |
+
print(f"Training samples: {len(train_data)}")
|
| 107 |
+
print(f"Validation samples: {len(val_data)}")
|
| 108 |
+
print(f"Files saved in jinsaryko_elise_fadeout/")
|
| 109 |
+
print(f"\nAudio processing applied:")
|
| 110 |
+
print(f"- 100ms fadeout at the end of each clip")
|
| 111 |
+
print(f"- 250ms silence padding after fadeout")
|
| 112 |
+
print(f"- All audio files now have smooth endings!")
|
loraV2/test_fadeout_model.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))
|
| 4 |
+
|
| 5 |
+
import torch
|
| 6 |
+
from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
|
| 7 |
+
from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
|
| 8 |
+
from peft import PeftModel
|
| 9 |
+
import json
|
| 10 |
+
|
| 11 |
+
print("Loading fadeout model (4 epochs, fadeout + padding dataset)...")
|
| 12 |
+
model = VibeVoiceForConditionalGenerationInference.from_pretrained(
|
| 13 |
+
".",
|
| 14 |
+
torch_dtype=torch.bfloat16,
|
| 15 |
+
device_map="cuda",
|
| 16 |
+
attn_implementation="flash_attention_2"
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
# Load the fadeout model weights
|
| 20 |
+
model.model.language_model = PeftModel.from_pretrained(
|
| 21 |
+
model.model.language_model,
|
| 22 |
+
"finetune_elise_fadeout/lora"
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
diffusion_state = torch.load("finetune_elise_fadeout/lora/diffusion_head_full.bin", map_location="cpu")
|
| 26 |
+
model.model.prediction_head.load_state_dict(diffusion_state)
|
| 27 |
+
|
| 28 |
+
processor = VibeVoiceProcessor.from_pretrained("src/vibevoice/processor")
|
| 29 |
+
model.eval()
|
| 30 |
+
|
| 31 |
+
# Use optimal settings
|
| 32 |
+
model.set_ddpm_inference_steps(num_steps=20)
|
| 33 |
+
|
| 34 |
+
# Get a dummy voice sample (model ignores this due to voice_prompt_drop_rate=1.0)
|
| 35 |
+
with open("jinsaryko_elise_fadeout/elise_train_split.jsonl", 'r') as f:
|
| 36 |
+
voice_data = json.loads(f.readline())
|
| 37 |
+
dummy_voice_path = voice_data['audio']
|
| 38 |
+
|
| 39 |
+
print(f"\nUsing dummy voice (ignored): {os.path.basename(dummy_voice_path)}")
|
| 40 |
+
print("Testing fadeout model with various length statements...\n")
|
| 41 |
+
|
| 42 |
+
# Test sentences - short, medium, and long (ending with ellipses to prevent cutoffs)
|
| 43 |
+
test_sentences = [
|
| 44 |
+
# Short statements
|
| 45 |
+
"Hello!",
|
| 46 |
+
"Good morning everyone...",
|
| 47 |
+
"Welcome to my channel...",
|
| 48 |
+
"Thanks for watching!",
|
| 49 |
+
|
| 50 |
+
# Medium statements
|
| 51 |
+
"Today we're going to learn something amazing together...",
|
| 52 |
+
"I'm really excited to share this with all of you...",
|
| 53 |
+
"Let me show you how this incredible feature works...",
|
| 54 |
+
"Have you ever wondered about the mysteries of the universe?",
|
| 55 |
+
|
| 56 |
+
# Long statements
|
| 57 |
+
"Welcome back to the channel! Today I have something really special to share with you, and I think you're going to absolutely love what we're about to explore together...",
|
| 58 |
+
"Throughout my journey of learning and discovery, I've come across many fascinating concepts, but this one in particular has completely transformed the way I think about technology and innovation...",
|
| 59 |
+
"The beauty of machine learning lies not just in its complexity, but in how it can bring seemingly impossible ideas to life, creating experiences that were once confined to the realm of science fiction...",
|
| 60 |
+
"As we dive deeper into this topic, I want you to imagine the endless possibilities that await us, and consider how these advancements might shape our future in ways we can barely comprehend today..."
|
| 61 |
+
]
|
| 62 |
+
|
| 63 |
+
os.makedirs("test_fadeout_output", exist_ok=True)
|
| 64 |
+
|
| 65 |
+
for i, text in enumerate(test_sentences):
|
| 66 |
+
length = "short" if len(text) < 30 else "medium" if len(text) < 100 else "long"
|
| 67 |
+
print(f"\n[{i+1}/{len(test_sentences)}] [{length.upper()}] {text}")
|
| 68 |
+
|
| 69 |
+
prompt = f"Speaker 0: {text}"
|
| 70 |
+
|
| 71 |
+
inputs = processor(
|
| 72 |
+
text=[prompt],
|
| 73 |
+
voice_samples=[[dummy_voice_path]], # Dummy voice (ignored by model)
|
| 74 |
+
return_tensors="pt"
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
for k, v in inputs.items():
|
| 78 |
+
if torch.is_tensor(v):
|
| 79 |
+
inputs[k] = v.to("cuda")
|
| 80 |
+
|
| 81 |
+
outputs = model.generate(
|
| 82 |
+
**inputs,
|
| 83 |
+
cfg_scale=2.0,
|
| 84 |
+
tokenizer=processor.tokenizer,
|
| 85 |
+
generation_config={'do_sample': False},
|
| 86 |
+
verbose=False
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
if outputs.speech_outputs and outputs.speech_outputs[0] is not None:
|
| 90 |
+
audio = outputs.speech_outputs[0]
|
| 91 |
+
|
| 92 |
+
# Save with light padding (model already has fadeout from training)
|
| 93 |
+
silence = torch.zeros_like(audio[..., :4800]) # 200ms padding
|
| 94 |
+
padded = torch.cat([audio, silence], dim=-1)
|
| 95 |
+
|
| 96 |
+
output_path = f"test_fadeout_output/test_{i:02d}_{length}.wav"
|
| 97 |
+
processor.save_audio(padded, output_path)
|
| 98 |
+
|
| 99 |
+
duration = audio.shape[-1] / 24000
|
| 100 |
+
print(f" ✓ Generated {duration:.2f}s → {output_path}")
|
| 101 |
+
|
| 102 |
+
print("\n" + "="*60)
|
| 103 |
+
print("Fadeout model test complete!")
|
| 104 |
+
print("Files saved in test_fadeout_output/")
|
| 105 |
+
print("\nModel stats:")
|
| 106 |
+
print("- 4 epochs on fadeout dataset (100ms fade + 250ms padding)")
|
| 107 |
+
print("- Final CE loss: ~5.25")
|
| 108 |
+
print("- Final Diffusion loss: ~0.559")
|
| 109 |
+
print("- voice_prompt_drop_rate: 1.0 (no voice prompts)")
|
| 110 |
+
print("- All training audio had smooth fadeouts!")
|
| 111 |
+
print("="*60)
|