DevParker commited on
Commit
81508d4
·
verified ·
1 Parent(s): 8b22ea9

Upload 3 files

Browse files
loraV2/finetune_elise_fadeout.sh ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Single-speaker fine-tuning script for VibeVoice-1.5B on Elise dataset
4
+ # WITH FADEOUT AND PADDING - ensures all audio has clean endings
5
+ # No voice prompts - pure text-to-speech for Elise voice only
6
+
7
+ echo "Single-speaker fine-tuning with fadeout dataset..."
8
+ echo "All audio files have 100ms fadeout + 250ms padding"
9
+ echo "NO voice prompts - training pure Elise TTS model"
10
+
11
+ python -m src.finetune_vibevoice_lora \
12
+ --model_name_or_path . \
13
+ --train_jsonl jinsaryko_elise_fadeout/elise_train_split.jsonl \
14
+ --validation_jsonl jinsaryko_elise_fadeout/elise_val.jsonl \
15
+ --text_column_name text \
16
+ --audio_column_name audio \
17
+ --output_dir finetune_elise_fadeout \
18
+ --per_device_train_batch_size 4 \
19
+ --gradient_accumulation_steps 8 \
20
+ --learning_rate 2.5e-5 \
21
+ --num_train_epochs 4 \
22
+ --logging_steps 10 \
23
+ --save_steps 100 \
24
+ --eval_steps 100 \
25
+ --report_to none \
26
+ --remove_unused_columns False \
27
+ --bf16 True \
28
+ --do_train \
29
+ --do_eval \
30
+ --gradient_clipping \
31
+ --gradient_checkpointing False \
32
+ --ddpm_batch_mul 2 \
33
+ --diffusion_loss_weight 1.4 \
34
+ --train_diffusion_head True \
35
+ --ce_loss_weight 0.04 \
36
+ --voice_prompt_drop_rate 1.0 \
37
+ --lora_target_modules q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj \
38
+ --lr_scheduler_type cosine \
39
+ --warmup_ratio 0.03 \
40
+ --max_grad_norm 0.8 \
41
+ --max_length 4096
42
+
43
+ echo "Single-speaker fine-tuning with fadeout complete!"
44
+ echo "Model trained on audio with smooth fadeouts - no abrupt endings!"
loraV2/prepare_jinsaryko_elise_fadeout.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from datasets import load_dataset
4
+ import soundfile as sf
5
+ import numpy as np
6
+ from tqdm import tqdm
7
+
8
+ # Create output directories
9
+ os.makedirs("jinsaryko_elise_fadeout", exist_ok=True)
10
+ os.makedirs("jinsaryko_elise_fadeout/wavs", exist_ok=True)
11
+
12
+ def apply_fadeout_and_padding(audio_array, sr, fadeout_duration=0.1, silence_duration=0.25):
13
+ """
14
+ Apply fadeout and add silence padding to audio
15
+
16
+ Args:
17
+ audio_array: numpy array of audio samples
18
+ sr: sample rate
19
+ fadeout_duration: duration of fadeout in seconds (default 0.1s = 100ms)
20
+ silence_duration: duration of silence to add after fadeout (default 0.25s = 250ms)
21
+
22
+ Returns:
23
+ processed audio array
24
+ """
25
+ # Calculate sample counts
26
+ fadeout_samples = int(fadeout_duration * sr)
27
+ silence_samples = int(silence_duration * sr)
28
+
29
+ # Make sure we don't fade out more than the audio length
30
+ if fadeout_samples > len(audio_array):
31
+ fadeout_samples = len(audio_array) // 2
32
+
33
+ # Create fadeout curve (linear fade)
34
+ fade_curve = np.linspace(1.0, 0.0, fadeout_samples)
35
+
36
+ # Apply fadeout to the last part of the audio
37
+ audio_with_fade = audio_array.copy()
38
+ audio_with_fade[-fadeout_samples:] *= fade_curve
39
+
40
+ # Add silence padding
41
+ silence = np.zeros(silence_samples, dtype=audio_array.dtype)
42
+ audio_padded = np.concatenate([audio_with_fade, silence])
43
+
44
+ return audio_padded
45
+
46
+ # Load the Jinsaryko/Elise dataset
47
+ print("Loading Jinsaryko/Elise dataset...")
48
+ dataset = load_dataset("Jinsaryko/Elise")
49
+
50
+ # Since it's a single speaker dataset, we'll use voice_prompt_drop_rate=1.0 as recommended
51
+ # But we still need to format the text with Speaker 0: prefix
52
+
53
+ jsonl_data = []
54
+
55
+ print("Processing audio files with fadeout and padding...")
56
+ for idx, sample in enumerate(tqdm(dataset['train'])):
57
+ # Format text with Speaker 0: prefix
58
+ original_text = sample['text']
59
+ formatted_text = f"Speaker 0: {original_text}"
60
+
61
+ # Get audio data
62
+ audio = sample['audio']
63
+ audio_array = audio['array'].astype(np.float32)
64
+ sampling_rate = audio['sampling_rate']
65
+
66
+ # Apply fadeout and padding
67
+ processed_audio = apply_fadeout_and_padding(
68
+ audio_array,
69
+ sampling_rate,
70
+ fadeout_duration=0.1, # 100ms fadeout
71
+ silence_duration=0.25 # 250ms silence
72
+ )
73
+
74
+ # Save processed audio file
75
+ audio_filename = f"jinsaryko_elise_fadeout/wavs/sample_{idx:06d}.wav"
76
+ sf.write(audio_filename, processed_audio, sampling_rate)
77
+
78
+ # Add to JSONL
79
+ jsonl_entry = {
80
+ "text": formatted_text,
81
+ "audio": os.path.abspath(audio_filename)
82
+ }
83
+ jsonl_data.append(jsonl_entry)
84
+
85
+ # Save JSONL file
86
+ print("Saving JSONL files...")
87
+ with open("jinsaryko_elise_fadeout/elise_train.jsonl", "w") as f:
88
+ for entry in jsonl_data:
89
+ f.write(json.dumps(entry) + "\n")
90
+
91
+ # Create a small validation set (5% of data)
92
+ val_size = int(0.05 * len(jsonl_data))
93
+ train_data = jsonl_data[:-val_size]
94
+ val_data = jsonl_data[-val_size:]
95
+
96
+ with open("jinsaryko_elise_fadeout/elise_train_split.jsonl", "w") as f:
97
+ for entry in train_data:
98
+ f.write(json.dumps(entry) + "\n")
99
+
100
+ with open("jinsaryko_elise_fadeout/elise_val.jsonl", "w") as f:
101
+ for entry in val_data:
102
+ f.write(json.dumps(entry) + "\n")
103
+
104
+ print(f"\nDataset with fadeout prepared!")
105
+ print(f"Total samples: {len(jsonl_data)}")
106
+ print(f"Training samples: {len(train_data)}")
107
+ print(f"Validation samples: {len(val_data)}")
108
+ print(f"Files saved in jinsaryko_elise_fadeout/")
109
+ print(f"\nAudio processing applied:")
110
+ print(f"- 100ms fadeout at the end of each clip")
111
+ print(f"- 250ms silence padding after fadeout")
112
+ print(f"- All audio files now have smooth endings!")
loraV2/test_fadeout_model.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))
4
+
5
+ import torch
6
+ from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
7
+ from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
8
+ from peft import PeftModel
9
+ import json
10
+
11
+ print("Loading fadeout model (4 epochs, fadeout + padding dataset)...")
12
+ model = VibeVoiceForConditionalGenerationInference.from_pretrained(
13
+ ".",
14
+ torch_dtype=torch.bfloat16,
15
+ device_map="cuda",
16
+ attn_implementation="flash_attention_2"
17
+ )
18
+
19
+ # Load the fadeout model weights
20
+ model.model.language_model = PeftModel.from_pretrained(
21
+ model.model.language_model,
22
+ "finetune_elise_fadeout/lora"
23
+ )
24
+
25
+ diffusion_state = torch.load("finetune_elise_fadeout/lora/diffusion_head_full.bin", map_location="cpu")
26
+ model.model.prediction_head.load_state_dict(diffusion_state)
27
+
28
+ processor = VibeVoiceProcessor.from_pretrained("src/vibevoice/processor")
29
+ model.eval()
30
+
31
+ # Use optimal settings
32
+ model.set_ddpm_inference_steps(num_steps=20)
33
+
34
+ # Get a dummy voice sample (model ignores this due to voice_prompt_drop_rate=1.0)
35
+ with open("jinsaryko_elise_fadeout/elise_train_split.jsonl", 'r') as f:
36
+ voice_data = json.loads(f.readline())
37
+ dummy_voice_path = voice_data['audio']
38
+
39
+ print(f"\nUsing dummy voice (ignored): {os.path.basename(dummy_voice_path)}")
40
+ print("Testing fadeout model with various length statements...\n")
41
+
42
+ # Test sentences - short, medium, and long (ending with ellipses to prevent cutoffs)
43
+ test_sentences = [
44
+ # Short statements
45
+ "Hello!",
46
+ "Good morning everyone...",
47
+ "Welcome to my channel...",
48
+ "Thanks for watching!",
49
+
50
+ # Medium statements
51
+ "Today we're going to learn something amazing together...",
52
+ "I'm really excited to share this with all of you...",
53
+ "Let me show you how this incredible feature works...",
54
+ "Have you ever wondered about the mysteries of the universe?",
55
+
56
+ # Long statements
57
+ "Welcome back to the channel! Today I have something really special to share with you, and I think you're going to absolutely love what we're about to explore together...",
58
+ "Throughout my journey of learning and discovery, I've come across many fascinating concepts, but this one in particular has completely transformed the way I think about technology and innovation...",
59
+ "The beauty of machine learning lies not just in its complexity, but in how it can bring seemingly impossible ideas to life, creating experiences that were once confined to the realm of science fiction...",
60
+ "As we dive deeper into this topic, I want you to imagine the endless possibilities that await us, and consider how these advancements might shape our future in ways we can barely comprehend today..."
61
+ ]
62
+
63
+ os.makedirs("test_fadeout_output", exist_ok=True)
64
+
65
+ for i, text in enumerate(test_sentences):
66
+ length = "short" if len(text) < 30 else "medium" if len(text) < 100 else "long"
67
+ print(f"\n[{i+1}/{len(test_sentences)}] [{length.upper()}] {text}")
68
+
69
+ prompt = f"Speaker 0: {text}"
70
+
71
+ inputs = processor(
72
+ text=[prompt],
73
+ voice_samples=[[dummy_voice_path]], # Dummy voice (ignored by model)
74
+ return_tensors="pt"
75
+ )
76
+
77
+ for k, v in inputs.items():
78
+ if torch.is_tensor(v):
79
+ inputs[k] = v.to("cuda")
80
+
81
+ outputs = model.generate(
82
+ **inputs,
83
+ cfg_scale=2.0,
84
+ tokenizer=processor.tokenizer,
85
+ generation_config={'do_sample': False},
86
+ verbose=False
87
+ )
88
+
89
+ if outputs.speech_outputs and outputs.speech_outputs[0] is not None:
90
+ audio = outputs.speech_outputs[0]
91
+
92
+ # Save with light padding (model already has fadeout from training)
93
+ silence = torch.zeros_like(audio[..., :4800]) # 200ms padding
94
+ padded = torch.cat([audio, silence], dim=-1)
95
+
96
+ output_path = f"test_fadeout_output/test_{i:02d}_{length}.wav"
97
+ processor.save_audio(padded, output_path)
98
+
99
+ duration = audio.shape[-1] / 24000
100
+ print(f" ✓ Generated {duration:.2f}s → {output_path}")
101
+
102
+ print("\n" + "="*60)
103
+ print("Fadeout model test complete!")
104
+ print("Files saved in test_fadeout_output/")
105
+ print("\nModel stats:")
106
+ print("- 4 epochs on fadeout dataset (100ms fade + 250ms padding)")
107
+ print("- Final CE loss: ~5.25")
108
+ print("- Final Diffusion loss: ~0.559")
109
+ print("- voice_prompt_drop_rate: 1.0 (no voice prompts)")
110
+ print("- All training audio had smooth fadeouts!")
111
+ print("="*60)