Upload 8 files
Browse files- README.md +69 -0
- create_cleaned_dataset.py +80 -0
- data_vibevoice.py +370 -0
- detect_audio_cutoffs.py +77 -0
- finetune_elise_single_speaker.sh +44 -0
- prepare_jinsaryko_elise_dataset.py +66 -0
- simple_inference.py +119 -0
- test_fixed_eos_dummy_voice.py +103 -0
README.md
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# VibeVoice 1.5B Single-Speaker Fine-tuning Guide
|
| 2 |
+
|
| 3 |
+
This folder contains all the files needed to fine-tune VibeVoice 1.5B for a single speaker (Elise voice).
|
| 4 |
+
|
| 5 |
+
## Key Improvements
|
| 6 |
+
|
| 7 |
+
1. **Fixed EOS Token Issue**: The modified `data_vibevoice.py` adds proper `<|endoftext|>` token after speech generation to prevent repetition/looping
|
| 8 |
+
2. **Single-Speaker Training**: Uses `voice_prompt_drop_rate=1.0` to train without voice prompts
|
| 9 |
+
3. **Audio Quality Filter**: Removes training samples with abrupt cutoffs
|
| 10 |
+
|
| 11 |
+
## Files Included
|
| 12 |
+
|
| 13 |
+
- `data_vibevoice.py` - CRITICAL: Modified data collator that adds EOS token (replaces src/data_vibevoice.py)
|
| 14 |
+
- `prepare_jinsaryko_elise_dataset.py` - Downloads and prepares the Elise dataset
|
| 15 |
+
- `detect_audio_cutoffs.py` - Detects audio files with abrupt endings
|
| 16 |
+
- `finetune_elise_single_speaker.sh` - Training script for single-speaker model
|
| 17 |
+
- `test_fixed_eos_dummy_voice.py` - Test script for inference
|
| 18 |
+
|
| 19 |
+
## Quick Start
|
| 20 |
+
|
| 21 |
+
1. **Prepare the dataset**:
|
| 22 |
+
```bash
|
| 23 |
+
python prepare_jinsaryko_elise_dataset.py
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
2. **Detect and remove bad audio** (optional but recommended):
|
| 27 |
+
```bash
|
| 28 |
+
python detect_audio_cutoffs.py
|
| 29 |
+
# This will create elise_cleaned/ folder with good samples only
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
3. **IMPORTANT: Replace the data collator**:
|
| 33 |
+
```bash
|
| 34 |
+
cp data_vibevoice.py ../src/data_vibevoice.py
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
4. **Train the model**:
|
| 38 |
+
```bash
|
| 39 |
+
./finetune_elise_single_speaker.sh
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
5. **Test the model**:
|
| 43 |
+
```bash
|
| 44 |
+
python test_fixed_eos_dummy_voice.py
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
## Training Configuration
|
| 48 |
+
|
| 49 |
+
Key settings in `finetune_elise_single_speaker.sh`:
|
| 50 |
+
- `voice_prompt_drop_rate 1.0` - Always drops voice prompts (single-speaker mode)
|
| 51 |
+
- `learning_rate 2.5e-5` - Conservative learning rate
|
| 52 |
+
- `ddpm_batch_mul 2` - Diffusion batch multiplier
|
| 53 |
+
- `diffusion_loss_weight 1.4` - Diffusion loss weight
|
| 54 |
+
- `ce_loss_weight 0.04` - Cross-entropy loss weight
|
| 55 |
+
|
| 56 |
+
## How It Works
|
| 57 |
+
|
| 58 |
+
1. The model learns to associate "Speaker 0:" with Elise's voice
|
| 59 |
+
2. No voice samples needed during inference
|
| 60 |
+
3. Proper EOS token ensures clean endings without repetition
|
| 61 |
+
|
| 62 |
+
## Dataset Format
|
| 63 |
+
|
| 64 |
+
The training data should be JSONL with this format:
|
| 65 |
+
```json
|
| 66 |
+
{"text": "Speaker 0: Hello, this is a test.", "audio": "/path/to/audio.wav"}
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
Note: The "Speaker 0:" prefix is REQUIRED for all text entries.
|
create_cleaned_dataset.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import shutil
|
| 4 |
+
from tqdm import tqdm
|
| 5 |
+
|
| 6 |
+
# This script creates a cleaned dataset by removing samples with abrupt cutoffs
|
| 7 |
+
# It uses the results from detect_audio_cutoffs.py
|
| 8 |
+
|
| 9 |
+
print("Creating cleaned dataset from cutoff analysis...")
|
| 10 |
+
|
| 11 |
+
# Read the cutoff analysis
|
| 12 |
+
with open('audio_cutoff_analysis.json', 'r') as f:
|
| 13 |
+
analysis = json.load(f)
|
| 14 |
+
|
| 15 |
+
# Get good samples
|
| 16 |
+
good_samples = analysis['good_samples']
|
| 17 |
+
print(f"Found {len(good_samples)} good samples out of {analysis['total_samples']} total")
|
| 18 |
+
|
| 19 |
+
# Create output directory
|
| 20 |
+
os.makedirs("elise_cleaned", exist_ok=True)
|
| 21 |
+
os.makedirs("elise_cleaned/wavs", exist_ok=True)
|
| 22 |
+
|
| 23 |
+
# Process train split
|
| 24 |
+
train_good = []
|
| 25 |
+
val_good = []
|
| 26 |
+
|
| 27 |
+
# Read original train data
|
| 28 |
+
with open("jinsaryko_elise_formatted/elise_train_split.jsonl", 'r') as f:
|
| 29 |
+
for line in tqdm(f, desc="Processing train split"):
|
| 30 |
+
entry = json.loads(line)
|
| 31 |
+
audio_path = entry['audio']
|
| 32 |
+
|
| 33 |
+
# Check if this is a good sample
|
| 34 |
+
if audio_path in [s['audio_path'] for s in good_samples]:
|
| 35 |
+
# Copy audio file
|
| 36 |
+
basename = os.path.basename(audio_path)
|
| 37 |
+
new_audio_path = os.path.abspath(f"elise_cleaned/wavs/{basename}")
|
| 38 |
+
shutil.copy2(audio_path, new_audio_path)
|
| 39 |
+
|
| 40 |
+
# Update entry with new path
|
| 41 |
+
new_entry = {
|
| 42 |
+
"text": entry['text'],
|
| 43 |
+
"audio": new_audio_path
|
| 44 |
+
}
|
| 45 |
+
train_good.append(new_entry)
|
| 46 |
+
|
| 47 |
+
# Read original validation data
|
| 48 |
+
with open("jinsaryko_elise_formatted/elise_val.jsonl", 'r') as f:
|
| 49 |
+
for line in tqdm(f, desc="Processing validation split"):
|
| 50 |
+
entry = json.loads(line)
|
| 51 |
+
audio_path = entry['audio']
|
| 52 |
+
|
| 53 |
+
# Check if this is a good sample
|
| 54 |
+
if audio_path in [s['audio_path'] for s in good_samples]:
|
| 55 |
+
# Copy audio file
|
| 56 |
+
basename = os.path.basename(audio_path)
|
| 57 |
+
new_audio_path = os.path.abspath(f"elise_cleaned/wavs/{basename}")
|
| 58 |
+
if not os.path.exists(new_audio_path):
|
| 59 |
+
shutil.copy2(audio_path, new_audio_path)
|
| 60 |
+
|
| 61 |
+
# Update entry with new path
|
| 62 |
+
new_entry = {
|
| 63 |
+
"text": entry['text'],
|
| 64 |
+
"audio": new_audio_path
|
| 65 |
+
}
|
| 66 |
+
val_good.append(new_entry)
|
| 67 |
+
|
| 68 |
+
# Save cleaned datasets
|
| 69 |
+
with open("elise_cleaned/train_split.jsonl", 'w') as f:
|
| 70 |
+
for entry in train_good:
|
| 71 |
+
f.write(json.dumps(entry) + '\n')
|
| 72 |
+
|
| 73 |
+
with open("elise_cleaned/val.jsonl", 'w') as f:
|
| 74 |
+
for entry in val_good:
|
| 75 |
+
f.write(json.dumps(entry) + '\n')
|
| 76 |
+
|
| 77 |
+
print(f"\nCleaned dataset created!")
|
| 78 |
+
print(f"Training samples: {len(train_good)}")
|
| 79 |
+
print(f"Validation samples: {len(val_good)}")
|
| 80 |
+
print(f"Files saved in elise_cleaned/")
|
data_vibevoice.py
ADDED
|
@@ -0,0 +1,370 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import math
|
| 2 |
+
from dataclasses import dataclass
|
| 3 |
+
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
import torch
|
| 7 |
+
import warnings
|
| 8 |
+
import random
|
| 9 |
+
|
| 10 |
+
try:
|
| 11 |
+
import librosa # type: ignore
|
| 12 |
+
except Exception: # pragma: no cover
|
| 13 |
+
librosa = None # Fallback: user must install librosa when using local audio paths
|
| 14 |
+
|
| 15 |
+
try:
|
| 16 |
+
import resampy # type: ignore
|
| 17 |
+
except Exception: # pragma: no cover
|
| 18 |
+
resampy = None
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def _resample_if_needed(wav: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
|
| 22 |
+
if orig_sr == target_sr:
|
| 23 |
+
return wav.astype(np.float32, copy=False)
|
| 24 |
+
if resampy is not None:
|
| 25 |
+
return resampy.resample(wav.astype(np.float32), orig_sr, target_sr)
|
| 26 |
+
if librosa is not None:
|
| 27 |
+
return librosa.resample(y=wav.astype(np.float32), orig_sr=orig_sr, target_sr=target_sr)
|
| 28 |
+
warnings.warn(
|
| 29 |
+
"No resampler available; treating audio as target_sr without resampling. Install resampy or librosa.",
|
| 30 |
+
RuntimeWarning,
|
| 31 |
+
)
|
| 32 |
+
return wav.astype(np.float32, copy=False)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# Lightweight HF-style dataset wrapper (optional). Trainer can also pass raw HF datasets directly.
|
| 36 |
+
class VibeVoiceDataset:
|
| 37 |
+
def __init__(
|
| 38 |
+
self,
|
| 39 |
+
dataset: Any,
|
| 40 |
+
text_column: str = "text",
|
| 41 |
+
audio_column: str = "audio",
|
| 42 |
+
voice_prompts_column: Optional[str] = "voice_prompts",
|
| 43 |
+
) -> None:
|
| 44 |
+
self.dataset = dataset
|
| 45 |
+
self.text_column = text_column
|
| 46 |
+
self.audio_column = audio_column
|
| 47 |
+
self.voice_prompts_column = voice_prompts_column
|
| 48 |
+
|
| 49 |
+
def __len__(self) -> int:
|
| 50 |
+
return len(self.dataset)
|
| 51 |
+
|
| 52 |
+
def __getitem__(self, idx: int) -> Dict[str, Any]:
|
| 53 |
+
item = self.dataset[idx]
|
| 54 |
+
data: Dict[str, Any] = {}
|
| 55 |
+
data["text"] = item[self.text_column]
|
| 56 |
+
data["audio"] = item[self.audio_column]
|
| 57 |
+
|
| 58 |
+
user_provided_prompt = None
|
| 59 |
+
if self.voice_prompts_column and self.voice_prompts_column in item:
|
| 60 |
+
user_provided_prompt = item[self.voice_prompts_column]
|
| 61 |
+
|
| 62 |
+
if user_provided_prompt:
|
| 63 |
+
# A prompt was provided in the dataset, so we use it.
|
| 64 |
+
if not isinstance(user_provided_prompt, list):
|
| 65 |
+
data["voice_prompts"] = [user_provided_prompt]
|
| 66 |
+
else:
|
| 67 |
+
data["voice_prompts"] = user_provided_prompt
|
| 68 |
+
else:
|
| 69 |
+
# FALLBACK: No prompt provided, so we auto-generate one from the target audio.
|
| 70 |
+
try:
|
| 71 |
+
target_sr = 24000
|
| 72 |
+
wav_array = _load_audio_to_24k(item[self.audio_column], target_sr=target_sr)
|
| 73 |
+
audio_len_seconds = len(wav_array) / target_sr
|
| 74 |
+
|
| 75 |
+
min_len_sec = min(5.0, audio_len_seconds / 4.0)
|
| 76 |
+
max_len_sec = min(15.0, audio_len_seconds / 2.0)
|
| 77 |
+
|
| 78 |
+
if min_len_sec > max_len_sec:
|
| 79 |
+
min_len_sec = max_len_sec
|
| 80 |
+
max_len_sec = min(max_len_sec, audio_len_seconds)
|
| 81 |
+
|
| 82 |
+
if max_len_sec > 0.1:
|
| 83 |
+
prompt_len_sec = random.uniform(min_len_sec, max_len_sec)
|
| 84 |
+
prompt_len_samples = int(prompt_len_sec * target_sr)
|
| 85 |
+
|
| 86 |
+
max_start_sample = len(wav_array) - prompt_len_samples
|
| 87 |
+
start_sample = random.randint(0, max_start_sample)
|
| 88 |
+
|
| 89 |
+
prompt_crop = wav_array[start_sample : start_sample + prompt_len_samples]
|
| 90 |
+
|
| 91 |
+
data["voice_prompts"] = [prompt_crop]
|
| 92 |
+
else:
|
| 93 |
+
data["voice_prompts"] = None
|
| 94 |
+
|
| 95 |
+
except Exception as e:
|
| 96 |
+
warnings.warn(f"Could not create voice prompt for item {idx}: {e}")
|
| 97 |
+
data["voice_prompts"] = None
|
| 98 |
+
return data
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def _load_audio_to_24k(audio: Union[str, np.ndarray, torch.Tensor, Dict[str, Any]], *, target_sr: int = 24000) -> np.ndarray:
|
| 102 |
+
if isinstance(audio, np.ndarray):
|
| 103 |
+
return audio.astype(np.float32)
|
| 104 |
+
if isinstance(audio, torch.Tensor):
|
| 105 |
+
return audio.detach().cpu().float().numpy()
|
| 106 |
+
if isinstance(audio, str):
|
| 107 |
+
if librosa is None:
|
| 108 |
+
raise RuntimeError("librosa is required to load audio file paths. Please pip install librosa.")
|
| 109 |
+
wav, sr = librosa.load(audio, sr=None, mono=True)
|
| 110 |
+
wav = _resample_if_needed(wav, int(sr), target_sr)
|
| 111 |
+
return wav
|
| 112 |
+
if isinstance(audio, dict) and "array" in audio and "sampling_rate" in audio:
|
| 113 |
+
arr = np.asarray(audio["array"], dtype=np.float32)
|
| 114 |
+
sr = int(audio["sampling_rate"])
|
| 115 |
+
arr = _resample_if_needed(arr, sr, target_sr)
|
| 116 |
+
return arr
|
| 117 |
+
raise ValueError(f"Unsupported audio type: {type(audio)}")
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
@dataclass
|
| 121 |
+
class VibeVoiceCollator:
|
| 122 |
+
processor: Any # VibeVoiceProcessor
|
| 123 |
+
max_length: Optional[int] = None
|
| 124 |
+
speech_compress_ratio: int = 3200
|
| 125 |
+
semantic_vae_dim: int = 128
|
| 126 |
+
compute_semantics: bool = False
|
| 127 |
+
debug_checks: bool = False
|
| 128 |
+
|
| 129 |
+
text_field: str = "text"
|
| 130 |
+
audio_field: str = "audio"
|
| 131 |
+
voice_prompts_field: str = "voice_prompts"
|
| 132 |
+
voice_prompt_drop_rate: float = 0.0
|
| 133 |
+
|
| 134 |
+
def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, Any]:
|
| 135 |
+
batch_size = len(features)
|
| 136 |
+
|
| 137 |
+
sample_input_ids: List[List[int]] = []
|
| 138 |
+
sample_attention_masks: List[List[int]] = []
|
| 139 |
+
sample_acoustic_input_masks: List[List[bool]] = []
|
| 140 |
+
sample_acoustic_loss_masks: List[List[bool]] = []
|
| 141 |
+
|
| 142 |
+
all_speech_waveforms: List[np.ndarray] = []
|
| 143 |
+
all_speech_latent_lengths: List[int] = []
|
| 144 |
+
per_segment_is_target: List[bool] = []
|
| 145 |
+
|
| 146 |
+
for ex in features:
|
| 147 |
+
text: str = ex.get(self.text_field, "")
|
| 148 |
+
voice_prompts: Optional[List[Union[str, np.ndarray, torch.Tensor]]] = ex.get(self.voice_prompts_field)
|
| 149 |
+
target_audio: Union[str, np.ndarray, torch.Tensor, Dict[str, Any]] = ex.get(self.audio_field)
|
| 150 |
+
|
| 151 |
+
# Clamp drop rate for safety
|
| 152 |
+
_drop_rate = self.voice_prompt_drop_rate
|
| 153 |
+
if _drop_rate < 0.0:
|
| 154 |
+
_drop_rate = 0.0
|
| 155 |
+
elif _drop_rate > 1.0:
|
| 156 |
+
_drop_rate = 1.0
|
| 157 |
+
|
| 158 |
+
proc = self.processor(
|
| 159 |
+
text=[text],
|
| 160 |
+
voice_samples=[voice_prompts] if voice_prompts is not None and random.random() >= _drop_rate else None,
|
| 161 |
+
padding=False,
|
| 162 |
+
truncation=False,
|
| 163 |
+
max_length=self.max_length,
|
| 164 |
+
return_tensors="pt",
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
ids = proc["input_ids"][0].tolist()
|
| 168 |
+
attn = proc.get("attention_mask", torch.ones_like(proc["input_ids"]))[0].tolist()
|
| 169 |
+
speech_input_mask = proc.get("speech_input_mask")
|
| 170 |
+
if speech_input_mask is None:
|
| 171 |
+
speech_input_mask = torch.zeros_like(proc["input_ids"], dtype=torch.bool)
|
| 172 |
+
speech_input_mask_list = speech_input_mask[0].tolist()
|
| 173 |
+
|
| 174 |
+
wav_target = _load_audio_to_24k(target_audio, target_sr=24000)
|
| 175 |
+
# Prefer exact frame count from acoustic tokenizer if available; fallback to compress ratio
|
| 176 |
+
target_latent_len = None
|
| 177 |
+
try:
|
| 178 |
+
acoustic_tok = getattr(self.processor, "acoustic_tokenizer", None)
|
| 179 |
+
if acoustic_tok is not None and hasattr(acoustic_tok, "encode"):
|
| 180 |
+
enc_out = acoustic_tok.encode(wav_target)
|
| 181 |
+
# Normalize various possible return formats to get time dimension
|
| 182 |
+
T = None
|
| 183 |
+
try:
|
| 184 |
+
# Direct array-like with shape (T, D) or (T,)
|
| 185 |
+
if hasattr(enc_out, "shape") and len(getattr(enc_out, "shape", [])) >= 1:
|
| 186 |
+
T = int(enc_out.shape[0])
|
| 187 |
+
else:
|
| 188 |
+
# Nested lists/tuples or ModelOutput-like
|
| 189 |
+
cand = enc_out
|
| 190 |
+
# Drill down a couple of levels safely
|
| 191 |
+
for _ in range(2):
|
| 192 |
+
if isinstance(cand, (list, tuple)) and len(cand) > 0:
|
| 193 |
+
cand = cand[0]
|
| 194 |
+
if hasattr(cand, "shape") and len(getattr(cand, "shape", [])) >= 1:
|
| 195 |
+
T = int(cand.shape[0])
|
| 196 |
+
except Exception:
|
| 197 |
+
T = None
|
| 198 |
+
if T is not None and T > 0:
|
| 199 |
+
target_latent_len = T
|
| 200 |
+
except Exception:
|
| 201 |
+
target_latent_len = None
|
| 202 |
+
if target_latent_len is None:
|
| 203 |
+
target_latent_len = max(1, int(math.ceil(len(wav_target) / float(self.speech_compress_ratio))))
|
| 204 |
+
|
| 205 |
+
speech_diff_id = self.processor.tokenizer.speech_diffusion_id
|
| 206 |
+
target_placeholders = [speech_diff_id] * target_latent_len
|
| 207 |
+
|
| 208 |
+
ids_extended = ids + target_placeholders
|
| 209 |
+
attn_extended = attn + [1] * target_latent_len
|
| 210 |
+
|
| 211 |
+
acoustic_input_mask = speech_input_mask_list + [True] * target_latent_len
|
| 212 |
+
acoustic_loss_mask = ([False] * len(speech_input_mask_list)) + [True] * target_latent_len
|
| 213 |
+
|
| 214 |
+
# Add speech_end_id token
|
| 215 |
+
speech_end_id = self.processor.tokenizer.speech_end_id
|
| 216 |
+
ids_extended.append(speech_end_id)
|
| 217 |
+
attn_extended.append(1)
|
| 218 |
+
acoustic_input_mask.append(False)
|
| 219 |
+
acoustic_loss_mask.append(False)
|
| 220 |
+
|
| 221 |
+
# FIXED: Add actual EOS token after speech_end_id to properly terminate generation
|
| 222 |
+
eos_token_id = self.processor.tokenizer.eos_token_id
|
| 223 |
+
ids_extended.append(eos_token_id)
|
| 224 |
+
attn_extended.append(1)
|
| 225 |
+
acoustic_input_mask.append(False)
|
| 226 |
+
acoustic_loss_mask.append(False)
|
| 227 |
+
|
| 228 |
+
if self.max_length is not None and len(ids_extended) > self.max_length:
|
| 229 |
+
cut = len(ids_extended) - int(self.max_length)
|
| 230 |
+
leading_non_acoustic = 0
|
| 231 |
+
for v in acoustic_input_mask:
|
| 232 |
+
if v:
|
| 233 |
+
break
|
| 234 |
+
leading_non_acoustic += 1
|
| 235 |
+
if cut > leading_non_acoustic:
|
| 236 |
+
raise ValueError(
|
| 237 |
+
f"--max_length={self.max_length} would truncate into acoustic tokens. "
|
| 238 |
+
f"Needed cut={cut}, but only {leading_non_acoustic} leading non-acoustic tokens available. "
|
| 239 |
+
"Increase max_length or shorten text/voice-prompt preamble."
|
| 240 |
+
)
|
| 241 |
+
ids_extended = ids_extended[cut:]
|
| 242 |
+
attn_extended = attn_extended[cut:]
|
| 243 |
+
acoustic_input_mask = acoustic_input_mask[cut:]
|
| 244 |
+
acoustic_loss_mask = acoustic_loss_mask[cut:]
|
| 245 |
+
|
| 246 |
+
sample_input_ids.append(ids_extended)
|
| 247 |
+
sample_attention_masks.append(attn_extended)
|
| 248 |
+
sample_acoustic_input_masks.append(acoustic_input_mask)
|
| 249 |
+
sample_acoustic_loss_masks.append(acoustic_loss_mask)
|
| 250 |
+
|
| 251 |
+
voice_speeches = []
|
| 252 |
+
voice_latent_lengths = []
|
| 253 |
+
if proc.get("speech_tensors") is not None:
|
| 254 |
+
voice_np = proc["speech_tensors"].cpu().numpy()
|
| 255 |
+
voice_masks = proc["speech_masks"].cpu().numpy().astype(bool)
|
| 256 |
+
for seg_idx in range(voice_np.shape[0]):
|
| 257 |
+
voice_speeches.append(voice_np[seg_idx])
|
| 258 |
+
voice_latent_lengths.append(int(voice_masks[seg_idx].sum()))
|
| 259 |
+
|
| 260 |
+
all_speech_waveforms.extend(voice_speeches)
|
| 261 |
+
all_speech_latent_lengths.extend(voice_latent_lengths)
|
| 262 |
+
per_segment_is_target.extend([False] * len(voice_speeches))
|
| 263 |
+
|
| 264 |
+
all_speech_waveforms.append(wav_target)
|
| 265 |
+
all_speech_latent_lengths.append(target_latent_len)
|
| 266 |
+
per_segment_is_target.append(True)
|
| 267 |
+
|
| 268 |
+
max_seq_len = max(len(x) for x in sample_input_ids)
|
| 269 |
+
padded_input_ids = []
|
| 270 |
+
padded_attention_masks = []
|
| 271 |
+
padded_acoustic_input_masks = []
|
| 272 |
+
padded_acoustic_loss_masks = []
|
| 273 |
+
tok = self.processor.tokenizer
|
| 274 |
+
pad_token_id = getattr(tok, "pad_token_id", None)
|
| 275 |
+
if pad_token_id is None or pad_token_id < 0:
|
| 276 |
+
pad_token_id = getattr(tok, "eos_token_id", None)
|
| 277 |
+
if pad_token_id is None or pad_token_id < 0:
|
| 278 |
+
raise ValueError(
|
| 279 |
+
"Tokenizer has no pad_token_id or eos_token_id; please set one or pass a valid pad id."
|
| 280 |
+
)
|
| 281 |
+
for ids, attn, ain_mask, aloss_mask in zip(
|
| 282 |
+
sample_input_ids, sample_attention_masks, sample_acoustic_input_masks, sample_acoustic_loss_masks
|
| 283 |
+
):
|
| 284 |
+
pad_len = max_seq_len - len(ids)
|
| 285 |
+
padded_input_ids.append(ids + [pad_token_id] * pad_len)
|
| 286 |
+
padded_attention_masks.append(attn + [0] * pad_len)
|
| 287 |
+
padded_acoustic_input_masks.append(ain_mask + [False] * pad_len)
|
| 288 |
+
padded_acoustic_loss_masks.append(aloss_mask + [False] * pad_len)
|
| 289 |
+
|
| 290 |
+
input_ids_tensor = torch.tensor(padded_input_ids, dtype=torch.long)
|
| 291 |
+
attention_mask_tensor = torch.tensor(padded_attention_masks, dtype=torch.long)
|
| 292 |
+
acoustic_input_mask_tensor = torch.tensor(padded_acoustic_input_masks, dtype=torch.bool)
|
| 293 |
+
acoustic_loss_mask_tensor = torch.tensor(padded_acoustic_loss_masks, dtype=torch.bool)
|
| 294 |
+
|
| 295 |
+
if all_speech_waveforms:
|
| 296 |
+
max_wave_len = max(w.shape[0] for w in all_speech_waveforms)
|
| 297 |
+
padded_speeches = np.zeros((len(all_speech_waveforms), max_wave_len), dtype=np.float32)
|
| 298 |
+
for i, w in enumerate(all_speech_waveforms):
|
| 299 |
+
L = w.shape[0]
|
| 300 |
+
padded_speeches[i, :L] = w
|
| 301 |
+
|
| 302 |
+
max_latent_len = max(all_speech_latent_lengths) if all_speech_latent_lengths else 1
|
| 303 |
+
speech_masks_np = np.zeros((len(all_speech_waveforms), max_latent_len), dtype=np.bool_)
|
| 304 |
+
for i, L_lat in enumerate(all_speech_latent_lengths):
|
| 305 |
+
speech_masks_np[i, :L_lat] = True
|
| 306 |
+
|
| 307 |
+
speech_tensors_tensor = torch.tensor(padded_speeches, dtype=torch.float32)
|
| 308 |
+
speech_masks_tensor = torch.tensor(speech_masks_np, dtype=torch.bool)
|
| 309 |
+
|
| 310 |
+
speeches_loss_input_np = np.zeros_like(speech_masks_np, dtype=np.bool_)
|
| 311 |
+
for i, is_target in enumerate(per_segment_is_target):
|
| 312 |
+
if is_target:
|
| 313 |
+
speeches_loss_input_np[i] = speech_masks_np[i]
|
| 314 |
+
speeches_loss_input_tensor = torch.tensor(speeches_loss_input_np, dtype=torch.bool)
|
| 315 |
+
|
| 316 |
+
# Semantic features
|
| 317 |
+
if self.compute_semantics and hasattr(self.processor, "semantic_tokenizer") and self.processor.semantic_tokenizer is not None:
|
| 318 |
+
sem_feats: List[np.ndarray] = []
|
| 319 |
+
for w in all_speech_waveforms:
|
| 320 |
+
try:
|
| 321 |
+
# Expect [T, D] where T ≈ ceil(len(w)/compress_ratio)
|
| 322 |
+
sem = self.processor.semantic_tokenizer.encode(w)
|
| 323 |
+
sem = np.asarray(sem, dtype=np.float32)
|
| 324 |
+
except Exception:
|
| 325 |
+
sem = np.zeros((0, self.semantic_vae_dim), dtype=np.float32)
|
| 326 |
+
if sem.ndim != 2:
|
| 327 |
+
raise RuntimeError(f"Semantic tokenizer returned unexpected shape {sem.shape}. Expect [T, D].")
|
| 328 |
+
L = sem.shape[0]
|
| 329 |
+
D = sem.shape[1]
|
| 330 |
+
if D != self.semantic_vae_dim:
|
| 331 |
+
if D < self.semantic_vae_dim:
|
| 332 |
+
pad_d = np.zeros((L, self.semantic_vae_dim - D), dtype=np.float32)
|
| 333 |
+
sem = np.concatenate([sem, pad_d], axis=1)
|
| 334 |
+
else:
|
| 335 |
+
sem = sem[:, : self.semantic_vae_dim]
|
| 336 |
+
if L < max_latent_len:
|
| 337 |
+
pad = np.zeros((max_latent_len - L, self.semantic_vae_dim), dtype=np.float32)
|
| 338 |
+
sem = np.concatenate([sem, pad], axis=0)
|
| 339 |
+
elif L > max_latent_len:
|
| 340 |
+
sem = sem[:max_latent_len]
|
| 341 |
+
sem_feats.append(sem.astype(np.float32))
|
| 342 |
+
speech_semantic_tensors = torch.tensor(np.stack(sem_feats, axis=0), dtype=torch.float32)
|
| 343 |
+
else:
|
| 344 |
+
# Semantic tokenizer unavailable while semantics are required for training.
|
| 345 |
+
# Raise to avoid silently degrading alignment with zeroed features.
|
| 346 |
+
raise RuntimeError(
|
| 347 |
+
"Semantic features are required but could not be computed. "
|
| 348 |
+
"Ensure processor.semantic_tokenizer is available or precompute and provide features."
|
| 349 |
+
)
|
| 350 |
+
else:
|
| 351 |
+
speech_tensors_tensor = None
|
| 352 |
+
speech_masks_tensor = None
|
| 353 |
+
speeches_loss_input_tensor = None
|
| 354 |
+
speech_semantic_tensors = None # No segments in batch
|
| 355 |
+
|
| 356 |
+
if self.debug_checks:
|
| 357 |
+
assert (input_ids_tensor >= 0).all(), "input_ids contains negative indices"
|
| 358 |
+
if speech_tensors_tensor is not None:
|
| 359 |
+
assert speech_tensors_tensor.dim() == 2, "Expected speech_tensors 2D [segments, samples]"
|
| 360 |
+
|
| 361 |
+
return {
|
| 362 |
+
"input_ids": input_ids_tensor,
|
| 363 |
+
"attention_mask": attention_mask_tensor,
|
| 364 |
+
"speech_tensors": speech_tensors_tensor,
|
| 365 |
+
"speech_masks": speech_masks_tensor,
|
| 366 |
+
"speech_semantic_tensors": speech_semantic_tensors,
|
| 367 |
+
"acoustic_input_mask": acoustic_input_mask_tensor,
|
| 368 |
+
"acoustic_loss_mask": acoustic_loss_mask_tensor,
|
| 369 |
+
"speeches_loss_input": speeches_loss_input_tensor,
|
| 370 |
+
}
|
detect_audio_cutoffs.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import numpy as np
|
| 4 |
+
import soundfile as sf
|
| 5 |
+
from tqdm import tqdm
|
| 6 |
+
|
| 7 |
+
def check_abrupt_ending(audio_path, threshold_db=-35):
|
| 8 |
+
"""Check if audio ends abruptly"""
|
| 9 |
+
try:
|
| 10 |
+
audio, sr = sf.read(audio_path)
|
| 11 |
+
if len(audio.shape) > 1:
|
| 12 |
+
audio = audio.mean(axis=1)
|
| 13 |
+
|
| 14 |
+
# Check last 100ms of audio
|
| 15 |
+
last_100ms = int(0.1 * sr)
|
| 16 |
+
final_segment = audio[-last_100ms:]
|
| 17 |
+
|
| 18 |
+
# Calculate RMS energy
|
| 19 |
+
rms = np.sqrt(np.mean(final_segment**2))
|
| 20 |
+
db = 20 * np.log10(rms + 1e-10)
|
| 21 |
+
|
| 22 |
+
# If final segment has significant energy, it's likely cut off
|
| 23 |
+
return db > threshold_db, db
|
| 24 |
+
except:
|
| 25 |
+
return False, -100
|
| 26 |
+
|
| 27 |
+
print("Analyzing audio endings...")
|
| 28 |
+
|
| 29 |
+
# Load dataset
|
| 30 |
+
with open("jinsaryko_elise_formatted/elise_train.jsonl", 'r') as f:
|
| 31 |
+
data = [json.loads(line) for line in f]
|
| 32 |
+
|
| 33 |
+
clean_samples = []
|
| 34 |
+
abrupt_samples = []
|
| 35 |
+
|
| 36 |
+
for item in tqdm(data):
|
| 37 |
+
is_abrupt, energy = check_abrupt_ending(item['audio'])
|
| 38 |
+
|
| 39 |
+
if is_abrupt:
|
| 40 |
+
abrupt_samples.append((item, energy))
|
| 41 |
+
else:
|
| 42 |
+
clean_samples.append(item)
|
| 43 |
+
|
| 44 |
+
print(f"\nResults:")
|
| 45 |
+
print(f"Clean endings: {len(clean_samples)} ({100*len(clean_samples)/len(data):.1f}%)")
|
| 46 |
+
print(f"Abrupt cutoffs: {len(abrupt_samples)} ({100*len(abrupt_samples)/len(data):.1f}%)")
|
| 47 |
+
|
| 48 |
+
# Show worst examples
|
| 49 |
+
print("\nWorst cutoffs (highest final energy):")
|
| 50 |
+
abrupt_samples.sort(key=lambda x: x[1], reverse=True)
|
| 51 |
+
for item, energy in abrupt_samples[:5]:
|
| 52 |
+
print(f" {os.path.basename(item['audio'])}: {energy:.1f} dB")
|
| 53 |
+
print(f" Text: {item['text'][:60]}...")
|
| 54 |
+
|
| 55 |
+
# Save cleaned dataset
|
| 56 |
+
os.makedirs('elise_cleaned', exist_ok=True)
|
| 57 |
+
|
| 58 |
+
with open('elise_cleaned/train.jsonl', 'w') as f:
|
| 59 |
+
for item in clean_samples:
|
| 60 |
+
f.write(json.dumps(item) + '\n')
|
| 61 |
+
|
| 62 |
+
print(f"\nSaved {len(clean_samples)} clean samples to elise_cleaned/train.jsonl")
|
| 63 |
+
|
| 64 |
+
# Create validation split
|
| 65 |
+
val_size = int(0.05 * len(clean_samples))
|
| 66 |
+
train_data = clean_samples[:-val_size]
|
| 67 |
+
val_data = clean_samples[-val_size:]
|
| 68 |
+
|
| 69 |
+
with open('elise_cleaned/train_split.jsonl', 'w') as f:
|
| 70 |
+
for item in train_data:
|
| 71 |
+
f.write(json.dumps(item) + '\n')
|
| 72 |
+
|
| 73 |
+
with open('elise_cleaned/val.jsonl', 'w') as f:
|
| 74 |
+
for item in val_data:
|
| 75 |
+
f.write(json.dumps(item) + '\n')
|
| 76 |
+
|
| 77 |
+
print(f"Split into {len(train_data)} train and {len(val_data)} validation samples")
|
finetune_elise_single_speaker.sh
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# Single-speaker fine-tuning script for VibeVoice-1.5B on CLEANED Elise dataset
|
| 4 |
+
# No voice prompts - pure text-to-speech for Elise voice only
|
| 5 |
+
# WITH PROPER EOS TOKEN to fix repetition/looping issue
|
| 6 |
+
|
| 7 |
+
echo "Single-speaker fine-tuning on cleaned dataset..."
|
| 8 |
+
echo "Using 544 clean samples (no cutoffs)"
|
| 9 |
+
echo "NO voice prompts - training pure Elise TTS model"
|
| 10 |
+
|
| 11 |
+
python -m src.finetune_vibevoice_lora \
|
| 12 |
+
--model_name_or_path . \
|
| 13 |
+
--train_jsonl elise_cleaned/train_split.jsonl \
|
| 14 |
+
--validation_jsonl elise_cleaned/val.jsonl \
|
| 15 |
+
--text_column_name text \
|
| 16 |
+
--audio_column_name audio \
|
| 17 |
+
--output_dir finetune_elise_single_speaker \
|
| 18 |
+
--per_device_train_batch_size 4 \
|
| 19 |
+
--gradient_accumulation_steps 8 \
|
| 20 |
+
--learning_rate 2.5e-5 \
|
| 21 |
+
--num_train_epochs 4 \
|
| 22 |
+
--logging_steps 10 \
|
| 23 |
+
--save_steps 100 \
|
| 24 |
+
--eval_steps 100 \
|
| 25 |
+
--report_to none \
|
| 26 |
+
--remove_unused_columns False \
|
| 27 |
+
--bf16 True \
|
| 28 |
+
--do_train \
|
| 29 |
+
--do_eval \
|
| 30 |
+
--gradient_clipping \
|
| 31 |
+
--gradient_checkpointing False \
|
| 32 |
+
--ddpm_batch_mul 2 \
|
| 33 |
+
--diffusion_loss_weight 1.4 \
|
| 34 |
+
--train_diffusion_head True \
|
| 35 |
+
--ce_loss_weight 0.04 \
|
| 36 |
+
--voice_prompt_drop_rate 1.0 \
|
| 37 |
+
--lora_target_modules q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj \
|
| 38 |
+
--lr_scheduler_type cosine \
|
| 39 |
+
--warmup_ratio 0.03 \
|
| 40 |
+
--max_grad_norm 0.8 \
|
| 41 |
+
--max_length 4096
|
| 42 |
+
|
| 43 |
+
echo "Single-speaker fine-tuning complete!"
|
| 44 |
+
echo "Model will now generate Elise voice from text only - no voice prompts needed!"
|
prepare_jinsaryko_elise_dataset.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
from datasets import load_dataset
|
| 4 |
+
import soundfile as sf
|
| 5 |
+
import numpy as np
|
| 6 |
+
from tqdm import tqdm
|
| 7 |
+
|
| 8 |
+
# Create output directories
|
| 9 |
+
os.makedirs("jinsaryko_elise_formatted", exist_ok=True)
|
| 10 |
+
os.makedirs("jinsaryko_elise_formatted/wavs", exist_ok=True)
|
| 11 |
+
|
| 12 |
+
# Load the Jinsaryko/Elise dataset
|
| 13 |
+
print("Loading Jinsaryko/Elise dataset...")
|
| 14 |
+
dataset = load_dataset("Jinsaryko/Elise")
|
| 15 |
+
|
| 16 |
+
# Since it's a single speaker dataset, we'll use voice_prompt_drop_rate=1.0 as recommended
|
| 17 |
+
# But we still need to format the text with Speaker 0: prefix
|
| 18 |
+
|
| 19 |
+
jsonl_data = []
|
| 20 |
+
|
| 21 |
+
print("Processing audio files...")
|
| 22 |
+
for idx, sample in enumerate(tqdm(dataset['train'])):
|
| 23 |
+
# Format text with Speaker 0: prefix
|
| 24 |
+
original_text = sample['text']
|
| 25 |
+
formatted_text = f"Speaker 0: {original_text}"
|
| 26 |
+
|
| 27 |
+
# Save audio file
|
| 28 |
+
audio = sample['audio']
|
| 29 |
+
audio_array = audio['array']
|
| 30 |
+
sampling_rate = audio['sampling_rate']
|
| 31 |
+
|
| 32 |
+
# Save as WAV file
|
| 33 |
+
audio_filename = f"jinsaryko_elise_formatted/wavs/sample_{idx:06d}.wav"
|
| 34 |
+
sf.write(audio_filename, audio_array, sampling_rate)
|
| 35 |
+
|
| 36 |
+
# Add to JSONL
|
| 37 |
+
jsonl_entry = {
|
| 38 |
+
"text": formatted_text,
|
| 39 |
+
"audio": os.path.abspath(audio_filename)
|
| 40 |
+
}
|
| 41 |
+
jsonl_data.append(jsonl_entry)
|
| 42 |
+
|
| 43 |
+
# Save JSONL file
|
| 44 |
+
print("Saving JSONL file...")
|
| 45 |
+
with open("jinsaryko_elise_formatted/elise_train.jsonl", "w") as f:
|
| 46 |
+
for entry in jsonl_data:
|
| 47 |
+
f.write(json.dumps(entry) + "\n")
|
| 48 |
+
|
| 49 |
+
# Create a small validation set (5% of data)
|
| 50 |
+
val_size = int(0.05 * len(jsonl_data))
|
| 51 |
+
train_data = jsonl_data[:-val_size]
|
| 52 |
+
val_data = jsonl_data[-val_size:]
|
| 53 |
+
|
| 54 |
+
with open("jinsaryko_elise_formatted/elise_train_split.jsonl", "w") as f:
|
| 55 |
+
for entry in train_data:
|
| 56 |
+
f.write(json.dumps(entry) + "\n")
|
| 57 |
+
|
| 58 |
+
with open("jinsaryko_elise_formatted/elise_val.jsonl", "w") as f:
|
| 59 |
+
for entry in val_data:
|
| 60 |
+
f.write(json.dumps(entry) + "\n")
|
| 61 |
+
|
| 62 |
+
print(f"Dataset prepared!")
|
| 63 |
+
print(f"Total samples: {len(jsonl_data)}")
|
| 64 |
+
print(f"Training samples: {len(train_data)}")
|
| 65 |
+
print(f"Validation samples: {len(val_data)}")
|
| 66 |
+
print(f"Files saved in jinsaryko_elise_formatted/")
|
simple_inference.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
sys.path.append(os.path.join(os.path.dirname(__file__), '../src'))
|
| 4 |
+
|
| 5 |
+
import torch
|
| 6 |
+
from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
|
| 7 |
+
from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
|
| 8 |
+
from peft import PeftModel
|
| 9 |
+
|
| 10 |
+
# Configuration
|
| 11 |
+
MODEL_DIR = ".." # Path to VibeVoice-1.5B directory
|
| 12 |
+
LORA_DIR = "../finetune_elise_single_speaker/lora" # Path to your fine-tuned LoRA weights
|
| 13 |
+
OUTPUT_DIR = "output_audio"
|
| 14 |
+
|
| 15 |
+
def load_model():
|
| 16 |
+
"""Load the fine-tuned model"""
|
| 17 |
+
print("Loading model...")
|
| 18 |
+
|
| 19 |
+
# Load base model
|
| 20 |
+
model = VibeVoiceForConditionalGenerationInference.from_pretrained(
|
| 21 |
+
MODEL_DIR,
|
| 22 |
+
torch_dtype=torch.bfloat16,
|
| 23 |
+
device_map="cuda",
|
| 24 |
+
attn_implementation="flash_attention_2"
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
# Load fine-tuned LoRA weights
|
| 28 |
+
model.model.language_model = PeftModel.from_pretrained(
|
| 29 |
+
model.model.language_model,
|
| 30 |
+
LORA_DIR
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
# Load diffusion head
|
| 34 |
+
diffusion_state = torch.load(f"{LORA_DIR}/diffusion_head_full.bin", map_location="cpu")
|
| 35 |
+
model.model.prediction_head.load_state_dict(diffusion_state)
|
| 36 |
+
|
| 37 |
+
# Load processor
|
| 38 |
+
processor = VibeVoiceProcessor.from_pretrained(f"{MODEL_DIR}/src/vibevoice/processor")
|
| 39 |
+
|
| 40 |
+
model.eval()
|
| 41 |
+
model.set_ddpm_inference_steps(num_steps=20)
|
| 42 |
+
|
| 43 |
+
return model, processor
|
| 44 |
+
|
| 45 |
+
def generate_speech(model, processor, text, voice_sample_path=None):
|
| 46 |
+
"""Generate speech from text"""
|
| 47 |
+
|
| 48 |
+
# Format text with Speaker 0 prefix (required!)
|
| 49 |
+
prompt = f"Speaker 0: {text}"
|
| 50 |
+
|
| 51 |
+
# If no voice sample provided, use a dummy one from training data
|
| 52 |
+
# The model ignores this since it was trained with voice_prompt_drop_rate=1.0
|
| 53 |
+
if voice_sample_path is None:
|
| 54 |
+
# You'll need at least one audio file from the training set
|
| 55 |
+
voice_sample_path = "../elise_cleaned/wavs/sample_000009.wav"
|
| 56 |
+
|
| 57 |
+
# Process inputs
|
| 58 |
+
inputs = processor(
|
| 59 |
+
text=[prompt],
|
| 60 |
+
voice_samples=[[voice_sample_path]],
|
| 61 |
+
return_tensors="pt"
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
# Move to GPU
|
| 65 |
+
for k, v in inputs.items():
|
| 66 |
+
if torch.is_tensor(v):
|
| 67 |
+
inputs[k] = v.to("cuda")
|
| 68 |
+
|
| 69 |
+
# Generate audio
|
| 70 |
+
outputs = model.generate(
|
| 71 |
+
**inputs,
|
| 72 |
+
cfg_scale=2.0,
|
| 73 |
+
tokenizer=processor.tokenizer,
|
| 74 |
+
generation_config={'do_sample': False},
|
| 75 |
+
verbose=False
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
if outputs.speech_outputs and outputs.speech_outputs[0] is not None:
|
| 79 |
+
audio = outputs.speech_outputs[0]
|
| 80 |
+
|
| 81 |
+
# Add small silence padding at the end
|
| 82 |
+
silence = torch.zeros_like(audio[..., :4800]) # 200ms
|
| 83 |
+
padded = torch.cat([audio, silence], dim=-1)
|
| 84 |
+
|
| 85 |
+
return padded
|
| 86 |
+
|
| 87 |
+
return None
|
| 88 |
+
|
| 89 |
+
def main():
|
| 90 |
+
# Load model once
|
| 91 |
+
model, processor = load_model()
|
| 92 |
+
|
| 93 |
+
# Create output directory
|
| 94 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 95 |
+
|
| 96 |
+
# Example texts
|
| 97 |
+
texts = [
|
| 98 |
+
"Hello! This is the Elise voice model.",
|
| 99 |
+
"I can generate speech without needing voice samples.",
|
| 100 |
+
"Thank you for using this model!"
|
| 101 |
+
]
|
| 102 |
+
|
| 103 |
+
# Generate speech for each text
|
| 104 |
+
for i, text in enumerate(texts):
|
| 105 |
+
print(f"\nGenerating: {text}")
|
| 106 |
+
|
| 107 |
+
audio = generate_speech(model, processor, text)
|
| 108 |
+
|
| 109 |
+
if audio is not None:
|
| 110 |
+
output_path = f"{OUTPUT_DIR}/output_{i:02d}.wav"
|
| 111 |
+
processor.save_audio(audio, output_path)
|
| 112 |
+
|
| 113 |
+
duration = (audio.shape[-1] - 4800) / 24000 # Subtract padding
|
| 114 |
+
print(f"Saved: {output_path} ({duration:.2f}s)")
|
| 115 |
+
else:
|
| 116 |
+
print("Failed to generate audio")
|
| 117 |
+
|
| 118 |
+
if __name__ == "__main__":
|
| 119 |
+
main()
|
test_fixed_eos_dummy_voice.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))
|
| 4 |
+
|
| 5 |
+
import torch
|
| 6 |
+
from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
|
| 7 |
+
from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
|
| 8 |
+
from peft import PeftModel
|
| 9 |
+
import json
|
| 10 |
+
|
| 11 |
+
print("Loading fixed EOS model...")
|
| 12 |
+
model = VibeVoiceForConditionalGenerationInference.from_pretrained(
|
| 13 |
+
".",
|
| 14 |
+
torch_dtype=torch.bfloat16,
|
| 15 |
+
device_map="cuda",
|
| 16 |
+
attn_implementation="flash_attention_2"
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
# Load the fixed EOS model weights
|
| 20 |
+
model.model.language_model = PeftModel.from_pretrained(
|
| 21 |
+
model.model.language_model,
|
| 22 |
+
"finetune_elise_cleaned_fixed_eos/lora"
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
diffusion_state = torch.load("finetune_elise_cleaned_fixed_eos/lora/diffusion_head_full.bin", map_location="cpu")
|
| 26 |
+
model.model.prediction_head.load_state_dict(diffusion_state)
|
| 27 |
+
|
| 28 |
+
processor = VibeVoiceProcessor.from_pretrained("src/vibevoice/processor")
|
| 29 |
+
model.eval()
|
| 30 |
+
|
| 31 |
+
# Use optimal settings
|
| 32 |
+
model.set_ddpm_inference_steps(num_steps=20)
|
| 33 |
+
|
| 34 |
+
# Get a dummy voice sample to satisfy the model architecture
|
| 35 |
+
# Since we trained with voice_prompt_drop_rate=1.0, it shouldn't affect the output
|
| 36 |
+
with open("elise_cleaned/train_split.jsonl", 'r') as f:
|
| 37 |
+
voice_data = json.loads(f.readline())
|
| 38 |
+
dummy_voice_path = voice_data['audio']
|
| 39 |
+
|
| 40 |
+
print(f"\nUsing dummy voice (ignored due to training): {os.path.basename(dummy_voice_path)}")
|
| 41 |
+
print("Testing fixed EOS model (should stop properly)...\n")
|
| 42 |
+
|
| 43 |
+
# Test sentences
|
| 44 |
+
test_sentences = [
|
| 45 |
+
"Hello! This model should stop properly without repeating.",
|
| 46 |
+
"The EOS token fix should prevent any looping issues.",
|
| 47 |
+
"I can speak clearly without repetition.",
|
| 48 |
+
"This is so much better than the repetitive version!",
|
| 49 |
+
"Let's test a longer sentence to make sure it completes the entire thought without cutting off or repeating at the end.",
|
| 50 |
+
"Wow, I'm really excited to see if this works!",
|
| 51 |
+
"No more saying things twice, twice, twice!",
|
| 52 |
+
"The weather today is absolutely beautiful, isn't it?"
|
| 53 |
+
]
|
| 54 |
+
|
| 55 |
+
os.makedirs("test_fixed_eos_results", exist_ok=True)
|
| 56 |
+
|
| 57 |
+
for i, text in enumerate(test_sentences):
|
| 58 |
+
print(f"\n[{i+1}/{len(test_sentences)}] {text}")
|
| 59 |
+
|
| 60 |
+
prompt = f"Speaker 0: {text}"
|
| 61 |
+
|
| 62 |
+
# Use dummy voice to satisfy model architecture
|
| 63 |
+
# But it shouldn't affect output since model was trained with voice_prompt_drop_rate=1.0
|
| 64 |
+
inputs = processor(
|
| 65 |
+
text=[prompt],
|
| 66 |
+
voice_samples=[[dummy_voice_path]],
|
| 67 |
+
return_tensors="pt"
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
for k, v in inputs.items():
|
| 71 |
+
if torch.is_tensor(v):
|
| 72 |
+
inputs[k] = v.to("cuda")
|
| 73 |
+
|
| 74 |
+
outputs = model.generate(
|
| 75 |
+
**inputs,
|
| 76 |
+
cfg_scale=2.0,
|
| 77 |
+
tokenizer=processor.tokenizer,
|
| 78 |
+
generation_config={'do_sample': False},
|
| 79 |
+
verbose=False
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
if outputs.speech_outputs and outputs.speech_outputs[0] is not None:
|
| 83 |
+
audio = outputs.speech_outputs[0]
|
| 84 |
+
|
| 85 |
+
# Save with light padding
|
| 86 |
+
silence = torch.zeros_like(audio[..., :4800]) # 200ms padding
|
| 87 |
+
padded = torch.cat([audio, silence], dim=-1)
|
| 88 |
+
|
| 89 |
+
output_path = f"test_fixed_eos_results/test_{i:02d}.wav"
|
| 90 |
+
processor.save_audio(padded, output_path)
|
| 91 |
+
|
| 92 |
+
duration = audio.shape[-1] / 24000
|
| 93 |
+
print(f" ✓ Generated {duration:.2f}s → {output_path}")
|
| 94 |
+
|
| 95 |
+
print("\n" + "="*60)
|
| 96 |
+
print("Fixed EOS model test complete!")
|
| 97 |
+
print("Files saved in test_fixed_eos_results/")
|
| 98 |
+
print("\nKey points:")
|
| 99 |
+
print("- Used dummy voice sample (ignored by model)")
|
| 100 |
+
print("- Model trained with voice_prompt_drop_rate=1.0")
|
| 101 |
+
print("- Should stop properly at EOS token")
|
| 102 |
+
print("- No repetition or looping")
|
| 103 |
+
print("="*60)
|