Upload 8 files

Browse files

Files changed (8) hide show

README.md +69 -0
create_cleaned_dataset.py +80 -0
data_vibevoice.py +370 -0
detect_audio_cutoffs.py +77 -0
finetune_elise_single_speaker.sh +44 -0
prepare_jinsaryko_elise_dataset.py +66 -0
simple_inference.py +119 -0
test_fixed_eos_dummy_voice.py +103 -0

README.md ADDED Viewed

	@@ -0,0 +1,69 @@

+# VibeVoice 1.5B Single-Speaker Fine-tuning Guide
+This folder contains all the files needed to fine-tune VibeVoice 1.5B for a single speaker (Elise voice).
+## Key Improvements
+1. **Fixed EOS Token Issue**: The modified `data_vibevoice.py` adds proper `<|endoftext|>` token after speech generation to prevent repetition/looping
+2. **Single-Speaker Training**: Uses `voice_prompt_drop_rate=1.0` to train without voice prompts
+3. **Audio Quality Filter**: Removes training samples with abrupt cutoffs
+## Files Included
+- `data_vibevoice.py` - CRITICAL: Modified data collator that adds EOS token (replaces src/data_vibevoice.py)
+- `prepare_jinsaryko_elise_dataset.py` - Downloads and prepares the Elise dataset
+- `detect_audio_cutoffs.py` - Detects audio files with abrupt endings
+- `finetune_elise_single_speaker.sh` - Training script for single-speaker model
+- `test_fixed_eos_dummy_voice.py` - Test script for inference
+## Quick Start
+1. **Prepare the dataset**:
+   ```bash
+   python prepare_jinsaryko_elise_dataset.py
+   ```
+2. **Detect and remove bad audio** (optional but recommended):
+   ```bash
+   python detect_audio_cutoffs.py
+   # This will create elise_cleaned/ folder with good samples only
+   ```
+3. **IMPORTANT: Replace the data collator**:
+   ```bash
+   cp data_vibevoice.py ../src/data_vibevoice.py
+   ```
+4. **Train the model**:
+   ```bash
+   ./finetune_elise_single_speaker.sh
+   ```
+5. **Test the model**:
+   ```bash
+   python test_fixed_eos_dummy_voice.py
+   ```
+## Training Configuration
+Key settings in `finetune_elise_single_speaker.sh`:
+- `voice_prompt_drop_rate 1.0` - Always drops voice prompts (single-speaker mode)
+- `learning_rate 2.5e-5` - Conservative learning rate
+- `ddpm_batch_mul 2` - Diffusion batch multiplier
+- `diffusion_loss_weight 1.4` - Diffusion loss weight
+- `ce_loss_weight 0.04` - Cross-entropy loss weight
+## How It Works
+1. The model learns to associate "Speaker 0:" with Elise's voice
+2. No voice samples needed during inference
+3. Proper EOS token ensures clean endings without repetition
+## Dataset Format
+The training data should be JSONL with this format:
+```json
+{"text": "Speaker 0: Hello, this is a test.", "audio": "/path/to/audio.wav"}
+```
+Note: The "Speaker 0:" prefix is REQUIRED for all text entries.

create_cleaned_dataset.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import json
+import os
+import shutil
+from tqdm import tqdm
+# This script creates a cleaned dataset by removing samples with abrupt cutoffs
+# It uses the results from detect_audio_cutoffs.py
+print("Creating cleaned dataset from cutoff analysis...")
+# Read the cutoff analysis
+with open('audio_cutoff_analysis.json', 'r') as f:
+    analysis = json.load(f)
+# Get good samples
+good_samples = analysis['good_samples']
+print(f"Found {len(good_samples)} good samples out of {analysis['total_samples']} total")
+# Create output directory
+os.makedirs("elise_cleaned", exist_ok=True)
+os.makedirs("elise_cleaned/wavs", exist_ok=True)
+# Process train split
+train_good = []
+val_good = []
+# Read original train data
+with open("jinsaryko_elise_formatted/elise_train_split.jsonl", 'r') as f:
+    for line in tqdm(f, desc="Processing train split"):
+        entry = json.loads(line)
+        audio_path = entry['audio']
+        # Check if this is a good sample
+        if audio_path in [s['audio_path'] for s in good_samples]:
+            # Copy audio file
+            basename = os.path.basename(audio_path)
+            new_audio_path = os.path.abspath(f"elise_cleaned/wavs/{basename}")
+            shutil.copy2(audio_path, new_audio_path)
+            # Update entry with new path
+            new_entry = {
+                "text": entry['text'],
+                "audio": new_audio_path
+            }
+            train_good.append(new_entry)
+# Read original validation data
+with open("jinsaryko_elise_formatted/elise_val.jsonl", 'r') as f:
+    for line in tqdm(f, desc="Processing validation split"):
+        entry = json.loads(line)
+        audio_path = entry['audio']
+        # Check if this is a good sample
+        if audio_path in [s['audio_path'] for s in good_samples]:
+            # Copy audio file
+            basename = os.path.basename(audio_path)
+            new_audio_path = os.path.abspath(f"elise_cleaned/wavs/{basename}")
+            if not os.path.exists(new_audio_path):
+                shutil.copy2(audio_path, new_audio_path)
+            # Update entry with new path
+            new_entry = {
+                "text": entry['text'],
+                "audio": new_audio_path
+            }
+            val_good.append(new_entry)
+# Save cleaned datasets
+with open("elise_cleaned/train_split.jsonl", 'w') as f:
+    for entry in train_good:
+        f.write(json.dumps(entry) + '\n')
+with open("elise_cleaned/val.jsonl", 'w') as f:
+    for entry in val_good:
+        f.write(json.dumps(entry) + '\n')
+print(f"\nCleaned dataset created!")
+print(f"Training samples: {len(train_good)}")
+print(f"Validation samples: {len(val_good)}")
+print(f"Files saved in elise_cleaned/")

data_vibevoice.py ADDED Viewed

	@@ -0,0 +1,370 @@

+import math
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+import numpy as np
+import torch
+import warnings
+import random
+try:
+    import librosa  # type: ignore
+except Exception:  # pragma: no cover
+    librosa = None  # Fallback: user must install librosa when using local audio paths
+try:
+    import resampy  # type: ignore
+except Exception:  # pragma: no cover
+    resampy = None
+def _resample_if_needed(wav: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
+    if orig_sr == target_sr:
+        return wav.astype(np.float32, copy=False)
+    if resampy is not None:
+        return resampy.resample(wav.astype(np.float32), orig_sr, target_sr)
+    if librosa is not None:
+        return librosa.resample(y=wav.astype(np.float32), orig_sr=orig_sr, target_sr=target_sr)
+    warnings.warn(
+        "No resampler available; treating audio as target_sr without resampling. Install resampy or librosa.",
+        RuntimeWarning,
+    )
+    return wav.astype(np.float32, copy=False)
+# Lightweight HF-style dataset wrapper (optional). Trainer can also pass raw HF datasets directly.
+class VibeVoiceDataset:
+    def __init__(
+        self,
+        dataset: Any,
+        text_column: str = "text",
+        audio_column: str = "audio",
+        voice_prompts_column: Optional[str] = "voice_prompts",
+    ) -> None:
+        self.dataset = dataset
+        self.text_column = text_column
+        self.audio_column = audio_column
+        self.voice_prompts_column = voice_prompts_column
+    def __len__(self) -> int:
+        return len(self.dataset)
+    def __getitem__(self, idx: int) -> Dict[str, Any]:
+        item = self.dataset[idx]
+        data: Dict[str, Any] = {}
+        data["text"] = item[self.text_column]
+        data["audio"] = item[self.audio_column]
+        user_provided_prompt = None
+        if self.voice_prompts_column and self.voice_prompts_column in item:
+            user_provided_prompt = item[self.voice_prompts_column]
+        if user_provided_prompt:
+            # A prompt was provided in the dataset, so we use it.
+            if not isinstance(user_provided_prompt, list):
+                data["voice_prompts"] = [user_provided_prompt]
+            else:
+                data["voice_prompts"] = user_provided_prompt
+        else:
+            # FALLBACK: No prompt provided, so we auto-generate one from the target audio.
+            try:
+                target_sr = 24000
+                wav_array = _load_audio_to_24k(item[self.audio_column], target_sr=target_sr)
+                audio_len_seconds = len(wav_array) / target_sr
+                min_len_sec = min(5.0, audio_len_seconds / 4.0)
+                max_len_sec = min(15.0, audio_len_seconds / 2.0)
+                if min_len_sec > max_len_sec:
+                    min_len_sec = max_len_sec
+                max_len_sec = min(max_len_sec, audio_len_seconds)
+                if max_len_sec > 0.1:
+                    prompt_len_sec = random.uniform(min_len_sec, max_len_sec)
+                    prompt_len_samples = int(prompt_len_sec * target_sr)
+                    max_start_sample = len(wav_array) - prompt_len_samples
+                    start_sample = random.randint(0, max_start_sample)
+                    prompt_crop = wav_array[start_sample : start_sample + prompt_len_samples]
+                    data["voice_prompts"] = [prompt_crop]
+                else:
+                    data["voice_prompts"] = None
+            except Exception as e:
+                warnings.warn(f"Could not create voice prompt for item {idx}: {e}")
+                data["voice_prompts"] = None
+        return data
+def _load_audio_to_24k(audio: Union[str, np.ndarray, torch.Tensor, Dict[str, Any]], *, target_sr: int = 24000) -> np.ndarray:
+    if isinstance(audio, np.ndarray):
+        return audio.astype(np.float32)
+    if isinstance(audio, torch.Tensor):
+        return audio.detach().cpu().float().numpy()
+    if isinstance(audio, str):
+        if librosa is None:
+            raise RuntimeError("librosa is required to load audio file paths. Please pip install librosa.")
+        wav, sr = librosa.load(audio, sr=None, mono=True)
+        wav = _resample_if_needed(wav, int(sr), target_sr)
+        return wav
+    if isinstance(audio, dict) and "array" in audio and "sampling_rate" in audio:
+        arr = np.asarray(audio["array"], dtype=np.float32)
+        sr = int(audio["sampling_rate"])
+        arr = _resample_if_needed(arr, sr, target_sr)
+        return arr
+    raise ValueError(f"Unsupported audio type: {type(audio)}")
+@dataclass
+class VibeVoiceCollator:
+    processor: Any  # VibeVoiceProcessor
+    max_length: Optional[int] = None
+    speech_compress_ratio: int = 3200
+    semantic_vae_dim: int = 128
+    compute_semantics: bool = False
+    debug_checks: bool = False
+    text_field: str = "text"
+    audio_field: str = "audio"
+    voice_prompts_field: str = "voice_prompts"
+    voice_prompt_drop_rate: float = 0.0
+    def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, Any]:
+        batch_size = len(features)
+        sample_input_ids: List[List[int]] = []
+        sample_attention_masks: List[List[int]] = []
+        sample_acoustic_input_masks: List[List[bool]] = []
+        sample_acoustic_loss_masks: List[List[bool]] = []
+        all_speech_waveforms: List[np.ndarray] = []
+        all_speech_latent_lengths: List[int] = []
+        per_segment_is_target: List[bool] = []
+        for ex in features:
+            text: str = ex.get(self.text_field, "")
+            voice_prompts: Optional[List[Union[str, np.ndarray, torch.Tensor]]] = ex.get(self.voice_prompts_field)
+            target_audio: Union[str, np.ndarray, torch.Tensor, Dict[str, Any]] = ex.get(self.audio_field)
+            # Clamp drop rate for safety
+            _drop_rate = self.voice_prompt_drop_rate
+            if _drop_rate < 0.0:
+                _drop_rate = 0.0
+            elif _drop_rate > 1.0:
+                _drop_rate = 1.0
+            proc = self.processor(
+                text=[text],
+                voice_samples=[voice_prompts] if voice_prompts is not None and random.random() >= _drop_rate else None,
+                padding=False,
+                truncation=False,
+                max_length=self.max_length,
+                return_tensors="pt",
+            )
+            ids = proc["input_ids"][0].tolist()
+            attn = proc.get("attention_mask", torch.ones_like(proc["input_ids"]))[0].tolist()
+            speech_input_mask = proc.get("speech_input_mask")
+            if speech_input_mask is None:
+                speech_input_mask = torch.zeros_like(proc["input_ids"], dtype=torch.bool)
+            speech_input_mask_list = speech_input_mask[0].tolist()
+            wav_target = _load_audio_to_24k(target_audio, target_sr=24000)
+            # Prefer exact frame count from acoustic tokenizer if available; fallback to compress ratio
+            target_latent_len = None
+            try:
+                acoustic_tok = getattr(self.processor, "acoustic_tokenizer", None)
+                if acoustic_tok is not None and hasattr(acoustic_tok, "encode"):
+                    enc_out = acoustic_tok.encode(wav_target)
+                    # Normalize various possible return formats to get time dimension
+                    T = None
+                    try:
+                        # Direct array-like with shape (T, D) or (T,)
+                        if hasattr(enc_out, "shape") and len(getattr(enc_out, "shape", [])) >= 1:
+                            T = int(enc_out.shape[0])
+                        else:
+                            # Nested lists/tuples or ModelOutput-like
+                            cand = enc_out
+                            # Drill down a couple of levels safely
+                            for _ in range(2):
+                                if isinstance(cand, (list, tuple)) and len(cand) > 0:
+                                    cand = cand[0]
+                            if hasattr(cand, "shape") and len(getattr(cand, "shape", [])) >= 1:
+                                T = int(cand.shape[0])
+                    except Exception:
+                        T = None
+                    if T is not None and T > 0:
+                        target_latent_len = T
+            except Exception:
+                target_latent_len = None
+            if target_latent_len is None:
+                target_latent_len = max(1, int(math.ceil(len(wav_target) / float(self.speech_compress_ratio))))
+            speech_diff_id = self.processor.tokenizer.speech_diffusion_id
+            target_placeholders = [speech_diff_id] * target_latent_len
+            ids_extended = ids + target_placeholders
+            attn_extended = attn + [1] * target_latent_len
+            acoustic_input_mask = speech_input_mask_list + [True] * target_latent_len
+            acoustic_loss_mask = ([False] * len(speech_input_mask_list)) + [True] * target_latent_len
+            # Add speech_end_id token
+            speech_end_id = self.processor.tokenizer.speech_end_id
+            ids_extended.append(speech_end_id)
+            attn_extended.append(1)
+            acoustic_input_mask.append(False)
+            acoustic_loss_mask.append(False)
+            # FIXED: Add actual EOS token after speech_end_id to properly terminate generation
+            eos_token_id = self.processor.tokenizer.eos_token_id
+            ids_extended.append(eos_token_id)
+            attn_extended.append(1)
+            acoustic_input_mask.append(False)
+            acoustic_loss_mask.append(False)
+            if self.max_length is not None and len(ids_extended) > self.max_length:
+                cut = len(ids_extended) - int(self.max_length)
+                leading_non_acoustic = 0
+                for v in acoustic_input_mask:
+                    if v:
+                        break
+                    leading_non_acoustic += 1
+                if cut > leading_non_acoustic:
+                    raise ValueError(
+                        f"--max_length={self.max_length} would truncate into acoustic tokens. "
+                        f"Needed cut={cut}, but only {leading_non_acoustic} leading non-acoustic tokens available. "
+                        "Increase max_length or shorten text/voice-prompt preamble."
+                    )
+                ids_extended = ids_extended[cut:]
+                attn_extended = attn_extended[cut:]
+                acoustic_input_mask = acoustic_input_mask[cut:]
+                acoustic_loss_mask = acoustic_loss_mask[cut:]
+            sample_input_ids.append(ids_extended)
+            sample_attention_masks.append(attn_extended)
+            sample_acoustic_input_masks.append(acoustic_input_mask)
+            sample_acoustic_loss_masks.append(acoustic_loss_mask)
+            voice_speeches = []
+            voice_latent_lengths = []
+            if proc.get("speech_tensors") is not None:
+                voice_np = proc["speech_tensors"].cpu().numpy()
+                voice_masks = proc["speech_masks"].cpu().numpy().astype(bool)
+                for seg_idx in range(voice_np.shape[0]):
+                    voice_speeches.append(voice_np[seg_idx])
+                    voice_latent_lengths.append(int(voice_masks[seg_idx].sum()))
+            all_speech_waveforms.extend(voice_speeches)
+            all_speech_latent_lengths.extend(voice_latent_lengths)
+            per_segment_is_target.extend([False] * len(voice_speeches))
+            all_speech_waveforms.append(wav_target)
+            all_speech_latent_lengths.append(target_latent_len)
+            per_segment_is_target.append(True)
+        max_seq_len = max(len(x) for x in sample_input_ids)
+        padded_input_ids = []
+        padded_attention_masks = []
+        padded_acoustic_input_masks = []
+        padded_acoustic_loss_masks = []
+        tok = self.processor.tokenizer
+        pad_token_id = getattr(tok, "pad_token_id", None)
+        if pad_token_id is None or pad_token_id < 0:
+            pad_token_id = getattr(tok, "eos_token_id", None)
+            if pad_token_id is None or pad_token_id < 0:
+                raise ValueError(
+                    "Tokenizer has no pad_token_id or eos_token_id; please set one or pass a valid pad id."
+                )
+        for ids, attn, ain_mask, aloss_mask in zip(
+            sample_input_ids, sample_attention_masks, sample_acoustic_input_masks, sample_acoustic_loss_masks
+        ):
+            pad_len = max_seq_len - len(ids)
+            padded_input_ids.append(ids + [pad_token_id] * pad_len)
+            padded_attention_masks.append(attn + [0] * pad_len)
+            padded_acoustic_input_masks.append(ain_mask + [False] * pad_len)
+            padded_acoustic_loss_masks.append(aloss_mask + [False] * pad_len)
+        input_ids_tensor = torch.tensor(padded_input_ids, dtype=torch.long)
+        attention_mask_tensor = torch.tensor(padded_attention_masks, dtype=torch.long)
+        acoustic_input_mask_tensor = torch.tensor(padded_acoustic_input_masks, dtype=torch.bool)
+        acoustic_loss_mask_tensor = torch.tensor(padded_acoustic_loss_masks, dtype=torch.bool)
+        if all_speech_waveforms:
+            max_wave_len = max(w.shape[0] for w in all_speech_waveforms)
+            padded_speeches = np.zeros((len(all_speech_waveforms), max_wave_len), dtype=np.float32)
+            for i, w in enumerate(all_speech_waveforms):
+                L = w.shape[0]
+                padded_speeches[i, :L] = w
+            max_latent_len = max(all_speech_latent_lengths) if all_speech_latent_lengths else 1
+            speech_masks_np = np.zeros((len(all_speech_waveforms), max_latent_len), dtype=np.bool_)
+            for i, L_lat in enumerate(all_speech_latent_lengths):
+                speech_masks_np[i, :L_lat] = True
+            speech_tensors_tensor = torch.tensor(padded_speeches, dtype=torch.float32)
+            speech_masks_tensor = torch.tensor(speech_masks_np, dtype=torch.bool)
+            speeches_loss_input_np = np.zeros_like(speech_masks_np, dtype=np.bool_)
+            for i, is_target in enumerate(per_segment_is_target):
+                if is_target:
+                    speeches_loss_input_np[i] = speech_masks_np[i]
+            speeches_loss_input_tensor = torch.tensor(speeches_loss_input_np, dtype=torch.bool)
+            # Semantic features
+            if self.compute_semantics and hasattr(self.processor, "semantic_tokenizer") and self.processor.semantic_tokenizer is not None:
+                sem_feats: List[np.ndarray] = []
+                for w in all_speech_waveforms:
+                    try:
+                        # Expect [T, D]  where T ≈ ceil(len(w)/compress_ratio)
+                        sem = self.processor.semantic_tokenizer.encode(w)
+                        sem = np.asarray(sem, dtype=np.float32)
+                    except Exception:
+                        sem = np.zeros((0, self.semantic_vae_dim), dtype=np.float32)
+                    if sem.ndim != 2:
+                        raise RuntimeError(f"Semantic tokenizer returned unexpected shape {sem.shape}. Expect [T, D].")
+                    L = sem.shape[0]
+                    D = sem.shape[1]
+                    if D != self.semantic_vae_dim:
+                        if D < self.semantic_vae_dim:
+                            pad_d = np.zeros((L, self.semantic_vae_dim - D), dtype=np.float32)
+                            sem = np.concatenate([sem, pad_d], axis=1)
+                        else:
+                            sem = sem[:, : self.semantic_vae_dim]
+                    if L < max_latent_len:
+                        pad = np.zeros((max_latent_len - L, self.semantic_vae_dim), dtype=np.float32)
+                        sem = np.concatenate([sem, pad], axis=0)
+                    elif L > max_latent_len:
+                        sem = sem[:max_latent_len]
+                    sem_feats.append(sem.astype(np.float32))
+                speech_semantic_tensors = torch.tensor(np.stack(sem_feats, axis=0), dtype=torch.float32)
+            else:
+                # Semantic tokenizer unavailable while semantics are required for training.
+                # Raise to avoid silently degrading alignment with zeroed features.
+                raise RuntimeError(
+                    "Semantic features are required but could not be computed. "
+                    "Ensure processor.semantic_tokenizer is available or precompute and provide features."
+                )
+        else:
+            speech_tensors_tensor = None
+            speech_masks_tensor = None
+            speeches_loss_input_tensor = None
+            speech_semantic_tensors = None  # No segments in batch
+        if self.debug_checks:
+            assert (input_ids_tensor >= 0).all(), "input_ids contains negative indices"
+            if speech_tensors_tensor is not None:
+                assert speech_tensors_tensor.dim() == 2, "Expected speech_tensors 2D [segments, samples]"
+        return {
+            "input_ids": input_ids_tensor,
+            "attention_mask": attention_mask_tensor,
+            "speech_tensors": speech_tensors_tensor,
+            "speech_masks": speech_masks_tensor,
+            "speech_semantic_tensors": speech_semantic_tensors,
+            "acoustic_input_mask": acoustic_input_mask_tensor,
+            "acoustic_loss_mask": acoustic_loss_mask_tensor,
+            "speeches_loss_input": speeches_loss_input_tensor,
+        }

detect_audio_cutoffs.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import os
+import json
+import numpy as np
+import soundfile as sf
+from tqdm import tqdm
+def check_abrupt_ending(audio_path, threshold_db=-35):
+    """Check if audio ends abruptly"""
+    try:
+        audio, sr = sf.read(audio_path)
+        if len(audio.shape) > 1:
+            audio = audio.mean(axis=1)
+        # Check last 100ms of audio
+        last_100ms = int(0.1 * sr)
+        final_segment = audio[-last_100ms:]
+        # Calculate RMS energy
+        rms = np.sqrt(np.mean(final_segment**2))
+        db = 20 * np.log10(rms + 1e-10)
+        # If final segment has significant energy, it's likely cut off
+        return db > threshold_db, db
+    except:
+        return False, -100
+print("Analyzing audio endings...")
+# Load dataset
+with open("jinsaryko_elise_formatted/elise_train.jsonl", 'r') as f:
+    data = [json.loads(line) for line in f]
+clean_samples = []
+abrupt_samples = []
+for item in tqdm(data):
+    is_abrupt, energy = check_abrupt_ending(item['audio'])
+    if is_abrupt:
+        abrupt_samples.append((item, energy))
+    else:
+        clean_samples.append(item)
+print(f"\nResults:")
+print(f"Clean endings: {len(clean_samples)} ({100*len(clean_samples)/len(data):.1f}%)")
+print(f"Abrupt cutoffs: {len(abrupt_samples)} ({100*len(abrupt_samples)/len(data):.1f}%)")
+# Show worst examples
+print("\nWorst cutoffs (highest final energy):")
+abrupt_samples.sort(key=lambda x: x[1], reverse=True)
+for item, energy in abrupt_samples[:5]:
+    print(f"  {os.path.basename(item['audio'])}: {energy:.1f} dB")
+    print(f"    Text: {item['text'][:60]}...")
+# Save cleaned dataset
+os.makedirs('elise_cleaned', exist_ok=True)
+with open('elise_cleaned/train.jsonl', 'w') as f:
+    for item in clean_samples:
+        f.write(json.dumps(item) + '\n')
+print(f"\nSaved {len(clean_samples)} clean samples to elise_cleaned/train.jsonl")
+# Create validation split
+val_size = int(0.05 * len(clean_samples))
+train_data = clean_samples[:-val_size]
+val_data = clean_samples[-val_size:]
+with open('elise_cleaned/train_split.jsonl', 'w') as f:
+    for item in train_data:
+        f.write(json.dumps(item) + '\n')
+with open('elise_cleaned/val.jsonl', 'w') as f:
+    for item in val_data:
+        f.write(json.dumps(item) + '\n')
+print(f"Split into {len(train_data)} train and {len(val_data)} validation samples")

finetune_elise_single_speaker.sh ADDED Viewed

	@@ -0,0 +1,44 @@

+#!/bin/bash
+# Single-speaker fine-tuning script for VibeVoice-1.5B on CLEANED Elise dataset
+# No voice prompts - pure text-to-speech for Elise voice only
+# WITH PROPER EOS TOKEN to fix repetition/looping issue
+echo "Single-speaker fine-tuning on cleaned dataset..."
+echo "Using 544 clean samples (no cutoffs)"
+echo "NO voice prompts - training pure Elise TTS model"
+python -m src.finetune_vibevoice_lora \
+    --model_name_or_path . \
+    --train_jsonl elise_cleaned/train_split.jsonl \
+    --validation_jsonl elise_cleaned/val.jsonl \
+    --text_column_name text \
+    --audio_column_name audio \
+    --output_dir finetune_elise_single_speaker \
+    --per_device_train_batch_size 4 \
+    --gradient_accumulation_steps 8 \
+    --learning_rate 2.5e-5 \
+    --num_train_epochs 4 \
+    --logging_steps 10 \
+    --save_steps 100 \
+    --eval_steps 100 \
+    --report_to none \
+    --remove_unused_columns False \
+    --bf16 True \
+    --do_train \
+    --do_eval \
+    --gradient_clipping \
+    --gradient_checkpointing False \
+    --ddpm_batch_mul 2 \
+    --diffusion_loss_weight 1.4 \
+    --train_diffusion_head True \
+    --ce_loss_weight 0.04 \
+    --voice_prompt_drop_rate 1.0 \
+    --lora_target_modules q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj \
+    --lr_scheduler_type cosine \
+    --warmup_ratio 0.03 \
+    --max_grad_norm 0.8 \
+    --max_length 4096
+echo "Single-speaker fine-tuning complete!"
+echo "Model will now generate Elise voice from text only - no voice prompts needed!"

prepare_jinsaryko_elise_dataset.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import json
+import os
+from datasets import load_dataset
+import soundfile as sf
+import numpy as np
+from tqdm import tqdm
+# Create output directories
+os.makedirs("jinsaryko_elise_formatted", exist_ok=True)
+os.makedirs("jinsaryko_elise_formatted/wavs", exist_ok=True)
+# Load the Jinsaryko/Elise dataset
+print("Loading Jinsaryko/Elise dataset...")
+dataset = load_dataset("Jinsaryko/Elise")
+# Since it's a single speaker dataset, we'll use voice_prompt_drop_rate=1.0 as recommended
+# But we still need to format the text with Speaker 0: prefix
+jsonl_data = []
+print("Processing audio files...")
+for idx, sample in enumerate(tqdm(dataset['train'])):
+    # Format text with Speaker 0: prefix
+    original_text = sample['text']
+    formatted_text = f"Speaker 0: {original_text}"
+    # Save audio file
+    audio = sample['audio']
+    audio_array = audio['array']
+    sampling_rate = audio['sampling_rate']
+    # Save as WAV file
+    audio_filename = f"jinsaryko_elise_formatted/wavs/sample_{idx:06d}.wav"
+    sf.write(audio_filename, audio_array, sampling_rate)
+    # Add to JSONL
+    jsonl_entry = {
+        "text": formatted_text,
+        "audio": os.path.abspath(audio_filename)
+    }
+    jsonl_data.append(jsonl_entry)
+# Save JSONL file
+print("Saving JSONL file...")
+with open("jinsaryko_elise_formatted/elise_train.jsonl", "w") as f:
+    for entry in jsonl_data:
+        f.write(json.dumps(entry) + "\n")
+# Create a small validation set (5% of data)
+val_size = int(0.05 * len(jsonl_data))
+train_data = jsonl_data[:-val_size]
+val_data = jsonl_data[-val_size:]
+with open("jinsaryko_elise_formatted/elise_train_split.jsonl", "w") as f:
+    for entry in train_data:
+        f.write(json.dumps(entry) + "\n")
+with open("jinsaryko_elise_formatted/elise_val.jsonl", "w") as f:
+    for entry in val_data:
+        f.write(json.dumps(entry) + "\n")
+print(f"Dataset prepared!")
+print(f"Total samples: {len(jsonl_data)}")
+print(f"Training samples: {len(train_data)}")
+print(f"Validation samples: {len(val_data)}")
+print(f"Files saved in jinsaryko_elise_formatted/")

simple_inference.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import sys
+import os
+sys.path.append(os.path.join(os.path.dirname(__file__), '../src'))
+import torch
+from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
+from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
+from peft import PeftModel
+# Configuration
+MODEL_DIR = ".."  # Path to VibeVoice-1.5B directory
+LORA_DIR = "../finetune_elise_single_speaker/lora"  # Path to your fine-tuned LoRA weights
+OUTPUT_DIR = "output_audio"
+def load_model():
+    """Load the fine-tuned model"""
+    print("Loading model...")
+    # Load base model
+    model = VibeVoiceForConditionalGenerationInference.from_pretrained(
+        MODEL_DIR,
+        torch_dtype=torch.bfloat16,
+        device_map="cuda",
+        attn_implementation="flash_attention_2"
+    )
+    # Load fine-tuned LoRA weights
+    model.model.language_model = PeftModel.from_pretrained(
+        model.model.language_model,
+        LORA_DIR
+    )
+    # Load diffusion head
+    diffusion_state = torch.load(f"{LORA_DIR}/diffusion_head_full.bin", map_location="cpu")
+    model.model.prediction_head.load_state_dict(diffusion_state)
+    # Load processor
+    processor = VibeVoiceProcessor.from_pretrained(f"{MODEL_DIR}/src/vibevoice/processor")
+    model.eval()
+    model.set_ddpm_inference_steps(num_steps=20)
+    return model, processor
+def generate_speech(model, processor, text, voice_sample_path=None):
+    """Generate speech from text"""
+    # Format text with Speaker 0 prefix (required!)
+    prompt = f"Speaker 0: {text}"
+    # If no voice sample provided, use a dummy one from training data
+    # The model ignores this since it was trained with voice_prompt_drop_rate=1.0
+    if voice_sample_path is None:
+        # You'll need at least one audio file from the training set
+        voice_sample_path = "../elise_cleaned/wavs/sample_000009.wav"
+    # Process inputs
+    inputs = processor(
+        text=[prompt],
+        voice_samples=[[voice_sample_path]],
+        return_tensors="pt"
+    )
+    # Move to GPU
+    for k, v in inputs.items():
+        if torch.is_tensor(v):
+            inputs[k] = v.to("cuda")
+    # Generate audio
+    outputs = model.generate(
+        **inputs,
+        cfg_scale=2.0,
+        tokenizer=processor.tokenizer,
+        generation_config={'do_sample': False},
+        verbose=False
+    )
+    if outputs.speech_outputs and outputs.speech_outputs[0] is not None:
+        audio = outputs.speech_outputs[0]
+        # Add small silence padding at the end
+        silence = torch.zeros_like(audio[..., :4800])  # 200ms
+        padded = torch.cat([audio, silence], dim=-1)
+        return padded
+    return None
+def main():
+    # Load model once
+    model, processor = load_model()
+    # Create output directory
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    # Example texts
+    texts = [
+        "Hello! This is the Elise voice model.",
+        "I can generate speech without needing voice samples.",
+        "Thank you for using this model!"
+    ]
+    # Generate speech for each text
+    for i, text in enumerate(texts):
+        print(f"\nGenerating: {text}")
+        audio = generate_speech(model, processor, text)
+        if audio is not None:
+            output_path = f"{OUTPUT_DIR}/output_{i:02d}.wav"
+            processor.save_audio(audio, output_path)
+            duration = (audio.shape[-1] - 4800) / 24000  # Subtract padding
+            print(f"Saved: {output_path} ({duration:.2f}s)")
+        else:
+            print("Failed to generate audio")
+if __name__ == "__main__":
+    main()

test_fixed_eos_dummy_voice.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import sys
+import os
+sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))
+import torch
+from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
+from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
+from peft import PeftModel
+import json
+print("Loading fixed EOS model...")
+model = VibeVoiceForConditionalGenerationInference.from_pretrained(
+    ".",
+    torch_dtype=torch.bfloat16,
+    device_map="cuda",
+    attn_implementation="flash_attention_2"
+)
+# Load the fixed EOS model weights
+model.model.language_model = PeftModel.from_pretrained(
+    model.model.language_model,
+    "finetune_elise_cleaned_fixed_eos/lora"
+)
+diffusion_state = torch.load("finetune_elise_cleaned_fixed_eos/lora/diffusion_head_full.bin", map_location="cpu")
+model.model.prediction_head.load_state_dict(diffusion_state)
+processor = VibeVoiceProcessor.from_pretrained("src/vibevoice/processor")
+model.eval()
+# Use optimal settings
+model.set_ddpm_inference_steps(num_steps=20)
+# Get a dummy voice sample to satisfy the model architecture
+# Since we trained with voice_prompt_drop_rate=1.0, it shouldn't affect the output
+with open("elise_cleaned/train_split.jsonl", 'r') as f:
+    voice_data = json.loads(f.readline())
+    dummy_voice_path = voice_data['audio']
+print(f"\nUsing dummy voice (ignored due to training): {os.path.basename(dummy_voice_path)}")
+print("Testing fixed EOS model (should stop properly)...\n")
+# Test sentences
+test_sentences = [
+    "Hello! This model should stop properly without repeating.",
+    "The EOS token fix should prevent any looping issues.",
+    "I can speak clearly without repetition.",
+    "This is so much better than the repetitive version!",
+    "Let's test a longer sentence to make sure it completes the entire thought without cutting off or repeating at the end.",
+    "Wow, I'm really excited to see if this works!",
+    "No more saying things twice, twice, twice!",
+    "The weather today is absolutely beautiful, isn't it?"
+]
+os.makedirs("test_fixed_eos_results", exist_ok=True)
+for i, text in enumerate(test_sentences):
+    print(f"\n[{i+1}/{len(test_sentences)}] {text}")
+    prompt = f"Speaker 0: {text}"
+    # Use dummy voice to satisfy model architecture
+    # But it shouldn't affect output since model was trained with voice_prompt_drop_rate=1.0
+    inputs = processor(
+        text=[prompt],
+        voice_samples=[[dummy_voice_path]],
+        return_tensors="pt"
+    )
+    for k, v in inputs.items():
+        if torch.is_tensor(v):
+            inputs[k] = v.to("cuda")
+    outputs = model.generate(
+        **inputs,
+        cfg_scale=2.0,
+        tokenizer=processor.tokenizer,
+        generation_config={'do_sample': False},
+        verbose=False
+    )
+    if outputs.speech_outputs and outputs.speech_outputs[0] is not None:
+        audio = outputs.speech_outputs[0]
+        # Save with light padding
+        silence = torch.zeros_like(audio[..., :4800])  # 200ms padding
+        padded = torch.cat([audio, silence], dim=-1)
+        output_path = f"test_fixed_eos_results/test_{i:02d}.wav"
+        processor.save_audio(padded, output_path)
+        duration = audio.shape[-1] / 24000
+        print(f"  ✓ Generated {duration:.2f}s → {output_path}")
+print("\n" + "="*60)
+print("Fixed EOS model test complete!")
+print("Files saved in test_fixed_eos_results/")
+print("\nKey points:")
+print("- Used dummy voice sample (ignored by model)")
+print("- Model trained with voice_prompt_drop_rate=1.0")
+print("- Should stop properly at EOS token")
+print("- No repetition or looping")
+print("="*60)