File size: 2,831 Bytes
308155b
 
 
 
 
 
67ea4ca
308155b
 
67ea4ca
308155b
67ea4ca
308155b
 
67ea4ca
308155b
 
67ea4ca
308155b
67ea4ca
308155b
 
67ea4ca
 
308155b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67ea4ca
 
 
 
308155b
 
 
 
 
 
 
67ea4ca
308155b
 
 
 
 
 
 
 
 
67ea4ca
308155b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from dataclasses import dataclass

@dataclass
class TrainConfig:
    # --- Paths ---
    # Directory where setup.py downloaded the files
    # Using the original pretrained_models directory which now contains the English-only base weights
    model_dir: str = "./pretrained_models"
    
    
    # Path to your metadata CSV (Format: ID|RawText|NormText)
    csv_path: str = "./chatterbox_midtune_cc_data_16k/metadata.csv"
    
    # Directory containing WAV files
    wav_dir: str = "./chatterbox_midtune_cc_data_16k"
    
    # Attribution file for speaker-aware splitting
    attribution_path: str = "./chatterbox_midtune_cc_data_16k/attribution.csv"
    
    preprocessed_dir = "./chatterbox_midtune_cc_data_16k/preprocess"
    
    # Output directory for the finetuned model
    # Changed to differentiate from the English-only run
    output_dir: str = "./chatterbox_output_multilingual"

    ljspeech = True # Set True if the dataset format is ljspeech, and False if it's file-based.
    preprocess = True # If you've already done preprocessing once, set it to false.
    
    is_turbo: bool = False # Set True if you're training Turbo, False if you're training Standard (multilingual, stronger)

    # --- OOD Evaluation ---
    # These speakers are strictly excluded from training and validation
    ood_speakers = ["cv-15_11", "cv-15_16", "cv-15_2"]

    # --- Vocabulary ---
    # The size of the NEW vocabulary (from tokenizer.json)
    # Ensure this matches the JSON file generated by your tokenizer script.
    # For Turbo mode: Use the exact number provided by setup.py (e.g., 52260)
    new_vocab_size: int = 52260 if is_turbo else 2454 

    # --- Hyperparameters ---
    batch_size: int = 16         # Adjust based on VRAM
    grad_accum: int = 2          # Effective Batch Size = 64
    learning_rate: float = 2e-5  # Research-optimized LR with warmup
    num_epochs: int = 5         # Run exactly 5 epochs
    weight_decay: float = 0.05   # Defensive weight decay
    
    # Training Strategy:
    # Stage 1 (Current): Multi-speaker Finnish -> 3-5 epochs, lower LR
    # Stage 2 (Later):   Single speaker voice clone -> 50-150 epochs, higher LR
    
    # --- Validation ---
    validation_split: float = 0.05  # 5% of data for validation
    validation_seed: int = 42      # For reproducible train/val split

    # --- Constraints ---
    min_training_duration: float = 4.0 # Filter samples shorter than this
    min_training_snr: float = 35.0     # Filter samples with SNR lower than this
    max_training_snr: float = 100.0    # Filter samples with SNR higher than this (digital artifacts)
    start_text_token = 255
    stop_text_token = 0
    max_text_len: int = 256
    max_speech_len: int = 1024   # Truncates very long audio
    prompt_duration: float = 3.0 # Duration for the reference prompt (seconds)