| """ |
| V6 Config โ Encoder-Decoder TTS with MioCodec + Speaker Embedding |
| ================================================================== |
| Vocab layout: |
| [0..8] = 9 special tokens |
| [9..154] = ~146 text chars (BG + EN + digits + punct) |
| [155..12954] = 12,800 audio tokens (MioCodec, 1 codebook) |
| Total = 12,955 |
| |
| Architecture: |
| Encoder: 4L bidirectional, d=384, 6 heads โ text understanding |
| Decoder: 8L causal + cross-attention, d=384, 6 heads โ audio generation |
| Speaker: 128-dim global_embedding โ Linear(128, 384) โ added to decoder |
| |
| Key differences from V5: |
| - MioCodec (25fps, 1CB, 12800) instead of NanoCodec (12.5fps, 4CB, 16128) |
| - d=384 for both encoder and decoder (V5: enc=512, dec=768) |
| - 8 decoder layers (V5: 18) |
| - Speaker embedding injection (V5: discrete speaker tokens) |
| - max_text=256, max_audio=512 (V5: 512/2048) |
| - ~40M params (V5: 250M) |
| - Expected RTF ~0.15-0.25 (V5: 1.1) |
| """ |
|
|
| |
| CODEC_MODEL_NAME = "Aratako/MioCodec-25Hz-24kHz" |
| CODEC_SAMPLE_RATE = 24_000 |
| CODEC_NUM_CODEBOOKS = 1 |
| CODEC_CODEBOOK_SIZE = 12_800 |
| CODEC_FRAME_RATE = 25.0 |
| CODEC_TOKENS_PER_SEC = 25 |
| TOKENS_PER_FRAME = 1 |
| SPEAKER_EMB_DIM = 128 |
|
|
| |
| BG_LOWER = "ะฐะฑะฒะณะดะตะถะทะธะนะบะปะผะฝะพะฟัััััั
ัััััััั" |
| BG_UPPER = "ะะะะะะะะะะะะะะะะะ ะกะขะฃะคะฅะฆะงะจะฉะชะฌะฎะฏ" |
| EN_LOWER = "abcdefghijklmnopqrstuvwxyz" |
| EN_UPPER = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
| DIGITS = "0123456789" |
| PUNCT = '.,!?;:-โโโฆ"\'()[]{}ยซยปโ"" ' |
| EXTRA = "\n\t" |
|
|
| _ALL_CHARS: list[str] = [] |
| _seen: set[str] = set() |
| for _src in [BG_LOWER, BG_UPPER, EN_LOWER, EN_UPPER, DIGITS, PUNCT, EXTRA]: |
| for _ch in _src: |
| if _ch not in _seen: |
| _ALL_CHARS.append(_ch) |
| _seen.add(_ch) |
|
|
| |
| SPECIAL_TOKENS = { |
| "<pad>": 0, |
| "<start_of_text>": 1, |
| "<end_of_text>": 2, |
| "<start_of_speech>": 3, |
| "<end_of_speech>": 4, |
| "<spk_0>": 5, |
| "<spk_1>": 6, |
| "<spk_2>": 7, |
| "<spk_3>": 8, |
| } |
| NUM_SPECIAL_TOKENS = len(SPECIAL_TOKENS) |
|
|
| |
| TEXT_CHARS = _ALL_CHARS |
| TEXT_VOCAB_SIZE = len(TEXT_CHARS) |
| TEXT_OFFSET = NUM_SPECIAL_TOKENS |
| AUDIO_OFFSET = TEXT_OFFSET + TEXT_VOCAB_SIZE |
| NUM_AUDIO_TOKENS = CODEC_CODEBOOK_SIZE |
| TOTAL_VOCAB_SIZE = AUDIO_OFFSET + NUM_AUDIO_TOKENS |
|
|
| |
| ENCODER_VOCAB_SIZE = AUDIO_OFFSET |
| DECODER_VOCAB_SIZE = TOTAL_VOCAB_SIZE |
|
|
| |
| PAD_TOKEN_ID = SPECIAL_TOKENS["<pad>"] |
| START_OF_TEXT_TOKEN_ID = SPECIAL_TOKENS["<start_of_text>"] |
| END_OF_TEXT_TOKEN_ID = SPECIAL_TOKENS["<end_of_text>"] |
| START_OF_SPEECH_TOKEN_ID = SPECIAL_TOKENS["<start_of_speech>"] |
| END_OF_SPEECH_TOKEN_ID = SPECIAL_TOKENS["<end_of_speech>"] |
| SPK_0_TOKEN_ID = SPECIAL_TOKENS["<spk_0>"] |
| SPK_1_TOKEN_ID = SPECIAL_TOKENS["<spk_1>"] |
|
|
| |
| def audio_token_id(code: int) -> int: |
| """MioCodec code โ global token ID.""" |
| return AUDIO_OFFSET + code |
|
|
| def decode_audio_token(token_id: int) -> int: |
| """Global token ID โ MioCodec code.""" |
| return token_id - AUDIO_OFFSET |
|
|
| def is_audio_token(token_id: int) -> bool: |
| return AUDIO_OFFSET <= token_id < AUDIO_OFFSET + NUM_AUDIO_TOKENS |
|
|
| def is_special_token(token_id: int) -> bool: |
| return 0 <= token_id < NUM_SPECIAL_TOKENS |
|
|
| def is_text_token(token_id: int) -> bool: |
| return TEXT_OFFSET <= token_id < AUDIO_OFFSET |
|
|
| |
| |
| ENC_D_MODEL = 384 |
| ENC_N_HEADS = 6 |
| ENC_N_LAYERS = 4 |
| ENC_D_FF = 1536 |
|
|
| |
| DEC_D_MODEL = 384 |
| DEC_N_HEADS = 6 |
| DEC_N_LAYERS = 8 |
| DEC_D_FF = 1536 |
|
|
| MAX_TEXT_LEN = 256 |
| MAX_AUDIO_LEN = 512 |
| DROPOUT = 0.0 |
|
|
| |
| BATCH_SIZE = 16 |
| GRAD_ACCUM = 4 |
| LR = 3e-4 |
| WEIGHT_DECAY = 0.1 |
| WARMUP_STEPS = 1000 |
| NUM_EPOCHS = 5 |
|
|
| |
| if __name__ == "__main__": |
| print(f"V6 Vocab Layout:") |
| print(f" Special: [0, {NUM_SPECIAL_TOKENS-1}] ({NUM_SPECIAL_TOKENS} tokens)") |
| print(f" Text: [{TEXT_OFFSET}, {AUDIO_OFFSET-1}] ({TEXT_VOCAB_SIZE} chars)") |
| print(f" Audio: [{AUDIO_OFFSET}, {TOTAL_VOCAB_SIZE-1}] ({NUM_AUDIO_TOKENS} tokens)") |
| print(f" TOTAL: {TOTAL_VOCAB_SIZE}") |
| print() |
| print(f"V6 Encoder: d={ENC_D_MODEL}, heads={ENC_N_HEADS}, L={ENC_N_LAYERS}, ff={ENC_D_FF}") |
| print(f"V6 Decoder: d={DEC_D_MODEL}, heads={DEC_N_HEADS}, L={DEC_N_LAYERS}, ff={DEC_D_FF}") |
| print(f"V6 Codec: MioCodec {CODEC_FRAME_RATE}fps, {CODEC_NUM_CODEBOOKS}CB ร {CODEC_CODEBOOK_SIZE}") |
| print(f"V6 Speaker: {SPEAKER_EMB_DIM}-dim global_embedding") |
| print(f"V6 Limits: max_text={MAX_TEXT_LEN}, max_audio={MAX_AUDIO_LEN}") |
|
|