| """
|
| V6 Config โ Encoder-Decoder TTS with MioCodec + Speaker Embedding
|
| ==================================================================
|
| Vocab layout:
|
| [0..8] = 9 special tokens
|
| [9..154] = ~146 text chars (BG + EN + digits + punct)
|
| [155..12954] = 12,800 audio tokens (MioCodec, 1 codebook)
|
| Total = 12,955
|
|
|
| Architecture:
|
| Encoder: 4L bidirectional, d=384, 6 heads โ text understanding
|
| Decoder: 8L causal + cross-attention, d=384, 6 heads โ audio generation
|
| Speaker: 128-dim global_embedding โ Linear(128, 384) โ added to decoder
|
|
|
| Key differences from V5:
|
| - MioCodec (25fps, 1CB, 12800) instead of NanoCodec (12.5fps, 4CB, 16128)
|
| - d=384 for both encoder and decoder (V5: enc=512, dec=768)
|
| - 8 decoder layers (V5: 18)
|
| - Speaker embedding injection (V5: discrete speaker tokens)
|
| - max_text=256, max_audio=512 (V5: 512/2048)
|
| - ~40M params (V5: 250M)
|
| - Expected RTF ~0.15-0.25 (V5: 1.1)
|
| """
|
|
|
|
|
| CODEC_MODEL_NAME = "Aratako/MioCodec-25Hz-24kHz"
|
| CODEC_SAMPLE_RATE = 24_000
|
| CODEC_NUM_CODEBOOKS = 1
|
| CODEC_CODEBOOK_SIZE = 12_800
|
| CODEC_FRAME_RATE = 25.0
|
| CODEC_TOKENS_PER_SEC = 25
|
| TOKENS_PER_FRAME = 1
|
| SPEAKER_EMB_DIM = 128
|
|
|
|
|
| BG_LOWER = "ะฐะฑะฒะณะดะตะถะทะธะนะบะปะผะฝะพะฟัััััั
ัััััััั"
|
| BG_UPPER = "ะะะะะะะะะะะะะะะะะ ะกะขะฃะคะฅะฆะงะจะฉะชะฌะฎะฏ"
|
| EN_LOWER = "abcdefghijklmnopqrstuvwxyz"
|
| EN_UPPER = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
| DIGITS = "0123456789"
|
| PUNCT = '.,!?;:-โโโฆ"\'()[]{}ยซยปโ"" '
|
| EXTRA = "\n\t"
|
|
|
| _ALL_CHARS: list[str] = []
|
| _seen: set[str] = set()
|
| for _src in [BG_LOWER, BG_UPPER, EN_LOWER, EN_UPPER, DIGITS, PUNCT, EXTRA]:
|
| for _ch in _src:
|
| if _ch not in _seen:
|
| _ALL_CHARS.append(_ch)
|
| _seen.add(_ch)
|
|
|
|
|
| SPECIAL_TOKENS = {
|
| "<pad>": 0,
|
| "<start_of_text>": 1,
|
| "<end_of_text>": 2,
|
| "<start_of_speech>": 3,
|
| "<end_of_speech>": 4,
|
| "<spk_0>": 5,
|
| "<spk_1>": 6,
|
| "<spk_2>": 7,
|
| "<spk_3>": 8,
|
| }
|
| NUM_SPECIAL_TOKENS = len(SPECIAL_TOKENS)
|
|
|
|
|
| TEXT_CHARS = _ALL_CHARS
|
| TEXT_VOCAB_SIZE = len(TEXT_CHARS)
|
| TEXT_OFFSET = NUM_SPECIAL_TOKENS
|
| AUDIO_OFFSET = TEXT_OFFSET + TEXT_VOCAB_SIZE
|
| NUM_AUDIO_TOKENS = CODEC_CODEBOOK_SIZE
|
| TOTAL_VOCAB_SIZE = AUDIO_OFFSET + NUM_AUDIO_TOKENS
|
|
|
|
|
| ENCODER_VOCAB_SIZE = AUDIO_OFFSET
|
| DECODER_VOCAB_SIZE = TOTAL_VOCAB_SIZE
|
|
|
|
|
| PAD_TOKEN_ID = SPECIAL_TOKENS["<pad>"]
|
| START_OF_TEXT_TOKEN_ID = SPECIAL_TOKENS["<start_of_text>"]
|
| END_OF_TEXT_TOKEN_ID = SPECIAL_TOKENS["<end_of_text>"]
|
| START_OF_SPEECH_TOKEN_ID = SPECIAL_TOKENS["<start_of_speech>"]
|
| END_OF_SPEECH_TOKEN_ID = SPECIAL_TOKENS["<end_of_speech>"]
|
| SPK_0_TOKEN_ID = SPECIAL_TOKENS["<spk_0>"]
|
| SPK_1_TOKEN_ID = SPECIAL_TOKENS["<spk_1>"]
|
|
|
|
|
| def audio_token_id(code: int) -> int:
|
| """MioCodec code โ global token ID."""
|
| return AUDIO_OFFSET + code
|
|
|
| def decode_audio_token(token_id: int) -> int:
|
| """Global token ID โ MioCodec code."""
|
| return token_id - AUDIO_OFFSET
|
|
|
| def is_audio_token(token_id: int) -> bool:
|
| return AUDIO_OFFSET <= token_id < AUDIO_OFFSET + NUM_AUDIO_TOKENS
|
|
|
| def is_special_token(token_id: int) -> bool:
|
| return 0 <= token_id < NUM_SPECIAL_TOKENS
|
|
|
| def is_text_token(token_id: int) -> bool:
|
| return TEXT_OFFSET <= token_id < AUDIO_OFFSET
|
|
|
|
|
|
|
| ENC_D_MODEL = 384
|
| ENC_N_HEADS = 6
|
| ENC_N_LAYERS = 4
|
| ENC_D_FF = 1536
|
|
|
|
|
| DEC_D_MODEL = 384
|
| DEC_N_HEADS = 6
|
| DEC_N_LAYERS = 8
|
| DEC_D_FF = 1536
|
|
|
| MAX_TEXT_LEN = 256
|
| MAX_AUDIO_LEN = 512
|
| DROPOUT = 0.0
|
|
|
|
|
| BATCH_SIZE = 16
|
| GRAD_ACCUM = 4
|
| LR = 3e-4
|
| WEIGHT_DECAY = 0.1
|
| WARMUP_STEPS = 1000
|
| NUM_EPOCHS = 5
|
|
|
|
|
| if __name__ == "__main__":
|
| print(f"V6 Vocab Layout:")
|
| print(f" Special: [0, {NUM_SPECIAL_TOKENS-1}] ({NUM_SPECIAL_TOKENS} tokens)")
|
| print(f" Text: [{TEXT_OFFSET}, {AUDIO_OFFSET-1}] ({TEXT_VOCAB_SIZE} chars)")
|
| print(f" Audio: [{AUDIO_OFFSET}, {TOTAL_VOCAB_SIZE-1}] ({NUM_AUDIO_TOKENS} tokens)")
|
| print(f" TOTAL: {TOTAL_VOCAB_SIZE}")
|
| print()
|
| print(f"V6 Encoder: d={ENC_D_MODEL}, heads={ENC_N_HEADS}, L={ENC_N_LAYERS}, ff={ENC_D_FF}")
|
| print(f"V6 Decoder: d={DEC_D_MODEL}, heads={DEC_N_HEADS}, L={DEC_N_LAYERS}, ff={DEC_D_FF}")
|
| print(f"V6 Codec: MioCodec {CODEC_FRAME_RATE}fps, {CODEC_NUM_CODEBOOKS}CB ร {CODEC_CODEBOOK_SIZE}")
|
| print(f"V6 Speaker: {SPEAKER_EMB_DIM}-dim global_embedding")
|
| print(f"V6 Limits: max_text={MAX_TEXT_LEN}, max_audio={MAX_AUDIO_LEN}")
|
|
|