Hexa09 commited on Feb 4

Commit

e729286

verified ·

1 Parent(s): b38ebdf

Upload folder using huggingface_hub

Browse files

Files changed (20) hide show

src/__init__.py +0 -0
src/__pycache__/__init__.cpython-312.pyc +0 -0
src/__pycache__/config.cpython-312.pyc +0 -0
src/__pycache__/dataset.cpython-312.pyc +0 -0
src/__pycache__/hf_model.cpython-312.pyc +0 -0
src/__pycache__/inference.cpython-312.pyc +0 -0
src/__pycache__/model.cpython-312.pyc +0 -0
src/__pycache__/test_tiny.cpython-312.pyc +0 -0
src/__pycache__/text_encoder.cpython-312.pyc +0 -0
src/__pycache__/train.cpython-312.pyc +0 -0
src/__pycache__/train_hf.cpython-312.pyc +0 -0
src/config.py +43 -0
src/dataset.py +92 -0
src/hf_model.py +57 -0
src/inference.py +61 -0
src/model.py +150 -0
src/test_tiny.py +56 -0
src/text_encoder.py +48 -0
src/train.py +85 -0
src/train_hf.py +75 -0

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (123 Bytes). View file

src/__pycache__/config.cpython-312.pyc ADDED Viewed

Binary file (1.85 kB). View file

src/__pycache__/dataset.cpython-312.pyc ADDED Viewed

Binary file (4.95 kB). View file

src/__pycache__/hf_model.cpython-312.pyc ADDED Viewed

Binary file (3.05 kB). View file

src/__pycache__/inference.cpython-312.pyc ADDED Viewed

Binary file (2.84 kB). View file

src/__pycache__/model.cpython-312.pyc ADDED Viewed

Binary file (9.34 kB). View file

src/__pycache__/test_tiny.cpython-312.pyc ADDED Viewed

Binary file (2.74 kB). View file

src/__pycache__/text_encoder.cpython-312.pyc ADDED Viewed

Binary file (2.35 kB). View file

src/__pycache__/train.cpython-312.pyc ADDED Viewed

Binary file (3.61 kB). View file

src/__pycache__/train_hf.cpython-312.pyc ADDED Viewed

Binary file (2.82 kB). View file

src/config.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from dataclasses import dataclass, field
+from typing import List
+@dataclass
+class HexaConfig:
+    """
+    Configuration for Hexa TTS 5B Model.
+    Designed to scale to ~5 Billion parameters.
+    """
+    # Model Architecture
+    dim: int = 3200                       # Tuned for ~5B params (4.92B)
+    depth: int = 40                       # Number of layers
+    heads: int = 32                       # Number of attention heads
+    dim_head: int = 100                   # Dimension of each head
+    mlp_ratio: float = 4.0                # Feedforward expansion factor
+    dropout: float = 0.1
+    # Input / Output
+    num_languages: int = 15
+    vocab_size: int = 256                 # Size of phoneme/text vocabulary
+    num_speakers: int = 10000             # Embedding slot for speakers
+    num_emotions: int = 32                # Distinct emotion categories
+    # Audio Settings
+    sample_rate: int = 24000
+    n_mel_channels: int = 100
+    n_fft: int = 1024
+    hop_length: int = 256
+    win_length: int = 1024
+    # Context
+    max_text_len: int = 1024
+    max_audio_len: int = 4096             # In mel frames
+    # Checkpoints
+    checkpoint_path: str = "checkpoints/hexa_5b_latest.pt"
+    def __post_init__(self):
+        # Rough parameter count estimation:
+        # 12 * layers * dim^2  (approximate for standard transformer)
+        total_params = 12 * self.depth * (self.dim ** 2)
+        print(f"Hexa Config initialized.")
+        print(f"Approximate Model Size: {total_params / 1e9:.2f} Billion parameters")

src/dataset.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import torch
+import torchaudio
+import os
+from torch.utils.data import Dataset
+from .text_encoder import TextEncoder
+from .config import HexaConfig
+class HexaDataset(Dataset):
+    """
+    Real Dataset Loader for Hexa TTS.
+    Expects a directory structure:
+    /data_root
+        /wavs/
+        metadata.csv (formatted: filename|text)
+    """
+    def __init__(self, root_dir, config: HexaConfig, train=True):
+        self.root_dir = root_dir
+        self.config = config
+        self.encoder = TextEncoder()
+        self.wav_dir = os.path.join(root_dir, "wavs")
+        self.metadata_path = os.path.join(root_dir, "metadata.csv")
+        self.files = []
+        if os.path.exists(self.metadata_path):
+            with open(self.metadata_path, 'r', encoding='utf-8') as f:
+                for line in f:
+                    parts = line.strip().split('|')
+                    if len(parts) >= 2:
+                        self.files.append((parts[0], parts[1]))
+        else:
+            print(f"Warning: Metadata not found at {self.metadata_path}")
+        # Mel Spectrogram Transform
+        self.mel_transform = torchaudio.transforms.MelSpectrogram(
+            sample_rate=config.sample_rate,
+            n_fft=config.n_fft,
+            win_length=config.win_length,
+            hop_length=config.hop_length,
+            n_mels=config.n_mel_channels
+        )
+    def __len__(self):
+        return len(self.files)
+    def __getitem__(self, idx):
+        filename, text = self.files[idx]
+        wav_path = os.path.join(self.wav_dir, filename + ".wav")
+        # 1. Load Audio
+        waveform, sr = torchaudio.load(wav_path)
+        # Resample if needed
+        if sr != self.config.sample_rate:
+            resampler = torchaudio.transforms.Resample(sr, self.config.sample_rate)
+            waveform = resampler(waveform)
+        # 2. Compute Mel
+        mel = self.mel_transform(waveform) # [channels, frames]
+        mel = mel.squeeze(0).transpose(0, 1) # [frames, channels]
+        # 3. Tokenize Text
+        # Assuming English for starter dataset (LJSpeech)
+        text_ids = self.encoder.preprocess(text, lang_code='en').squeeze(0)
+        # 4. Dummy Speaker/Lang/Emotion for single-speaker dataset
+        speaker = torch.tensor(0)
+        lang = torch.tensor(0)
+        emotion = torch.tensor(0)
+        return text_ids, speaker, lang, emotion, mel
+def collate_fn(batch):
+    """
+    Pads batch to longest sequence.
+    """
+    # Sort by text length for packing (optional but good practice)
+    batch.sort(key=lambda x: x[0].shape[0], reverse=True)
+    text_ids, speakers, langs, emotions, mels = zip(*batch)
+    # Pad Text
+    text_padded = torch.nn.utils.rnn.pad_sequence(text_ids, batch_first=True, padding_value=0)
+    # Pad Mels
+    mel_padded = torch.nn.utils.rnn.pad_sequence(mels, batch_first=True, padding_value=0.0)
+    speakers = torch.stack(speakers)
+    langs = torch.stack(langs)
+    emotions = torch.stack(emotions)
+    return text_padded, speakers, langs, emotions, mel_padded

src/hf_model.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import torch
+import torch.nn as nn
+from transformers import PreTrainedModel, PretrainedConfig
+from .config import HexaConfig
+# Re-importing the core layers from the existing definition or redefining for cleanliness.
+# To integrate with HF Trainer, we wrap the existing module.
+class HexaHFConfig(PretrainedConfig):
+    model_type = "hexa_tts"
+    def __init__(self, **kwargs):
+        # Flatten HexaConfig into kwargs for HF compatibility
+        self.hexa_config = HexaConfig()
+        # Update with manual kwargs if provided
+        for k, v in kwargs.items():
+            if hasattr(self.hexa_config, k):
+                setattr(self.hexa_config, k, v)
+        super().__init__(**kwargs)
+from .model import HexaTransformer as CoreTransformer
+class HexaModel(PreTrainedModel):
+    config_class = HexaHFConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        # Initialize the core model using the internal HexaConfig
+        self.core = CoreTransformer(config.hexa_config)
+        # Enable Gradient Checkpointing for memory savings
+        self.gradient_checkpointing = False
+    def forward(self, text_ids, speaker_ids=None, language_ids=None, emotion_ids=None, labels=None):
+        # Handle defaults for optional args
+        device = text_ids.device
+        if speaker_ids is None: speaker_ids = torch.zeros_like(text_ids).to(device)
+        if language_ids is None: language_ids = torch.zeros_like(text_ids).to(device)
+        if emotion_ids is None: emotion_ids = torch.zeros_like(text_ids).to(device)
+        # Forward pass
+        mels = self.core(text_ids, speaker_ids, language_ids, emotion_ids)
+        loss = None
+        if labels is not None:
+            # labels = target_mels
+            # Align lengths
+            min_len = min(mels.shape[1], labels.shape[1])
+            mels_sliced = mels[:, :min_len, :]
+            labels_sliced = labels[:, :min_len, :]
+            loss = torch.nn.functional.mse_loss(mels_sliced, labels_sliced)
+        return {"loss": loss, "logits": mels} if loss is not None else {"logits": mels}
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, CoreTransformer):
+            module.gradient_checkpointing = value

src/inference.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import torch
+import soundfile as sf
+import os
+from .model import build_model
+from .text_encoder import TextEncoder
+from .config import HexaConfig
+def generate_audio(text, output_path, lang='en', speaker_id=0, emotion_id=0):
+    """
+    Generates audio from text using the Hexa 5B model.
+    """
+    print(f"Initializing Hexa 5B TTS System...")
+    # 1. Load Configuration
+    config = HexaConfig()
+    # 2. Load Model (Architecture only, random weights for demo)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Using device: {device}")
+    model = build_model()
+    model.to(device)
+    model.eval()
+    # 3. Process Text
+    encoder = TextEncoder()
+    print(f"Processing text: '{text}' ({lang})")
+    text_ids = encoder.preprocess(text, lang_code=lang).to(device)
+    # 4. Prepare inputs
+    # Ensure IDs are within range
+    speaker_tensor = torch.tensor([speaker_id]).to(device).clamp(0, config.num_speakers-1)
+    language_tensor = torch.tensor([0]).to(device) # Placeholder mapping
+    emotion_tensor = torch.tensor([emotion_id]).to(device).clamp(0, config.num_emotions-1)
+    # 5. Generate (Forward Pass)
+    with torch.no_grad():
+        # In a real autoregressive model, this would be a loop.
+        # Here we just run one forward pass to verify architecture.
+        mel_output = model(text_ids, speaker_tensor, language_tensor, emotion_tensor)
+    print(f"Model forward pass successful. Output shape: {mel_output.shape}")
+    print("Note: Since this is an untrained model, the output is random noise.")
+    # 6. Dummy Vocoder (Simulated)
+    # In production, use HifiGAN here to convert Mel -> Audio
+    sr = config.sample_rate
+    dummy_audio = torch.randn(mel_output.shape[1] * 256) # Approx length
+    # Save
+    sf.write(output_path, dummy_audio.cpu().numpy(), sr)
+    print(f"Saved generated (random) audio to: {output_path}")
+if __name__ == "__main__":
+    # Test Run
+    generate_audio(
+        "Hello, this is Hexa TTS.",
+        "test_output.wav",
+        lang='en',
+        emotion_id=5 # e.g. 'Happy'
+    )

src/model.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from .config import HexaConfig
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+    def forward(self, x):
+        n, device = x.shape[1], x.device
+        t = torch.arange(n, device=device).type_as(self.inv_freq)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        return emb[None, None, :, :]
+class FeedForward(nn.Module):
+    def __init__(self, dim, hidden_dim, dropout=0.0):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(dim, hidden_dim),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, dim),
+            nn.Dropout(dropout)
+        )
+    def forward(self, x):
+        return self.net(x)
+class Attention(nn.Module):
+    def __init__(self, dim, heads=8, dim_head=64, dropout=0.0):
+        super().__init__()
+        inner_dim = dim_head * heads
+        self.heads = heads
+        self.scale = dim_head ** -0.5
+        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, dim),
+            nn.Dropout(dropout)
+        )
+    def forward(self, x, mask=None, rope_emb=None):
+        b, n, _, h = *x.shape, self.heads
+        qkv = self.to_qkv(x).chunk(3, dim=-1)
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), qkv)
+        # Apply RoPE if provided
+        if rope_emb is not None:
+            # Simplified RoPE application (omitted full logic for brevity, assuming training stability)
+            pass
+        dots = torch.einsum('b h i d, b h j d -> b h i j', q, k) * self.scale
+        if mask is not None:
+            mask_value = -torch.finfo(dots.dtype).max
+            dots.masked_fill_(~mask, mask_value)
+        attn = dots.softmax(dim=-1)
+        out = torch.einsum('b h i j, b h j d -> b h i d', attn, v)
+        out = rearrange(out, 'b h n d -> b n (h d)')
+        return self.to_out(out)
+class TransformerBlock(nn.Module):
+    def __init__(self, dim, heads, dim_head, mlp_dim, dropout=0.0):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim)
+        self.attn = Attention(dim, heads=heads, dim_head=dim_head, dropout=dropout)
+        self.norm2 = nn.LayerNorm(dim)
+        self.ff = FeedForward(dim, mlp_dim, dropout=dropout)
+    def forward(self, x, mask=None, rope_emb=None):
+        x = x + self.attn(self.norm1(x), mask=mask, rope_emb=rope_emb)
+        x = x + self.ff(self.norm2(x))
+        return x
+class HexaTransformer(nn.Module):
+    """
+    Hexa TTS 5B Model Core.
+    A massive decoder-only transformer for autoregressive spectral / token generation.
+    """
+    def __init__(self, config: HexaConfig):
+        super().__init__()
+        self.config = config
+        # Embeddings
+        self.token_emb = nn.Embedding(config.vocab_size, config.dim)
+        self.speaker_emb = nn.Embedding(config.num_speakers, config.dim) # Multi-Character
+        self.language_emb = nn.Embedding(config.num_languages, config.dim) # 14 Languages
+        self.emotion_emb = nn.Embedding(config.num_emotions, config.dim) # Emotion Support
+        self.pos_emb = RotaryEmbedding(config.dim_head)
+        # Transformer Layers
+        self.layers = nn.ModuleList([])
+        for _ in range(config.depth):
+            self.layers.append(TransformerBlock(
+                dim = config.dim,
+                heads = config.heads,
+                dim_head = config.dim_head,
+                mlp_dim = int(config.dim * config.mlp_ratio),
+                dropout = config.dropout
+            ))
+        self.norm_final = nn.LayerNorm(config.dim)
+        # Output Head (Projecting to Mel Channels OR Discrete Codebook)
+        self.to_mel = nn.Linear(config.dim, config.n_mel_channels)
+    def forward(self, text_ids, speaker_ids, language_ids, emotion_ids, mask=None):
+        """
+        Forward pass for training or inference.
+        """
+        # Embed Inputs
+        x = self.token_emb(text_ids)
+        s = self.speaker_emb(speaker_ids)
+        l = self.language_emb(language_ids)
+        e = self.emotion_emb(emotion_ids)
+        # Fuse Conditioning
+        # Simple addition for now; more complex fusion (AdaLIN, Cross-Attn) can be added.
+        # Broadcasting speaker, language, emotion to sequence length
+        s = s.unsqueeze(1).expand(-1, x.shape[1], -1)
+        l = l.unsqueeze(1).expand(-1, x.shape[1], -1)
+        e = e.unsqueeze(1).expand(-1, x.shape[1], -1)
+        x = x + s + l + e
+        # Parameters for RoPE
+        rope_emb = self.pos_emb(x)
+        # Transformer Pass
+        for layer in self.layers:
+            x = layer(x, mask=mask, rope_emb=rope_emb)
+        x = self.norm_final(x)
+        # Output Generation
+        mels = self.to_mel(x)
+        return mels
+def build_model():
+    conf = HexaConfig()
+    model = HexaTransformer(conf)
+    return model

src/test_tiny.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+import soundfile as sf
+import os
+from .model import HexaTransformer
+from .text_encoder import TextEncoder
+from .config import HexaConfig
+def run_tiny_test():
+    """
+    Test the architecture with a tiny config to fit in memory.
+    """
+    print("Initializing Tiny Hexa Model for Code Verification...")
+    # Override Config for Tiny Scale
+    config = HexaConfig(
+        dim=512,
+        depth=6,
+        heads=8,
+        dim_head=64,
+        num_languages=15
+    )
+    device = "cpu"
+    model = HexaTransformer(config)
+    model.to(device)
+    model.eval()
+    params = sum(p.numel() for p in model.parameters())
+    print(f"Tiny Model Size: {params / 1e6:.2f} Million parameters")
+    # Process Text
+    text = "Hello world, testing tiny hexa."
+    encoder = TextEncoder()
+    text_ids = encoder.preprocess(text, lang_code='en').to(device)
+    print(f"Encoded text shape: {text_ids.shape}")
+    # Inputs
+    speaker = torch.tensor([0]).to(device)
+    language = torch.tensor([0]).to(device)
+    emotion = torch.tensor([0]).to(device)
+    # Forward Pass
+    with torch.no_grad():
+        output = model(text_ids, speaker, language, emotion)
+    print(f"Forward pass successful. Output shape: {output.shape}")
+    # Save dummy audio
+    # Output is (B, Frames, Mel_Channels)
+    # We fake audio from it
+    dummy_wav = torch.randn(output.shape[1] * 256).numpy()
+    sf.write("tiny_output.wav", dummy_wav, config.sample_rate)
+    print("Saved tiny_output.wav")
+if __name__ == "__main__":
+    run_tiny_test()

src/text_encoder.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import torch
+from phonemizer import phonemize
+from phonemizer.separator import Separator
+class TextEncoder:
+    """
+    Handles text-to-phoneme conversion for 14 languages.
+    """
+    def __init__(self, vocab_map=None):
+        self.separator = Separator(phone=' ', word='|', syllable='')
+        # Maps 14 languages to phonemizer language codes
+        self.lang_map = {
+            'en': 'en-us', 'zh': 'cmn', 'es': 'es', 'fr': 'fr-fr',
+            'de': 'de', 'ja': 'ja', 'ko': 'ko', 'ru': 'ru',
+            'pt': 'pt', 'it': 'it', 'hi': 'hi', 'ar': 'ar',
+            'tr': 'tr', 'nl': 'nl', 'bn': 'bn'
+        }
+        # Simple character-to-id mapping (placeholder)
+        self.vocab = vocab_map if vocab_map else {c: i for i, c in enumerate(" abcdefghijklmnopqrstuvwxyz|")}
+    def preprocess(self, text, lang_code='en'):
+        """
+        Converts text to phoneme IDs.
+        """
+        if lang_code not in self.lang_map:
+            print(f"Warning: Language {lang_code} not fully supported, defaulting to English backend.")
+            backend_lang = 'en-us'
+        else:
+            backend_lang = self.lang_map[lang_code]
+        try:
+            # Phonemize
+            phonemes = phonemize(
+                text,
+                language=backend_lang,
+                backend='espeak',
+                separator=self.separator,
+                strip=True,
+                preserve_punctuation=True,
+                njobs=1
+            )
+        except RuntimeError:
+            print("Warning: eSpeak not found. Falling back to character-level tokenization.")
+            phonemes = list(text)  # Simple list of characters as fallback
+        # Tokenize (Simple lookup for now)
+        token_ids = [self.vocab.get(p, 0) for p in phonemes]
+        return torch.tensor(token_ids).unsqueeze(0) # Batch dim

src/train.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import torch
+from torch.utils.data import DataLoader
+from torch.optim import AdamW
+from accelerate import Accelerator
+from tqdm import tqdm
+import os
+from .model import build_model
+from .config import HexaConfig
+from .dataset import HexaDataset, collate_fn
+def train():
+    """
+    Massive Scale Training Loop.
+    """
+    # 1. Setup
+    config = HexaConfig()
+    # Gradient Accumulation is CRITICAL for large models on small GPUs
+    accelerator = Accelerator(gradient_accumulation_steps=16)
+    print(f"Initializing 5B Parameter Model... (This takes memory!)")
+    try:
+        model = build_model()
+    except RuntimeError as e:
+        print(f"Error initializing full model: {e}")
+        print("Fallback: Your GPU memory is too small for 5B. Please try reducing config.dim in config.py")
+        return
+    # 2. Data
+    data_root = "d:\\hexatts\\data"
+    if not os.path.exists(data_root):
+        print("Data not found. Run 'python get_data.py' first.")
+        return
+    dataset = HexaDataset(data_root, config)
+    dataloader = DataLoader(dataset, batch_size=1, shuffle=True, collate_fn=collate_fn)
+    # 3. Optimize
+    optimizer = AdamW(model.parameters(), lr=1e-4) # Standard LR
+    model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader)
+    print("Starting Training...")
+    model.train()
+    # 4. Loop
+    global_step = 0
+    epochs = 5 # arbitrary for demo
+    for epoch in range(epochs):
+        progress_bar = tqdm(total=len(dataloader), desc=f"Epoch {epoch+1}")
+        for batch in dataloader:
+            with accelerator.accumulate(model):
+                text, speakers, langs, emotions, target_mels = batch
+                # Check shapes
+                # Output: [Batch, Time, Channels]
+                # Target: [Batch, Time, Channels]
+                output_mels = model(text, speakers, langs, emotions)
+                # Align lengths (Simple truncation to min length for loss)
+                min_len = min(output_mels.shape[1], target_mels.shape[1])
+                output_sliced = output_mels[:, :min_len, :]
+                target_sliced = target_mels[:, :min_len, :]
+                loss = torch.nn.functional.mse_loss(output_sliced, target_sliced)
+                accelerator.backward(loss)
+                optimizer.step()
+                optimizer.zero_grad()
+                progress_bar.set_postfix(loss=loss.item())
+                progress_bar.update(1)
+                global_step += 1
+        # Save Checkpoint
+        save_path = os.path.join("checkpoints", f"checkpoint_epoch_{epoch}")
+        os.makedirs(save_path, exist_ok=True)
+        accelerator.save_state(save_path)
+        print(f"Saved checkpoint to {save_path}")
+if __name__ == "__main__":
+    train()

src/train_hf.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import torch
+from transformers import Trainer, TrainingArguments
+from .hf_model import HexaModel, HexaHFConfig
+from .dataset import HexaDataset
+from .config import HexaConfig
+# Data Collator for HF Trainer
+def data_collator(features):
+    # Features is a list of tuples from Dataset.__getitem__
+    # (text_ids, speaker, lang, emotion, mel)
+    batch_text = [f[0] for f in features]
+    batch_speaker = [f[1] for f in features]
+    batch_lang = [f[2] for f in features]
+    batch_emotion = [f[3] for f in features]
+    batch_mel = [f[4] for f in features]
+    # Pad
+    txt = torch.nn.utils.rnn.pad_sequence(batch_text, batch_first=True, padding_value=0)
+    mel = torch.nn.utils.rnn.pad_sequence(batch_mel, batch_first=True, padding_value=0.0)
+    spk = torch.stack(batch_speaker)
+    lng = torch.stack(batch_lang)
+    emo = torch.stack(batch_emotion)
+    return {
+        "text_ids": txt,
+        "speaker_ids": spk,
+        "language_ids": lng,
+        "emotion_ids": emo,
+        "labels": mel
+    }
+def train():
+    print("Initializing Hexa TTS (5B Config) with HuggingFace Trainer...")
+    # 1. Config & Model
+    # Setting dim=3200, depth=40 -> ~5B params
+    hexa_conf = HexaConfig(dim=3200, depth=40, heads=32, dim_head=100)
+    hf_config = HexaHFConfig()
+    hf_config.hexa_config = hexa_conf
+    model = HexaModel(hf_config)
+    # 2. Data
+    data_root = "d:\\hexatts\\data"
+    dataset = HexaDataset(data_root, hexa_conf)
+    # 3. Training Arguments for Memory Optimization
+    args = TrainingArguments(
+        output_dir="./hexa_checkpoints",
+        per_device_train_batch_size=1,     # Must be 1 for 5B model on single GPU
+        gradient_accumulation_steps=16,    # Simulate batch size 16
+        learning_rate=1e-4,
+        num_train_epochs=3,
+        logging_steps=1,
+        save_steps=100,
+        fp16=False,                        # Enable if you have Tensor Cores (NVIDIA)
+        gradient_checkpointing=True,       # CRITICAL for 5B model memory
+        dataloader_num_workers=0,
+        report_to="tensorboard"
+    )
+    trainer = Trainer(
+        model=model,
+        args=args,
+        train_dataset=dataset,
+        data_collator=data_collator,
+    )
+    print("Starting Training...")
+    trainer.train()
+if __name__ == "__main__":
+    train()