File size: 2,165 Bytes
e729286
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import torch
import soundfile as sf
import os
from .model import build_model
from .text_encoder import TextEncoder
from .config import HexaConfig

def generate_audio(text, output_path, lang='en', speaker_id=0, emotion_id=0):
    """

    Generates audio from text using the Hexa 5B model.

    """
    print(f"Initializing Hexa 5B TTS System...")
    
    # 1. Load Configuration
    config = HexaConfig()
    
    # 2. Load Model (Architecture only, random weights for demo)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    
    model = build_model()
    model.to(device)
    model.eval()
    
    # 3. Process Text
    encoder = TextEncoder()
    print(f"Processing text: '{text}' ({lang})")
    text_ids = encoder.preprocess(text, lang_code=lang).to(device)
    
    # 4. Prepare inputs
    # Ensure IDs are within range
    speaker_tensor = torch.tensor([speaker_id]).to(device).clamp(0, config.num_speakers-1)
    language_tensor = torch.tensor([0]).to(device) # Placeholder mapping
    emotion_tensor = torch.tensor([emotion_id]).to(device).clamp(0, config.num_emotions-1)
    
    # 5. Generate (Forward Pass)
    with torch.no_grad():
        # In a real autoregressive model, this would be a loop.
        # Here we just run one forward pass to verify architecture.
        mel_output = model(text_ids, speaker_tensor, language_tensor, emotion_tensor)
    
    print(f"Model forward pass successful. Output shape: {mel_output.shape}")
    print("Note: Since this is an untrained model, the output is random noise.")
    
    # 6. Dummy Vocoder (Simulated)
    # In production, use HifiGAN here to convert Mel -> Audio
    sr = config.sample_rate
    dummy_audio = torch.randn(mel_output.shape[1] * 256) # Approx length
    
    # Save
    sf.write(output_path, dummy_audio.cpu().numpy(), sr)
    print(f"Saved generated (random) audio to: {output_path}")

if __name__ == "__main__":
    # Test Run
    generate_audio(
        "Hello, this is Hexa TTS.", 
        "test_output.wav", 
        lang='en', 
        emotion_id=5 # e.g. 'Happy'
    )