Spaces:
Runtime error
Runtime error
| import torch | |
| import soundfile as sf | |
| import os | |
| from .model import build_model | |
| from .text_encoder import TextEncoder | |
| from .config import HexaConfig | |
| def generate_audio(text, output_path, lang='en', speaker_id=0, emotion_id=0): | |
| """ | |
| Generates audio from text using the Hexa 5B model. | |
| """ | |
| print(f"Initializing Hexa 5B TTS System...") | |
| # 1. Load Configuration | |
| config = HexaConfig() | |
| # 2. Load Model (Architecture only, random weights for demo) | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Using device: {device}") | |
| model = build_model() | |
| model.to(device) | |
| model.eval() | |
| # 3. Process Text | |
| encoder = TextEncoder() | |
| print(f"Processing text: '{text}' ({lang})") | |
| text_ids = encoder.preprocess(text, lang_code=lang).to(device) | |
| # 4. Prepare inputs | |
| # Ensure IDs are within range | |
| speaker_tensor = torch.tensor([speaker_id]).to(device).clamp(0, config.num_speakers-1) | |
| language_tensor = torch.tensor([0]).to(device) # Placeholder mapping | |
| emotion_tensor = torch.tensor([emotion_id]).to(device).clamp(0, config.num_emotions-1) | |
| # 5. Generate (Forward Pass) | |
| with torch.no_grad(): | |
| # In a real autoregressive model, this would be a loop. | |
| # Here we just run one forward pass to verify architecture. | |
| mel_output = model(text_ids, speaker_tensor, language_tensor, emotion_tensor) | |
| print(f"Model forward pass successful. Output shape: {mel_output.shape}") | |
| print("Note: Since this is an untrained model, the output is random noise.") | |
| # 6. Dummy Vocoder (Simulated) | |
| # In production, use HifiGAN here to convert Mel -> Audio | |
| sr = config.sample_rate | |
| dummy_audio = torch.randn(mel_output.shape[1] * 256) # Approx length | |
| # Save | |
| sf.write(output_path, dummy_audio.cpu().numpy(), sr) | |
| print(f"Saved generated (random) audio to: {output_path}") | |
| if __name__ == "__main__": | |
| # Test Run | |
| generate_audio( | |
| "Hello, this is Hexa TTS.", | |
| "test_output.wav", | |
| lang='en', | |
| emotion_id=5 # e.g. 'Happy' | |
| ) | |