|
|
--- |
|
|
library_name: transformers |
|
|
tags: |
|
|
- unsloth |
|
|
- trl |
|
|
- sft |
|
|
datasets: |
|
|
- taresh18/AnimeVox |
|
|
--- |
|
|
This repository provides a powerful and modular Text-to-Speech (TTS) model trained on Spark model that supports controllable audio generation using semantic and global token conditioning. It is designed for immersive narration, guided visualization, or expressive AI agents. |
|
|
|
|
|
🔊 Model Highlights |
|
|
🎯 Task-specific generation using <|task_tts|> prompt format |
|
|
|
|
|
🧠 Semantic tokens capture content-related prosody and intonation |
|
|
|
|
|
🌍 Global tokens control speaker identity, style, and other features |
|
|
|
|
|
⚡ Optimized for fast inference with native acceleration |
|
|
|
|
|
🧪 Example input: Guided fitness visualization prompt |
|
|
|
|
|
📦 Installation |
|
|
Make sure to install the required packages: |
|
|
|
|
|
bash |
|
|
Copy |
|
|
Edit |
|
|
pip install torch torchaudio soundfile |
|
|
🚀 Usage |
|
|
````python |
|
|
import torch |
|
|
import re |
|
|
import numpy as np |
|
|
import torchaudio.transforms as T |
|
|
from typing import Dict, Any |
|
|
|
|
|
FastModel.for_inference(model) # Enable 2x faster inference |
|
|
|
|
|
input_text = "Frieren: Now, let's explore the imagery of your fitness journey..." |
|
|
|
|
|
@torch.inference_mode() |
|
|
def generate_speech_from_text( |
|
|
text: str, |
|
|
temperature: float = 0.8, |
|
|
top_k: int = 50, |
|
|
top_p: float = 1.0, |
|
|
max_new_audio_tokens: int = 2048, |
|
|
device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
) -> np.ndarray: |
|
|
prompt = "".join([ |
|
|
"<|task_tts|>", |
|
|
"<|start_content|>", |
|
|
text, |
|
|
"<|end_content|>", |
|
|
"<|start_global_token|>" |
|
|
]) |
|
|
model_inputs = tokenizer([prompt], return_tensors="pt").to(device) |
|
|
|
|
|
print("Generating token sequence...") |
|
|
generated_ids = model.generate( |
|
|
**model_inputs, |
|
|
max_new_tokens=max_new_audio_tokens, |
|
|
do_sample=True, |
|
|
temperature=temperature, |
|
|
top_k=top_k, |
|
|
top_p=top_p, |
|
|
eos_token_id=tokenizer.eos_token_id, |
|
|
pad_token_id=tokenizer.pad_token_id |
|
|
) |
|
|
print("Token sequence generated.") |
|
|
|
|
|
generated_ids_trimmed = generated_ids[:, model_inputs.input_ids.shape[1]:] |
|
|
predicts_text = tokenizer.batch_decode(generated_ids_trimmed, skip_special_tokens=False)[0] |
|
|
|
|
|
semantic_matches = re.findall(r"<\|bicodec_semantic_(\d+)\|>", predicts_text) |
|
|
if not semantic_matches: |
|
|
print("Warning: No semantic tokens found.") |
|
|
return np.array([], dtype=np.float32) |
|
|
|
|
|
pred_semantic_ids = torch.tensor([int(token) for token in semantic_matches]).long().unsqueeze(0) |
|
|
|
|
|
global_matches = re.findall(r"<\|bicodec_global_(\d+)\|>", predicts_text) |
|
|
if not global_matches: |
|
|
print("Warning: No global tokens found. Using defaults.") |
|
|
pred_global_ids = torch.zeros((1, 1), dtype=torch.long) |
|
|
else: |
|
|
pred_global_ids = torch.tensor([int(token) for token in global_matches]).long().unsqueeze(0) |
|
|
|
|
|
pred_global_ids = pred_global_ids.unsqueeze(0) |
|
|
|
|
|
print(f"Found {pred_semantic_ids.shape[1]} semantic tokens.") |
|
|
print(f"Found {pred_global_ids.shape[2]} global tokens.") |
|
|
|
|
|
print("Detokenizing audio tokens...") |
|
|
audio_tokenizer.device = device |
|
|
audio_tokenizer.model.to(device) |
|
|
wav_np = audio_tokenizer.detokenize( |
|
|
pred_global_ids.to(device).squeeze(0), |
|
|
pred_semantic_ids.to(device) |
|
|
) |
|
|
print("Detokenization complete.") |
|
|
return wav_np |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
print(f"Generating speech for: '{input_text}'") |
|
|
text = f"{chosen_voice}: " + input_text if 'chosen_voice' in globals() else input_text |
|
|
generated_waveform = generate_speech_from_text(text) |
|
|
|
|
|
if generated_waveform.size > 0: |
|
|
import soundfile as sf |
|
|
output_filename = "generated_speech_controllable.wav" |
|
|
sample_rate = audio_tokenizer.config.get("sample_rate", 16000) |
|
|
sf.write(output_filename, generated_waveform, sample_rate) |
|
|
print(f"Audio saved to {output_filename}") |
|
|
|
|
|
from IPython.display import Audio, display |
|
|
display(Audio(generated_waveform, rate=sample_rate)) |
|
|
else: |
|
|
print("Audio generation failed (no tokens found?).") |
|
|
``` |
|
|
## 🔧 Parameters |
|
|
|
|
|
| Parameter | Type | Default | Description | |
|
|
|------------------------|---------------|---------|-------------------------------------------------------------| |
|
|
| `text` | `str` | — | The input text to be converted into speech. | |
|
|
| `temperature` | `float` | `0.8` | Sampling temperature for diversity in generation. | |
|
|
| `top_k` | `int` | `50` | Limits sampling to top-k most likely tokens. | |
|
|
| `top_p` | `float` | `1.0` | Nucleus sampling (select from top-p cumulative probability).| |
|
|
| `max_new_audio_tokens` | `int` | `2048` | Maximum number of audio tokens to generate. | |
|
|
| `device` | `torch.device`| Auto | Uses CUDA if available, otherwise CPU. | |
|
|
|
|
|
|
|
|
📁 Output Format |
|
|
Output: generated_speech_controllable.wav |
|
|
|
|
|
Sample Rate: Defaults to 16kHz (configurable via audio_tokenizer.config) |
|
|
|
|
|
⚠️ Notes |
|
|
Make sure model, tokenizer, and audio_tokenizer are properly initialized. |
|
|
|
|
|
Designed for research and development use |