|
|
--- |
|
|
license: mit |
|
|
--- |
|
|
|
|
|
# Somya_zero_shot_TTS |
|
|
|
|
|
A multilingual text-to-speech model supporting 10 languages with high-quality speech synthesis capabilities and zero-shot cloning.. |
|
|
|
|
|
## Model Description |
|
|
|
|
|
Orpheus is a multilingual TTS model based on Llama architecture, fine-tuned for text-to-speech generation across multiple Indic languages. The model generates high-quality speech audio at 24kHz using the SNAC audio codec. |
|
|
|
|
|
## Supported Languages |
|
|
|
|
|
- **HI** - Hindi |
|
|
- **KN** - Kannada |
|
|
- **MR** - Marathi |
|
|
- **TE** - Telugu |
|
|
- **BN** - Bengali |
|
|
- **GU** - Gujarati |
|
|
- **MA** - Maithili |
|
|
- **MG** - Magahi |
|
|
- **BH** - Bhojpuri |
|
|
- **CH** - Chhattisgarhi |
|
|
|
|
|
|
|
|
## Installation |
|
|
|
|
|
```bash |
|
|
pip install torch transformers soundfile librosa snac orpheus-speech |
|
|
``` |
|
|
|
|
|
## Usage |
|
|
|
|
|
### Basic TTS Inference |
|
|
|
|
|
```python |
|
|
import io |
|
|
import re |
|
|
import base64 |
|
|
import wave |
|
|
import hashlib |
|
|
import librosa |
|
|
import torch |
|
|
from typing import List |
|
|
|
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
from snac import SNAC |
|
|
|
|
|
# ------------------ TOKEN CONSTANTS ------------------ |
|
|
START_OF_SPEECH = 128257 |
|
|
END_OF_SPEECH = 128258 |
|
|
START_OF_HUMAN = 128259 |
|
|
END_OF_HUMAN = 128260 |
|
|
START_OF_AI = 128261 |
|
|
AUDIO_TOKENS_START = 128266 |
|
|
|
|
|
CUSTOM_TOKEN_RE = re.compile(r"<custom_token_\d+>") |
|
|
|
|
|
# ------------------ LOAD MODELS ------------------ |
|
|
MODEL_ID = "somyalab/Somya_zero_shot_TTS" |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) |
|
|
clone_token_id = tokenizer.convert_tokens_to_ids("[clone]") |
|
|
|
|
|
# Use HuggingFace Transformers pipeline instead of vllm |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
MODEL_ID, |
|
|
trust_remote_code=True, |
|
|
torch_dtype=torch.bfloat16, |
|
|
device_map="auto" |
|
|
) |
|
|
|
|
|
snac = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").cuda().eval() |
|
|
|
|
|
from orpheus_tts.decoder import tokens_decoder_sync |
|
|
|
|
|
# ------------------ AUDIO ENCODING ------------------ |
|
|
def encode_audio_base64(ref_audio_base64: str, max_duration=5.0) -> List[int]: |
|
|
audio_bytes = base64.b64decode(ref_audio_base64) |
|
|
audio_np, sr = librosa.load(io.BytesIO(audio_bytes), sr=24000, mono=True) |
|
|
|
|
|
if max_duration: |
|
|
audio_np = audio_np[: int(max_duration * 24000)] |
|
|
|
|
|
waveform = torch.from_numpy(audio_np).unsqueeze(0).unsqueeze(0).cuda() |
|
|
|
|
|
with torch.inference_mode(): |
|
|
codes = snac.encode(waveform) |
|
|
|
|
|
tokens = [] |
|
|
for i in range(codes[0].shape[-1]): |
|
|
tokens.extend([ |
|
|
int(codes[0][0, i]) + AUDIO_TOKENS_START, |
|
|
int(codes[1][0, 2*i]) + AUDIO_TOKENS_START + 4096, |
|
|
int(codes[2][0, 4*i]) + AUDIO_TOKENS_START + 8192, |
|
|
int(codes[2][0, 4*i+1]) + AUDIO_TOKENS_START + 12288, |
|
|
int(codes[1][0, 2*i+1]) + AUDIO_TOKENS_START + 16384, |
|
|
int(codes[2][0, 4*i+2]) + AUDIO_TOKENS_START + 20480, |
|
|
int(codes[2][0, 4*i+3]) + AUDIO_TOKENS_START + 24576, |
|
|
]) |
|
|
return tokens |
|
|
|
|
|
# ------------------ PROMPT FORMAT ------------------ |
|
|
def build_clone_prompt(ref_audio_b64: str, text: str) -> str: |
|
|
ref_tokens = encode_audio_base64(ref_audio_b64) |
|
|
text_tokens = tokenizer.encode(text, add_special_tokens=False) |
|
|
|
|
|
tokens = ( |
|
|
[START_OF_SPEECH] |
|
|
+ ref_tokens |
|
|
+ [END_OF_SPEECH, clone_token_id, START_OF_HUMAN] |
|
|
+ text_tokens |
|
|
+ [END_OF_HUMAN, START_OF_AI, START_OF_SPEECH] |
|
|
) |
|
|
|
|
|
return tokenizer.decode(tokens) |
|
|
|
|
|
# ------------------ INFERENCE ------------------ |
|
|
def clone_voice( |
|
|
ref_audio_b64: str, |
|
|
text: str, |
|
|
temperature=0.68, |
|
|
top_p=0.93, |
|
|
top_k=50, |
|
|
max_tokens=2048 |
|
|
) -> bytes: |
|
|
prompt = build_clone_prompt(ref_audio_b64, text) |
|
|
# Tokenize prompt for transformers |
|
|
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device) |
|
|
|
|
|
# Generate the response, stopping at END_OF_SPEECH token if possible |
|
|
stopping_criteria = None |
|
|
if hasattr(tokenizer, 'eos_token_id') and END_OF_SPEECH is not None: |
|
|
eos_token_id = END_OF_SPEECH |
|
|
stopping_criteria = None |
|
|
else: |
|
|
eos_token_id = None |
|
|
|
|
|
with torch.no_grad(): |
|
|
output_ids = model.generate( |
|
|
input_ids, |
|
|
do_sample=True, |
|
|
temperature=temperature, |
|
|
top_p=top_p, |
|
|
top_k=top_k, |
|
|
max_new_tokens=max_tokens, |
|
|
eos_token_id=eos_token_id, |
|
|
pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id is not None else None, |
|
|
) |
|
|
|
|
|
# The prompt itself will be at the start, so decode only the new tokens |
|
|
generated_text = tokenizer.decode(output_ids[0][input_ids.shape[-1]:], skip_special_tokens=True) |
|
|
|
|
|
def token_stream(): |
|
|
for m in CUSTOM_TOKEN_RE.finditer(generated_text): |
|
|
yield m.group(0) |
|
|
|
|
|
pcm_bytes = b"".join(tokens_decoder_sync(token_stream())) |
|
|
return pcm_bytes |
|
|
|
|
|
# ------------------ SAVE WAV ------------------ |
|
|
def save_wav(pcm_bytes: bytes, path="output.wav", sr=24000): |
|
|
with wave.open(path, "wb") as wf: |
|
|
wf.setnchannels(1) |
|
|
wf.setsampwidth(2) |
|
|
wf.setframerate(sr) |
|
|
wf.writeframes(pcm_bytes) |
|
|
|
|
|
|
|
|
with open("refrenced_audio.wav", "rb") as f: |
|
|
ref_audio_b64 = base64.b64encode(f.read()).decode() |
|
|
|
|
|
pcm = clone_voice( |
|
|
ref_audio_b64, |
|
|
"नमस्ते, यह एक ज़ीरो-शॉट वॉयस क्लोनिंग उदाहरण है।" |
|
|
) |
|
|
|
|
|
print("Saving audio to output.wav") |
|
|
save_wav(pcm, "output.wav") |
|
|
|
|
|
``` |
|
|
|
|
|
### Example with Different Languages |
|
|
|
|
|
```python |
|
|
import base64 |
|
|
|
|
|
with open("reference.wav", "rb") as f: |
|
|
ref_audio_b64 = base64.b64encode(f.read()).decode() |
|
|
|
|
|
pcm = clone_voice( |
|
|
ref_audio_b64, |
|
|
"नमस्ते, यह एक ज़ीरो-शॉट वॉयस क्लोनिंग उदाहरण है।" |
|
|
) |
|
|
|
|
|
save_wav(pcm, "output.wav") |
|
|
|
|
|
``` |
|
|
|
|
|
|
|
|
## Citation |
|
|
|
|
|
If you use this model, please cite: |
|
|
|
|
|
```bibtex |
|
|
@misc{somyalab/Somya-IndicTTS, |
|
|
title={Somya-IndicTTS Multilingual TTS Model}, |
|
|
author={Vedu023}, |
|
|
year={2025}, |
|
|
publisher={Hugging Face} |
|
|
} |
|
|
``` |