--- license: mit --- # Somya_zero_shot_TTS A multilingual text-to-speech model supporting 10 languages with high-quality speech synthesis capabilities and zero-shot cloning.. ## Model Description Orpheus is a multilingual TTS model based on Llama architecture, fine-tuned for text-to-speech generation across multiple Indic languages. The model generates high-quality speech audio at 24kHz using the SNAC audio codec. ## Supported Languages - **HI** - Hindi - **KN** - Kannada - **MR** - Marathi - **TE** - Telugu - **BN** - Bengali - **GU** - Gujarati - **MA** - Maithili - **MG** - Magahi - **BH** - Bhojpuri - **CH** - Chhattisgarhi ## Installation ```bash pip install torch transformers soundfile librosa snac orpheus-speech ``` ## Usage ### Basic TTS Inference ```python import io import re import base64 import wave import hashlib import librosa import torch from typing import List from transformers import AutoTokenizer, AutoModelForCausalLM from snac import SNAC # ------------------ TOKEN CONSTANTS ------------------ START_OF_SPEECH = 128257 END_OF_SPEECH = 128258 START_OF_HUMAN = 128259 END_OF_HUMAN = 128260 START_OF_AI = 128261 AUDIO_TOKENS_START = 128266 CUSTOM_TOKEN_RE = re.compile(r"") # ------------------ LOAD MODELS ------------------ MODEL_ID = "somyalab/Somya_zero_shot_TTS" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) clone_token_id = tokenizer.convert_tokens_to_ids("[clone]") # Use HuggingFace Transformers pipeline instead of vllm model = AutoModelForCausalLM.from_pretrained( MODEL_ID, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="auto" ) snac = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").cuda().eval() from orpheus_tts.decoder import tokens_decoder_sync # ------------------ AUDIO ENCODING ------------------ def encode_audio_base64(ref_audio_base64: str, max_duration=5.0) -> List[int]: audio_bytes = base64.b64decode(ref_audio_base64) audio_np, sr = librosa.load(io.BytesIO(audio_bytes), sr=24000, mono=True) if max_duration: audio_np = audio_np[: int(max_duration * 24000)] waveform = torch.from_numpy(audio_np).unsqueeze(0).unsqueeze(0).cuda() with torch.inference_mode(): codes = snac.encode(waveform) tokens = [] for i in range(codes[0].shape[-1]): tokens.extend([ int(codes[0][0, i]) + AUDIO_TOKENS_START, int(codes[1][0, 2*i]) + AUDIO_TOKENS_START + 4096, int(codes[2][0, 4*i]) + AUDIO_TOKENS_START + 8192, int(codes[2][0, 4*i+1]) + AUDIO_TOKENS_START + 12288, int(codes[1][0, 2*i+1]) + AUDIO_TOKENS_START + 16384, int(codes[2][0, 4*i+2]) + AUDIO_TOKENS_START + 20480, int(codes[2][0, 4*i+3]) + AUDIO_TOKENS_START + 24576, ]) return tokens # ------------------ PROMPT FORMAT ------------------ def build_clone_prompt(ref_audio_b64: str, text: str) -> str: ref_tokens = encode_audio_base64(ref_audio_b64) text_tokens = tokenizer.encode(text, add_special_tokens=False) tokens = ( [START_OF_SPEECH] + ref_tokens + [END_OF_SPEECH, clone_token_id, START_OF_HUMAN] + text_tokens + [END_OF_HUMAN, START_OF_AI, START_OF_SPEECH] ) return tokenizer.decode(tokens) # ------------------ INFERENCE ------------------ def clone_voice( ref_audio_b64: str, text: str, temperature=0.68, top_p=0.93, top_k=50, max_tokens=2048 ) -> bytes: prompt = build_clone_prompt(ref_audio_b64, text) # Tokenize prompt for transformers input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device) # Generate the response, stopping at END_OF_SPEECH token if possible stopping_criteria = None if hasattr(tokenizer, 'eos_token_id') and END_OF_SPEECH is not None: eos_token_id = END_OF_SPEECH stopping_criteria = None else: eos_token_id = None with torch.no_grad(): output_ids = model.generate( input_ids, do_sample=True, temperature=temperature, top_p=top_p, top_k=top_k, max_new_tokens=max_tokens, eos_token_id=eos_token_id, pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id is not None else None, ) # The prompt itself will be at the start, so decode only the new tokens generated_text = tokenizer.decode(output_ids[0][input_ids.shape[-1]:], skip_special_tokens=True) def token_stream(): for m in CUSTOM_TOKEN_RE.finditer(generated_text): yield m.group(0) pcm_bytes = b"".join(tokens_decoder_sync(token_stream())) return pcm_bytes # ------------------ SAVE WAV ------------------ def save_wav(pcm_bytes: bytes, path="output.wav", sr=24000): with wave.open(path, "wb") as wf: wf.setnchannels(1) wf.setsampwidth(2) wf.setframerate(sr) wf.writeframes(pcm_bytes) with open("refrenced_audio.wav", "rb") as f: ref_audio_b64 = base64.b64encode(f.read()).decode() pcm = clone_voice( ref_audio_b64, "नमस्ते, यह एक ज़ीरो-शॉट वॉयस क्लोनिंग उदाहरण है।" ) print("Saving audio to output.wav") save_wav(pcm, "output.wav") ``` ### Example with Different Languages ```python import base64 with open("reference.wav", "rb") as f: ref_audio_b64 = base64.b64encode(f.read()).decode() pcm = clone_voice( ref_audio_b64, "नमस्ते, यह एक ज़ीरो-शॉट वॉयस क्लोनिंग उदाहरण है।" ) save_wav(pcm, "output.wav") ``` ## Citation If you use this model, please cite: ```bibtex @misc{somyalab/Somya-IndicTTS, title={Somya-IndicTTS Multilingual TTS Model}, author={Vedu023}, year={2025}, publisher={Hugging Face} } ```