| import os |
| import torch |
| import gradio as gr |
| import soundfile as sf |
| import numpy as np |
| from pathlib import Path |
| import json |
| import traceback |
|
|
| |
| HF_TOKEN = os.getenv("HF_TOKEN") |
| if not HF_TOKEN: |
| print("WARNING: HF_TOKEN missing. Add it in Space → Settings → Variables & Secrets.") |
|
|
| MODEL_ID = "ai4bharat/indic-parler-tts" |
|
|
| try: |
| from parler_tts import ParlerTTSForConditionalGeneration |
| from transformers import AutoTokenizer |
| except Exception as e: |
| raise RuntimeError("Missing required libraries. Install dependencies from requirements.txt. Error: " + str(e)) |
|
|
| device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
| print("Loading model…") |
| model = ParlerTTSForConditionalGeneration.from_pretrained(MODEL_ID).to(device) |
| text_tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) |
|
|
| try: |
| desc_encoder_name = model.config.text_encoder._name_or_path |
| desc_tokenizer = AutoTokenizer.from_pretrained(desc_encoder_name) |
| except: |
| desc_tokenizer = text_tokenizer |
|
|
| sampling_rate = getattr(model.config, "sampling_rate", 22050) |
|
|
| sp_file = Path(__file__).parent / "speakers.json" |
| if sp_file.exists(): |
| SPEAKERS = json.load(open(sp_file, "r", encoding="utf-8")) |
| else: |
| SPEAKERS = ["Default"] |
|
|
| def synthesize(text, speaker, emotion="Neutral"): |
| if not text.strip(): |
| return None |
|
|
| desc = f"{speaker}'s voice. Tone: {emotion}. Natural, clear speech, close mic." |
| try: |
| desc_ids = desc_tokenizer(desc, return_tensors="pt").to(device) |
| text_ids = text_tokenizer(text, return_tensors="pt").to(device) |
| except: |
| desc_ids = desc_tokenizer(desc, return_tensors="pt") |
| text_ids = text_tokenizer(text, return_tensors="pt") |
|
|
| with torch.no_grad(): |
| try: |
| audio = model.generate( |
| input_ids=desc_ids.input_ids, |
| attention_mask=desc_ids.attention_mask, |
| prompt_input_ids=text_ids.input_ids, |
| prompt_attention_mask=text_ids.attention_mask, |
| max_length=20000, |
| ) |
| except: |
| audio = model.generate(description=desc, text=text) |
|
|
| arr = audio.cpu().numpy().squeeze() |
| if np.issubdtype(arr.dtype, np.integer): |
| arr = arr.astype("float32") / np.iinfo(arr.dtype).max |
|
|
| out_path = f"/tmp/out_{abs(hash(text))}.wav" |
| sf.write(out_path, arr, sampling_rate) |
| return out_path |
|
|
| with gr.Blocks() as demo: |
| gr.Markdown("# Indic Parler-TTS (69 Speakers)") |
|
|
| txt = gr.Textbox(value="नमस्ते, यह एक परीक्षण वाक्य है।", label="Text") |
| sp = gr.Dropdown(SPEAKERS, value=SPEAKERS[0], label="Speaker") |
| emo = gr.Dropdown(["Neutral","Happy","Sad","Angry","Narration"], value="Neutral", label="Emotion") |
| btn = gr.Button("Generate") |
| out = gr.Audio() |
|
|
| btn.click(fn=synthesize, inputs=[txt, sp, emo], outputs=out) |
|
|
| if __name__ == '__main__': |
| demo.launch(server_name="0.0.0.0", server_port=7860) |
|
|