| import gradio as gr
|
| import torch
|
| import re
|
| from dataclasses import dataclass
|
| from transformers import AutoTokenizer, AutoModelForCausalLM
|
| from neucodec import NeuCodec
|
|
|
| @dataclass
|
| class Config:
|
| model_name = "StepSharp/urdu-tts"
|
| device_map = "auto"
|
| max_new_tokens = 2048
|
| temperature = 0.8
|
| top_p = 0.95
|
| repetition_penalty = 1.1
|
|
|
|
|
| class UrduTTS:
|
| def __init__(self):
|
| self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
| self.tokenizer = AutoTokenizer.from_pretrained(
|
| Config.model_name
|
| )
|
|
|
| self.model = AutoModelForCausalLM.from_pretrained(
|
| Config.model_name,
|
| torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
|
| device_map=Config.device_map,
|
| )
|
|
|
| self.codec = NeuCodec.from_pretrained(
|
| "neuphonic/neucodec"
|
| ).eval().to(self.device)
|
|
|
| vocab = self.tokenizer.get_vocab()
|
| self.speech_end = vocab["<|im_end|>"]
|
|
|
| def synthesize(self, text, description):
|
|
|
| speaker = "OutteTTS-urdu-dataset_audio_uat_speaker"
|
|
|
| prompt = (
|
| f"<|im_start|>{speaker}: {text}"
|
| f"<|description|>{description}"
|
| f"<|speech_start|>"
|
| )
|
|
|
| inputs = self.tokenizer(
|
| prompt,
|
| return_tensors="pt"
|
| )
|
|
|
| input_ids = inputs.input_ids.to(self.device)
|
|
|
| output = self.model.generate(
|
| input_ids=input_ids,
|
| max_new_tokens=2048,
|
| do_sample=True,
|
| temperature=0.8,
|
| top_p=0.95,
|
| repetition_penalty=1.1,
|
| eos_token_id=self.speech_end,
|
| )
|
|
|
| decoded = self.tokenizer.decode(
|
| output[0],
|
| skip_special_tokens=False
|
| )
|
|
|
| audio_tokens = re.findall(
|
| r"<\|s_(\d+)\|>",
|
| decoded
|
| )
|
|
|
| audio_tokens = [int(x) for x in audio_tokens]
|
|
|
| codes = (
|
| torch.tensor(audio_tokens)
|
| .unsqueeze(0)
|
| .unsqueeze(0)
|
| .to(self.device)
|
| )
|
|
|
| with torch.inference_mode():
|
| waveform = self.codec.decode_code(codes)
|
|
|
| audio = waveform[0, 0].cpu().numpy()
|
|
|
| return 24000, audio
|
|
|
|
|
| tts = UrduTTS()
|
|
|
|
|
| def generate_audio(text, description):
|
| return tts.synthesize(text, description)
|
|
|
|
|
|
|
| with gr.Blocks(title="Urdu TTS") as demo:
|
|
|
| gr.Markdown(
|
| """
|
| # Urdu Text-to-Speech
|
| Enter Urdu text and generate speech.
|
| """
|
| )
|
|
|
| text = gr.Textbox(
|
| label="Urdu Text",
|
| lines=4,
|
| placeholder="اردو متن درج کریں"
|
| )
|
|
|
| description = gr.Textbox(
|
| label="Voice Description",
|
| value="A male Urdu speaker with a calm and clear tone."
|
| )
|
|
|
| btn = gr.Button("Generate Speech")
|
|
|
| output = gr.Audio(
|
| label="Generated Audio"
|
| )
|
|
|
| btn.click(
|
| fn=generate_audio,
|
| inputs=[text, description],
|
| outputs=output
|
| )
|
|
|
| gr.Examples(
|
| examples=[
|
| ["میری عمر اس وقت 26 سال ہے اور اگلا سال 2025 ہوگا۔"],
|
| ["براہ کرم چند لمحے انتظار کریں۔"],
|
| ["محکمۂ موسمیات کے مطابق درجۂ حرارت ٤٦٫٨ ڈگری سینٹی گریڈ تک پہنچ سکتا ہے، لہٰذا شہری غیرضروری سفر سے گریز کریں۔"],
|
| ["بین الاقوامی خلائی تحقیقاتی ادارے نے اعلان کیا کہ سیٹلائٹ “PakSat-X2”"],
|
| ["اگر temperature 42.7 ڈگری سینٹی گریڈ سے تجاوز کر جائے تو server automatically shutdown ہو جائے گا۔"]
|
| ],
|
| inputs=text
|
| )
|
|
|
| if __name__ == "__main__":
|
| demo.launch() |