import torch from parler_tts import ParlerTTSForConditionalGeneration from transformers import AutoTokenizer import gradio as gr import numpy as np import time import requests # Set device to GPU if available, else CPU device = "cuda:0" if torch.cuda.is_available() else "cpu" # Function to load the model with retry logic and increased timeout def load_model_with_retry(model_id, retries=3, timeout=60): for attempt in range(1, retries + 1): try: model = ParlerTTSForConditionalGeneration.from_pretrained(model_id, timeout=timeout).to(device) return model except requests.exceptions.ReadTimeout: print(f"Timeout when loading model. Attempt {attempt} of {retries}. Retrying in 5 seconds...") time.sleep(5) raise Exception("Failed to load model after multiple retries.") # Function to load a tokenizer with retry logic and increased timeout def load_tokenizer_with_retry(tokenizer_id, retries=3, timeout=60): for attempt in range(1, retries + 1): try: tokenizer = AutoTokenizer.from_pretrained(tokenizer_id, timeout=timeout) return tokenizer except requests.exceptions.ReadTimeout: print(f"Timeout when loading tokenizer. Attempt {attempt} of {retries}. Retrying in 5 seconds...") time.sleep(5) raise Exception("Failed to load tokenizer after multiple retries.") # Load the TTS model and tokenizers using the retry functions model = load_model_with_retry("ai4bharat/indic-parler-tts") tokenizer = load_tokenizer_with_retry("ai4bharat/indic-parler-tts") description_tokenizer = load_tokenizer_with_retry(model.config.text_encoder._name_or_path) def generate_audio(text: str): """ Generate synthesized speech audio based on the input text. Args: text (str): The text prompt to be spoken. Returns: tuple: A tuple containing the audio numpy array and the sampling rate. """ # Set a default voice description default_description = ( "Divya's voice is monotone yet slightly fast in delivery, with a very close recording " "that almost has no background noise." ) # Tokenize the default description and the input text description_tokens = description_tokenizer(default_description, return_tensors="pt").to(device) prompt_tokens = tokenizer(text, return_tensors="pt").to(device) # Generate the audio tensor using the model generation = model.generate( input_ids=description_tokens.input_ids, attention_mask=description_tokens.attention_mask, prompt_input_ids=prompt_tokens.input_ids, prompt_attention_mask=prompt_tokens.attention_mask ) # Convert the generated tensor to a numpy array and remove extra dimensions audio_arr = generation.cpu().numpy().squeeze() # Retrieve the sampling rate from the model config sampling_rate = model.config.sampling_rate return (audio_arr, sampling_rate) # Build the Gradio interface with a single text input iface = gr.Interface( fn=generate_audio, inputs=gr.Textbox(label="Enter Text", value="अरे, तुम आज कैसे हो?"), outputs=gr.Audio(label="Generated Audio"), title="Indic Parler TTS", description="Generate synthesized speech using the Indic Parler TTS model from ai4bharat." ) if __name__ == "__main__": iface.launch()