Spaces:
Runtime error
Runtime error
| import torch | |
| from parler_tts import ParlerTTSForConditionalGeneration | |
| from transformers import AutoTokenizer | |
| import gradio as gr | |
| import numpy as np | |
| import time | |
| import requests | |
| # Set device to GPU if available, else CPU | |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| # Function to load the model with retry logic and increased timeout | |
| def load_model_with_retry(model_id, retries=3, timeout=60): | |
| for attempt in range(1, retries + 1): | |
| try: | |
| model = ParlerTTSForConditionalGeneration.from_pretrained(model_id, timeout=timeout).to(device) | |
| return model | |
| except requests.exceptions.ReadTimeout: | |
| print(f"Timeout when loading model. Attempt {attempt} of {retries}. Retrying in 5 seconds...") | |
| time.sleep(5) | |
| raise Exception("Failed to load model after multiple retries.") | |
| # Function to load a tokenizer with retry logic and increased timeout | |
| def load_tokenizer_with_retry(tokenizer_id, retries=3, timeout=60): | |
| for attempt in range(1, retries + 1): | |
| try: | |
| tokenizer = AutoTokenizer.from_pretrained(tokenizer_id, timeout=timeout) | |
| return tokenizer | |
| except requests.exceptions.ReadTimeout: | |
| print(f"Timeout when loading tokenizer. Attempt {attempt} of {retries}. Retrying in 5 seconds...") | |
| time.sleep(5) | |
| raise Exception("Failed to load tokenizer after multiple retries.") | |
| # Load the TTS model and tokenizers using the retry functions | |
| model = load_model_with_retry("ai4bharat/indic-parler-tts") | |
| tokenizer = load_tokenizer_with_retry("ai4bharat/indic-parler-tts") | |
| description_tokenizer = load_tokenizer_with_retry(model.config.text_encoder._name_or_path) | |
| def generate_audio(text: str): | |
| """ | |
| Generate synthesized speech audio based on the input text. | |
| Args: | |
| text (str): The text prompt to be spoken. | |
| Returns: | |
| tuple: A tuple containing the audio numpy array and the sampling rate. | |
| """ | |
| # Set a default voice description | |
| default_description = ( | |
| "Divya's voice is monotone yet slightly fast in delivery, with a very close recording " | |
| "that almost has no background noise." | |
| ) | |
| # Tokenize the default description and the input text | |
| description_tokens = description_tokenizer(default_description, return_tensors="pt").to(device) | |
| prompt_tokens = tokenizer(text, return_tensors="pt").to(device) | |
| # Generate the audio tensor using the model | |
| generation = model.generate( | |
| input_ids=description_tokens.input_ids, | |
| attention_mask=description_tokens.attention_mask, | |
| prompt_input_ids=prompt_tokens.input_ids, | |
| prompt_attention_mask=prompt_tokens.attention_mask | |
| ) | |
| # Convert the generated tensor to a numpy array and remove extra dimensions | |
| audio_arr = generation.cpu().numpy().squeeze() | |
| # Retrieve the sampling rate from the model config | |
| sampling_rate = model.config.sampling_rate | |
| return (audio_arr, sampling_rate) | |
| # Build the Gradio interface with a single text input | |
| iface = gr.Interface( | |
| fn=generate_audio, | |
| inputs=gr.Textbox(label="Enter Text", value="เค เคฐเฅ, เคคเฅเคฎ เคเค เคเฅเคธเฅ เคนเฅ?"), | |
| outputs=gr.Audio(label="Generated Audio"), | |
| title="Indic Parler TTS", | |
| description="Generate synthesized speech using the Indic Parler TTS model from ai4bharat." | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() | |