Spaces:
Sleeping
Sleeping
File size: 4,636 Bytes
0a6371e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Running on local URL: http://127.0.0.1:7868\n",
"\n",
"To create a public link, set `share=True` in `launch()`.\n"
]
},
{
"data": {
"text/html": [
"<div><iframe src=\"http://127.0.0.1:7868/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": []
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\066226758\\Blog\\Virtual-Webcam-Chatbot\\.venv\\lib\\site-packages\\gradio\\processing_utils.py:583: UserWarning: Trying to convert audio automatically from float32 to 16-bit int format.\n",
" warnings.warn(warning.format(data.dtype))\n"
]
}
],
"source": [
"import gradio as gr\n",
"import numpy as np\n",
"import torch\n",
"from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan\n",
"from io import BytesIO\n",
"import soundfile as sf\n",
"\n",
"# Load models outside of function calls for efficiency\n",
"def load_models():\n",
" model = SpeechT5ForTextToSpeech.from_pretrained(\"microsoft/speecht5_tts\")\n",
" processor = SpeechT5Processor.from_pretrained(\"microsoft/speecht5_tts\")\n",
" vocoder = SpeechT5HifiGan.from_pretrained(\"microsoft/speecht5_hifigan\")\n",
" return model, processor, vocoder\n",
"\n",
"model, processor, vocoder = load_models()\n",
"\n",
"# Load speaker embeddings\n",
"def get_speaker_embeddings():\n",
" speaker_embeddings = np.load(\"cmu_us_clb_arctic-wav-arctic_a0144.npy\")\n",
" return torch.tensor(speaker_embeddings).unsqueeze(0)\n",
"\n",
"speaker_embeddings = get_speaker_embeddings()\n",
"\n",
"# Function to convert text to speech\n",
"def text_to_speech(text):\n",
" try:\n",
" # Segment the text if it's too long\n",
" max_length = 100 # Set a max length as per model's capability\n",
" segments = [text[i:i+max_length] for i in range(0, len(text), max_length)]\n",
" combined_speech = []\n",
"\n",
" for segment in segments:\n",
" inputs = processor(text=segment, return_tensors=\"pt\")\n",
" spectrogram = model.generate_speech(inputs[\"input_ids\"], speaker_embeddings)\n",
" with torch.no_grad():\n",
" speech = vocoder(spectrogram)\n",
" combined_speech.extend(speech.numpy())\n",
"\n",
" # Combine audio data into a single numpy array\n",
" combined_speech = np.array(combined_speech)\n",
"\n",
" return 16000, combined_speech # Return sample rate and combined audio data\n",
" except Exception as e:\n",
" return None, f\"Error in text-to-speech conversion: {e}\"\n",
"\n",
"# Gradio Interface\n",
"def gradio_interface(text):\n",
" sample_rate, audio_data = text_to_speech(text)\n",
" if sample_rate and isinstance(audio_data, np.ndarray):\n",
" return sample_rate, audio_data\n",
" else:\n",
" return None # Return None if there's an error\n",
"\n",
"interface = gr.Interface(\n",
" fn=gradio_interface,\n",
" title=\"Text to Voice T5\", # Add a title to the interface\n",
" description=\"Developed by Ruslan Magana, visit <a href='https://ruslanmv.com/' target='_blank'>ruslanmv.com</a> for more information.\",\n",
" inputs=gr.Textbox(lines=10, label=\"Enter text to convert to speech\"),\n",
" outputs=gr.Audio(label=\"Generated audio\")\n",
")\n",
"\n",
"interface.launch()\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (watson)",
"language": "python",
"name": "watson"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|