File size: 4,636 Bytes
0a6371e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Running on local URL:  http://127.0.0.1:7868\n",
      "\n",
      "To create a public link, set `share=True` in `launch()`.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div><iframe src=\"http://127.0.0.1:7868/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": []
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\066226758\\Blog\\Virtual-Webcam-Chatbot\\.venv\\lib\\site-packages\\gradio\\processing_utils.py:583: UserWarning: Trying to convert audio automatically from float32 to 16-bit int format.\n",
      "  warnings.warn(warning.format(data.dtype))\n"
     ]
    }
   ],
   "source": [
    "import gradio as gr\n",
    "import numpy as np\n",
    "import torch\n",
    "from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan\n",
    "from io import BytesIO\n",
    "import soundfile as sf\n",
    "\n",
    "# Load models outside of function calls for efficiency\n",
    "def load_models():\n",
    "    model = SpeechT5ForTextToSpeech.from_pretrained(\"microsoft/speecht5_tts\")\n",
    "    processor = SpeechT5Processor.from_pretrained(\"microsoft/speecht5_tts\")\n",
    "    vocoder = SpeechT5HifiGan.from_pretrained(\"microsoft/speecht5_hifigan\")\n",
    "    return model, processor, vocoder\n",
    "\n",
    "model, processor, vocoder = load_models()\n",
    "\n",
    "# Load speaker embeddings\n",
    "def get_speaker_embeddings():\n",
    "    speaker_embeddings = np.load(\"cmu_us_clb_arctic-wav-arctic_a0144.npy\")\n",
    "    return torch.tensor(speaker_embeddings).unsqueeze(0)\n",
    "\n",
    "speaker_embeddings = get_speaker_embeddings()\n",
    "\n",
    "# Function to convert text to speech\n",
    "def text_to_speech(text):\n",
    "    try:\n",
    "        # Segment the text if it's too long\n",
    "        max_length = 100  # Set a max length as per model's capability\n",
    "        segments = [text[i:i+max_length] for i in range(0, len(text), max_length)]\n",
    "        combined_speech = []\n",
    "\n",
    "        for segment in segments:\n",
    "            inputs = processor(text=segment, return_tensors=\"pt\")\n",
    "            spectrogram = model.generate_speech(inputs[\"input_ids\"], speaker_embeddings)\n",
    "            with torch.no_grad():\n",
    "                speech = vocoder(spectrogram)\n",
    "                combined_speech.extend(speech.numpy())\n",
    "\n",
    "        # Combine audio data into a single numpy array\n",
    "        combined_speech = np.array(combined_speech)\n",
    "\n",
    "        return 16000, combined_speech  # Return sample rate and combined audio data\n",
    "    except Exception as e:\n",
    "        return None, f\"Error in text-to-speech conversion: {e}\"\n",
    "\n",
    "# Gradio Interface\n",
    "def gradio_interface(text):\n",
    "    sample_rate, audio_data = text_to_speech(text)\n",
    "    if sample_rate and isinstance(audio_data, np.ndarray):\n",
    "        return sample_rate, audio_data\n",
    "    else:\n",
    "        return None  # Return None if there's an error\n",
    "\n",
    "interface = gr.Interface(\n",
    "    fn=gradio_interface,\n",
    "    title=\"Text to Voice T5\",  # Add a title to the interface\n",
    "    description=\"Developed by Ruslan Magana, visit <a href='https://ruslanmv.com/' target='_blank'>ruslanmv.com</a> for more information.\",\n",
    "    inputs=gr.Textbox(lines=10, label=\"Enter text to convert to speech\"),\n",
    "    outputs=gr.Audio(label=\"Generated audio\")\n",
    ")\n",
    "\n",
    "interface.launch()\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python (watson)",
   "language": "python",
   "name": "watson"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}