Spaces:

ruslanmv
/

Text-to-Voice-Transformers

Sleeping

File size: 4,636 Bytes

0a6371e

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Running on local URL:  http://127.0.0.1:7868\n",
      "\n",
      "To create a public link, set `share=True` in `launch()`.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div><iframe src=\"http://127.0.0.1:7868/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": []
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\066226758\\Blog\\Virtual-Webcam-Chatbot\\.venv\\lib\\site-packages\\gradio\\processing_utils.py:583: UserWarning: Trying to convert audio automatically from float32 to 16-bit int format.\n",
      "  warnings.warn(warning.format(data.dtype))\n"
     ]
    }
   ],
   "source": [
    "import gradio as gr\n",
    "import numpy as np\n",
    "import torch\n",
    "from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan\n",
    "from io import BytesIO\n",
    "import soundfile as sf\n",
    "\n",
    "# Load models outside of function calls for efficiency\n",
    "def load_models():\n",
    "    model = SpeechT5ForTextToSpeech.from_pretrained(\"microsoft/speecht5_tts\")\n",
    "    processor = SpeechT5Processor.from_pretrained(\"microsoft/speecht5_tts\")\n",
    "    vocoder = SpeechT5HifiGan.from_pretrained(\"microsoft/speecht5_hifigan\")\n",
    "    return model, processor, vocoder\n",
    "\n",
    "model, processor, vocoder = load_models()\n",
    "\n",
    "# Load speaker embeddings\n",
    "def get_speaker_embeddings():\n",
    "    speaker_embeddings = np.load(\"cmu_us_clb_arctic-wav-arctic_a0144.npy\")\n",
    "    return torch.tensor(speaker_embeddings).unsqueeze(0)\n",
    "\n",
    "speaker_embeddings = get_speaker_embeddings()\n",
    "\n",
    "# Function to convert text to speech\n",
    "def text_to_speech(text):\n",
    "    try:\n",
    "        # Segment the text if it's too long\n",
    "        max_length = 100  # Set a max length as per model's capability\n",
    "        segments = [text[i:i+max_length] for i in range(0, len(text), max_length)]\n",
    "        combined_speech = []\n",
    "\n",
    "        for segment in segments:\n",
    "            inputs = processor(text=segment, return_tensors=\"pt\")\n",
    "            spectrogram = model.generate_speech(inputs[\"input_ids\"], speaker_embeddings)\n",
    "            with torch.no_grad():\n",
    "                speech = vocoder(spectrogram)\n",
    "                combined_speech.extend(speech.numpy())\n",
    "\n",
    "        # Combine audio data into a single numpy array\n",
    "        combined_speech = np.array(combined_speech)\n",
    "\n",
    "        return 16000, combined_speech  # Return sample rate and combined audio data\n",
    "    except Exception as e:\n",
    "        return None, f\"Error in text-to-speech conversion: {e}\"\n",
    "\n",
    "# Gradio Interface\n",
    "def gradio_interface(text):\n",
    "    sample_rate, audio_data = text_to_speech(text)\n",
    "    if sample_rate and isinstance(audio_data, np.ndarray):\n",
    "        return sample_rate, audio_data\n",
    "    else:\n",
    "        return None  # Return None if there's an error\n",
    "\n",
    "interface = gr.Interface(\n",
    "    fn=gradio_interface,\n",
    "    title=\"Text to Voice T5\",  # Add a title to the interface\n",
    "    description=\"Developed by Ruslan Magana, visit <a href='https://ruslanmv.com/' target='_blank'>ruslanmv.com</a> for more information.\",\n",
    "    inputs=gr.Textbox(lines=10, label=\"Enter text to convert to speech\"),\n",
    "    outputs=gr.Audio(label=\"Generated audio\")\n",
    ")\n",
    "\n",
    "interface.launch()\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python (watson)",
   "language": "python",
   "name": "watson"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}