File size: 5,769 Bytes

d9edac2

{
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "# HebTTS Quickstart\n",
        "We created a jupyter notebook in order to easily generate your desired samples!\n",
        "\n",
        "\n",
        "  <a href='https://arxiv.org/abs/2407.12206'><img src='https://img.shields.io/badge/ArXiv-PDF-red'></a> &nbsp;\n",
        "   <a href='https://pages.cs.huji.ac.il/adiyoss-lab/HebTTS/'><img src='https://img.shields.io/badge/Project-Page-Green'></a> &nbsp;\n",
        "  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1f3-6Dqbna9_hI5C9V4qTIG05dixW-r72?usp=sharing) &nbsp;\n",
        "  [![Open In Colab](https://badges.aleen42.com/src/github.svg)](https://github.com/slp-rl/HebTTS) &nbsp;\n",
        "\n",
        "\n",
        "\n",
        "\n",
        "---\n",
        "\n",
        "\n"
      ],
      "metadata": {
        "id": "3wmzNy2fDSsr"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "56kkbeJahIe1"
      },
      "outputs": [],
      "source": [
        "#@title Install ‍💻\n",
        "\n",
        "%%capture\n",
        "! git clone https://github.com/slp-rl/HebTTS.git\n",
        "\n",
        "! pip install torch torchaudio\n",
        "! pip install torchmetrics\n",
        "! pip install omegaconf\n",
        "! pip install git+https://github.com/lhotse-speech/lhotse\n",
        "! pip install librosa\n",
        "! pip install encodec\n",
        "! pip install phonemizer\n",
        "! pip install audiocraft  # optional\n",
        "! gdown https://drive.google.com/uc?id=11NoOJzMLRX9q1C_Q4sX0w2b9miiDjGrv\n",
        "\n",
        "from pathlib import Path\n",
        "import glob\n",
        "import os\n",
        "from IPython.display import Audio, display\n",
        "from pathlib import Path\n",
        "\n",
        "\n",
        "def display_audio(prompt_file):\n",
        "  l=100\n",
        "  speaker = os.path.basename(os.path.dirname((os.path.dirname(prompt_file))))\n",
        "  audio_files = list(Path(os.path.join(os.path.dirname(prompt_file))).rglob(\"*.wav\"))\n",
        "  print(f\"Prompt: {open(prompt_file).read()}\")\n",
        "  print(f\"Speaker: {speaker}\")\n",
        "  print(\"-\"*100)\n",
        "\n",
        "  for audio_file in audio_files:\n",
        "    display(Audio(audio_file, autoplay=False))\n",
        "    print(\"-\"*100)\n",
        "    print()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "uIBF-IFwxLjL",
        "cellView": "form"
      },
      "outputs": [],
      "source": [
        "# @title Enter Hebrew text for generation\n",
        "# @markdown You can concatenate prompts with '|' to generate few samples at once\n",
        "hebrew_text_to_generate = \"היי מה קורה | ובשביל להבין למה מחיר הדלק כל כך עלה צריך לחזור שנתיים אחרונית\" # @param {type:\"string\"}"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "RFrYpBJsxQW3",
        "cellView": "form"
      },
      "outputs": [],
      "source": [
        "\n",
        "# @title Choose speaker\n",
        "speaker = \"shaul\" # @param [\"\\\"osim\\\"\", \"\\\"geek\\\"\", \"\\\"shaul\\\"\"] {type:\"raw\", allow-input: true}"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "**Note:** inference may be slower based on the allocated resources by google colab. It is recomended to choose a gpu in `Edit` -> `Notebook settings` -> `Hardware accelerator`."
      ],
      "metadata": {
        "id": "NMvd3VytEJEZ"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "U8Hszx6axgns"
      },
      "outputs": [],
      "source": [
        "#@title Generate!\n",
        "from pathlib import Path\n",
        "\n",
        "speaker_dir = Path(f\"./out/{speaker}\")\n",
        "speaker_dir.mkdir(exist_ok=True, parents=True)\n",
        "i=0\n",
        "if any(speaker_dir.iterdir()):\n",
        "    max(map(lambda x:int(x.name),speaker_dir.glob(\"*\")))+1\n",
        "output_dir = speaker_dir / f\"{i}\"\n",
        "output_dir.mkdir(exist_ok=True, parents=True)\n",
        "prompt_file = output_dir / \"prompt.txt\"\n",
        "with open(prompt_file, \"w\") as f:\n",
        "    f.write(hebrew_text_to_generate)\n",
        "\n",
        "! python HebTTS/infer.py \\\n",
        "    --checkpoint checkpoint.pt \\\n",
        "    --output-dir $output_dir                                             \\\n",
        "    --text \"$hebrew_text_to_generate\" \\\n",
        "    --speaker \"$speaker\" \\\n",
        "    --speaker-yaml HebTTS/speakers/speakers.yaml \\\n",
        "    --tokens-file HebTTS/tokenizer/unique_words_tokens_all.k2symbols \\\n",
        "    --vocab-file HebTTS/tokenizer/vocab.txt \\\n",
        "    --mbd True\n",
        "\n",
        "display_audio(prompt_file)"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "#@title Listen\n",
        "for prompt_file in glob.glob(\"./out/*/*/prompt.txt\"):\n",
        "  display_audio(prompt_file)"
      ],
      "metadata": {
        "cellView": "form",
        "id": "CXiMWmNxJzNZ"
      },
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "colab": {
      "provenance": [],
      "gpuType": "T4"
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "nbformat": 4,
  "nbformat_minor": 0
}