{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "5a3ddcc8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"d:\\Project\\TTS\\StyleTTS2-lite\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt_tab to\n",
"[nltk_data] C:\\Users\\catto\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package punkt_tab is already up-to-date!\n"
]
}
],
"source": [
"%cd ../..\n",
"from inference import StyleTTS2\n",
"\n",
"import librosa\n",
"import IPython.display as ipd\n",
"import torch.cuda\n",
"\n",
"device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
"#device = 'cpu'"
]
},
{
"cell_type": "markdown",
"id": "092cfb69",
"metadata": {},
"source": [
"### Load G2P"
]
},
{
"cell_type": "markdown",
"id": "a152ec13",
"metadata": {},
"source": [
"If you did not use eSpeak for your language, please add your own G2P."
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "ca224f37",
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import phonemizer\n",
"if sys.platform.startswith(\"win\"):\n",
" try:\n",
" from phonemizer.backend.espeak.wrapper import EspeakWrapper\n",
" import espeakng_loader\n",
" EspeakWrapper.set_library(espeakng_loader.get_library_path())\n",
" except Exception as e:\n",
" print(e)\n",
"\n",
"def get_phoneme(text, lang):\n",
" try:\n",
" my_phonemizer = phonemizer.backend.EspeakBackend(language=lang, preserve_punctuation=True, with_stress=True, language_switch='remove-flags')\n",
" return my_phonemizer.phonemize([text])[0]\n",
" except Exception as e:\n",
" print(e)"
]
},
{
"cell_type": "markdown",
"id": "7b9cecbe",
"metadata": {},
"source": [
"### Load models"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e7b9c01d",
"metadata": {},
"outputs": [],
"source": [
"config_path = \"Configs/config.yaml\"\n",
"models_path = \"Models/Finetune/current_model_100k.pth\""
]
},
{
"cell_type": "markdown",
"id": "b803110e",
"metadata": {},
"source": [
"### Synthesize speech\n",
"\n",
"Little Note: Reference audio has a huge impact on the result. It is best to select audio around 10s long and consistent in both tone and speed."
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "78396f70",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"./Demo/Audio/1_heart.wav\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"speaker = {\n",
" \"path\": \"./Demo/Audio/1_heart.wav\", #Ref audio path\n",
" \"speed\": 1.0, #Speaking speed\n",
"}\n",
"\n",
"max_samples = 24000*20 #max 20 seconds ref audio\n",
"print(speaker['path'])\n",
"wave, sr = librosa.load(speaker['path'], sr=24000)\n",
"audio, index = librosa.effects.trim(wave, top_db=30)\n",
"if sr != 24000: audio = librosa.resample(audio, sr, 24000)\n",
"if len(audio) > max_samples: audio = audio[:max_samples]\n",
"display(ipd.Audio(audio, rate=24000, normalize=True))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "395959f1",
"metadata": {},
"outputs": [],
"source": [
"text = '''\n",
"Nearly 300 scholars currently working in the United States have applied for positions at Aix-Marseille University in France, which has announced a program to provide a haven for academics affected by the Trump administration's policies.\n",
"Aix-Marseille launched the \"Safe Place for Science\" initiative earlier this year, offering three-year funded placements for approximately 20 researchers. The program aims to support scholars facing budget cuts and policy changes that have disrupted U.S. academic institutions.\n",
"'''"
]
},
{
"cell_type": "markdown",
"id": "1e846446",
"metadata": {},
"source": [
"| Parameter | Type | Description | Performance Impact |\n",
"|----------------|--------|-----------------------------------------------------------------------------|----------------------------------------------------|\n",
"| `avg_style` | BOOL | Split the reference audio and calculate the average speaking style. | Higher computation during style extraction |\n",
"| `denoise` | FLOAT | Adjusts denoiser strength; range [0, 1]. | Additional computation for style processing |\n",
"| `stabilize` | BOOL | Stabilizes speaking speed for long-form synthesis. | Slight additional computation |\n",
"| `n_merge` | INT | Avoids short sentences by merging if words < `n_merge`. | Higher VRAM usage as value increases |\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "16194211",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\catto\\anaconda3\\Lib\\site-packages\\torch\\nn\\utils\\weight_norm.py:28: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\n",
" warnings.warn(\"torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\")\n",
"c:\\Users\\catto\\anaconda3\\Lib\\site-packages\\torch\\nn\\modules\\rnn.py:83: UserWarning: dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.2 and num_layers=1\n",
" warnings.warn(\"dropout option adds dropout after all but last \"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"decoder : 54289492\n",
"predictor : 16194612\n",
"text_encoder : 5606400\n",
"style_encoder : 13845440\n",
"\n",
"Total : 89935944\n"
]
}
],
"source": [
"model = StyleTTS2(config_path, models_path).eval().to(device)\n",
"avg_style = True\n",
"denoise = 0.3\n",
"stabilize = False \n",
"n_merge = 16 "
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "980c6fbb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Computing the style for: ./Demo/Audio/1_heart.wav\n",
"Generating Audio...\n",
"Synthesized:\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"with torch.no_grad():\n",
" phonemes = get_phoneme(text=text, lang=\"en-us\")\n",
" styles = model.get_styles(speaker, denoise, avg_style)\n",
" r = model.generate(phonemes, styles, stabilize, n_merge)\n",
"\n",
"print('Synthesized:')\n",
"display(ipd.Audio(r, rate=24000, normalize=True))"
]
},
{
"cell_type": "markdown",
"id": "b664ec62",
"metadata": {},
"source": [
"Optional: The styles tensor can be saved and re-use later for faster generation time."
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "77bbf7e9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Saved styles!\n",
"Loaded styles!\n"
]
}
],
"source": [
"model.save_styles(save_dir=\"./style1.pt\")\n",
"model.load_styles(save_dir=\"./style1.pt\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "15411a86",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Generating Audio...\n",
"Synthesized:\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"with torch.no_grad():\n",
" phonemes = get_phoneme(text=text, lang=\"en-us\")\n",
" styles = model.get_styles(speaker, load_styles=True)\n",
" r = model.generate(phonemes, styles, stabilize, n_merge)\n",
"\n",
"print('Synthesized:')\n",
"display(ipd.Audio(r, rate=24000, normalize=True))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}