sts

Sleeping

App Files Files Community

gratias98 commited on Jan 2, 2025

Commit

ecd73f4

verified ·

1 Parent(s): 3c74bb7

Update tts.py

Browse files

Files changed (1) hide show

tts.py +169 -106

tts.py CHANGED Viewed

@@ -1,117 +1,180 @@
-import os, re, tempfile, torch, sys
 import numpy as np
-import psutil
 from huggingface_hub import hf_hub_download
 if "vits" not in sys.path:
-   sys.path.append("vits")
-from vits import commons, utils
 from vits.models import SynthesizerTrn
-# Load languages
 TTS_LANGUAGES = {}
-with open("data/tts/all_langs.tsv") as f:
-   TTS_LANGUAGES = {line.split(" ",1)[0].strip(): line.split(" ",1)[1].strip() for line in f}
-class TextMapper:
-   def __init__(self, vocab_file):
-       self.symbols = [x.strip() for x in open(vocab_file, encoding="utf-8")]
-       self.SPACE_ID = self.symbols.index(" ")
-       self._symbol_to_id = {s:i for i,s in enumerate(self.symbols)}
-       self._id_to_symbol = {i:s for i,s in enumerate(self.symbols)}
-   def text_to_sequence(self, text, cleaner_names):
-       return [self._symbol_to_id[s] for s in text.strip()]
-   def uromanize(self, text, uroman_pl):
-       with tempfile.NamedTemporaryFile() as tf, tempfile.NamedTemporaryFile() as tf2:
-           with open(tf.name, "w") as f:
-               f.write(text + "\n")
-           os.system(f"perl {uroman_pl} -l xxx < {tf.name} > {tf2.name}")
-           with open(tf2.name) as f:
-               return re.sub(r"\s+", " ", f.read()).strip()
-   def get_text(self, text, hps):
-       text_norm = self.text_to_sequence(text, hps.data.text_cleaners)
-       if hps.data.add_blank:
-           text_norm = commons.intersperse(text_norm, 0)
-       return torch.LongTensor(text_norm)
-   def filter_oov(self, text, lang=None):
-       text = text.replace("ț", "ţ") if lang == "ron" else text
-       return "".join(c for c in text if c in self._symbol_to_id)
-def synthesize(text=None, lang=None, speed=1.0):
-   # Memory check
-   if psutil.virtual_memory().percent > 85:
-       raise RuntimeError("System memory usage too high")
-   lang_code = lang.split()[0].strip()
-   # Download model files
-   model_dir = f"models/{lang_code}"
-   files = {
-       "vocab": hf_hub_download("facebook/mms-tts", "vocab.txt", subfolder=model_dir),
-       "config": hf_hub_download("facebook/mms-tts", "config.json", subfolder=model_dir),
-       "model": hf_hub_download("facebook/mms-tts", "G_100000.pth", subfolder=model_dir)
-   }
-   # Setup device
-   device = torch.device("cuda" if torch.cuda.is_available() else
-                        "mps" if hasattr(torch.backends, "mps") and
-                               torch.backends.mps.is_available() and
-                               torch.backends.mps.is_built() else "cpu")
-   # Initialize model
-   hps = utils.get_hparams_from_file(files["config"])
-   text_mapper = TextMapper(files["vocab"])
-   net_g = SynthesizerTrn(
-       len(text_mapper.symbols),
-       hps.data.filter_length // 2 + 1,
-       hps.train.segment_size // hps.data.hop_length,
-       **hps.model
-   ).to(device).eval()
-   utils.load_checkpoint(files["model"], net_g, None)
-   # Process text
-   if hps.data.training_files.endswith(".uroman"):
-       text = text_mapper.uromanize(text, os.path.join("uroman", "bin", "uroman.pl"))
-   text = text_mapper.filter_oov(text.lower(), lang=lang)
-   stn_tst = text_mapper.get_text(text, hps)
-   # Generate audio
-   try:
-       with torch.no_grad():
-           x_tst = stn_tst.unsqueeze(0).to(device)
-           x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
-           hyp = net_g.infer(
-               x_tst,
-               x_tst_lengths,
-               noise_scale=0.667,
-               noise_scale_w=0.8,
-               length_scale=1.0/speed
-           )[0][0,0].cpu().float().numpy()
-           # Cleanup
-           torch.cuda.empty_cache() if device.type == "cuda" else None
-           return (hps.data.sampling_rate, hyp), text
-   except RuntimeError as e:
-       if "out of memory" in str(e):
-           torch.cuda.empty_cache()
-           device = torch.device("cpu")
-           return synthesize(text, lang, speed)
-       raise e
 TTS_EXAMPLES = [
-   ["I am going to the store.", "eng (English)", 1.0],
-   ["안녕하세요.", "kor (Korean)", 1.0],
-   ["क्या मुझे पीने का पानी मिल सकता है?", "hin (Hindi)", 1.0],
-   ["Tanış olmağıma çox şadam", "azj-script_latin (Azerbaijani, North)", 1.0],
-   ["Mu zo murna a cikin ƙasar.", "hau (Hausa)", 1.0]
 ]

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import re
+import tempfile
+import torch
+import sys
+import gradio as gr
 import numpy as np
 from huggingface_hub import hf_hub_download
+# Setup TTS env
 if "vits" not in sys.path:
+    sys.path.append("vits")
+from vits import commons, utils
 from vits.models import SynthesizerTrn
 TTS_LANGUAGES = {}
+with open(f"data/tts/all_langs.tsv") as f:
+    for line in f:
+        iso, name = line.split(" ", 1)
+        TTS_LANGUAGES[iso.strip()] = name.strip()
+class TextMapper(object):
+    def __init__(self, vocab_file):
+        self.symbols = [
+            x.replace("\n", "") for x in open(vocab_file, encoding="utf-8").readlines()
+        ]
+        self.SPACE_ID = self.symbols.index(" ")
+        self._symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
+        self._id_to_symbol = {i: s for i, s in enumerate(self.symbols)}
+    def text_to_sequence(self, text, cleaner_names):
+        """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+        Args:
+        text: string to convert to a sequence
+        cleaner_names: names of the cleaner functions to run the text through
+        Returns:
+        List of integers corresponding to the symbols in the text
+        """
+        sequence = []
+        clean_text = text.strip()
+        for symbol in clean_text:
+            symbol_id = self._symbol_to_id[symbol]
+            sequence += [symbol_id]
+        return sequence
+    def uromanize(self, text, uroman_pl):
+        iso = "xxx"
+        with tempfile.NamedTemporaryFile() as tf, tempfile.NamedTemporaryFile() as tf2:
+            with open(tf.name, "w") as f:
+                f.write("\n".join([text]))
+            cmd = f"perl " + uroman_pl
+            cmd += f" -l {iso} "
+            cmd += f" < {tf.name} > {tf2.name}"
+            os.system(cmd)
+            outtexts = []
+            with open(tf2.name) as f:
+                for line in f:
+                    line = re.sub(r"\s+", " ", line).strip()
+                    outtexts.append(line)
+            outtext = outtexts[0]
+        return outtext
+    def get_text(self, text, hps):
+        text_norm = self.text_to_sequence(text, hps.data.text_cleaners)
+        if hps.data.add_blank:
+            text_norm = commons.intersperse(text_norm, 0)
+        text_norm = torch.LongTensor(text_norm)
+        return text_norm
+    def filter_oov(self, text, lang=None):
+        text = self.preprocess_char(text, lang=lang)
+        val_chars = self._symbol_to_id
+        txt_filt = "".join(list(filter(lambda x: x in val_chars, text)))
+        return txt_filt
+    def preprocess_char(self, text, lang=None):
+        """
+        Special treatement of characters in certain languages
+        """
+        if lang == "ron":
+            text = text.replace("ț", "ţ")
+            print(f"{lang} (ț -> ţ): {text}")
+        return text
+def synthesize(text=None, lang=None, speed=None):
+    if speed is None:
+        speed = 1.0
+    lang_code = lang.split()[0].strip()
+    vocab_file = hf_hub_download(
+        repo_id="facebook/mms-tts",
+        filename="vocab.txt",
+        subfolder=f"models/{lang_code}",
+    )
+    config_file = hf_hub_download(
+        repo_id="facebook/mms-tts",
+        filename="config.json",
+        subfolder=f"models/{lang_code}",
+    )
+    g_pth = hf_hub_download(
+        repo_id="facebook/mms-tts",
+        filename="G_100000.pth",
+        subfolder=f"models/{lang_code}",
+    )
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+    elif (
+        hasattr(torch.backends, "mps")
+        and torch.backends.mps.is_available()
+        and torch.backends.mps.is_built()
+    ):
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")
+    print(f"Run inference with {device}")
+    assert os.path.isfile(config_file), f"{config_file} doesn't exist"
+    hps = utils.get_hparams_from_file(config_file)
+    text_mapper = TextMapper(vocab_file)
+    net_g = SynthesizerTrn(
+        len(text_mapper.symbols),
+        hps.data.filter_length // 2 + 1,
+        hps.train.segment_size // hps.data.hop_length,
+        **hps.model,
+    )
+    net_g.to(device)
+    _ = net_g.eval()
+    _ = utils.load_checkpoint(g_pth, net_g, None)
+    is_uroman = hps.data.training_files.split(".")[-1] == "uroman"
+    if is_uroman:
+        uroman_dir = "uroman"
+        assert os.path.exists(uroman_dir)
+        uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
+        text = text_mapper.uromanize(text, uroman_pl)
+    text = text.lower()
+    text = text_mapper.filter_oov(text, lang=lang)
+    stn_tst = text_mapper.get_text(text, hps)
+    with torch.no_grad():
+        x_tst = stn_tst.unsqueeze(0).to(device)
+        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
+        hyp = (
+            net_g.infer(
+                x_tst,
+                x_tst_lengths,
+                noise_scale=0.667,
+                noise_scale_w=0.8,
+                length_scale=1.0 / speed,
+            )[0][0, 0]
+            .cpu()
+            .float()
+            .numpy()
+        )
+    return (hps.data.sampling_rate, hyp), text
 TTS_EXAMPLES = [
+    ["I am going to the store.", "eng (English)", 1.0],
+    ["안녕하세요.", "kor (Korean)", 1.0],
+    ["क्या मुझे पीने का पानी मिल सकता है?", "hin (Hindi)", 1.0],
+    ["Tanış olmağıma çox şadam", "azj-script_latin (Azerbaijani, North)", 1.0],
+    ["Mu zo murna a cikin ƙasar.", "hau (Hausa)", 1.0],
 ]