|
|
import torch
|
|
|
from phonemizer import phonemize
|
|
|
from phonemizer.separator import Separator
|
|
|
|
|
|
class TextEncoder:
|
|
|
"""
|
|
|
Handles text-to-phoneme conversion for 14 languages.
|
|
|
"""
|
|
|
def __init__(self, vocab_map=None):
|
|
|
self.separator = Separator(phone=' ', word='|', syllable='')
|
|
|
|
|
|
self.lang_map = {
|
|
|
'en': 'en-us', 'zh': 'cmn', 'es': 'es', 'fr': 'fr-fr',
|
|
|
'de': 'de', 'ja': 'ja', 'ko': 'ko', 'ru': 'ru',
|
|
|
'pt': 'pt', 'it': 'it', 'hi': 'hi', 'ar': 'ar',
|
|
|
'tr': 'tr', 'nl': 'nl', 'bn': 'bn'
|
|
|
}
|
|
|
|
|
|
self.vocab = vocab_map if vocab_map else {c: i for i, c in enumerate(" abcdefghijklmnopqrstuvwxyz|")}
|
|
|
|
|
|
def preprocess(self, text, lang_code='en'):
|
|
|
"""
|
|
|
Converts text to phoneme IDs.
|
|
|
"""
|
|
|
if lang_code not in self.lang_map:
|
|
|
print(f"Warning: Language {lang_code} not fully supported, defaulting to English backend.")
|
|
|
backend_lang = 'en-us'
|
|
|
else:
|
|
|
backend_lang = self.lang_map[lang_code]
|
|
|
|
|
|
try:
|
|
|
|
|
|
phonemes = phonemize(
|
|
|
text,
|
|
|
language=backend_lang,
|
|
|
backend='espeak',
|
|
|
separator=self.separator,
|
|
|
strip=True,
|
|
|
preserve_punctuation=True,
|
|
|
njobs=1
|
|
|
)
|
|
|
except RuntimeError:
|
|
|
print("Warning: eSpeak not found. Falling back to character-level tokenization.")
|
|
|
phonemes = list(text)
|
|
|
|
|
|
|
|
|
token_ids = [self.vocab.get(p, 0) for p in phonemes]
|
|
|
return torch.tensor(token_ids).unsqueeze(0)
|
|
|
|