| | |
| | |
| | |
| | |
| |
|
| | import os |
| | from tqdm import tqdm |
| | from text.g2p_module import G2PModule, LexiconModule |
| | from text.symbol_table import SymbolTable |
| |
|
| | ''' |
| | phoneExtractor: extract phone from text |
| | ''' |
| | class phoneExtractor: |
| | def __init__(self, cfg, dataset_name=None, phone_symbol_file=None): |
| | ''' |
| | Args: |
| | cfg: config |
| | dataset_name: name of dataset |
| | ''' |
| | self.cfg = cfg |
| |
|
| | |
| | self.phone_symbols = set() |
| | |
| | |
| | if phone_symbol_file is not None: |
| | self.phone_symbols_file = phone_symbol_file |
| | elif dataset_name is not None: |
| | self.dataset_name = dataset_name |
| | self.phone_symbols_file = os.path.join(cfg.preprocess.processed_dir, |
| | dataset_name, |
| | cfg.preprocess.symbols_dict) |
| |
|
| | |
| | |
| | if cfg.preprocess.phone_extractor in ["espeak", "pypinyin", "pypinyin_initials_finals"]: |
| | self.g2p_module = G2PModule(backend=cfg.preprocess.phone_extractor) |
| | elif cfg.preprocess.phone_extractor == 'lexicon': |
| | assert cfg.preprocess.lexicon_path != "" |
| | self.g2p_module = LexiconModule(cfg.preprocess.lexicon_path) |
| | else: |
| | print('No suppert to', cfg.preprocess.phone_extractor) |
| | raise |
| |
|
| | |
| | def extract_phone(self, text): |
| | ''' |
| | Extract phone from text |
| | Args: |
| | |
| | text: text of utterance |
| | |
| | Returns: |
| | phone_symbols: set of phone symbols |
| | phone_seq: list of phone sequence of each utterance |
| | ''' |
| | |
| | if self.cfg.preprocess.phone_extractor in ["espeak", "pypinyin", "pypinyin_initials_finals"]: |
| | text = text.replace("”", '"').replace("“", '"') |
| | phone = self.g2p_module.g2p_conversion(text=text) |
| | self.phone_symbols.update(phone) |
| | phone_seq = [phn for phn in phone] |
| | |
| | elif self.cfg.preprocess.phone_extractor == 'lexicon': |
| | phone_seq = self.g2p_module.g2p_conversion(text) |
| | phone = phone_seq |
| | if not isinstance(phone_seq, list): |
| | phone_seq = phone_seq.split() |
| | |
| | return phone_seq |
| |
|
| | def save_dataset_phone_symbols_to_table(self): |
| | |
| | if os.path.exists(self.phone_symbols_file): |
| | phone_symbol_dict_saved = SymbolTable.from_file(self.phone_symbols_file)._sym2id.keys() |
| | self.phone_symbols.update(set(phone_symbol_dict_saved)) |
| |
|
| | |
| | phone_symbol_dict = SymbolTable() |
| | for s in sorted(list(self.phone_symbols)): |
| | phone_symbol_dict.add(s) |
| | phone_symbol_dict.to_file(self.phone_symbols_file) |
| |
|
| | |
| | def extract_utt_phone_sequence(cfg, metadata): |
| | ''' |
| | Extract phone sequence from text |
| | Args: |
| | cfg: config |
| | metadata: list of dict, each dict contains "Uid", "Text" |
| | |
| | ''' |
| | |
| | dataset_name = cfg.dataset[0] |
| | |
| | |
| | out_path = os.path.join(cfg.preprocess.processed_dir, dataset_name, cfg.preprocess.phone_dir) |
| | os.makedirs(out_path, exist_ok=True) |
| | |
| | phone_extractor = phoneExtractor(cfg, dataset_name) |
| |
|
| | for utt in tqdm(metadata): |
| | uid = utt["Uid"] |
| | text = utt["Text"] |
| | |
| | phone_seq = phone_extractor.extract_phone(text) |
| | |
| | phone_path = os.path.join(out_path, uid+'.phone') |
| | with open(phone_path, 'w') as fin: |
| | fin.write(' '.join(phone_seq)) |
| | |
| | if cfg.preprocess.phone_extractor != 'lexicon': |
| | phone_extractor.save_dataset_phone_symbols_to_table() |
| | |
| | |
| | |
| | def save_all_dataset_phone_symbols_to_table(self, cfg, dataset): |
| | |
| | phone_symbols = set() |
| | |
| | for dataset_name in dataset: |
| | phone_symbols_file = os.path.join(cfg.preprocess.processed_dir, |
| | dataset_name, |
| | cfg.preprocess.symbols_dict) |
| | |
| | |
| | assert os.path.exists(phone_symbols_file) |
| | phone_symbol_dict_saved = SymbolTable.from_file(phone_symbols_file)._sym2id.keys() |
| | phone_symbols.update(set(phone_symbol_dict_saved)) |
| | |
| | |
| | phone_symbol_dict = SymbolTable() |
| | for s in sorted(list(phone_symbols)): |
| | phone_symbol_dict.add(s) |
| | for dataset_name in dataset: |
| | phone_symbols_file = os.path.join(cfg.preprocess.processed_dir, |
| | dataset_name, |
| | cfg.preprocess.symbols_dict) |
| | phone_symbol_dict.to_file(phone_symbols_file) |
| | |
| | |