| | import logging |
| |
|
| | import regex as re |
| |
|
| | from tools.classify_language import classify_language, split_alpha_nonalpha |
| |
|
| |
|
| | def check_is_none(item) -> bool: |
| | """none -> True, not none -> False""" |
| | return ( |
| | item is None |
| | or (isinstance(item, str) and str(item).isspace()) |
| | or str(item) == "" |
| | ) |
| |
|
| |
|
| | def markup_language(text: str, target_languages: list = None) -> str: |
| | pattern = ( |
| | r"[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`" |
| | r"\!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」" |
| | r"『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+" |
| | ) |
| | sentences = re.split(pattern, text) |
| |
|
| | pre_lang = "" |
| | p = 0 |
| |
|
| | if target_languages is not None: |
| | sorted_target_languages = sorted(target_languages) |
| | if sorted_target_languages in [["en", "zh"], ["en", "ja"], ["en", "ja", "zh"]]: |
| | new_sentences = [] |
| | for sentence in sentences: |
| | new_sentences.extend(split_alpha_nonalpha(sentence)) |
| | sentences = new_sentences |
| |
|
| | for sentence in sentences: |
| | if check_is_none(sentence): |
| | continue |
| |
|
| | lang = classify_language(sentence, target_languages) |
| |
|
| | if pre_lang == "": |
| | text = text[:p] + text[p:].replace( |
| | sentence, f"[{lang.upper()}]{sentence}", 1 |
| | ) |
| | p += len(f"[{lang.upper()}]") |
| | elif pre_lang != lang: |
| | text = text[:p] + text[p:].replace( |
| | sentence, f"[{pre_lang.upper()}][{lang.upper()}]{sentence}", 1 |
| | ) |
| | p += len(f"[{pre_lang.upper()}][{lang.upper()}]") |
| | pre_lang = lang |
| | p += text[p:].index(sentence) + len(sentence) |
| | text += f"[{pre_lang.upper()}]" |
| |
|
| | return text |
| |
|
| |
|
| | def split_by_language(text: str, target_languages: list = None) -> list: |
| | pattern = ( |
| | r"[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`" |
| | r"\!?\。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」" |
| | r"『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+" |
| | ) |
| | sentences = re.split(pattern, text) |
| |
|
| | pre_lang = "" |
| | start = 0 |
| | end = 0 |
| | sentences_list = [] |
| |
|
| | if target_languages is not None: |
| | sorted_target_languages = sorted(target_languages) |
| | if sorted_target_languages in [["en", "zh"], ["en", "ja"], ["en", "ja", "zh"]]: |
| | new_sentences = [] |
| | for sentence in sentences: |
| | new_sentences.extend(split_alpha_nonalpha(sentence)) |
| | sentences = new_sentences |
| |
|
| | for sentence in sentences: |
| | if check_is_none(sentence): |
| | continue |
| |
|
| | lang = classify_language(sentence, target_languages) |
| |
|
| | end += text[end:].index(sentence) |
| | if pre_lang != "" and pre_lang != lang: |
| | sentences_list.append((text[start:end], pre_lang)) |
| | start = end |
| | end += len(sentence) |
| | pre_lang = lang |
| | sentences_list.append((text[start:], pre_lang)) |
| |
|
| | return sentences_list |
| |
|
| |
|
| | def sentence_split(text: str, max: int) -> list: |
| | pattern = r"[!(),—+\-.:;??。,、;:]+" |
| | sentences = re.split(pattern, text) |
| | discarded_chars = re.findall(pattern, text) |
| |
|
| | sentences_list, count, p = [], 0, 0 |
| |
|
| | |
| | for i, discarded_chars in enumerate(discarded_chars): |
| | count += len(sentences[i]) + len(discarded_chars) |
| | if count >= max: |
| | sentences_list.append(text[p : p + count].strip()) |
| | p += count |
| | count = 0 |
| |
|
| | |
| | if p < len(text): |
| | sentences_list.append(text[p:]) |
| |
|
| | return sentences_list |
| |
|
| |
|
| | def sentence_split_and_markup(text, max=50, lang="auto", speaker_lang=None): |
| | |
| | if speaker_lang is not None and len(speaker_lang) == 1: |
| | if lang.upper() not in ["AUTO", "MIX"] and lang.lower() != speaker_lang[0]: |
| | logging.debug( |
| | f'lang "{lang}" is not in speaker_lang {speaker_lang},automatically set lang={speaker_lang[0]}' |
| | ) |
| | lang = speaker_lang[0] |
| |
|
| | sentences_list = [] |
| | if lang.upper() != "MIX": |
| | if max <= 0: |
| | sentences_list.append( |
| | markup_language(text, speaker_lang) |
| | if lang.upper() == "AUTO" |
| | else f"[{lang.upper()}]{text}[{lang.upper()}]" |
| | ) |
| | else: |
| | for i in sentence_split(text, max): |
| | if check_is_none(i): |
| | continue |
| | sentences_list.append( |
| | markup_language(i, speaker_lang) |
| | if lang.upper() == "AUTO" |
| | else f"[{lang.upper()}]{i}[{lang.upper()}]" |
| | ) |
| | else: |
| | sentences_list.append(text) |
| |
|
| | for i in sentences_list: |
| | logging.debug(i) |
| |
|
| | return sentences_list |
| |
|
| |
|
| | if __name__ == "__main__": |
| | text = "这几天心里颇不宁静。今晚在院子里坐着乘凉,忽然想起日日走过的荷塘,在这满月的光里,总该另有一番样子吧。月亮渐渐地升高了,墙外马路上孩子们的欢笑,已经听不见了;妻在屋里拍着闰儿,迷迷糊糊地哼着眠歌。我悄悄地披了大衫,带上门出去。" |
| | print(markup_language(text, target_languages=None)) |
| | print(sentence_split(text, max=50)) |
| | print(sentence_split_and_markup(text, max=50, lang="auto", speaker_lang=None)) |
| |
|
| | text = "你好,这是一段用来测试自动标注的文本。こんにちは,これは自動ラベリングのテスト用テキストです.Hello, this is a piece of text to test autotagging.你好!今天我们要介绍VITS项目,其重点是使用了GAN Duration predictor和transformer flow,并且接入了Bert模型来提升韵律。Bert embedding会在稍后介绍。" |
| | print(split_by_language(text, ["zh", "ja", "en"])) |
| |
|
| | text = "vits和Bert-VITS2是tts模型。花费3days.花费3天。Take 3 days" |
| |
|
| | print(split_by_language(text, ["zh", "ja", "en"])) |
| | |
| |
|
| | print(split_by_language(text, ["zh", "en"])) |
| | |
| |
|
| | text = "vits 和 Bert-VITS2 是 tts 模型。花费 3 days. 花费 3天。Take 3 days" |
| | print(split_by_language(text, ["zh", "en"])) |
| | |
| |
|