| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| """Text processing utilities for TTS inference. |
| |
| Provides: |
| - ``chunk_text_punctuation()``: Splits long text into model-friendly chunks at |
| sentence boundaries, with abbreviation-aware punctuation splitting. |
| - ``add_punctuation()``: Appends missing end punctuation (Chinese or English). |
| """ |
|
|
| from typing import List, Optional |
|
|
|
|
| SPLIT_PUNCTUATION = set(".,;:!?。,;:!?") |
| CLOSING_MARKS = set("\"'""')]》》>」】") |
|
|
| END_PUNCTUATION = { |
| ";", |
| ":", |
| ",", |
| ".", |
| "!", |
| "?", |
| "…", |
| ")", |
| "]", |
| "}", |
| '"', |
| "'", |
| """, |
| "'", |
| ";", |
| ":", |
| ",", |
| "。", |
| "!", |
| "?", |
| "、", |
| "……", |
| ")", |
| "】", |
| """, |
| "'", |
| } |
|
|
|
|
| ABBREVIATIONS = { |
| "Mr.", |
| "Mrs.", |
| "Ms.", |
| "Dr.", |
| "Prof.", |
| "Sr.", |
| "Jr.", |
| "Rev.", |
| "Fr.", |
| "Hon.", |
| "Pres.", |
| "Gov.", |
| "Capt.", |
| "Gen.", |
| "Sen.", |
| "Rep.", |
| "Col.", |
| "Maj.", |
| "Lt.", |
| "Cmdr.", |
| "Sgt.", |
| "Cpl.", |
| "Co.", |
| "Corp.", |
| "Inc.", |
| "Ltd.", |
| "Est.", |
| "Dept.", |
| "St.", |
| "Ave.", |
| "Blvd.", |
| "Rd.", |
| "Mt.", |
| "Ft.", |
| "No.", |
| "Jan.", |
| "Feb.", |
| "Mar.", |
| "Apr.", |
| "Aug.", |
| "Sep.", |
| "Sept.", |
| "Oct.", |
| "Nov.", |
| "Dec.", |
| "i.e.", |
| "e.g.", |
| "vs.", |
| "Vs.", |
| "Etc.", |
| "approx.", |
| "fig.", |
| "def.", |
| } |
|
|
|
|
| def chunk_text_punctuation( |
| text: str, |
| chunk_len: int, |
| min_chunk_len: Optional[int] = None, |
| ) -> List[str]: |
| """ |
| Splits the input tokens list into chunks according to punctuations, |
| avoiding splits on common abbreviations (e.g., Mr., No.). |
| """ |
|
|
| |
| sentences = [] |
| current_sentence = [] |
|
|
| tokens_list = list(text) |
|
|
| for token in tokens_list: |
| |
| |
| if ( |
| len(current_sentence) == 0 |
| and len(sentences) != 0 |
| and (token in SPLIT_PUNCTUATION or token in CLOSING_MARKS) |
| ): |
| sentences[-1].append(token) |
| |
| else: |
| current_sentence.append(token) |
|
|
| |
| if token in SPLIT_PUNCTUATION: |
| is_abbreviation = False |
|
|
| if token == ".": |
| temp_str = "".join(current_sentence).strip() |
| if temp_str: |
| last_word = temp_str.split()[-1] |
| if last_word in ABBREVIATIONS: |
| is_abbreviation = True |
|
|
| if not is_abbreviation: |
| sentences.append(current_sentence) |
| current_sentence = [] |
| |
| if len(current_sentence) != 0: |
| sentences.append(current_sentence) |
|
|
| |
| merged_chunks = [] |
| current_chunk = [] |
| for sentence in sentences: |
| if len(current_chunk) + len(sentence) <= chunk_len: |
| current_chunk.extend(sentence) |
| else: |
| if len(current_chunk) > 0: |
| merged_chunks.append(current_chunk) |
| current_chunk = sentence |
|
|
| if len(current_chunk) > 0: |
| merged_chunks.append(current_chunk) |
|
|
| |
| |
| if min_chunk_len is not None: |
| first_chunk_short_flag = ( |
| len(merged_chunks) > 0 and len(merged_chunks[0]) < min_chunk_len |
| ) |
| final_chunks = [] |
| for i, chunk in enumerate(merged_chunks): |
| if i == 1 and first_chunk_short_flag: |
| final_chunks[-1].extend(chunk) |
| else: |
| if len(chunk) >= min_chunk_len: |
| final_chunks.append(chunk) |
| else: |
| if len(final_chunks) == 0: |
| final_chunks.append(chunk) |
| else: |
| final_chunks[-1].extend(chunk) |
| else: |
| final_chunks = merged_chunks |
|
|
| chunk_strings = [ |
| "".join(chunk).strip() for chunk in final_chunks if "".join(chunk).strip() |
| ] |
| return chunk_strings |
|
|
|
|
| def add_punctuation(text: str): |
| """Add punctuation if there is not in the end of text""" |
| text = text.strip() |
|
|
| if not text: |
| return text |
|
|
| if text[-1] not in END_PUNCTUATION: |
| is_chinese = any("\u4e00" <= char <= "\u9fff" for char in text) |
|
|
| text += "。" if is_chinese else "." |
|
|
| return text |
|
|