| SYMBOL_SPLITS = { | |
| "。", | |
| "?", | |
| "!", | |
| "……", | |
| ".", | |
| "?", | |
| "!", | |
| "~", | |
| "…", | |
| } | |
| def make_text_chunk(original_text, strat_index, max_len=5, max_try=5000): | |
| cut_string = original_text | |
| end_index = strat_index | |
| while True: | |
| if original_text[end_index] in SYMBOL_SPLITS: | |
| end_index += 1 | |
| cut_string = original_text[strat_index:end_index] | |
| break | |
| else: | |
| end_index += 1 | |
| if end_index >= len(original_text): | |
| # 文本太短,没找到 | |
| return 0, "" | |
| if end_index > max_try: | |
| # 有问题 | |
| raise ValueError("Reach max try") | |
| return end_index, cut_string | |