| import re |
|
|
|
|
| def extract_language_and_text_updated(speaker, dialogue): |
| |
| pattern_language_text = r"<(\S+?)>([^<]+)" |
| matches = re.findall(pattern_language_text, dialogue, re.DOTALL) |
| speaker = speaker[1:-1] |
| |
| matches_cleaned = [(lang.upper(), text.strip()) for lang, text in matches] |
| matches_cleaned.append(speaker) |
| return matches_cleaned |
|
|
|
|
| def validate_text(input_text): |
| |
| pattern_speaker = r"(\[\S+?\])((?:\s*<\S+?>[^<\[\]]+?)+)" |
|
|
| |
| matches = re.findall(pattern_speaker, input_text, re.DOTALL) |
|
|
| |
| for _, dialogue in matches: |
| language_text_matches = extract_language_and_text_updated(_, dialogue) |
| if not language_text_matches: |
| return ( |
| False, |
| "Error: Invalid format detected in dialogue content. Please check your input.", |
| ) |
|
|
| |
| if not matches: |
| return ( |
| False, |
| "Error: No valid speaker format detected. Please check your input.", |
| ) |
|
|
| return True, "Input is valid." |
|
|
|
|
| def text_matching(text: str) -> list: |
| speaker_pattern = r"(\[\S+?\])(.+?)(?=\[\S+?\]|$)" |
| matches = re.findall(speaker_pattern, text, re.DOTALL) |
| result = [] |
| for speaker, dialogue in matches: |
| result.append(extract_language_and_text_updated(speaker, dialogue)) |
| print(result) |
| return result |
|
|
|
|
| def cut_para(text): |
| splitted_para = re.split("[\n]", text) |
| splitted_para = [ |
| sentence.strip() for sentence in splitted_para if sentence.strip() |
| ] |
| return splitted_para |
|
|
|
|
| def cut_sent(para): |
| para = re.sub("([。!;?\?])([^”’])", r"\1\n\2", para) |
| para = re.sub("(\.{6})([^”’])", r"\1\n\2", para) |
| para = re.sub("(\…{2})([^”’])", r"\1\n\2", para) |
| para = re.sub("([。!?\?][”’])([^,。!?\?])", r"\1\n\2", para) |
| para = para.rstrip() |
| return para.split("\n") |
|
|
|
|
| if __name__ == "__main__": |
| text = """ |
| [说话人1] |
| [说话人2]<zh>你好吗?<jp>元気ですか?<jp>こんにちは,世界。<zh>你好吗? |
| [说话人3]<zh>谢谢。<jp>どういたしまして。 |
| """ |
| text_matching(text) |
| |
| test_text = """ |
| [说话人1]<zh>你好,こんにちは!<jp>こんにちは,世界。 |
| [说话人2]<zh>你好吗? |
| """ |
| text_matching(test_text) |
| res = validate_text(test_text) |
| print(res) |
|
|