| emo_dict = { |
| "<|HAPPY|>": "๐", |
| "<|SAD|>": "๐", |
| "<|ANGRY|>": "๐ก", |
| "<|NEUTRAL|>": "", |
| "<|FEARFUL|>": "๐ฐ", |
| "<|DISGUSTED|>": "๐คข", |
| "<|SURPRISED|>": "๐ฎ", |
| } |
|
|
| event_dict = { |
| "<|BGM|>": "๐ผ", |
| "<|Speech|>": "", |
| "<|Applause|>": "๐", |
| "<|Laughter|>": "๐", |
| "<|Cry|>": "๐ญ", |
| "<|Sneeze|>": "๐คง", |
| "<|Breath|>": "", |
| "<|Cough|>": "๐คง", |
| } |
|
|
| lang_dict = { |
| "<|zh|>": "<|lang|>", |
| "<|en|>": "<|lang|>", |
| "<|yue|>": "<|lang|>", |
| "<|ja|>": "<|lang|>", |
| "<|ko|>": "<|lang|>", |
| "<|nospeech|>": "<|lang|>", |
| } |
|
|
| emoji_dict = { |
| "<|nospeech|><|Event_UNK|>": "โ", |
| "<|zh|>": "", |
| "<|en|>": "", |
| "<|yue|>": "", |
| "<|ja|>": "", |
| "<|ko|>": "", |
| "<|nospeech|>": "", |
| "<|HAPPY|>": "๐", |
| "<|SAD|>": "๐", |
| "<|ANGRY|>": "๐ก", |
| "<|NEUTRAL|>": "", |
| "<|BGM|>": "๐ผ", |
| "<|Speech|>": "", |
| "<|Applause|>": "๐", |
| "<|Laughter|>": "๐", |
| "<|FEARFUL|>": "๐ฐ", |
| "<|DISGUSTED|>": "๐คข", |
| "<|SURPRISED|>": "๐ฎ", |
| "<|Cry|>": "๐ญ", |
| "<|EMO_UNKNOWN|>": "", |
| "<|Sneeze|>": "๐คง", |
| "<|Breath|>": "", |
| "<|Cough|>": "๐ท", |
| "<|Sing|>": "", |
| "<|Speech_Noise|>": "", |
| "<|withitn|>": "", |
| "<|woitn|>": "", |
| "<|GBG|>": "", |
| "<|Event_UNK|>": "", |
| } |
|
|
| emo_set = {"๐", "๐", "๐ก", "๐ฐ", "๐คข", "๐ฎ"} |
| event_set = { |
| "๐ผ", |
| "๐", |
| "๐", |
| "๐ญ", |
| "๐คง", |
| "๐ท", |
| } |
|
|
|
|
| def format_str_v2(s): |
| sptk_dict = {} |
| for sptk in emoji_dict: |
| sptk_dict[sptk] = s.count(sptk) |
| s = s.replace(sptk, "") |
| emo = "<|NEUTRAL|>" |
| for e in emo_dict: |
| if sptk_dict[e] > sptk_dict[emo]: |
| emo = e |
| for e in event_dict: |
| if sptk_dict[e] > 0: |
| s = event_dict[e] + s |
| s = s + emo_dict[emo] |
|
|
| for emoji in emo_set.union(event_set): |
| s = s.replace(" " + emoji, emoji) |
| s = s.replace(emoji + " ", emoji) |
| return s.strip() |
|
|
| def rich_transcription_postprocess(s): |
| def get_emo(s): |
| return s[-1] if s[-1] in emo_set else None |
|
|
| def get_event(s): |
| return s[0] if s[0] in event_set else None |
|
|
| s = s.replace("<|nospeech|><|Event_UNK|>", "โ") |
| for lang in lang_dict: |
| s = s.replace(lang, "<|lang|>") |
| s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")] |
| new_s = " " + s_list[0] |
| cur_ent_event = get_event(new_s) |
| for i in range(1, len(s_list)): |
| if len(s_list[i]) == 0: |
| continue |
| if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None: |
| s_list[i] = s_list[i][1:] |
| |
| cur_ent_event = get_event(s_list[i]) |
| if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s): |
| new_s = new_s[:-1] |
| new_s += s_list[i].strip().lstrip() |
| new_s = new_s.replace("The.", " ") |
| return new_s.strip() |
|
|
| def rich_print_asr_res(asr_res): |
| res = "".join([rich_transcription_postprocess(i) for i in asr_res]) |
| print(res) |