diff --git "a/eval_results/trained_cs_dialogue.json" "b/eval_results/trained_cs_dialogue.json" --- "a/eval_results/trained_cs_dialogue.json" +++ "b/eval_results/trained_cs_dialogue.json" @@ -1,4676 +1,3246 @@ { - "model": "MERaLiON-2-3B-DPO-CodeSwitch", + "model": "myaccountfor/MERaLiON-2-3B-DPO-CodeSwitch", + "dataset": "/home/users/astar/ares/suns1/scratch/trung/meralion2_dpo_output/transcription_onpolicy_mix_20250818_190637/data_splits/test_data.json", "summary": { "total_samples": 359, - "avg_mer": 0.743637119846023 + "valid_generations": 359, + "generated": { + "cer": 0.2199462818143899, + "wer": 0.6201355705781273, + "mer": 0.25412140330321953 + } }, "results": [ { "segment_id": "SEGMENT_ZH-CN_U1021_ENCN_20250816_202117_306489", - "audio_path": "SEGMENT_ZH-CN_U1021_ENCN_20250816_202117_306489.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1021_ENCN_20250816_202117_306489.wav", "ground_truth": "对,我只是看到它一个测试版的一个视频,就是它发出来的,就是它功能功能上的。 Yes.", - "original_transcription": ": 对我只是看了他一个测试版的一个视频就是他发出来的就是他功能功能上的yes\n", "generated_transcription": "\n: 对我只是看了他一个测试版的一个视频就是他发出来的就是他功能功能上的\n", - "wer_original": 0.0, - "wer_generated": 0.23684210526315788, - "wer_improvement": -0.23684210526315788, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.19444444444444445, + "wer": 1.0, + "mer": 0.14705882352941177 }, { "segment_id": "SEGMENT_ZH-CN_U0017_ENCN_20250816_202117_264368", - "audio_path": "SEGMENT_ZH-CN_U0017_ENCN_20250816_202117_264368.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0017_ENCN_20250816_202117_264368.wav", "ground_truth": "OK, for yoga I think it's a ** yeah yoga is an in traditional Indian sports and it actually,yeah, yoga can more than just help us to exercise our body but also to exercise our mind, our heart. When we do ** yoga, we less only those structures, instructions that the voice is really soft. 嗯我,对于我来说,我有找过一些兼职,然后一些实习,我觉得我的工作经验还不是很丰富。然后想向您向您多学习学习,那你最近在做什么,,哪个领域相关的工作呢?", - "original_transcription": ": 瑜伽我我觉得瑜伽对我来说有找过一些兼职然后一些实习我觉得我的工作经验还不算很丰富想向您向您多学习学习那你最近在做什么哪个领域相关的工作呢\n", "generated_transcription": ": 瑜伽我 think 是印度 yoga 是印度传统印度 sports 其实 yoga 可以帮助我们锻炼身体但同时锻炼我们的心和头脑当我们做 yoga 我们会学习那些结构指令 like the voice 是 really soft 我对我来说我有找过一些兼职然后一些实习我觉得我的工作经验还不是很丰富想向您说向您多学习学习那你最近在做什么哪个领域相关的工作\n", - "wer_original": 0.5769230769230769, - "wer_generated": 0.5692307692307692, - "wer_improvement": 0.007692307692307665, - "wer_method": "smart_mixed", - "mer_original": 0.5461538461538461 + "cer": 0.6433566433566433, + "wer": 0.8333333333333334, + "mer": 0.5210084033613446 }, { "segment_id": "SEGMENT_ZH-CN_U1015_ENCN_20250816_202117_275837", - "audio_path": "SEGMENT_ZH-CN_U1015_ENCN_20250816_202117_275837.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1015_ENCN_20250816_202117_275837.wav", "ground_truth": "所以。 ** Actually I've never tried it before. 嗯。", - "original_transcription": ": 所以 actually I've never tried it before\n", "generated_transcription": "\n: 所以 actually i've never tried it before\n", - "wer_original": 0.16666666666666666, - "wer_generated": 0.5833333333333334, - "wer_improvement": -0.41666666666666674, - "wer_method": "smart_mixed", - "mer_original": 0.5 + "cer": 0.03125, + "wer": 0.125, + "mer": 0.1111111111111111 }, { "segment_id": "SEGMENT_ZH-CN_U0088_MIX_20250816_202117_280428", - "audio_path": "SEGMENT_ZH-CN_U0088_MIX_20250816_202117_280428.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0088_MIX_20250816_202117_280428.wav", "ground_truth": "然后 你在美国那边儿有比较好玩的那种旅行吗 trip?", - "original_transcription": ": 然后你在美国那边有比较好玩的那种旅行的, trip.\n", "generated_transcription": "\n: 然后你在美国那边有比较好玩的那种旅行的, trip\n", - "wer_original": 0.14285714285714285, - "wer_generated": 0.19047619047619047, - "wer_improvement": -0.047619047619047616, - "wer_method": "smart_mixed", - "mer_original": 0.19047619047619047 + "cer": 0.08333333333333333, + "wer": 0.6666666666666666, + "mer": 0.09523809523809523 }, { "segment_id": "SEGMENT_ZH-CN_U0041_MIX_20250816_202117_269795", - "audio_path": "SEGMENT_ZH-CN_U0041_MIX_20250816_202117_269795.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0041_MIX_20250816_202117_269795.wav", "ground_truth": "是呢 maybe 在future 会有更更多 strange and techno technological things 我们可以看到。", - "original_transcription": ": (hm), 是 (hm), maybe (err) 在 future 会有更多, (err) and the technological things, 我们可以看到\n", "generated_transcription": "\n: (hm) 是 (hm) maybe 在 future 会有更多 (uh) strange (uh) and the technological things 我们可以看到\n", - "wer_original": 0.7619047619047619, - "wer_generated": 0.42857142857142855, - "wer_improvement": 0.3333333333333333, - "wer_method": "smart_mixed", - "mer_original": 0.5238095238095238 + "cer": 0.21666666666666667, + "wer": 0.9, + "mer": 0.2857142857142857 }, { "segment_id": "SEGMENT_ZH-CN_U1037_ENCN_20250816_202117_310259", - "audio_path": "SEGMENT_ZH-CN_U1037_ENCN_20250816_202117_310259.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1037_ENCN_20250816_202117_310259.wav", "ground_truth": "That sounds like a plan. See you at harmony feast. 好的,你想吃哪个套餐呢?", - "original_transcription": ": that sounds like a plan see you at harmony feast 嗯好的你想吃哪个套餐呢\n", "generated_transcription": "\n: that sounds like a plan see you at harmony feast 嗯好的你想吃哪个套餐呢\n", - "wer_original": 1.0, - "wer_generated": 0.3181818181818182, - "wer_improvement": 0.6818181818181819, - "wer_method": "smart_mixed", - "mer_original": 0.3181818181818182 + "cer": 0.02040816326530612, + "wer": 0.09090909090909091, + "mer": 0.05 }, { "segment_id": "SEGMENT_ZH-CN_U0005_ENCN_20250816_202117_348307", - "audio_path": "SEGMENT_ZH-CN_U0005_ENCN_20250816_202117_348307.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0005_ENCN_20250816_202117_348307.wav", "ground_truth": "削球那种,就是我看 一些。 I'm it mean I'm Libra.", - "original_transcription": ": 销求那种就是我看一些I mean I'm liberal\n", "generated_transcription": "\n: 销求那种就是我看一些 I mean I'm liberal\n", - "wer_original": 0.35294117647058826, - "wer_generated": 0.4117647058823529, - "wer_improvement": -0.05882352941176466, - "wer_method": "smart_mixed", - "mer_original": 0.8823529411764706 + "cer": 0.28, + "wer": 0.7142857142857143, + "mer": 0.3333333333333333 }, { "segment_id": "SEGMENT_ZH-CN_U0033_ENCN_20250816_202117_341230", - "audio_path": "SEGMENT_ZH-CN_U0033_ENCN_20250816_202117_341230.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0033_ENCN_20250816_202117_341230.wav", "ground_truth": "Yes , I like to play with somebody who is friendly and warm . 好。", - "original_transcription": ": (hm) yes, I like to play with somebody who is friendly and warm.\n", "generated_transcription": "\n: (hm) yes, I like to play with somebody who is friendly and warm\n", - "wer_original": 0.375, - "wer_generated": 0.3125, - "wer_improvement": 0.0625, - "wer_method": "smart_mixed", - "mer_original": 0.375 + "cer": 0.06382978723404255, + "wer": 0.15384615384615385, + "mer": 0.15384615384615385 }, { "segment_id": "SEGMENT_ZH-CN_U1006_MIX_20250816_202117_343362", - "audio_path": "SEGMENT_ZH-CN_U1006_MIX_20250816_202117_343362.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1006_MIX_20250816_202117_343362.wav", "ground_truth": "我觉得 Tik Tok 上面流行的这种还好。 既然他都有那种 conscious 了,那他肯定也会有教育这方面的意识。", - "original_transcription": ": 我觉得ticktock上面流行的也就还好既然他都有那种conscious了那他肯定也会有教育这方面的意识\n", "generated_transcription": "\n: 我觉得ticktock上面流行的就还好既然他都有那种conscious了那他肯定也会有教育这方面的意识\n", - "wer_original": 0.07317073170731707, - "wer_generated": 1.0, - "wer_improvement": -0.926829268292683, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.08, + "wer": 1.0, + "mer": 0.10526315789473684 }, { "segment_id": "SEGMENT_ZH-CN_U0042_ENCN_20250816_202117_275046", - "audio_path": "SEGMENT_ZH-CN_U0042_ENCN_20250816_202117_275046.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0042_ENCN_20250816_202117_275046.wav", "ground_truth": "So you are the only child in your family. 克服对水的恐惧,对吧?那你会呛水吗?", - "original_transcription": ": So you are the only child in your family.\n", "generated_transcription": "\n: So you are the only child in your family\n", - "wer_original": 0.6666666666666666, - "wer_generated": 0.7037037037037037, - "wer_improvement": -0.03703703703703709, - "wer_method": "smart_mixed", - "mer_original": 0.6666666666666666 + "cer": 0.3191489361702128, + "wer": 0.1, + "mer": 0.625 }, { "segment_id": "SEGMENT_ZH-CN_U0078_MIX_20250816_202117_309330", - "audio_path": "SEGMENT_ZH-CN_U0078_MIX_20250816_202117_309330.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0078_MIX_20250816_202117_309330.wav", "ground_truth": "Yeah, I have a lot of singers I like 比如说 my favorite 歌手, 是那个 比如说我 my favorite 歌手是周杰伦, 他的歌你应该也听过很多吧。", - "original_transcription": ": Yeah, I have a lot of singers I like. 比如说,我的最爱歌手是周杰伦,他的歌你应该也听过很多吧。\n", "generated_transcription": "\n: yeah I have a lot of singers I like 比如说 my favorite 歌手是那个 嗯比如说 my favorite 歌手是周杰伦 嗯他的歌你应该也听过很多吧\n", - "wer_original": 0.4782608695652174, - "wer_generated": 0.13043478260869565, - "wer_improvement": 0.34782608695652173, - "wer_method": "smart_mixed", - "mer_original": 0.30434782608695654 + "cer": 0.03896103896103896, + "wer": 0.21052631578947367, + "mer": 0.06976744186046512 }, { "segment_id": "SEGMENT_ZH-CN_U1053_MIX_20250816_202117_324570", - "audio_path": "SEGMENT_ZH-CN_U1053_MIX_20250816_202117_324570.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1053_MIX_20250816_202117_324570.wav", "ground_truth": "但是有的老师他们就非常的 严肃。所以说 有的有的 class 我们就比较的 relax,但是有的课我们就 就非常的紧张。因为他会 抽同学回答一些 非常 difficult 的问题。", - "original_transcription": "但是有的老师他们就非常的嗯严肃所以说嗯有的有的class嗯嗯我们就比较的relax但是有的课我们就嗯就非常的紧张因为他会嗯抽同学回答一些嗯非常difficult的问题\n", "generated_transcription": "\n但是有的老师他们就非常的嗯严肃所以说嗯有的有的class嗯嗯我们就比较的relax但是有的课我们就嗯就非常的紧张因为他会嗯抽同学回答一些嗯非常difficult的问题\n", - "wer_original": 0.0, - "wer_generated": 1.0, - "wer_improvement": -1.0, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.09210526315789473, + "wer": 1.0, + "mer": 0.11666666666666667 }, { "segment_id": "SEGMENT_ZH-CN_U0066_ENCN_20250816_202117_350140", - "audio_path": "SEGMENT_ZH-CN_U0066_ENCN_20250816_202117_350140.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0066_ENCN_20250816_202117_350140.wav", "ground_truth": "就是那时候就是模仿张继科,还有马龙,就是他们进球之后,不是会有一个就是呼喊的声音嘛,就是很很激动的那种声音。然后撕撕衣服庆祝什么的,就那时候觉得这些很帅,就是一直一直学习。然后羽毛球也是我经常接触的一个项目,虽然我羽毛球打得不怎么样 ,但。 Yes, I'd like to listen.", - "original_transcription": ": 那个时候就是模仿张继科还有马龙 他们进球之后不是会有一个就是呼喊的声音就是很很激动的那种声音然后撕衣服庆祝什么的就那时候觉得这些很帅就是一直一直学习然后羽毛球也是我经常接触的一个项目所以羽毛球打得不怎么样但是 : yes, I don't like to listen.\n", "generated_transcription": ": 那个时候就是模仿张继科还有马龙 他们进球之后不是会有一个就是呼喊的声音就是很很激动的那种声音然后撕衣服庆祝什么的就那时候觉得这些很帅就是一直一直学习然后羽毛球也是我经常接触的一个项目所以我羽毛球打得不怎么样但是\n", - "wer_original": 0.056451612903225805, - "wer_generated": 0.20161290322580644, - "wer_improvement": -0.14516129032258063, - "wer_method": "smart_mixed", - "mer_original": 0.1935483870967742 + "cer": 0.208, + "wer": 1.0, + "mer": 0.12389380530973451 }, { "segment_id": "SEGMENT_ZH-CN_U1045_ENCN_20250816_202117_330152", - "audio_path": "SEGMENT_ZH-CN_U1045_ENCN_20250816_202117_330152.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1045_ENCN_20250816_202117_330152.wav", "ground_truth": "那你可以跟我讲一下,就是你的朋友们他们的性格是怎么样的吗? Yes, and we just need to play it with the computer, right?", - "original_transcription": ": 嗯那你可以跟我讲一下就是你的朋友们他们的性格是怎么样的吗 : yeah and we just need to play it with the computer right\n", "generated_transcription": "\n: 嗯那你可以跟我讲一下就是你的朋友们他们的性格是怎么样的吗 : yeah 是 and we just need need to play it with the computer right\n", - "wer_original": 0.5121951219512195, - "wer_generated": 0.17073170731707318, - "wer_improvement": 0.3414634146341463, - "wer_method": "smart_mixed", - "mer_original": 0.14634146341463414 + "cer": 0.11267605633802817, + "wer": 0.3076923076923077, + "mer": 0.10256410256410256 }, { "segment_id": "SEGMENT_ZH-CN_U0041_MIX_20250816_202117_269635", - "audio_path": "SEGMENT_ZH-CN_U0041_MIX_20250816_202117_269635.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0041_MIX_20250816_202117_269635.wav", "ground_truth": "是的, 可能 但是别人的 personality 我们也不能去 judge, 所以就这样吧,看看一个热闹。 It's alright.", - "original_transcription": ": 嗯是的嗯可能嗯但是别人的personality我们也不能去judge所以就这样吧看一个热闹嗯it's alright嗯\n", "generated_transcription": "\n: 嗯是的嗯可能嗯但是别人的personality 我们也不能去judge所以就这样吧看一个热闹嗯it's alright\n", - "wer_original": 0.9411764705882353, - "wer_generated": 1.0, - "wer_improvement": -0.05882352941176472, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.09433962264150944, + "wer": 0.8888888888888888, + "mer": 0.16129032258064516 }, { "segment_id": "SEGMENT_ZH-CN_U1101_MIX_20250816_202117_306893", - "audio_path": "SEGMENT_ZH-CN_U1101_MIX_20250816_202117_306893.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1101_MIX_20250816_202117_306893.wav", "ground_truth": "So you broke your niece. 所以你只是膝盖受伤,别的地方没有受伤是吗? 啊,示范英语 almost almost almost 就大部分。", - "original_transcription": ": so you just your knees so you just your knees only your knees only your knees 还是没有其他地方没有手伤是吗 : 哦示范英语almost almost 就大部分\n", "generated_transcription": "\n: so you just your knees so you just your knees only 手指手上别的地方没有手伤是吗 : 哦示范英语 almost almost 就大部分\n", - "wer_original": 0.275, - "wer_generated": 0.475, - "wer_improvement": -0.19999999999999996, - "wer_method": "smart_mixed", - "mer_original": 0.775 + "cer": 0.6, + "wer": 0.9090909090909091, + "mer": 0.4166666666666667 }, { "segment_id": "SEGMENT_ZH-CN_U0019_MIX_20250816_202117_330680", - "audio_path": "SEGMENT_ZH-CN_U0019_MIX_20250816_202117_330680.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0019_MIX_20250816_202117_330680.wav", "ground_truth": "嗯, Of course 我有一个 ** Sister ,他年龄比我大一点,然后,工作的时长也比我早一点嘛。嗯,他的工作是 Nurse ,所以嗯,他的工作也有个 Three to four years 。 所以啊,在我刚开始工作的时候,他会还会偶尔过来看我,然后也会给我过来送一些吃的之类的,然后会过来找我玩吧。后来倒是工作,工作有一段时间,几个月之后,然后他还会偶尔来 Ask ,我有没有一些 Difficulty 啊之类的东西。嗯,反正对我来说还是挺照顾的。", - "original_transcription": ": 嗯of course啊我有一个sister她年龄比我大一点然后啊工作的工作时长也比我早一点嘛啊她的工作是nurse所以嗯她的工作也有个三到四年的所以啊在我刚开始工作的时候她还会偶尔过来看我然后也会给我过来送一些吃的之类的然后\n", "generated_transcription": ": 嗯of course我有一个sister她年龄比我大一点然后工作的时间也比我早一点嗯她的工作是nurse所以嗯她的工作也有个three to four years所以在我刚开始工作的时候她还会偶尔过来看我然后也会给我过来送一些吃的之类的然后\n", - "wer_original": 0.011494252873563218, - "wer_generated": 0.9885057471264368, - "wer_improvement": -0.9770114942528736, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.41578947368421054, + "wer": 0.8823529411764706, + "mer": 0.4473684210526316 }, { "segment_id": "SEGMENT_ZH-CN_U0042_MIX_20250816_202117_274718", - "audio_path": "SEGMENT_ZH-CN_U0042_MIX_20250816_202117_274718.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0042_MIX_20250816_202117_274718.wav", "ground_truth": "那那个同学也挺他的经历也挺 funny 的。 那你 是怎么how to applicate 这个学校的通过 Internet 吗?", - "original_transcription": ": 那那个同学也挺他的经历也挺 funny 的, : 那你 (um) 是怎么, how to apply to the 这个学校的通过, internet 吗.\n", "generated_transcription": "\n: 那那个同学也挺他的经历也挺 funny 的, : 那你 (um) 是怎么, how to apply cate to 这个学校的通过 internet 吗\n", - "wer_original": 0.41935483870967744, - "wer_generated": 0.3870967741935484, - "wer_improvement": 0.03225806451612906, - "wer_method": "smart_mixed", - "mer_original": 0.41935483870967744 + "cer": 0.09259259259259259, + "wer": 0.6, + "mer": 0.125 }, { "segment_id": "SEGMENT_ZH-CN_U1086_ENCN_20250816_202117_302439", - "audio_path": "SEGMENT_ZH-CN_U1086_ENCN_20250816_202117_302439.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1086_ENCN_20250816_202117_302439.wav", "ground_truth": "Okay, I know. 然后 去逛一下那个泰国旅,去感受一下那些潮流文化那些时尚,然后 还可以去宽窄巷子去感受那种成都最出名的就是那种慢节奏的那种生活嘛,然后比较悠闲嘛,然后还可以去人民公园,就是在下午喝茶,然后采耳,一般外地人来都要采耳的,这是一个就是非物质文化遗产的那种东西。", - "original_transcription": ": ok i know : 然后去逛一下那个泰国去感受一下那些潮流文化那些时尚然后还可以去宽窄巷子去感受那种成都最出名的就是那种慢节奏的那种生活然后比较悠闲然后还可以去人园公园就是在下午喝茶然后踩踏一般外国人来都要踩踏的这是一个就是非物质文化遗产的那种东西\n", "generated_transcription": "\n然后去逛一下那个泰国去感受一下那些潮流文化那些时尚然后还可以去宽窄巷子去感受那种成都最出名的就是那种慢节奏的那种生活然后比较悠闲然后还可以去人园公园就是在下午喝茶然后踩尔一般外国人来都要踩尔的这是一个就是非物质文化遗产的那种东西\n", - "wer_original": 0.9767441860465116, - "wer_generated": 0.16279069767441862, - "wer_improvement": 0.8139534883720929, - "wer_method": "smart_mixed", - "mer_original": 0.16279069767441862 + "cer": 0.14285714285714285, + "wer": 1.0, + "mer": 0.1 }, { "segment_id": "SEGMENT_ZH-CN_U0048_ENCN_20250816_202117_345983", - "audio_path": "SEGMENT_ZH-CN_U0048_ENCN_20250816_202117_345983.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0048_ENCN_20250816_202117_345983.wav", "ground_truth": "Oh fate series. Yes, I I've I've played that game. 那真的感觉好幸福。因为 不过我们宿舍其实也没有住八个人,我们宿舍是住了六个人。因为我有两个室友,他们在大一的时候,就是由于种种原因或者是出国留学,她们就搬出去了。然后有一个同学她已经退学了,她现在在英国读书。", - "original_transcription": ": (oh) fate series yes i i i played that game (wow) 真的感觉好幸福因为 (um) 不过我们宿舍其实也没有住八个人我们宿舍是住了六个人因为我有两个室友他们在大一的时候就是由于种种原因或者是出国留学他们就搬出去了然后有一个同学他已经退学了他现在在英国读书\n", "generated_transcription": "\n: (oh) fate series yes i i i play that game 哇真的感觉好幸福因为 (um) 不过我们宿舍其实也没有住八个人我们宿舍是住了六个人因为我有两个室友他们在大一的时候就是由于种种原因或者是出国留学他们就搬出去了然后有一个同学他已经退学了他现在在英国读书\n", - "wer_original": 0.911504424778761, - "wer_generated": 0.19469026548672566, - "wer_improvement": 0.7168141592920354, - "wer_method": "smart_mixed", - "mer_original": 0.18584070796460178 + "cer": 0.09230769230769231, + "wer": 0.5, + "mer": 0.07692307692307693 }, { "segment_id": "SEGMENT_ZH-CN_U0077_MIX_20250816_202117_298680", - "audio_path": "SEGMENT_ZH-CN_U0077_MIX_20250816_202117_298680.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0077_MIX_20250816_202117_298680.wav", "ground_truth": "我去过健身房 ,sometimes I go to the gym。在我的公司里。", - "original_transcription": ": 我去过健身房啊 sometimes I go to the gym 在我的公司里\n", "generated_transcription": "\n: 我去过健身房啊 sometimes I go to the gym 在我的公司里\n", - "wer_original": 1.4166666666666667, - "wer_generated": 0.75, - "wer_improvement": 0.6666666666666667, - "wer_method": "smart_mixed", - "mer_original": 0.75 + "cer": 0.03125, + "wer": 0.42857142857142855, + "mer": 0.05555555555555555 }, { "segment_id": "SEGMENT_ZH-CN_U1043_ENCN_20250816_202117_293805", - "audio_path": "SEGMENT_ZH-CN_U1043_ENCN_20250816_202117_293805.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1043_ENCN_20250816_202117_293805.wav", "ground_truth": "对啊,黑色黑色星期五嘛。 From Bilibili to TikTok.", - "original_transcription": ": 对啊黑色黑色星期五吗 from Bidi Bidi to TikTok\n", "generated_transcription": "\n: 对啊黑色黑色星期五吗 from bilibili to tiktok\n", - "wer_original": 1.0, - "wer_generated": 0.375, - "wer_improvement": 0.625, - "wer_method": "smart_mixed", - "mer_original": 0.375 + "cer": 0.03333333333333333, + "wer": 0.2, + "mer": 0.07142857142857142 }, { "segment_id": "SEGMENT_ZH-CN_U2005_ENCN_20250816_202117_352723", - "audio_path": "SEGMENT_ZH-CN_U2005_ENCN_20250816_202117_352723.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U2005_ENCN_20250816_202117_352723.wav", "ground_truth": "Oh, actually I know exactly what you mean, but in such an arm war listen it. So the way , they catch hooks with them smooth vocals. It's generous I I think. 其实我觉得有一些老师也是很懒的,因为像那个高级英语嘛,我上学期上高级英语的时候,我就发现这个老师的课件好像是配套的,我很好奇,我说这老师的课件在哪里找的呀?很奇怪,我当时我上网一去搜,然后我才知道原来他有一个网站是专门给这个高校老师备课的资源,它全部都在里面了。", - "original_transcription": ": 其实我觉得有一些老师也是很懒的因为像那个高级英语嘛我上学期上高级英语的时候我就发现这个老师的课件好像是有配套的我很好奇我说这个老师的课件在哪里找的呀很奇怪我当时我上网一���搜然后我才知道原来他有一个网站是专门给这个高校老师备课的资源他全部都在里面了\n", "generated_transcription": ": 其实我觉得有一些老师也是很懒的因为像那个高级英语嘛我上学期上高级英语的时候我就发现这个老师的课件好像是被套的我很好奇我说这个老师的课件在哪里找的呀很奇怪我当时我上网一去搜然后我才知道原来他有一个网站是专门给这个高校老师备课的资源他全部都在里面了\n", - "wer_original": 0.803680981595092, - "wer_generated": 0.27607361963190186, - "wer_improvement": 0.5276073619631902, - "wer_method": "smart_mixed", - "mer_original": 0.27607361963190186 + "cer": 0.5062761506276151, + "wer": 1.0, + "mer": 0.2236842105263158 }, { "segment_id": "SEGMENT_ZH-CN_U1046_MIX_20250816_202117_276218", - "audio_path": "SEGMENT_ZH-CN_U1046_MIX_20250816_202117_276218.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1046_MIX_20250816_202117_276218.wav", "ground_truth": "我明白了,那你觉得西安它的最大最让你感到和别的城市不一样的地方,你 它能够跟别的城市有 different。", - "original_transcription": ": 我明白了那你觉得西安它的最大最让你感到和别的城市不一样的地方它能够跟别的城市有different\n", "generated_transcription": "\n: 我明白了那你觉得西安它的最大最让你感到和别的城市不一样的地方它能够跟别的城市有\n", - "wer_original": 1.0, - "wer_generated": 0.09302325581395349, - "wer_improvement": 0.9069767441860466, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.20408163265306123, + "wer": 1.0, + "mer": 0.04878048780487805 }, { "segment_id": "SEGMENT_ZH-CN_U0075_ENCN_20250816_202117_315387", - "audio_path": "SEGMENT_ZH-CN_U0075_ENCN_20250816_202117_315387.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0075_ENCN_20250816_202117_315387.wav", "ground_truth": "你感觉夏天对你来说怎么样? That's sounds great. Because I am a boy.", - "original_transcription": ": 你感觉夏天对你来说怎么样, : (oh), that sounds great because I'm a boy.\n", "generated_transcription": "\n: 你感觉夏天对你来说怎么样 : (oh) that sounds great because i'm a boy\n", - "wer_original": 0.6666666666666666, - "wer_generated": 0.3333333333333333, - "wer_improvement": 0.3333333333333333, - "wer_method": "smart_mixed", - "mer_original": 0.3333333333333333 + "cer": 0.09523809523809523, + "wer": 0.4444444444444444, + "mer": 0.2 }, { "segment_id": "SEGMENT_ZH-CN_U1057_ENCN_20250816_202117_325576", - "audio_path": "SEGMENT_ZH-CN_U1057_ENCN_20250816_202117_325576.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1057_ENCN_20250816_202117_325576.wav", "ground_truth": "Although Jessica is more older than Joanna, but Jessica's class is so outstanding because it's not like other teacher. Jessica don't teach us by the book and instead. She turned to teach us with her own knowledge. 确实可能也有这种原因,那就是除开你的家人,你在学校的生活怎么样呢?你们是住校还是走读呢?", - "original_transcription": ": 虽然杰西卡比乔安娜大但是杰西卡的课班是那么出色杰西卡不教我们按书本教而是用她自己的知识教我们确实可能也有这种原因那就是除开你的家人你在学校的生活怎么样呢你们是住校还是走读呢\n", "generated_transcription": ": 虽然杰西卡是老 than 乔安娜但杰西卡的课班是so outstanding因为不是像其他老师杰西卡不教我们by the book而是她turn to teach us with her own knowledge\n", - "wer_original": 0.5432098765432098, - "wer_generated": 0.9135802469135802, - "wer_improvement": -0.37037037037037035, - "wer_method": "smart_mixed", - "mer_original": 0.6172839506172839 + "cer": 0.7238095238095238, + "wer": 0.7631578947368421, + "mer": 0.974025974025974 }, { "segment_id": "SEGMENT_ZH-CN_U0042_MIX_20250816_202117_274024", - "audio_path": "SEGMENT_ZH-CN_U0042_MIX_20250816_202117_274024.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0042_MIX_20250816_202117_274024.wav", "ground_truth": "那你真是一个安静乖巧的 personality 呢。 你 你去过,最南的城市是哪一个 city?", - "original_transcription": ": 那你真是一个安静乖巧的personality呢你你去过最难的城市是哪个city\n", "generated_transcription": "\n: 那你真是一个安静乖巧的personality呢你你去过最难的城市是哪个city\n", - "wer_original": 0.1724137931034483, - "wer_generated": 1.0, - "wer_improvement": -0.8275862068965517, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.05, + "wer": 1.0, + "mer": 0.07407407407407407 }, { "segment_id": "SEGMENT_ZH-CN_U2001_ENCN_20250816_202117_295751", - "audio_path": "SEGMENT_ZH-CN_U2001_ENCN_20250816_202117_295751.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U2001_ENCN_20250816_202117_295751.wav", "ground_truth": "粉丝给她取了一个爱称,叫做梅梅。 泰勒呢,她是一个她是一名美国歌手,她写的很多歌都很很流行,我很喜欢听她的歌。 Yes.", - "original_transcription": ": 呃粉丝给她取了一个爱称叫做梅梅嗯泰勒呢她是一个她是一名美国歌手她写的很多歌都很很流行我很喜欢听她的歌yes\n", "generated_transcription": "\n: 呃粉丝给她取了一个爱称叫做梅梅嗯泰勒呢她是一个她是一名美国歌手她写的很多歌都很很流行我很喜欢听她的歌 yes\n", - "wer_original": 0.9818181818181818, - "wer_generated": 0.14545454545454545, - "wer_improvement": 0.8363636363636364, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.0392156862745098, + "wer": 0.6666666666666666, + "mer": 0.04081632653061224 }, { "segment_id": "SEGMENT_ZH-CN_U1043_MIX_20250816_202117_293624", - "audio_path": "SEGMENT_ZH-CN_U1043_MIX_20250816_202117_293624.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1043_MIX_20250816_202117_293624.wav", "ground_truth": "我现在只记得,我现在只记得只记得一句那什么 glasses 是吧? 就是 attention please。", - "original_transcription": ": 我现在只记得我只记得只记得一句那什么gracias是吧啊就是attention please\n", "generated_transcription": "\n: 我现在只记得我现在只记得只记得一句那什么gracias是吧啊就是attention please\n", - "wer_original": 0.9655172413793104, - "wer_generated": 1.0, - "wer_improvement": -0.03448275862068961, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.10869565217391304, + "wer": 0.8333333333333334, + "mer": 0.07407407407407407 }, { "segment_id": "SEGMENT_ZH-CN_U0099_MIX_20250816_202117_342452", - "audio_path": "SEGMENT_ZH-CN_U0099_MIX_20250816_202117_342452.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0099_MIX_20250816_202117_342452.wav", "ground_truth": "I may focus my attention on 英语专业八级 。", - "original_transcription": ": I may focus my attention on (um) English major, B.A. in\n", "generated_transcription": "\n: I may focus my attention on (um) English专业八级\n", - "wer_original": 0.8461538461538461, - "wer_generated": 0.5384615384615384, - "wer_improvement": 0.3076923076923077, - "wer_method": "smart_mixed", - "mer_original": 0.5384615384615384 + "cer": 0.32142857142857145, + "wer": 0.2857142857142857, + "mer": 0.16666666666666666 }, { "segment_id": "SEGMENT_ZH-CN_U0080_ENCN_20250816_202117_292994", - "audio_path": "SEGMENT_ZH-CN_U0080_ENCN_20250816_202117_292994.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0080_ENCN_20250816_202117_292994.wav", "ground_truth": "I think I will go to Inner Mongolia or just in Emei mountain, some places like that。 听起来很棒,但自我时间对于我来说,可能就是泡一杯茶,或者是点一杯奶茶,外卖,然后坐在床上看看电脑,甚至躺上睡觉。", - "original_transcription": ": (um), I think I will go to Inner Mongolia or just in Ermei Mountain, some places like that. : 听起来很棒但自个时间对于我来说可能就是泡一杯茶或者是点一���奶茶外卖然后坐在床上看看电脑甚至躺着睡觉\n", "generated_transcription": "\n: (um) I think I will go to Inner Mongolia or just in Ermei Mountain some places like that. : 听起来很棒但自个时间对于我来说可能就是泡一杯茶或者是点一杯奶茶外卖然后坐在床上看看电脑甚至躺着睡觉\n", - "wer_original": 0.7808219178082192, - "wer_generated": 0.1780821917808219, - "wer_improvement": 0.6027397260273972, - "wer_method": "smart_mixed", - "mer_original": 0.1780821917808219 + "cer": 0.043478260869565216, + "wer": 0.16666666666666666, + "mer": 0.06060606060606061 }, { "segment_id": "SEGMENT_ZH-CN_U1079_MIX_20250816_202117_294045", - "audio_path": "SEGMENT_ZH-CN_U1079_MIX_20250816_202117_294045.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1079_MIX_20250816_202117_294045.wav", "ground_truth": "然后我觉得哆啦 A 梦里面的每个角色都很 friendly。 然后要不然我在我的 iphone 上买两张 tickets 吧,然后方便我们先过去看。", - "original_transcription": ": 哦我觉得The Lion King里面的每个角色都很friendly然后要不然我在我的iPhone上买两套tickets吧然后方便我们先过去看\n", "generated_transcription": "\n: 哦我觉得The Lion King里面的每个角色都很friendly然后要不然我在我的iPhone上买两套tickets吧然后方便我们先过去看\n", - "wer_original": 0.19148936170212766, - "wer_generated": 1.0, - "wer_improvement": -0.8085106382978724, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.2222222222222222, + "wer": 1.0, + "mer": 0.15555555555555556 }, { "segment_id": "SEGMENT_ZH-CN_U0078_MIX_20250816_202117_309382", - "audio_path": "SEGMENT_ZH-CN_U0078_MIX_20250816_202117_309382.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0078_MIX_20250816_202117_309382.wav", "ground_truth": "Yeah, 我喜欢和我的朋友去 play basketball, because some some some friends play together 是一件非常有趣的事,而且足球需要的人数 is less than football,所以更好的去凑齐人数 进行一些 这些这些运动吧。", - "original_transcription": "Yeah, 我喜欢和我的朋友去打篮球,因为一些朋友一起打篮球是一件非常有趣的事,而且足球需要的人数是小于足球,所以更好地去凑齐人数,进行一些这些运动吧。\n", - "generated_transcription": "\nYeah, 我喜欢和我的朋友去 play basketball, 因为 (uh) some (uh) some friends play together, 是一件非常有趣的事, 而且足球需要的人数 (uh) is less than football, 所以更好地去凑齐人数, 进行一些 (uh) 这些运动吧\n", - "wer_original": 0.8333333333333334, - "wer_generated": 0.4444444444444444, - "wer_improvement": 0.38888888888888895, - "wer_method": "smart_mixed", - "mer_original": 0.6481481481481481 + "generated_transcription": "\nYeah, 我喜欢和我的朋友去 play basketball, 因为 some some friends play together, 是一件非常有趣的事, 而且足球需要的人数 is less than football, 所以更好地去凑齐人数, 进行一些这些运动\n", + "cer": 0.12295081967213115, + "wer": 0.3888888888888889, + "mer": 0.0967741935483871 }, { "segment_id": "SEGMENT_ZH-CN_U0025_ENCN_20250816_202117_317757", - "audio_path": "SEGMENT_ZH-CN_U0025_ENCN_20250816_202117_317757.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0025_ENCN_20250816_202117_317757.wav", "ground_truth": "哎,我也是,我。 Yes。 啊,浏览居多的话,就是像微博小红书这这一类的吧,就是看一些打发时间的新闻。 Yes, I have joined a painting clubs during my college life and besides. We can draw a draw pictures. ** What make me more interested in is I I can knows some lots of new friends because we share the common hobbies, and we have lots of common topics to discuss.", - "original_transcription": ": I also yes, (uh) browse most of the time, like Weibo, 小红书, 这这类的吧, 就是看一些打发时间新闻. : Yes, (uh), I have joined (uh) painting clubs during my college life, and (uh) besides, (uh) we can draw draw pictures. : What make me more interested in is I can know lots of new friends, because we share the common hobbies and we have lots of common topics to discuss.\n", - "generated_transcription": ": I also yes (um) browse most of the time 就是看一些打发时间新闻\n", - "wer_original": 0.4742268041237113, - "wer_generated": 0.8865979381443299, - "wer_improvement": -0.41237113402061853, - "wer_method": "smart_mixed", - "mer_original": 0.4329896907216495 + "generated_transcription": "\n: (err) 我也是 yes (uh) 浏览最多的话就是像微博 小红书 这这类的吧 就是看一些 打发时间 的新闻\n", + "cer": 0.8647540983606558, + "wer": 0.9811320754716981, + "mer": 0.6179775280898876 }, { "segment_id": "SEGMENT_ZH-CN_U0037_ENCN_20250816_202117_308828", - "audio_path": "SEGMENT_ZH-CN_U0037_ENCN_20250816_202117_308828.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0037_ENCN_20250816_202117_308828.wav", "ground_truth": "我我其实觉得, 怎么说,像比如说请假这件事情吧, 我在上学的时候基本上没有遇到过,就基本上没有请过假。 因为 因为怎么说,就是如果有事儿不去上课,也不也不会通知老师。基本上上学这几年都没有请过假。但是 就我跟我国内的同学聊天,就发现了一件事情,就是在国内请假好像是一件非常严重的事情。 Was it fourteen hundred? Yeah.", - "original_transcription": ": 我其实觉得怎么说像比如说请假这件事情吧我在上学的时候基本上没有遇到过就基本上没有请过假因为因为怎么说就是如果有事不去上课也不会通知老师基本上上学这几年都没有请过假但是呃就我跟我国内的同学聊天就发现了一件事情就是在国内请假好像是一件非常严重的事情\n", - "generated_transcription": ": 我其实觉得怎么说像比如说请假这件事情吧我在上学的时候基本上没有遇到过就基本上没有请过假因为因为怎么说就是如果有事不去上课也不会通知老师基本上上学这几年都没有请过假但是呃就我跟我国内的同学聊天就发现了一件事情就是在国内请假好像是一件非常严重的事情\n", - "wer_original": 0.04225352112676056, - "wer_generated": 0.15492957746478872, - "wer_improvement": -0.11267605633802816, - "wer_method": "smart_mixed", - "mer_original": 0.15492957746478872 + "generated_transcription": "\n: 我其实觉得怎么说像比如说请假这件事情吧我在上学的时候基本上没有遇到过就基本上没有请过假因为因为怎么说就是如果有事不去上课也不会通知老师基本上上学这几年都没有请过假但是就我国内的同学聊天发现了一件事情就是在国内请假好像是一件非常严重的事情\n", + "cer": 0.2080536912751678, + "wer": 1.0, + "mer": 0.09230769230769231 }, { "segment_id": "SEGMENT_ZH-CN_U0093_MIX_20250816_202117_330486", - "audio_path": "SEGMENT_ZH-CN_U0093_MIX_20250816_202117_330486.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0093_MIX_20250816_202117_330486.wav", "ground_truth": "对,所以 在 weekend 时候我一般会做要是做 运动的话 I would like try something not so competitive, 比如说像 。", - "original_transcription": ": 对所以, (err) 在, weekend, 的时候我一般会做, 要是做, (err) 运动的话, (err) I would like try something (err) not so competitive, (err) (err) 比如说像, (err).\n", "generated_transcription": "\n: 对所以, (err) 在, weekend, 的时候我一般会做, 要是做, (err) 运动的话, (err) I would like try something (err) not so competitive, (err) 比如说像,\n", - "wer_original": 0.7575757575757576, - "wer_generated": 0.45454545454545453, - "wer_improvement": 0.30303030303030304, - "wer_method": "smart_mixed", - "mer_original": 0.5151515151515151 + "cer": 0.23880597014925373, + "wer": 0.5, + "mer": 0.1935483870967742 }, { "segment_id": "SEGMENT_ZH-CN_U1044_ENCN_20250816_202117_312602", - "audio_path": "SEGMENT_ZH-CN_U1044_ENCN_20250816_202117_312602.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1044_ENCN_20250816_202117_312602.wav", "ground_truth": "这边的机会说实话,好特别特别大,对。 How about him?", - "original_transcription": ": 这边的机会说实话特别特别大对, (hm), how about him?\n", "generated_transcription": "\n: 这边的机会说实话特别特别大对 : (hm) how about him\n", - "wer_original": 0.8571428571428571, - "wer_generated": 0.2857142857142857, - "wer_improvement": 0.5714285714285714, - "wer_method": "smart_mixed", - "mer_original": 0.2857142857142857 + "cer": 0.11538461538461539, + "wer": 0.5, + "mer": 0.1111111111111111 }, { "segment_id": "SEGMENT_ZH-CN_U0075_ENCN_20250816_202117_315391", - "audio_path": "SEGMENT_ZH-CN_U0075_ENCN_20250816_202117_315391.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0075_ENCN_20250816_202117_315391.wav", "ground_truth": "In real group. In my opinion. It should be the coordination of the department and understanding of a personal in the event so that the right person can do the right thing. So which has a huge impact on the work ** if affections of the team. 嗯,对的对的,还是非常不错的。", - "original_transcription": ": 还是非常不错的\n", - "generated_transcription": ": 还是非常不错的\n", - "wer_original": 0.0, - "wer_generated": 0.8870967741935484, - "wer_improvement": -0.8870967741935484, - "wer_method": "smart_mixed", - "mer_original": 0.8870967741935484 + "generated_transcription": "\n: 还是非常不错的\n", + "cer": 0.965, + "wer": 1.0, + "mer": 0.8793103448275862 }, { "segment_id": "SEGMENT_ZH-CN_U0002_ENCN_20250816_202117_291309", - "audio_path": "SEGMENT_ZH-CN_U0002_ENCN_20250816_202117_291309.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0002_ENCN_20250816_202117_291309.wav", "ground_truth": "I will, I think you'll help a lot. I I've seen a lot of Chinese student in my university that also got scholarship. 是呀,一些有很多的 九九六的工作,大家根本就没有时间来 生活,更不要提说想养一个孩子了。", - "original_transcription": ": I, I think it will help a lot. I, I've seen a lot of Chinese student in my university that also got (um) scholarship. : 是呀现在有很多的, (um) 九九六的工作大家根本就没有时间来, (err) 生活更不要提说想养一个孩子了\n", "generated_transcription": "\n: I will, I think it will help a lot. I, I've seen a lot of Chinese student in my university that also got (um) scholarship. : 是呀现在有很多的, (um) 九九六的工作, 大家根本就没有时间来, (err) 生活, 更不要提说想养一个孩子了\n", - "wer_original": 0.6461538461538462, - "wer_generated": 0.2153846153846154, - "wer_improvement": 0.4307692307692308, - "wer_method": "smart_mixed", - "mer_original": 0.24615384615384617 + "cer": 0.10317460317460317, + "wer": 0.38461538461538464, + "mer": 0.11475409836065574 }, { "segment_id": "SEGMENT_ZH-CN_U0056_ENCN_20250816_202117_316905", - "audio_path": "SEGMENT_ZH-CN_U0056_ENCN_20250816_202117_316905.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0056_ENCN_20250816_202117_316905.wav", "ground_truth": "Oh melody. 太扎心了,我也学了很长时间,你都不知道我考多少分,你绝对猜不到专业课都还可以,我只考了五十几分,太难了,好难呀,我觉得政治 。", - "original_transcription": ": 哦, melody 太扎心了我也觉得很长时间你都不知道我考多少分你绝对猜不到专业科都还可以我只考了五十几分太难了好难啊我觉得政治嗯\n", "generated_transcription": "\n: 哦melody太扎心了我也觉得很长时间你都不知道我考多少分你绝对猜不到专业科都还可以我只考了五十几分太难了好难啊我觉得政治\n", - "wer_original": 0.0625, - "wer_generated": 1.0, - "wer_improvement": -0.9375, - "wer_method": "smart_mixed", - "mer_original": 0.234375 + "cer": 0.0967741935483871, + "wer": 1.0, + "mer": 0.08928571428571429 }, { "segment_id": "SEGMENT_ZH-CN_U2001_ENCN_20250816_202117_295673", - "audio_path": "SEGMENT_ZH-CN_U2001_ENCN_20250816_202117_295673.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U2001_ENCN_20250816_202117_295673.wav", "ground_truth": "He, he wants to to pursue the lights, right? 你有哥哥或者姐姐吗?", - "original_transcription": ": he he wants to to pursue the lights right 那你有哥哥或者姐姐吗\n", "generated_transcription": "\n: he he wants to to pursue the lights right 那你有哥哥或者姐姐吗\n", - "wer_original": 0.631578947368421, - "wer_generated": 0.2631578947368421, - "wer_improvement": 0.3684210526315789, - "wer_method": "smart_mixed", - "mer_original": 0.2631578947368421 + "cer": 0.023809523809523808, + "wer": 0.1, + "mer": 0.05555555555555555 }, { "segment_id": "SEGMENT_ZH-CN_U1092_MIX_20250816_202117_296130", - "audio_path": "SEGMENT_ZH-CN_U1092_MIX_20250816_202117_296130.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1092_MIX_20250816_202117_296130.wav", "ground_truth": "对,然后 你刚刚有我,你刚刚说到就是比较喜欢弹吉他嘛,然后对虽然是 gogo。然后我不知道,我不知道你因因为我之前听吉他曲会比较多一点,就是我念大学的时候,然后 你之前这啊了解。", - "original_transcription": ": 对然后嗯你刚刚有我你刚刚说到就是比较喜欢弹吉他嘛然后对三六九九然后我不知道我不知道你因为我之前听吉他曲会比较多一点就是我念大学的时候然后嗯你之前这了解\n", "generated_transcription": "\n: 对然后嗯你刚刚有我你刚刚说到就是比较喜欢弹吉他嘛然后对saraba six six six six 我不知道我不知道你因因为我之前听吉他曲会比较多一点就是我念大学的时候然后嗯你之前这啊了解\n", - "wer_original": 1.2439024390243902, - "wer_generated": 0.926829268292683, - "wer_improvement": 0.31707317073170727, - "wer_method": "smart_mixed", - "mer_original": 1.1219512195121952 + "cer": 0.2564102564102564, + "wer": 1.5, + "mer": 0.10666666666666667 }, { "segment_id": "SEGMENT_ZH-CN_U1103_ENCN_20250816_202117_272171", - "audio_path": "SEGMENT_ZH-CN_U1103_ENCN_20250816_202117_272171.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1103_ENCN_20250816_202117_272171.wav", "ground_truth": "不是,我是做这个叫婚后的剧情生活号,是那种叫 老白和他的故事,是做这个号。 My weight, I'm fat guy.", - "original_transcription": ": 不是我是做这个叫婚后的剧情生活号是那种叫老白和他的故事我是做这个号\n", "generated_transcription": "\n: 不是我是做这个叫婚后的剧情生活号是那种叫老白和他的故事我是做这个号\n: My way I'm fat guy\n\n", - "wer_original": 0.9024390243902439, - "wer_generated": 0.14634146341463414, - "wer_improvement": 0.7560975609756098, - "wer_method": "smart_mixed", - "mer_original": 0.21951219512195122 + "cer": 0.125, + "wer": 0.42857142857142855, + "mer": 0.05405405405405406 }, { "segment_id": "SEGMENT_ZH-CN_U1052_MIX_20250816_202117_334668", - "audio_path": "SEGMENT_ZH-CN_U1052_MIX_20250816_202117_334668.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1052_MIX_20250816_202117_334668.wav", "ground_truth": "更高的一些那个收入嘛,salary,yes。 像在成都之前。I have part-time job to have foreign teacher, teach the local student.", - "original_transcription": ": 更高的一些那个收入嘛salary yes啊像在成都之前I have part-time job to have foreign teacher teach the local student\n", "generated_transcription": "\n: 更高的一些那个收入吗salary yes 嗯像在成都之前 I have part-time job to have foreign teacher teach the local student\n", - "wer_original": 1.0, - "wer_generated": 0.9230769230769231, - "wer_improvement": 0.07692307692307687, - "wer_method": "smart_mixed", - "mer_original": 0.3076923076923077 + "cer": 0.024691358024691357, + "wer": 0.3076923076923077, + "mer": 0.13793103448275862 }, { "segment_id": "SEGMENT_ZH-CN_U0018_ENCN_20250816_202117_316252", - "audio_path": "SEGMENT_ZH-CN_U0018_ENCN_20250816_202117_316252.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0018_ENCN_20250816_202117_316252.wav", "ground_truth": "Is this very amazing? You know, book bookstore is very amazing. So you want, you don't want to like a starlet open the bookshell right now, because first, you don't have money and second, you don't have experience. And third, you don't have like social important thing, yeah, yeah, yeah. 可以可以,那我们就是你平时我再想问一下,你会做哪些动作去比如说那我我的话我走路会比较驼背嘛,那这个时候我旁边我会提醒我旁边的人,就是如果看到我驼背了,可以在我移这个背部来上一拳或者是拍一下,然后这个就是我的一个小计划吧,就是我一驼背的时候,我就会提醒我旁边的人,就是提醒我一下。那你有有没有这方面的一些经验呢?就是体态方面。", - "original_transcription": ": 这是非常令人惊讶的你知道书店非常令人惊讶所以你想要你不想开始它现在打开书架吗因为首先你没有钱其次你没有经验最后你没有社交经验所以你想要你不想就开始它现在打开书架吧\n", "generated_transcription": ": 这很神奇你知道书店很神奇所以你不想开始 (uh) 打开书架 (ah) 现在因为 first 你没有钱 second 你没有经验 third 你没有 like social 重要的东西是 yeah yeah yeah 可以可以那我们就是你平时 (uh) 我在想问一下你会做哪些动作去比如说那我我的话我走路会比较驼背吗那这个时候我旁边我会提醒我旁边的人就是如果看到我驼背了可以在我右呃背部来上一拳或者是拍一下然后这个就是我的一个小计划吧就是我驼背的时候我就会提醒我旁边的人就是提醒我一下那你有没有这方面的一些经验呢就是体态方面\n", - "wer_original": 0.41037735849056606, - "wer_generated": 0.3443396226415094, - "wer_improvement": 0.06603773584905664, - "wer_method": "smart_mixed", - "mer_original": 0.9716981132075472 + "cer": 0.5094339622641509, + "wer": 0.8823529411764706, + "mer": 0.28 }, { "segment_id": "SEGMENT_ZH-CN_U0043_MIX_20250816_202117_285388", - "audio_path": "SEGMENT_ZH-CN_U0043_MIX_20250816_202117_285388.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0043_MIX_20250816_202117_285388.wav", "ground_truth": "Catch me if you can,对那个那个也是我非常喜欢的电影之一。 那你有没有因为看到一部电影,然后你就想去一个地方。You you watch a movie and you like wow, this place I well I I like it so much. I gotta be there because of that specific movie.", - "original_transcription": ": (oh) catch me if you can 对那个也是我非常喜欢电影之一 呃那你有没有因为看到一部电影然后你就想去一个地方 你看一部电影然后你就想去那个地方\n", "generated_transcription": "\n: (oh) catch me if you can 对那个那个也是我非常喜欢电影之一 那你有没有因为看到一部电影然后你就想去一个地方 你 watch a movie and you're like !(wow) this place I I like it so much I got to be there because of that specific movie\n", - "wer_original": 0.0, - "wer_generated": 1.5454545454545454, - "wer_improvement": -1.5454545454545454, - "wer_method": "smart_mixed", - "mer_original": 1.7272727272727273 + "cer": 0.1118421052631579, + "wer": 0.30303030303030304, + "mer": 0.1232876712328767 }, { "segment_id": "SEGMENT_ZH-CN_U1072_MIX_20250816_202117_318898", - "audio_path": "SEGMENT_ZH-CN_U1072_MIX_20250816_202117_318898.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1072_MIX_20250816_202117_318898.wav", "ground_truth": "很多。那我们的学长们都是也是,他们每天不仅有这个这个 running 嘛,还有这个力量训练,你这个也也不能落下。", - "original_transcription": ": 很多内我们的学长们都是也是他们每天不仅有这个running吗还有这个力量训练这个也不能落下\n", "generated_transcription": "\n: 很多内我们的学长们都是也是他们每天不仅有这个running吗还有这个力量训练这个也不能落下\n", - "wer_original": 0.041666666666666664, - "wer_generated": 1.0, - "wer_improvement": -0.9583333333333334, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.12244897959183673, + "wer": 1.0, + "mer": 0.13953488372093023 }, { "segment_id": "SEGMENT_ZH-CN_U0042_ENCN_20250816_202117_275232", - "audio_path": "SEGMENT_ZH-CN_U0042_ENCN_20250816_202117_275232.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0042_ENCN_20250816_202117_275232.wav", "ground_truth": "是的是的,就是无论是你从哪个方向走,哪个地方都没有你想要去的共享单车,然后你到了目的���之后发现全是共享单车。 Yeah, you do quite familiar with this function. So if you have a chance to change it,", - "original_transcription": ": 是的 是的就是无论是你从哪个方向走哪个地方都没有你想要去的共享单车然后你到了目的地之后发现全是共享单车, : Ya, you do quite familiar with this function, so if you have a chance to change it.\n", "generated_transcription": "\n: 是的是的就是无论是你从哪个方向走哪个地方都没有你想要去的共享单车然后你到了目的地之后发现全是共享单车 : Ya, 你都 quite familiar with this function, So if you have a chance to change it\n", - "wer_original": 0.7605633802816901, - "wer_generated": 0.1267605633802817, - "wer_improvement": 0.6338028169014085, - "wer_method": "smart_mixed", - "mer_original": 0.11267605633802817 + "cer": 0.0603448275862069, + "wer": 0.16666666666666666, + "mer": 0.04477611940298507 }, { "segment_id": "SEGMENT_ZH-CN_U0002_ENCN_20250816_202117_291611", - "audio_path": "SEGMENT_ZH-CN_U0002_ENCN_20250816_202117_291611.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0002_ENCN_20250816_202117_291611.wav", "ground_truth": "Yeah, you're right , I hope I can find a good internship like this year. 嗯,我好像有听说,好像很多旅游现在现在都要去抢预约,好像要提前一个星期,特别难抢。", - "original_transcription": ": Yeah, you're right. (um). I hope I can find a good internship. (um), like (err) this year, 我好像有听说好像很多旅游现在都要去抢预约好像要提前一个星期特别难抢\n", "generated_transcription": "\n: Yeah, you are right. (um). I hope I can find a good internship. (um) like (err) this year, 我好像有听说好像很多旅游现在都要去抢预约好像要提前一个星期特别难抢\n", - "wer_original": 0.5, - "wer_generated": 0.2857142857142857, - "wer_improvement": 0.2142857142857143, - "wer_method": "smart_mixed", - "mer_original": 0.25 + "cer": 0.12222222222222222, + "wer": 0.4, + "mer": 0.16 }, { "segment_id": "SEGMENT_ZH-CN_U0026_ENCN_20250816_202117_307899", - "audio_path": "SEGMENT_ZH-CN_U0026_ENCN_20250816_202117_307899.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0026_ENCN_20250816_202117_307899.wav", "ground_truth": "Yeah, I understand. Well, I I I also apply for the public relations major in the same university that I'm going to go. And to be honest, I didn't choose that major because there, that's major both too many writings. And I don't think I can be that ** competed with, you know, the native the locals, because they, there are the, the, the modern language of them is English. I don't know how to, you know the, the description of that , curricula sad. That's this major is to teach you how to write like a true and authentic Americans. So I think's pretty hard for me. That's why I choose the management major. So what's the reason you choose your major? 哦,确实我觉得运动之后。", - "original_transcription": ": Yeah, I understand. Well, I, I also applied for the public relations major in the same university that I'm going to go, and to be honest, I didn't choose that major because that major involves too many writings, and I don't think I can be that good competed with, you know, the natives, the locals, because they're they're the mother language of them is English. I don't know how to you know the description of that (uh) curriculum said that this major is to teach you how to write like a true and authentic American, so I think it's pretty hard for me. That's why I choose the management major. So what's the reason you choose your major? : (oh) 确实我觉得运动之后\n", "generated_transcription": ": yeah, I understand. Well, I, I also applied for the public relations major in the same university that I'm going to go, and to be honest, I didn't choose that major because their that major involves too many writings and I don't think I can be that good competed with, you know, the natives, the locals, because they're they're the the mother language of them is English. I don't know how to you know the description of that (uh) curriculum said that (uh) this major is to teach you how to write like a true and authentic American, so I think it's pretty hard for me. That's why I choose the management major. So what's the reason you choose your major? : (oh) 确实我觉得运动之后\n", - "wer_original": 0.08888888888888889, - "wer_generated": 0.24444444444444444, - "wer_improvement": -0.15555555555555556, - "wer_method": "smart_mixed", - "mer_original": 0.22962962962962963 + "cer": 0.08695652173913043, + "wer": 0.1885245901639344, + "mer": 0.16793893129770993 }, { "segment_id": "SEGMENT_ZH-CN_U0078_ENCN_20250816_202117_309557", - "audio_path": "SEGMENT_ZH-CN_U0078_ENCN_20250816_202117_309557.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0078_ENCN_20250816_202117_309557.wav", "ground_truth": "Yeah, okay, and. 我最喜欢的足球队是切尔西, 然后我最喜欢的运动员呢是克里斯蒂亚诺罗罗纳尔多,这名球员是在世界上也是非常知名的足球运动员, 他的精神和他的毅力,包括他这种不服输的这种精神都值得我们去学习。", - "original_transcription": "Yeah, okay, and, (um), my favorite football team is Chelsea. And, my favorite athlete is Cristiano Ronaldo. This player is in the world is a very famous football player. His spirit and his willpower, including his spirit of not giving up, all of these are worth for us to learn.\n", "generated_transcription": "\nyeah ok and 我最喜欢的足球队是切尔西然后我最喜欢的运动员呢是克里斯蒂亚诺罗纳尔多这名球员在世界上也是非常知名的足球运动员他的精神和他的毅力包括他这种不服输的这种精神都值得我们去学习\n", - "wer_original": 0.4787234042553192, - "wer_generated": 0.10638297872340426, - "wer_improvement": 0.37234042553191493, - "wer_method": "smart_mixed", - "mer_original": 0.9787234042553191 + "cer": 0.041237113402061855, + "wer": 0.6666666666666666, + "mer": 0.033707865168539325 }, { "segment_id": "SEGMENT_ZH-CN_U0055_MIX_20250816_202117_336398", - "audio_path": "SEGMENT_ZH-CN_U0055_MIX_20250816_202117_336398.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0055_MIX_20250816_202117_336398.wav", "ground_truth": "因为我觉得当时确实有点 crazy,但我觉得都已经考完了,就相当于他都已经都已经定了嘛。就是还想了解一下自己大概是一个什么样的 situation。", - "original_transcription": ": 因为我觉得当时确实有点, crazy, 但我觉得都已经考完了就相当于他已经都已经定了嘛就是还想了解一下自己大概是一个什么样的, situation.\n", "generated_transcription": "\n: 因为我觉得当时确实有点 crazy 但我觉得都已经考完了就相当于他已经都已经定了嘛就是还想了解一下自己大概是一个什么样的 situation\n", - "wer_original": 3.8461538461538463, - "wer_generated": 3.3846153846153846, - "wer_improvement": 0.4615384615384617, - "wer_method": "smart_mixed", - "mer_original": 3.5384615384615383 + "cer": 0.014705882352941176, + "wer": 0.6666666666666666, + "mer": 0.017857142857142856 }, { "segment_id": "SEGMENT_ZH-CN_U0077_ENCN_20250816_202117_298764", - "audio_path": "SEGMENT_ZH-CN_U0077_ENCN_20250816_202117_298764.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0077_ENCN_20250816_202117_298764.wav", "ground_truth": "Yes, I like watching performance on stage. 是是一周吗?", - "original_transcription": ": (ah) yes I like watching performance on stage 是是一周吗\n", "generated_transcription": "\n: (ah) yes I like watching performance on stage 是是一周吗\n", - "wer_original": 1.0769230769230769, - "wer_generated": 0.3076923076923077, - "wer_improvement": 0.7692307692307692, - "wer_method": "smart_mixed", - "mer_original": 0.3076923076923077 + "cer": 0.05128205128205128, + "wer": 0.125, + "mer": 0.08333333333333333 }, { "segment_id": "SEGMENT_ZH-CN_U0083_MIX_20250816_202117_283001", - "audio_path": "SEGMENT_ZH-CN_U0083_MIX_20250816_202117_283001.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0083_MIX_20250816_202117_283001.wav", "ground_truth": "For example, 比如说这个频道在播放西游记。 那 so this will be the only choice for you. 那么你只能看这一个,但是还有就是 。", - "original_transcription": ": for example,比如说这个频道在播放西游记,那么,this will be the only choice for you,那么你只能看这一个,那还有就是,���\n", "generated_transcription": "\n: for example,比如说这个频道在播放西游记,那么this will be the only choice for you,那么你只能看这一个,那还有就是\n", - "wer_original": 0.7674418604651163, - "wer_generated": 0.8604651162790697, - "wer_improvement": -0.09302325581395343, - "wer_method": "smart_mixed", - "mer_original": 0.8604651162790697 + "cer": 0.05714285714285714, + "wer": 0.5, + "mer": 0.075 }, { "segment_id": "SEGMENT_ZH-CN_U1031_ENCN_20250816_202117_337579", - "audio_path": "SEGMENT_ZH-CN_U1031_ENCN_20250816_202117_337579.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1031_ENCN_20250816_202117_337579.wav", "ground_truth": "So Zhiwei do you do you enjoy traveling? 主要确实是我都去的,是一些很有 历史故事的地方 。 我就。", - "original_transcription": ": So do you do you enjoy traveling? : 哦,我确实都是去那些很有历史故事的地方。\n", "generated_transcription": "\n: So do you do you enjoy traveling\n", - "wer_original": 0.7647058823529411, - "wer_generated": 0.8235294117647058, - "wer_improvement": -0.05882352941176472, - "wer_method": "smart_mixed", - "mer_original": 0.4117647058823529 + "cer": 0.5272727272727272, + "wer": 0.36363636363636365, + "mer": 0.7741935483870968 }, { "segment_id": "SEGMENT_ZH-CN_U1056_ENCN_20250816_202117_347285", - "audio_path": "SEGMENT_ZH-CN_U1056_ENCN_20250816_202117_347285.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1056_ENCN_20250816_202117_347285.wav", "ground_truth": "嗯。 Thanks for your. 也可能就是他们都说我我们零零后就是被家长都被家长保护的太好了,然后就没有什么 独自出入社会的能力。对,但是我自己觉得可能确实缺乏这样的能力,但是我又 没有信心去。 I still remember that the air was filled with water vapor, and there were few street light on the narrow mountain road, which made the hurried journey more difficult.", - "original_transcription": ": Thanks for your.\n", "generated_transcription": ": Thanks for your\n", - "wer_original": 0.7256637168141593, - "wer_generated": 0.9823008849557522, - "wer_improvement": -0.2566371681415929, - "wer_method": "smart_mixed", - "mer_original": 0.9734513274336283 + "cer": 0.9417040358744395, + "wer": 0.9166666666666666, + "mer": 0.9719626168224299 }, { "segment_id": "SEGMENT_ZH-CN_U1023_ENCN_20250816_202117_352506", - "audio_path": "SEGMENT_ZH-CN_U1023_ENCN_20250816_202117_352506.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1023_ENCN_20250816_202117_352506.wav", "ground_truth": "Yeah better. It's called. what was it called again, Breaking Bad, right? Have you have you watched Better Call Saul. 段美就是他的中中国的外号嘛, 他他们都不知道不知道为什么就是国人, 你说。", - "original_transcription": ": Ya better. It's called (um). What's it called again? Breaking Bad right? Have you have you watched Better Call Saul? 完美就是他的中中国的外行吗, (oh), 他他们, 不知道不知道为什么就是国人, 你说,\n", "generated_transcription": "\n: Ya better. It's called (um). What's it called again? Breaking Bad right. Have you have you watched Better Call Saul? 完美就是他的中中国的外行吗, (oh), 他他们, 不知道不知道为什么就是国人, 你说\n", - "wer_original": 0.05454545454545454, - "wer_generated": 0.2909090909090909, - "wer_improvement": -0.23636363636363636, - "wer_method": "smart_mixed", - "mer_original": 0.2727272727272727 + "cer": 0.09836065573770492, + "wer": 0.30434782608695654, + "mer": 0.15384615384615385 }, { "segment_id": "SEGMENT_ZH-CN_U1043_ENCN_20250816_202117_293784", - "audio_path": "SEGMENT_ZH-CN_U1043_ENCN_20250816_202117_293784.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1043_ENCN_20250816_202117_293784.wav", "ground_truth": "You know what, you know what he said to me. 但是原本我觉得你可怜,现在我觉得真的是。", - "original_transcription": ": 你知道他说了什么但是原本我觉得你可怜现在我觉���真的是\n", "generated_transcription": "\n: 你know what 你know what he said to me 但是原本我觉得你可怜现在我觉得真的是\n", - "wer_original": 0.4666666666666667, - "wer_generated": 0.26666666666666666, - "wer_improvement": 0.2, - "wer_method": "smart_mixed", - "mer_original": 0.4 + "cer": 0.12, + "wer": 0.36363636363636365, + "mer": 0.07142857142857142 }, { "segment_id": "SEGMENT_ZH-CN_U0015_MIX_20250816_202117_269064", - "audio_path": "SEGMENT_ZH-CN_U0015_MIX_20250816_202117_269064.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0015_MIX_20250816_202117_269064.wav", "ground_truth": "对啊,那些 kids 呢?那些 kids 也不 hungry 吗? 人都顺利了,flag 之后,又当这个 flag 没存在过。", - "original_transcription": ": 对呀那些, kids, 那些, kids, 也不 hungry, (ah), 人都胜利了, flag, 了之后又当这个, flag, 没存在过,\n", "generated_transcription": "\n: 对呀那些 kids 呢那些 kids 也不 hungry 吗人都顺利了 flag 了之后又当这个 flag 没存在过\n", - "wer_original": 0.9333333333333333, - "wer_generated": 0.4, - "wer_improvement": 0.5333333333333333, - "wer_method": "smart_mixed", - "mer_original": 0.7 + "cer": 0.0425531914893617, + "wer": 0.36363636363636365, + "mer": 0.06666666666666667 }, { "segment_id": "SEGMENT_ZH-CN_U0044_MIX_20250816_202117_279457", - "audio_path": "SEGMENT_ZH-CN_U0044_MIX_20250816_202117_279457.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0044_MIX_20250816_202117_279457.wav", "ground_truth": "嗯,会的,我和朋友就是比如说我们在 plan 或者在 train, 然后我们在通勤的过程中,就会一起选择一部我们都喜欢的 movie, 然后一起看一看。比如说宫崎骏的一些 cartoon film, 我们觉得都是比较经典的,然后也很适合在旅途中作为放松休闲的,对,电影来看,是的。", - "original_transcription": ": 嗯会的我和朋友就是比如说我们在plan或者在churn然后我们在通行的过程中就会一起选择一部我们都喜欢的movie然后一起看一看比如说宫崎骏的一些cartoon film我们觉得都是比较经典的然后也很适合在旅途中作为放松休闲的对电影来看是的\n", "generated_transcription": "\n: 嗯会的我和朋友就是比如说我们在plan或者在chilling然后我们在通行的过程中就会一起选择一部我们都喜欢的movie然后一起看一看比如说宫崎骏的一些cartoon film我们觉得都是比较经典的然后也很适合在旅途中作为放松休闲的对电影来看是的\n", - "wer_original": 0.018518518518518517, - "wer_generated": 1.0, - "wer_improvement": -0.9814814814814815, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.058823529411764705, + "wer": 1.0, + "mer": 0.020202020202020204 }, { "segment_id": "SEGMENT_ZH-CN_U0003_ENCN_20250816_202117_273534", - "audio_path": "SEGMENT_ZH-CN_U0003_ENCN_20250816_202117_273534.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0003_ENCN_20250816_202117_273534.wav", "ground_truth": "哦,我我还其实对文书挺感兴趣的,就是我看实习中也有那种类似于写文案的,然后就是招英专生,然后其实大四的时候还要去实习一下,觉得蛮蛮有意思的。 No, what what is this? .", - "original_transcription": ": 哦我我还其实对文书挺感兴趣的就是我看实习中也有那种类似于写文案的然后照印专升然后其实大四的时候还要去实习一下觉得蛮蛮有意思的嗯 no what is this 哦\n", "generated_transcription": "\n: 哦我我还其实对文书挺感兴趣的就是我看实习中也有那种类似于写文案的然后照印专生然后其实大四的时候还要去实习一下觉得蛮蛮有意思的嗯 no what is this\n", - "wer_original": 0.9210526315789473, - "wer_generated": 0.18421052631578946, - "wer_improvement": 0.7368421052631579, - "wer_method": "smart_mixed", - "mer_original": 0.19736842105263158 + "cer": 0.1125, + "wer": 0.3333333333333333, + "mer": 0.08695652173913043 }, { "segment_id": "SEGMENT_ZH-CN_U0028_ENCN_20250816_202117_292219", - "audio_path": "SEGMENT_ZH-CN_U0028_ENCN_20250816_202117_292219.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0028_ENCN_20250816_202117_292219.wav", "ground_truth": "Yeah . So I think the most proper activity sports activity for me is is to play badminton. It doesn't require a a basis. 嗯。", - "original_transcription": ": yeah, so, (um), I think the most proper activity, sports activity for me is is to play badminton. It doesn't require a a a basis, (um).\n", "generated_transcription": "\n: yeah so (um) I think the most proper activity sports activity for me is is to play badminton it doesn't require a a basis\n", - "wer_original": 1.3076923076923077, - "wer_generated": 0.3076923076923077, - "wer_improvement": 1.0, - "wer_method": "smart_mixed", - "mer_original": 0.2692307692307692 + "cer": 0.031914893617021274, + "wer": 0.08333333333333333, + "mer": 0.08333333333333333 }, { "segment_id": "SEGMENT_ZH-CN_U0073_ENCN_20250816_202117_338424", - "audio_path": "SEGMENT_ZH-CN_U0073_ENCN_20250816_202117_338424.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0073_ENCN_20250816_202117_338424.wav", "ground_truth": "And I guess it's just like what I said before, I normally do exercise at the lunch time. I mean, we ** we take a break at eleven past thirty. 说起来我之前的实习工作确实,跟我现在做的工作不相关。我之前做的实习工作是娱娱乐营销。", - "original_transcription": ": Yeah, I guess it's just like what I said before. (uh), I normally do exercise at the lunchtime. I mean we we take a break at eleven past thirty. : 说起来我之前的实习工作确实跟我现在做的工作不相关我之前做的实习工作是娱乐营销\n", "generated_transcription": "\n: Yeah, I guess it's just like what I said before. (uh), I normally do exercise at the lunch time. I mean we we take a break at eleven past thirty. : 说起来我之前的实习工作确实跟我现在做的工作不相关我之前做的实习工作是娱乐营销\n", - "wer_original": 0.6805555555555556, - "wer_generated": 0.125, - "wer_improvement": 0.5555555555555556, - "wer_method": "smart_mixed", - "mer_original": 0.1527777777777778 + "cer": 0.04861111111111111, + "wer": 0.1, + "mer": 0.04411764705882353 }, { "segment_id": "SEGMENT_ZH-CN_U0048_MIX_20250816_202117_345733", - "audio_path": "SEGMENT_ZH-CN_U0048_MIX_20250816_202117_345733.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0048_MIX_20250816_202117_345733.wav", "ground_truth": "对,我感觉可能是因为他的那个 main idea 有时候会比较贴近一些 Europe Europe 的 background,然后他们的这种 dressing 啊,还有包括人类的那个长相 appearance,都会很 European。", - "original_transcription": ": 对我感觉可能是因为他的那个, main idea, 有时候会比较贴近一些, Europe, Europe, 的, background, 然后他们的这种, (um), dressing, (ah), 还有包括人类的那个, 长相, appearance, 都会很, European.\n", "generated_transcription": "\n: 对我感觉可能是因为他的那个, main idea, 有时候会比较贴近一些, Europe, Europe 的, background, 然后他们的这种, (um) dressing, (ah), 还有包括人类的那个, 长相, appearance, 都会很, European\n", - "wer_original": 0.6086956521739131, - "wer_generated": 0.5652173913043478, - "wer_improvement": 0.0434782608695653, - "wer_method": "smart_mixed", - "mer_original": 0.5869565217391305 + "cer": 0.0392156862745098, + "wer": 0.6666666666666666, + "mer": 0.037037037037037035 }, { "segment_id": "SEGMENT_ZH-CN_U0102_ENCN_20250816_202117_307627", - "audio_path": "SEGMENT_ZH-CN_U0102_ENCN_20250816_202117_307627.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0102_ENCN_20250816_202117_307627.wav", "ground_truth": "是是是。 Yeah, you know Hunan people's philosophy, they are very stubborn. They are, you know they are very stubborn. And they, if they want to do one thing, they will do it, persistently , they will do it regardless of whatever it takes.", - "original_transcription": ": 嗯嗯嗯嗯嗯嗯 : ya, (um), you know, (um), Hunan people's philosophy, they are very stubborn. They are, you know, they are very stubborn, and they, if they want to do one thing, they will do it persistently. They will do it regardless of whatever it takes.\n", "generated_transcription": "\n: 是是是 : ya, (um), you know, (um), Hunan people's philosophy, they are very stubborn. They are, you know, they are very stubborn and they, if they want to do one thing, they will do it persistently. They will do it regardless of whatever it takes\n", - "wer_original": 0.08695652173913043, - "wer_generated": 0.2608695652173913, - "wer_improvement": -0.17391304347826086, - "wer_method": "smart_mixed", - "mer_original": 0.3695652173913043 + "cer": 0.027777777777777776, + "wer": 0.07142857142857142, + "mer": 0.06818181818181818 }, { "segment_id": "SEGMENT_ZH-CN_U0031_ENCN_20250816_202117_319476", - "audio_path": "SEGMENT_ZH-CN_U0031_ENCN_20250816_202117_319476.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0031_ENCN_20250816_202117_319476.wav", "ground_truth": "yes. 哦。 对对对,这个电影就是它火热,是因为大生呃周深的这个大鱼这首主题曲呃火起来。但是其实在主题曲火起来之前,我已经看了好几遍,就是两三遍这样子。然后因为他的动画画的非常的感人,就是非常精美之余,感觉每一帧截出来都像是可以当成壁纸。然后他的情节虽然有些人会吐槽,觉得他不是很很符合逻辑,但是我每一次看还是会哭一遍,所以也成了我一种安慰吧。就是当我之后不是就是特别特别不开心的时候,可以看一下,然后就感觉被安慰到了的感觉。 person in the school, like in the streets they they just don't respect those or special people. Or maybe we just we need to treat them as normal person by the same time to take care of them as we are the same stage in the society.", - "original_transcription": ": yes, : 对对对这个电影就是它火热是因为大生周深的这个大鱼这首主题曲呃火起来但是其实在主题曲火起来之前我已经看了好几遍就是两三遍这样子然后因为它的动画画得非常的感人就是非常的精美之余感觉每一帧截出来都像是可以当成壁纸然后他的情节虽然有些人会吐槽觉得它不是很符合逻辑但是我每次看还是会哭一遍所以也成为唯一一种安慰吧就是当我之后不是就特别特别不开心的时候可以看一下或者感觉被安慰到了的感觉 : person in the school like in the streets they they just don't respect those (uh) special people or maybe we just treat we need to treat them as normal person by the same time to (um) take care of them as (uh) we are the same stage in the society.\n", "generated_transcription": ": yeah, : 对对对这个电影就是它火热是因为大生周深的这个大鱼这首主题曲呃火起来但是其实在主题曲火起来之前我已经看了好几遍就是两三遍这样子然后因为它的动画画得非常的感人就是非常的精美之余感觉每一帧截出来都像是可以当成壁纸然后他的情节虽然有些人会吐槽觉得它不是很符合逻辑但是我每次看还是会哭一遍所以也成为唯一一种安慰吧就是当我之后不是就特别特别不开心的时候可以看一下或者感觉被安慰到了的感觉 : person in the school like in the streets they they just don't respect those (uh) special people or maybe we just treat we need to treat them as normal person by the same time to (um) take care of them as (uh) we are the same stage in the society\n", - "wer_original": 0.265625, - "wer_generated": 0.16015625, - "wer_improvement": 0.10546875, - "wer_method": "smart_mixed", - "mer_original": 0.15625 + "cer": 0.07219251336898395, + "wer": 0.14, + "mer": 0.08368200836820083 }, { "segment_id": "SEGMENT_ZH-CN_U0088_ENCN_20250816_202117_280571", - "audio_path": "SEGMENT_ZH-CN_U0088_ENCN_20250816_202117_280571.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0088_ENCN_20250816_202117_280571.wav", "ground_truth": "I remember many people say LeBron James go to play American football, and he can't against the other enemy. 哦。", - "original_transcription": ": I remember many people say (um) Lebron James go to (um) (uh) play American football and he can't against the other enemy (oh).\n", "generated_transcription": "\n: I remember many people say (um) Lebron James go to (um) play American football and he can't against the other enemy\n", - "wer_original": 0.0, - "wer_generated": 0.3333333333333333, - "wer_improvement": -0.3333333333333333, - "wer_method": "smart_mixed", - "mer_original": 0.38095238095238093 + "cer": 0.05747126436781609, + "wer": 0.15, + "mer": 0.15 }, { "segment_id": "SEGMENT_ZH-CN_U0034_ENCN_20250816_202117_336185", - "audio_path": "SEGMENT_ZH-CN_U0034_ENCN_20250816_202117_336185.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0034_ENCN_20250816_202117_336185.wav", "ground_truth": "原来如此。 I think so. 嗯, 我记得好像翻译方面也有嗯计算机辅助翻译这种课程。 Yes, I have been there, and I enjoy it too.", - "original_transcription": ": 原来如此, : (hm), I think so, 我记得好像, 翻译方面也有, (hm), 计算机辅助翻译这种课程, : Yes, I have been there and I enjoy it too.\n", "generated_transcription": "\n: 原来如此 : (hm), I think so 我记得好像翻译方面也有 (hm) 计算机辅助翻译这种课程 : Yes, I have been there and I enjoy it too.\n", - "wer_original": 0.4772727272727273, - "wer_generated": 0.1590909090909091, - "wer_improvement": 0.31818181818181823, - "wer_method": "smart_mixed", - "mer_original": 0.22727272727272727 + "cer": 0.07462686567164178, + "wer": 0.25, + "mer": 0.07317073170731707 }, { "segment_id": "SEGMENT_ZH-CN_U0017_MIX_20250816_202117_264174", - "audio_path": "SEGMENT_ZH-CN_U0017_MIX_20250816_202117_264174.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0017_MIX_20250816_202117_264174.wav", "ground_truth": "但是呢,是的,在我准备一些嗯课后作业的时候,some times i did't 安排了时间,liable so 我在这个时候呢,everyday 我会错过一些 begin,这个时候嗯我觉得对于 for myself,的这件事情也不是经常发生 faculty,pens in peson 非常有,对一些任务来说,我是比较嗯,dances 就是一旦有一些比较重要,一般上学的任务,我就会觉得非常的嗯 really。", - "original_transcription": ": actually 是的在我准备一些客户作业的时候 sometimes (um) I didn't 安排我的时间 very well so 我在这个时候呢就会 (um) 常常我会错过一些deadline 这个时候 (um) 我觉得对于 (um) for myself 的这些事情也不是经常发生 frequently cause I'm a person (um) 非常有对一些任务来说我是比较 (um) anxious 就是一旦有一些比较重要的然后 (um) emergency 的任务我就会觉得非常的 (um) worry\n", "generated_transcription": "\n: actually 是的 在我准备一些客户作业的时候 sometimes 我 didn't 安排我的时间 very well so 我在这个时候呢就会 (um) 常常我会错过一些 deadline 这个时候 (um) 我觉得对于 (um) for myself 的这些事情也不是经常 (um) 常常发生 cause 我是 (um) 一个非常有对一些任务来说我是比较 (um) anxious 就是一旦有一些比较重要的然后 (um) emergency 的任务 我就会觉得非常的 (um) worry\n", - "wer_original": 0.52, - "wer_generated": 1.94, - "wer_improvement": -1.42, - "wer_method": "smart_mixed", - "mer_original": 1.82 + "cer": 0.4857142857142857, + "wer": 2.0, + "mer": 0.35344827586206895 }, { "segment_id": "SEGMENT_ZH-CN_U0094_MIX_20250816_202117_272414", - "audio_path": "SEGMENT_ZH-CN_U0094_MIX_20250816_202117_272414.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0094_MIX_20250816_202117_272414.wav", "ground_truth": "Speaking of gym well I got a lot to say let's talk ** equipment。Let me tell you 我比较喜欢就是 focus 在力量上面力量, 还有 有些人会做 ** ,但是你力量,然后速度这两个其实已经够。就是如果你是要比平常好看的话,练力量练速度,这两个都必须的。", - "original_transcription": ": speaking of gym !(wah)! I got a lot to say. (err) let's not start with equipment. Let me say (err) 我比较喜欢就是, focus, 在力量上面, 力量, (err) 还有, 有时候你会做, HIIT, 但是你力量然后, 速度这两个其实已经够就是如果你知道, 平常好看的话, 练, 力量, 练, 速度, 这个两个都必须的,\n", "generated_transcription": "\n: speaking of gym !(wah)! I got a lot to say. (err) let's not start with equipment. Let me say (err) 我比较喜欢就是, focus 在力量上面, 力量, (err) 还有, 有时候你会做, HIIT 但是你力量然后, 速度这两个其实已经够就是如果你知道, 平常好看的话, 练力量练力量练速度这两个都必须的\n", - "wer_original": 0.4175824175824176, - "wer_generated": 0.3626373626373626, - "wer_improvement": 0.05494505494505497, - "wer_method": "smart_mixed", - "mer_original": 0.38461538461538464 + "cer": 0.3023255813953488, + "wer": 0.8181818181818182, + "mer": 0.24390243902439024 }, { "segment_id": "SEGMENT_ZH-CN_U2014_MIX_20250816_202117_278776", - "audio_path": "SEGMENT_ZH-CN_U2014_MIX_20250816_202117_278776.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U2014_MIX_20250816_202117_278776.wav", "ground_truth": "那当然,我 classmates 关系都很不错,大家一起一起去学习,一起去 library。", - "original_transcription": ": 那当然了,我classmates关系都很不错,大家一起学习也一起去library\n", "generated_transcription": "\n: 那当然了,我classmates关系都很不错大家一起一起去学习也去library\n", - "wer_original": 0.2962962962962963, - "wer_generated": 1.0, - "wer_improvement": -0.7037037037037037, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.07692307692307693, + "wer": 1.0, + "mer": 0.125 }, { "segment_id": "SEGMENT_ZH-CN_U0040_ENCN_20250816_202117_283783", - "audio_path": "SEGMENT_ZH-CN_U0040_ENCN_20250816_202117_283783.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0040_ENCN_20250816_202117_283783.wav", "ground_truth": "嗯。 Yes. 那就跟我, 就跟我家孩子一样,他有时候比如说会把东西碰到地上,我说你偶尔碰一次是不小心的。但是如果你不想让它掉地上,你就把它放在里面,你不可能把他碰地上的。所以就我以后如果你再有这样的行为,我就理解为是你并不爱惜它。没有没有巧合,没有不小心。这种就就是就很能理解。他的玩具很多,我会让他们把它照顾好。如果你不喜欢或者是老摔坏它,总是这样的话,我只能说你不爱惜它,没有别的原因,总有那么多不小心。 I heard about that. I heard about s similar one? I I don't.", - "original_transcription": ": 嗯嗯 yes 就跟我啊就跟我家孩子一样他有时候比如说会把东西碰到地上我说你偶尔碰一次是不小心但是如果你不想让他掉地上你就把它放在里面你不可能把它碰地上的所以就我以后如果你再有这样的行为我就理解为是你并不爱惜他没有没有巧合没有不小心这种就就就就很能理解他玩具很多我会让他们把它照顾好如果你不喜欢或者是老摔坏的总是这样的话我只能说你不爱惜他没有别的原因没有那么多不小心 : I heard about that I heard about similar one I I don't\n", "generated_transcription": "\n: 嗯嗯 yes 就跟我啊就跟我家孩子一样他有时候比如说会把东西碰到地上我说你偶尔碰一次是不小心的那但是如果你不想让他掉地上你就把他放在里面你不可能把他碰地上的所以就我以后如果你再有这样的行为我就理解为是你并不爱惜他没有没有巧合没有不小心这种就就就就很能理解他玩具很多我会让他们把他照顾好如果你不喜欢或者是老摔坏的总是这样的话我只能说你不爱惜他没有别的原因没有那么多不小心\n", - "wer_original": 0.9292452830188679, - "wer_generated": 0.20754716981132076, - "wer_improvement": 0.7216981132075471, - "wer_method": "smart_mixed", - "mer_original": 0.1650943396226415 + "cer": 0.25, + "wer": 0.9411764705882353, + "mer": 0.13541666666666666 }, { "segment_id": "SEGMENT_ZH-CN_U1028_ENCN_20250816_202117_326856", - "audio_path": "SEGMENT_ZH-CN_U1028_ENCN_20250816_202117_326856.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1028_ENCN_20250816_202117_326856.wav", "ground_truth": "对, 还还挺有趣的,挺有意思的。 Yeah doesn't matter.", - "original_transcription": ": 对还还挺有趣的挺有意思的\n", "generated_transcription": "\n: 对还挺有趣的挺有意思的\n", - "wer_original": 0.8333333333333334, - "wer_generated": 0.3888888888888889, - "wer_improvement": 0.4444444444444445, - "wer_method": "smart_mixed", - "mer_original": 0.3333333333333333 + "cer": 0.6071428571428571, + "wer": 1.0, + "mer": 0.26666666666666666 }, { "segment_id": "SEGMENT_ZH-CN_U1101_ENCN_20250816_202117_307125", - "audio_path": "SEGMENT_ZH-CN_U1101_ENCN_20250816_202117_307125.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1101_ENCN_20250816_202117_307125.wav", "ground_truth": "What is your major? 当时是在那个什么县,那个字儿应该叫滦县嘛,三点水那个。", - "original_transcription": ": 当时是在那个什么县那个字儿应该叫蓝县吗三点水那个\n", "generated_transcription": "\n: 当时是在那个什么县那个字儿应该叫蓝县吗三点水那个\n", - "wer_original": 0.8709677419354839, - "wer_generated": 0.2903225806451613, - "wer_improvement": 0.5806451612903225, - "wer_method": "smart_mixed", - "mer_original": 0.2903225806451613 + "cer": 0.4358974358974359, + "wer": 1.0, + "mer": 0.21428571428571427 }, { "segment_id": "SEGMENT_ZH-CN_U2003_MIX_20250816_202117_262765", - "audio_path": "SEGMENT_ZH-CN_U2003_MIX_20250816_202117_262765.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U2003_MIX_20250816_202117_262765.wav", "ground_truth": "是的,我妈妈也很少给我零花钱,她觉得我不需要这么多,可是我非常的 need。", - "original_transcription": ": 是的我妈妈也很少给我零花钱她觉得我不需要这么多可是我非常的腻的\n", "generated_transcription": "\n: 是的我妈妈也很少给我零花钱她觉得我不需要这么多可是我非常的腻的\n", - "wer_original": 1.0, - "wer_generated": 0.15151515151515152, - "wer_improvement": 0.8484848484848485, - "wer_method": "smart_mixed", - "mer_original": 0.15151515151515152 + "cer": 0.12121212121212122, + "wer": 1.0, + "mer": 0.06666666666666667 }, { "segment_id": "SEGMENT_ZH-CN_U0045_ENCN_20250816_202117_315654", - "audio_path": "SEGMENT_ZH-CN_U0045_ENCN_20250816_202117_315654.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0045_ENCN_20250816_202117_315654.wav", "ground_truth": "And also I agree with you. Yes, this topic is actually pretty similar to the last one, but , actually to spend my free time . I like nature a lot. So sometimes , except for learning some instruments playing some instruments. I also go to the like the park to just do nothing. I just stay there for the whole afternoon and sitting on a bench and to play with some little cats and dogs, and , just to see the trees and the skies, which are very relaxing to me. And this is one of the things that usually do to spend my free time. 然后关于这个冒险,我也是通过哈利波特才有了这样的想法。有时候就是当时看电影的那段日子特别神奇,我每天晚上做梦都会梦到一些相关的情节,然后都特别有意思的梦境。我觉得当时如果我要是把他们记录下来就好了,但是总之我当时也是没有这么做嘛,有一点点后悔 。但是如果讨论到这个冒险的话题的话,我觉得如果有一个电影可以激发我的冒险的灵感的话,那可能就是哈利波特吧。你呢?", - "original_transcription": ": 非常喜欢哈利波特。\n", "generated_transcription": ": 呃,另外,我也同意你,是的,这个话题其实跟上一个话题 pretty 类似,但其实我,actually,为了花我free time,嗯,我,like,nature a lot,所以,sometimes,except for learning some instruments,playing some instruments,我,also,go to,like,the park,to,just,do nothing,我,just,stay there,for,the,whole afternoon,and,sitting on,a,bench,and,to,play,with,some,little,cats,and,dogs,and,just,to,see,the,trees,and,the,skies,which,are,very,relaxing,to,me,and,this,is,one,of,the,(um),things,that,I,usually,do,to,spend,my,free,time。\n", - "wer_original": 0.5774647887323944, - "wer_generated": 0.954225352112676, - "wer_improvement": -0.37676056338028163, - "wer_method": "smart_mixed", - "mer_original": 0.9823943661971831 + "cer": 0.4370629370629371, + "wer": 0.9428571428571428, + "mer": 0.9589552238805971 }, { "segment_id": "SEGMENT_ZH-CN_U1038_MIX_20250816_202117_329545", - "audio_path": "SEGMENT_ZH-CN_U1038_MIX_20250816_202117_329545.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1038_MIX_20250816_202117_329545.wav", "ground_truth": "我好像在 TikTok 上刷到过,我感觉就是非常非常 sweet。 我 a little 我只会一点之前溺过水,然后就学了一点点。", - "original_transcription": ": 我好像在ticktock上刷到过我感觉就是非常非常sweet嗯我只会一点之前弄过水然后就学了一点点\n", "generated_transcription": "\n: 我好像在ticktock上刷到过我感觉就是非常非常sweet嗯我only会一点之前烫过水然后就学了一点点\n", - "wer_original": 0.046511627906976744, - "wer_generated": 1.0, - "wer_improvement": -0.9534883720930233, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.2222222222222222, + "wer": 1.0, + "mer": 0.15 }, { "segment_id": "SEGMENT_ZH-CN_U0026_ENCN_20250816_202117_308077", - "audio_path": "SEGMENT_ZH-CN_U0026_ENCN_20250816_202117_308077.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0026_ENCN_20250816_202117_308077.wav", "ground_truth": "嗯,说起来高中,那你高中在哪个地方上的呀? I'll be majoring in management studies.", - "original_transcription": ": 嗯说起来高中那呃高中在哪个地方上啊呃我将要读管理学专业\n", "generated_transcription": "\n: 说起来高中那呃高中在哪个地方上啊 我会majoring in management studies\n", - "wer_original": 0.7777777777777778, - "wer_generated": 0.4074074074074074, - "wer_improvement": 0.3703703703703704, - "wer_method": "smart_mixed", - "mer_original": 0.5185185185185185 + "cer": 0.18, + "wer": 0.5714285714285714, + "mer": 0.25 }, { "segment_id": "SEGMENT_ZH-CN_U1041_MIX_20250816_202117_343921", - "audio_path": "SEGMENT_ZH-CN_U1041_MIX_20250816_202117_343921.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1041_MIX_20250816_202117_343921.wav", "ground_truth": "就是,体现在就,体现在可能会怎么说呢?可能会因为一个方案就是,彻夜加班很多很多天这种情况会出现。就虽然说可能后面也会就是轮休放假吧,但是对于那个身体的消耗太大,就是,就是国外的咨询公司可能在就是劳动法这方面会做得比较好。就是哪怕就是这个 case 很紧急,但是还是就是说,就是按时的上下班这个样子。", - "original_transcription": ": 就是起现在就起现在可能会怎么说呢可能会因为一个方案就是彻夜加班很多很多天这种情况会出现就虽然说可能后面也会就是轮休放假吧但是对于那个身体的消耗太大就是就是国外咨询公司可能在就是劳动法这方面会做得比较好就是哪怕就是这个case很紧急但是还是就是说就是按时的上下班这个样子\n", "generated_transcription": "\n就是起现在就起现在可能会怎么说呢可能会因为一个方案就是彻夜加班很多很多天这种情况会出现就虽然说可能后面也会就是轮休放假吧但是对于那个身体的消耗太大就是就是国外咨询公司可能在就是劳动法这方面会做得比较好就是哪怕就是这个case很紧急但是还是就是说就是按时的上下班这个样子\n", - "wer_original": 0.9930555555555556, - "wer_generated": 1.0, - "wer_improvement": -0.00694444444444442, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.022222222222222223, + "wer": 1.0, + "mer": 0.022727272727272728 }, { "segment_id": "SEGMENT_ZH-CN_U0062_MIX_20250816_202117_317974", - "audio_path": "SEGMENT_ZH-CN_U0062_MIX_20250816_202117_317974.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0062_MIX_20250816_202117_317974.wav", "ground_truth": "Charlie Puth 嗯,Charlie Puth 他他是在之前感觉挺火的。", - "original_transcription": ": Charlie Puth, (uh), Charlie Puth, 他他是在之前感觉挺火的,\n", "generated_transcription": "\n: Charlie Puth, (uh), Charlie Puth 他他是在之前感觉挺火的\n", - "wer_original": 0.0, - "wer_generated": 0.25, - "wer_improvement": -0.25, - "wer_method": "smart_mixed", - "mer_original": 0.3125 + "cer": 0.058823529411764705, + "wer": 0.4, + "mer": 0.0625 }, { "segment_id": "SEGMENT_ZH-CN_U0043_ENCN_20250816_202117_285987", - "audio_path": "SEGMENT_ZH-CN_U0043_ENCN_20250816_202117_285987.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0043_ENCN_20250816_202117_285987.wav", "ground_truth": "Yeah. 那。 对, 然后小时候觉得, 小时候感觉学语文的时候会经常让我们背很多东西。小时候其实不理解,但是长大了以后觉得挺有帮助的。其实小时候背的一些东西,你在一生都可以记住。然后你以后再跟人交流的时候,可以引经据典,然后然后你显得很博学多才。 The best part of my school days, others too many. But .", - "original_transcription": ": ya, that. : 对, 然后, 小时候觉得, (err), 小时候感觉学语文的时候会经常让我们背很多东西, 小时候其实不理解, 但是长大以后觉得挺有帮助的, 其实, 小时候背的一些东西你一辈子都可以记住, 然后你以后再跟人交流的时候可以, 引经据点, 然后, 让你显得很博学多才, : (oh), the best part of my school days, (oh), there's too many, but (um).\n", "generated_transcription": ": 也那 : 对 : 然后小时候觉得 : 嗯 : 小时候感觉学语文的时候会经常让我们背很多东西 : 嗯 : 小时候其实不理解但是长大以后觉得挺有帮助的其实小时候背的一些东西你一辈子都可以记住然后你以后再跟人交流的时候可以 引经据点然后让你显得很博学多才\n", - "wer_original": 0.9140625, - "wer_generated": 0.2421875, - "wer_improvement": 0.671875, - "wer_method": "smart_mixed", - "mer_original": 0.2421875 + "cer": 0.3624161073825503, + "wer": 0.875, + "mer": 0.1810344827586207 }, { "segment_id": "SEGMENT_ZH-CN_U1062_ENCN_20250816_202117_294629", - "audio_path": "SEGMENT_ZH-CN_U1062_ENCN_20250816_202117_294629.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1062_ENCN_20250816_202117_294629.wav", "ground_truth": "Yeah, but. 直接指,对,直接可以把同学都卷死,你以为我在睡觉,不,我在学习。", - "original_transcription": ": yeah but 直接直接可以把同学都卷死 你以为我在睡觉不我在学习\n", "generated_transcription": "\n: yeah but 直接对直接可以把同学都卷死 你以为我在睡觉不我在学习\n", - "wer_original": 0.0, - "wer_generated": 0.2647058823529412, - "wer_improvement": -0.2647058823529412, - "wer_method": "smart_mixed", - "mer_original": 0.29411764705882354 + "cer": 0.030303030303030304, + "wer": 0.6666666666666666, + "mer": 0.03571428571428571 }, { "segment_id": "SEGMENT_ZH-CN_U1042_MIX_20250816_202117_300758", - "audio_path": "SEGMENT_ZH-CN_U1042_MIX_20250816_202117_300758.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1042_MIX_20250816_202117_300758.wav", "ground_truth": "看来你这边的话安排的话,很多都是跟演唱会相关的话,看很多 concert 日本的话这边有喜欢的乐队吗?", - "original_transcription": ": 嗯看来你这边的话安排的话很多都是跟演唱会相关的话很多concert日本的话这边也有喜欢的乐队嘛\n", "generated_transcription": "\n: 看来你这边的话安排的话很多都是跟演唱会相关的话很多concert日本的话这边有喜欢的乐队嘛\n", - "wer_original": 0.16279069767441862, - "wer_generated": 1.0, - "wer_improvement": -0.8372093023255813, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.043478260869565216, + "wer": 1.0, + "mer": 0.05 }, { "segment_id": "SEGMENT_ZH-CN_U0018_MIX_20250816_202117_316071", - "audio_path": "SEGMENT_ZH-CN_U0018_MIX_20250816_202117_316071.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0018_MIX_20250816_202117_316071.wav", "ground_truth": "但是,嗯后来我我看那边有两个人在玩儿那个 badminton,然后我就想加入他们,那我加入他们的 team 有两个啊 goal,就是第一个就是想 make my boyfriend angry and second,我想我确实想玩儿那个 badminton 嘛,然后呢我就 join 他们的 team,然后我们就三个人一起 play 那个 badminton,然后玩儿了about ten minutes I found myself 很开心,就是以前那个 angry 的情绪也没了,我我也不内耗了,我就感觉 sport 真的可以让人感觉很轻松,而且很 positive energy,对。", - "original_transcription": ": 但是嗯后来我我看那边有两个人在玩那个badminton然后我就想加入他们那我加入他们的team有两个呃goal就是第一个就是想make my boyfriend angry and the second呃我想我确实想玩那个badminton嘛然后呢我就join他们的team然后我们就三个人一起play那个badminton然后玩了about ten minutes I found myself很开心就是以前那个angry的情绪也没了我我也不内耗了我就感觉呃sport真的可以让人感觉很轻松而且很positive energy对\n", "generated_transcription": ": 但是嗯后来我我看那边有两个人在玩那个badminton然后我就想加入他们那我加入他们的team有两个呃goal就是第一个就是想make my boyfriend angry and the second呃我想我确实想玩那个badminton嘛然后呢我就join他们的team然后我们就三个人一起play那个badminton然后玩了about ten minutes 我发现自己很开心就是以前那个angry的情绪也没了我我也不内耗了我就感觉呃sport真的可以让人感觉很轻松而且很positive energy对\n", - "wer_original": 0.038834951456310676, - "wer_generated": 0.941747572815534, - "wer_improvement": -0.9029126213592233, - "wer_method": "smart_mixed", - "mer_original": 0.9223300970873787 + "cer": 0.08366533864541832, + "wer": 0.7741935483870968, + "mer": 0.08108108108108109 }, { "segment_id": "SEGMENT_ZH-CN_U1071_MIX_20250816_202117_271361", - "audio_path": "SEGMENT_ZH-CN_U1071_MIX_20250816_202117_271361.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1071_MIX_20250816_202117_271361.wav", "ground_truth": "我的话 我的话我的话也也也比较也比较那个 比较外向。 但 遇到那种就是 陌生人,或者说甚至说 foreigner you know 外国人也也也也是可以搭上话的。 对。", - "original_transcription": ": 嗯我的话嗯我的话我的话也也也比较也比较那个嗯比较外向嗯但呃遇到那种就是嗯陌生人或者说甚至说foreigner你知道外国人也是可以搭讪的对\n", "generated_transcription": "\n: 嗯我的话嗯我的话我的话也也也比较也比较那个嗯比较外向嗯但呃遇到那种就是嗯陌生人或者说甚至说foreigner 有的外国人也也也也是可以搭讪话的对\n", - "wer_original": 0.0, - "wer_generated": 0.7540983606557377, - "wer_improvement": -0.7540983606557377, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.2, + "wer": 1.0, + "mer": 0.15789473684210525 }, { "segment_id": "SEGMENT_ZH-CN_U0036_ENCN_20250816_202117_343065", - "audio_path": "SEGMENT_ZH-CN_U0036_ENCN_20250816_202117_343065.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0036_ENCN_20250816_202117_343065.wav", "ground_truth": "哇,英国! Oh, I know I know. But you know that the culture in China and abroad are very different. I I I kind of feel scared about it about my future, the campus life in abroad. And I'm afraid that I can't adapt to the foreign life in a totally different culture. And could you give me some comfort? 但是和和更多综合性的大学比起来,我们学校的校园整体面积却非常的小。这就导致了我们的娱乐娱乐场所并不是特别的多。这也是我一直以来觉得很遗憾的一件事情。", - "original_transcription": ": 哇英国哦我知道我知道但是你知道中国和英国的文化非常地不同啊我啊我啊我啊我觉得很害怕关于我的未来啊呃呃呃呃呃呃嗯嗯嗯嗯嗯嗯呃呃呃呃呃啊呃呃呃呃嗯呃呃呃呃嗯啊呃呃呃呃啊嗯呃呃呃呃啊啊呃呃呃呃啊啊啊呃呃呃呃 uh uh uh uh uh uh\n", - "generated_transcription": ": 哎哟英国啊我知道我知道但是你知道中国和英国的文化啊非常地不同啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊��啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我", - "wer_original": 0.0364963503649635, - "wer_generated": 3.759124087591241, - "wer_improvement": -3.7226277372262775, - "wer_method": "smart_mixed", - "mer_original": 0.9635036496350365 + "generated_transcription": ": 哎哟英国啊我知道我知道但是你知道中国和英国的文化啊非常地不同啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我啊我", + "cer": 0.9864864864864865, + "wer": 1.0, + "mer": 1.9770992366412214 }, { "segment_id": "SEGMENT_ZH-CN_U1103_ENCN_20250816_202117_272070", - "audio_path": "SEGMENT_ZH-CN_U1103_ENCN_20250816_202117_272070.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1103_ENCN_20250816_202117_272070.wav", "ground_truth": "很抱歉,很抱歉。 Tonight the football game tonight a Spanish with Italy tonight. Europe Europe, Europe Europe cup Spanish V S Italy.", - "original_transcription": ": (wah) 很抱歉很抱歉 tonight the football game tonight a Spanish with Italy tonight Europe Europe Europe Euro Cup Spanish vs Italy\n", "generated_transcription": "\n: (wah) 很抱歉很抱歉 tonight the football game tonight a Spanish with Italy tonight Europe Europe Europe Euro Cup Spanish vs Italy\n", - "wer_original": 0.3333333333333333, - "wer_generated": 0.4074074074074074, - "wer_improvement": -0.07407407407407407, - "wer_method": "smart_mixed", - "mer_original": 0.4074074074074074 + "cer": 0.05, + "wer": 0.2, + "mer": 0.16 }, { "segment_id": "SEGMENT_ZH-CN_U0006_ENCN_20250816_202117_339903", - "audio_path": "SEGMENT_ZH-CN_U0006_ENCN_20250816_202117_339903.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0006_ENCN_20250816_202117_339903.wav", "ground_truth": "And another from a different translation group. So to make a comparative analysis according to a theory, and then make some conclusion. Yes, that, that's it. 就算上从我家到地铁站,和从地铁站到学校的这段时间,大概要两个小时。", - "original_transcription": ": and another from a different translation group so to make a comparative analysis according to a theory and then make some conclusion. Yes, that that's it. : 就算上从我家到地铁站和从地铁站到学校的这段时间大概要两个小时.\n", "generated_transcription": "\n: and another from a different translation group so to make a comparative analysis according to a theory and then make some conclusion Yes that that's it\n", - "wer_original": 0.559322033898305, - "wer_generated": 0.6949152542372882, - "wer_improvement": -0.13559322033898313, - "wer_method": "smart_mixed", - "mer_original": 0.13559322033898305 + "cer": 0.1935483870967742, + "wer": 0.037037037037037035, + "mer": 0.5357142857142857 }, { "segment_id": "SEGMENT_ZH-CN_U0065_ENCN_20250816_202117_299379", - "audio_path": "SEGMENT_ZH-CN_U0065_ENCN_20250816_202117_299379.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0065_ENCN_20250816_202117_299379.wav", "ground_truth": "Yes, yes when I were. 然后其实我们现在东城的宿舍是要比昌平宽敞一些的,但是整个楼和一些设施会老旧一些,毕��是一九四九年就建了,有已经非常老旧。", - "original_transcription": ": 然后其实我们现在东城的宿舍是要比产品宽敞一些的但是整个楼和一些设施会老旧一些毕竟是一九四九年就建了已经非常老旧\n", "generated_transcription": "\n: 其实我们现在东城的宿舍是要比产品宽敞一些的但是整个楼和一些设施会老旧一些毕竟是一九四九年就建了已经非常老旧\n", - "wer_original": 0.9230769230769231, - "wer_generated": 0.2153846153846154, - "wer_improvement": 0.7076923076923077, - "wer_method": "smart_mixed", - "mer_original": 0.18461538461538463 + "cer": 0.28169014084507044, + "wer": 1.0, + "mer": 0.16393442622950818 }, { "segment_id": "SEGMENT_ZH-CN_U2005_MIX_20250816_202117_352546", - "audio_path": "SEGMENT_ZH-CN_U2005_MIX_20250816_202117_352546.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U2005_MIX_20250816_202117_352546.wav", "ground_truth": "其实我对 B T S 的了解不算特别多。但是我知道他有一首歌叫做 fake love。 我为什么会知道这一首歌呢?是因为我在上一年看了一部韩剧,叫做黑暗荣耀。然后那些各大的 up 主啊,就很喜欢用 fake love 来剪这个视频。", - "original_transcription": ": 其实我对BTS的了解不算特别多但是我知道他有一首歌叫做fake love我为什么会知道这一首歌呢是因为我在上一年看了一部韩剧叫做黑暗荣耀然后那些各大up主啊就很喜欢用fake love来剪这个视频\n", "generated_transcription": "\n其实我对BTS的了解不算特别多但是我知道他有一首歌叫做fake love我为什么会知道这一首歌呢是因为我在上一年看了一部韩剧叫做黑暗荣耀然后那些各大up主啊就很喜欢用fake love来剪这个视频\n", - "wer_original": 0.0, - "wer_generated": 1.0, - "wer_improvement": -1.0, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.010309278350515464, + "wer": 1.0, + "mer": 0.047619047619047616 }, { "segment_id": "SEGMENT_ZH-CN_U1081_ENCN_20250816_202117_319823", - "audio_path": "SEGMENT_ZH-CN_U1081_ENCN_20250816_202117_319823.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1081_ENCN_20250816_202117_319823.wav", "ground_truth": "Boyfriend. It's. 然后坏的方面就是它有点让人容易沉沉迷,容易成瘾。很多人,就是我朋友,他们一刷那个抖音,他们可以在床上躺一天。", - "original_transcription": ": boyfriend it's 然后坏的方面就是它有点让人容易沉沉迷容易沉迷很多人就是我朋友他们一刷那个抖音他们可以在床上躺一天\n", "generated_transcription": "\n: boyfriend it's 然后坏的方面就是他有点让人容易沉沉迷容易沉迷很多人就是我朋友他们一刷那个抖音他们可以在床上躺一天\n", - "wer_original": 0.9642857142857143, - "wer_generated": 0.19642857142857142, - "wer_improvement": 0.7678571428571429, - "wer_method": "smart_mixed", - "mer_original": 0.17857142857142858 + "cer": 0.05, + "wer": 0.3333333333333333, + "mer": 0.06 }, { "segment_id": "SEGMENT_ZH-CN_U1089_ENCN_20250816_202117_324360", - "audio_path": "SEGMENT_ZH-CN_U1089_ENCN_20250816_202117_324360.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1089_ENCN_20250816_202117_324360.wav", "ground_truth": "Wow. 对对对。 ** By the low of case ** security in the phenomena real but in the intelligent ram are where is pray, this is a view of freedom that trans gate experience. 他不仅仅是我们能做什么,而是在某种意义上我们为什么要去这么做?", - "original_transcription": ": (wow), 对对对, governed by the law of c~ c~ c~ c, c, c, c. in a phenomenon realm, but in the ontological realm, R, where is free? This is a view of freedom that trans, that experience, 他不仅仅是, 我们能做什么, 而是在某种意义上, 我们为什么要去这么做.\n", - "generated_transcription": ": 对对对, governed by the law of c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c", - "wer_original": 0.4626865671641791, - "wer_generated": 3.8059701492537314, - "wer_improvement": -3.343283582089552, - "wer_method": "smart_mixed", - "mer_original": 0.40298507462686567 + "generated_transcription": ": 对对对, governed by the law of c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c~ c", + "cer": 0.948051948051948, + "wer": 3.90625, + "mer": 2.0161290322580645 }, { "segment_id": "SEGMENT_ZH-CN_U1076_ENCN_20250816_202117_350210", - "audio_path": "SEGMENT_ZH-CN_U1076_ENCN_20250816_202117_350210.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1076_ENCN_20250816_202117_350210.wav", "ground_truth": "Okay. 说了特别中二的话,没想到有些人吧 一见就是一辈子, 一见那可能就是那几年就是一辈子就是用完了这一生所有的运气吧, 就见完了。 所以嗨现在回想起来吧,就是嗨也年轻过,也青涩过,也中二过。 所以说这个才叫青春吧,才叫回忆嘛, 是不是?", - "original_transcription": ": OK说了特别中二的话没想到有些人吧呃一见就是一辈子呃一见那可能就是那几年就是一辈子就是用完了这一生所有的运气吧呃就见完了唉所以嗨现在回想起来吧就是嗨也年轻过也青涩过也中二过唉所以说这个才叫轻症嘛吧嘛叫回忆嘛是不是\n", "generated_transcription": "\n: OK说了特别中二的话没想到有些人吧呃一见就是一辈子呃一见那可能就是那几年就是一辈子就是用完了这一生所有的运气吧呃就见完了唉所以嗨现在回想起来吧就是嗨也年轻过也青涩过也中二过唉所以说这个才叫轻症嘛吧嘛叫回忆嘛是不是\n", - "wer_original": 0.02727272727272727, - "wer_generated": 1.0, - "wer_improvement": -0.9727272727272728, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.10784313725490197, + "wer": 1.0, + "mer": 0.10101010101010101 }, { "segment_id": "SEGMENT_ZH-CN_U0035_ENCN_20250816_202117_288011", - "audio_path": "SEGMENT_ZH-CN_U0035_ENCN_20250816_202117_288011.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0035_ENCN_20250816_202117_288011.wav", "ground_truth": "I I think I enjoy. 嗯嗯。 And I have a question to you, Kunkun. 嗯。", - "original_transcription": ": I, I think I enjoy. : (hm), (hm), and I have a question to you, kun kun : (hm).\n", "generated_transcription": "\n: I, I think I enjoy (hm), and I have a question to you, kun kun\n", - "wer_original": 1.0, - "wer_generated": 0.5, - "wer_improvement": 0.5, - "wer_method": "smart_mixed", - "mer_original": 0.4444444444444444 + "cer": 0.06818181818181818, + "wer": 0.2, + "mer": 0.25 }, { "segment_id": "SEGMENT_ZH-CN_U2012_ENCN_20250816_202117_305799", - "audio_path": "SEGMENT_ZH-CN_U2012_ENCN_20250816_202117_305799.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U2012_ENCN_20250816_202117_305799.wav", "ground_truth": "I I have watch this squeed game too. And I think that this Korean drama is so fantastic, especially reversal and suspense. I just can't help watching them all the time. 哦哦,好好好,诶,等一下,我好像看到我朋友有有告诉我有一个新的有一个新的歌手,要来这里开一个 现场演出,要不要去?", - "original_transcription": ": 我好像看到我朋友有有告诉我有一个新的有一个新的歌手要来这里开一个现场演出要不要去\n", "generated_transcription": ": 我好像看到我朋友有有告诉我有一个新的有一个新的歌手要来这里开一个呃现场演出要不要去\n", - "wer_original": 0.6436781609195402, - "wer_generated": 0.5517241379310345, - "wer_improvement": 0.09195402298850575, - "wer_method": "smart_mixed", - "mer_original": 0.5402298850574713 + "cer": 0.7857142857142857, + "wer": 1.0, + "mer": 0.5125 }, { "segment_id": "SEGMENT_ZH-CN_U0001_ENCN_20250816_202117_301826", - "audio_path": "SEGMENT_ZH-CN_U0001_ENCN_20250816_202117_301826.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0001_ENCN_20250816_202117_301826.wav", "ground_truth": "Yeah, I get so, and I think the it seems like the renting price is getting higher and higher. Is it? 那你觉得学化学难吗?", - "original_transcription": ": Yeah, I guess so and I think the it seems like the renting price is getting higher and higher, is it 那你觉得学画学难吗\n", "generated_transcription": "\n: Yeah, I guess so and I think the it seems like the renting price is getting higher and higher, is it 那你觉得学画学难吗\n", - "wer_original": 0.3225806451612903, - "wer_generated": 0.22580645161290322, - "wer_improvement": 0.0967741935483871, - "wer_method": "smart_mixed", - "mer_original": 0.22580645161290322 + "cer": 0.047058823529411764, + "wer": 0.09090909090909091, + "mer": 0.06666666666666667 }, { "segment_id": "SEGMENT_ZH-CN_U0060_MIX_20250816_202117_302469", - "audio_path": "SEGMENT_ZH-CN_U0060_MIX_20250816_202117_302469.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0060_MIX_20250816_202117_302469.wav", "ground_truth": "我有, 我有 我有看过一场 about some background musical of some elimination movie, like。", - "original_transcription": ": 我有我有我有看过一场 about some (err) background musical of some elimination movie like.\n", "generated_transcription": "\n: 我有我有我有看过一场 about some (err) background musical of some elimination movie like\n", - "wer_original": 0.7, - "wer_generated": 0.2, - "wer_improvement": 0.49999999999999994, - "wer_method": "smart_mixed", - "mer_original": 0.2 + "cer": 0.04838709677419355, + "wer": 0.3333333333333333, + "mer": 0.05263157894736842 }, { "segment_id": "SEGMENT_ZH-CN_U0012_ENCN_20250816_202117_329157", - "audio_path": "SEGMENT_ZH-CN_U0012_ENCN_20250816_202117_329157.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0012_ENCN_20250816_202117_329157.wav", "ground_truth": "还行吧,它好像对学生有折扣,差不多如果是三个月的话,七百九 七百多块七百多块钱。 Yeah.", - "original_transcription": ": 还行吧,他好像对学生有折扣差不多如果是三个月的话七百九呃七百多块七百多块钱\n", "generated_transcription": "\n: 还行吧他好像对学生有折扣差不多如果是三个月的话七百九呃七百多块七百多块钱\n", - "wer_original": 0.975, - "wer_generated": 0.175, - "wer_improvement": 0.8, - "wer_method": "smart_mixed", - "mer_original": 0.15 + "cer": 0.15384615384615385, + "wer": 1.0, + "mer": 0.08333333333333333 }, { "segment_id": "SEGMENT_ZH-CN_U0035_ENCN_20250816_202117_288080", - "audio_path": "SEGMENT_ZH-CN_U0035_ENCN_20250816_202117_288080.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0035_ENCN_20250816_202117_288080.wav", "ground_truth": "So, do you always travel with your mom? 嗯。", - "original_transcription": ": (uh), so do you always travel with your mom?\n", - "generated_transcription": ": (uh), So do you always travel with your mom?\n", - "wer_original": 1.6, - "wer_generated": 0.4, - "wer_improvement": 1.2000000000000002, - "wer_method": "smart_mixed", - "mer_original": 0.4 + "generated_transcription": "\n: (uh), So do you always travel with your mom?\n", + "cer": 0.0967741935483871, + "wer": 0.2222222222222222, + "mer": 0.2222222222222222 }, { "segment_id": "SEGMENT_ZH-CN_U1056_ENCN_20250816_202117_347248", - "audio_path": "SEGMENT_ZH-CN_U1056_ENCN_20250816_202117_347248.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1056_ENCN_20250816_202117_347248.wav", "ground_truth": "是的是的, 是,对,但是,好, 他们好多人其实不是现在职场很多都说是什么零零后整顿职场嘛。然后就是因为 觉得零零后就这样的心态 什么都不怕,就是我就算去 去工作去实习,然后因为我不害怕,然后我不在意这些。 Yes.", - "original_transcription": ": 是的,是的,嗯,对,但是好,嗯,他们好的人,其实不是现在职场很多都说是什么零零后整顿职场吗,然后就是因为,呃,觉得零零后就这样的心态,啊什么都不怕,就是我就算去啊去工作去实习,然后因为我不害怕,然后我不在意这些,yes.\n", - "generated_transcription": ": 是的 是的嗯对但是好嗯他们好多人其实不是现在职场很多都说是什么零零后整顿职场嘛然后就是因为啊觉得零零后这样的心态啊什么都不怕就是我就算去啊去工作去实习然后因为我不害怕然后我不在意这些 yes\n", - "wer_original": 1.0, - "wer_generated": 0.16326530612244897, - "wer_improvement": 0.8367346938775511, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "generated_transcription": "\n: 是的 是的嗯对但是好嗯他们好多人其实不是现在职场很多都说是什么零零后整顿职场嘛然后就是因为啊觉得零零后这样的心态啊什么都不怕就是我就算去啊去工作去实习然后因为我不害怕然后我不在意这些 yes\n", + "cer": 0.06666666666666667, + "wer": 0.8571428571428571, + "mer": 0.06818181818181818 }, { "segment_id": "SEGMENT_ZH-CN_U1063_MIX_20250816_202117_353447", - "audio_path": "SEGMENT_ZH-CN_U1063_MIX_20250816_202117_353447.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1063_MIX_20250816_202117_353447.wav", "ground_truth": "So from my perspective, I think, you know, I'm from the 长沙 and my university is in my university of technology and a business. So when I introduce my score, maybe my experience of studying English. So I think I have to seeing experience as you because we I don't have ** such a teacher to help me to study English.", - "original_transcription": ": So from my perspective, I think you know I'm from the Shanghai and my university is the Hunan University of Technology and Business. So when I introduced my score, maybe my experience of studying English, so I think I have the same experience as you because we, I don't have such a teacher to help me to study English so.\n", - "generated_transcription": ": So from my perspective I think you know I'm from the Changsha and my university is Hunan University of Technology and Business So when I introduce my score maybe my experience of studying English So I think I have the same experience as you because we I don't have such such a teacher to help me to study English so\n", - "wer_original": 0.0, - "wer_generated": 0.2903225806451613, - "wer_improvement": -0.2903225806451613, - "wer_method": "smart_mixed", - "mer_original": 0.3064516129032258 + "generated_transcription": "\n: So from my perspective I think you know I'm from the Changsha and my university is Hunan University of Technology and Business So when I introduce my score maybe my experience of studying English So I think I have the same experience as you because we I don't have such such a teacher to help me to study English so\n", + "cer": 0.102880658436214, + "wer": 0.13333333333333333, + "mer": 0.14754098360655737 }, { "segment_id": "SEGMENT_ZH-CN_U1077_MIX_20250816_202117_266579", - "audio_path": "SEGMENT_ZH-CN_U1077_MIX_20250816_202117_266579.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1077_MIX_20250816_202117_266579.wav", "ground_truth": "因为在成都这边的那些什么 work work is 也是很少。然后基本上 I have no money more, 然后基本上一个月的 money 基本上就花的差不多了,就没有那么 More money can can do。", - "original_transcription": ": 嗯因为在成都这边的那些什么work work is 也是很少然后基本上我have no money more然后基本上一个月的money基本上就花得差不多了就没有no money more money can can do\n", - "generated_transcription": ": 嗯因为在成都这边的那些什么 work work is 也是很少然后基本上 i have no money more 然后基本上一个月的 money 基本上就花得差不多了就没有 no money more money can\n", - "wer_original": 0.8524590163934426, - "wer_generated": 0.18032786885245902, - "wer_improvement": 0.6721311475409836, - "wer_method": "smart_mixed", - "mer_original": 0.8688524590163934 + "generated_transcription": "\n: 嗯因为在成都这边的那些什么work work is 也是很少然后基本上我 have no money more 然后基本上一个月的money 基本上就花得差不多了就没有 no money more money can\n", + "cer": 0.16129032258064516, + "wer": 0.5555555555555556, + "mer": 0.11864406779661017 }, { "segment_id": "SEGMENT_ZH-CN_U0099_ENCN_20250816_202117_342647", - "audio_path": "SEGMENT_ZH-CN_U0099_ENCN_20250816_202117_342647.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0099_ENCN_20250816_202117_342647.wav", "ground_truth": "Oh I see. 我的家乡啊,我的家乡我就是北京的人,所以说是它 比最中国地最著名的一些地方,就有很多座在北京一些景点,比如说是像是故宫啊,天安门啊,然后圆明园啊之类的,都很大。", - "original_transcription": ": 我的家乡我的家乡我就是北京的人所以说是它比中国最著名的一些地方有很多住在北京的一些景点比如说是像是故宫田园门圆明园之类都很棒\n", - "generated_transcription": ": 我的家乡我的家乡我就是北京的人所以说是它比中国最著名的一些地方有很多住在北京一些景点比如说是像是故宫田园门圆明园之类都很棒\n", - "wer_original": 0.07317073170731707, - "wer_generated": 0.3048780487804878, - "wer_improvement": -0.23170731707317072, - "wer_method": "smart_mixed", - "mer_original": 0.3170731707317073 + "generated_transcription": "\n: 我的家乡我的家乡我就是北京的人所以说是它比中国最著名的一些地方有很多住在北京一些景点比如说是像是故宫田园门圆明园之类都很棒\n", + "cer": 0.2597402597402597, + "wer": 1.0, + "mer": 0.22972972972972974 }, { "segment_id": "SEGMENT_ZH-CN_U1081_ENCN_20250816_202117_319665", - "audio_path": "SEGMENT_ZH-CN_U1081_ENCN_20250816_202117_319665.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1081_ENCN_20250816_202117_319665.wav", "ground_truth": "What? 那你最晚是凌晨两点,那你还是蛮自律的,我觉得我最晚经常都是整夜都没有睡觉。", - "original_transcription": ": 那你最晚是凌晨两点那你还是蛮自律的我觉得我最晚经常都是整夜都没有睡觉\n", - "generated_transcription": ": 那你最晚是凌晨两点那你还是蛮自律的我觉得我最晚经常都是整夜都没有睡觉\n", - "wer_original": 0.07894736842105263, - "wer_generated": 0.10526315789473684, - "wer_improvement": -0.02631578947368421, - "wer_method": "smart_mixed", - "mer_original": 0.10526315789473684 + "generated_transcription": "\n: 那你最晚是凌晨两点那你还是蛮自律的我觉得我最晚经常都是整夜都没有睡觉\n", + "cer": 0.10526315789473684, + "wer": 0.5, + "mer": 0.02857142857142857 }, { "segment_id": "SEGMENT_ZH-CN_U2014_ENCN_20250816_202117_278911", - "audio_path": "SEGMENT_ZH-CN_U2014_ENCN_20250816_202117_278911.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U2014_ENCN_20250816_202117_278911.wav", "ground_truth": "For example? 那你弟弟现在怎么样了呢? Yes. 哇,那你有 ** 吗?", - "original_transcription": ": for example, (oh) 那你弟弟现在怎么样了呢 yes, (wah) 那你有sibling吗\n", - "generated_transcription": ": for example 哦那你弟弟现在怎么样了呢 yes 哇那你有sibling吗\n", - "wer_original": 0.34782608695652173, - "wer_generated": 0.5652173913043478, - "wer_improvement": -0.21739130434782605, - "wer_method": "smart_mixed", - "mer_original": 0.5652173913043478 + "generated_transcription": "\n: for example, 哦那你弟弟现在怎么样了呢 yes 哇那你有sibling吗\n", + "cer": 0.27586206896551724, + "wer": 0.5, + "mer": 0.10526315789473684 }, { "segment_id": "SEGMENT_ZH-CN_U0053_ENCN_20250816_202117_326542", - "audio_path": "SEGMENT_ZH-CN_U0053_ENCN_20250816_202117_326542.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0053_ENCN_20250816_202117_326542.wav", "ground_truth": "那这么说的话,公司可以算就是一个小型的或者说 。 For starters, they are not that expensive, but.", - "original_transcription": ": 那这么说的话公司可以算就是一个小型或者说, (err) for starters, they are not that expensive but\n", - "generated_transcription": ": 那这么说的话公司可以算就是一个小型或者说 for starters 他们现在不那么 expensive 但是\n", - "wer_original": 0.12903225806451613, - "wer_generated": 0.45161290322580644, - "wer_improvement": -0.3225806451612903, - "wer_method": "smart_mixed", - "mer_original": 0.22580645161290322 + "generated_transcription": "\n: 那这么说的话公司可以算就是一个小型或者说 for starters 他们现在不那么 expensive 但是\n", + "cer": 0.3103448275862069, + "wer": 0.6666666666666666, + "mer": 0.3448275862068966 }, { "segment_id": "SEGMENT_ZH-CN_U1072_ENCN_20250816_202117_318996", - "audio_path": "SEGMENT_ZH-CN_U1072_ENCN_20250816_202117_318996.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1072_ENCN_20250816_202117_318996.wav", "ground_truth": "也会做一些运动哈。 Yes, I know. Oh my god. 因为 对对对六人间,它相对来说比较拥挤嘛。", - "original_transcription": ": 也会说一些运动话 : (ah) yes I know (oh) my god 因为对对对六人间他相对来说比较拥挤嘛\n", "generated_transcription": "\n: 也会说一些运动话 : (ah) yes I know (oh) my god 因为对对对年轻人他相对来说比较拥挤\n", - "wer_original": 0.2857142857142857, - "wer_generated": 0.4, - "wer_improvement": -0.11428571428571432, - "wer_method": "smart_mixed", - "mer_original": 0.2857142857142857 + "cer": 0.21951219512195122, + "wer": 0.4444444444444444, + "mer": 0.25 }, { "segment_id": "SEGMENT_ZH-CN_U0088_ENCN_20250816_202117_280691", - "audio_path": "SEGMENT_ZH-CN_U0088_ENCN_20250816_202117_280691.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0088_ENCN_20250816_202117_280691.wav", "ground_truth": "Yeah yeah, yeah,you work harder than me . 而且。", - "original_transcription": ": ya, ya ya. You work harder than me. : Macam.\n", "generated_transcription": "\n: ya ya ya you work harder than me.\n", - "wer_original": 0.09090909090909091, - "wer_generated": 0.8181818181818182, - "wer_improvement": -0.7272727272727273, - "wer_method": "smart_mixed", - "mer_original": 0.8181818181818182 + "cer": 0.24242424242424243, + "wer": 0.625, + "mer": 0.6666666666666666 }, { "segment_id": "SEGMENT_ZH-CN_U0080_ENCN_20250816_202117_292977", - "audio_path": "SEGMENT_ZH-CN_U0080_ENCN_20250816_202117_292977.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0080_ENCN_20250816_202117_292977.wav", "ground_truth": "We focus on environment projects, like reducing plastic use in our school. and we discuss some more fashion or interesting projects and ideas, and sometimes we will. 这听起来不错,但是喝凉茶可能会在吃饭的时候让我胀气或者打嗝,下一次我可以试一试少喝一点,或许会对特别辣有所帮助。", - "original_transcription": ": 我们会关注环境项目,比如减少塑料使用,我们也会讨论一些更时尚或有趣的项目和想法,有时候我们会听起来不错,但是喝凉茶可能会在吃饭的时候让我胀气或者打嗝,下次我可以试一试少喝一点,或许会对特别辣有所帮助。\n", "generated_transcription": ": 我们会关注环境项目,比如减少塑料使用,我们也会讨论一些更时尚或有趣的项目和想法,有时候我们会听起来不错,但是喝凉茶可能会在吃饭的时候让我胀气或者打嗝\n", - "wer_original": 0.1566265060240964, - "wer_generated": 0.8795180722891566, - "wer_improvement": -0.7228915662650601, - "wer_method": "smart_mixed", - "mer_original": 0.5662650602409639 + "cer": 0.8556149732620321, + "wer": 1.0, + "mer": 0.8481012658227848 }, { "segment_id": "SEGMENT_ZH-CN_U0038_ENCN_20250816_202117_323847", - "audio_path": "SEGMENT_ZH-CN_U0038_ENCN_20250816_202117_323847.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0038_ENCN_20250816_202117_323847.wav", "ground_truth": "零,零心得 。 Yeah, true. Grammar to be bit like the pronunciations is generally harder than English or any other languages, I think.", - "original_transcription": ": (hm), ling ling xin de (um), ya, true, (uh), grammatical speaking, but like the pronunciation of the Asians is generally harder than English or any other languages, I think.\n", "generated_transcription": "\n: (hm), ling ling xin de (um), ya true, (uh), grammatical speaking, but like the pronunciation of Asians is generally harder than English or any other languages, I think.\n", - "wer_original": 0.23076923076923078, - "wer_generated": 0.5769230769230769, - "wer_improvement": -0.3461538461538461, - "wer_method": "smart_mixed", - "mer_original": 0.6153846153846154 + "cer": 0.4, + "wer": 0.6666666666666666, + "mer": 0.5833333333333334 }, { "segment_id": "SEGMENT_ZH-CN_U0015_MIX_20250816_202117_268594", - "audio_path": "SEGMENT_ZH-CN_U0015_MIX_20250816_202117_268594.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0015_MIX_20250816_202117_268594.wav", "ground_truth": "Hello,我也是,嗯,那咱们先自我介绍一下呗,我叫阿瑞亚,然后今年二十二,现在在北京念书。然后你呢?", - "original_transcription": ": 你好我也是嗯那咱们先自我介绍一下呗我叫阿利亚然后今年二十二现在在北京念书然后你呢\n", "generated_transcription": "\n: 你好我也是嗯那咱们先自我介绍一下呗我叫阿利亚然后今年二十二现在在北京念书然后你呢\n", - "wer_original": 25.0, - "wer_generated": 40.0, - "wer_improvement": -15.0, - "wer_method": "smart_mixed", - "mer_original": 40.0 + "cer": 0.13953488372093023, + "wer": 1.0, + "mer": 0.07692307692307693 }, { "segment_id": "SEGMENT_ZH-CN_U0043_ENCN_20250816_202117_285972", - "audio_path": "SEGMENT_ZH-CN_U0043_ENCN_20250816_202117_285972.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0043_ENCN_20250816_202117_285972.wav", "ground_truth": "Yeah, you know thing. But another thing is like sometimes change the place you live is easier than, well, to change another job, because sometimes good job is really hard to find. 对。", - "original_transcription": ": ya. : Can you understand? But another thing is like sometimes change the place you live is easier than well to change another job because sometimes a good job is really hard to find. : then.\n", "generated_transcription": "\n: ya : Then you said, but another thing is like sometimes change the place you live is easier than well to change another job because sometimes a good job is really hard to find.\n", - "wer_original": 0.058823529411764705, - "wer_generated": 0.3235294117647059, - "wer_improvement": -0.2647058823529412, - "wer_method": "smart_mixed", - "mer_original": 0.29411764705882354 + "cer": 0.0979020979020979, + "wer": 0.18181818181818182, + "mer": 0.18181818181818182 }, { "segment_id": "SEGMENT_ZH-CN_U1042_ENCN_20250816_202117_300833", - "audio_path": "SEGMENT_ZH-CN_U1042_ENCN_20250816_202117_300833.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1042_ENCN_20250816_202117_300833.wav", "ground_truth": "You know, we are in our department. We have some concert guest from Germany. They often invite us to watch some soccer game for the Bayern Munich. And for the last week, the Bayern Munich versus Real Madrid, semifinal of the Europe championship. Okay, that's very intensive. 这个其实这个其实 因为对于那个日韩这边的话,可能 印象的话也不是那么挺深刻的。其实欧美的话这边的话还是比较喜欢一些,比如说像这些读大学,喜欢一些朋克啊之类的。 但后对后面的话就是说开始上班之后的话,其实后面的话压力又比较大。平时的话可能大多数时间的话。", - "original_transcription": ": 你知道我们在我们的公寓里我们有几个来自德国的客人他们经常邀请我们去观看一些足球比赛为拜仁慕尼黑和前周拜仁慕尼黑与Real Madrid半决赛欧洲冠军联赛Okay那次比赛非常激烈Okay这个其实这个其实因为对于那个日韩之类的话可能印象的话不是那么深刻的���实欧美这边的话还是比较喜欢一些比如说像这些读大学喜欢一些朋克啊之类的啊但是后对后面的话就是说开始上班之后的话其实后面的话压力又比较大平时的话可能大部分的时间的话\n", "generated_transcription": ": 你知道我们在apartment我们有consulate guest from germany他们经常邀请我们看一些soccer game for the bundesliga and for the last week the bundesliga versus real madrid a semi final of the Europe championship okay that's very intensive okay 这个其实这个其实因为对于那个日韩这边的话可能印象的话不是那么深刻的其实欧美这边的话还是比较喜欢一些比如说像这些读大学喜欢一些朋克啊之类的啊然后对后面的话就是说开始上班之后的话其实后面的话压力又比较大平时的话可能大部分时间的话\n", - "wer_original": 0.5235294117647059, - "wer_generated": 0.29411764705882354, - "wer_improvement": 0.22941176470588237, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.2822822822822823, + "wer": 0.6078431372549019, + "mer": 0.2345679012345679 }, { "segment_id": "SEGMENT_ZH-CN_U0055_ENCN_20250816_202117_336722", - "audio_path": "SEGMENT_ZH-CN_U0055_ENCN_20250816_202117_336722.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0055_ENCN_20250816_202117_336722.wav", "ground_truth": "我还没有哎,我其实一直很想去,但是就一直缺乏一个合适的机会,就要不然就是比较忙,要不然的话就是金钱比较有限。就是目前还没有就是没有去过这种,但是我真的每次在朋友圈看他们就是分享的时候,我都好羡慕他们。 Yeah, yeah, it's a good. It's a very good suggestions. I think I maybe I'll do it when I go to my dormitory, and I may be stay. I maybe start the next day.", - "original_transcription": ": 我还没有,我其实一直很想去,但是就一直缺乏一个合适的机会,要不然就是比较忙,要不然就是金钱比较有限,就是目前还没有,就是没有去过这种,但是我真的每次在朋友圈看他们就是分享的时候,我好羡慕他们,: Yeah, yeah, it's a good, it's a very good suggestion. I think I, maybe I'll do it when I go to my dormitory and I maybe stay, I maybe start the next day.\n", "generated_transcription": ": 我还没有, 我其实一直很想去, 但是就一直缺乏一个合适的机会, 要不然就是比较忙, 要不然的话就是金钱比较有限, 就是目前还没有, 就是没有去过这种, 但是我真的每次在朋友圈看他们就是分享的时候, 我都好羡慕他们, : Yeah, yeah, it's a good, it's a very good suggestion, I think I, maybe I'll do it when I go to my dormitory and I maybe stay, I maybe start the next day.\n", - "wer_original": 0.7835820895522388, - "wer_generated": 0.1417910447761194, - "wer_improvement": 0.6417910447761195, - "wer_method": "smart_mixed", - "mer_original": 0.8134328358208955 + "cer": 0.014705882352941176, + "wer": 0.34285714285714286, + "mer": 0.03968253968253968 }, { "segment_id": "SEGMENT_ZH-CN_U1002_ENCN_20250816_202117_286944", - "audio_path": "SEGMENT_ZH-CN_U1002_ENCN_20250816_202117_286944.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1002_ENCN_20250816_202117_286944.wav", "ground_truth": "But but Pilate you learn Pilate, you have to have like money, because learning Pilate is quite expensive. 对。", - "original_transcription": ": But slap by platty you learn platty you have to have like money because learning platty is quite expensive.\n", "generated_transcription": "\n: But slap by platty you learn platty you have to have like money because learning platty is quite expensive.\n", - "wer_original": 1.7, - "wer_generated": 0.4, - "wer_improvement": 1.2999999999999998, - "wer_method": "smart_mixed", - "mer_original": 0.4 + "cer": 0.1744186046511628, + "wer": 0.3157894736842105, + "mer": 0.3157894736842105 }, { "segment_id": "SEGMENT_ZH-CN_U0055_MIX_20250816_202117_336322", - "audio_path": "SEGMENT_ZH-CN_U0055_MIX_20250816_202117_336322.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0055_MIX_20250816_202117_336322.wav", "ground_truth": "我最近听的比较多的是 Melody。 确实,但但我觉得就是是 a a few years ago,就是在我觉得还挺早之前,就杨洋还不知道他自己就是非常 handsome 的时候,他其实那个时候还没那么油腻,就当时看他跳舞的时候,觉得都还挺好的。", - "original_transcription": ": 我最近听的比较多的是melody, 确实, 但当我觉得就是就是, (ah) few years ago, 就是在我觉得还挺早之前, 就是杨杨还不知道她自己就是非常handsome 的时候, 她其实那个时候还没那么油腻, 就当时看她跳舞的时候觉得都还挺好的.\n", "generated_transcription": "\n: 我最近听的比较多的是melody 确实但当我觉得就是就是 (ah) few years ago 就是在我觉得还挺早之前杨杨还不知道她自己就是非常handsome 的时候她其实那个时候还没那么油腻就当时看她跳舞的时候觉得都还挺好的\n", - "wer_original": 0.9692307692307692, - "wer_generated": 0.36923076923076925, - "wer_improvement": 0.6, - "wer_method": "smart_mixed", - "mer_original": 0.5384615384615384 + "cer": 0.0761904761904762, + "wer": 0.8, + "mer": 0.10588235294117647 }, { "segment_id": "SEGMENT_ZH-CN_U0022_MIX_20250816_202117_331950", - "audio_path": "SEGMENT_ZH-CN_U0022_MIX_20250816_202117_331950.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0022_MIX_20250816_202117_331950.wav", "ground_truth": "因为就是可能有一些,可能更 prefer work at night, 或者其实整个寝室里边儿一个作息就是这样子。所以早上我是不喜欢,,就是起的太早的。就是对于工作这个其实也一样的。就我的话可能,嗯,你像九点的话,确实还是有一点太早,可能我 prefer 九点半,或者直接十点上班,然后到下午可能五六点下班,对我来说可能是比较 suitable schedule。 当然我需要在嗯中午有一些午休的时间。", - "original_transcription": ": 然后下午可能五六点下班对我来说可能是比较合适的日程当然我需要在中午有一些午休的时间\n", "generated_transcription": ": 然后下午可能五六点下班对我来说可能是比较 suitable schedule\n", - "wer_original": 0.17834394904458598, - "wer_generated": 0.8662420382165605, - "wer_improvement": -0.6878980891719746, - "wer_method": "smart_mixed", - "mer_original": 0.7898089171974523 + "cer": 0.7942857142857143, + "wer": 0.8181818181818182, + "mer": 0.8461538461538461 }, { "segment_id": "SEGMENT_ZH-CN_U0072_ENCN_20250816_202117_275959", - "audio_path": "SEGMENT_ZH-CN_U0072_ENCN_20250816_202117_275959.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0072_ENCN_20250816_202117_275959.wav", "ground_truth": "I I sometimes would go to play with my teammates or roommates. But I hurt my bones the week before, and I have a little P T S D for the basketball. So I won't to touch it until the next month or sometimes. 对。", - "original_transcription": "I sometimes would go to play with my teammates or roommates, but I hurt my bones the week before, and I have a little PTSD for the basketball, so I wanted to touch it until the next month or sometimes, 对.\n", "generated_transcription": "\nI sometimes would go to play with my teammates or roommates, but (uh), I hurt my bones the week before (uh), and (uh), I have a little PTSD for the basketball, so I wanted to to touch it (uh) until (hm) the next month or sometimes (uh) 对\n", - "wer_original": 0.0, - "wer_generated": 0.43478260869565216, - "wer_improvement": -0.43478260869565216, - "wer_method": "smart_mixed", - "mer_original": 0.2608695652173913 + "cer": 0.11392405063291139, + "wer": 0.28888888888888886, + "mer": 0.28888888888888886 }, { "segment_id": "SEGMENT_ZH-CN_U1074_ENCN_20250816_202117_351979", - "audio_path": "SEGMENT_ZH-CN_U1074_ENCN_20250816_202117_351979.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1074_ENCN_20250816_202117_351979.wav", "ground_truth": "鱼 , 不知道你说的是那种马步鱼啊,那种一一一小点的那种鱼,还是那种大鱼。大鱼的话可能不太好烤。 Yeah yeah yeah.", - "original_transcription": ": 鱼哦嗯不知道你说的是那种马步鱼啊那种一一小点的那种鱼还是那种大鱼大鱼的话可能不太好好好yeah yeah yeah\n", "generated_transcription": "\n: 鱼哦嗯不知道你说的是那种马步鱼啊那种一一小点的那种鱼还是那种大鱼大鱼的话可能不太好\n", - "wer_original": 0.9387755102040817, - "wer_generated": 0.22448979591836735, - "wer_improvement": 0.7142857142857143, - "wer_method": "smart_mixed", - "mer_original": 0.9795918367346939 + "cer": 0.3018867924528302, + "wer": 1.0, + "mer": 0.1590909090909091 }, { "segment_id": "SEGMENT_ZH-CN_U1007_MIX_20250816_202117_293163", - "audio_path": "SEGMENT_ZH-CN_U1007_MIX_20250816_202117_293163.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1007_MIX_20250816_202117_293163.wav", "ground_truth": "然后就会买,但一个可能也就不知道大概多少钱,可能就几十块钱吧,可能也 忘记了,就是不是记得特别清楚,就有点 confused。 然后那个时候就听着那个 M P three 听着他的歌,然后就是带着那个 戴着耳机,然后在路上还是在学校里面,都会可能就会偷偷的听他的歌。", - "original_transcription": ": 然后就会买带一个可能也就不知道大概多少钱可能就几十块钱吧可能也忘记了就是不是记得特别清楚就是有点confuse然后那个时候就听着那个mp three听着他的歌然后就是戴着那个嗯戴着耳机然后在路上还是在学校里面都会可能就会偷偷地听他的歌\n", "generated_transcription": "\n然后就会买带一个可能也就不知道大概多少钱可能就几十块钱吧可能也忘记了就是不是记得特别清楚就是有点 confused 然后那个时候就听着那个 Mp three 听着他的歌然后就是戴那个嗯戴着耳机然后在路上还是在学校里面都会可能就会偷偷地听他的歌\n", - "wer_original": 0.02654867256637168, - "wer_generated": 0.1592920353982301, - "wer_improvement": -0.1327433628318584, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.05217391304347826, + "wer": 0.6666666666666666, + "mer": 0.07692307692307693 }, { "segment_id": "SEGMENT_ZH-CN_U1101_ENCN_20250816_202117_307053", - "audio_path": "SEGMENT_ZH-CN_U1101_ENCN_20250816_202117_307053.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1101_ENCN_20250816_202117_307053.wav", "ground_truth": "Yeah, nice to meet you. 就是那个对。 Yes. 所以你的老家是在也是在 所以你上学老家,一直就是一直在河北,没有出,没有出过河北省。", - "original_transcription": ": yeah nice to meet you : 就是那个对 yes : 嗯哦所以你的老家是在也是在呃是你上学老家一直就是一直在河北没有出没有出过河北省\n", "generated_transcription": "\n: yeah nice to meet you : 就是那个对 yes 嗯哦所以你的老家是在也是在呃是你上学老家一直就是一直在河北没有出没有出过河北省\n", - "wer_original": 0.39622641509433965, - "wer_generated": 0.20754716981132076, - "wer_improvement": 0.18867924528301888, - "wer_method": "smart_mixed", - "mer_original": 0.20754716981132076 + "cer": 0.06451612903225806, + "wer": 0.2222222222222222, + "mer": 0.08333333333333333 }, { "segment_id": "SEGMENT_ZH-CN_U1002_MIX_20250816_202117_286737", - "audio_path": "SEGMENT_ZH-CN_U1002_MIX_20250816_202117_286737.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1002_MIX_20250816_202117_286737.wav", "ground_truth": "哦,我觉我用的是 iPhone。 但是我觉得醒图的话,它有一个 bug, 就是就比如说美图秀秀有些 V I P 才能用的嘛。你可以就是把那个屏幕退出来,然后截图。但是醒图的话,它就不能。", - "original_transcription": ": 哦我觉得我用的是iphone但是我觉得醒图的话它有一个bug就是就是比如说美图秀秀有些VIP才能用的嘛你可以就是把那个屏幕退出来然后截图但是醒图的话它就不能\n", "generated_transcription": "\n: 哦我觉得我用的是iphone但是我觉得醒图的话它有一个bug就是就比如说美图秀秀有些VIP才能用的嘛你可以就是把那个屏幕退出来然后截图但是醒图的话它就不能\n", - "wer_original": 0.9868421052631579, - "wer_generated": 1.0, - "wer_improvement": -0.013157894736842146, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.013157894736842105, + "wer": 1.0, + "mer": 0.057971014492753624 }, { "segment_id": "SEGMENT_ZH-CN_U1038_MIX_20250816_202117_329550", - "audio_path": "SEGMENT_ZH-CN_U1038_MIX_20250816_202117_329550.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1038_MIX_20250816_202117_329550.wav", "ground_truth": "Yeah, I I think coffee is great。 但是 就是睡眠质量非常的不好。", - "original_transcription": ": yeah and I think coffee is great 但是嗯就是睡眠质量非常地不好\n", - "generated_transcription": ": yeah and I think coffee is great 但是嗯就是睡眠质量非常地不好\n", - "wer_original": 0.7142857142857143, - "wer_generated": 0.2857142857142857, - "wer_improvement": 0.4285714285714286, - "wer_method": "smart_mixed", - "mer_original": 0.2857142857142857 + "generated_transcription": "\n: yeah and I think coffee is great 但是嗯就是睡眠质量非常地不好\n", + "cer": 0.13513513513513514, + "wer": 0.3333333333333333, + "mer": 0.15 }, { "segment_id": "SEGMENT_ZH-CN_U0005_ENCN_20250816_202117_348302", - "audio_path": "SEGMENT_ZH-CN_U0005_ENCN_20250816_202117_348302.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0005_ENCN_20250816_202117_348302.wav", "ground_truth": "这个。 Yes. 是在大大麦网那种吗? Drama,yeah, opera like like the T V play .", - "original_transcription": ": 这个是的在大卖网那种吗 drama yeah opera like like the TV play\n", - "generated_transcription": ": 这个 yes 是在大大卖网那种吗 drama yeah opera like like the TV play\n", - "wer_original": 0.8695652173913043, - "wer_generated": 0.34782608695652173, - "wer_improvement": 0.5217391304347826, - "wer_method": "smart_mixed", - "mer_original": 0.43478260869565216 + "generated_transcription": "\n: 这个 yes 是在大大卖网那种吗 drama yeah opera like like the TV play\n", + "cer": 0.022222222222222223, + "wer": 0.45454545454545453, + "mer": 0.25 }, { "segment_id": "SEGMENT_ZH-CN_U1049_ENCN_20250816_202117_304218", - "audio_path": "SEGMENT_ZH-CN_U1049_ENCN_20250816_202117_304218.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1049_ENCN_20250816_202117_304218.wav", "ground_truth": "Okay. Can you introduce more ? 啊。 嗯,你去看了吗?", - "original_transcription": ": Okay, can you introduce more? : (oh), 你去看了吗\n", - "generated_transcription": ": Okay, can you introduce more? : (oh), 你去看了吗\n", - "wer_original": 0.625, - "wer_generated": 0.5625, - "wer_improvement": 0.0625, - "wer_method": "smart_mixed", - "mer_original": 0.5625 + "generated_transcription": "\n: Okay, can you introduce more? : (oh), 你去看了吗\n", + "cer": 0.06666666666666667, + "wer": 0.2857142857142857, + "mer": 0.16666666666666666 }, { "segment_id": "SEGMENT_ZH-CN_U0074_ENCN_20250816_202117_277972", - "audio_path": "SEGMENT_ZH-CN_U0074_ENCN_20250816_202117_277972.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0074_ENCN_20250816_202117_277972.wav", "ground_truth": "Well, I just like go to the gym by myself and do some jogging. 并且这个宣传是有一定的积分和稿酬奖励的,所以也是公司为了鼓励大家多做宣传的原因。", - "original_transcription": ": 嗯嗯嗯我就是喜欢自己去健身房做一些运动并且这个宣传是有有一定的积分和稿酬奖励的所以也是公司为了鼓励大家多做宣传的原因\n", - "generated_transcription": ": 嗯 well 我就是喜欢自己去健身房做一些瑜伽 并且这个宣传是有有一定的积分和搞酬劳的所以也是公司为了鼓励大家多做宣传的原因\n", - "wer_original": 0.3888888888888889, - "wer_generated": 0.4444444444444444, - "wer_improvement": -0.055555555555555525, - "wer_method": "smart_mixed", - "mer_original": 0.4074074074074074 + "generated_transcription": "\n: 嗯 well 我就是喜欢自己去健身房做一些瑜伽 并且这个宣传是有有一定的积分和搞酬劳的所以也是公司为了鼓励大家多做宣传的原因\n", + "cer": 0.5647058823529412, + "wer": 1.0, + "mer": 0.40384615384615385 }, { "segment_id": "SEGMENT_ZH-CN_U0057_MIX_20250816_202117_267212", - "audio_path": "SEGMENT_ZH-CN_U0057_MIX_20250816_202117_267212.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0057_MIX_20250816_202117_267212.wav", "ground_truth": "It's a natural process I think, so we need to 接受 the artificial intelligence. 就这样吧。然后如果是奖励自己的话,我还是觉得就是最后那个最好的结果就是的 best gift.", - "original_transcription": ": it's a natural process I think so we need to 接受 the artificial intelligence : 就这样吧然后如果是奖励自己的话我还是觉得就是最后那个最好的结果就是 the best gift\n", - "generated_transcription": ": it's a natural process I think so we need to 接受 the artificial intelligence : 就这样吧然后如果是奖励自己的话我还是觉得就是最后那个最好的结果就是 the best gift\n", - "wer_original": 0.05660377358490566, - "wer_generated": 0.1320754716981132, - "wer_improvement": -0.07547169811320754, - "wer_method": "smart_mixed", - "mer_original": 0.1320754716981132 + "generated_transcription": "\n: it's a natural process I think so we need to 接受 the artificial intelligence : 就这样吧然后如果是奖励自己的话我还是觉得就是最后那个最好的结果就是 the best gift\n", + "cer": 0.02912621359223301, + "wer": 0.11764705882352941, + "mer": 0.0196078431372549 }, { "segment_id": "SEGMENT_ZH-CN_U1060_ENCN_20250816_202117_294887", - "audio_path": "SEGMENT_ZH-CN_U1060_ENCN_20250816_202117_294887.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1060_ENCN_20250816_202117_294887.wav", "ground_truth": "Oh, yes, I like music. And my favorite singer is Li Ronghao, and I often spend. 对,他可以说是前无古人,也非常哪有后来来者吧,后有来者吧 。", - "original_transcription": ": (oh) yes I like music and my favorite singer is Li Ronghao and I often spend 对他可以说是前无古人也很难有后来者吧后来者吧\n", - "generated_transcription": ": 哦 yes 我喜欢 music 和 my favorite singer 是 李荣浩 and 我 often spend 对 他可以说是前无古人也非常难得后来者吧后来者吧\n", - "wer_original": 0.6444444444444445, - "wer_generated": 0.4666666666666667, - "wer_improvement": 0.1777777777777778, - "wer_method": "smart_mixed", - "mer_original": 0.3333333333333333 + "generated_transcription": "\n: 哦 yes 我喜欢 music 和 my favorite singer 是 李荣浩 and 我 often spend 对 他可以说是前无古人也非常难得后来者吧后来者吧\n", + "cer": 0.30952380952380953, + "wer": 0.5882352941176471, + "mer": 0.34146341463414637 }, { "segment_id": "SEGMENT_ZH-CN_U0043_MIX_20250816_202117_285602", - "audio_path": "SEGMENT_ZH-CN_U0043_MIX_20250816_202117_285602.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0043_MIX_20250816_202117_285602.wav", "ground_truth": "对,但是我我非常我我非常能理解你说的这种感觉。因为 sometimes well, if you force to do something or you like, you made to do something. And then like well, you kind of think well I don't really want to do this. But well, if if you have a choice to do that, you know you more inclined to do something。", - "original_transcription": ": 对但是我我非常我我非常能理解你说这种感觉因为, sometimes, well, if you forced to do something or you like you made to do something, well, and then like well, you kind of saying, well, I don't really want to do this, but well, if if you have a choice to do that, you know you more inclined to do something.\n", - "generated_transcription": ": 对但是我我非常我我非常能理解你说这种感觉因为, sometimes, well, if you forced to do something or you like you might do something, well, and then like well, you kind of saying, well, I don't really want to do this, but well, if if you have a choice to do that, you know you more inclined to do something.\n", - "wer_original": 1.131578947368421, - "wer_generated": 0.2236842105263158, - "wer_improvement": 0.9078947368421052, - "wer_method": "smart_mixed", - "mer_original": 0.19736842105263158 + "generated_transcription": "\n: 对但是我我非常我我非常能理解你说这种感觉因为, sometimes, well, if you forced to do something or you like you might do something, well, and then like well, you kind of saying, well, I don't really wanna do this, but well, if if you have a choice to do that, you know you more inclined to do something\n", + "cer": 0.08018867924528301, + "wer": 0.15384615384615385, + "mer": 0.10810810810810811 }, { "segment_id": "SEGMENT_ZH-CN_U1045_ENCN_20250816_202117_330015", - "audio_path": "SEGMENT_ZH-CN_U1045_ENCN_20250816_202117_330015.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1045_ENCN_20250816_202117_330015.wav", "ground_truth": "我们当时其实不是从成都出发的,然后是从我老公的爸爸的家里面出发。他们在重庆,对,我们从重庆出发的话,我记得是开了有三天的样子,对,我们中途还去其他的地方玩了。但是我现在已经记不得了。对,因为是前年的事情, 我感觉路上好像都比那个地方要 体验,好像要更好一点。 Well, so you you guys have the same hobby, right?", - "original_transcription": ": 我们当时其实不是从成都出发然后是从我老公的爸爸的家里出发他们在重庆对我们从重庆出发的话我记得是开了有三天的样子对我们中途还去其他的地方玩了但是我现在已经记不到了对因为是前年的事情嗯我感觉路上好像都比那个地方要体验好像要更好一点哇所以你们你们有同的爱好对\n", - "generated_transcription": ": 我们当时其实不是从成都出发然后是从我老公的爸爸的家里出发他们在重庆对我们从重庆出发的话我记得是开了有三天的样子对我们中途还去其他的地方玩了但是我现在已经记不到了对因为是前年的事情嗯我感觉路上好像都比那个地方要体验好像要更好一点 : 哇 so you you guys have the same hobby right\n", - "wer_original": 0.9343065693430657, - "wer_generated": 0.1386861313868613, - "wer_improvement": 0.7956204379562044, - "wer_method": "smart_mixed", - "mer_original": 0.20437956204379562 + "generated_transcription": "\n: 我们当时其实不是从成都出发然后是从我老公的爸爸的家里出发他们在重庆对我们从重庆出发的话我记得是开了有三天的样子对我们中途还去其他的地方玩了但是我现在已经记不到了对因为是前年的事情嗯我感觉路上好像都比那个地方要体验好像要更好一点 : 哇 so you you guys have the same hobby right\n", + "cer": 0.052980132450331126, + "wer": 0.3076923076923077, + "mer": 0.04032258064516129 }, { "segment_id": "SEGMENT_ZH-CN_U0027_ENCN_20250816_202117_350757", - "audio_path": "SEGMENT_ZH-CN_U0027_ENCN_20250816_202117_350757.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0027_ENCN_20250816_202117_350757.wav", "ground_truth": "No. 对,因为我也是一直在考虑这个问题,虽然我的工作经验可能没有那么多,但是也是积累了一些在不同企业,不同工作环境的一些经验。所以给我的感觉就是说可能一直工作下去,对于我来说就是我一直是一个乙方是一件非常被动的事情。但是可能现在在大环境不好的情况下,可能大家普遍的选择还是希望能够有一份稳定的工作。但是我也特别同意您刚才说的,就是可能在拥有一份稳定工作的同时,还是要去想自己如果遇到了一些大的事情的时候,怎么去抵御风险,提高自己在这方面的对抗能力,可能会比我们只扎根于给别人打工挣钱来的更稳定,或更安全一些。", - "original_transcription": ": 对因为我也是一直在考虑这个问题虽然我的工作经验可能没有那么多但是也是积累了一些在不同企业不同工作环境的一些经验所以给我的感觉就是说可能一直工作下去对于我来说就是我一直是一个乙方是一件非常被动的事情但是可能现在在大环境不好的情况下可能大家普遍的选择还是希望能够有一份稳定的工作但是我也特别同意您刚才说的就是可能在拥有一份稳定的工作的同时还是要去想自己如果遇到了一些大的事情的时候怎么去抵御风险提高自己在这方面的对抗能力可能会比我们只扎根于给别人打工挣钱来得更稳定或更安全一些\n", "generated_transcription": ": 对因为我也是一直在考虑这个问题虽然我的工作经验可能没有那么多但是也是积累了一些在不同企业不同工作环境的一些经验所以给我的感觉就是说可能一直工作下去对于我来说就是我一直是一个乙方是一件非常被动的事情但是可能现在在大环境不好的情况下可能大家普遍的选择还是希望能够有一份稳定的工作但是我也特别同意您刚才说的就是可能在拥有一份稳定的工作的同时还是要去想自己如果遇到了一些大的事情的时候怎么去抵御风险提高自己在这方面的对抗能力可能会比我们只扎根于给别人打工挣钱来得更稳定或更安全一些\n", - "wer_original": 1.0, - "wer_generated": 0.07539682539682539, - "wer_improvement": 0.9246031746031746, - "wer_method": "smart_mixed", - "mer_original": 0.07539682539682539 + "cer": 0.016877637130801686, + "wer": 1.0, + "mer": 0.012711864406779662 }, { "segment_id": "SEGMENT_ZH-CN_U1014_ENCN_20250816_202117_284857", - "audio_path": "SEGMENT_ZH-CN_U1014_ENCN_20250816_202117_284857.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1014_ENCN_20250816_202117_284857.wav", "ground_truth": "我买的时候已经降价很多了,像是我买的这款游戏机的话,它是宝可梦珠的限定款。它在刚发售的时候,它都卖到快三千了。但是现在的话我是减了,等于说红利,我买的时候才一千七。 The songs , yes. I'm a bad singer.", - "original_transcription": ": 嗯我买的时候已经降价很多像是我买的这款有些急冻它是宝可梦周限定款它在刚发售的时候它都卖到快三千了但是现在的话我是捡了等于说红利我买的时候才一千七 : the songs yes i'm a bad singer\n", "generated_transcription": "\n: 嗯我买的时候已经降价很多像是我买了这款有些急冻它是宝可梦周限定款他在刚发售的时候它都卖到快三千了但是现在的话我是捡了等于说红利我买的时候才一千七 : the songs yes 我是bad singer\n", - "wer_original": 0.17777777777777778, - "wer_generated": 0.3, - "wer_improvement": -0.1222222222222222, - "wer_method": "smart_mixed", - "mer_original": 0.25555555555555554 + "cer": 0.15463917525773196, + "wer": 0.5, + "mer": 0.1728395061728395 }, { "segment_id": "SEGMENT_ZH-CN_U1002_ENCN_20250816_202117_287497", - "audio_path": "SEGMENT_ZH-CN_U1002_ENCN_20250816_202117_287497.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1002_ENCN_20250816_202117_287497.wav", "ground_truth": "对,她直接把菜刀拿过来了,好搞笑啊。 That's awesome. I'm trying to get into a weekly exercise one time, too. What kind of running do you do?", - "original_transcription": ": 对他直接把菜都拿过来了好搞笑啊 : 哦那太棒了我正试图进入一个wiggler exercise routine呢你跑什么类型的跑步\n", "generated_transcription": "\n: 对他直接把菜都拿过来了好搞笑啊 : 哦that's awesome I'm trying to get into a jogger exercise routine too what kind of a running do you do\n\n", - "wer_original": 0.5526315789473685, - "wer_generated": 0.3684210526315789, - "wer_improvement": 0.18421052631578955, - "wer_method": "smart_mixed", - "mer_original": 0.631578947368421 + "cer": 0.15053763440860216, + "wer": 0.2857142857142857, + "mer": 0.2 }, { "segment_id": "SEGMENT_ZH-CN_U0029_MIX_20250816_202117_297745", - "audio_path": "SEGMENT_ZH-CN_U0029_MIX_20250816_202117_297745.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0029_MIX_20250816_202117_297745.wav", "ground_truth": "嗯,啊好像好像不是,哎,teams 是那个属于微软 microsoft。 WeChat 的话就是他 V 就是比较 convenient, 然后也比较 straight, 比较直接。因为 because my work I 我宁经常需要跟 candidate 去 communication. So when I 去加一些候选的话 ,I first 我可能会比较考虑的话,就是 WeChat 嘛比较方便,因为 because 是 everyone 都会有的东西。", - "original_transcription": ": 因为我的工作我经常需要跟candidate去communication, 所以, (err), when I 去加一些后选人的话, (err), I first, 我可能会比较考虑的话就是微信吧, 比较方便, 因为, 因为, everyone都会有的东西.\n", "generated_transcription": ": 因为我工作我经常需要跟 candidate 去 communication 所以 when 我去加一些后选的话 我可能会比较考虑的话\n", - "wer_original": 0.9032258064516129, - "wer_generated": 0.7096774193548387, - "wer_improvement": 0.19354838709677413, - "wer_method": "smart_mixed", - "mer_original": 0.7634408602150538 + "cer": 0.7219251336898396, + "wer": 0.875, + "mer": 0.7142857142857143 }, { "segment_id": "SEGMENT_ZH-CN_U2012_ENCN_20250816_202117_305795", - "audio_path": "SEGMENT_ZH-CN_U2012_ENCN_20250816_202117_305795.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U2012_ENCN_20250816_202117_305795.wav", "ground_truth": "尊重老师和 老师建立良好的关系。 应该可以吧,挺好的。 Yes. I mean, there are a lot of course in college life. And the course is, it is very flexible to some extent, except some compulsory courses, you can choose many voluntary courses, But remember when choosing those courses you need to balance the the difficulties and the easiness and your own interest.", - "original_transcription": ": 尊重老师和老师建立良好的关系,应该可以吧,挺好的。\n", "generated_transcription": ": 尊重老师和老师建立良好的关系,应该可以吧,挺好的\n: Yes, I mean there are a lot of courses in college life and the courses is is very flexible to some extent, except some compulsory courses you can choose many voluntary courses, but remember\n: when choosing those courses you need to balance the difficulties and the easiness and your own interests", - "wer_original": 0.0, - "wer_generated": 0.1794871794871795, - "wer_improvement": -0.1794871794871795, - "wer_method": "smart_mixed", - "mer_original": 0.6923076923076923 + "cer": 0.026415094339622643, + "wer": 0.14285714285714285, + "mer": 0.06666666666666667 }, { "segment_id": "SEGMENT_ZH-CN_U1091_ENCN_20250816_202117_306771", - "audio_path": "SEGMENT_ZH-CN_U1091_ENCN_20250816_202117_306771.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1091_ENCN_20250816_202117_306771.wav", "ground_truth": "do you have someone you really like? 嗯。 Yeah, that's really bad. And but I I want you. Do you have some your? Do you have some teacher you really like or you really hate about? 嗯。", - "original_transcription": ": Do you have someone you really like?\n", "generated_transcription": "\n: Do you have someone you really like\n", - "wer_original": 0.0, - "wer_generated": 0.8461538461538461, - "wer_improvement": -0.8461538461538461, - "wer_method": "smart_mixed", - "mer_original": 0.8461538461538461 + "cer": 0.7835820895522388, + "wer": 0.8108108108108109, + "mer": 0.8108108108108109 }, { "segment_id": "SEGMENT_ZH-CN_U1028_ENCN_20250816_202117_326903", - "audio_path": "SEGMENT_ZH-CN_U1028_ENCN_20250816_202117_326903.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1028_ENCN_20250816_202117_326903.wav", "ground_truth": "I I mean, we need to work. 哦。 To have the internship. 我们办公室的老师大多数都是我们大学的毕业生。因为我我所在那个中学嘛,他是 我们大学的一个附属的中学,所以说很多老师都是我们的毕业生,他们都是我们的学姐学长,就很好沟通。", - "original_transcription": ": 嗯 I I mean we need to work : 嗯 to have the internship : 我们办公室的老师大多数都是我们大学的毕业生因为我我走在那个中学嘛它是呃我们大学的一个附属的中学所以说很多老师都是我们的毕业生他们都是我们的学界学长就很好沟通\n", "generated_transcription": "\n: 嗯 I mean we need to work : 嗯 to have the internship : 我们办公室的老师大多数都是我们大学的毕业生因为我我走在那个中学吗它是呃我们大学的一个附属的中学所以说很多老师都是我们的毕业生他们都是我们的学界学长就很好沟通\n", - "wer_original": 0.23958333333333334, - "wer_generated": 0.1875, - "wer_improvement": 0.05208333333333334, - "wer_method": "smart_mixed", - "mer_original": 0.17708333333333334 + "cer": 0.06086956521739131, + "wer": 0.2857142857142857, + "mer": 0.07865168539325842 }, { "segment_id": "SEGMENT_ZH-CN_U0039_ENCN_20250816_202117_351280", - "audio_path": "SEGMENT_ZH-CN_U0039_ENCN_20250816_202117_351280.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0039_ENCN_20250816_202117_351280.wav", "ground_truth": "Oh, oh that's that's quite amazing. So you're both a ** a daughter ** for as as a daughter, and you're both among that's pretty cool. 嗯。", - "original_transcription": ": (oh), (oh), that's that's quite amazing. So you are (uh), both a a ch~ a daughter for as as a daughter, and you are both a mom. That's pretty cool. (hm).\n", "generated_transcription": "\n: (oh), (oh), that's that's quite amazing. So you are (uh), both a a ch~ a daughter for as as a daughter and you are both a mom. That's pretty cool.\n", - "wer_original": 0.07142857142857142, - "wer_generated": 0.5714285714285714, - "wer_improvement": -0.5, - "wer_method": "smart_mixed", - "mer_original": 0.5357142857142857 + "cer": 0.10416666666666667, + "wer": 0.4, + "mer": 0.4 }, { "segment_id": "SEGMENT_ZH-CN_U0016_ENCN_20250816_202117_303679", - "audio_path": "SEGMENT_ZH-CN_U0016_ENCN_20250816_202117_303679.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0016_ENCN_20250816_202117_303679.wav", "ground_truth": "我觉得可能是我用了跑步姿势不对吧,然后跑久了就会感觉 脚板痛,还是说我有扁平足吗? No. Eight, eight? But but I think that, that will be much larger than ours you know.", - "original_transcription": ": 我觉得可能是我用的跑步姿势不对吧然后跑久了就会感觉脚踝痛还说我有变平足吗, No, eight eight, but but I think that that will be much larger than ours, you know.\n", "generated_transcription": "\n: 我觉得可能是我用的跑步姿势不对吧然后跑久了就会感觉脚踝痛还说我有变平足吗 No, eight eight, but but I think that that will be much larger than ours, you know.\n", - "wer_original": 0.43859649122807015, - "wer_generated": 0.22807017543859648, - "wer_improvement": 0.21052631578947367, - "wer_method": "smart_mixed", - "mer_original": 0.22807017543859648 + "cer": 0.04, + "wer": 0.10526315789473684, + "mer": 0.07407407407407407 }, { "segment_id": "SEGMENT_ZH-CN_U0082_MIX_20250816_202117_305943", - "audio_path": "SEGMENT_ZH-CN_U0082_MIX_20250816_202117_305943.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0082_MIX_20250816_202117_305943.wav", "ground_truth": "That's very interesting . 因为 虽然我是北京人啊,但是 like all those things that Beijing people recommend you . 我都不太喜欢,像什么爆肚啊 ,什么卤煮啊,都 so growth I I don't like them growth, 我我我不太喜欢吃内脏,我我 因为腥 腥了吧唧的。", - "original_transcription": ": that's very interesting. (um) 因为 (um) 虽然我是北京人 (ah) 但是 like all those things that Beijing people recommend you (um) 我都不太喜欢像什么爆肚啊 (um) 什么卤煮啊 those so gross I I don't like them gross 我我不太喜欢吃内脏我我我不太喜欢吃内臟因为 (um) 新了巴吉了\n", "generated_transcription": "\n: that's very interesting. (um) 因为 (um) 虽然我是北京人 (ah) 但是 like all those things that Beijing people recommend you (um) 我都不太喜欢像什么爆肚 (ah) (uh) 什么卤煮 (ah) those so gross I I don't like them gross 我我不太喜欢吃内脏 (um) 因为 (um) 心了巴吉的\n", - "wer_original": 0.8076923076923077, - "wer_generated": 0.3076923076923077, - "wer_improvement": 0.5, - "wer_method": "smart_mixed", - "mer_original": 0.358974358974359 + "cer": 0.22758620689655173, + "wer": 0.7037037037037037, + "mer": 0.24285714285714285 }, { "segment_id": "SEGMENT_ZH-CN_U0082_MIX_20250816_202117_305979", - "audio_path": "SEGMENT_ZH-CN_U0082_MIX_20250816_202117_305979.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0082_MIX_20250816_202117_305979.wav", "ground_truth": "牛街 is a good place to eat those 。 因为我们是对我们 we design as three D but 生产 needs two D 所以我们设计完了我们 need to transfer it to two D 所以生产才能生产 才能做这个零件。", - "original_transcription": ": (ah) 牛街, is a good place to eat those (err) 因为, 我们是对我们, we design as three D, but 生产 needs two D, 所以我们设计完了我们, (err) need to transfer it to two D, 所以生产才能生产, (err) 才能做这个零件\n", "generated_transcription": "\n: (ah) 牛街, 是一个好地方, eat those, (err) 因为, 我们是对, 我们, we design as three D, but 生产 needs two D, 所以我们设计完了我们, (err) need to transfer it to two D, 所以生产才能生产, (err) 才能做这个零件\n", - "wer_original": 0.6451612903225806, - "wer_generated": 0.3548387096774194, - "wer_improvement": 0.29032258064516125, - "wer_method": "smart_mixed", - "mer_original": 0.20967741935483872 + "cer": 0.22018348623853212, + "wer": 0.3793103448275862, + "mer": 0.16666666666666666 }, { "segment_id": "SEGMENT_ZH-CN_U0089_ENCN_20250816_202117_324022", - "audio_path": "SEGMENT_ZH-CN_U0089_ENCN_20250816_202117_324022.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0089_ENCN_20250816_202117_324022.wav", "ground_truth": "Just for now, I just had you saying that. Yes. I also like the tomato fried egg. What you said, just bring me a lot of memories. You know, like when I was a child, I also have my mom used to made me tomatoes fried egg. So she will put in the tomatoes first. And she said, that will be very tasty. But I do also remember that when I went to other cities, they do have other ways to cook the tomato fried egg. 因为很多时候就是,它最早是依赖于我们的现实生活,比如说我发的照片也好,或者是我的个人行动也好,像你说的步数也好,就是它是依赖于我现实的生活的。但是随着时间的变化,其实我又很,我的情绪价值往往又源自于我这社交媒体,比如说我发了一个朋友圈,有多少人喜欢,有多少人评论。", - "original_transcription": ": 他就依赖于我们的现实生活比如说我发的照片也好或者是我的个人行动也好像你说的部署也好就是他依赖于我现实生活的但是随着时间的变化其实我又很我的情绪价值往往又源自于我的社交媒体比如说我发了一个朋友圈有多少人喜欢有多少人评论\n", "generated_transcription": ": 他就依赖于我们的现实生活比如说我发的照片也好或者是我的个人行动也好像你说的部署也好就是他依赖于我现实生活的但是随着时间的变化其实我又很我的情绪价值往往又源自于我这社交媒体比如说我发了一个朋友圈有多少人喜欢有多少人评论\n", - "wer_original": 0.5412844036697247, - "wer_generated": 0.5229357798165137, - "wer_improvement": 0.01834862385321101, - "wer_method": "smart_mixed", - "mer_original": 0.5275229357798165 + "cer": 0.7570093457943925, + "wer": 1.0, + "mer": 0.49514563106796117 }, { "segment_id": "SEGMENT_ZH-CN_U0095_ENCN_20250816_202117_351616", - "audio_path": "SEGMENT_ZH-CN_U0095_ENCN_20250816_202117_351616.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0095_ENCN_20250816_202117_351616.wav", "ground_truth": "Yeah, and sometimes I will buy the albums. The C D online. 对。", - "original_transcription": ": yeah, (um), and sometimes I would buy the albums the Cd online. 对.\n", "generated_transcription": "\n: yeah, (um), and sometimes I will buy the albums the Cd online, 对\n", - "wer_original": 1.0, - "wer_generated": 0.5714285714285714, - "wer_improvement": 0.4285714285714286, - "wer_method": "smart_mixed", - "mer_original": 0.5714285714285714 + "cer": 0.044444444444444446, + "wer": 0.23076923076923078, + "mer": 0.23076923076923078 }, { "segment_id": "SEGMENT_ZH-CN_U0012_ENCN_20250816_202117_329357", - "audio_path": "SEGMENT_ZH-CN_U0012_ENCN_20250816_202117_329357.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0012_ENCN_20250816_202117_329357.wav", "ground_truth": "Yes, yes, that. 有,就是像我们那个楼的话,十一点十五,晚上那个在门口那个宿管阿姨,她就会把门给关掉。但是如果说你要那个时候进的话也没问题,就是刷个卡或者登记一下就可以了。", - "original_transcription": ": yes yes that : 有就是像我们那个楼的话十一点十五晚上那个在门口那个宿管阿姨她就会把门给关掉但是如果说你要那个时候进的话也没问题就是刷个卡或者登记一下就可以了\n", "generated_transcription": "\n: yes yes 那 : 有就是像我们那个楼的话十一点十五晚上那个在门口那个宿管阿姨她就会把门给关掉但是如果说你要那个时候进的话也没问题就是刷个卡或者登记一下就可以了\n", - "wer_original": 0.9625, - "wer_generated": 0.125, - "wer_improvement": 0.8375, - "wer_method": "smart_mixed", - "mer_original": 0.125 + "cer": 0.05, + "wer": 0.25, + "mer": 0.0136986301369863 }, { "segment_id": "SEGMENT_ZH-CN_U1053_ENCN_20250816_202117_324658", - "audio_path": "SEGMENT_ZH-CN_U1053_ENCN_20250816_202117_324658.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1053_ENCN_20250816_202117_324658.wav", "ground_truth": "这个有,因为我现在 有喜欢的明星,然后我们会经常组织在一起唱歌,然后一起 一起玩游戏之类的。因因为他们跟我 也比较说的上话嘛,所以我觉得能够认识他们也很好。 We, we really travel with my family.", - "original_transcription": ": 嗯这个有因为我现在嗯嗯有喜欢的明星然后我们会经常组织在一起唱歌然后一起嗯一起玩游戏之类的因因为他们跟我嗯也比较说得得上话嘛所以我觉得能够认识他们也很好\n", "generated_transcription": ": 嗯 这个有因为我现在嗯有喜欢的明星然后我们会经常组织在一起唱歌然后一起嗯一起玩游戏之类的因因为他们跟我嗯也比较说得得上话嘛所以我觉得能够认识他们也很好\n", - "wer_original": 0.024390243902439025, - "wer_generated": 0.23170731707317074, - "wer_improvement": -0.20731707317073172, - "wer_method": "smart_mixed", - "mer_original": 0.24390243902439024 + "cer": 0.35051546391752575, + "wer": 1.0, + "mer": 0.17105263157894737 }, { "segment_id": "SEGMENT_ZH-CN_U0064_MIX_20250816_202117_334983", - "audio_path": "SEGMENT_ZH-CN_U0064_MIX_20250816_202117_334983.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0064_MIX_20250816_202117_334983.wav", "ground_truth": "嗯,对,我看您也是就是北京的那您有时候就是 recommend 这一些 place 去玩吗?我感觉现在天天每天就在家,因为我是一个很典型的 homegirl,我感觉就是找不着什么很好玩的地方。", - "original_transcription": ": 嗯对我看您也是就是北京的您有什么就recommend的这些place去玩嘛我感觉现在天天每天就在家因为我是一个很典型的homegirl我感觉就是找不到什么很好玩的地方\n", "generated_transcription": "\n: 嗯对我看您也是就是北京的那您有什么就recommend的这些place去玩嘛我感觉现在天天每天就在家因为我是一个很典型的homegirl我感觉就是找不到什么很好玩的地方\n", - "wer_original": 0.9814814814814815, - "wer_generated": 1.0, - "wer_improvement": -0.01851851851851849, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.08235294117647059, + "wer": 1.0, + "mer": 0.10606060606060606 }, { "segment_id": "SEGMENT_ZH-CN_U0064_ENCN_20250816_202117_335456", - "audio_path": "SEGMENT_ZH-CN_U0064_ENCN_20250816_202117_335456.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0064_ENCN_20250816_202117_335456.wav", "ground_truth": "对。 Well, because I'm not the person who can do these kind of sports like basketball, football, because I think is, is it's a little bit dangerous. You have to kick a ball or.", - "original_transcription": ": 对, Well, because I'm not the person who can do this kind of sports like basketball football, because I think is is (um), it's a little bit dangerous. You have to kick a ball or.\n", "generated_transcription": "\n: 对 Well, because I'm not the person who can do this kind of sports like basketball football, because I think is is (um), it's a little bit dangerous. You have to kick a ball or\n", - "wer_original": 1.457142857142857, - "wer_generated": 0.17142857142857143, - "wer_improvement": 1.2857142857142856, - "wer_method": "smart_mixed", - "mer_original": 0.14285714285714285 + "cer": 0.030303030303030304, + "wer": 0.058823529411764705, + "mer": 0.058823529411764705 }, { "segment_id": "SEGMENT_ZH-CN_U1055_ENCN_20250816_202117_345009", - "audio_path": "SEGMENT_ZH-CN_U1055_ENCN_20250816_202117_345009.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1055_ENCN_20250816_202117_345009.wav", "ground_truth": "As I just said, my passion is playing basketball or basketball, but the professional athletes, the N B A players often serve as role models for us, showcasing discipline , dedication and perseverance in the game they play. 我明白的,不仅仅是这样吧,像你跑步也很好呀,晚上跑一点步,然后听听音乐,我觉得还是挺不错的一项运动,至少不会花很多时间,然后要提前计划什么的,我觉得这样反而还好一点。因为运动嘛,我觉得每天运动才是最重要的。你偶尔去一次爬山的话。", - "original_transcription": ": 我喜欢打篮球,但我认为专业运动员,NBA球员,经常是我们的榜样,他们展现出纪律,奉献和在比赛中坚持不懈的精神。嗯,我明白的,不仅仅是这样吧,像你跑步也很好呀,晚上跑一点步,然后听听音乐,我觉得还是挺不错的一种运动,至少不会花很多时间,然后要提前计划什么的,我觉得这样反而还好一点,因为运动嘛,我觉得每天运动才是最重要的,你偶尔去一次爬山的话\n", "generated_transcription": ": 我说我刚才说我的激情是玩篮球或篮球但是专业运动员的NBA球员经常是我们的榜样展示着纪律啊奉献和在他们玩的游戏中坚持不懈的 : 嗯我明白的不仅仅是这样吧像你跑步也很好呀晚上跑一点步然后听听音乐我觉得还是挺不错的一些运动至少不会花很多时间然后要提前计划什么的我觉得这样反而还好一点因为运动嘛我觉得每天运动才是最重要的你偶尔去一次爬山的话\n", - "wer_original": 0.75, - "wer_generated": 0.3355263157894737, - "wer_improvement": 0.4144736842105263, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.6298932384341637, + "wer": 1.0, + "mer": 0.4316546762589928 }, { "segment_id": "SEGMENT_ZH-CN_U1094_ENCN_20250816_202117_321305", - "audio_path": "SEGMENT_ZH-CN_U1094_ENCN_20250816_202117_321305.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1094_ENCN_20250816_202117_321305.wav", "ground_truth": "Oh, all right, so Bobby was talking to Michelle, and I was talking to Emily, and it was going just fine. And she was a really nice girl and also Lisa. I think she is a very super nice girl and still friends with her now. But I mean, in the span of maybe five seconds from when we started talking to them to when this happen. 这不在丽江,他在底几个地方都取了景的。", - "original_transcription": ": But I mean in the span of maybe five seconds from when we started talking to them to when this happened.\n", "generated_transcription": ": But she didn't go to jail\n", - "wer_original": 0.22093023255813954, - "wer_generated": 0.9767441860465116, - "wer_improvement": -0.7558139534883721, - "wer_method": "smart_mixed", - "mer_original": 0.7790697674418605 + "cer": 0.9360902255639098, + "wer": 0.9705882352941176, + "mer": 0.9761904761904762 }, { "segment_id": "SEGMENT_ZH-CN_U0021_ENCN_20250816_202117_261700", - "audio_path": "SEGMENT_ZH-CN_U0021_ENCN_20250816_202117_261700.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0021_ENCN_20250816_202117_261700.wav", "ground_truth": "是的是的。 In my imagination, I prefer work .", - "original_transcription": ": 是的 是的 In my imagination I prefer work\n", "generated_transcription": "\n: 是的 是的 In my imagination I prefer work\n", - "wer_original": 1.1666666666666667, - "wer_generated": 0.25, - "wer_improvement": 0.9166666666666667, - "wer_method": "smart_mixed", - "mer_original": 0.25 + "cer": 0.0, + "wer": 0.2857142857142857, + "mer": 0.0 }, { "segment_id": "SEGMENT_ZH-CN_U1064_MIX_20250816_202117_339993", - "audio_path": "SEGMENT_ZH-CN_U1064_MIX_20250816_202117_339993.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1064_MIX_20250816_202117_339993.wav", "ground_truth": "是的,然后我发现就是如果我们只是单纯的 focus on myself 的话,其实也不也是觉得不够的。因为你知道人他是一种社会性的动物,就像马克思说的,对吧?如果我们只专注于自己的话,那我们会发现 这个 word 反而好像并没有那么的 colorful。 所以我觉得有时候我们应该 focus on myself, 有时候我们应该 focus on others, focus on the environment 呀。", - "original_transcription": ": (hm) 是的然后我发现就是如果我们只单纯的, focus on myself 的话其实我也是觉得���够的因为你知道人它是一种社会性的动物就像马克思说的对吧如果我们只专注于自己的话那么我会发现, (err) 这个 world 似乎并没有那么的 colorful 所以我觉得有时候我们应该, focus on myself, 有时候我们应该, focus on others, (err) focus on the environment, (err)\n", "generated_transcription": "\n嗯是的然后我发现就是如果我们只单纯的focus on myself的话其实我也是觉得不够的因为你知道人它是一种社会性的动物就像马克思说的对吧如果我们只专注于自己的话那么我会发现诶这个world好像并没有那么的colorful所以我觉得有时候我们应该focus on myself有时候我们应该focus on othersfocus on the environment啊\n", - "wer_original": 0.0, - "wer_generated": 0.9612403100775194, - "wer_improvement": -0.9612403100775194, - "wer_method": "smart_mixed", - "mer_original": 0.24031007751937986 + "cer": 0.06741573033707865, + "wer": 0.7727272727272727, + "mer": 0.11570247933884298 }, { "segment_id": "SEGMENT_ZH-CN_U1050_MIX_20250816_202117_299634", - "audio_path": "SEGMENT_ZH-CN_U1050_MIX_20250816_202117_299634.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1050_MIX_20250816_202117_299634.wav", "ground_truth": "我也有看过那个电影,那个 movie 他是。 具有具有观赏性的 competition,对吧?", - "original_transcription": ": 我也有看过那个电影那个movie它是具有观赏性的competition对吧\n", "generated_transcription": "\n: 我也有看过那个电影那个movie它是具有观赏性的competition对吧\n", - "wer_original": 0.32, - "wer_generated": 1.0, - "wer_improvement": -0.6799999999999999, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.07692307692307693, + "wer": 1.0, + "mer": 0.12 }, { "segment_id": "SEGMENT_ZH-CN_U0060_ENCN_20250816_202117_302901", - "audio_path": "SEGMENT_ZH-CN_U0060_ENCN_20250816_202117_302901.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0060_ENCN_20250816_202117_302901.wav", "ground_truth": "What about you? 是的,这就是观鸟给我的,这个是提到观鸟我想到的事情,还有。", - "original_transcription": ": what about you, 是的, 这就是关鸟给我的, 这就是提到关鸟我想到的事情, 还有\n", "generated_transcription": "\n: what about you, 是的, 这就是关鸟给我的, 这就是提到关鸟我想到的事情, 还有\n", - "wer_original": 0.09375, - "wer_generated": 0.28125, - "wer_improvement": -0.1875, - "wer_method": "smart_mixed", - "mer_original": 0.28125 + "cer": 0.08108108108108109, + "wer": 1.0, + "mer": 0.10714285714285714 }, { "segment_id": "SEGMENT_ZH-CN_U1043_ENCN_20250816_202117_293789", - "audio_path": "SEGMENT_ZH-CN_U1043_ENCN_20250816_202117_293789.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1043_ENCN_20250816_202117_293789.wav", "ground_truth": "Oh, my god. Okay, I don't. I don't want to talk about my work, talk about yours. 因为中国市场。", - "original_transcription": ": (oh) my god okay I don't I don't want to talk about my work talk about yours because China market\n", "generated_transcription": "\n: (oh) my god okay I don't I don't wanna talk about my work talk about yours because China market\n", - "wer_original": 0.2916666666666667, - "wer_generated": 0.625, - "wer_improvement": -0.3333333333333333, - "wer_method": "smart_mixed", - "mer_original": 0.5416666666666666 + "cer": 0.3387096774193548, + "wer": 0.2777777777777778, + "mer": 0.34782608695652173 }, { "segment_id": "SEGMENT_ZH-CN_U1068_MIX_20250816_202117_288642", - "audio_path": "SEGMENT_ZH-CN_U1068_MIX_20250816_202117_288642.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1068_MIX_20250816_202117_288642.wav", "ground_truth": "对,就就比如说像像 music festival 就是这样,因为其实一开始我挺挺挺怕的,你知道吧?有有点 afraid。就是感觉就是就是那种music festival,就是那些潮人,就是 very fashionable people。然后就感觉自己只是一个 normal people。就不应该去这种地方,就 那个丢人现眼了���", - "original_transcription": ": 对就比如说像像music festival就这样因为其实一开始我挺挺挺怕的你知道吧有点afraid就是感觉就是就是那种music festival就是那些潮人就是very fashionable people然后就感觉自己只是一个normal people就不应该去这种地方就呃那个丢人现眼了\n", "generated_transcription": "\n: 对就比如说像像music festival就这样因为其实一开始我挺挺挺怕的你知道吧有点afraid就是感觉就是就是那种music festival就是那些潮人very fashionable people然后就感觉自己只是一个normal people就不应该去这种地方就呃那个丢人现眼了\n", - "wer_original": 1.3018867924528301, - "wer_generated": 0.9811320754716981, - "wer_improvement": 0.320754716981132, - "wer_method": "smart_mixed", - "mer_original": 0.9811320754716981 + "cer": 0.04195804195804196, + "wer": 0.9166666666666666, + "mer": 0.06818181818181818 }, { "segment_id": "SEGMENT_ZH-CN_U1067_ENCN_20250816_202117_301263", - "audio_path": "SEGMENT_ZH-CN_U1067_ENCN_20250816_202117_301263.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1067_ENCN_20250816_202117_301263.wav", "ground_truth": "Yeah, I do. I like read books, listen musics and do some sports. And I mark. 对对对。", - "original_transcription": ": yeah i do i like reading books listening music and do some sports and i am a\n", "generated_transcription": "\n: yeah i do i like reading books listen music and do some sports and i am\n", - "wer_original": 1.45, - "wer_generated": 0.75, - "wer_improvement": 0.7, - "wer_method": "smart_mixed", - "mer_original": 0.8 + "cer": 0.1694915254237288, + "wer": 0.23529411764705882, + "mer": 0.3157894736842105 }, { "segment_id": "SEGMENT_ZH-CN_U0043_ENCN_20250816_202117_285981", - "audio_path": "SEGMENT_ZH-CN_U0043_ENCN_20250816_202117_285981.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0043_ENCN_20250816_202117_285981.wav", "ground_truth": "Yeah, like like in exactly like enjoy the the movie, and like like like realistic way, almost like well you're there, you know, well everything is happening. Well I actually had that experience a few years back when they have the what they call like Four D or Five D movie. And they put you in this like seat. And then the the seat goes like like roller coaster goes up and down it shakes and like it's brace water on you as well. And yeah, that that's pretty cool. I don't know if you ever try those like Three D or like Four D Five D movies before? 对,那你平时工作和学习中,你觉得就是对电子产品的使用多吗?", - "original_transcription": ": yeah like like exactly like enjoy the movie in like like like realistic way almost like while you are there you know while everything is happening well actually had that experience a few years back when they have the what they call like four D or five D movie and they put you in this like seat and then the seat goes like like roller coaster goes up and down it shakes and like sprays water on you as well : yeah that's pretty cool I don't know if you ever try those like three D I like four D or fiveD movies before : 对那你平时工作和学习中你觉得就是对电子产品的使用多吗\n", "generated_transcription": ": yeah like like exactly like enjoy the movie in like like like realistic way almost like while you are there you know while everything is happening well actually had that experience a few years back when they have the what they call like four D or five D movie and they put you in this like seat and then the seat goes like like roller coaster goes up and down it shakes and like sprays water on you as well : yeah that that's pretty cool I don't know if you ever try those like three D like four D or five D movies before : 对那你平时工作和学习中你觉得就是对电子产品的使用多吗\n", - "wer_original": 0.21739130434782608, - "wer_generated": 0.26811594202898553, - "wer_improvement": -0.05072463768115945, - "wer_method": "smart_mixed", - "mer_original": 0.2753623188405797 + "cer": 0.06637168141592921, + "wer": 0.12727272727272726, + "mer": 0.1037037037037037 }, { "segment_id": "SEGMENT_ZH-CN_U0004_MIX_20250816_202117_320692", - "audio_path": "SEGMENT_ZH-CN_U0004_MIX_20250816_202117_320692.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0004_MIX_20250816_202117_320692.wav", "ground_truth": "And effected by my colleagues。I become even more emotional,so I can create more generate more 那个 creative writings 。", - "original_transcription": ": And affected by my colleagues, I become more even more emotional so I can create more, (uh), generate more, 那个, creative writings.\n", "generated_transcription": "\n: And affected by my colleagues, I become more even more emotional so I can create more generate more creative writings\n", - "wer_original": 0.2, - "wer_generated": 0.45, - "wer_improvement": -0.25, - "wer_method": "smart_mixed", - "mer_original": 0.6 + "cer": 0.07368421052631578, + "wer": 0.3888888888888889, + "mer": 0.42105263157894735 }, { "segment_id": "SEGMENT_ZH-CN_U0040_ENCN_20250816_202117_283792", - "audio_path": "SEGMENT_ZH-CN_U0040_ENCN_20250816_202117_283792.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0040_ENCN_20250816_202117_283792.wav", "ground_truth": "Would you have any job preference? No. 对对对。", - "original_transcription": ": Do you have any job preference? : (um), no, 对对对.\n", "generated_transcription": "\n: Do you have any job preference? : (um), no, 对对对\n", - "wer_original": 0.36363636363636365, - "wer_generated": 0.36363636363636365, - "wer_improvement": 0.0, - "wer_method": "smart_mixed", - "mer_original": 0.36363636363636365 + "cer": 0.18181818181818182, + "wer": 0.25, + "mer": 0.2 }, { "segment_id": "SEGMENT_ZH-CN_U0037_MIX_20250816_202117_308257", - "audio_path": "SEGMENT_ZH-CN_U0037_MIX_20250816_202117_308257.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0037_MIX_20250816_202117_308257.wav", "ground_truth": "嗯,其实我看过这部电影啊,哦,他是不是是不是叫英文名是不是叫 knives out 是吧? 哦,是不是那个那个那个零零七演的? 哦。", - "original_transcription": ": 嗯嗯其实我看过这部电影啊它是不是是不是叫英文名是不是叫knife out是吧哦是不是那个那个那个零零七演的哦\n", - "generated_transcription": ": 嗯嗯其实我看过这部电影啊它是不是是不是叫英文名是不是叫knife out 是吧哦是不是那个那个那个零零七演的\n", - "wer_original": 0.12962962962962962, - "wer_generated": 0.6666666666666666, - "wer_improvement": -0.537037037037037, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "generated_transcription": "\n: 嗯嗯其实我看过这部电影啊它是不是是不是叫英文名是不是叫knife out是吧哦是不是那个那个那个零零七演的\n", + "cer": 0.1111111111111111, + "wer": 1.0, + "mer": 0.10638297872340426 }, { "segment_id": "SEGMENT_ZH-CN_U2007_MIX_20250816_202117_309614", - "audio_path": "SEGMENT_ZH-CN_U2007_MIX_20250816_202117_309614.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U2007_MIX_20250816_202117_309614.wav", "ground_truth": "你也喜欢 那个韩国流行的 music 啊,他他的简称好像是叫做 K pop 是吧?", - "original_transcription": ": (oh) 你也喜欢那个韩国流行的没有谁啊他他的 концер好像是叫做, Kpop, 是吧,\n", - "generated_transcription": ": (oh) 你也喜欢那个韩国流行的没有谁啊他他的 концер很像是叫做, Kpop, 是吧\n", - "wer_original": 0.10344827586206896, - "wer_generated": 0.5172413793103449, - "wer_improvement": -0.4137931034482759, - "wer_method": "smart_mixed", - "mer_original": 0.4827586206896552 + "generated_transcription": "\n: (oh) 你也喜欢那个韩国流行的没有谁啊他他的 концер很像是叫做, Kpop 是吧\n", + "cer": 0.30303030303030304, + "wer": 0.8571428571428571, + "mer": 0.3333333333333333 }, { "segment_id": "SEGMENT_ZH-CN_U1065_ENCN_20250816_202117_315093", - "audio_path": "SEGMENT_ZH-CN_U1065_ENCN_20250816_202117_315093.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1065_ENCN_20250816_202117_315093.wav", "ground_truth": "那你就是打算就是什么时候开始运动啊?刚才��们聊了那么多。 I know I know but before besides this, the can you run with me?", - "original_transcription": ": 嗯那你就是打算就是什么时候开始运动啊刚才咱们聊那么多啊嗯嗯我知道我知道嗯但是嗯除了这个嗯除了这个嗯能不能跟我玩\n", - "generated_transcription": ": 那你就是打算就是什么时候开始运动啊刚才咱们聊那么多啊\n", - "wer_original": 0.5952380952380952, - "wer_generated": 0.40476190476190477, - "wer_improvement": 0.19047619047619047, - "wer_method": "smart_mixed", - "mer_original": 0.7619047619047619 + "generated_transcription": "\n: 那你就是打算就是什么时候开始运动啊刚才咱们聊那么多啊\n", + "cer": 0.6621621621621622, + "wer": 1.0, + "mer": 0.375 }, { "segment_id": "SEGMENT_ZH-CN_U1014_ENCN_20250816_202117_284848", - "audio_path": "SEGMENT_ZH-CN_U1014_ENCN_20250816_202117_284848.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1014_ENCN_20250816_202117_284848.wav", "ground_truth": "Yeah. 嗯嗯。 我用了大概我想一想哦,我今年是二十岁,大概用了八年左右。 Yes yes yes yes yes yeah.", - "original_transcription": ": yeah 嗯嗯 我用了大概我想一想哦我今年是二十岁大概用了八年左右 yes yes yes yes yes yeah\n", - "generated_transcription": ": yeah 嗯嗯 我用了大概我想一想哦我今年是二十岁大概用了八年左右 yes yes yes yes yes yeah\n", - "wer_original": 0.8157894736842105, - "wer_generated": 0.18421052631578946, - "wer_improvement": 0.631578947368421, - "wer_method": "smart_mixed", - "mer_original": 0.18421052631578946 + "generated_transcription": "\n: yeah 嗯嗯 我用了大概我想一想哦我今年是二十岁大概用了八年左右 yes yes yes yes yes yeah\n", + "cer": 0.0, + "wer": 0.0, + "mer": 0.0 }, { "segment_id": "SEGMENT_ZH-CN_U0073_MIX_20250816_202117_338282", - "audio_path": "SEGMENT_ZH-CN_U0073_MIX_20250816_202117_338282.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0073_MIX_20250816_202117_338282.wav", "ground_truth": "但是那段时间我并没有 control what I eat,所以我可能并没有真正地去 lose weight。", - "original_transcription": ": 但是那段时间我并没有, (err) control what I eat, 所以我可能并没有真正的去, lose weight.\n", - "generated_transcription": "But 那段时间我并没有 control what I eat, 所以我可能并没有真正的去 lose weight\n", - "wer_original": 1.4375, - "wer_generated": 1.0, - "wer_improvement": 0.4375, - "wer_method": "smart_mixed", - "mer_original": 1.0625 + "generated_transcription": "\n: 但是那段时间我并没有 control what I eat, 所以我可能并没有真正的去 lose weight\n", + "cer": 0.02127659574468085, + "wer": 0.2857142857142857, + "mer": 0.03571428571428571 }, { "segment_id": "SEGMENT_ZH-CN_U1056_MIX_20250816_202117_347216", - "audio_path": "SEGMENT_ZH-CN_U1056_MIX_20250816_202117_347216.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1056_MIX_20250816_202117_347216.wav", "ground_truth": "哦, 哦 ,哎,这个这个之前确实没有过这这种想法,因为 平常像就是平常去 climbing mountain, I'll always waste my with my friend. Yes, I only have one friend. So.", - "original_transcription": ": (oh) (oh) (ah), 这个这个之前确实没有过这这种想法因为, (um), 平常像就是平常去, climbing mountain, (um), I always with my (um) with my friend. Yes, I only have one friend so.\n", - "generated_transcription": ": (oh) (oh) (ah), 这个这个之前确实没有过这这种想法因为, (err), 平常像就是平常去, climbing mountain, (um), I always waste my (hm) with my friend. Yes, I only have one friend so.\n", - "wer_original": 0.20408163265306123, - "wer_generated": 0.32653061224489793, - "wer_improvement": -0.1224489795918367, - "wer_method": "smart_mixed", - "mer_original": 0.3469387755102041 + "generated_transcription": "\n: (oh) (oh) (ah) 这个这个之前确实没有过这这种想法因为, (err) 平常像就是平常去 climbing mountain, (um), I always waste my (hm) with my friend. Yes, I only have one friend so\n", + "cer": 0.14583333333333334, + "wer": 0.4, + "mer": 0.15555555555555556 }, { "segment_id": "SEGMENT_ZH-CN_U1050_ENCN_20250816_202117_299706", - "audio_path": "SEGMENT_ZH-CN_U1050_ENCN_20250816_202117_299706.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1050_ENCN_20250816_202117_299706.wav", "ground_truth": "哦。 Oh, where are you from? His writer. 对,而且也可以提高我们的专注度。", - "original_transcription": ": 哦,你来自哪里?: 对,而且也可以提高我们的专注度\n", - "generated_transcription": ": 哦 where are you from writer : 对而且也可以提高我们的专注度\n", - "wer_original": 0.72, - "wer_generated": 0.28, - "wer_improvement": 0.43999999999999995, - "wer_method": "smart_mixed", - "mer_original": 0.36 + "generated_transcription": "\n: 哦 where are you from writer : 对而且也可以提高我们的专注度\n", + "cer": 0.12195121951219512, + "wer": 0.2222222222222222, + "mer": 0.09090909090909091 }, { "segment_id": "SEGMENT_ZH-CN_U0054_MIX_20250816_202117_276890", - "audio_path": "SEGMENT_ZH-CN_U0054_MIX_20250816_202117_276890.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0054_MIX_20250816_202117_276890.wav", "ground_truth": "不是很不是很多,Like。 Actually I like most 球类活动我都可以玩儿。但是 none of them is 非常的程度非常的好的那种。", - "original_transcription": ": 不是很不是很多, like actually I like most, 球类活动我都可以玩但是, none of them is 非常的程度非常的好的那种\n", - "generated_transcription": ": 不是很不是很多 like actually I like most 球类活动我都可以玩但是 none of them is 非常的程度非常的好的那种\n", - "wer_original": 0.0, - "wer_generated": 0.34285714285714286, - "wer_improvement": -0.34285714285714286, - "wer_method": "smart_mixed", - "mer_original": 0.42857142857142855 + "generated_transcription": "\n: 不是很不是很多 like actually I like most 球类活动我都可以玩但是 none of them is 非常的程度非常的好的那种\n", + "cer": 0.015625, + "wer": 0.2727272727272727, + "mer": 0.025 }, { "segment_id": "SEGMENT_ZH-CN_U0038_ENCN_20250816_202117_323820", - "audio_path": "SEGMENT_ZH-CN_U0038_ENCN_20250816_202117_323820.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0038_ENCN_20250816_202117_323820.wav", "ground_truth": "Yeah. 嗯,确实 确实。 嗯。 I think it's just like the traditional way that a parent would think of their children. Like to want them to be better than others instead of like being good enough for themselves.", - "original_transcription": ": yeah, (hm), 确切, 确切, I think it's just like the traditional way that a parent would think of their children like they want them to be better than others instead of like being good enough for themselves.\n", "generated_transcription": "\n: ya, (hm), 确切, 确切, I think it's just like the traditional way that a parent would think of their children like they want them to be better than others instead of like being good enough for themselves.\n", - "wer_original": 0.9302325581395349, - "wer_generated": 0.27906976744186046, - "wer_improvement": 0.6511627906976745, - "wer_method": "smart_mixed", - "mer_original": 0.27906976744186046 + "cer": 0.05161290322580645, + "wer": 0.13513513513513514, + "mer": 0.15 }, { "segment_id": "SEGMENT_ZH-CN_U0016_ENCN_20250816_202117_303630", - "audio_path": "SEGMENT_ZH-CN_U0016_ENCN_20250816_202117_303630.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0016_ENCN_20250816_202117_303630.wav", "ground_truth": "Yeah, I think maybe for some people. Just singing is a little difficult, especially when you sing singing in front of the people, but are the music is always very good for our relaxation. 然后我当时就笑她,我说你做得到吗?她说但是今天下雨,所以我就不去了。", - "original_transcription": ": yeah, I think (um), (um), maybe for some people (um) to sing is (um) a little difficult, (um), especially when you sing sing it in front of the people, but (um), the music is always very good for our relaxation. : 然后我��时就笑他我说你做得到吗他说但是今天下雨所以我就不去了\n", "generated_transcription": "\n: yeah, I think (um), (um), maybe for some people (um) to sing is (um) a little difficult, (um), especially when you sing sing it in front of the people, but (um) the music is always very good for our relaxation. : 然后我当时就笑他他说你做得到吗他说但是今天下雨所以我就不去了\n", - "wer_original": 0.5, - "wer_generated": 0.27941176470588236, - "wer_improvement": 0.22058823529411764, - "wer_method": "smart_mixed", - "mer_original": 0.2647058823529412 + "cer": 0.11731843575418995, + "wer": 0.3142857142857143, + "mer": 0.203125 }, { "segment_id": "SEGMENT_ZH-CN_U1010_ENCN_20250816_202117_282733", - "audio_path": "SEGMENT_ZH-CN_U1010_ENCN_20250816_202117_282733.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1010_ENCN_20250816_202117_282733.wav", "ground_truth": "Have you ever used ** ? 对。 Yes, of course, well, actually I do like both of these too. And what about you? 嗯。", - "original_transcription": ": Have you ever used chat you\n: Yes of course well actually I do like both of these two and what about you\n", "generated_transcription": "\n: Have you ever used chat you\n: 对\n: Yes of course well actually I do like both of these two And what about you\n", - "wer_original": 1.2307692307692308, - "wer_generated": 0.38461538461538464, - "wer_improvement": 0.8461538461538463, - "wer_method": "smart_mixed", - "mer_original": 0.46153846153846156 + "cer": 0.11842105263157894, + "wer": 0.18181818181818182, + "mer": 0.18181818181818182 }, { "segment_id": "SEGMENT_ZH-CN_U1036_ENCN_20250816_202117_341662", - "audio_path": "SEGMENT_ZH-CN_U1036_ENCN_20250816_202117_341662.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1036_ENCN_20250816_202117_341662.wav", "ground_truth": "感人,他总让人感觉。 When do you have time? Yes, and. 的那种。", - "original_transcription": ": 感人他总让人感觉, when do you have time, yes, and 的那种\n", "generated_transcription": "\n: 感人他总让人感觉, when do you have time, yes and 的那种\n", - "wer_original": 0.6666666666666666, - "wer_generated": 0.3333333333333333, - "wer_improvement": 0.3333333333333333, - "wer_method": "smart_mixed", - "mer_original": 0.3333333333333333 + "cer": 0.0, + "wer": 0.0, + "mer": 0.0 }, { "segment_id": "SEGMENT_ZH-CN_U1060_ENCN_20250816_202117_294901", - "audio_path": "SEGMENT_ZH-CN_U1060_ENCN_20250816_202117_294901.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1060_ENCN_20250816_202117_294901.wav", "ground_truth": "famous. 代表性。 对,现在 到处都是看到小朋友在踢足球,参加什么俱乐部之类的。对,他们也对周潇也非常有热情。 So besides, I have we I have participate to ** in summer clubs. For example.", - "original_transcription": ": 那么,我们来谈谈中国足球的现状。\n", "generated_transcription": ": 那么 famous 代表性 对现在到处都是看到小朋友在踢足球参加什么俱乐部之类的 对他们也对足球也非常有热情 呃 so 呃 besides 我 have participated in summer clubs\n", - "wer_original": 0.421875, - "wer_generated": 0.359375, - "wer_improvement": 0.0625, - "wer_method": "smart_mixed", - "mer_original": 0.921875 + "cer": 0.24761904761904763, + "wer": 0.7222222222222222, + "mer": 0.24561403508771928 }, { "segment_id": "SEGMENT_ZH-CN_U0061_MIX_20250816_202117_265137", - "audio_path": "SEGMENT_ZH-CN_U0061_MIX_20250816_202117_265137.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0061_MIX_20250816_202117_265137.wav", "ground_truth": "所以我就根本没有任何 experiments 在这个 journey 上面。 秦皇岛没有,我去滨海新区,当时是去看海的,但是那天太 cold 的了,那天我们早上就是五点多准备去海上看一个 sunrise 。", - "original_transcription": ": 所以我就根本没有任何experience在这个那个journey上面情况都没有我去滨海市区到时候去看好了���是那个太cold了那天我们早上就是五点多准备去海上看一个sunrise\n", "generated_transcription": "\n: 所以我就根本没有任何experience 在这个那个journey上面情况都没有我去滨海市去到时候去看好了但是那个太cold了那天我们早上就是五点多准备去海上看一个sunrise\n", - "wer_original": 0.014492753623188406, - "wer_generated": 1.0, - "wer_improvement": -0.9855072463768116, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.18181818181818182, + "wer": 1.0, + "mer": 0.2222222222222222 }, { "segment_id": "SEGMENT_ZH-CN_U0006_ENCN_20250816_202117_339861", - "audio_path": "SEGMENT_ZH-CN_U0006_ENCN_20250816_202117_339861.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0006_ENCN_20250816_202117_339861.wav", "ground_truth": "嗯嗯。 No no no. Yeah, at this cat warm run, crazily just to catch you. 是这样 我我室友她也是喜欢韩国的这种流行乐,她然后就在她的影响下,我也知道了一些 男团女团之类的,嗯。", - "original_transcription": ": (hm), (hm), : 那那那, : Yeah, at least cats won't run crazily just to catch you. : 是这样, (oh), 我我室友她也是喜欢韩国的这种流行乐, 她然后就在她的影响下我也知道了一些, (err), 男团女团之类的, (oh).\n", "generated_transcription": "\n: (hm) (hm) : 那那那 : Ya at least cat won't run crazily just to catch you : 是这样哦我室友她也是喜欢韩国的这种流行乐她然后就在她的影响下我也知道了一些男团女团之类的哦\n", - "wer_original": 0.7878787878787878, - "wer_generated": 0.25757575757575757, - "wer_improvement": 0.5303030303030303, - "wer_method": "smart_mixed", - "mer_original": 0.2878787878787879 + "cer": 0.20212765957446807, + "wer": 0.5555555555555556, + "mer": 0.16393442622950818 }, { "segment_id": "SEGMENT_ZH-CN_U0042_ENCN_20250816_202117_275156", - "audio_path": "SEGMENT_ZH-CN_U0042_ENCN_20250816_202117_275156.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0042_ENCN_20250816_202117_275156.wav", "ground_truth": "原来是这样呀。 Yeah, English, Germany and French.", - "original_transcription": ": 然后是这样, ya, English, Germany and France.\n", "generated_transcription": "\n: 然后是这样呀, ya, English, Germany and the France\n", - "wer_original": 1.0, - "wer_generated": 0.5, - "wer_improvement": 0.5, - "wer_method": "smart_mixed", - "mer_original": 0.5 + "cer": 0.2727272727272727, + "wer": 0.6666666666666666, + "mer": 0.45454545454545453 }, { "segment_id": "SEGMENT_ZH-CN_U1061_ENCN_20250816_202117_333816", - "audio_path": "SEGMENT_ZH-CN_U1061_ENCN_20250816_202117_333816.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1061_ENCN_20250816_202117_333816.wav", "ground_truth": "The fighting the fighting against to the enemy and I think it's just a joke, because I ** hurt the painful. Firstly, I must be the ** the spider bite me. Okay. 哦,那如果我有一件这个斗篷,那就太棒了,我就可以从万米高空跳下去了。", - "original_transcription": ": (um) fighting the fighting against the enemy and I think it's just a joke because I I am the freaking heart the painful firstly I must be the (uh) master of the spider by me okay : (oh) 那如果我又遇见这个头鹏那太棒了我就可以从万米高空跳下去了\n", "generated_transcription": "\n: (um) fighting the fighting against the enemy and I think it's just a joke because I I am the freaking heart the painful firstly I must be the (uh) master of the spider by me okay : 哦那如果我又遇见这个头鹏那太棒了我就可以从万米高空跳下去了\n", - "wer_original": 0.5454545454545454, - "wer_generated": 0.3939393939393939, - "wer_improvement": 0.1515151515151515, - "wer_method": "smart_mixed", - "mer_original": 0.4090909090909091 + "cer": 0.2702702702702703, + "wer": 0.3870967741935484, + "mer": 0.2833333333333333 }, { "segment_id": "SEGMENT_ZH-CN_U0024_ENCN_20250816_202117_331622", - "audio_path": "SEGMENT_ZH-CN_U0024_ENCN_20250816_202117_331622.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0024_ENCN_20250816_202117_331622.wav", "ground_truth": "对。 Yeah, yeah but a lot a lot of people do that.", - "original_transcription": ": 对 : yeah a lot of people do that\n", "generated_transcription": "\n: 对\n", - "wer_original": 0.0, - "wer_generated": 0.9230769230769231, - "wer_improvement": -0.9230769230769231, - "wer_method": "smart_mixed", - "mer_original": 0.46153846153846156 + "cer": 0.9705882352941176, + "wer": 0.9166666666666666, + "mer": 0.9166666666666666 }, { "segment_id": "SEGMENT_ZH-CN_U0037_ENCN_20250816_202117_308700", - "audio_path": "SEGMENT_ZH-CN_U0037_ENCN_20250816_202117_308700.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0037_ENCN_20250816_202117_308700.wav", "ground_truth": "我觉得中考的体能测试,它包含的内容其实我觉得没有办法反映一个人的,嗯,怎么说运动方面的能力,甚至我觉得完全无法反应。 Like I started up my own business after I graduate. What my work didn't evolve in debating as well. so yeah.", - "original_transcription": ": 我觉得中考的体能测试它包含的内容其实我觉得没有办法反映一个人的运动方面的能力甚至我觉得完全无法反映\n", "generated_transcription": ": 我觉得中考的体能测试它包含的内容其实我觉得没有办法反映一个人的运动方面的能力甚至我觉得完全无法反映\n", - "wer_original": 0.7468354430379747, - "wer_generated": 0.3924050632911392, - "wer_improvement": 0.35443037974683544, - "wer_method": "smart_mixed", - "mer_original": 0.3924050632911392 + "cer": 0.6496350364963503, + "wer": 1.0, + "mer": 0.35135135135135137 }, { "segment_id": "SEGMENT_ZH-CN_U0063_MIX_20250816_202117_282028", - "audio_path": "SEGMENT_ZH-CN_U0063_MIX_20250816_202117_282028.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0063_MIX_20250816_202117_282028.wav", "ground_truth": "I I I know like 有一种那个叫就是水下的手表那种 watch,然后就能够 record the distance,就记录那个游泳的距离,然后还可以 listen the music。所以,我还没有我看到有人在用,不知,然后会记录那个 heart rate 心率啊。", - "original_transcription": "I I (err) I know like 有一种那个叫就是水下的手表, 那种, watch, 然后就能够, record, the distance, 就记录那个游泳的距离然后还可以, listen, 那个, music, 所以, 我还没有我看到有人在用, 不然后会, 记录那个, (um), heart rate, 心率, (ah).\n", "generated_transcription": "\nI I (err) I know like 有一种那个叫就是水下的手表, 那种, watch, 然后就能够 record, the distance, 就记录那个游泳的距离然后还可以, 类似的, music, 所以我还没有我看到有人在用, 不然后会, 记录那个, (um), heart rate, 心率啊\n", - "wer_original": 1.4242424242424243, - "wer_generated": 1.7878787878787878, - "wer_improvement": -0.36363636363636354, - "wer_method": "smart_mixed", - "mer_original": 1.878787878787879 + "cer": 0.12931034482758622, + "wer": 0.8125, + "mer": 0.08108108108108109 }, { "segment_id": "SEGMENT_ZH-CN_U1016_MIX_20250816_202117_326968", - "audio_path": "SEGMENT_ZH-CN_U1016_MIX_20250816_202117_326968.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1016_MIX_20250816_202117_326968.wav", "ground_truth": "而且我们有时候一些汽车吧,它还能够一些 car, 他还会就是骗你的钱,所以一定要保管好自己的 money 。", - "original_transcription": ": 嗯嗯而且我们有时候一些汽车吧它还能够一些卡儿它还会就是骗你的钱所以一定要保管好自己的money\n", "generated_transcription": "\n: 嗯嗯而且我们有时候一些汽车吧他还能够一些卡儿他还会就是骗你的钱所以一定要保管好自己的money\n", - "wer_original": 0.9534883720930233, - "wer_generated": 1.0, - "wer_improvement": -0.046511627906976716, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.13043478260869565, + "wer": 1.0, + "mer": 0.125 }, { "segment_id": "SEGMENT_ZH-CN_U0026_ENCN_20250816_202117_308065", - "audio_path": "SEGMENT_ZH-CN_U0026_ENCN_20250816_202117_308065.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0026_ENCN_20250816_202117_308065.wav", "ground_truth": "酷。 Cool actually I I also listeened to a lot of HongKong music when I was a kid, because my dad was a huge fan of Beyond the band and in his car I I listen to a lot of songs by Beyond or .", - "original_transcription": ": cool, cool. Actually, I, I also listen to a lot of Hong Kong music when I was a kid because my dad was a huge fan of Beyond the band, and (uh), in his car, I, I listen to a lot of (uh) songs by Beyond or (uh).\n", "generated_transcription": "\n: cool cool. Actually, I, I also listen to a lot of Hong Kong music when I was a kid because my dad was a huge fan of beyond the band and (uh) in his car I I listen to a lot of (uh) songs by beyond or (uh).\n", - "wer_original": 0.10869565217391304, - "wer_generated": 0.30434782608695654, - "wer_improvement": -0.19565217391304351, - "wer_method": "smart_mixed", - "mer_original": 0.32608695652173914 + "cer": 0.09219858156028368, + "wer": 0.1590909090909091, + "mer": 0.1590909090909091 }, { "segment_id": "SEGMENT_ZH-CN_U0069_ENCN_20250816_202117_290647", - "audio_path": "SEGMENT_ZH-CN_U0069_ENCN_20250816_202117_290647.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0069_ENCN_20250816_202117_290647.wav", "ground_truth": "到了中午的时候能把人直接烤熟的那种状态,所以不要选择夏天去,我感觉夏天真的太熬人了 , 对,其他季节还可以。 I should, I should say in Beijing, because my family is here and later, maybe you know, my daughter have have have has her own goal.", - "original_transcription": ": 到了中午的时候能把人直接烤熟的那种状态所以不要选择夏天去我感觉夏天真的太熬人, 其他季节还可以, I should, I should stay in Beijing because my family is here and later maybe you know my daughter have have have has her own girl.\n", "generated_transcription": "\n: 到了中午的时候能把人直接烤熟的那种状态所以不要选择夏天去我感觉夏天真的太熬人, 其他季节还可以, I should, I should stay in Beijing, because my family is here and later maybe you know my daughter have have have has her own girl\n", - "wer_original": 0.5769230769230769, - "wer_generated": 0.14102564102564102, - "wer_improvement": 0.43589743589743585, - "wer_method": "smart_mixed", - "mer_original": 0.15384615384615385 + "cer": 0.03355704697986577, + "wer": 0.14285714285714285, + "mer": 0.0547945205479452 }, { "segment_id": "SEGMENT_ZH-CN_U0047_ENCN_20250816_202117_345579", - "audio_path": "SEGMENT_ZH-CN_U0047_ENCN_20250816_202117_345579.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0047_ENCN_20250816_202117_345579.wav", "ground_truth": "Yeah. So what type of this TV programs or movies you like best? 嗯。", - "original_transcription": ": Yeah, So what type of this Tv programmes or movies you like best? : (hm).\n", "generated_transcription": "\n: Yeah, So what type of this Tv programmes or movies you like best?\n", - "wer_original": 0.13333333333333333, - "wer_generated": 0.3333333333333333, - "wer_improvement": -0.19999999999999998, - "wer_method": "smart_mixed", - "mer_original": 0.3333333333333333 + "cer": 0.06, + "wer": 0.14285714285714285, + "mer": 0.14285714285714285 }, { "segment_id": "SEGMENT_ZH-CN_U0033_ENCN_20250816_202117_340668", - "audio_path": "SEGMENT_ZH-CN_U0033_ENCN_20250816_202117_340668.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0033_ENCN_20250816_202117_340668.wav", "ground_truth": "Yeah on the one hand, I still worried about it will re replace my job . 用户运营,其实我当时那个用户运营,它其实就是 怎么说呢? 感觉我的那个角色有一点点像那个群管理员,或者说是有一点像客服的感觉,就是我得跟那个, 我们公司的就是一些用户去打交道, 这样子。", - "original_transcription": ": 嗯我还有点担心它会取代我的工作\n", "generated_transcription": ": 我还担心它会取代我的工作\n", - "wer_original": 0.02857142857142857, - "wer_generated": 0.9619047619047619, - "wer_improvement": -0.9333333333333333, - "wer_method": "smart_mixed", - "mer_original": 0.9523809523809523 + "cer": 0.9703703703703703, + "wer": 1.0, + "mer": 0.9583333333333334 }, { "segment_id": "SEGMENT_ZH-CN_U0001_ENCN_20250816_202117_301858", - "audio_path": "SEGMENT_ZH-CN_U0001_ENCN_20250816_202117_301858.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0001_ENCN_20250816_202117_301858.wav", "ground_truth": "Exactly. 是的。 Yeah, speak of makeup video, and it's actually a good thing. I I think it's it helps girls who just went to college to learn how to makeup. 对。", - "original_transcription": ": exactly, : 是的, ya, speaking of makeup video and it's actually a good thing. I, I think it's it helps girls who just went to college to learn how to makeup. 对.\n", "generated_transcription": "\n: exactly, : 是的, ya, speaking of makeup video and it's actually a good thing. I, I think it's it helps girls who just went to college to learn how to makeup, 对\n", - "wer_original": 0.0, - "wer_generated": 0.23529411764705882, - "wer_improvement": -0.23529411764705882, - "wer_method": "smart_mixed", - "mer_original": 0.20588235294117646 + "cer": 0.04310344827586207, + "wer": 0.06451612903225806, + "mer": 0.0625 }, { "segment_id": "SEGMENT_ZH-CN_U1051_ENCN_20250816_202117_281774", - "audio_path": "SEGMENT_ZH-CN_U1051_ENCN_20250816_202117_281774.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1051_ENCN_20250816_202117_281774.wav", "ground_truth": "English your major is English, right ? 是的是的,沿海这边它基本上很多东西都是都是甜的,或者说像这些油啊,辣椒放的就比较少一点。对。", - "original_transcription": ": 英语有梅西的英语right 哇嗯是的是的沿海这边它基本上很多东西都是都是甜的或者说像这些油啊辣椒放的就比较少一点对\n", "generated_transcription": "\n: English 有maybe是English right 哇嗯是的是的沿海这边它基本上很多东西都是都是甜的或者说像这些油啊辣椒放的就比较少一点\n", - "wer_original": 0.8301886792452831, - "wer_generated": 0.22641509433962265, - "wer_improvement": 0.6037735849056605, - "wer_method": "smart_mixed", - "mer_original": 0.22641509433962265 + "cer": 0.16901408450704225, + "wer": 0.7142857142857143, + "mer": 0.1276595744680851 }, { "segment_id": "SEGMENT_ZH-CN_U1045_ENCN_20250816_202117_330170", - "audio_path": "SEGMENT_ZH-CN_U1045_ENCN_20250816_202117_330170.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1045_ENCN_20250816_202117_330170.wav", "ground_truth": "嗯。 Yeah yeah yeah. Yeah. Yeah. Yeah. Yeah. Yeah. Yeah. Yeah. 就是云南那边,嗯。", - "original_transcription": ": 嗯嗯嗯嗯嗯嗯就是云南那边嗯\n", - "generated_transcription": "\n: 嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯", - "wer_original": 0.5238095238095238, - "wer_generated": 24.0, - "wer_improvement": -23.476190476190474, - "wer_method": "smart_mixed", - "mer_original": 0.6190476190476191 + "generated_transcription": "\n: 嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯", + "cer": 5.166666666666667, + "wer": 1.0, + "mer": 13.777777777777779 }, { "segment_id": "SEGMENT_ZH-CN_U1101_MIX_20250816_202117_306899", - "audio_path": "SEGMENT_ZH-CN_U1101_MIX_20250816_202117_306899.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1101_MIX_20250816_202117_306899.wav", "ground_truth": "Has, lots of he's or her lives because of no one assists him 就之前有一个有一个新闻说一个二十二十多岁的一个小伙子吧,然后在健身房自己就举举举重一样的那种,就练差不多吧。举哑铃, 对,卧推应该是就就是睡睡在那往上吧,然后没有上去,然后就直接压死了,就。", - "original_transcription": ": he has lost all his lives because of no one assists him 就之前有一个有一个新闻说一个二十二十多岁的一个小伙子吧然后在健身房自己就举举重一样的那种就练差不多吧举哑铃对推举应该是就就睡睡在那往上吧然后没有上去然后就直接压死了就\n", "generated_transcription": "\nhe has lost his all his lives because of no one assists him 就之前有一个有一个新闻说一个二十二十多岁的一个小伙子吧然后在健身房自己就举举重一样的那种就练差不多吧举哑铃对推举应该是就就睡睡在那往上吧然后没有上去然后就直接压死了\n", - "wer_original": 0.8785046728971962, - "wer_generated": 0.18691588785046728, - "wer_improvement": 0.6915887850467289, - "wer_method": "smart_mixed", - "mer_original": 0.17757009345794392 + "cer": 0.12213740458015267, + "wer": 0.5333333333333333, + "mer": 0.11224489795918367 }, { "segment_id": "SEGMENT_ZH-CN_U0022_ENCN_20250816_202117_332202", - "audio_path": "SEGMENT_ZH-CN_U0022_ENCN_20250816_202117_332202.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0022_ENCN_20250816_202117_332202.wav", "ground_truth": "You know, there there are eleven seasons in total. And now I'm already on the tenth season. More precisely the last few episodes a bit, you know, when one one few stressed, I usually try to watch some movies and serious and is, is it can ,I release my stress, I think it's helpful. 让我想想,清清河附近吗?还是去昌平那块儿?", - "original_transcription": ": You know there are eleven seasons in total and now I'm already on the tenth season. (um), more precisely, the last few episodes of it. You know when I feel stressed, I usually try to watch some movies and series and it it can (uh) relieve my stress, I think it's helpful.\n", "generated_transcription": ": 你知道 there there are eleven seasons in total and now I'm already on the tenth season and more precisely the last few episodes of it You know when when when I feel stressed I usually try to watch some movies and series and it it it can relieve my stress I think it is helpful 让我想想清清河附近吗还是去昌平那块\n", - "wer_original": 1.16, - "wer_generated": 0.4, - "wer_improvement": 0.7599999999999999, - "wer_method": "smart_mixed", - "mer_original": 0.5333333333333333 + "cer": 0.13247863247863248, + "wer": 0.3090909090909091, + "mer": 0.25 }, { "segment_id": "SEGMENT_ZH-CN_U1015_MIX_20250816_202117_275305", - "audio_path": "SEGMENT_ZH-CN_U1015_MIX_20250816_202117_275305.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1015_MIX_20250816_202117_275305.wav", "ground_truth": "这个 diss disadvantage,就是说 那边的那���物价其实非常的 expensive,you know?", - "original_transcription": ": 哦这个disadvantage就是说嗯那边那个物价其实非常的expensive you know\n", "generated_transcription": "\n: 这个disadvantage就是说那边的那个物价其实非常的expensive you know\n", - "wer_original": 0.9444444444444444, - "wer_generated": 1.0, - "wer_improvement": -0.05555555555555558, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.08163265306122448, + "wer": 0.8333333333333334, + "mer": 0.14285714285714285 }, { "segment_id": "SEGMENT_ZH-CN_U0040_MIX_20250816_202117_283243", - "audio_path": "SEGMENT_ZH-CN_U0040_MIX_20250816_202117_283243.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0040_MIX_20250816_202117_283243.wav", "ground_truth": "虽然我经常会羡慕你们还在学校的学生啊什么, enjoy school life 啊,谈个谈个恋爱呀,出去玩儿啊, 非常怀念。", - "original_transcription": ": 虽然我经常会羡慕你们还在学校的学生啊什么enjoy school life啊谈个谈个恋爱啊出去玩啊非常怀念\n", "generated_transcription": "\n: 虽然我经常会羡慕你们还在学校的学生啊什么enjoy school life啊谈个谈个恋爱啊出去玩啊非常怀念\n", - "wer_original": 0.17777777777777778, - "wer_generated": 0.9777777777777777, - "wer_improvement": -0.7999999999999999, - "wer_method": "smart_mixed", - "mer_original": 0.9777777777777777 + "cer": 0.038461538461538464, + "wer": 0.8333333333333334, + "mer": 0.05 }, { "segment_id": "SEGMENT_ZH-CN_U0060_MIX_20250816_202117_302703", - "audio_path": "SEGMENT_ZH-CN_U0060_MIX_20250816_202117_302703.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0060_MIX_20250816_202117_302703.wav", "ground_truth": "他们有他,整个 competition 会持续大概 ten ten days or more or little I I don't know exactly,And。", - "original_transcription": ": 他们有它整个, competition, 会持续大概, ten, ten days or more or less, I don't know exactly and the.\n", "generated_transcription": "\n: 他们有它整个, competition, 会持续大概, ten, ten days or more or less, I, I don't know exactly and the\n", - "wer_original": 1.04, - "wer_generated": 0.44, - "wer_improvement": 0.6000000000000001, - "wer_method": "smart_mixed", - "mer_original": 0.44 + "cer": 0.13636363636363635, + "wer": 0.3333333333333333, + "mer": 0.20833333333333334 }, { "segment_id": "SEGMENT_ZH-CN_U0043_ENCN_20250816_202117_286073", - "audio_path": "SEGMENT_ZH-CN_U0043_ENCN_20250816_202117_286073.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0043_ENCN_20250816_202117_286073.wav", "ground_truth": "Yeah. I think another important thing about teamwork is making compromises. Yeah, sometimes like, well, you can't exactly get everything you want, and sometimes you have to do the things that you don't really wanna do. And well, that's important. But in the meanwhile, you know, like you can't let other people, you know, take advantage of U S wall. So so you need to know that you know when you should compromise you, you shouldn't. 那非常非常好。我也是很喜欢锻炼,那你有会去权衡,比如说 有氧运动和无氧运动,就是你会去健身房做一些力量的训练。", - "original_transcription": ": 对啊我也是很喜欢锻炼那你也会去权衡比如说有氧运动和无氧运动就是你会去健身房做一些力量的训练\n", "generated_transcription": ": 对啊我也是很喜欢锻炼那你也会去权衡比如说有氧运动和无氧运动就是你会去健身房做一些力量的训练\n", - "wer_original": 0.4153846153846154, - "wer_generated": 0.676923076923077, - "wer_improvement": -0.26153846153846155, - "wer_method": "smart_mixed", - "mer_original": 0.676923076923077 + "cer": 0.8911917098445595, + "wer": 1.0, + "mer": 0.664 }, { "segment_id": "SEGMENT_ZH-CN_U0084_MIX_20250816_202117_304590", - "audio_path": "SEGMENT_ZH-CN_U0084_MIX_20250816_202117_304590.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0084_MIX_20250816_202117_304590.wav", "ground_truth": "六天或者五天一周的话,it can really help me to 让我保持身材。 去雕刻线条,因为 that's what our girls like right。 I want to ask you that 一般就是你跑完步之后会做什么 cool down 呢。", - "original_transcription": ": 六天或者五天一周的话, it can really help me to 让我保持身材, (um) 去雕刻线条因为, that's what our girls like right and I want to ask you that, (um) 一般就是你跑完步之后会做什么, (um) cool down 了\n", "generated_transcription": "\n: 六天或者五天一周的话, it can really help me to 让我保持身材, (um) 去雕刻线条因为, that's what our girls like right and I want to ask you that, (um) 一般就是你跑完步之后会做什么, (um) cool down 了\n", - "wer_original": 0.35294117647058826, - "wer_generated": 0.47058823529411764, - "wer_improvement": -0.11764705882352938, - "wer_method": "smart_mixed", - "mer_original": 0.47058823529411764 + "cer": 0.09259259259259259, + "wer": 0.2916666666666667, + "mer": 0.08620689655172414 }, { "segment_id": "SEGMENT_ZH-CN_U1013_ENCN_20250816_202117_297486", - "audio_path": "SEGMENT_ZH-CN_U1013_ENCN_20250816_202117_297486.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1013_ENCN_20250816_202117_297486.wav", "ground_truth": "嗯嗯嗯,是的是的。 It is an American company, and sometimes I need to do financial daily work, like reconciliation and write some reports in my job. So this is my job, and this is my work content.", - "original_transcription": ": (hm), (hm), (hm) 是的是的, (err) is American company and sometimes I needed to do financial daily work, (err) like reconciliation and the write some reports in my job, so (err) this is my job and this is my work content.\n", "generated_transcription": "\n: (hm), (hm), (hm), 是的是的, s~ (err) is American company and sometimes I needed to do financial daily work, (err) like reconciliation and the write some reports in my job, so (err) this is my job and this is my work content.\n", - "wer_original": 0.20930232558139536, - "wer_generated": 0.37209302325581395, - "wer_improvement": -0.1627906976744186, - "wer_method": "smart_mixed", - "mer_original": 0.3488372093023256 + "cer": 0.15753424657534246, + "wer": 0.3142857142857143, + "mer": 0.24390243902439024 }, { "segment_id": "SEGMENT_ZH-CN_U0069_ENCN_20250816_202117_290574", - "audio_path": "SEGMENT_ZH-CN_U0069_ENCN_20250816_202117_290574.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0069_ENCN_20250816_202117_290574.wav", "ground_truth": "对,它的上坡下坎儿特别多,然后他们的那些出租车司机技术都相当可以,就是他们那个刹车启动都不是一般人能接受的所以,反正而且。 Okay.", - "original_transcription": ": 对他们的上坡下坎特别多然后他们的那些出租车司机技术都相当可以就是他们那个刹车启动都不是一般人能接受的所以反正而且Okay\n", "generated_transcription": "\n: 对他们的上坡下坎特别多然后他们的那些出租车司机技术都相当可以就是他们那个刹车启动都不是一般人能接受的所以反正而且\n", - "wer_original": 0.9838709677419355, - "wer_generated": 0.12903225806451613, - "wer_improvement": 0.8548387096774194, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.11666666666666667, + "wer": 1.0, + "mer": 0.07017543859649122 }, { "segment_id": "SEGMENT_ZH-CN_U0011_ENCN_20250816_202117_322410", - "audio_path": "SEGMENT_ZH-CN_U0011_ENCN_20250816_202117_322410.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0011_ENCN_20250816_202117_322410.wav", "ground_truth": "Well, cause I wanna try a lot a lot of things in my life. So. 啊,其实没事儿的,其实可以可以应该去试一试。因为北因为北京这种这种院校还挺多的,对。", - "original_transcription": ": Well, I want to try a lot of things in my life, so : (ah), 现在没事可以可以可以去试一下因为在北京这种这种医院还挺多的,\n", "generated_transcription": "\n: Well, cause I want to try a lot of f~ a lot of things in my life, so : (ah), 现在没事可以可以应该去试一下因为, 比, 在北京这种这种医院还挺多,\n", - "wer_original": 0.7368421052631579, - "wer_generated": 0.47368421052631576, - "wer_improvement": 0.2631578947368421, - "wer_method": "smart_mixed", - "mer_original": 0.49122807017543857 + "cer": 0.2716049382716049, + "wer": 0.5, + "mer": 0.36538461538461536 }, { "segment_id": "SEGMENT_ZH-CN_U1022_ENCN_20250816_202117_289270", - "audio_path": "SEGMENT_ZH-CN_U1022_ENCN_20250816_202117_289270.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1022_ENCN_20250816_202117_289270.wav", "ground_truth": "Sometimes I don't like to travel to the cities, big the modern cities. I like to to go to the , you know, the places like Xinjiang or or the or Sichuan province something something . 你像刚才你说旅行,你去西藏那边也是,西藏,像我之前去新疆那边也是,感觉感觉就是会更好的接触到当地的一些风土人情,就是走到就是真的是贴近到大自然的那个里面了。", - "original_transcription": ": 或者去一些地方看看当地的一些风土人情就是走到就是真的是贴近到大自然的那个里面了\n", "generated_transcription": ": sometimes I don't like to travel to the cities the big the modern cities I like to to go to the you know the places like 新疆 or or the or the 西川 province something something (um) 你像刚才你说旅行你去西藏那边也是西藏像我之前去新疆那边也是感觉感觉就是会 better 的接触到当地的一些风土人情就是走到就是真的是贴近到大自然的那个里面\n", - "wer_original": 0.6842105263157895, - "wer_generated": 0.18421052631578946, - "wer_improvement": 0.5, - "wer_method": "smart_mixed", - "mer_original": 0.7280701754385965 + "cer": 0.12206572769953052, + "wer": 0.22857142857142856, + "mer": 0.09433962264150944 }, { "segment_id": "SEGMENT_ZH-CN_U0040_ENCN_20250816_202117_283704", - "audio_path": "SEGMENT_ZH-CN_U0040_ENCN_20250816_202117_283704.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0040_ENCN_20250816_202117_283704.wav", "ground_truth": "反正我觉得你你的性格还是蛮好的 ,应该跟人就就是打交道,不是问题,要比较平和,蛮好的。 I know.", - "original_transcription": ": 反正我觉得你你的性格还是蛮好的应该跟人就是打交道不是问题要比较平和蛮好的, I know.\n", "generated_transcription": "\n: 反正我觉得你你的性格还是蛮好的应该跟人就是打交道不是问题要比较平和蛮好, I know\n", - "wer_original": 0.0, - "wer_generated": 0.18181818181818182, - "wer_improvement": -0.18181818181818182, - "wer_method": "smart_mixed", - "mer_original": 0.13636363636363635 + "cer": 0.047619047619047616, + "wer": 0.5, + "mer": 0.05128205128205128 }, { "segment_id": "SEGMENT_ZH-CN_U0062_MIX_20250816_202117_318200", - "audio_path": "SEGMENT_ZH-CN_U0062_MIX_20250816_202117_318200.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0062_MIX_20250816_202117_318200.wav", "ground_truth": "我之前,对,我最近最近大概两个月前在五道口那边有个 internship,也是非常无聊的工作。", - "original_transcription": ": 嗯我之前对我最近最近大概两个月前在五道口那边有个internship也是非常无聊的工作\n", "generated_transcription": "\n: 嗯我之前对我最近最近大概两个月前在五道口那边有个internship也是非常无聊的工作\n", - "wer_original": 1.0, - "wer_generated": 1.0, - "wer_improvement": 0.0, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.023809523809523808, + "wer": 1.0, + "mer": 0.030303030303030304 }, { "segment_id": "SEGMENT_ZH-CN_U0006_ENCN_20250816_202117_339639", - "audio_path": "SEGMENT_ZH-CN_U0006_ENCN_20250816_202117_339639.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0006_ENCN_20250816_202117_339639.wav", "ground_truth": "对对,就因为, 它的这个实习的性质是属于顶岗实习,就相当于你要到这个学校去。 因为我找到的是一份教师的工作,所以就相当于在。 I'll be not so relaxed.", - "original_transcription": ": 对就因为(um)他的这个实习的性质是属于顶岗实习就相当于你要到这个学校去(err)因为我找到的是一份教师的工作所以就相当于在 i'll be not so relaxed\n", "generated_transcription": "\n: 对就因为呃他的这个实习的性质是属于顶岗实习就相当于你要到这个学校去呃因为我找到的是一份教师的工作所以就相当于在 i'll be not so relaxed\n", - "wer_original": 0.12307692307692308, - "wer_generated": 0.15384615384615385, - "wer_improvement": -0.03076923076923077, - "wer_method": "smart_mixed", - "mer_original": 0.9538461538461539 + "cer": 0.056338028169014086, + "wer": 0.375, + "mer": 0.06779661016949153 }, { "segment_id": "SEGMENT_ZH-CN_U0005_ENCN_20250816_202117_348501", - "audio_path": "SEGMENT_ZH-CN_U0005_ENCN_20250816_202117_348501.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0005_ENCN_20250816_202117_348501.wav", "ground_truth": "Yes. 签售我没有去过,因为我觉得就是我追的团里面我喜欢的那个人,就我我我的本命,他有点儿贵, 可能一场要几十万。", - "original_transcription": ": yes : 哦千寿我没有去过因为我觉得就是我追的团里面我喜欢的人那个人就是我我的本命它有点贵可能一场要几十万\n", "generated_transcription": "\n: yes : 哦千寿我没有去过因为我觉得就是我追的团里面我喜欢的人那个人就是我我的本命它有点贵可能一场要几十万\n", - "wer_original": 0.9811320754716981, - "wer_generated": 0.24528301886792453, - "wer_improvement": 0.7358490566037736, - "wer_method": "smart_mixed", - "mer_original": 0.24528301886792453 + "cer": 0.14, + "wer": 0.6666666666666666, + "mer": 0.14583333333333334 }, { "segment_id": "SEGMENT_ZH-CN_U1013_MIX_20250816_202117_296981", - "audio_path": "SEGMENT_ZH-CN_U1013_MIX_20250816_202117_296981.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1013_MIX_20250816_202117_296981.wav", "ground_truth": "烩面 ,noodle 烩 noodles, 烩 noodles, 烩 noodles. So 但是对于我来说,我觉得烩面不是很好消化,就是 让身体有一些 burden 就负担,你知道吗?", - "original_transcription": ": 会面啊 noodles, 会 noodles, 会 noodles, so, 但是对于我来说我觉得会面不是很好消化就是让身体有一些, burden, 就负担你知道吗\n", "generated_transcription": "\n: 会面啊 noodles, 会 noodles, 会 noodles, 会 noodles, 所以但是对于我来说我觉得会面不是很好消化就是让身体有一些 burden 就负担你知道吗\n", - "wer_original": 0.0, - "wer_generated": 0.2916666666666667, - "wer_improvement": -0.2916666666666667, - "wer_method": "smart_mixed", - "mer_original": 0.2916666666666667 + "cer": 0.1232876712328767, + "wer": 0.6153846153846154, + "mer": 0.20454545454545456 }, { "segment_id": "SEGMENT_ZH-CN_U0033_MIX_20250816_202117_340227", - "audio_path": "SEGMENT_ZH-CN_U0033_MIX_20250816_202117_340227.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0033_MIX_20250816_202117_340227.wav", "ground_truth": "嗯,确实我觉得北京真是一个美食荒漠呀,desert,you know ?所以我觉得北京最好吃的其实是 McDonald 麦当劳。", - "original_transcription": ": 嗯确实我觉得北京真是一个美食荒漠 dessert, you know, 所以我觉得北京最好吃的其实是, Mcdonald's, Mcdonald's.\n", "generated_transcription": "\n: 嗯确实我觉得北京真是一个美食荒漠 dessert, 你懂所以我觉得北京最好吃的其实是 Mcdonald's 麦当劳\n", - "wer_original": 1.1363636363636365, - "wer_generated": 0.9545454545454546, - "wer_improvement": 0.18181818181818188, - "wer_method": "smart_mixed", - "mer_original": 1.0909090909090908 + "cer": 0.18181818181818182, + "wer": 0.8, + "mer": 0.10810810810810811 }, { "segment_id": "SEGMENT_ZH-CN_U0022_ENCN_20250816_202117_332058", - "audio_path": "SEGMENT_ZH-CN_U0022_ENCN_20250816_202117_332058.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0022_ENCN_20250816_202117_332058.wav", "ground_truth": "对,因为大家其实都更愿意在寝室里多休息一会儿,或者做一些其他的事情��那么累。因为平时上课确实已经挺累了。 Ok,hello, nice to meet you today.", - "original_transcription": ": 对因为大家其实都更愿意在寝室里多休息一会儿或者做一些其他的事情不能累因为平时上课确实已经挺累了\n", - "generated_transcription": ": 对因为大家其实都更愿意在寝室里多休息一会儿或者做一些其他的事情不能累因为平时上课确实已经挺累了\n", - "wer_original": 0.25862068965517243, - "wer_generated": 0.20689655172413793, - "wer_improvement": 0.0517241379310345, - "wer_method": "smart_mixed", - "mer_original": 0.20689655172413793 + "generated_transcription": "\n: 对因为大家其实都更愿意在寝室里多休息一会儿或者做一些其他的事情不能累因为平时上课确实已经挺累了\n", + "cer": 0.3698630136986301, + "wer": 1.0, + "mer": 0.14814814814814814 }, { "segment_id": "SEGMENT_ZH-CN_U1034_ENCN_20250816_202117_300617", - "audio_path": "SEGMENT_ZH-CN_U1034_ENCN_20250816_202117_300617.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1034_ENCN_20250816_202117_300617.wav", "ground_truth": "I often. 我现在呢是在做一家财税的销售。", - "original_transcription": ": 哎奥芬嗯我现在呢是在做一家彩税的销售\n", - "generated_transcription": ": 哎奥芬嗯我现在呢是在做一家彩税的销售\n", - "wer_original": 0.23529411764705882, - "wer_generated": 0.35294117647058826, - "wer_improvement": -0.11764705882352944, - "wer_method": "smart_mixed", - "mer_original": 0.35294117647058826 + "generated_transcription": "\n: 哎奥芬嗯我现在呢是在做一家采税的销售\n", + "cer": 0.35, + "wer": 1.0, + "mer": 0.3125 }, { "segment_id": "SEGMENT_ZH-CN_U0092_ENCN_20250816_202117_314340", - "audio_path": "SEGMENT_ZH-CN_U0092_ENCN_20250816_202117_314340.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0092_ENCN_20250816_202117_314340.wav", "ground_truth": "Ok, yeah. 那就证明其实 我是有能力来完成这件事情的,这个时候对于我来说它就是坚持,对,另外一方面是考虑得失吧,就是可能在别人来说,或者说在大局上来说 这件事情是要衡量利弊的,那可能在那个时候我会觉得我当时是固执的。", - "original_transcription": ": 好的,那就证明其实我是有能力来完成这件事情的,这个时候对于我来说,它就是坚持,另外一方面是考虑得失吧,就是可能在别人来说或者说在大体上来说,这件事情是要衡量利弊的,那可能在那个时候我会觉得我当时是固执的。\n", - "generated_transcription": ": 好的yeah 那就证明其实我是有能力来完成这件事情的这个时候对于我来说它就是坚持另外一方面是考虑得失吧就是可能在别人来说或者说在大体上来说这个事情是要衡量利弊的可能在那个时候我会觉得我当时是固执的\n", - "wer_original": 0.9803921568627451, - "wer_generated": 0.12745098039215685, - "wer_improvement": 0.8529411764705882, - "wer_method": "smart_mixed", - "mer_original": 0.08823529411764706 + "generated_transcription": "\n: 好的yeah 那就证明其实我是有能力来完成这件事情的这个时候对于我来说它就是坚持另外一方面是考虑得失吧就是可能在别人来说或者说在大体上来说这个事情是要衡量利弊的可能在那个时候我会觉得我当时是固执的\n", + "cer": 0.06060606060606061, + "wer": 1.0, + "mer": 0.06315789473684211 }, { "segment_id": "SEGMENT_ZH-CN_U0102_ENCN_20250816_202117_307560", - "audio_path": "SEGMENT_ZH-CN_U0102_ENCN_20250816_202117_307560.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0102_ENCN_20250816_202117_307560.wav", "ground_truth": "是是是。 Okay okay. You know that the corporation between between China and America is is down right? It's is is do is doping is jumping. They are seen a lot of conflicts between China and America. 那。", - "original_transcription": ": 是是是 : Okay, okay, (err), you know that the cooperation between between China and America is is down right, is is is du~ is du~ is dupping is dropping. (err), they are seeing a lot of conflicts between China and America.\n", - "generated_transcription": ": 是是是 : Okay, okay, (err), you know that the cooperation between between China and America is is down right, is is is du~ is du~ is du~ is dropping. They are seeing a lot of conflicts between China and America.\n", - "wer_original": 1.1951219512195121, - "wer_generated": 0.36585365853658536, - "wer_improvement": 0.8292682926829268, - "wer_method": "smart_mixed", - "mer_original": 0.4146341463414634 + "generated_transcription": "\n: 是是是 : Okay, okay, (err), you know that the cooperation between between China and America is is down right, is is is du~ is du~ is du~ is dropping. They are seeing a lot of conflicts between China and America.\n", + "cer": 0.12418300653594772, + "wer": 0.2702702702702703, + "mer": 0.2564102564102564 }, { "segment_id": "SEGMENT_ZH-CN_U1091_ENCN_20250816_202117_306762", - "audio_path": "SEGMENT_ZH-CN_U1091_ENCN_20250816_202117_306762.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1091_ENCN_20250816_202117_306762.wav", "ground_truth": "So let's talk about your school life in your college. 嗯。", - "original_transcription": ": So let's talk about your school life in your college\n", - "generated_transcription": ": So let's talk about your school life in your college\n", - "wer_original": 0.16666666666666666, - "wer_generated": 0.25, - "wer_improvement": -0.08333333333333334, - "wer_method": "smart_mixed", - "mer_original": 0.25 + "generated_transcription": "\n: So let's talk about your school life in your college\n", + "cer": 0.023255813953488372, + "wer": 0.09090909090909091, + "mer": 0.09090909090909091 }, { "segment_id": "SEGMENT_ZH-CN_U0011_ENCN_20250816_202117_322574", - "audio_path": "SEGMENT_ZH-CN_U0011_ENCN_20250816_202117_322574.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0011_ENCN_20250816_202117_322574.wav", "ground_truth": "So what could you possibly use for your new laptop. 哦,我有我我以为他们都是他们都是很很很很年会像像我这个年纪会会听的,但是没想到现在大学生也也会听他们。", - "original_transcription": ": So what could you possibly use for your new laptop? : (oh), 我我我我以为他们都是他们都是很很很很也会像像我这个年纪就会会听的但是没想到现在大学校也也会听他们,\n", - "generated_transcription": ": So what could you possibly use for your new laptop? : (oh) 我我我我以为他们都是他们都是很很很很也会像像我这个年纪就会会听的但是没想到现在大学校, 也也会听他们\n", - "wer_original": 0.8360655737704918, - "wer_generated": 0.16393442622950818, - "wer_improvement": 0.6721311475409837, - "wer_method": "smart_mixed", - "mer_original": 0.14754098360655737 + "generated_transcription": "\n: So what could you possibly use for your new laptop? : (oh) 我我我我以为他们都是他们都是很很很很也会像像我这个年纪就会会听的但是没想到现在大学校, 也也会听他们\n", + "cer": 0.06741573033707865, + "wer": 0.2727272727272727, + "mer": 0.08620689655172414 }, { "segment_id": "SEGMENT_ZH-CN_U0069_MIX_20250816_202117_290488", - "audio_path": "SEGMENT_ZH-CN_U0069_MIX_20250816_202117_290488.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0069_MIX_20250816_202117_290488.wav", "ground_truth": "遇到一些 difficulties 啊,然后同时呢,你会 face all kinds of you know 就是就是这个。", - "original_transcription": ": 遇到一些 difficulties, 然后同时呢你会 face all kinds of you know, 就是就是这个\n", - "generated_transcription": ": 遇到一些 difficulties 然后同时呢你会 face all kinds of you you know 就是就是这个\n", - "wer_original": 0.42857142857142855, - "wer_generated": 0.17857142857142858, - "wer_improvement": 0.24999999999999997, - "wer_method": "smart_mixed", - "mer_original": 0.21428571428571427 + "generated_transcription": "\n: 遇到一些 difficulties 然后同时呢你会 face all kinds of you you know 就是就是这个\n", + "cer": 0.0784313725490196, + "wer": 0.2, + "mer": 0.08 }, { "segment_id": "SEGMENT_ZH-CN_U0030_ENCN_20250816_202117_266130", - "audio_path": "SEGMENT_ZH-CN_U0030_ENCN_20250816_202117_266130.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0030_ENCN_20250816_202117_266130.wav", "ground_truth": "哦。 Yeah. 还有什么?还有什么? Kindly die, why are you so funny? You are so interesting.", - "original_transcription": ": (oh), (oh), yeah. : 还有什么还有什么, (oh), 看不见, (err), where is it? So fast? You are so interesting.\n", - "generated_transcription": ": (oh), (oh) ya, : 还有什么还有什么, (oh), 看不懂, 我, where is it been so fast? You are so interesting.\n", - "wer_original": 1.0, - "wer_generated": 0.7083333333333334, - "wer_improvement": 0.29166666666666663, - "wer_method": "smart_mixed", - "mer_original": 0.6666666666666666 + "generated_transcription": "\n: (oh), (oh) ya, : 还有什么还有什么, (oh), 看不懂, 我, where is it been so fast? You are so interesting.\n", + "cer": 0.49122807017543857, + "wer": 0.7857142857142857, + "mer": 0.6190476190476191 }, { "segment_id": "SEGMENT_ZH-CN_U0017_MIX_20250816_202117_264140", - "audio_path": "SEGMENT_ZH-CN_U0017_MIX_20250816_202117_264140.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0017_MIX_20250816_202117_264140.wav", "ground_truth": "Become rich again,然后我觉得在这个小说中嗯的我的 the female actress 赫思嘉,是非常的有嗯值得值得学习的地方,然后。", - "original_transcription": ": become rich again 然后我觉得在这个小说中我的 female actress 和私家是非常有值得值得学习的地方然后\n", - "generated_transcription": ": become rich again 然后我觉得在这个小说中我的 female actress 和私家是非常的有值得值得学习的地方\n", - "wer_original": 0.9310344827586207, - "wer_generated": 0.8275862068965517, - "wer_improvement": 0.10344827586206895, - "wer_method": "smart_mixed", - "mer_original": 0.7931034482758621 + "generated_transcription": "\n: become rich again 然后我觉得在这个小说中 (um) 我的 female actress 和私家是非常的有 (um) 值得值得学习的地方\n", + "cer": 0.18181818181818182, + "wer": 1.0, + "mer": 0.21951219512195122 }, { "segment_id": "SEGMENT_ZH-CN_U0033_ENCN_20250816_202117_341304", - "audio_path": "SEGMENT_ZH-CN_U0033_ENCN_20250816_202117_341304.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0033_ENCN_20250816_202117_341304.wav", "ground_truth": "嗯。 For me? 哦,你们学校没有健身房吗? So have you have you went to Beijing Zoo?", - "original_transcription": ": (hm), (uh), for me, (oh), 你们学校有没有健身房吗, : So have you (uh), have you went to Beijing, too?\n", - "generated_transcription": ": (hm) : (uh) for me, (oh) 你们学校没有健身房吗 : So have you (uh) have you went to Beijing, too?\n", - "wer_original": 1.0, - "wer_generated": 0.38461538461538464, - "wer_improvement": 0.6153846153846154, - "wer_method": "smart_mixed", - "mer_original": 0.4230769230769231 + "generated_transcription": "\n: (hm) : (uh) for me, (oh) 你们学校没有健身房吗 : So have you (uh) have you went to Beijing, too?\n", + "cer": 0.1836734693877551, + "wer": 0.46153846153846156, + "mer": 0.21739130434782608 }, { "segment_id": "SEGMENT_ZH-CN_U0061_ENCN_20250816_202117_265528", - "audio_path": "SEGMENT_ZH-CN_U0061_ENCN_20250816_202117_265528.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0061_ENCN_20250816_202117_265528.wav", "ground_truth": "就是 真的很疯狂,他直接开始内卷开来了,是吗 ? Yeah.", - "original_transcription": ": 就真的很疯狂, 他直接开始内讲开了是吗, : Yup.\n", - "generated_transcription": ": 就真的很疯狂, 他直接开始内讲开了是吗, : Yup\n", - "wer_original": 0.9565217391304348, - "wer_generated": 0.30434782608695654, - "wer_improvement": 0.6521739130434783, - "wer_method": "smart_mixed", - "mer_original": 0.30434782608695654 + "generated_transcription": "\n: 就真的很疯狂, 他直接开始内讲开了是吗\n", + "cer": 0.30434782608695654, + "wer": 1.0, + "mer": 0.2 }, { "segment_id": "SEGMENT_ZH-CN_U1103_ENCN_20250816_202117_272233", - "audio_path": "SEGMENT_ZH-CN_U1103_ENCN_20250816_202117_272233.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1103_ENCN_20250816_202117_272233.wav", "ground_truth": "One boy, one girl. 嗯。 嗯嗯嗯。 You don't like.", - "original_transcription": ": One boy one girl (hm) (hm) (hm)(hm) you don't like\n", - "generated_transcription": ": one boy one girl\n", - "wer_original": 0.46153846153846156, - "wer_generated": 0.9230769230769231, - "wer_improvement": -0.46153846153846156, - "wer_method": "smart_mixed", - "mer_original": 0.7692307692307693 + "generated_transcription": "\n: one boy one girl\n", + "cer": 0.5357142857142857, + "wer": 0.5555555555555556, + "mer": 0.6363636363636364 }, { "segment_id": "SEGMENT_ZH-CN_U0041_MIX_20250816_202117_269594", - "audio_path": "SEGMENT_ZH-CN_U0041_MIX_20250816_202117_269594.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0041_MIX_20250816_202117_269594.wav", "ground_truth": "你们通常都是在weekend go to the park, right 这是这样吗?", - "original_transcription": ": 你们通常都是在, weekend, (err), go to the park, 这样吗\n", - "generated_transcription": ": 你们通常都是在 weekend, (err) go to the park, 是这样吗\n", - "wer_original": 1.25, - "wer_generated": 1.0, - "wer_improvement": 0.25, - "wer_method": "smart_mixed", - "mer_original": 1.1666666666666667 + "generated_transcription": "\n: 你们通常都是在 weekend, (err) go to the park, 是这样吗\n", + "cer": 0.2571428571428571, + "wer": 0.7142857142857143, + "mer": 0.16666666666666666 }, { "segment_id": "SEGMENT_ZH-CN_U0070_ENCN_20250816_202117_290133", - "audio_path": "SEGMENT_ZH-CN_U0070_ENCN_20250816_202117_290133.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0070_ENCN_20250816_202117_290133.wav", "ground_truth": "嗯。 Yeah. Yeah. 嗯。", - "original_transcription": ": (um) yeah yeah (um)\n", - "generated_transcription": ": (um) yeah yeah (um)\n", - "wer_original": 0.6666666666666666, - "wer_generated": 1.0, - "wer_improvement": -0.33333333333333337, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "generated_transcription": "\n: (um) yeah yeah\n", + "cer": 0.3, + "wer": 0.5, + "mer": 0.5 }, { "segment_id": "SEGMENT_ZH-CN_U2013_MIX_20250816_202117_333260", - "audio_path": "SEGMENT_ZH-CN_U2013_MIX_20250816_202117_333260.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U2013_MIX_20250816_202117_333260.wav", "ground_truth": "当然啦那里的那里的 park 有一个非常非常大的湖,我们每次去那里散步的时候,都能感觉到非常的恰意,非常的自由。", - "original_transcription": ": 当然啦那里的那里的泡有一个非常非常大的湖我们每次去那里散步的时候都能感觉非常的惬意非常的自由\n", - "generated_transcription": ": 当然啦那里的那里的泡有一个非常非常大的湖我们每次去那里散步的时候都能感觉非常的惬意非常的自由\n", - "wer_original": 0.0392156862745098, - "wer_generated": 0.13725490196078433, - "wer_improvement": -0.09803921568627452, - "wer_method": "smart_mixed", - "mer_original": 0.13725490196078433 + "generated_transcription": "\n: 当然啦那里的那里的泡有一个非常非常大的湖我们每次去那里散步的时候都能感觉非常的惬意非常的自由\n", + "cer": 0.12, + "wer": 1.0, + "mer": 0.06382978723404255 }, { "segment_id": "SEGMENT_ZH-CN_U1050_ENCN_20250816_202117_299734", - "audio_path": "SEGMENT_ZH-CN_U1050_ENCN_20250816_202117_299734.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1050_ENCN_20250816_202117_299734.wav", "ground_truth": "哦,他最近他 他最近。 I know Taylor Swift, right?", - "original_transcription": ": 哦他最近他嗯他最近I know Taylor Swift right\n", - "generated_transcription": ": 他最近他嗯他最近 I know Taylor Swift right\n", - "wer_original": 0.6666666666666666, - "wer_generated": 0.4, - "wer_improvement": 0.2666666666666666, - "wer_method": "smart_mixed", - "mer_original": 0.8666666666666667 + "generated_transcription": "\n: 哦 他最近他嗯 他最近 I know Taylor Swift right\n", + "cer": 0.034482758620689655, + "wer": 0.2857142857142857, + "mer": 0.07692307692307693 }, { "segment_id": "SEGMENT_ZH-CN_U0061_MIX_20250816_202117_264738", - "audio_path": "SEGMENT_ZH-CN_U0061_MIX_20250816_202117_264738.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0061_MIX_20250816_202117_264738.wav", "ground_truth": "我最喜欢唱的歌儿。其实我觉得就梅梅的歌最适合我唱的是叫 Nothing New, 就她那个 Red T V 里的佳曲。", - "original_transcription": ": 我最喜欢唱的歌其实我觉得这美名的歌最适合我唱的是叫nothing new就她那个ray Tidy的家曲\n", "generated_transcription": "\n: 我最喜欢唱的歌其实我觉得这美美的歌最适合我唱的是叫nothing new就她那个rap title 的家曲\n", - "wer_original": 0.6585365853658537, - "wer_generated": 0.9512195121951219, - "wer_improvement": -0.2926829268292682, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.22448979591836735, + "wer": 1.0, + "mer": 0.23076923076923078 }, { "segment_id": "SEGMENT_ZH-CN_U1005_ENCN_20250816_202117_346824", - "audio_path": "SEGMENT_ZH-CN_U1005_ENCN_20250816_202117_346824.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1005_ENCN_20250816_202117_346824.wav", "ground_truth": "但是归根到底就是哪怕你躺平闲闲逛的那个时间过了。但是只要你觉得你自己是快乐自由的,那你那别人,对吧,在在别人眼中你是一个什么废物或者是说 躺平的啃老的,在我看来都无所谓啊,只要你觉得你这一生值得或者说够了也就行了。反正本来就是人生不如意事十之八九是吧,你常常想的都是那一二分的甜和好玩的事儿。 is Sichuan Normal University. You know, the land mountain .", - "original_transcription": ": 但是归根到底就是哪怕你躺平闲逛的那个时间过了但是只要你觉得你自己是快乐自由的那你那别人对吧在别人眼中你是一个什么废物或者是说躺平的啃老的在我看来都无所谓啊只要你觉得你这一生值得或者说够了也就行了反正本就是人生不如意是十之八九是吧你常常想的都是那一二分的甜和好玩的事儿啊, (um), shi-chuan-n-n-n-university, you know the land mountain.\n", "generated_transcription": ": 但是归根到底就是哪怕你躺平闲逛的那个时间过了但是只要你觉得你自己是快乐自由的那你那别人对吧在别人眼中你是一个什么废物或者是说躺平的啃老的在我看来都无所谓啊只要你觉得你这一生值得或者说够了也就行了反正本就是人生不如意是十之八九是吧你常常想的都是那一二分的甜和好玩的事儿 : (hm) : 嗯 : 嗯\n", - "wer_original": 0.9483870967741935, - "wer_generated": 0.14838709677419354, - "wer_improvement": 0.8, - "wer_method": "smart_mixed", - "mer_original": 0.13548387096774195 + "cer": 0.2677595628415301, + "wer": 1.0, + "mer": 0.0896551724137931 }, { "segment_id": "SEGMENT_ZH-CN_U1001_MIX_20250816_202117_313237", - "audio_path": "SEGMENT_ZH-CN_U1001_MIX_20250816_202117_313237.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1001_MIX_20250816_202117_313237.wav", "ground_truth": "对,我第一次用小红书的时候,它会让我 choose 几个就是我比较感兴趣的 hobbies, 当时我就选择了那个 music 音乐,然后还有一些 food 美食,以及一些 taking photos 摄影那些。然后选择了几个 topics 之后,我点进去,然后他就给我推了许多我感兴趣的 videos, 然后每天就刷那些,然后还还会不停的收藏,就点一下那个 enjoy,然后收藏之后就会他就会按照你自己平时刷的那些 videos 来给你 recomend 的一些你爱刷的,就是他会感应到你喜欢刷的,就给你推相应的视频。", - "original_transcription": ": 我第一次用小红书的时候它会让我choose几个就是我比较感兴趣的hobby然后当时我就选择了那个music音乐然后还有一些food美食以及一些taking photos摄影那些然后选择了几个topics之后我点进去然后它就给我推了许多我感兴趣的videos然后每天就刷那些然后还还会不停地收藏然后点一下那个enjoy然后收藏之后它就会按照你自己平时刷的那些videos来给你recommend一些你爱刷的就是它会感应到你喜欢刷的就给你推相应视频\n", "generated_transcription": ": 我第一次用小红书的时候他会让我choose几个就是我比较感兴趣的hobby然后当时我就选择了那个music音乐然后还有一些food美食以及一些taking photos摄影那些然后选择了几个topics之后我点进去然后他就给我推了许多我感兴趣的videos然后每天就刷那些然后还还会不停地收藏就点一下那个enjoy然后收藏之后就会它就会按照你自己平时刷的那些videos来给你recommend一些你爱刷的就是它会感应到你喜欢刷的就给你推相应视频\n", - "wer_original": 0.9875, - "wer_generated": 1.0, - "wer_improvement": -0.012499999999999956, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.049107142857142856, + "wer": 1.0, + "mer": 0.06470588235294118 }, { "segment_id": "SEGMENT_ZH-CN_U0043_ENCN_20250816_202117_285871", - "audio_path": "SEGMENT_ZH-CN_U0043_ENCN_20250816_202117_285871.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0043_ENCN_20250816_202117_285871.wav", "ground_truth": "其实刚开始去小小城市的话,我是不太适应的。因为喜欢那种热闹啊,喜欢大城市的那种氛围,啊,但是感觉生活了几年以后,现在也开始适应。 Yeah.", - "original_transcription": ": 呃其实刚开始去小小城市的话我是不太适应的因为喜欢那种热闹啊喜欢大城市的那种氛围啊但是感觉生活了几年以后现在也开始适应了\n", "generated_transcription": "\n: 呃其实刚开始去小小城市的话我是不太适应的因为喜欢那种热闹啊喜欢大城市的那种氛围啊但是感觉生活了几年以后现在也开始适应\n", - "wer_original": 0.9846153846153847, - "wer_generated": 0.13846153846153847, - "wer_improvement": 0.8461538461538463, - "wer_method": "smart_mixed", - "mer_original": 0.13846153846153847 + "cer": 0.08196721311475409, + "wer": 1.0, + "mer": 0.034482758620689655 }, { "segment_id": "SEGMENT_ZH-CN_U0045_MIX_20250816_202117_315534", - "audio_path": "SEGMENT_ZH-CN_U0045_MIX_20250816_202117_315534.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0045_MIX_20250816_202117_315534.wav", "ground_truth": "Actually I don't care about 公司的那些文化什么的。 I think they're not very important。 对于我来说, 我我我还是比较关注,就是这这我干的这个事情,whether is 真的符合 my interest or not。 所以对于我来说,可能还是就是 跟跟跟这个 company 的它的 culture 是没有什么太大的关系的,和 environment 不是特别重要吧。对于我来说,but I think most important thing for me is to 做我喜欢做的事情,并且我需要有一个很 creative 的工作环境。我我我真的很希望那种就是我 I don't like fixed work, so 我比较喜欢那些可以让自己找到发挥余地的。And some creative points 的工作。", - "original_transcription": ": (err) actually I don't care about 公司的那些文化什么的 (err) I think they're not very important 对我来说 (um) 我我我还是比较关注就是我干的这个事情 whether it's 真的符合 my interest or not 所以对我来说可能还是就是 (um) 跟跟跟这个 company 的它的 culture 是没有什么太大的关系的和 environment 不是特别重要对我说 but I think (ah) the most important thing for me is to 做我喜欢做的事情并且我需要有一个很 creative 的工作环境我我真的很希望那种就是我 I don't like fixed work so 我比较喜欢那些可以让自己找到发挥余力的 and some creative points 的工作\n", "generated_transcription": ": 呃 actually 我不关心公司的那些文化什么的呃 I think 他们不是 very important 对于我来说嗯我我我还是比较关注就是我干的这个事情 whether is 真的符合 my interest or not 所以对于我来说可能还是就是嗯跟跟跟这个 company 的它的 culture 是没有什么太大的关系的和 environment 不是特别重要对我不来说 but I think (ah) most important thing for me is to 做我喜欢做���事情并且我需要有一个很 creative 的工作环境我我真的很希望那种就是我 I don't like fixed work 所以我比较喜欢那些可以让自己找到发挥余地的 and some creative points 的工作\n", - "wer_original": 0.37681159420289856, - "wer_generated": 0.5507246376811594, - "wer_improvement": -0.17391304347826086, - "wer_method": "smart_mixed", - "mer_original": 0.4927536231884058 + "cer": 0.11384615384615385, + "wer": 0.35294117647058826, + "mer": 0.11538461538461539 }, { "segment_id": "SEGMENT_ZH-CN_U0073_MIX_20250816_202117_338233", - "audio_path": "SEGMENT_ZH-CN_U0073_MIX_20250816_202117_338233.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0073_MIX_20250816_202117_338233.wav", "ground_truth": "那比如说你的工作有没有一些 other area you are very 满意的呢?", - "original_transcription": ": 嗯那比如说你的工作有没有一些其他area 你很满意的呢\n", "generated_transcription": ": 嗯那比如说你的工作有没有一些其他 area 你很满意的呢\n", - "wer_original": 0.8695652173913043, - "wer_generated": 0.30434782608695654, - "wer_improvement": 0.5652173913043478, - "wer_method": "smart_mixed", - "mer_original": 0.782608695652174 + "cer": 0.4444444444444444, + "wer": 0.8571428571428571, + "mer": 0.2727272727272727 }, { "segment_id": "SEGMENT_ZH-CN_U0043_MIX_20250816_202117_285270", - "audio_path": "SEGMENT_ZH-CN_U0043_MIX_20250816_202117_285270.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0043_MIX_20250816_202117_285270.wav", "ground_truth": "对,因为在这种 春季到夏季之间的时候,然后树它会有很多的这种 pollen 这种花粉。然后呢它会引起人们的这种 asthma 这种反应是一种过敏的反应。那很多严重的人呢需要靠那种药物啊,像 因为我 像那种就 salbutamol 啊,很多别的药物然后来抗过敏,这个确实感觉 对生活影响挺大的对。", - "original_transcription": ": 对因为在这种啊春季到夏季之间的时候然后树它会有很多的这种花粉这种花粉然后呢它会引起人们的这种过敏这种反应很多严重的人呢需要靠那种药物啊像啊因为我啊像那种就是sublimate啊很多别的药物然后来看过敏这个确实感觉对生活影响挺大的\n", "generated_transcription": ": 对因为在这种啊春季到夏季之间的时候然后树它会有很多的这种 pollen这种花粉然后呢它会引起人们的这种 asthma这种过敏的反应这种过敏的反应那很多严重的人呢是要靠那种药物啊像啊因为我啊像那种就是 sublimate啊很多别的药物然后来看过敏这个确实感觉对生活影响挺大的\n", - "wer_original": 0.9915254237288136, - "wer_generated": 0.7796610169491526, - "wer_improvement": 0.211864406779661, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.15503875968992248, + "wer": 1.0, + "mer": 0.11818181818181818 }, { "segment_id": "SEGMENT_ZH-CN_U0072_MIX_20250816_202117_275920", - "audio_path": "SEGMENT_ZH-CN_U0072_MIX_20250816_202117_275920.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0072_MIX_20250816_202117_275920.wav", "ground_truth": "The thrones 就是权力的游戏这个 this kind of series and when talk about the minority。I think kind of the books did attach the great importance to this minority or and you can say the marginal people 边缘人群 the American culture 。", - "original_transcription": ": the songs, 就是权利的游戏这个, (um), this kind of series, (um), and when talk about the minority, I think (um), kind of the books did it attach the great importance to this minority, or you can say the marginal people, 边缘人群, (um), the American (um) culture, (um).\n", "generated_transcription": ": the songs 就是权利的游戏这个 this kind of series (um) and when talk about the minority I think (um) kind of the books did attach the great importance to this minority or you can say the marginal people 边缘人群 the American (um) culture (um).\n", - "wer_original": 0.98, - "wer_generated": 0.2, - "wer_improvement": 0.78, - "wer_method": "smart_mixed", - "mer_original": 0.38 + "cer": 0.09090909090909091, + "wer": 0.23684210526315788, + "mer": 0.1836734693877551 }, { "segment_id": "SEGMENT_ZH-CN_U1078_MIX_20250816_202117_295153", - "audio_path": "SEGMENT_ZH-CN_U1078_MIX_20250816_202117_295153.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1078_MIX_20250816_202117_295153.wav", "ground_truth": "非常 different。因为我专业的话其实可能更偏向于 一些 words。所以我们的考核形式更多是 writes 很多的论文,包括一些平常的 small homework,我们说是小作业,也是以论文的这种 formal 来进行的。", - "original_transcription": ": 非常different因为我的专业的话其实可能更偏向于呃一些word所以我们的考核形式更多是写很多的论文包括一些平常的呃小作业也是以论文的这种formal来进行的\n", "generated_transcription": ": 非常different因为我的专业的话其实可能更偏向于呃一些word所以我们的考核形式更多是write呃很多的论文包括一些平常的呃small homework我们说是小作业也是以论文的这种formal来进行的\n", - "wer_original": 0.0, - "wer_generated": 1.0, - "wer_improvement": -1.0, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.04950495049504951, + "wer": 1.0, + "mer": 0.08823529411764706 }, { "segment_id": "SEGMENT_ZH-CN_U0033_ENCN_20250816_202117_340637", - "audio_path": "SEGMENT_ZH-CN_U0033_ENCN_20250816_202117_340637.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0033_ENCN_20250816_202117_340637.wav", "ground_truth": "Yes. 嗯,是呢,感觉学日语适合那些喜欢啊日漫文化啊的人呢。", - "original_transcription": ": 嗯嗯是呢感觉学日语适合那些喜欢啊日漫文化的人呢\n", "generated_transcription": ": 嗯 yes 嗯 是呢感觉学日语适合那些喜欢啊日漫文化的人呢\n", - "wer_original": 0.9629629629629629, - "wer_generated": 0.2222222222222222, - "wer_improvement": 0.7407407407407407, - "wer_method": "smart_mixed", - "mer_original": 0.18518518518518517 + "cer": 0.07692307692307693, + "wer": 1.5, + "mer": 0.08333333333333333 }, { "segment_id": "SEGMENT_ZH-CN_U0004_MIX_20250816_202117_320436", - "audio_path": "SEGMENT_ZH-CN_U0004_MIX_20250816_202117_320436.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0004_MIX_20250816_202117_320436.wav", "ground_truth": "啊?What? 是的,就有的人第一次看 see see me 的时候呢,觉得我可能二十出头儿,我还今天呢故意的穿那么 mature 一点儿, 但是。", - "original_transcription": ": 嗯是的就有的第一次看seesee me的时候觉得我可能二十出头我还今天呢故意的穿那么穿儿一点儿但是\n", "generated_transcription": ": 嗯是的就有的人第一次看see me的时候觉得我可能二十出头我还今天呢故意的穿那么穿儿一点儿\n", - "wer_original": 0.94, - "wer_generated": 1.0, - "wer_improvement": -0.06000000000000005, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.3050847457627119, + "wer": 1.0, + "mer": 0.1956521739130435 }, { "segment_id": "SEGMENT_ZH-CN_U0073_MIX_20250816_202117_338307", - "audio_path": "SEGMENT_ZH-CN_U0073_MIX_20250816_202117_338307.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0073_MIX_20250816_202117_338307.wav", "ground_truth": "It's not very it's not that kind of drum. It's just something it's like a ball. 就像我们平时吃饭的一个碗,but in different kind of material,it's a metal 碗。", - "original_transcription": ": it's not very. It's not that kind of drum. It's just something. It's like a ball. 就像我们平时吃饭的碗, but in different kind of material. It's a metal bowl.\n", "generated_transcription": ": it's not well. It's not that kind of drum. It's just something. It's like a ball. 就像我们平时吃饭的碗, but in different kind of material. It's a metal bowl.\n", - "wer_original": 0.6538461538461539, - "wer_generated": 0.8076923076923077, - "wer_improvement": -0.15384615384615385, - "wer_method": "smart_mixed", - "mer_original": 0.8076923076923077 + "cer": 0.08333333333333333, + "wer": 0.24, + "mer": 0.16216216216216217 }, { "segment_id": "SEGMENT_ZH-CN_U0074_MIX_20250816_202117_277883", - "audio_path": "SEGMENT_ZH-CN_U0074_MIX_20250816_202117_277883.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0074_MIX_20250816_202117_277883.wav", "ground_truth": "对,也是时不时总结一下 部门近期的一些 work 一些工作。", - "original_transcription": ": 对也是时不时总结一下部门近期的一些work一些工作\n", "generated_transcription": ": 对也是时不时总结一下部门近期的一些work一些工作\n", - "wer_original": 0.3333333333333333, - "wer_generated": 1.0, - "wer_improvement": -0.6666666666666667, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.0, + "wer": 1.0, + "mer": 0.0 }, { "segment_id": "SEGMENT_ZH-CN_U1039_ENCN_20250816_202117_292710", - "audio_path": "SEGMENT_ZH-CN_U1039_ENCN_20250816_202117_292710.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1039_ENCN_20250816_202117_292710.wav", "ground_truth": "就是四食堂,四十四川的有点远,就没去过。 Yes, that's that's a pretty good.", - "original_transcription": ": 就是四十parsedMessage 嗯嗯四四十parsedMessage 嗯嗯有点远就没去过\n", "generated_transcription": ": 就是四十娘四四娘有点远就没去过\n", - "wer_original": 0.5, - "wer_generated": 0.5384615384615384, - "wer_improvement": -0.038461538461538436, - "wer_method": "smart_mixed", - "mer_original": 0.7307692307692307 + "cer": 0.7073170731707317, + "wer": 1.0, + "mer": 0.4782608695652174 }, { "segment_id": "SEGMENT_ZH-CN_U0006_MIX_20250816_202117_339524", - "audio_path": "SEGMENT_ZH-CN_U0006_MIX_20250816_202117_339524.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0006_MIX_20250816_202117_339524.wav", "ground_truth": "Emotional value 的话,除了emotional value 的话,我觉得可能就是。 就是能够根据你给他的需求,比如说你你变得 bossy, 你变得就霸道总裁一点,然后他的回答你的问题的方式就会变得 bossy, 我觉得这样还挺好玩儿的。", - "original_transcription": ": 嗯, 隐形暴力的话, 除了隐形暴力的话, 我觉得可能就是, 就是能够根据你给他的需求比如说你变得bossy, 你变得就霸道总裁一点然后他的回答你的问题的方式就会变得特别bossy, 我觉得这样还挺好玩儿的\n", "generated_transcription": ": 嗯, 隐身模式, 有, 那么, 除了隐身模式, 有, 我觉得可能就是, 就是能够, 根据你给他的需求比如说你, 你变得bossy, 你变得就霸道总裁一点, 然后他的回答你的问题的方式就会变得特别bossy, 我觉得这样还挺好玩儿的\n", - "wer_original": 0.0, - "wer_generated": 0.6233766233766234, - "wer_improvement": -0.6233766233766234, - "wer_method": "smart_mixed", - "mer_original": 0.8181818181818182 + "cer": 0.3148148148148148, + "wer": 1.2, + "mer": 0.19736842105263158 }, { "segment_id": "SEGMENT_ZH-CN_U0004_ENCN_20250816_202117_321064", - "audio_path": "SEGMENT_ZH-CN_U0004_ENCN_20250816_202117_321064.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0004_ENCN_20250816_202117_321064.wav", "ground_truth": "对对,你平时上微博吗? Go. You you you won't fail to to to go to to pursue your master's degree.", - "original_transcription": ": 对对, 你平时上微博吗, (uh), go, you, you, you won't (uh), fail to to to go to to to pursue your master's degree.\n", "generated_transcription": ": 对对, 你平时上微博吗, (err) go you you you won't (err) fail to to to go to to pursue your master's degree\n", - "wer_original": 0.14814814814814814, - "wer_generated": 0.25925925925925924, - "wer_improvement": -0.1111111111111111, - "wer_method": "smart_mixed", - "mer_original": 0.2962962962962963 + "cer": 0.09523809523809523, + "wer": 0.23529411764705882, + "mer": 0.08 }, { "segment_id": "SEGMENT_ZH-CN_U1040_ENCN_20250816_202117_312258", - "audio_path": "SEGMENT_ZH-CN_U1040_ENCN_20250816_202117_312258.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1040_ENCN_20250816_202117_312258.wav", "ground_truth": "I I don't think so, because I I like reading too, I like read novels before sleeping. 你逃过课吗?", - "original_transcription": ": (oh) I I don't think so because I I like reading too I like read novels before sleeping : 你逃过课吗\n", "generated_transcription": ": (oh) I I don't think so because I I like reading too I like read novels before sleeping\n: 你逃过课吗\n", - "wer_original": 0.34782608695652173, - "wer_generated": 0.21739130434782608, - "wer_improvement": 0.13043478260869565, - "wer_method": "smart_mixed", - "mer_original": 0.21739130434782608 + "cer": 0.02857142857142857, + "wer": 0.05555555555555555, + "mer": 0.045454545454545456 }, { "segment_id": "SEGMENT_ZH-CN_U1103_MIX_20250816_202117_271939", - "audio_path": "SEGMENT_ZH-CN_U1103_MIX_20250816_202117_271939.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1103_MIX_20250816_202117_271939.wav", "ground_truth": "我以前的那家公司,他们内是在 Han Guo Korea ** 。韩国公州。", - "original_transcription": ": 嗯我以前的那家公司company是在韩国Korea公州韩国公州\n", "generated_transcription": ": 嗯我以前的那家公司company是在韩国Korea公州韩国公州\n", - "wer_original": 0.875, - "wer_generated": 1.0, - "wer_improvement": -0.125, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.5, + "wer": 1.0, + "mer": 0.4 }, { "segment_id": "SEGMENT_ZH-CN_U1071_MIX_20250816_202117_271416", - "audio_path": "SEGMENT_ZH-CN_U1071_MIX_20250816_202117_271416.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1071_MIX_20250816_202117_271416.wav", "ground_truth": "对, your say say is good, I think。 I know I know for me for me 我来说的话 在 四川四川这边 。", - "original_transcription": ": (ah) 对 (ah) yes yes yes good (ah) I think (ah) I know I know for me for me for me 我来说的话 (ah) 在四川四川这边 (ah)\n", "generated_transcription": ": (嗯) 对 (嗯) 这是 (嗯) 很好 (嗯) 我知道 (嗯) 我知道 (嗯) 对我来说的话 (嗯) 在四川四川这边\n", - "wer_original": 0.8, - "wer_generated": 1.1, - "wer_improvement": -0.30000000000000004, - "wer_method": "smart_mixed", - "mer_original": 0.43333333333333335 + "cer": 0.8, + "wer": 1.0, + "mer": 0.6428571428571429 }, { "segment_id": "SEGMENT_ZH-CN_U1098_ENCN_20250816_202117_260290", - "audio_path": "SEGMENT_ZH-CN_U1098_ENCN_20250816_202117_260290.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1098_ENCN_20250816_202117_260290.wav", "ground_truth": "嗯。 So nice to meet you. I hope that it can be a good journey to have a nice conversation with you.", - "original_transcription": ": So nice to meet you too I hope that it can be a good journey to have a nice conversation with you\n", "generated_transcription": ": So nice to meet too I hope that it can be a good journey to have a nice conversation with you\n", - "wer_original": 0.08695652173913043, - "wer_generated": 0.17391304347826086, - "wer_improvement": -0.08695652173913043, - "wer_method": "smart_mixed", - "mer_original": 0.21739130434782608 + "cer": 0.04054054054054054, + "wer": 0.09090909090909091, + "mer": 0.09090909090909091 }, { "segment_id": "SEGMENT_ZH-CN_U1001_MIX_20250816_202117_313251", - "audio_path": "SEGMENT_ZH-CN_U1001_MIX_20250816_202117_313251.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1001_MIX_20250816_202117_313251.wav", "ground_truth": "对,我也刚刚想问,其实有些手机它是会限制你下载 A P P 的, 我用的那个是 phone 就是国产手机华为。", - "original_transcription": ": 对我也刚刚想问其实有些手机它是会限制你下载APP的嗯我用的那个fone是华为是国产手机华为\n", "generated_transcription": ": 对我也刚刚想问其实有些手机它是会限制你下载APP的嗯我用的那个fone是华为是国产手机华为\n", - "wer_original": 1.0, - "wer_generated": 1.0, - "wer_improvement": 0.0, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.1590909090909091, + "wer": 1.0, + "mer": 0.175 }, { "segment_id": "SEGMENT_ZH-CN_U0012_MIX_20250816_202117_328870", - "audio_path": "SEGMENT_ZH-CN_U0012_MIX_20250816_202117_328870.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0012_MIX_20250816_202117_328870.wav", "ground_truth": "嗯,我之前丢耳机都是一个一整个,就是连那个充电仓一整个的丢,我还没就只丢过一个,但我之前只丢过一个的话,其实你可以根据它那个 A P P 里边有一个找耳机的功能,就是会把可以把它找回来。", - "original_transcription": ": 嗯我之前丢耳机都是一个一整个就是连那个充电仓一整个丢我还没就只丢过一个但我之前只丢过一个的话其实你可以根据它那个app (err) 里边有一个找耳机的功能就是可以把它找回来\n", "generated_transcription": "\n: 嗯我之前丢耳机都是一个一整个就是连那个充电的仓一整个丢我还没就只丢过一个但我之前只丢过一个的话其实你可以根据它那个app里面有一个找耳机的功能就是可以把把它找回来\n", - "wer_original": 1.0, - "wer_generated": 1.0, - "wer_improvement": 0.0, - "wer_method": "smart_mixed", - "mer_original": 0.7752808988764045 + "cer": 0.07317073170731707, + "wer": 1.0, + "mer": 0.10975609756097561 }, { "segment_id": "SEGMENT_ZH-CN_U0055_ENCN_20250816_202117_336587", - "audio_path": "SEGMENT_ZH-CN_U0055_ENCN_20250816_202117_336587.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0055_ENCN_20250816_202117_336587.wav", "ground_truth": "Yeah. 明白。 没有很享受。就是可能刚开始还比较低的时候,它还在低低地方摇摆的时候,我觉得还行。然后后来它摆得太高了,就是它从很高的地方,然后落下来的时候,我感觉我腿都在抖。然后后面就我刚开始我可能还尖叫一些,到后面我实在太害怕了,我叫不出声儿了,然后我就一直拔着那个那个杆儿,然后我也不说。然后我室友他们还在问我说问我怎么不说话 。 So though the final result is that I choose don't to exercise and just stay there.", - "original_transcription": ": yeah, : (hm), : 没有很享受就是可能刚开始还比较低的时候, 他还在低低的地方摇摆的时候我觉得还行然后后来他摆得太高了就是他从很高的地方然后落下来的时候我感觉我腿都在抖然后后面, 就我刚开始我可能还尖叫一下到后面我实在太害怕了我就叫不出声音了然后我就一直拔着那个那个杆儿然后我也不说话然后我室友他们还在问我说问我怎么不说话, So the the final result is that I choose not to exercise and just stay there.\n", "generated_transcription": ": yeah, : (hm), 明白 : 没有很享受就是可能刚开始还比较低的时候, 他还在低低的地方摇摆的时候我觉得还行然后后来他摆得太高了就是他从很高的地方然后落下来的时候我感觉我腿都在抖然后后面, 就我刚开始我可能还尖叫一下到后面我实在太害怕了我叫不出声了然后我就一直拔着那个那个杆儿然后我也不说话, 然后我室友他们还在问我说问我怎么不说话, So the the final result is that I choose don't to exercise and just stay there.\n", - "wer_original": 0.9044943820224719, - "wer_generated": 0.14606741573033707, - "wer_improvement": 0.7584269662921348, - "wer_method": "smart_mixed", - "mer_original": 0.15730337078651685 + "cer": 0.06046511627906977, + "wer": 0.3157894736842105, + "mer": 0.05521472392638037 }, { "segment_id": "SEGMENT_ZH-CN_U1054_MIX_20250816_202117_311017", - "audio_path": "SEGMENT_ZH-CN_U1054_MIX_20250816_202117_311017.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1054_MIX_20250816_202117_311017.wav", "ground_truth": "I like 李白 too,那还有我比较喜欢田园派的,do you know 田园派的,就像陶渊明这些,他写的归园田居啊归去来兮,我比较喜欢这些,那还有 比较壮志磅礴的,假如说。", - "original_transcription": ": 嗯, I like, 呃, 李白, two, 那还有, 我比较喜欢田园派的, you know, 田园派的, 就, 像, 陶渊明, 这些, 他写���归元田居, 呀, 归来西, 我比较喜欢这些, 那还有, 嗯, 比较壮志宏博的, 假如说\n", "generated_transcription": "\n: 嗯 I like 呃李白 two 那还有我比较喜欢田园派的 you know 田园派的就像陶渊明这些他写的归元田居啊归来西我比较喜欢这些那还有嗯比较壮志宏博的假如说\n", - "wer_original": 0.44642857142857145, - "wer_generated": 0.48214285714285715, - "wer_improvement": -0.0357142857142857, - "wer_method": "smart_mixed", - "mer_original": 0.75 + "cer": 0.14864864864864866, + "wer": 0.75, + "mer": 0.15873015873015872 }, { "segment_id": "SEGMENT_ZH-CN_U1022_ENCN_20250816_202117_289248", - "audio_path": "SEGMENT_ZH-CN_U1022_ENCN_20250816_202117_289248.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1022_ENCN_20250816_202117_289248.wav", "ground_truth": "就是。 Yes, I hear it a lot and do you have any hobbies?", - "original_transcription": ": I actually yes I hear it a lot and do you have any hobbies\n", "generated_transcription": "\n: Actually yes I hear it a lot and do you have any hobbies\n", - "wer_original": 1.1333333333333333, - "wer_generated": 0.3333333333333333, - "wer_improvement": 0.8, - "wer_method": "smart_mixed", - "mer_original": 0.3333333333333333 + "cer": 0.21052631578947367, + "wer": 0.07692307692307693, + "mer": 0.14285714285714285 }, { "segment_id": "SEGMENT_ZH-CN_U1074_MIX_20250816_202117_351778", - "audio_path": "SEGMENT_ZH-CN_U1074_MIX_20250816_202117_351778.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1074_MIX_20250816_202117_351778.wav", "ground_truth": "好的,那不如我们说一说我们吃 K F C 的时候,最喜欢点的一些东西呢。比如说我之前提到的,我之前提到的 French fries 呀,那些 hamburgers,还有那些你你在吃 French fries 的时候,你喜欢点那些 sauce 吗?比如说。", - "original_transcription": ": 好的那不如我们说说我们吃KFC的时候最喜欢点的一些东西呢比如说我之前提到的我之前提到的french fries呀那些hamburger还有那些你你在吃french fries的时候你喜欢点那些sauce吗比如说\n", "generated_transcription": "\n: 好的那不如我们说说我们吃KFC的时候最喜欢点的一些东西呢比如说我之前提到的我之前提到的french fries呀那些hamburger还有那些你你在吃french fries的时候你喜欢点那些sauce吗比如说\n", - "wer_original": 0.9864864864864865, - "wer_generated": 1.0, - "wer_improvement": -0.013513513513513487, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.01904761904761905, + "wer": 1.0, + "mer": 0.06756756756756757 }, { "segment_id": "SEGMENT_ZH-CN_U1093_ENCN_20250816_202117_279133", - "audio_path": "SEGMENT_ZH-CN_U1093_ENCN_20250816_202117_279133.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1093_ENCN_20250816_202117_279133.wav", "ground_truth": "Or your instincts are challenge them. You know, that is probably not going to work. It's like just relax or whatever. But I remember at the time, just desperately wanting to be ** by adults and like to bitter that I was being stupid. I don't know I'd probably just be like, yeah go for it. 就是很有情调的 。", - "original_transcription": ": All your instincts are telling them, you know, that is probably not going to work. (um), it's like just relax or whatever, but I remember at the time just desperately wanting to be validated by adults and not to be told that I was being stupid. I don't know. I'd probably just be like, Yeah, go for it.\n", "generated_transcription": ": All your instincts are telling them you know that is probably not going to work.\n", - "wer_original": 0.125, - "wer_generated": 0.84375, - "wer_improvement": -0.71875, - "wer_method": "smart_mixed", - "mer_original": 0.359375 + "cer": 0.7456140350877193, + "wer": 0.7678571428571429, + "mer": 0.7903225806451613 }, { "segment_id": "SEGMENT_ZH-CN_U0020_ENCN_20250816_202117_270864", - "audio_path": "SEGMENT_ZH-CN_U0020_ENCN_20250816_202117_270864.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0020_ENCN_20250816_202117_270864.wav", "ground_truth": "嗯,那你觉得是不是日常来讲,这样的运动还是比较需要的。,就是觉得一个人他可以不运动嘛,你会有这样的想法吗? I will have a lot of i ideas about my my dinner or my lunch or when I'm gonna do next steps, and I can't contract my ideas more carefylly to to get this insight.", - "original_transcription": ": 嗯,那你觉得是不是日常来讲这样的运动还是比较需要的,就是觉得一个人他可以不运动吗,你会有这样的想法吗?: 我会有很多关于我的晚餐或午餐的想法,或者我接下来要做什么,我会更仔细地思考这些想法,来获得更多灵感。\n", "generated_transcription": ": 嗯,那你觉得是不是日常来讲这样的运动还是比较需要的,就是觉得一个人他可以不运动吗,你会有这样的想法吗?\n", - "wer_original": 0.6022727272727273, - "wer_generated": 0.4318181818181818, - "wer_improvement": 0.17045454545454547, - "wer_method": "smart_mixed", - "mer_original": 0.6022727272727273 + "cer": 0.7294117647058823, + "wer": 1.0, + "mer": 0.43902439024390244 }, { "segment_id": "SEGMENT_ZH-CN_U1006_ENCN_20250816_202117_343691", - "audio_path": "SEGMENT_ZH-CN_U1006_ENCN_20250816_202117_343691.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1006_ENCN_20250816_202117_343691.wav", "ground_truth": "And if they run the world, they rule world, they will make the world more more fair. 那他们为什么还要住那儿呢?回巴勒斯坦不行吗?", - "original_transcription": ": And if they run the world they rule the world they will make the world more more fair 那他们为什么还要住那儿呢回巴勒斯坦不行吗\n", "generated_transcription": "\n: And if they run the world they rule the world they will make the world more more fair 那他们为什么还要住那儿呢回巴勒斯坦不行吗\n", - "wer_original": 0.5641025641025641, - "wer_generated": 0.15384615384615385, - "wer_improvement": 0.41025641025641024, - "wer_method": "smart_mixed", - "mer_original": 0.15384615384615385 + "cer": 0.03529411764705882, + "wer": 0.05555555555555555, + "mer": 0.02702702702702703 }, { "segment_id": "SEGMENT_ZH-CN_U1085_MIX_20250816_202117_277425", - "audio_path": "SEGMENT_ZH-CN_U1085_MIX_20250816_202117_277425.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1085_MIX_20250816_202117_277425.wav", "ground_truth": "我的 lifestyle 说实话 比较单一,但就是偶尔其实也有我感觉不太健康的,就是有时候我会 you know stayed up to 熬夜嘛,two two one or two o'clock 这样的,我会就是 晚上尤其是晚上的作息吧现在是有一点点的不规律,就是也没有还没有去调整它吧。", - "original_transcription": ": 我的lifestyle说实话比较单一但就是偶尔其实也有感觉不太健康的就是有时候我会, you know, stay up till, 熬夜, two to one or two o'clock, 这样的我会就是晚上尤其是晚上的作息吧现在是有一点点的不规律就是还没有还没有去调整它,\n", "generated_transcription": "\n我的lifestyle说实话比较单一但就是偶尔其实也有感觉不太健康的就是有时候我会you know stay up till 熬夜, two two one or two o'clock这样的我会就是晚上尤其是晚上的作息吧现在是有一点点的不规律就是还没有还没有去调整它\n", - "wer_original": 0.0, - "wer_generated": 0.9565217391304348, - "wer_improvement": -0.9565217391304348, - "wer_method": "smart_mixed", - "mer_original": 0.5217391304347826 + "cer": 0.07258064516129033, + "wer": 0.7058823529411765, + "mer": 0.06666666666666667 }, { "segment_id": "SEGMENT_ZH-CN_U0092_MIX_20250816_202117_314282", - "audio_path": "SEGMENT_ZH-CN_U0092_MIX_20250816_202117_314282.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0092_MIX_20250816_202117_314282.wav", "ground_truth": "但其实如果说这些英国企业,他们不了解中国的 standard,还有中国的特殊的,比如说 G M P, 就是 中国本土制定的这些很特殊的这些条例的话 但他们就等于是很多地方 。", - "original_transcription": ": 嗯但其实如果说这些英国企业他们不了解中国的standard还有中国的特殊的比如说GNP然后就是中国本土制定的这些很特殊的条例的话但他们就等于是很多地方\n", "generated_transcription": "\n: 嗯但其实如果说这些英国企业他们不了解中国的standard还有中国的特殊的比如说GNP然后就是中国本土制定的这些很特殊的这些条例的话但他们就等于是很多地方\n", - "wer_original": 0.9655172413793104, - "wer_generated": 1.0, - "wer_improvement": -0.03448275862068961, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.05405405405405406, + "wer": 1.0, + "mer": 0.05970149253731343 }, { "segment_id": "SEGMENT_ZH-CN_U0052_ENCN_20250816_202117_342206", - "audio_path": "SEGMENT_ZH-CN_U0052_ENCN_20250816_202117_342206.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0052_ENCN_20250816_202117_342206.wav", "ground_truth": "New types. 嗯。 那。 Yeah.", - "original_transcription": ": (hm), new types, (hm), ya.\n", "generated_transcription": "\n: (hm) new types ya.\n", - "wer_original": 0.5714285714285714, - "wer_generated": 1.0, - "wer_improvement": -0.4285714285714286, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.42857142857142855, + "wer": 0.8, + "mer": 0.8 }, { "segment_id": "SEGMENT_ZH-CN_U0057_ENCN_20250816_202117_268348", - "audio_path": "SEGMENT_ZH-CN_U0057_ENCN_20250816_202117_268348.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0057_ENCN_20250816_202117_268348.wav", "ground_truth": "The evilness. 反正我觉得像这个东西我也不是很知道 A I 它已经发生到发展到怎么样的程度了,然后我们自己发展的又没有它那么快,就会很觉得就很焦虑,就一直在被超越的那种。 哎呀,但是可能 。", - "original_transcription": ": the evilness 反正我觉得像这个东西我也不是很知道AI它已经发展到发展到怎么样的程度了然后我们自己发展得又没有它那么快就会觉得就很焦虑就一直在被超越的那种啊但是可能嗯\n", "generated_transcription": "\n: the evilness 反正我觉得像这个东西我也不是很知道 Ai 它已经发展到发展到怎么样的程度了然后我们自己发展得又没有它那么快就会很觉得就很焦虑就一直在被超越的那种啊但是可能\n", - "wer_original": 0.9642857142857143, - "wer_generated": 0.16666666666666666, - "wer_improvement": 0.7976190476190477, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.04597701149425287, + "wer": 0.5714285714285714, + "mer": 0.07692307692307693 }, { "segment_id": "SEGMENT_ZH-CN_U1006_ENCN_20250816_202117_343531", - "audio_path": "SEGMENT_ZH-CN_U1006_ENCN_20250816_202117_343531.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1006_ENCN_20250816_202117_343531.wav", "ground_truth": "我会首选美国,因为我语言上我只懂英语,然后其次就是它的薪资比较高,然后物价又比较低,就挺适合生活的。而且我也比较喜欢美国的文化,但是只有一个缺点就是可能治安比较差,尤其是像我如果去的话只能住那种贫民窟,那种地方可能也就比较危险。 Okay, Manson Zhang, how old are you?", - "original_transcription": ": 嗯,美国,因为我语言上我只懂英语,其次就是他的薪资比较高,然后物价又比较低,就挺适合生活的,而且我也比较喜欢美国的文化,但是只有一个缺点就是可能治安比较差,尤其是像我如果去的话只能住那种平民窟,那种地方可能也就比较危险。\n", "generated_transcription": ": 嗯,美国,因为我语言上我只懂英语,其次就是他的薪资比较高,然后物价又比较低,就挺适合生活的,而且我也比较喜欢美国的文化,但是只有一个缺点就是可能治安比较差,尤其是像我如果去的话只能住那种平民窟,那种地方可能也就比较危险\n", - "wer_original": 0.9421487603305785, - "wer_generated": 0.14049586776859505, - "wer_improvement": 0.8016528925619835, - "wer_method": "smart_mixed", - "mer_original": 0.1322314049586777 + "cer": 0.26515151515151514, + "wer": 1.0, + "mer": 0.13392857142857142 }, { "segment_id": "SEGMENT_ZH-CN_U1104_ENCN_20250816_202117_311620", - "audio_path": "SEGMENT_ZH-CN_U1104_ENCN_20250816_202117_311620.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1104_ENCN_20250816_202117_311620.wav", "ground_truth": "Yeah. 看吧,因为她身体不太好,现在可能先是把身体调理好再说, 那我想问一下哈,你那个两个小朋友,那你跟你的呃应该是老婆了,因为有孩子不可能是没结婚,她是怎怎么样认识,或者说因为刚刚一直在说小朋友的事嘛,没有说到妈妈的事,我想听听你跟妈妈的故事。", - "original_transcription": ": yeah (um) 看吧因为他身体不太好现在可能先是把身体调理好再说诶那我想问一下哈你那个两个小朋友那你跟你的呃应该是老婆了因为有孩子不可能没结婚了他是怎怎么样认识或者说因为刚刚一直在说小朋友的事嘛没有说到妈妈的事我想听听你跟妈妈的故事\n", "generated_transcription": "\n: yeah : (um) 看吧因为她身体不太好现在可能先是把身体调理好再说诶那我想问一下哈你那个两个小朋友那你跟你的呃应该是老婆了因为有孩子不可能没结婚他是怎怎么样认识或者说因为刚刚一直在说小朋友的事嘛没有说到妈妈的事我想听听你跟妈妈的故事\n", - "wer_original": 0.01680672268907563, - "wer_generated": 0.12605042016806722, - "wer_improvement": -0.1092436974789916, - "wer_method": "smart_mixed", - "mer_original": 0.13445378151260504 + "cer": 0.04504504504504504, + "wer": 0.6666666666666666, + "mer": 0.037037037037037035 }, { "segment_id": "SEGMENT_ZH-CN_U1097_ENCN_20250816_202117_328208", - "audio_path": "SEGMENT_ZH-CN_U1097_ENCN_20250816_202117_328208.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1097_ENCN_20250816_202117_328208.wav", "ground_truth": "Yeah, you are teacher now right? 嗯,好。", - "original_transcription": ": Yeah, you are a teacher now. (uh), right. : (hm), 好.\n", "generated_transcription": "\n: Yeah, you are a teacher now. (uh), right. : (hm), 好\n", - "wer_original": 0.4, - "wer_generated": 0.6, - "wer_improvement": -0.19999999999999996, - "wer_method": "smart_mixed", - "mer_original": 0.6 + "cer": 0.18518518518518517, + "wer": 0.5714285714285714, + "mer": 0.375 }, { "segment_id": "SEGMENT_ZH-CN_U0062_MIX_20250816_202117_318213", - "audio_path": "SEGMENT_ZH-CN_U0062_MIX_20250816_202117_318213.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0062_MIX_20250816_202117_318213.wav", "ground_truth": "像那个 the national museum and 非常非常的难难讲。", - "original_transcription": ": 上那个, (err), the national museum and (err) 非常非常地难难讲,\n", "generated_transcription": "\n: 上那个, (err), the national museum and (err) 非常非常地难难讲\n", - "wer_original": 0.4375, - "wer_generated": 0.375, - "wer_improvement": 0.0625, - "wer_method": "smart_mixed", - "mer_original": 0.375 + "cer": 0.25806451612903225, + "wer": 0.6666666666666666, + "mer": 0.26666666666666666 }, { "segment_id": "SEGMENT_ZH-CN_U0033_MIX_20250816_202117_340352", - "audio_path": "SEGMENT_ZH-CN_U0033_MIX_20250816_202117_340352.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0033_MIX_20250816_202117_340352.wav", "ground_truth": "restaurant welcome to us, for example 海底捞 ,他原来接受我们的这个研究生学生证,所以我还是很喜欢去海底捞 。", - "original_transcription": ": (err) restaurant, welcome to us, for example 海底捞, 他原来接受我们的这个研究生学生证, 所以我还很喜欢去海底捞\n", - "generated_transcription": ": (err) restaurant welcome to us, for example 海底捞 他原来接受我们的这个研究生学生证, 所以我还很喜欢去海底捞\n", - "wer_original": 0.875, - "wer_generated": 0.125, - "wer_improvement": 0.75, - "wer_method": "smart_mixed", - "mer_original": 0.15 + "generated_transcription": "\n: (err) restaurant welcome to us, for example 海底捞 他原来接受我们的这个研究生学生证, 所以我还很喜欢去海底捞\n", + "cer": 0.06451612903225806, + "wer": 0.375, + "mer": 0.05405405405405406 }, { "segment_id": "SEGMENT_ZH-CN_U0081_MIX_20250816_202117_328247", - "audio_path": "SEGMENT_ZH-CN_U0081_MIX_20250816_202117_328247.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0081_MIX_20250816_202117_328247.wav", "ground_truth": "Hello 你好。 But but but I think some people believe 内脏 is gross, but I think it's delicious. I like , yeah yeah yeah, I like it very much. I think it's wonderful.", - "original_transcription": ": hello ni hao (uh) but but but I think (uh) some people believe (uh) naan is gross but I think it's delicious I like ya ya ya I like it very much I think it's (uh) wonderful\n", - "generated_transcription": ": hello 你好 but but but I think (um) (uh) some people believe (uh) naan is gross but I think it's delicious I like ya ya ya I like it very much I think it's (uh) wonderful\n", - "wer_original": 1.5, - "wer_generated": 0.4444444444444444, - "wer_improvement": 1.0555555555555556, - "wer_method": "smart_mixed", - "mer_original": 0.4722222222222222 + "generated_transcription": "\n: hello 你好 but but but I think (um) (uh) some people believe (uh) naan is gross but I think it's delicious I like ya ya ya I like it very much I think it's (uh) wonderful\n", + "cer": 0.15126050420168066, + "wer": 0.25, + "mer": 0.23529411764705882 }, { "segment_id": "SEGMENT_ZH-CN_U0039_ENCN_20250816_202117_351423", - "audio_path": "SEGMENT_ZH-CN_U0039_ENCN_20250816_202117_351423.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0039_ENCN_20250816_202117_351423.wav", "ground_truth": "哦。 Yeah, I. I'm like the Beijing Number Four, High school. 因为我感觉韩国他们可能一般去也就是购物,感觉它是一个非常适合就是比如说有一些女性去购物。", - "original_transcription": ": (oh) yeah, I'm like in the Beijing number four high school, 因为感觉韩国他们一般去就是购物感觉它是一个非常适合就是比如说有一些女性去购物\n", - "generated_transcription": ": (oh) yeah I I'm like in the Beijing number four high school 因为感觉韩国他们可能一般去就是购物感觉它是一个非常适合就是比如说有一些女性去购物\n", - "wer_original": 0.2857142857142857, - "wer_generated": 0.23214285714285715, - "wer_improvement": 0.05357142857142855, - "wer_method": "smart_mixed", - "mer_original": 0.26785714285714285 + "generated_transcription": "\n: (oh) yeah, I, I'm like in the Beijing number four high school, 因为感觉韩国他们可能一般去就是购物感觉它是一个非常适合就是比如说有一些女性去购物\n", + "cer": 0.07142857142857142, + "wer": 0.25, + "mer": 0.07547169811320754 }, { "segment_id": "SEGMENT_ZH-CN_U0034_MIX_20250816_202117_335780", - "audio_path": "SEGMENT_ZH-CN_U0034_MIX_20250816_202117_335780.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0034_MIX_20250816_202117_335780.wav", "ground_truth": "那你比较喜欢,我还有一种选项,就是 一边是比较 warm and politely friendly,但是他 可能还会想跟你发展一些工作之外的交流。然后另外一种就是他可能平时 看起来比较 grim,比较严肃,比较 serious,但是他可能工作之外,他不会跟你有过多的交集 connection,你觉得你喜欢哪种?", - "original_transcription": "(um) 那你比较喜欢我还有一种选择就是一边是比较, warm and polite and friendly 但是他, (err) 可能还会想跟你发展一些工作之外的交流然后另外一种就是他可能平时, (err) 看起来比较, grim, 比较严肃比较, serious, 但是他可能工作之外他不会跟你有过多的, Jiao, connection, 你觉得你喜欢哪一种\n", - "generated_transcription": "(um) 那你比较喜欢我还有一种选择就是 (err) 一边是比较 warm and polite friendly 但是他 (err) 可能还会想跟你发展一些工作之外的交流然后另外一种就是他可能平时 (err) 看起来比较 grim 比较 serious 比较 serious 但是他可能工作之外他不会跟你有过多的 交际 connection 你觉得你喜欢哪一种\n", - "wer_original": 0.5757575757575758, - "wer_generated": 0.7575757575757576, - "wer_improvement": -0.18181818181818177, - "wer_method": "smart_mixed", - "mer_original": 0.8484848484848485 + "generated_transcription": "\n嗯那你比较喜欢我还有一种选择就是一边是比较, warm and polite and friendly 但是他, (err) 可能还会想跟你发展一些工作之外的交流然后另外一种就是他可能平时, (err) 看起来比较, grim, 比较严肃比较, serious 但是他可能工作之外他不会跟你有过多的, Jiao jiu, connection, 你觉得你喜欢哪一种\n", + "cer": 0.1386861313868613, + "wer": 1.4545454545454546, + "mer": 0.09 }, { "segment_id": "SEGMENT_ZH-CN_U0094_ENCN_20250816_202117_272451", - "audio_path": "SEGMENT_ZH-CN_U0094_ENCN_20250816_202117_272451.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0094_ENCN_20250816_202117_272451.wav", "ground_truth": "I I finish one round. 对。 英式的话是不用穿衣服,就不用穿那种盔甲,但是美式的话就要穿盔甲,所以就是这样。 Yeah.", - "original_transcription": ": Okay, I finish one round. : 阴师的话是不用穿衣服就不用穿那种盔甲但是每次的话就又穿盔甲所以就是这样, : Ya.\n", - "generated_transcription": ": Okay, I finish one round. : 阴师的话是不用穿衣服就, (err), 不用穿那种盔甲但是每次的话就穿盔甲所以, 这这样, : Ya.\n", - "wer_original": 1.0, - "wer_generated": 0.3617021276595745, - "wer_improvement": 0.6382978723404256, - "wer_method": "smart_mixed", - "mer_original": 0.2765957446808511 + "generated_transcription": "\n: Okay, I finish one round. : 阴师的话是不用穿衣服就, (err), 不用穿那种盔甲但是每次的话就穿盔甲所以, 这这样, : Ya.\n", + "cer": 0.30357142857142855, + "wer": 0.75, + "mer": 0.2619047619047619 }, { "segment_id": "SEGMENT_ZH-CN_U1090_ENCN_20250816_202117_291895", - "audio_path": "SEGMENT_ZH-CN_U1090_ENCN_20250816_202117_291895.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1090_ENCN_20250816_202117_291895.wav", "ground_truth": "嗯,好的. Yeah. In the love story, Romeo and Julia. They they both love each other. 我觉得我觉得像像他这样的这样就是可以说是伟大的一个人。", - "original_transcription": ": 嗯好的 yeah in the love story (um) (um) Romeo and Juliet (um) (um), they they both love each other. : 我觉得我觉得像像他这样的这样就是可以说是伟大的一个人\n", - "generated_transcription": ": 嗯好的 yeah in the love story (um) (um) Romeo and Juliet (um) (um) they they both love each other 我觉得 我觉得 像像他这样的这样就是可以说是伟大的一个人\n", - "wer_original": 0.13043478260869565, - "wer_generated": 0.2826086956521739, - "wer_improvement": -0.15217391304347824, - "wer_method": "smart_mixed", - "mer_original": 0.2608695652173913 + "generated_transcription": "\n: 嗯好的 yeah in the love story (um) (um) Romeo and Juliet (um) (um) they they both love each other 我觉得 我觉得 像像他这样的这样就是可以说是伟大的一个人\n", + "cer": 0.11764705882352941, + "wer": 0.5, + "mer": 0.11627906976744186 }, { "segment_id": "SEGMENT_ZH-CN_U0053_MIX_20250816_202117_325934", - "audio_path": "SEGMENT_ZH-CN_U0053_MIX_20250816_202117_325934.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0053_MIX_20250816_202117_325934.wav", "ground_truth": "对,就就就说的是那个,wow。 就是我真正实际在打的那个我刚才说了,足球可能是喜欢,但是可能实际玩的场景不多。然后但是乒乓的话 better, and。", - "original_transcription": ": 对就就就说的是那个哇就是我真正实际在打的那个我刚刚说了足球可能是喜欢但是可能实际玩儿的场景不多然后但是乒乓的话嗯better and the\n", - "generated_transcription": ": 对就就就说的是那个哇就是我真正实际在打的那个我刚刚说了足球可能是喜欢但是可能实际玩儿的场景不多然后但是乒乓的话 better and\n", - "wer_original": 1.0, - "wer_generated": 0.34, - "wer_improvement": 0.6599999999999999, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "generated_transcription": "\n: 对就就就说的是那个哇就是我真正实际在打的那个我刚刚说了足球可能是喜欢但是可能实际玩儿的场景不多然后但是乒乓的话 better and\n", + "cer": 0.07692307692307693, + "wer": 0.5, + "mer": 0.05357142857142857 }, { "segment_id": "SEGMENT_ZH-CN_U0023_ENCN_20250816_202117_263729", - "audio_path": "SEGMENT_ZH-CN_U0023_ENCN_20250816_202117_263729.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0023_ENCN_20250816_202117_263729.wav", "ground_truth": "大概是哪些地方能讲讲,尤其是像君主制之类的。 because you clearly don't know and everybody knows that. So it's going to be super awkward if you try to like act call and feel like, you know, all the answer to the questions.", - "original_transcription": ": 大概是哪些地方能讲讲尤其是像君主制之类的因为你显然不知道而且每个人都知道所以如果你是想像很酷一样然后觉得你知道所有问题的答案的话会很尴尬\n", - "generated_transcription": ": 大概是哪些地方能讲讲尤其是像君主制之类的 because you clearly don't know and everybody knows that so it's gonna be super awkward if you try to like act cool and feel like you know all the answer to the questions\n", - "wer_original": 0.0, - "wer_generated": 0.17857142857142858, - "wer_improvement": -0.17857142857142858, - "wer_method": "smart_mixed", - "mer_original": 0.875 + "generated_transcription": "\n: 大概是哪些地方能讲讲尤其是像君主制之类的 because you clearly don't know and everybody knows that so it's gonna be super awkward if you try to like act cool and feel like you know all the answer to the questions\n", + "cer": 0.03821656050955414, + "wer": 0.08571428571428572, + "mer": 0.05555555555555555 }, { "segment_id": "SEGMENT_ZH-CN_U0012_MIX_20250816_202117_328864", - "audio_path": "SEGMENT_ZH-CN_U0012_MIX_20250816_202117_328864.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0012_MIX_20250816_202117_328864.wav", "ground_truth": "我其实不太了解,因为我平时也不怎么研究这些,我只是说知道,比如说像知道华为,苹果,小米, vivo 这些牌子,但是不太了解。", - "original_transcription": ": 我其实不太了解因为我平时也不怎么研究这些我只是知道比如说像知道华为苹果小米维维这些牌子但是不太了解\n", "generated_transcription": "\n: 我其实不太了解因为我平时也不怎么研究这些我只是说知道比如说像知道华为苹果小米V这些牌子但是不太了解\n", - "wer_original": 0.0, - "wer_generated": 1.0, - "wer_improvement": -1.0, - "wer_method": "smart_mixed", - "mer_original": 0.17543859649122806 + "cer": 0.057692307692307696, + "wer": 1.0, + "mer": 0.02040816326530612 }, { "segment_id": "SEGMENT_ZH-CN_U1032_MIX_20250816_202117_350350", - "audio_path": "SEGMENT_ZH-CN_U1032_MIX_20250816_202117_350350.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1032_MIX_20250816_202117_350350.wav", "ground_truth": "您说 your mother 有有一些个就是,has some some illnesses or or some some weaknesses。", - "original_transcription": ": 你说your mother has some some illnesses or some some weaknesses\n", "generated_transcription": "\n: 你说your mother有有一些就是has some some illnesses or or some some weaknesses\n", - "wer_original": 1.6153846153846154, - "wer_generated": 0.46153846153846156, - "wer_improvement": 1.1538461538461537, - "wer_method": "smart_mixed", - "mer_original": 0.46153846153846156 + "cer": 0.03278688524590164, + "wer": 0.3333333333333333, + "mer": 0.1 }, { "segment_id": "SEGMENT_ZH-CN_U0048_ENCN_20250816_202117_346236", - "audio_path": "SEGMENT_ZH-CN_U0048_ENCN_20250816_202117_346236.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0048_ENCN_20250816_202117_346236.wav", "ground_truth": "但是非常不幸的是,就是当我上大学的那年,因为当时不是有疫情的关系嘛,然后大家都没有办法 自由的进出学校。然后当时泼水节就等于是就是从那年开始就取消了,所以我其实是没有参加过的。 Wow, that's good ones. I actually I only watched Sherlock because sometimes the T V program make me feel I'm so stupid.", - "original_transcription": ": but it's very unfortunate that when I went to university, because it was the year of the swine flu, so everyone was afraid of getting sick, so everyone was afraid to go to school. So I couldn't go to school. So that year, I didn't go to school at all.\n", "generated_transcription": ": but very unfortunate is that when I went to university that year, because then there's no typhoon, so everyone couldn't go to school, so I couldn't go to school.\n", - "wer_original": 0.44036697247706424, - "wer_generated": 0.9908256880733946, - "wer_improvement": -0.5504587155963303, - "wer_method": "smart_mixed", - "mer_original": 0.9908256880733946 + "cer": 0.8850574712643678, + "wer": 1.0833333333333333, + "mer": 0.9902912621359223 }, { "segment_id": "SEGMENT_ZH-CN_U2005_ENCN_20250816_202117_352806", - "audio_path": "SEGMENT_ZH-CN_U2005_ENCN_20250816_202117_352806.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U2005_ENCN_20250816_202117_352806.wav", "ground_truth": "Yes, of course, yes. 对。 有啊。 So that is ** . So with the world becoming more data driven, you will be in high demand. You know, have you ever consider doing an internship in that field?", - "original_transcription": ": Yes, of course. Yes. 对, 你啊, So that is fantastic. So with the world becoming more data driven, you will be in high demand. You know, have you ever considered doing an internship in that field?\n", "generated_transcription": "\n: Yes, of course. Yes. 对. You are. So that is fantastic. So with the world becoming more data driven, you will be in high demand. You know. Have you ever considered doing an internship in that field\n", - "wer_original": 1.5, - "wer_generated": 0.3, - "wer_improvement": 1.2, - "wer_method": "smart_mixed", - "mer_original": 0.2 + "cer": 0.125, + "wer": 0.11428571428571428, + "mer": 0.1111111111111111 }, { "segment_id": "SEGMENT_ZH-CN_U1037_ENCN_20250816_202117_310302", - "audio_path": "SEGMENT_ZH-CN_U1037_ENCN_20250816_202117_310302.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1037_ENCN_20250816_202117_310302.wav", "ground_truth": "Okay, that's really good. 可以呀,咱们可以一起跑步,边跑步边听歌。我觉得还挺好的。", - "original_transcription": ": OK 那是really good 可以啊咱们可以一起跑步边跑步边听歌我觉得还挺好的\n", "generated_transcription": "\n: OK 那是really good 可以啊咱们可以一起跑步边跑步边听歌我觉得还挺好的\n", - "wer_original": 0.28125, - "wer_generated": 0.28125, - "wer_improvement": 0.0, - "wer_method": "smart_mixed", - "mer_original": 0.28125 + "cer": 0.18604651162790697, + "wer": 0.8, + "mer": 0.14285714285714285 }, { "segment_id": "SEGMENT_ZH-CN_U0040_ENCN_20250816_202117_283820", - "audio_path": "SEGMENT_ZH-CN_U0040_ENCN_20250816_202117_283820.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0040_ENCN_20250816_202117_283820.wav", "ground_truth": "Why? 你你。 Yes. One of the . 啊。", - "original_transcription": ": Why you you? Yes, one of the (um), (ah), (ah).\n", - "generated_transcription": ": Why 你你 Yes one of the (um) (ah) (ah).\n", - "wer_original": 0.45454545454545453, - "wer_generated": 0.6363636363636364, - "wer_improvement": -0.18181818181818182, - "wer_method": "smart_mixed", - "mer_original": 0.8181818181818182 + "generated_transcription": "\n: Why 你你 Yes one of the (um) (ah) (ah).\n", + "cer": 0.35294117647058826, + "wer": 0.42857142857142855, + "mer": 0.375 }, { "segment_id": "SEGMENT_ZH-CN_U0064_ENCN_20250816_202117_335446", - "audio_path": "SEGMENT_ZH-CN_U0064_ENCN_20250816_202117_335446.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0064_ENCN_20250816_202117_335446.wav", "ground_truth": "因为我其实是特别喜欢游泳,我之前特别喜欢看的动漫里边是是游泳的,所以我自己从小也是游泳比较多。 Well, sometimes I just hang out with my friends, and we just stay together because we most of us like stay at home. So yeah, but in school.", - "original_transcription": ": 因为我其实是特别喜欢游泳我之前特别喜欢看的动漫里边是是游泳的所以我自己从小也是游泳比较多, Well, sometimes I just hang out with my friends and we just stay together because we most of us like stay at home, so yeah, but in school.\n", - "generated_transcription": ": 因为我其实是特别喜欢游泳我之前特别喜欢看的动漫里边是是游泳的所以我自己从小也是游泳比较多 Well, sometimes I just hang out with my friends and we just stay together because we most of us like stay at home, so yeah, but in school\n", - "wer_original": 0.72, - "wer_generated": 0.09333333333333334, - "wer_improvement": 0.6266666666666666, - "wer_method": "smart_mixed", - "mer_original": 0.08 + "generated_transcription": "\n因为我其实是特别喜欢游泳我之前特别喜欢看的动漫里边是是游泳的所以我自己从小也是游泳比较多 Well sometimes I just hang out with my friends and we just stay together because we most of us like stay at home So yeah but in school\n", + "cer": 0.0, + "wer": 0.0, + "mer": 0.0 }, { "segment_id": "SEGMENT_ZH-CN_U0023_ENCN_20250816_202117_263680", - "audio_path": "SEGMENT_ZH-CN_U0023_ENCN_20250816_202117_263680.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0023_ENCN_20250816_202117_263680.wav", "ground_truth": "对,对。 Yeah. That's not for me. Yeah yeah yeah. 还有怎么安排自己的工作?", - "original_transcription": ": 对对 yeah that's enough for me yeah yeah yeah yeah 好怎么安排自己的工作\n", - "generated_transcription": ": 对对 yeah that's enough for me yeah yeah yeah 好怎么安排自己的工作\n", - "wer_original": 0.875, - "wer_generated": 0.4583333333333333, - "wer_improvement": 0.4166666666666667, - "wer_method": "smart_mixed", - "mer_original": 0.4583333333333333 + "generated_transcription": "\n: 对对 yeah that's enough for me yeah yeah yeah 好怎么安排自己的工作\n", + "cer": 0.14285714285714285, + "wer": 0.2, + "mer": 0.14285714285714285 }, { "segment_id": "SEGMENT_ZH-CN_U1046_ENCN_20250816_202117_276457", - "audio_path": "SEGMENT_ZH-CN_U1046_ENCN_20250816_202117_276457.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1046_ENCN_20250816_202117_276457.wav", "ground_truth": "So. 嗯,是的。 I just like to talk to them if we can make each other happy, so we can be friends. 是,那你们当时是要过专业英语等级是吧 。", - "original_transcription": ": So I just like to talk to them if we can make each other happy so we can be friends. : 那你们当时是要过专业英语等级是吧 : 嗯\n", - "generated_transcription": ": So I just like to talk to them if we can make each other happy so we can be friends : 那你们当时是要过专业英语等级是吧\n", - "wer_original": 0.06818181818181818, - "wer_generated": 0.25, - "wer_improvement": -0.18181818181818182, - "wer_method": "smart_mixed", - "mer_original": 0.22727272727272727 + "generated_transcription": "\n: So 嗯是的 I just like to talk to them if we can make each other happy so we can be friends 那你们当时是要过专业英语等级是吧\n", + "cer": 0.011904761904761904, + "wer": 0.045454545454545456, + "mer": 0.025 }, { "segment_id": "SEGMENT_ZH-CN_U1057_MIX_20250816_202117_325409", - "audio_path": "SEGMENT_ZH-CN_U1057_MIX_20250816_202117_325409.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1057_MIX_20250816_202117_325409.wav", "ground_truth": "Sorry, 我不知道,我没有听过笑面虎这个形容词。 然后其他课也要结业嘛,也要做 presentation 嘛,也要写 paper 嘛。然后就是感觉我这学期还没有学到什么东西,然后就要期末考试了,感觉脑袋空空就要考试了,啊,又要奋战了。", - "original_transcription": ": sorry 我不知道 我没有听过像面虎这个形容词 然后其他课也要结业嘛 也要做presentation嘛 也要写paper嘛 然后就是感觉我这些期还没有学到什么东西 然后就要临近考试了感觉脑袋空空就要考试了啊又要奋战了\n", - "generated_transcription": ": sorry 我不知道 我没有听过像面虎这个形容词 然后其他课也要结业嘛 也要做presentation嘛 也要写paper嘛 然后就是感觉我这些期还没有学到什么东西 然后就要临期考试了感觉脑袋空空就要考试了啊又要奋战了\n", - "wer_original": 0.03225806451612903, - "wer_generated": 0.26881720430107525, - "wer_improvement": -0.23655913978494622, - "wer_method": "smart_mixed", - "mer_original": 0.26881720430107525 + "generated_transcription": "\n: sorry 我不知道 我没有听过像面虎这个形容词 然后其他课也要结业嘛 也要做presentation嘛 也要写paper嘛 然后就是感觉我这些期还没有学到什么东西 然后就要临期考试了感觉脑袋空空就要考试了啊又要奋战了\n", + "cer": 0.0392156862745098, + "wer": 1.0, + "mer": 0.04819277108433735 }, { "segment_id": "SEGMENT_ZH-CN_U2002_ENCN_20250816_202117_273742", - "audio_path": "SEGMENT_ZH-CN_U2002_ENCN_20250816_202117_273742.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U2002_ENCN_20250816_202117_273742.wav", "ground_truth": "我家一共有五口人,我爸爸,我妈妈,我,我弟弟还有我妹妹。 Actually it doesn't have a real name real name. We always.", - "original_transcription": ": 我家一共有五口人我爸爸我妈妈我我弟弟还有我妹妹, : (um), actually, it doesn't have a real name, real name we, (uh), offer.\n", - "generated_transcription": ": 我家一个有五口人我爸爸我妈妈我我弟弟还有我妹妹 : (um), actually it doesn't have a real name real name we, oppos\n", - "wer_original": 0.6153846153846154, - "wer_generated": 0.2564102564102564, - "wer_improvement": 0.35897435897435903, - "wer_method": "smart_mixed", - "mer_original": 0.3076923076923077 + "generated_transcription": "\n: 我家一个有五口人我爸爸我妈妈我我弟弟还有我妹妹, : (um), actually it doesn't have a real name, real name we, : (hm),\n", + "cer": 0.1323529411764706, + "wer": 0.25, + "mer": 0.08823529411764706 }, { "segment_id": "SEGMENT_ZH-CN_U0011_ENCN_20250816_202117_322625", - "audio_path": "SEGMENT_ZH-CN_U0011_ENCN_20250816_202117_322625.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0011_ENCN_20250816_202117_322625.wav", "ground_truth": "但如果说经常晚归的话,会惩罚吗? It's also hard for me to understand what they say So I strongly you can recommend them to write me an email instead of making the conference call.", - "original_transcription": ": 那如果是经常, (err), 枉归的话会有惩罚吗对不对, : It's also hard for me to understand what they're saying, so I strongly recommend them to write me an email instead of making the conference call.\n", - "generated_transcription": ": 那如果是经常, 枉归的话会有惩罚吗对不对, : It's also hard for me to understand what they're saying, so I strongly rec~ recommend them to write me an email instead of making the conference call.\n", - "wer_original": 0.45454545454545453, - "wer_generated": 0.3409090909090909, - "wer_improvement": 0.11363636363636365, - "wer_method": "smart_mixed", - "mer_original": 0.36363636363636365 + "generated_transcription": "\n: 那如果是经常, 枉归的话会有惩罚吗对不对, : It's also hard for me to understand what they're saying, so I strongly rec~ recommend them to write me an email instead of making the conference call.\n", + "cer": 0.1297709923664122, + "wer": 0.20689655172413793, + "mer": 0.2619047619047619 }, { "segment_id": "SEGMENT_ZH-CN_U1035_MIX_20250816_202117_347401", - "audio_path": "SEGMENT_ZH-CN_U1035_MIX_20250816_202117_347401.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1035_MIX_20250816_202117_347401.wav", "ground_truth": "一些歌曲,你有什么特别喜欢的 K-pop 团队吗? My favorite is 西红柿炒鸡蛋。 It's very delicious. What about you?", - "original_transcription": ": 一些歌曲你有什么特别喜欢的K-pop团队吗\n: My favorites is 柠檬汁炒鸡蛋嗯它very delicious what about you\n", - "generated_transcription": ": 一些歌曲你有什么特别喜欢的K-pop团队吗\n: My favorites is 西红柿炒鸡蛋嗯 it's very delicious what about you\n", - "wer_original": 0.42857142857142855, - "wer_generated": 0.7142857142857143, - "wer_improvement": -0.28571428571428575, - "wer_method": "smart_mixed", - "mer_original": 0.9142857142857143 + "generated_transcription": "\n: 一些歌曲你有什么特别喜欢的K-pop团队吗\n: My favorites is 西红柿炒鸡蛋嗯 it's very delicious what about you\n", + "cer": 0.030303030303030304, + "wer": 0.38461538461538464, + "mer": 0.0625 }, { "segment_id": "SEGMENT_ZH-CN_U0084_ENCN_20250816_202117_304657", - "audio_path": "SEGMENT_ZH-CN_U0084_ENCN_20250816_202117_304657.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0084_ENCN_20250816_202117_304657.wav", "ground_truth": "It provide digital received and transaction histories, making it very easier for me to track my spending and manage my finances. 因为。", - "original_transcription": ": it provide digital receipt and transaction histories making it very easy for me to track my spending and manage my finances. : (uh), you.\n", "generated_transcription": "\n: it provide digital receipt and transaction histories making it very easy for me to track my spending and managing my finances.\n", - "wer_original": 0.08333333333333333, - "wer_generated": 0.3333333333333333, - "wer_improvement": -0.25, - "wer_method": "smart_mixed", - "mer_original": 0.2916666666666667 + "cer": 0.10185185185185185, + "wer": 0.18181818181818182, + "mer": 0.21739130434782608 }, { "segment_id": "SEGMENT_ZH-CN_U2003_ENCN_20250816_202117_262878", - "audio_path": "SEGMENT_ZH-CN_U2003_ENCN_20250816_202117_262878.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U2003_ENCN_20250816_202117_262878.wav", "ground_truth": "你不爱喝咖啡吗? Okay, it maybe it sounds not like a good act good. It sounds not like a good movie, but but I like it very much.", - "original_transcription": ": 你不爱喝咖啡吗, Okay, it may be it sounds not like a good act, good luck. It sounds not like a good movie, but (hm), but I like it very much.\n", "generated_transcription": "\n: 你不爱喝咖啡吗 Okay, it maybe it sounds not like a good act, good luck. It sounds not like a good movie, but (hm), but I like it very much\n", - "wer_original": 0.24242424242424243, - "wer_generated": 0.18181818181818182, - "wer_improvement": 0.06060606060606061, - "wer_method": "smart_mixed", - "mer_original": 0.21212121212121213 + "cer": 0.06593406593406594, + "wer": 0.07692307692307693, + "mer": 0.0625 }, { "segment_id": "SEGMENT_ZH-CN_U0005_MIX_20250816_202117_348053", - "audio_path": "SEGMENT_ZH-CN_U0005_MIX_20250816_202117_348053.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0005_MIX_20250816_202117_348053.wav", "ground_truth": "那你平时会去 哦就是滑雪啊,snow snowboarding 这些在冬天的时候。", - "original_transcription": ": 那你平时会去滑雪啊snowboarding这些在冬天的时候\n", "generated_transcription": "\n: 那你平时会去滑雪啊 snowboarding这些在冬天的时候\n", - "wer_original": 0.11764705882352941, - "wer_generated": 0.6470588235294118, - "wer_improvement": -0.5294117647058824, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.19444444444444445, + "wer": 1.0, + "mer": 0.18181818181818182 }, { "segment_id": "SEGMENT_ZH-CN_U1075_ENCN_20250816_202117_332477", - "audio_path": "SEGMENT_ZH-CN_U1075_ENCN_20250816_202117_332477.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1075_ENCN_20250816_202117_332477.wav", "ground_truth": "嗯。 Do you like to be a teacher? Do you like to be a journalist? 你日常外国人交流的,你看你交和外国人交流啊,这些 其实都是用的外国人用的非常简单的日常口语,简单的东西。", - "original_transcription": ": 嗯 do you like to be a teacher do you like to be journalist 你日常外国人交流的你看你交和外国人交流啊这些其实都是用的外国人用的非常简单的日常口语简单的东西\n", "generated_transcription": "\n: 嗯 do you like to be a teacher do you like to be a journalist 你日常外国人交流的你看你交和外国人交流啊这些其实都是用的外国人用的非常简单的日常口语简单的东西\n", - "wer_original": 0.2537313432835821, - "wer_generated": 0.13432835820895522, - "wer_improvement": 0.11940298507462688, - "wer_method": "smart_mixed", - "mer_original": 0.14925373134328357 + "cer": 0.0, + "wer": 0.11764705882352941, + "mer": 0.0 }, { "segment_id": "SEGMENT_ZH-CN_U1039_ENCN_20250816_202117_292558", - "audio_path": "SEGMENT_ZH-CN_U1039_ENCN_20250816_202117_292558.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1039_ENCN_20250816_202117_292558.wav", "ground_truth": "这好像又又有点容易混,我俩。 Beautiful, what's yours?", - "original_transcription": ": 最好的人又有点容易混我俩beautiful what's yours\n", "generated_transcription": "\n: 最好的人又有点容易混我俩beautiful what's yours\n", - "wer_original": 0.5294117647058824, - "wer_generated": 0.9411764705882353, - "wer_improvement": -0.4117647058823529, - "wer_method": "smart_mixed", - "mer_original": 0.9411764705882353 + "cer": 0.0967741935483871, + "wer": 0.5, + "mer": 0.2 }, { "segment_id": "SEGMENT_ZH-CN_U0022_ENCN_20250816_202117_332198", - "audio_path": "SEGMENT_ZH-CN_U0022_ENCN_20250816_202117_332198.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0022_ENCN_20250816_202117_332198.wav", "ground_truth": "我现在已经眼睛已经已经四五百度了,我并不太想花很多时间在这种电子屏幕上,因为对我的损伤还是挺大的。并且你像在读纸质书的时候,他并不会像在手机上或者在平板上,它会弹出一些微信消息或者一些其他信息之类,然后啊一旦我看见这些信息,可能我就会忍不住点进去。然后可能一个小时下来,我就读了几页纸这样子,我觉得对我真真正去读书还是比较没有太大帮助吧。 Okay, are your fan over suspense or mystery novels? I've heard you mention this.", - "original_transcription": ": 现在已经眼睛已经四五百度了我并不太想花很多时间在这种电子屏幕上因为对我的损伤还是挺大的并且你像在读纸质书的时候它并不会像在手机上或者在平板上它会弹出一些微信消息或者一些其他信息之类然后一旦我看见这些信息可能我就会忍不住点进去然后可能一个小时下来我就读了几页纸这样子我觉得真正去读书还是比较没有太大帮助\n", "generated_transcription": ": 现在已经眼睛已经四五百度了我并不太想花很多时间在这种电子屏幕上因为对我的损伤还是挺大的并且你像在读直指说的时候它并不会像在手机上或者在平板上它会弹出一些微信消息或者一些其他信息之类然后一旦我看见这些信息可能我就会忍不住点进去然后可能一个小时下来我就读了几页纸这样子\n", - "wer_original": 0.9289617486338798, - "wer_generated": 0.3005464480874317, - "wer_improvement": 0.6284153005464481, - "wer_method": "smart_mixed", - "mer_original": 0.18579234972677597 + "cer": 0.42081447963800905, + "wer": 1.0, + "mer": 0.2558139534883721 }, { "segment_id": "SEGMENT_ZH-CN_U0037_ENCN_20250816_202117_308676", - "audio_path": "SEGMENT_ZH-CN_U0037_ENCN_20250816_202117_308676.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0037_ENCN_20250816_202117_308676.wav", "ground_truth": "Well, the thing is, a lot of people are like a lot of people questioned my Chinese ability, because back in the middle school and the high school, I I had like really terry I had like terrible grades in Chinese. 那饮食方面呢?", - "original_transcription": ": Well, the thing is a lot of people are like a lot of people question my Chinese ability, because back in (uh) middle school and (uh) high school I I had like really ter~ I had like terrible grades in Chinese. : (oh), (oh), (oh) (oh), (oh). : 那饮食方面呢\n", "generated_transcription": "\n: Well, the thing is a lot of people are like a lot of people question my Chinese ability, because back in (uh) middle school and (uh) high school I I had like really ter~ I had like terrible grades in Chinese. : 那饮食方面,\n", - "wer_original": 0.14583333333333334, - "wer_generated": 0.16666666666666666, - "wer_improvement": -0.020833333333333315, - "wer_method": "smart_mixed", - "mer_original": 0.25 + "cer": 0.05232558139534884, + "wer": 0.11904761904761904, + "mer": 0.10638297872340426 }, { "segment_id": "SEGMENT_ZH-CN_U1097_ENCN_20250816_202117_328172", - "audio_path": "SEGMENT_ZH-CN_U1097_ENCN_20250816_202117_328172.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1097_ENCN_20250816_202117_328172.wav", "ground_truth": "那你那你又那你去讨,那你去买那个相机去摄影了吗? Yeah, maybe.", - "original_transcription": ": 那你那你又那你去讨那你去买那个相机去摄影了吗\n", "generated_transcription": "\n: 那你那你又那你去讨那你去买那个相机去摄影了吗\n", - "wer_original": 0.9230769230769231, - "wer_generated": 0.15384615384615385, - "wer_improvement": 0.7692307692307693, - "wer_method": "smart_mixed", - "mer_original": 0.15384615384615385 + "cer": 0.2903225806451613, + "wer": 0.6666666666666666, + "mer": 0.08333333333333333 }, { "segment_id": "SEGMENT_ZH-CN_U2005_MIX_20250816_202117_352589", - "audio_path": "SEGMENT_ZH-CN_U2005_MIX_20250816_202117_352589.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U2005_MIX_20250816_202117_352589.wav", "ground_truth": "哦,我知道那个他那首歌叫做 puppy, 然后那首歌真的很 famous。在这个粉圈当中非常的 就是让人听着 hearts fell in love 的这种感觉。除了 Baek Hyun 我看一下这段时间还有哪一些 idol group 在开 concerts 呢?好像 。", - "original_transcription": ": 哦我知道那个他那首歌叫做poppy然后那首歌真的很famous在这个粉圈当中非常的就是让人听着hearts fall in love的这种感觉除了pick up我看一下这段时间还有哪一些idol group在开concert呢好像嗯\n", "generated_transcription": "\n: 哦我知道那个他那首歌叫做poppy然后那首歌真的很famous在这个粉圈当中非常的就是让人听着heart fall in love的这种感觉除了pick up我看一下这段时间还有哪一些idol group在开concert呢好像嗯\n", - "wer_original": 0.9242424242424242, - "wer_generated": 0.9848484848484849, - "wer_improvement": -0.06060606060606066, - "wer_method": "smart_mixed", - "mer_original": 0.9848484848484849 + "cer": 0.09734513274336283, + "wer": 0.9444444444444444, + "mer": 0.09722222222222222 }, { "segment_id": "SEGMENT_ZH-CN_U1104_ENCN_20250816_202117_311445", - "audio_path": "SEGMENT_ZH-CN_U1104_ENCN_20250816_202117_311445.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1104_ENCN_20250816_202117_311445.wav", "ground_truth": "因为石家庄离这也不远嘛。 Yeah. 辛苦辛苦。 Maybe after we may maybe after we finished talking , you can see the football.", - "original_transcription": ": 因为史家庄这里也不远嘛, yeah, 辛苦辛苦, : maybe after we maybe after we finish the talking, you can see the football.\n", "generated_transcription": "\n: 因为史家庄这里也不远吗 yeah 辛苦辛苦 : maybe after we maybe after we finish the talking you can see the football\n", - "wer_original": 0.5454545454545454, - "wer_generated": 0.3939393939393939, - "wer_improvement": 0.1515151515151515, - "wer_method": "smart_mixed", - "mer_original": 0.3333333333333333 + "cer": 0.12345679012345678, + "wer": 0.23529411764705882, + "mer": 0.23333333333333334 }, { "segment_id": "SEGMENT_ZH-CN_U1008_ENCN_20250816_202117_262657", - "audio_path": "SEGMENT_ZH-CN_U1008_ENCN_20250816_202117_262657.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1008_ENCN_20250816_202117_262657.wav", "ground_truth": "嗯,在学校里面的话,除了我自己经常去社团里面加入跆拳道这些的训练之外,也经常和室友一起去打羽毛球。我不知道是为什么 ,我们学校大部分人都很喜欢打羽毛球,那个场子都快不够了,而且我们全部是整全部整个寝室她们都会打,我不会的也跟着学会了。所以你会打这些羽毛球或者是更多的项目吗? Yes.", - "original_transcription": ": 除了我自己经常去社团里面加入跆拳道这些的训练之外,也经常和室友一起打羽毛球。我不知道是为什么,哈我们学校大部分人都很喜欢打羽毛球,那个场子都快不够了,而且我们全部是整个寝室,他们都会打,我不会的也跟着学会了,所以你会打这些羽毛球或者更多的项目吗?\n", "generated_transcription": ": 除了我自己经常去社团里面加入跆拳道这些的训练之外,也经常和室友一起打羽毛球。我不知道是为什么,哈我们学校大部分人都很喜欢打羽毛球。那个场子都快不够了,而且我们全部是整个寝室他们都会打。我不会的也跟着学会了,所以你会打这些羽毛球或者更多的项目吗\n", - "wer_original": 0.021897810218978103, - "wer_generated": 0.16058394160583941, - "wer_improvement": -0.1386861313868613, - "wer_method": "smart_mixed", - "mer_original": 0.145985401459854 + "cer": 0.13953488372093023, + "wer": 1.0, + "mer": 0.12598425196850394 }, { "segment_id": "SEGMENT_ZH-CN_U0006_ENCN_20250816_202117_339838", - "audio_path": "SEGMENT_ZH-CN_U0006_ENCN_20250816_202117_339838.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0006_ENCN_20250816_202117_339838.wav", "ground_truth": "对,我家就是那种小土狗儿,是我妈从路边儿捡回来的。 And. 对,就在留学留留的没有意义了就。 Train translation translation, Chinese and English words, translation. And after finishing that you have to make a conclusion about your translation with a theory, because there are a lot of translation theory like the multimodel discourse analysis, or something for example.", - "original_transcription": ": 对我们家就是那种小图画是我妈从路边捡回来的, : 对就这, 留学, 流, 流的没有意义了就, : trans, trans, translation, Chinese and English words translation, and after finishing that you have to make a conclusion about your translation with a theory because there are a lot of translation theory like (uh), the multimodal discourse analysis or something, for example.\n", "generated_transcription": ": 对我们家就是那种小图画是我妈从路边捡回来的 : 对就这留学留留都没有意义了就 : trans trans translation Chinese and English words translation and after finishing that you have to make a conclusion about your translation with a theory because there are a lot of translation theory like (uh), the multimodal discourse analysis or something for example\n", - "wer_original": 0.8313253012048193, - "wer_generated": 0.27710843373493976, - "wer_improvement": 0.5542168674698795, - "wer_method": "smart_mixed", - "mer_original": 0.3253012048192771 + "cer": 0.07835820895522388, + "wer": 0.1590909090909091, + "mer": 0.15384615384615385 }, { "segment_id": "SEGMENT_ZH-CN_U0001_MIX_20250816_202117_301660", - "audio_path": "SEGMENT_ZH-CN_U0001_MIX_20250816_202117_301660.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0001_MIX_20250816_202117_301660.wav", "ground_truth": "That's so good。Have you tried that? 我我是没我有看到过,但是我没有试过。", - "original_transcription": ": (oh) that's so good. Have you tried that? 我我是没有看到过但是我没有试过\n", "generated_transcription": "\n: (oh) that's so good. Have you tried that? 我我是没有看到过但是我没有试过\n", - "wer_original": 0.0, - "wer_generated": 0.3333333333333333, - "wer_improvement": -0.3333333333333333, - "wer_method": "smart_mixed", - "mer_original": 0.3333333333333333 + "cer": 0.06976744186046512, + "wer": 0.5714285714285714, + "mer": 0.18181818181818182 }, { "segment_id": "SEGMENT_ZH-CN_U1045_MIX_20250816_202117_329885", - "audio_path": "SEGMENT_ZH-CN_U1045_MIX_20250816_202117_329885.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1045_MIX_20250816_202117_329885.wav", "ground_truth": "I think it maybe this is the 下一个 topic。我们 in this topic 我们就直接聊关于个人的 one more thing I'm more curious about that is 你有没有女朋友。", - "original_transcription": ": (um) I think maybe this is the next topic 我们 in this topic 我们就直接聊关于个人的嗯嗯嗯One more thing I'm more curious about that is 你有没有女朋友\n", "generated_transcription": "\n: 嗯 I think maybe this is the 下一个 topic 我们 in this topic 我们就直接聊关于个人的 one more thing I'm more curious about that is 你有没有女朋友\n", - "wer_original": 0.5476190476190477, - "wer_generated": 0.14285714285714285, - "wer_improvement": 0.4047619047619048, - "wer_method": "smart_mixed", - "mer_original": 0.4523809523809524 + "cer": 0.030927835051546393, + "wer": 0.17391304347826086, + "mer": 0.046511627906976744 }, { "segment_id": "SEGMENT_ZH-CN_U0051_ENCN_20250816_202117_349366", - "audio_path": "SEGMENT_ZH-CN_U0051_ENCN_20250816_202117_349366.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0051_ENCN_20250816_202117_349366.wav", "ground_truth": "啊,心理咨询师,啊。 Yeah, she thinks dog is dirty. So she does she, she even think I am dirty so she.", - "original_transcription": ": (ah), 心理咨询师, (ah), ya, she thinks dogs is dirty, so she just (hm), she even think I'm dirty, so she.\n", "generated_transcription": "\n: (ah), 心理咨询师, ya, she thinks dog is dirty, so she just (hm), she even think I am dirty, so she\n", - "wer_original": 0.35714285714285715, - "wer_generated": 0.42857142857142855, - "wer_improvement": -0.0714285714285714, - "wer_method": "smart_mixed", - "mer_original": 0.5 + "cer": 0.14925373134328357, + "wer": 0.2631578947368421, + "mer": 0.2 }, { "segment_id": "SEGMENT_ZH-CN_U0004_MIX_20250816_202117_320731", - "audio_path": "SEGMENT_ZH-CN_U0004_MIX_20250816_202117_320731.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0004_MIX_20250816_202117_320731.wav", "ground_truth": "嗯去一些那个 Hong Kong university 的冷门儿专业,我觉得是 may be accessible.", - "original_transcription": ": 去一些香港大学的冷门专业我觉得是maybe accessible\n", "generated_transcription": "\n: 去一些香港大学的冷门专业我觉得是maybe accessible\n", - "wer_original": 0.0, - "wer_generated": 1.0, - "wer_improvement": -1.0, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.4489795918367347, + "wer": 0.875, + "mer": 0.4090909090909091 }, { "segment_id": "SEGMENT_ZH-CN_U0101_MIX_20250816_202117_332633", - "audio_path": "SEGMENT_ZH-CN_U0101_MIX_20250816_202117_332633.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0101_MIX_20250816_202117_332633.wav", "ground_truth": "针对这种,我觉得每个人都是有自己的。Maybe some secret 所以我们要在就是因为学校也谈及到 禁止,有时候会 forbidden forbidden some pictures, photos and some videos of the student。有时候也会就是禁止去拍摄一些他们的可能啊面部的一些信息。家长也会可能 worried about the information 可能会担心这样孩子个人的一些隐私 secret。", - "original_transcription": ": 嗯针对这种我觉得每个人都是有自己的maybe some secret啊所以我们要在就是因为学校也谈到呃禁止有时候会forbidden forbidden some pictures photos and some videos of the students有时候也会就是禁止去拍摄一些他们的可能啊面部的一些信息家长也会可能worried about the information啊可能会担心这样孩子个人的一些隐私secret\n", - "generated_transcription": ": 嗯针对这种我觉得每个人都是有自己的maybe some secret啊所以我们要在就是因为学校也谈到呃禁止有时候会forbidden forbidden some pictures photos and some videos of the students有时候也会就是禁止去拍摄一些他们的可能啊面部的一些信息家长也会可能 worried about the information啊可能会担心这样孩子个人的一些隐私secret\n", - "wer_original": 1.3620689655172413, - "wer_generated": 0.7931034482758621, - "wer_improvement": 0.5689655172413792, - "wer_method": "smart_mixed", - "mer_original": 0.8103448275862069 + "generated_transcription": "\n: 嗯针对这种我觉得每个人都是有自己的maybe some secret啊所以我们要在就是因为学校也谈到呃禁止有时候会forbidden forbidden some pictures photos and some videos of the students有时候也会就是禁止去拍摄一些他们的可能啊面部的一些信息家长也会可能 worried about the information啊可能会担心这样孩子个人的一些隐私secret\n", + "cer": 0.03076923076923077, + "wer": 0.4090909090909091, + "mer": 0.05660377358490566 }, { "segment_id": "SEGMENT_ZH-CN_U0038_ENCN_20250816_202117_323782", - "audio_path": "SEGMENT_ZH-CN_U0038_ENCN_20250816_202117_323782.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0038_ENCN_20250816_202117_323782.wav", "ground_truth": "这个如果如果这个健康有有等级的话,那我肯定是最后一级 。 Yeah.", - "original_transcription": ": 这个如果如果这个健康有等级的话那我肯定是最后一集, yeah.\n", - "generated_transcription": ": 这个如果如果这个健康有等级的话那我肯定是最后一集\n", - "wer_original": 0.9642857142857143, - "wer_generated": 0.17857142857142858, - "wer_improvement": 0.7857142857142857, - "wer_method": "smart_mixed", - "mer_original": 0.17857142857142858 + "generated_transcription": "\n: 这个如果如果这个健康有等级的话那我肯定是最后一集\n", + "cer": 0.20689655172413793, + "wer": 1.0, + "mer": 0.11538461538461539 }, { "segment_id": "SEGMENT_ZH-CN_U1014_ENCN_20250816_202117_284899", - "audio_path": "SEGMENT_ZH-CN_U1014_ENCN_20250816_202117_284899.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1014_ENCN_20250816_202117_284899.wav", "ground_truth": "Well, that's sure the complex. 那你很可爱,你真的很可爱。嗯, 不客气,那你就是我觉得哎你真的是就是一个很负责的对象。因为你会为了另一半去融入他的爱好,我觉得你真的很好。", - "original_transcription": ": well that's really complex : 你很可爱你真的很可爱嗯嗯不客气那你就是我觉得你真的是就是一个很负责的对象因为你会为了另一半去融入他的爱好我觉得你真的很好\n", - "generated_transcription": ": well that's really complex 你很可爱你真的很可爱嗯嗯不客气那你就是我觉得你真的是就是一个很负责的对象因为你会为了另一半去融入他的爱好我觉得你真的很好\n", - "wer_original": 0.9315068493150684, - "wer_generated": 0.1780821917808219, - "wer_improvement": 0.7534246575342465, - "wer_method": "smart_mixed", - "mer_original": 0.1780821917808219 + "generated_transcription": "\n: well that's really complex 你很可爱你真的很可爱嗯嗯不客气那你就是我觉得你真的是就是一个很负责的对象因为你会为了另一半去融入他的爱好我觉得你真的很好\n", + "cer": 0.10714285714285714, + "wer": 0.5714285714285714, + "mer": 0.07575757575757576 }, { "segment_id": "SEGMENT_ZH-CN_U1008_ENCN_20250816_202117_262467", - "audio_path": "SEGMENT_ZH-CN_U1008_ENCN_20250816_202117_262467.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1008_ENCN_20250816_202117_262467.wav", "ground_truth": "所以后面产生了什么矛盾吗? when they to the top of mountain, we can see a very beautiful mountain mountains like Gonga mountain, because that was the air was there was very good. And also.", - "original_transcription": ": So, so, so, so, the mountain, the mountain, the mountains, the mountains, the mountain, the air, the air, the mountain, the view, the view, the mountain, the beautiful mountain, the beautiful mountain\n", - "generated_transcription": ": 所以后面产生了什么矛盾吗\n: 我们到山顶的时候可以看到非常美丽的mountain mountains 就像公加山因为那空气很好\n", - "wer_original": 0.3953488372093023, - "wer_generated": 0.6976744186046512, - "wer_improvement": -0.3023255813953489, - "wer_method": "smart_mixed", - "mer_original": 0.9069767441860465 + "generated_transcription": "\n: 所以后面产生了什么矛盾吗\n", + "cer": 0.9142857142857143, + "wer": 0.967741935483871, + "mer": 0.7142857142857143 }, { "segment_id": "SEGMENT_ZH-CN_U0005_ENCN_20250816_202117_348327", - "audio_path": "SEGMENT_ZH-CN_U0005_ENCN_20250816_202117_348327.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0005_ENCN_20250816_202117_348327.wav", "ground_truth": "No. 嗯。 平时的话在周末喜欢去户外爬爬山,然后 ,就是呼吸一下新鲜空气,包括跟朋友 可能约一下去周边玩儿,然后或者去公园散散步,划划船之类的 。 Yes I think so.", - "original_transcription": ": 嗯平时的话在周末喜欢去户外爬爬山然后呃就是呼吸一下新鲜空气包括跟朋友呃可能约一下去周边玩儿然后或者去公园散散步划划船之类的 yeah think so\n", - "generated_transcription": ": 嗯平时的话在周末喜欢去户外爬爬山然后呃就是呼吸一下新鲜空气包括跟朋友呃可能约一下去周边玩儿然后或者去公园散散步划划船之类的 yeah think so\n", - "wer_original": 0.15492957746478872, - "wer_generated": 0.16901408450704225, - "wer_improvement": -0.01408450704225353, - "wer_method": "smart_mixed", - "mer_original": 0.16901408450704225 + "generated_transcription": "\n: 嗯平时的话在周末喜欢去户外爬爬山然后呃就是呼吸一下新鲜空气包括跟朋友呃可能约一下去周边玩儿然后或者去公园散散步划划船之类的 yeah think so\n", + "cer": 0.08333333333333333, + "wer": 0.7777777777777778, + "mer": 0.078125 }, { "segment_id": "SEGMENT_ZH-CN_U2012_MIX_20250816_202117_305642", - "audio_path": "SEGMENT_ZH-CN_U2012_MIX_20250816_202117_305642.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U2012_MIX_20250816_202117_305642.wav", "ground_truth": "大学的老师,这些 professors 他们都很 professional。他们非常 focus 去引导学生一个 thinking and exploration ,那当然呢也会有一些professor 会比较 strict 我觉得但也是为了让你更好的学习。", - "original_transcription": ": 嗯大学的老师这些 professors 他们都很 professional 他们非常 focus 去引导学生一个 thinking (ah) and exploration (ah) 那当然呢也会有一些 professor 会比较 strict, 我觉得但也是为了让你更好的学习\n", - "generated_transcription": ": 嗯大学的老师这些 professors 他们都很 professional 他们非常 focus 去引导学生一个 thinking (ah) and exploration (ah) 那当然呢也会有一些 professor 会比较 strict 我觉得但也是为了让你更好的学习\n", - "wer_original": 0.6739130434782609, - "wer_generated": 0.43478260869565216, - "wer_improvement": 0.2391304347826087, - "wer_method": "smart_mixed", - "mer_original": 0.45652173913043476 + "generated_transcription": "\n: 嗯大学的老师这些 professors 他们都很 professional 他们非常 focus 去引导学生一个 thinking (ah) and exploration (ah) 那当然啦也会有一些 professor 会比较 strict 我觉得但也是为了让你更好的学习\n", + "cer": 0.05309734513274336, + "wer": 0.5384615384615384, + "mer": 0.07017543859649122 }, { "segment_id": "SEGMENT_ZH-CN_U1041_ENCN_20250816_202117_344059", - "audio_path": "SEGMENT_ZH-CN_U1041_ENCN_20250816_202117_344059.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1041_ENCN_20250816_202117_344059.wav", "ground_truth": "After work. 对,就能解压。 Oh, yeah. If I finish my my lesson, my class early, in long day, I will go to the gym and do some practice, like. 对,做一下销售预测是可以的。", - "original_transcription": ": after work : 对就是能解压 : (oh) yeah (err) if I finish my my lesson my class early (err) in one day I will go to the gym and do some practice like 对做一下销售预测是可以的\n", - "generated_transcription": ": after work : 对就是能解压 : (oh) yeah (err) if I finish my my lesson my class early (err) in one day I will go to the gym and do some practice like 对做一下销售预测是可以的\n", - "wer_original": 0.4375, - "wer_generated": 0.3541666666666667, - "wer_improvement": 0.08333333333333331, - "wer_method": "smart_mixed", - "mer_original": 0.3541666666666667 + "generated_transcription": "\n: after work : 对就是能解压 : (oh) yeah (err) if I finish my my lesson my class early (err) in one day I will go to the gym and do some practice like 对做一下销售预测是可以的\n", + "cer": 0.08333333333333333, + "wer": 0.13793103448275862, + "mer": 0.09090909090909091 }, { "segment_id": "SEGMENT_ZH-CN_U0037_ENCN_20250816_202117_309011", - "audio_path": "SEGMENT_ZH-CN_U0037_ENCN_20250816_202117_309011.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0037_ENCN_20250816_202117_309011.wav", "ground_truth": "是。 So yeah, and back in elementary school, I did swimming as and up utill to high school, I did kayaking and I successfully got a level A, a certificate kayaking as well.", - "original_transcription": ": sure. : So yeah, and back in elementary school I did swimming. I s~ and up until high school I did kayaking and I s~ successfully got a a level A certificate in kayaking as well.\n", - "generated_transcription": ": sure. : So yeah, and back in elementary school I did swimming. I s~ and up until high school I did kayaking and I s~ successfully got a a level A certificate in kayaking as well.\n", - "wer_original": 0.08823529411764706, - "wer_generated": 0.4117647058823529, - "wer_improvement": -0.32352941176470584, - "wer_method": "smart_mixed", - "mer_original": 0.4117647058823529 + "generated_transcription": "\n: sure. : So yeah, and back in elementary school I did swimming. I s~ and up until high school I did kayaking and I s~ successfully got a a level A certificate in kayaking as well\n", + "cer": 0.10606060606060606, + "wer": 0.2727272727272727, + "mer": 0.2727272727272727 }, { "segment_id": "SEGMENT_ZH-CN_U0015_ENCN_20250816_202117_269388", - "audio_path": "SEGMENT_ZH-CN_U0015_ENCN_20250816_202117_269388.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0015_ENCN_20250816_202117_269388.wav", "ground_truth": "Man's volleyball? I haven't heard, the focus are few obey the foreigners, because foreigners, because I haven't heard the Chinese man's volleyballs famous famous sportsman? 一起玩吗?哦,确实,这个词用的很高情商啊,其他的交流 。", - "original_transcription": ": men's volleyball, : (oh), I haven't heard. (um), (uh), the focus of you will be the foreigners because foreigners because I (uh), haven't heard that the Chinese is men's volleyball's (uh), famous famous sportsman, 你会玩吗, : (oh), 确实, 这个词用了很高情商, 还有其他的交流\n", "generated_transcription": "\n: men's volleyball, : (oh), I haven't heard. (um), (uh), the focus I feel will be the foreigners because foreigners because I (uh), haven't heard that the Chinese is men's volleyball's (uh), famous famous sportsman, 你都玩吗, : (oh), 确实, 这个词用了很高情商, 还有其他的交流\n", - "wer_original": 0.9038461538461539, - "wer_generated": 0.5769230769230769, - "wer_improvement": 0.326923076923077, - "wer_method": "smart_mixed", - "mer_original": 0.5769230769230769 + "cer": 0.20496894409937888, + "wer": 0.6923076923076923, + "mer": 0.40425531914893614 }, { "segment_id": "SEGMENT_ZH-CN_U1092_ENCN_20250816_202117_296280", - "audio_path": "SEGMENT_ZH-CN_U1092_ENCN_20250816_202117_296280.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1092_ENCN_20250816_202117_296280.wav", "ground_truth": "嗯,对。 Well, I majored in applied English, and I minored in, you know, a business trade. And well, you you know, I so what's your major?", - "original_transcription": ": (um) 对 well I majored in applied English and I minored in you know a business trade and (uh) well you you know I so what's your major\n", "generated_transcription": "\n: (hm) 对\n: Well I majored in applied English and I minored in you know a business trade and (uh) well you you know I so what's your major\n", - "wer_original": 0.13793103448275862, - "wer_generated": 0.4482758620689655, - "wer_improvement": -0.3103448275862069, - "wer_method": "smart_mixed", - "mer_original": 0.4482758620689655 + "cer": 0.04081632653061224, + "wer": 0.11538461538461539, + "mer": 0.07407407407407407 }, { "segment_id": "SEGMENT_ZH-CN_U1092_MIX_20250816_202117_296118", - "audio_path": "SEGMENT_ZH-CN_U1092_MIX_20250816_202117_296118.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1092_MIX_20250816_202117_296118.wav", "ground_truth": "Super Bowl 是那个类似于美国的一个春晚,是他们 Super Bowl 是一个橄榄球运动。然后他们会,对,然后他们会在这个他们会在这个 free time 会在中间的这个中场休息的时候,请很多的明星大咖来演出。对,然后 Michael Jackson 非常难得的一点,就是别人都是以上这个超级碗为荣,而超级碗是以能邀请到 Michael 为荣。", - "original_transcription": ": 呃,superbowl 是那个类似于美国的一个春晚,他们,superbowl 是一个橄榄球运动,然后他们会,对,然后他们会在这个,他们会在这个free time,会在中间的这个中场休息的时候请很多的明星大咖来演出,对,然后,呃,michael jackson非常难得的一点就是别人都是以上这个超级碗为荣,而超级碗是以能邀请到michael为荣。\n", "generated_transcription": "\n呃 super bowl 是那个类似于美国的一个春晚是他们 super bowl 是一个橄榄球运动然后他们会 对然后他们会在这个他们会在这个 free time 会在中间的这个中场休息的时候请很多的明星大咖来演出 对然后呃 michael jackson 非常难得的一点就是别人都是以 上这个超级碗 为荣 而超级碗是以能邀请到 michael 为荣\n", - "wer_original": 0.08, - "wer_generated": 0.152, - "wer_improvement": -0.072, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.013071895424836602, + "wer": 0.6428571428571429, + "mer": 0.017391304347826087 }, { "segment_id": "SEGMENT_ZH-CN_U0019_MIX_20250816_202117_330773", - "audio_path": "SEGMENT_ZH-CN_U0019_MIX_20250816_202117_330773.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0019_MIX_20250816_202117_330773.wav", "ground_truth": "那那我目前来说也算是啊也是工作人人群嘛,我可以给你讲讲工作吧。嗯,工作也分很多种嘛,其中有技术类型的工作,比如说像是 I T 那种,然后他们的话应该是你技术越高,然后呢,然后你的话语权越越大,就这样的话,基本上老板也不会拿你怎么样这种。然后就是你算是那个技术的核心核心啊部分。然后是你的工作任务,不是要去和老板怎么样怎么样,而是把你手上的最核心的技术要学好。然后他们的话应该是啊像人际交交流这方面比会比较少一点。然后如果是偏向业务部门的话,啊,他们会走一些流程啊,然后然后会对接对接外来的销售啊,以及内部的什么 H R,然后和一些其他部合作商之类的。", - "original_transcription": ": 那那我目前来说也算是也是工作人群嘛我可以给你讲讲工作啊工作也分很多种嘛其中有技术类型的工作比如说像是IT那种然后他们的话应该是你技术越高然后呢然后你的话语权越越大这样的话基本上老板也不会拿你怎么样这种然后就是你算是那个技术的核心核心部分然后是你的工作任务不是要去和老板怎么样怎么样而是把你手上的最核心的技术要学好然后他们的话应该是啊像人际交往方面会比较少一点然后如果是偏向业务部门的话啊他们会走一些流程啊然后然后会对接对接外来的销售啊以及内部的什么HR然后会一些其他合作伙伴之类的\n", "generated_transcription": ": 那那我目前来说也算是也是工作人群嘛我可以给你讲讲工作啊工作也分很多种嘛其中有技术类型的工作比如说像是IT那种然后他们的话应该是你技术越高然后呢然后你的话语权越越大这样的话基本上老板也不会拿你怎么样这种然后就是你算是那个技术的核心核心部分然后是你的工作任务不是要去和老板怎么样怎么样而是把你手上的最核心的技术要学好然后他们的话应该是啊像人际交流方面会比较少一点然后如果是偏向业务部门的话啊他们会走一些流程啊然后然后会对接对接外来的销售啊以及内部的什么HR然后会一些其他不合作商之类的\n", - "wer_original": 0.0, - "wer_generated": 1.0, - "wer_improvement": -1.0, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.04435483870967742, + "wer": 1.0, + "mer": 0.06048387096774194 }, { "segment_id": "SEGMENT_ZH-CN_U0040_ENCN_20250816_202117_283862", - "audio_path": "SEGMENT_ZH-CN_U0040_ENCN_20250816_202117_283862.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0040_ENCN_20250816_202117_283862.wav", "ground_truth": "泰国是佛教。 I i know. Everyone in my school are focused on the Gaokao. We don't have even one class about sport one once week. I can't believe. 明白。", - "original_transcription": ": (ah), 泰国是佛教的, I know everyone in my school focus on they 高考, We don't have even one class about sports one week once a week. I can't believe, 明白.\n", "generated_transcription": "\n: (ah), 泰国是佛教的, I, I know everyone in my school focus on they 高考, We don't have even one class about sports one week once a week. I can't believe 明白\n", - "wer_original": 1.2285714285714286, - "wer_generated": 0.5142857142857142, - "wer_improvement": 0.7142857142857144, - "wer_method": "smart_mixed", - "mer_original": 0.4857142857142857 + "cer": 0.18691588785046728, + "wer": 0.32142857142857145, + "mer": 0.30303030303030304 }, { "segment_id": "SEGMENT_ZH-CN_U2011_MIX_20250816_202117_348552", - "audio_path": "SEGMENT_ZH-CN_U2011_MIX_20250816_202117_348552.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U2011_MIX_20250816_202117_348552.wav", "ground_truth": "Wow, so interesting. 我还听说大学的课程非常的 free, 就是我可以 有很多 choice 去选择我喜欢的一些 课程。是这样子的吗?Is that is that true?", - "original_transcription": ": !(wow)! so interesting, 我还听说大学的课程非常的, free, 就是我可以, (err), 有很多 choice 去选择我喜欢的, 一些, (err) 课程是这样子的吗, Is that is that true?\n", "generated_transcription": "\n: !(wow)! so interesting 我还听说大学的课程非常的 free 就是我可以 (err) 有很多 choice 去选择我喜欢的一些 (err) 课程是这样子的吗 Is that is that true\n", - "wer_original": 0.38461538461538464, - "wer_generated": 0.38461538461538464, - "wer_improvement": 0.0, - "wer_method": "smart_mixed", - "mer_original": 0.48717948717948717 + "cer": 0.0759493670886076, + "wer": 0.2857142857142857, + "mer": 0.0425531914893617 }, { "segment_id": "SEGMENT_ZH-CN_U1016_ENCN_20250816_202117_327340", - "audio_path": "SEGMENT_ZH-CN_U1016_ENCN_20250816_202117_327340.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1016_ENCN_20250816_202117_327340.wav", "ground_truth": "甩路亚是什么呀?你可以和我介绍一下吗? Yeah, I also have many hobbies. I enjoying play basketball and listening to music and play many sports, such as badminton and ping-pong ball and soccer ball.", - "original_transcription": ": 甩路亚是什么呀你可以和我介绍一下吗 : 嗯 yeah I also have many hobbies I enjoy playing basketball and listening to music and play many sports such as badminton and ping pong ball and soccer ball\n", "generated_transcription": "\n: 甩路亚是什么呀你可以和我介绍一下吗\n: 嗯 yeah I also have many hobbies I enjoy playing basketball and listening to music and play many sports such as badminton and ping pong ball and soccer ball\n", - "wer_original": 0.9347826086956522, - "wer_generated": 0.21739130434782608, - "wer_improvement": 0.7173913043478262, - "wer_method": "smart_mixed", - "mer_original": 0.21739130434782608 + "cer": 0.04895104895104895, + "wer": 0.17857142857142858, + "mer": 0.11363636363636363 }, { "segment_id": "SEGMENT_ZH-CN_U2013_MIX_20250816_202117_333322", - "audio_path": "SEGMENT_ZH-CN_U2013_MIX_20250816_202117_333322.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U2013_MIX_20250816_202117_333322.wav", "ground_truth": "那说到考试,in sense 真的是让人压力山大。 是啊,社团 club 活动倒是很吸引人。", - "original_transcription": ": 嗯说到考试insance真的是让人压力山大是啊社团俱乐部活动都是很吸引人\n", "generated_transcription": "\n: 嗯说到考试insence真的是让人压力山大是啊社团klubb活动都是很吸引人\n", - "wer_original": 0.5185185185185185, - "wer_generated": 1.0, - "wer_improvement": -0.4814814814814815, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.13513513513513514, + "wer": 1.0, + "mer": 0.1724137931034483 }, { "segment_id": "SEGMENT_ZH-CN_U0029_MIX_20250816_202117_297942", - "audio_path": "SEGMENT_ZH-CN_U0029_MIX_20250816_202117_297942.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0029_MIX_20250816_202117_297942.wav", "ground_truth": "真的没错,但没办法,因为嗯他们平日太多 overtime 的那个工作了,很很累吧,是以嗯哼。", - "original_transcription": ": 真的没错但没办法因为他们平日太多over time的工作很很累吧是嗯嗯\n", "generated_transcription": "\n: 真的没错但没办法因为他们平日太多over time的工作很很累\n", - "wer_original": 0.05405405405405406, - "wer_generated": 1.0, - "wer_improvement": -0.9459459459459459, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.23076923076923078, + "wer": 1.0, + "mer": 0.34375 }, { "segment_id": "SEGMENT_ZH-CN_U0044_ENCN_20250816_202117_280133", - "audio_path": "SEGMENT_ZH-CN_U0044_ENCN_20250816_202117_280133.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0044_ENCN_20250816_202117_280133.wav", "ground_truth": "es. 嗯,我觉得也是在一次一次的尝试中来强化自己的能力,像第一次可能申请会失败,然后我就会去问,那原因是什么呢?然后老师可能会说是因为你这个数据没有信服力,你需要写的再清楚一些。那每一次经过一次一次这样的调整,这样你的简历就会越来越好。所以我觉得找工作也是一个不断磨砺,然后不断成长的过程。那你找工作也是有有类似的经历吗?", - "original_transcription": ": 有类似的经历吗\n", "generated_transcription": ": 有类似的经历吗\n", - "wer_original": 0.025157232704402517, - "wer_generated": 0.9559748427672956, - "wer_improvement": -0.9308176100628931, - "wer_method": "smart_mixed", - "mer_original": 0.9559748427672956 + "cer": 0.9527027027027027, + "wer": 1.0, + "mer": 0.9523809523809523 }, { "segment_id": "SEGMENT_ZH-CN_U0021_ENCN_20250816_202117_261710", - "audio_path": "SEGMENT_ZH-CN_U0021_ENCN_20250816_202117_261710.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0021_ENCN_20250816_202117_261710.wav", "ground_truth": "对,对,电子设备的干扰确实会比较多,嗯,但是现在我有的时候也有时候会犯懒,就是懒得去找纸质书,或者是有时候嗯课堂上发的一些资料,还是就是电子书,可能有时候就是在无意识间会对电子书形成一种依赖吧,但是我现在也在尽量就是嗯改变它,就是尽量还是多去读纸质的书。 Yeah.", - "original_transcription": ": 电子设备的干扰确实会比较多但是现在我有的时候也有时候会犯懒就是懒得去找纸质书或者是有时候课堂上发的一些资料还是就是电子书可能有时候就是在无意间会对电子书形成一种依赖吧但是我现在也在尽量就是改变它就是尽量还是多读纸质书止止止止止止\n", "generated_transcription": ": 电子设备的干扰确实会比较多\n", - "wer_original": 0.9921875, - "wer_generated": 0.8984375, - "wer_improvement": 0.09375, - "wer_method": "smart_mixed", - "mer_original": 0.1875 + "cer": 0.8916666666666667, + "wer": 1.0, + "mer": 0.8888888888888888 }, { "segment_id": "SEGMENT_ZH-CN_U1030_MIX_20250816_202117_338640", - "audio_path": "SEGMENT_ZH-CN_U1030_MIX_20250816_202117_338640.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1030_MIX_20250816_202117_338640.wav", "ground_truth": "我写日记已经差不多,I think 十多年了吧。", - "original_transcription": ": 我写日记已经差不多十多年了吧\n", "generated_transcription": "\n: 我写日记已经差不多十多年了吧\n", - "wer_original": 1.625, - "wer_generated": 1.25, - "wer_improvement": 0.375, - "wer_method": "smart_mixed", - "mer_original": 1.25 + "cer": 0.3, + "wer": 1.0, + "mer": 0.125 }, { "segment_id": "SEGMENT_ZH-CN_U0098_ENCN_20250816_202117_296623", - "audio_path": "SEGMENT_ZH-CN_U0098_ENCN_20250816_202117_296623.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0098_ENCN_20250816_202117_296623.wav", "ground_truth": "And we in the scar and the love ** of timing and Pumbaa. 他很浪费时间和金钱玩网络游戏,我觉得其实非常耗时间的。尤其是在你的级别高了以后,不要升级,必须要花很长时间去练习,否则很难升级。但是 。", - "original_transcription": ": and William Scar and the log book of timing and the puma 他很浪费时间和金钱玩玩游戏我觉得是非常好时间的尤其是在你的级别越高以后要升级必须要花很长时间去练习否则很难升级但是\n", "generated_transcription": "\n: and William scar and the log book of timing and the puma 他很浪费时间和金钱玩玩游戏我觉得是非常好时间的尤其是在你的级别越高以后要升级必须要花很长时间去练习否则很难升级\n", - "wer_original": 0.8375, - "wer_generated": 0.3125, - "wer_improvement": 0.525, - "wer_method": "smart_mixed", - "mer_original": 0.3 + "cer": 0.25742574257425743, + "wer": 0.6153846153846154, + "mer": 0.2361111111111111 }, { "segment_id": "SEGMENT_ZH-CN_U0015_ENCN_20250816_202117_269202", - "audio_path": "SEGMENT_ZH-CN_U0015_ENCN_20250816_202117_269202.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0015_ENCN_20250816_202117_269202.wav", "ground_truth": "Or can you or maybe I can change the mood to can you balance them? 哦。", - "original_transcription": ": Or can you? Or maybe I can choose the moon to? Can you balance them?\n", "generated_transcription": "\n: Or can you or maybe I can choose the moon to? Can you balance them\n", - "wer_original": 0.17647058823529413, - "wer_generated": 0.4117647058823529, - "wer_improvement": -0.2352941176470588, - "wer_method": "smart_mixed", - "mer_original": 0.5294117647058824 + "cer": 0.09615384615384616, + "wer": 0.1875, + "mer": 0.1875 }, { "segment_id": "SEGMENT_ZH-CN_U1061_MIX_20250816_202117_333704", - "audio_path": "SEGMENT_ZH-CN_U1061_MIX_20250816_202117_333704.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1061_MIX_20250816_202117_333704.wav", "ground_truth": "你们学校有没有什么文校园文化节或者各种 show time 之类的? 你你最近在写论文吗?Paper。", - "original_transcription": ": 你们学校有没有什么文校园文化节或者各种oh 诗坛之类的你你最近在写论文吗paper\n", "generated_transcription": "\n: 你们学校有没有什么文校园文化节或者各种oh 诗坛之类的 你你最近在写论文吗 paper\n", - "wer_original": 1.0, - "wer_generated": 1.0, - "wer_improvement": 0.0, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.1590909090909091, + "wer": 1.0, + "mer": 0.08823529411764706 }, { "segment_id": "SEGMENT_ZH-CN_U1056_ENCN_20250816_202117_347323", - "audio_path": "SEGMENT_ZH-CN_U1056_ENCN_20250816_202117_347323.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1056_ENCN_20250816_202117_347323.wav", "ground_truth": "No, I don't have. 嗯。 其实,哎,其实我我是一个比较 虽然说我很就是我的社交不是很强,但是我又是一个比较好动的人。所以其实我以前很就是喜欢的运动有很多。就比如说像你提到的 踢足球啊,打篮球,然后橄榄球这些,然后我都很喜欢,只是最后发现。 I,yes.", - "original_transcription": ": no, I don't have. (hm). (err), 其实, 哎, 其实我, 我是一个比较, (err), 虽然说我很, 就是我的社交, 不是很强, 但是我又是一个比较, 好动的人, 所以其实我以前, 很就是喜欢的运动有很多, 就比如说像你提到的, (err), 踢足球, 啊, 篮球, 然后, 橄榄球, 这些, 然后我都很喜欢, 只是最后发现,\n", "generated_transcription": ": no, I don't have. (hm). (err), 其实, 哎, 其实我, 我是一个比较, (err), 虽然说我很, 就是我的社交, 不是很强, 但是我又是一个比较, 好动的人, 所以其实我以前, 很就是喜欢的运动有很多, 就比如说, 像你提到的, (err), 踢足球, 啊, 打篮球, 然后, 橄榄球, 这些, 然后, 我都很喜欢, 只是最后发现\n", - "wer_original": 0.0990990990990991, - "wer_generated": 0.25225225225225223, - "wer_improvement": -0.15315315315315314, - "wer_method": "smart_mixed", - "mer_original": 0.24324324324324326 + "cer": 0.13636363636363635, + "wer": 2.888888888888889, + "mer": 0.05 }, { "segment_id": "SEGMENT_ZH-CN_U1098_ENCN_20250816_202117_260181", - "audio_path": "SEGMENT_ZH-CN_U1098_ENCN_20250816_202117_260181.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1098_ENCN_20250816_202117_260181.wav", "ground_truth": "然后。 A level? What? What does what is it exactly?", - "original_transcription": ": 然后A level what what does it what is it exactly\n", "generated_transcription": "\n: 然后A level what what does what is it exactly\n", - "wer_original": 0.4166666666666667, - "wer_generated": 0.6666666666666666, - "wer_improvement": -0.24999999999999994, - "wer_method": "smart_mixed", - "mer_original": 0.75 + "cer": 0.0, + "wer": 0.2, + "mer": 0.0 }, { "segment_id": "SEGMENT_ZH-CN_U0042_ENCN_20250816_202117_274877", - "audio_path": "SEGMENT_ZH-CN_U0042_ENCN_20250816_202117_274877.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0042_ENCN_20250816_202117_274877.wav", "ground_truth": "是的。 Yeah to prepare next step. the skate. 去年六月份的时候。", - "original_transcription": ": 是的, : Ya to prepare the next step, : the skate, 去年六月份的时候,\n", "generated_transcription": "\n: 是的 : Ya to prepare the next step : the skate, 去年六月份的时候\n", - "wer_original": 0.8421052631578947, - "wer_generated": 0.3157894736842105, - "wer_improvement": 0.5263157894736842, - "wer_method": "smart_mixed", - "mer_original": 0.3157894736842105 + "cer": 0.1282051282051282, + "wer": 0.2222222222222222, + "mer": 0.11764705882352941 }, { "segment_id": "SEGMENT_ZH-CN_U1051_ENCN_20250816_202117_281665", - "audio_path": "SEGMENT_ZH-CN_U1051_ENCN_20250816_202117_281665.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1051_ENCN_20250816_202117_281665.wav", "ground_truth": "Hi, Emily, my job I'm I am doing international. 也是反正我在上海工作的感触就是基本上 只有工作啊,没有生活,然后呢只有同事没有朋友,成都可能感觉不一样哈,工作的这种感觉。", - "original_transcription": ": 嗨emily我的工作嗯嗯嗯doing international : 嗯也是反正我在上海工作的感触就是基本上呃只有工作没有生活然后呢只有同事没有朋友成都可能感觉不一样哈工作的这种感觉\n", "generated_transcription": "\n: 嗨emily my job i am doing international 嗯也是反正我在上海工作的感触就是基本上呃只有工作没有生活然后呢只有同事没有朋友成都可能感觉不一样哈工作的这种感觉\n", - "wer_original": 0.2753623188405797, - "wer_generated": 0.18840579710144928, - "wer_improvement": 0.08695652173913043, - "wer_method": "smart_mixed", - "mer_original": 0.2318840579710145 + "cer": 0.07777777777777778, + "wer": 0.45454545454545453, + "mer": 0.078125 }, { "segment_id": "SEGMENT_ZH-CN_U0023_MIX_20250816_202117_263416", - "audio_path": "SEGMENT_ZH-CN_U0023_MIX_20250816_202117_263416.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0023_MIX_20250816_202117_263416.wav", "ground_truth": "个人 value 还是非常值得被重视起来。很多人非常的 depressive nowadays 可能也是因为 nobody see them, nobody see their values,so it can be easily lead to depressive depressive feelings,我觉得这也是比较正常的。", - "original_transcription": "我觉得这也是比较正常的, 很多人非常的, depressive nowadays, 可能也是因为, nobody see them, nobody see their values, so it can be easily lead to depressive, (err) depressive feelings, 我觉得这也是比较正常的, 因为, 很多人非常的, depressed, nowadays, 可能也是因为 nobody see them, nobody sees their values, so it it can be easily lead, to depressive, (err), depressive feelings, 我觉得这是也是比较正常的, 因为很多人非常的, depressed, now days, 可能也是因为 nobody, see them, nobody see, their values, so it, it can be easily lead.\n", "generated_transcription": "\n个人 value, (err) 还是非常, 值得被重视起来, 很多人非常的, depressive, nowadays, (err) 可能也是因为, nobody see them, nobody see their values, so it can be easily lead to depressive, (err) depressive feelings, 我觉得这也是比较正常的\n", - "wer_original": 0.0, - "wer_generated": 0.5555555555555556, - "wer_improvement": -0.5555555555555556, - "wer_method": "smart_mixed", - "mer_original": 2.511111111111111 + "cer": 0.06382978723404255, + "wer": 0.45454545454545453, + "mer": 0.09090909090909091 }, { "segment_id": "SEGMENT_ZH-CN_U0023_MIX_20250816_202117_263312", - "audio_path": "SEGMENT_ZH-CN_U0023_MIX_20250816_202117_263312.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0023_MIX_20250816_202117_263312.wav", "ground_truth": "Because 那个时候还是 Soviet time,所以就是。 They don't really 区别就是区分开来,就是哪国人,哪国人他们只会觉得自己都是苏联人这样的,并不会有这么大的隔阂,像现在。", - "original_transcription": ": because 那个时候还是Soviet time 所以就是 they don't really 区别就是区分开来就是哪国人哪国人他们只会觉得自己都是呃苏联人这样的并不会有这么大的隔阂像现在\n", "generated_transcription": "\n: because 那个时候还是Soviet time 所以就是 they don't really 区别就是区分开来就是哪国人哪国人他们只会觉得自己都是呃苏联人这样的并不会有这么大的隔阂像现在\n", - "wer_original": 0.2903225806451613, - "wer_generated": 0.24193548387096775, - "wer_improvement": 0.04838709677419356, - "wer_method": "smart_mixed", - "mer_original": 0.24193548387096775 + "cer": 0.011627906976744186, + "wer": 0.5, + "mer": 0.01639344262295082 }, { "segment_id": "SEGMENT_ZH-CN_U0062_MIX_20250816_202117_318032", - "audio_path": "SEGMENT_ZH-CN_U0062_MIX_20250816_202117_318032.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0062_MIX_20250816_202117_318032.wav", "ground_truth": "对, Paris in the Rain ,对。 它为什么还需要去去用它自己的东西去 test to your english level 呢?", - "original_transcription": ": 对, parisians, 的人对, 他为什么还需要去, 用他自己的东西去, test, 你, your English level, (ah).\n", - "generated_transcription": ": 对, parisian 的人对, 他为什么还需要去, 用他自己的东西去, test, 你, your English level, 吗\n", - "wer_original": 0.5454545454545454, - "wer_generated": 0.5454545454545454, - "wer_improvement": 0.0, - "wer_method": "smart_mixed", - "mer_original": 0.5454545454545454 + "generated_transcription": "\n: 对, parisian 的人对, 他为什么还需要去, 用他自己的东西去, test, 你, your English level, 吗\n", + "cer": 0.25, + "wer": 0.6153846153846154, + "mer": 0.3103448275862069 }, { "segment_id": "SEGMENT_ZH-CN_U1039_ENCN_20250816_202117_292652", - "audio_path": "SEGMENT_ZH-CN_U1039_ENCN_20250816_202117_292652.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1039_ENCN_20250816_202117_292652.wav", "ground_truth": "We have we have five or four, I'm not sure. 就是我们我们只只要求只要求了解,不需要 不需要很深。", - "original_transcription": ": we have we have five or four i'm not sure 对我们我们只只要求只要求了解不需要不需要很深\n", - "generated_transcription": ": we have we have five or four i'm not sure 对我们我们只只要求只要求了解不需要不需要很深\n", - "wer_original": 0.7142857142857143, - "wer_generated": 0.22857142857142856, - "wer_improvement": 0.48571428571428577, - "wer_method": "smart_mixed", - "mer_original": 0.22857142857142856 + "generated_transcription": "\n: we have we have five or four i'm not sure 对我们我们只只要求只要求了解不需要不需要很深\n", + "cer": 0.037037037037037035, + "wer": 0.16666666666666666, + "mer": 0.06060606060606061 }, { "segment_id": "SEGMENT_ZH-CN_U1024_MIX_20250816_202117_289326", - "audio_path": "SEGMENT_ZH-CN_U1024_MIX_20250816_202117_289326.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1024_MIX_20250816_202117_289326.wav", "ground_truth": "Pop star ,last year, my favorite pop star is 王嘉尔。Do you had that do you had had?", - "original_transcription": ": Pop star (um) last year my favorite pop star is Wang Jiaer do you have that\n", - "generated_transcription": ": Pop star (um) last year my favorite pop star is wang jiale do you have that\n", - "wer_original": 0.29411764705882354, - "wer_generated": 0.5294117647058824, - "wer_improvement": -0.23529411764705882, - "wer_method": "smart_mixed", - "mer_original": 0.5294117647058824 + "generated_transcription": "\n: Pop star (um) last year my favorite pop star is wang jiale do you have that\n", + "cer": 0.3333333333333333, + "wer": 0.4117647058823529, + "mer": 0.45 }, { "segment_id": "SEGMENT_ZH-CN_U0002_MIX_20250816_202117_291226", - "audio_path": "SEGMENT_ZH-CN_U0002_MIX_20250816_202117_291226.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0002_MIX_20250816_202117_291226.wav", "ground_truth": "所以其实寻找 purpose 的这个过程也是很有意思的。 你觉得你觉得我们应该怎么样去寻找人生的 purpose 呢?", - "original_transcription": ": 所以其实寻找purpose的这个过程也是很有意思的你觉得你觉得我们应该怎么样去寻找人生的purpose呢\n", - "generated_transcription": ": 所以其实寻找 purpose 的这个过程也是很有意思你觉得你觉得我们应该怎么样去寻找人生的 purpose 吗\n", - "wer_original": 0.9523809523809523, - "wer_generated": 0.09523809523809523, - "wer_improvement": 0.8571428571428571, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "generated_transcription": "\n: 所以其实寻找 purpose 的这个过程也是很有意思你觉得你觉得我们应该怎么样去寻找人生的 purpose 吗\n", + "cer": 0.038461538461538464, + "wer": 0.5, + "mer": 0.05 }, { "segment_id": "SEGMENT_ZH-CN_U0066_MIX_20250816_202117_349766", - "audio_path": "SEGMENT_ZH-CN_U0066_MIX_20250816_202117_349766.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0066_MIX_20250816_202117_349766.wav", "ground_truth": "Yes, and another I am be singer. I like a 陶喆。 Do you know 陶喆。", - "original_transcription": ": Yes, and another R and B singer I like, (uh), and then **, Tao, do you know Tao, **?\n", - "generated_transcription": ": Yes and another R and B singer I like (uh) name ** Tao Jiang, Do you know Tao Jiang\n", - "wer_original": 0.3157894736842105, - "wer_generated": 0.6842105263157895, - "wer_improvement": -0.368421052631579, - "wer_method": "smart_mixed", - "mer_original": 0.7368421052631579 + "generated_transcription": "\n: Yes and another R and B singer I like (uh) name ** Tao Jiang, Do you know Tao Jiang\n", + "cer": 0.5813953488372093, + "wer": 0.6, + "mer": 0.5294117647058824 }, { "segment_id": "SEGMENT_ZH-CN_U1082_MIX_20250816_202117_315795", - "audio_path": "SEGMENT_ZH-CN_U1082_MIX_20250816_202117_315795.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1082_MIX_20250816_202117_315795.wav", "ground_truth": "是什么原因让你们 broke up 了呀?是一些 personal 的原因吗?还是说一些现实原因导致的?", - "original_transcription": ": 哦呃是什么原因让你们, broke up 了呀是一些, personal 的原因吗还是说一些现实原因导致的\n", - "generated_transcription": ": 哦呃是什么原因让你们, broke up 了呀是一些, personal 的原因吗还是说一些现实原因导致的\n", - "wer_original": 0.9428571428571428, - "wer_generated": 0.2, - "wer_improvement": 0.7428571428571429, - "wer_method": "smart_mixed", - "mer_original": 0.2 + "generated_transcription": "\n: 哦呃是什么原因让你们, broke up 了呀是一些, personal 的原因吗还是说一些现实原因导致的\n", + "cer": 0.045454545454545456, + "wer": 0.16666666666666666, + "mer": 0.0625 }, { "segment_id": "SEGMENT_ZH-CN_U0057_MIX_20250816_202117_267184", - "audio_path": "SEGMENT_ZH-CN_U0057_MIX_20250816_202117_267184.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0057_MIX_20250816_202117_267184.wav", "ground_truth": "他们,对,处理一些文字工具啊,表格啊,他们就用 chat G P T 会更方便一些,更快。", - "original_transcription": ": 对处理一些文字工具啊表格啊他们就用CHPT会更方便一些更快\n", - "generated_transcription": ": 对处理一些文字工具啊表格啊他们就用差错率会更方便一些更快\n", - "wer_original": 0.8918918918918919, - "wer_generated": 0.32432432432432434, - "wer_improvement": 0.5675675675675675, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "generated_transcription": "\n: 对处理一些文字工具啊表格啊他们就用差错率会更方便一些更快\n", + "cer": 0.2647058823529412, + "wer": 1.0, + "mer": 0.1935483870967742 }, { "segment_id": "SEGMENT_ZH-CN_U0041_ENCN_20250816_202117_270293", - "audio_path": "SEGMENT_ZH-CN_U0041_ENCN_20250816_202117_270293.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0041_ENCN_20250816_202117_270293.wav", "ground_truth": "对,法国人特别慢。法国人,嗯,你我有一个认识的朋友,他就是法国人,他吃饭,上次跟他一起吃饭,吃了一个多小时,我都累了,他还在吃。 Yes.", - "original_transcription": ": 对法国人特别慢法国人嗯我有一个认识的朋友他就是法国人他吃饭上次跟他一起吃饭吃了一个多小时我都累了他还还在吃\n", - "generated_transcription": ": 对法国人特别慢法国人嗯我有一个认识的朋友他就是法国人他吃饭上次跟他一起吃饭吃了一个多小时我都累了他还还在吃\n", - "wer_original": 0.9846153846153847, - "wer_generated": 0.2153846153846154, - "wer_improvement": 0.7692307692307693, - "wer_method": "smart_mixed", - "mer_original": 0.2153846153846154 + "generated_transcription": "\n: 对法国人特别慢法国人嗯我有一个认识的朋友他就是法国人他吃饭上次跟他一起吃饭吃了一个多小时我都累了他还还在吃\n", + "cer": 0.08928571428571429, + "wer": 1.0, + "mer": 0.05555555555555555 }, { "segment_id": "SEGMENT_ZH-CN_U0085_ENCN_20250816_202117_298501", - "audio_path": "SEGMENT_ZH-CN_U0085_ENCN_20250816_202117_298501.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0085_ENCN_20250816_202117_298501.wav", "ground_truth": "确实,诶,我看咱们找一天儿一起去那个北京的王府井,看一看苹果新出的一些产品怎么样,又想又想去买了,嗯,好开心。 Yeah, that's right.", - "original_transcription": ": 确实哎我看咱们找一天一起去那个北京的王福锦看一看苹果新出的那些产品怎么样又想又想去买了开心 ya that's right\n", "generated_transcription": "\n: 确实哎我看咱们找一天一起去那个北京的王福锦看一看苹果新出的一些产品怎么样又想又想去买了开心 yeah that's right\n", - "wer_original": 0.9482758620689655, - "wer_generated": 0.25862068965517243, - "wer_improvement": 0.6896551724137931, - "wer_method": "smart_mixed", - "mer_original": 0.27586206896551724 + "cer": 0.0967741935483871, + "wer": 0.25, + "mer": 0.11764705882352941 }, { "segment_id": "SEGMENT_ZH-CN_U0039_MIX_20250816_202117_350986", - "audio_path": "SEGMENT_ZH-CN_U0039_MIX_20250816_202117_350986.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0039_MIX_20250816_202117_350986.wav", "ground_truth": "我觉得但是还是会觉得就是我我有个这个这个这个 child, 还是非常的让就让姐姐还是说我肯定不会后悔,然后就是她那个还会很愿意去,就是很很庆幸会有这么这么一个孩子。", - "original_transcription": ": 我就是但是还是会觉得就是我我有这个这个这个孩子还是非常的让就让姐姐还是说我肯定不会后悔然后就是很很会很愿意去就是很很欣喜会有这样这么一个孩子\n", "generated_transcription": "\n: 我就是但是还是会觉得就是我我有这个这个这个孩子还是非常的让就让姐姐还是说我肯定不会后悔然后就是很很会很愿意去就是很很欣喜会有这样这么一个孩子\n", - "wer_original": 0.05333333333333334, - "wer_generated": 0.2, - "wer_improvement": -0.14666666666666667, - "wer_method": "smart_mixed", - "mer_original": 0.2 + "cer": 0.19736842105263158, + "wer": 1.0, + "mer": 0.16666666666666666 }, { "segment_id": "SEGMENT_ZH-CN_U0060_ENCN_20250816_202117_302924", - "audio_path": "SEGMENT_ZH-CN_U0060_ENCN_20250816_202117_302924.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0060_ENCN_20250816_202117_302924.wav", "ground_truth": "可能可能是休息学习一小时,休息二十分钟这样。 Yes, I can understand.", - "original_transcription": ": 可能可能是学习一小时学习二十分钟这样, : Yes, I get it.\n", "generated_transcription": "\n: 可能可能是, 休息, 学习一小时休息二十分钟这样, : Yes, I get it.\n", - "wer_original": 0.3076923076923077, - "wer_generated": 0.23076923076923078, - "wer_improvement": 0.07692307692307693, - "wer_method": "smart_mixed", - "mer_original": 0.3076923076923077 + "cer": 0.2972972972972973, + "wer": 1.0, + "mer": 0.08333333333333333 }, { "segment_id": "SEGMENT_ZH-CN_U1097_ENCN_20250816_202117_328149", - "audio_path": "SEGMENT_ZH-CN_U1097_ENCN_20250816_202117_328149.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1097_ENCN_20250816_202117_328149.wav", "ground_truth": "I love to, I love . 我觉得是可。 Yeah. 是的,就是可能会让你回忆起你的童年是吧?就是。", - "original_transcription": ": I love to (uh) I love (err) 我觉得是可是的就是可能会让你回忆起你的童年是吧就是\n", "generated_transcription": "\n: I love to (uh) I love (err) 我觉得是可 yeah 是的就是可能会让你回忆起你的童年是吧就是\n", - "wer_original": 0.2222222222222222, - "wer_generated": 0.2222222222222222, - "wer_improvement": 0.0, - "wer_method": "smart_mixed", - "mer_original": 0.2222222222222222 + "cer": 0.12195121951219512, + "wer": 0.25, + "mer": 0.06451612903225806 }, { "segment_id": "SEGMENT_ZH-CN_U0090_ENCN_20250816_202117_271006", - "audio_path": "SEGMENT_ZH-CN_U0090_ENCN_20250816_202117_271006.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0090_ENCN_20250816_202117_271006.wav", "ground_truth": "那跟你相比的话,我觉得我正好相反。我是一个比较外向型的人,就是在这个工作还是生活当中啊,我比较喜欢跟不同的人去交流。 比如说在工作中我会比较喜欢跟我们一些 产品线的一些人员和我们销售人员进行进一步的这个沟通,然后去推动我们用户的一个需求。那在生活中呢我也是一个比较健谈的一个人 ,无论是在这个平时的娱乐中啊。 Thank you so much too. I like your answers. Thank you so much too. Have a nice day.", - "original_transcription": ": 那跟你相比的话我觉得我正好相反我是一个比较外向型的人就是在这个工作还是生活当中啊我比较喜欢跟不同的人去交流啊比如说在工作中我会比较喜欢跟我们的一些啊产品线的一些人员和我们销售人员进行进一步的这个沟通然后去推动我们用户的一个需求那在生活中啊我也是一个比较健谈的一个人啊无论是在这个平时的娱乐中啊\n", "generated_transcription": ": 那跟你相比的话我觉得我正好相反我是一个比较外向型的人就是在这个工作还是生活当中啊我比较喜欢跟不同的人去交流啊比如说在工作中我会比较喜欢跟我们的一些啊产品线的一些人员和我们销售人员进行进一步的这个沟通然后去推动我们用户的一个需求那在生活中啊我也是一个比较健谈的一个人啊无论是在这个平时的娱乐中啊\n", - "wer_original": 0.9644970414201184, - "wer_generated": 0.17751479289940827, - "wer_improvement": 0.7869822485207101, - "wer_method": "smart_mixed", - "mer_original": 0.17751479289940827 + "cer": 0.3284313725490196, + "wer": 1.0, + "mer": 0.14375 }, { "segment_id": "SEGMENT_ZH-CN_U1079_ENCN_20250816_202117_294238", - "audio_path": "SEGMENT_ZH-CN_U1079_ENCN_20250816_202117_294238.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1079_ENCN_20250816_202117_294238.wav", "ground_truth": "是富士相机,不是富士康。 Another hobby of mine is painting. I find it ** , and it allows me to impress my emotion, the colors and bus ** .", - "original_transcription": ": 是复式相机不是复式看 another hobby of mine is painting I find it therapeutic and it allows me to impress my emotions the colors and the brush strokes\n", "generated_transcription": "\n: 是复式相机不是复式看\n", - "wer_original": 1.0810810810810811, - "wer_generated": 0.8648648648648649, - "wer_improvement": 0.21621621621621623, - "wer_method": "smart_mixed", - "mer_original": 0.40540540540540543 + "cer": 0.945054945054945, + "wer": 1.0, + "mer": 0.8387096774193549 }, { "segment_id": "SEGMENT_ZH-CN_U1094_ENCN_20250816_202117_321336", - "audio_path": "SEGMENT_ZH-CN_U1094_ENCN_20250816_202117_321336.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1094_ENCN_20250816_202117_321336.wav", "ground_truth": "把他那个脸都涨红了,嗯。 And this song re reached the number eight in the Australia public charts. So his song her song lyrics explore feeling of feeling in love as well as the paining of breaking up. Actually I'm his I'm her ** fans. So I listening her music everyday. Do you ?", - "original_transcription": ": 把他的脸都涨红了嗯And this song reached the number eight in the Australia pop charts so his song her song lyrics explore feeling of feeling in love as well as the pain of breaking up actually I'm her fans so I listening her music everyday do you\n", "generated_transcription": "\n: 把他那个脸都涨红了嗯 and this song reach reach the number eight in the Australia pop charts so his song her song lyrics explore feeling of feeling in love as well as a pain of breaking up actually I'm her her fans so I listening her music everyday do you\n", - "wer_original": 0.5081967213114754, - "wer_generated": 0.32786885245901637, - "wer_improvement": 0.18032786885245905, - "wer_method": "smart_mixed", - "mer_original": 0.45901639344262296 + "cer": 0.0970873786407767, + "wer": 0.14583333333333334, + "mer": 0.12280701754385964 }, { "segment_id": "SEGMENT_ZH-CN_U1033_ENCN_20250816_202117_353027", - "audio_path": "SEGMENT_ZH-CN_U1033_ENCN_20250816_202117_353027.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1033_ENCN_20250816_202117_353027.wav", "ground_truth": "是的,就更强烈,对感官上 来说。 Hello, my name is Summer. I'm twenty years old. I come from Sichuan, which is a beautiful city. I am a student studying at C D U T. What about you?", - "original_transcription": ": 是的更强烈对感官来说\n", "generated_transcription": "\n: 是的更强烈对感官上来说\n", - "wer_original": 0.32608695652173914, - "wer_generated": 0.7608695652173914, - "wer_improvement": -0.4347826086956522, - "wer_method": "smart_mixed", - "mer_original": 0.782608695652174 + "cer": 0.9090909090909091, + "wer": 1.0, + "mer": 0.7441860465116279 }, { "segment_id": "SEGMENT_ZH-CN_U0034_ENCN_20250816_202117_336006", - "audio_path": "SEGMENT_ZH-CN_U0034_ENCN_20250816_202117_336006.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0034_ENCN_20250816_202117_336006.wav", "ground_truth": "嗯。 I think so. I but I think that's that's a bit different. I also listen to music for, I can follow the rhythm of the music, but I think listen to something you need to focus on it to understand the content, and then it will be hard for you to totally focus on your muscles, your movement. 嗯 ,原来如此,我们是嗯互联网教育公司这种感觉吧,然后因为做的是面那个软件是面向小学生的,所以教育方面难度也不是很高,就是一些常见的词汇和句式这样的感觉,呃可能重点还是要怎么吸引小朋友,我感觉可能跟你本科的内容会相关一些。", - "original_transcription": ": (hm) (hm) I think so (oh) (uh) but I think that's (hm) that's a bit different. I also listen to music for I can follow the reason of the music but I think listen to something you need to focus on it to understand the content and then it will be hard for you to totally focus on your muscles your movement.\n", - "generated_transcription": ": (hm) (hm) 我觉得所以 (oh) (uh) but 我觉得那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (", - "wer_original": 0.4764705882352941, - "wer_generated": 1.511764705882353, - "wer_improvement": -1.0352941176470587, - "wer_method": "smart_mixed", - "mer_original": 0.711764705882353 + "generated_transcription": ": (hm) (hm) 我觉得所以 (oh) (uh) but 我觉得那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (hm) 那 (", + "cer": 0.9320987654320988, + "wer": 2.0, + "mer": 0.9875776397515528 }, { "segment_id": "SEGMENT_ZH-CN_U0065_ENCN_20250816_202117_299438", - "audio_path": "SEGMENT_ZH-CN_U0065_ENCN_20250816_202117_299438.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0065_ENCN_20250816_202117_299438.wav", "ground_truth": "嗯嗯。 Yes, and talk about my art exam experience actually is very hard for me to make this decision, because my mom first know about the art exam, and she just thought it's is it's just a major that.", - "original_transcription": ": (hm), (hm), yes, and (uh), talk about my art exam experience. Actually, it's very hard for me to make this decision because (uh), my mom first know about the art exam and she just thought it's is a, it's just a major that.\n", "generated_transcription": "\n: (hm), (hm), yes, and (uh), talk about my art exam experience. Actually, it's very hard for me to make this decision because (uh), my mom first know about the art exam and she just thought it is, is, (uh), is just a major that.\n", - "wer_original": 0.07317073170731707, - "wer_generated": 0.3902439024390244, - "wer_improvement": -0.3170731707317073, - "wer_method": "smart_mixed", - "mer_original": 0.2926829268292683 + "cer": 0.08552631578947369, + "wer": 0.23076923076923078, + "mer": 0.225 }, { "segment_id": "SEGMENT_ZH-CN_U1040_ENCN_20250816_202117_312275", - "audio_path": "SEGMENT_ZH-CN_U1040_ENCN_20250816_202117_312275.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1040_ENCN_20250816_202117_312275.wav", "ground_truth": "Yes, I I like her too. I usually listen to her music, and I like her face. I think she is beautiful. 我们辅导员还挺年轻,看着就跟我们差不多的那种。", - "original_transcription": ": yes i i like her too i usually listen to her music and i like her face i think she is beautiful : 我们的辅导员还挺年轻看着就跟我们差不多的那种\n", "generated_transcription": "\n: yes i i like her too i usually listen to her music and i like her face i think she is beautiful\n: 我们辅导员还挺年轻看着就跟我们差不多的那种\n", - "wer_original": 0.7777777777777778, - "wer_generated": 0.26666666666666666, - "wer_improvement": 0.5111111111111111, - "wer_method": "smart_mixed", - "mer_original": 0.28888888888888886 + "cer": 0.0, + "wer": 0.0, + "mer": 0.0 }, { "segment_id": "SEGMENT_ZH-CN_U0005_MIX_20250816_202117_347672", - "audio_path": "SEGMENT_ZH-CN_U0005_MIX_20250816_202117_347672.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0005_MIX_20250816_202117_347672.wav", "ground_truth": "那你是一直都学的, 就关于 教师这种 major 吗?", - "original_transcription": ": 那你是一直都学的呃就关于呃教师这种major吗\n", "generated_transcription": "\n: 那你是一直都学的呃就关于呃教师这种major吗\n", - "wer_original": 0.10526315789473684, - "wer_generated": 1.0, - "wer_improvement": -0.8947368421052632, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.09523809523809523, + "wer": 1.0, + "mer": 0.11764705882352941 }, { "segment_id": "SEGMENT_ZH-CN_U1002_MIX_20250816_202117_286692", - "audio_path": "SEGMENT_ZH-CN_U1002_MIX_20250816_202117_286692.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1002_MIX_20250816_202117_286692.wav", "ground_truth": "Yeah。 Sometimes some like the music in 网易 Music and QQ Music。 You should had pay for it, then you can listen。 But in TikTok, we don't have to pay。 It's free。", - "original_transcription": ": Yeah, sometimes sound like the music in wangyin music and qq music. You should have to pay for it then you can listen. But in tiktok we don't have to pay it's free.\n", "generated_transcription": "\n: Yeah sometimes sound like the music in wangyin music and qq music you should have to pay for it then you can listen but in tiktok we don't have to pay it's free\n", - "wer_original": 1.0, - "wer_generated": 0.5454545454545454, - "wer_improvement": 0.4545454545454546, - "wer_method": "smart_mixed", - "mer_original": 0.48484848484848486 + "cer": 0.11965811965811966, + "wer": 0.125, + "mer": 0.15151515151515152 }, { "segment_id": "SEGMENT_ZH-CN_U0100_ENCN_20250816_202117_353253", - "audio_path": "SEGMENT_ZH-CN_U0100_ENCN_20250816_202117_353253.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0100_ENCN_20250816_202117_353253.wav", "ground_truth": "Wow, that's great. Like . Does you work else make really big progress? 是的,然后 我有很多一些科目是。", - "original_transcription": ": (wow) that's great like (err) does your workout make really big progress\n: 是的然后我有很多一些科目是\n", "generated_transcription": "\n: (wow) that's great like (err) does your workout make really big progress\n", - "wer_original": 0.5357142857142857, - "wer_generated": 0.8571428571428571, - "wer_improvement": -0.3214285714285714, - "wer_method": "smart_mixed", - "mer_original": 0.39285714285714285 + "cer": 0.3181818181818182, + "wer": 0.42857142857142855, + "mer": 0.68 }, { "segment_id": "SEGMENT_ZH-CN_U1005_ENCN_20250816_202117_346721", - "audio_path": "SEGMENT_ZH-CN_U1005_ENCN_20250816_202117_346721.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1005_ENCN_20250816_202117_346721.wav", "ground_truth": "是有什么问题吗? Your major sounds very cool. And what do you learning at school?", - "original_transcription": ": 是有什么问题吗 your major sounds very cool and what do you learning at school\n", "generated_transcription": "\n: 是有什么问题吗 your major sounds very cool and what do you learning at school\n", - "wer_original": 0.4, - "wer_generated": 0.25, - "wer_improvement": 0.15000000000000002, - "wer_method": "smart_mixed", - "mer_original": 0.25 + "cer": 0.0, + "wer": 0.0, + "mer": 0.0 }, { "segment_id": "SEGMENT_ZH-CN_U0100_ENCN_20250816_202117_353296", - "audio_path": "SEGMENT_ZH-CN_U0100_ENCN_20250816_202117_353296.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0100_ENCN_20250816_202117_353296.wav", "ground_truth": "So firstly, I will ask you what's your daily hobbies? Could you tell me something about your daily hobbies? 哦 它可能会有一些自然的山和水之类的风景。比较出名的话,自然风景比较它的山不包括不仅包括自然风景,还有包括历史有文化的一些山。比如说比较出名的话,像庐山不,你应该有听过庐山的诗词吧。对吧什么说好的飞流直下三千尺,疑是银河落九天,对。", - "original_transcription": ": 呃首先我会问你呃你的日常爱好你能告诉我一些关于你的日常爱好吗 : 呃他可能会有一些自然的山水之类的美景比较出名的话自然风景比较他的山不包括不仅包括自然风景还包括历史有文化的一些山比如说比较出名的话像庐山你应该有听说过庐山的诗词对吧\n", "generated_transcription": ": 说好的飞流直下三千尺疑是银河落九天\n", - "wer_original": 0.40298507462686567, - "wer_generated": 0.8731343283582089, - "wer_improvement": -0.47014925373134325, - "wer_method": "smart_mixed", - "mer_original": 0.5149253731343284 + "cer": 0.9105263157894737, + "wer": 1.0, + "mer": 0.8629032258064516 }, { "segment_id": "SEGMENT_ZH-CN_U0053_MIX_20250816_202117_325630", - "audio_path": "SEGMENT_ZH-CN_U0053_MIX_20250816_202117_325630.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0053_MIX_20250816_202117_325630.wav", "ground_truth": "你,okay,你刚才问我是那个喜欢的歌手是吗?", - "original_transcription": ": 呃你你刚才问我是那个喜欢的歌手是吗\n", "generated_transcription": "\n: 呃你你刚才问我是那个喜欢的歌手是吗\n", - "wer_original": 13.0, - "wer_generated": 17.0, - "wer_improvement": -4.0, - "wer_method": "smart_mixed", - "mer_original": 17.0 + "cer": 0.25, + "wer": 1.0, + "mer": 0.11764705882352941 }, { "segment_id": "SEGMENT_ZH-CN_U0003_ENCN_20250816_202117_273460", - "audio_path": "SEGMENT_ZH-CN_U0003_ENCN_20250816_202117_273460.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0003_ENCN_20250816_202117_273460.wav", "ground_truth": "哦,我是原来注重情绪价值,现在更觉得还是特立独行好。 And I don't know whether this will .", - "original_transcription": ": (oh) 我是原来注重情绪价值现在更觉得还是特地选好, and I don't know whether this will (err).\n", "generated_transcription": "\n: 哦我是原来注重情绪价值现在更觉得还是特地选好, and I don't know whether this will (err).\n", - "wer_original": 0.7647058823529411, - "wer_generated": 0.23529411764705882, - "wer_improvement": 0.5294117647058822, - "wer_method": "smart_mixed", - "mer_original": 0.2647058823529412 + "cer": 0.12, + "wer": 0.25, + "mer": 0.13333333333333333 }, { "segment_id": "SEGMENT_ZH-CN_U0047_ENCN_20250816_202117_345571", - "audio_path": "SEGMENT_ZH-CN_U0047_ENCN_20250816_202117_345571.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0047_ENCN_20250816_202117_345571.wav", "ground_truth": "Yeah, I know. 嗯,运动的话我其实比较少,我并不是一个特别热爱锻炼的人,虽然说锻炼有益身体健康,但是确实这方面我不太经常进行,我可能更多的话,比如说就是会比进行一些比较宅的活动吧,就比如说在家里可能看看看看电影,看看电视,然后追追动漫之类的,然后偶尔也会跟朋友一起出去转一转吧,就是说如果比如说像现在春天的时候,我跟我同学之前还去国家植物园去稍微赏了赏花,然后去香山去就是因为现在香山它不是那个红叶的热门季节嘛,所以说现在香山人就比较少,所以说去错峰出行,去爬了一下香山。", - "original_transcription": ": 运动的话我其实比较少我并不是一个特别热爱锻炼的人虽然说锻炼有利身体健康但是确实这方面我不太经常进行我可能更多的话比如说就是会进行一些比较宅的活动吧就比如说在家里可能看看看看电影看看电视然后追追动漫之类的然后偶尔也会跟朋友一起出去转一转吧就是说如果比如说像现在在春天的时候我跟我同学之前还去国家植物园去稍微赏赏花然后去香山去就是因为现在香山它不是那个红叶的热门季节嘛所以说现在香山人就比较少所以说去错峰出行去爬了一下香山\n", "generated_transcription": ": 运动的话我其实比较少我并不是一个特别热爱锻炼的人虽然说锻炼有利身体健康但是确实这方面我不太经常进行我可能更多的话比如说就是会进行一些比较宅的活动吧就比如说在家里可能看看看看电影看看电视然后追追动漫之类的然后偶尔也会跟朋友一起出去转一转吧就是说如果比如说像现在在春天的时候我跟我同学之前还去国家植物园去稍微赏了赏花然后去香山去就是因为现在香山它不是那个红叶的热门季节嘛所以说现在香山人就比较少所以说去错峰出行去爬了一下香山\n", - "wer_original": 0.04329004329004329, - "wer_generated": 0.1038961038961039, - "wer_improvement": -0.060606060606060615, - "wer_method": "smart_mixed", - "mer_original": 0.10822510822510822 + "cer": 0.05909090909090909, + "wer": 1.0, + "mer": 0.03271028037383177 }, { "segment_id": "SEGMENT_ZH-CN_U0030_MIX_20250816_202117_265818", - "audio_path": "SEGMENT_ZH-CN_U0030_MIX_20250816_202117_265818.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0030_MIX_20250816_202117_265818.wav", "ground_truth": "对,而且它现在是二十四小时都可以 keeping touch 的。所以你可能在家也会接到一些领导的工作要求,然后嘚嘚嘚的就得去继续干。", - "original_transcription": ": 对而且他现在是二十四小时你都可以keep in touch的所以可能在家也会接到一些领导的工作要求然后得得得就得去继续干\n", "generated_transcription": "\n: 对而且他现在是二十四小时你都可以keep in touch的所以可能在家也会接到一些领导的工作要求然后得得得就得去继续干\n", - "wer_original": 0.9814814814814815, - "wer_generated": 1.0, - "wer_improvement": -0.01851851851851849, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.13333333333333333, + "wer": 1.0, + "mer": 0.18 }, { "segment_id": "SEGMENT_ZH-CN_U0062_MIX_20250816_202117_318169", - "audio_path": "SEGMENT_ZH-CN_U0062_MIX_20250816_202117_318169.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0062_MIX_20250816_202117_318169.wav", "ground_truth": "听起来特别 terry song。 这其实也是一个大环境,整个中国就业 environment 的一个缩影吧。就是要求高的要死,就是特别复杂的一些门槛。但是实际上要做的是非常 simple ** 然后就是随便找个人都能干的东西,他需要需要你的那个什么 。", - "original_transcription": ": 听起来特别泰迪熊这其实也是一个大环境整个中国就业environment的一个缩影吧就是要求高的要求就是特别复杂的一些门槛但是实际上要做的事非常简单然后就是随便找的人都能干的东西它需要需要你的那个什么呃\n", "generated_transcription": "\n: 听起来特别terrible song 这其实也是一个大环境整个中国就业environment的一个缩影吧就是要求高的就是特别复杂的一些门槛但是实际上要做的事非常simple然后就是随便找个人都能干的东西他需要需要你的那个什么\n", - "wer_original": 0.9787234042553191, - "wer_generated": 1.0, - "wer_improvement": -0.021276595744680882, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.06422018348623854, + "wer": 0.875, + "mer": 0.04597701149425287 }, { "segment_id": "SEGMENT_ZH-CN_U0055_ENCN_20250816_202117_336739", - "audio_path": "SEGMENT_ZH-CN_U0055_ENCN_20250816_202117_336739.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0055_ENCN_20250816_202117_336739.wav", "ground_truth": "这样子。 Like that, you you remain you. You keep me remind that I once like some actors, but when when there are some new actors, they they play in a film, and maybe I will switch quickly to like another actor. So I don't think I I really like one actor for a long time, because I just switch to different actors.", - "original_transcription": ": Orh, this way, like that. (oh), you, you, remind you, you, keep me remind that (uh), I once like some actors, but when when there are some new actors, they they play in a film and maybe I will switch quickly to like another actor, so I don't think I, I really like one actor for a long time because I just switch to different actors.\n", "generated_transcription": ": Orh, this way, like that, Orh, you, you, remind, you, you, keep me remind that, (uh), I once like some actors, but when when there are some new actors, they they play in a film, and maybe I will switch quickly to like another actor, so I don't think I, I really like one actor for a long time because I just switch to different actors\n", - "wer_original": 1.4923076923076923, - "wer_generated": 0.27692307692307694, - "wer_improvement": 1.2153846153846155, - "wer_method": "smart_mixed", - "mer_original": 0.26153846153846155 + "cer": 0.07142857142857142, + "wer": 0.0967741935483871, + "mer": 0.09375 }, { "segment_id": "SEGMENT_ZH-CN_U1040_ENCN_20250816_202117_312171", - "audio_path": "SEGMENT_ZH-CN_U1040_ENCN_20250816_202117_312171.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U1040_ENCN_20250816_202117_312171.wav", "ground_truth": "哦, 那就应该还挺那个她学的专业应该挺厉害的吧? Because I think the sky is very blue and beautiful in Xinjiang.", - "original_transcription": ": 哦那就应该还挺那个他学的drawing应该挺厉害的吧因为我觉得天空在新疆很蓝很美丽\n", "generated_transcription": "\n: 嗯那就应该还挺那个他学的drawing应该��厉害的吧 because i think the sky is very blue and beautiful in Xinjiang\n", - "wer_original": 0.5142857142857142, - "wer_generated": 0.7428571428571429, - "wer_improvement": -0.22857142857142865, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "cer": 0.125, + "wer": 0.14285714285714285, + "mer": 0.12121212121212122 }, { "segment_id": "SEGMENT_ZH-CN_U0037_MIX_20250816_202117_308366", - "audio_path": "SEGMENT_ZH-CN_U0037_MIX_20250816_202117_308366.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0037_MIX_20250816_202117_308366.wav", "ground_truth": "嗯,我就 think 。 这,嗯,所以我觉得是什么?就是我们信息传输的需求和网络传输的速度,他们是这个,他们这样 simultaneously improved 的。", - "original_transcription": ": (um), just think : (um), 所以我觉得是什么就是我们信息传输的需求和网络传输的速度他们是这个他们叫, simultaneously improved 的\n", "generated_transcription": "\n: (hm), just think : 所以我觉得是什么就是我们信息传输的需求和网络传输的速度他们是这个他们叫, simultaneously improved 的\n", - "wer_original": 0.9622641509433962, - "wer_generated": 0.2830188679245283, - "wer_improvement": 0.679245283018868, - "wer_method": "smart_mixed", - "mer_original": 0.2830188679245283 + "cer": 0.14492753623188406, + "wer": 0.5, + "mer": 0.15555555555555556 }, { "segment_id": "SEGMENT_ZH-CN_U0098_ENCN_20250816_202117_296720", - "audio_path": "SEGMENT_ZH-CN_U0098_ENCN_20250816_202117_296720.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0098_ENCN_20250816_202117_296720.wav", "ground_truth": "see you. 网球真的很解压,你也可以尝试一下。", - "original_transcription": ": (oh) see you, 打篮球真的很累, (ah), 你也可以尝试一下,\n", - "generated_transcription": ": (oh) see you, 打篮球真的很累 ya 你也可以尝试一下\n", - "wer_original": 0.8947368421052632, - "wer_generated": 0.42105263157894735, - "wer_improvement": 0.4736842105263158, - "wer_method": "smart_mixed", - "mer_original": 0.42105263157894735 + "generated_transcription": "\n: (oh) soccer, 打篮球真的很累, (ah), 你也可以尝试一下\n", + "cer": 0.5714285714285714, + "wer": 1.6666666666666667, + "mer": 0.35294117647058826 }, { "segment_id": "SEGMENT_ZH-CN_U0036_MIX_20250816_202117_342900", - "audio_path": "SEGMENT_ZH-CN_U0036_MIX_20250816_202117_342900.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0036_MIX_20250816_202117_342900.wav", "ground_truth": "我明白你的意思。你就是说 我们 we we try to understand others,但是我们不能 change 他们的 situation,不能 change 他们是 what what the ** circumstances they are in,是这个这样子吗?", - "original_transcription": ": 我明白你的意思就是说, or 我们, we will try to understand others, 但是我们不能, change 他们的, situation, 不能, 改变他们的, circumstances, 他们, 现在, 是这个这样子吗\n", - "generated_transcription": ": 我明白你的意思就是说, or 我们, we will try to understand others, 但是我们不能 change 他们的 situation, 不能 change 他们的 circumstances, 他们, 在, 是这个这样子吗\n", - "wer_original": 0.0, - "wer_generated": 0.8108108108108109, - "wer_improvement": -0.8108108108108109, - "wer_method": "smart_mixed", - "mer_original": 1.0 + "generated_transcription": "\n: 我明白你的意思就是说, or 我们, we will try to understand others, 但是我们不能 change 他们的 situation, 不能 change 他们的 circumstances, 他们, 在, 是这个这样子吗\n", + "cer": 0.23893805309734514, + "wer": 0.7, + "mer": 0.2 }, { "segment_id": "SEGMENT_ZH-CN_U0037_ENCN_20250816_202117_308988", - "audio_path": "SEGMENT_ZH-CN_U0037_ENCN_20250816_202117_308988.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0037_ENCN_20250816_202117_308988.wav", "ground_truth": "Yeah, I I'd finished my school in Virginia. United States. 嗯。", - "original_transcription": ": Yeah, I. I finished my school in Virginia, United States.\n", - "generated_transcription": ": ya. I. I finished my school in Virginia United States.\n", - "wer_original": 0.0, - "wer_generated": 0.5, - "wer_improvement": -0.5, - "wer_method": "smart_mixed", - "mer_original": 0.4166666666666667 + "generated_transcription": "\n: ya. I. I finished my school in Virginia United States.\n", + "cer": 0.08695652173913043, + "wer": 0.2727272727272727, + "mer": 0.2727272727272727 }, { "segment_id": "SEGMENT_ZH-CN_U0059_ENCN_20250816_202117_344613", - "audio_path": "SEGMENT_ZH-CN_U0059_ENCN_20250816_202117_344613.wav", - "prompt": "Please transcribe this speech.", + "audio_path": "/home/users/astar/ares/suns1/scratch/trung/CS-newdialogue_transcription_dpo_20250816_202114/combined_audio_20250816_202117/SEGMENT_ZH-CN_U0059_ENCN_20250816_202117_344613.wav", "ground_truth": "哦。 viewing some posts on little read book or or the like, especially when the Mozart is still on play . So maybe maybe we can move on to another question say, do you have a special daily routine or study routine?", - "original_transcription": ": (oh), (uh), viewing some (uh) posts on little red book or or the like, especially when the most artists still on play, so maybe we maybe we can move on to another question. Say, do you have a special daily routine or study routine?\n", - "generated_transcription": ": (oh), (uh), viewing some (uh) post on little red book or or the like, especially when the most artist still on play, so maybe we maybe we can move on to another question. Say, do you have a special daily routine or study routine?\n", - "wer_original": 0.046511627906976744, - "wer_generated": 0.3023255813953488, - "wer_improvement": -0.2558139534883721, - "wer_method": "smart_mixed", - "mer_original": 0.27906976744186046 + "generated_transcription": "\n: (oh), : (uh), viewing some (uh) post on little red book or or the like, especially when the most artist still on play, so maybe we maybe we can move on to another question. Say, do you have a special daily routine or study routine?\n", + "cer": 0.0783132530120482, + "wer": 0.1951219512195122, + "mer": 0.1951219512195122 } ] } \ No newline at end of file