Student0809 commited on Jun 6, 2025

Commit

a050167

verified ·

1 Parent(s): 35dfdd4

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

ms-swift/processed_data/processed_overlap5s_speaker_segments.json +0 -0
ms-swift/processed_data/processed_silence_isoverlaps.json +0 -0
ms-swift/silence_overlaps/700/test/overlap5s_segments_test.json +27 -0
ms-swift/silence_overlaps/700/test/overlap5s_silence_segments_test.json +27 -0
ms-swift/silence_overlaps/700/train/overlap5s_issilence_segments_train.json +0 -0
ms-swift/silence_overlaps/test/test_train.json +963 -0
ms-swift/swift/llm/sampling/mcts.py +400 -0
ms-swift/swift/llm/template/template/__pycache__/__init__.cpython-310.pyc +0 -0
ms-swift/swift/llm/template/template/__pycache__/emu3.cpython-310.pyc +0 -0
ms-swift/swift/llm/template/template/__pycache__/gemma.cpython-310.pyc +0 -0
ms-swift/swift/llm/template/template/__pycache__/internvl.cpython-310.pyc +0 -0
ms-swift/swift/llm/template/template/__pycache__/minicpm.cpython-310.pyc +0 -0
ms-swift/swift/llm/template/template/__pycache__/pixtral.cpython-310.pyc +0 -0
ms-swift/swift/llm/template/template/__pycache__/stepfun.cpython-310.pyc +0 -0
ms-swift/swift/llm/template/template/__pycache__/valley.cpython-310.pyc +0 -0
ms-swift/swift/llm/template/template/__pycache__/yi.cpython-310.pyc +0 -0
ms-swift/swift/llm/template/template/deepseek.py +315 -0
ms-swift/swift/llm/template/template/glm.py +293 -0
ms-swift/swift/llm/template/template/internvl.py +168 -0
ms-swift/swift/llm/template/template/llama.py +213 -0
ms-swift/swift/llm/template/template/megrez.py +93 -0
ms-swift/swift/llm/template/template/openbuddy.py +48 -0
ms-swift/swift/llm/template/template/pixtral.py +59 -0
ms-swift/swift/llm/template/template/qwen.py +671 -0
ms-swift/swift/llm/template/template/stepfun.py +128 -0
ms-swift/swift/llm/template/template/yi.py +63 -0
ms-swift/swift/llm/train/__pycache__/callback.cpython-310.pyc +0 -0
ms-swift/swift/llm/train/__pycache__/rlhf.cpython-310.pyc +0 -0
ms-swift/swift/llm/train/__pycache__/sft.cpython-310.pyc +0 -0
ms-swift/swift/llm/train/__pycache__/tuner.cpython-310.pyc +0 -0
ms-swift/swift/llm/train/callback.py +80 -0
ms-swift/swift/llm/train/rlhf.py +154 -0
ms-swift/swift/llm/train/sft.py +287 -0
ms-swift/swift/llm/train/tuner.py +424 -0
ms-swift/swift/megatron/argument/train_args.py +53 -0
ms-swift/swift/megatron/model/__init__.py +4 -0
ms-swift/swift/megatron/model/config.py +57 -0
ms-swift/swift/megatron/model/constant.py +3 -0
ms-swift/swift/megatron/model/gpt/__init__.py +40 -0
ms-swift/swift/megatron/model/gpt/config.py +13 -0
ms-swift/swift/megatron/model/gpt/model.py +37 -0
ms-swift/swift/megatron/model/register.py +47 -0
ms-swift/swift/megatron/model/rope.py +40 -0
ms-swift/swift/megatron/train/patcher.py +64 -0
ms-swift/swift/megatron/utils/__init__.py +4 -0
ms-swift/swift/megatron/utils/convert.py +122 -0
ms-swift/swift/megatron/utils/patcher.py +26 -0
ms-swift/swift/plugin/__pycache__/__init__.cpython-310.pyc +0 -0
ms-swift/swift/plugin/__pycache__/callback.cpython-310.pyc +0 -0
ms-swift/swift/plugin/__pycache__/metric.cpython-310.pyc +0 -0

ms-swift/processed_data/processed_overlap5s_speaker_segments.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ms-swift/processed_data/processed_silence_isoverlaps.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ms-swift/silence_overlaps/700/test/overlap5s_segments_test.json ADDED Viewed

	@@ -0,0 +1,27 @@

+[
+  {
+    "key": "SODA_PROCESSED--train--123906",
+    "audio_url": "/root/autodl-tmp/output_overlapslong/newoverlapjson/overlap5s700/SODA_PROCESSED--train--123906.wav",
+    "model_output": "Multiple speakers talk simultaneously from 00:03-00:09"
+  },
+  {
+    "key": "SODA_PROCESSED--train--1112763",
+    "audio_url": "/root/autodl-tmp/output_overlapslong/newoverlapjson/overlap5s700/SODA_PROCESSED--train--1112763.wav",
+    "model_output": "Multiple speakers talk simultaneously from 00:09-00:15"
+  },
+  {
+    "key": "SODA_PROCESSED--train--790538",
+    "audio_url": "/root/autodl-tmp/output_overlapslong/newoverlapjson/overlap5s700/SODA_PROCESSED--train--790538.wav",
+    "model_output": "Multiple speakers talk simultaneously from 00:15-00:19"
+  },
+  {
+    "key": "SODA_PROCESSED--train--822773",
+    "audio_url": "/root/autodl-tmp/output_overlapslong/newoverlapjson/overlap5s700/SODA_PROCESSED--train--822773.wav",
+    "model_output": "Multiple speakers talk simultaneously from 00:14-00:19"
+  },
+  {
+    "key": "SODA_PROCESSED--train--424960",
+    "audio_url": "/root/autodl-tmp/output_overlapslong/newoverlapjson/overlap5s700/SODA_PROCESSED--train--424960.wav",
+    "model_output": "Multiple speakers talk simultaneously from 00:29-00:33"
+  }
+]

ms-swift/silence_overlaps/700/test/overlap5s_silence_segments_test.json ADDED Viewed

	@@ -0,0 +1,27 @@

+[
+  {
+    "key": "SODA_PROCESSED--train--137471",
+    "audio_url": "/root/autodl-tmp/output_overlapslong/newoverlapjson/overlap5s700/SODA_PROCESSED--train--137471.wav",
+    "model_output": "No, there is no silence gap."
+  },
+  {
+    "key": "SODA_PROCESSED--train--201044",
+    "audio_url": "/root/autodl-tmp/output_overlapslong/newoverlapjson/overlap5s700/SODA_PROCESSED--train--201044.wav",
+    "model_output": "No, there is no silence gap."
+  },
+  {
+    "key": "SODA_PROCESSED--train--596349",
+    "audio_url": "/root/autodl-tmp/output_overlapslong/newoverlapjson/overlap5s700/SODA_PROCESSED--train--596349.wav",
+    "model_output": "No, there is no silence gap."
+  },
+  {
+    "key": "SODA_PROCESSED--train--956648",
+    "audio_url": "/root/autodl-tmp/output_overlapslong/newoverlapjson/overlap5s700/SODA_PROCESSED--train--956648.wav",
+    "model_output": "No, there is no silence gap."
+  },
+  {
+    "key": "SODA_PROCESSED--train--962210",
+    "audio_url": "/root/autodl-tmp/output_overlapslong/newoverlapjson/overlap5s700/SODA_PROCESSED--train--962210.wav",
+    "model_output": "No, there is no silence gap."
+  }
+]

ms-swift/silence_overlaps/700/train/overlap5s_issilence_segments_train.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ms-swift/silence_overlaps/test/test_train.json ADDED Viewed

	@@ -0,0 +1,963 @@

+{
+    "SODA_PROCESSED--train--449689": {
+        "original_dialog_id": "",
+        "dialog_index": 449689,
+        "processed_dialogue": "A: Hey there. Mind if I lay down next to you?  \nB: No, go ahead.  \nA: Thanks. I needed a break from the sun. It's so hot today.  \nB: Yeah, it is. I'm trying to get a tan, but I don't want to get too dehydrated, so I'm keeping a bottle of water close by and reapplying sunscreen every hour to avoid any skin damage.  \nA: Burnt? Yeah, that's definitely a possibility out here. So what brings you to the beach today? Just wanting to relax?  \nB: Yeah, pretty much. I just finished up my summer classes and needed some time to myself before starting my new job next week.  \nA: That sounds rough. Are you excited for it? Or [interrupt] worried about how you'll balance everything with your personal life and other commitments you might have during this transitional period?  \nB: Nervous? A little bit of both, honestly. But mostly excited. It should be a good experience. And the pay is great, so that's a plus.  \nA: Definitely. Well, I hope you enjoy the rest of your day here.  \nB: Thanks. You too.",
+        "clean_dialogue": "A: Hey there. Mind if I lay down next to you?  \nB: No, go ahead.  \nA: Thanks. I needed a break from the sun. It's so hot today.  \nB: Yeah, it is. I'm trying to get a tan, but I don't want to get too dehydrated, so I'm keeping a bottle of water close by and reapplying sunscreen every hour to avoid any skin damage.  \nA: Burnt? Yeah, that's definitely a possibility out here. So what brings you to the beach today? Just wanting to relax?  \nB: Yeah, pretty much. I just finished up my summer classes and needed some time to myself before starting my new job next week.  \nA:That sounds rough. Are you excited for it? Or worried about how you'll balance everything with your personal life and other commitments you might have during this transitional period?\nB: Nervous? A little bit of both, honestly. But mostly excited. It should be a good experience. And the pay is great, so that's a plus.  \nA: Definitely. Well, I hope you enjoy the rest of your day here.  \nB: Thanks. You too.",
+        "speaker_tracks": {
+            "A": "/root/autodl-tmp/output_mixedAudios/processed_soda_3_processed_dialogues_part_20/SODA_PROCESSED--train--449689/A_track.wav",
+            "B": "/root/autodl-tmp/output_mixedAudios/processed_soda_3_processed_dialogues_part_20/SODA_PROCESSED--train--449689/B_track.wav"
+        },
+        "error_type": "error_after_interrupt",
+        "stereo_audio": "/root/autodl-tmp/output_mixedAudios/processed_soda_3_processed_dialogues_part_20/SODA_PROCESSED--train--449689/stereo_dialogue.wav",
+        "total_duration": 50.09668934240363,
+        "segments": [
+            {
+                "speaker": "A",
+                "text": "Hey there. Mind if I lay down next to you?",
+                "original_text": "Hey there. Mind if I lay down next to you?",
+                "start_time": 0,
+                "end_time": 2.4961451247165534,
+                "audio_file": "/root/autodl-tmp/output_mixedAudios/processed_soda_3_processed_dialogues_part_20/SODA_PROCESSED--train--449689/temp/line_0_A.wav",
+                "silence_duration": 0,
+                "is_interrupted": false
+            },
+            {
+                "speaker": "B",
+                "text": "No, go ahead.",
+                "original_text": "No, go ahead.",
+                "start_time": 3.0616233505922237,
+                "end_time": 4.257451014991316,
+                "audio_file": "/root/autodl-tmp/output_mixedAudios/processed_soda_3_processed_dialogues_part_20/SODA_PROCESSED--train--449689/temp/line_1_B.wav",
+                "silence_duration": 0.5654782258756702,
+                "is_interrupted": false
+            },
+            {
+                "speaker": "A",
+                "text": "Thanks. I needed a break from the sun. It's so hot today.",
+                "original_text": "Thanks. I needed a break from the sun. It's so hot today.",
+                "start_time": 4.673061027457998,
+                "end_time": 8.666893227004483,
+                "audio_file": "/root/autodl-tmp/output_mixedAudios/processed_soda_3_processed_dialogues_part_20/SODA_PROCESSED--train--449689/temp/line_2_A.wav",
+                "silence_duration": 0.41561001246668183,
+                "is_interrupted": false
+            },
+            {
+                "speaker": "B",
+                "text": "Yeah, it is. I'm trying to get a tan, but I don't want to get too dehydrated, so I'm keeping a bottle of water close by and reapplying sunscreen every hour to avoid any skin damage.",
+                "original_text": "Yeah, it is. I'm trying to get a tan, but I don't want to get too dehydrated, so I'm keeping a bottle of water close by and reapplying sunscreen every hour to avoid any skin damage.",
+                "start_time": 9.128191918953855,
+                "end_time": 19.01989259922596,
+                "audio_file": "/root/autodl-tmp/output_mixedAudios/processed_soda_3_processed_dialogues_part_20/SODA_PROCESSED--train--449689/temp/line_3_B.wav",
+                "silence_duration": 0.46129869194937123,
+                "is_interrupted": false
+            },
+            {
+                "speaker": "A",
+                "text": "Burnt? Yeah, that's definitely a possibility out here. So what brings you to the beach today? Just wanting to relax?",
+                "original_text": "Burnt? Yeah, that's definitely a possibility out here. So what brings you to the beach today? Just wanting to relax?",
+                "start_time": 19.43691572474219,
+                "end_time": 27.215600531998426,
+                "audio_file": "/root/autodl-tmp/output_mixedAudios/processed_soda_3_processed_dialogues_part_20/SODA_PROCESSED--train--449689/temp/line_4_A.wav",
+                "silence_duration": 0.4170231255162265,
+                "is_interrupted": false
+            },
+            {
+                "speaker": "B",
+                "text": "Yeah, pretty much. I just finished up my summer classes and needed some time to myself before starting my new job next week.",
+                "original_text": "Yeah, pretty much. I just finished up my summer classes and needed some time to myself before starting my new job next week.",
+                "start_time": 27.73206790619358,
+                "end_time": 34.08272550256547,
+                "audio_file": "/root/autodl-tmp/output_mixedAudios/processed_soda_3_processed_dialogues_part_20/SODA_PROCESSED--train--449689/temp/line_5_B.wav",
+                "silence_duration": 0.5164673741951538,
+                "is_interrupted": false
+            },
+            {
+                "speaker": "A",
+                "text": "That sounds rough. Are you excited for it? Or",
+                "original_text": "That sounds rough. Are you excited for it? Or [interrupt] worried about how you'll balance everything with your personal life and other commitments you might have during this transitional period?",
+                "start_time": 34.40566150397062,
+                "end_time": 44.703711390591934,
+                "audio_file": "/root/autodl-tmp/output_mixedAudios/processed_soda_3_processed_dialogues_part_20/SODA_PROCESSED--train--449689/temp/line_6_A.wav",
+                "silence_duration": 0.3229360014051523,
+                "is_interrupted": true,
+                "text_after_interrupt": "worried about how you'll balance everything with your personal life and other commitments you might have during this transitional period?"
+            },
+            {
+                "speaker": "B",
+                "text": "Nervous? A little bit of both, honestly. But mostly excited. It should be a good experience. And the pay is great, so that's a plus.",
+                "original_text": "Nervous? A little bit of both, honestly. But mostly excited. It should be a good experience. And the pay is great, so that's a plus.",
+                "start_time": 37.1456161524967,
+                "end_time": 44.564391662700785,
+                "audio_file": "/root/autodl-tmp/output_mixedAudios/processed_soda_3_processed_dialogues_part_20/SODA_PROCESSED--train--449689/temp/line_7_B.wav",
+                "silence_duration": 0.36321869535217244,
+                "is_interrupted": false
+            },
+            {
+                "speaker": "A",
+                "text": "Definitely. Well, I hope you enjoy the rest of your day here.",
+                "original_text": "Definitely. Well, I hope you enjoy the rest of your day here.",
+                "start_time": 44.9023552612567,
+                "end_time": 48.78008768756056,
+                "audio_file": "/root/autodl-tmp/output_mixedAudios/processed_soda_3_processed_dialogues_part_20/SODA_PROCESSED--train--449689/temp/line_8_A.wav",
+                "silence_duration": 0.33796359855591646,
+                "is_interrupted": false
+            },
+            {
+                "speaker": "B",
+                "text": "Thanks. You too.",
+                "original_text": "Thanks. You too.",
+                "start_time": 49.1679089027611,
+                "end_time": 50.09670708870214,
+                "audio_file": "/root/autodl-tmp/output_mixedAudios/processed_soda_3_processed_dialogues_part_20/SODA_PROCESSED--train--449689/temp/line_9_B.wav",
+                "silence_duration": 0.38782121520053575,
+                "is_interrupted": false
+            }
+        ],
+        "gt_score": 1
+    },
+    "SODA_PROCESSED--train--787791": {
+        "original_dialog_id": "",
+        "dialog_index": 787791,
+        "processed_dialogue": "A: You're welcome. I'm just glad I was able to stop it from happening.  \nB: Thank you so much for saving my life. I can't even begin to express how [interrupt] grateful I am for what you did. It means the world to me and I'll never forget your kindness and quick thinking in that moment.  \nA: Sorry to jump in, but are you sure you're okay? I mean, physically and emotionally?  \nB: I think so, but it's all still a bit of a blur. I don't know what would have happened if you hadn't been there. I'm just glad that you were in the right place at the right time.  \nA: Yeah, me too. But seriously, if you need anything—someone to talk to or whatever—don't hesitate to reach out, okay?  \nB: I really appreciate that. Thanks again, Antwain.  \nA: No problem. Take care.",
+        "clean_dialogue": "A: You're welcome. I'm just glad I was able to stop it from happening.  \nB:Thank you so much for saving my life. I can't even begin to express how grateful I am for what you did. It means the world to me and I'll never forget your kindness and quick thinking in that moment.\nA: Sorry to jump in, but are you sure you're okay? I mean, physically and emotionally?  \nB: I think so, but it's all still a bit of a blur. I don't know what would have happened if you hadn't been there. I'm just glad that you were in the right place at the right time.  \nA: Yeah, me too. But seriously, if you need anything—someone to talk to or whatever—don't hesitate to reach out, okay?  \nB: I really appreciate that. Thanks again, Antwain.  \nA: No problem. Take care.",
+        "speaker_tracks": {
+            "A": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_8/SODA_PROCESSED--train--787791/A_track.wav",
+            "B": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_8/SODA_PROCESSED--train--787791/B_track.wav"
+        },
+        "error_type": "error_after_interrupt",
+        "stereo_audio": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_8/SODA_PROCESSED--train--787791/stereo_dialogue.wav",
+        "total_duration": 37.52730158730159,
+        "segments": [
+            {
+                "speaker": "A",
+                "text": "You're welcome. I'm just glad I was able to stop it from happening.",
+                "original_text": "You're welcome. I'm just glad I was able to stop it from happening.",
+                "start_time": 0,
+                "end_time": 4.249251700680272,
+                "audio_file": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_8/SODA_PROCESSED--train--787791/temp/line_0_A.wav",
+                "silence_duration": 0,
+                "is_interrupted": false
+            },
+            {
+                "speaker": "B",
+                "text": "Thank you so much for saving my life. I can't even begin to express how",
+                "original_text": "Thank you so much for saving my life. I can't even begin to express how [interrupt] grateful I am for what you did. It means the world to me and I'll never forget your kindness and quick thinking in that moment.",
+                "start_time": 4.756366963799184,
+                "end_time": 14.694507553368345,
+                "audio_file": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_8/SODA_PROCESSED--train--787791/temp/line_1_B.wav",
+                "silence_duration": 0.5071152631189118,
+                "is_interrupted": true,
+                "text_after_interrupt": "grateful I am for what you did. It means the world to me and I'll never forget your kindness and quick thinking in that moment."
+            },
+            {
+                "speaker": "A",
+                "text": "Sorry to jump in, but are you sure you're okay? I mean, physically and emotionally?",
+                "original_text": "Sorry to jump in, but are you sure you're okay? I mean, physically and emotionally?",
+                "start_time": 8.726979208697143,
+                "end_time": 14.357818210964716,
+                "audio_file": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_8/SODA_PROCESSED--train--787791/temp/line_2_A.wav",
+                "silence_duration": 0.4049084459018305,
+                "is_interrupted": false
+            },
+            {
+                "speaker": "B",
+                "text": "I think so, but it's all still a bit of a blur. I don't know what would have happened if you hadn't been there. I'm just glad that you were in the right place at the right time.",
+                "original_text": "I think so, but it's all still a bit of a blur. I don't know what would have happened if you hadn't been there. I'm just glad that you were in the right place at the right time.",
+                "start_time": 14.861085984580113,
+                "end_time": 23.649838819047233,
+                "audio_file": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_8/SODA_PROCESSED--train--787791/temp/line_3_B.wav",
+                "silence_duration": 0.5032677736153957,
+                "is_interrupted": false
+            },
+            {
+                "speaker": "A",
+                "text": "Yeah, me too. But seriously, if you need anything—someone to talk to or whatever—don't hesitate to reach out, okay?",
+                "original_text": "Yeah, me too. But seriously, if you need anything—someone to talk to or whatever—don't hesitate to reach out, okay?",
+                "start_time": 24.145193415777634,
+                "end_time": 32.515987066571284,
+                "audio_file": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_8/SODA_PROCESSED--train--787791/temp/line_4_A.wav",
+                "silence_duration": 0.4953545967303996,
+                "is_interrupted": false
+            },
+            {
+                "speaker": "B",
+                "text": "I really appreciate that. Thanks again, Antwain.",
+                "original_text": "I really appreciate that. Thanks again, Antwain.",
+                "start_time": 32.97180815148517,
+                "end_time": 35.68854284536272,
+                "audio_file": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_8/SODA_PROCESSED--train--787791/temp/line_5_B.wav",
+                "silence_duration": 0.4558210849138826,
+                "is_interrupted": false
+            },
+            {
+                "speaker": "A",
+                "text": "No problem. Take care.",
+                "original_text": "No problem. Take care.",
+                "start_time": 35.99481454512998,
+                "end_time": 37.5273315519327,
+                "audio_file": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_8/SODA_PROCESSED--train--787791/temp/line_6_A.wav",
+                "silence_duration": 0.3062716997672569,
+                "is_interrupted": false
+            }
+        ],
+        "gt_score": 1
+    },
+    "SODA_PROCESSED--train--179972": {
+        "original_dialog_id": "",
+        "dialog_index": 179972,
+        "processed_dialogue": "A: So, how did you like the book?  \nB: I loved it! The ending was so shocking, I couldn't believe what happened.  \nA: Sorry to interrupt, but I just have to ask—did you see that twist with the protagonist coming? I was totally blindsided.  \nB: No, I didn't see it coming at all! It was so unexpected.  \nA: Yeah, I know. I couldn't put it down.  \nB: Me neither. I'm so glad you wanted to read it.  \nA: Yeah, I was curious about the protagonist's journey and how it would [interrupt] evolve, especially after that major setback when they had to completely rethink their entire approach to solving the central conflict.  \nB: Oh, speaking of the journey, what did you think about that part where the protagonist had to make that impossible choice? It really stuck with me.  \nA: It was definitely a rollercoaster ride. There were so many twists and turns.  \nB: I know! I didn't see any of them coming.  \nA: That's what made it so great. It kept you guessing the whole time.  \nB: Definitely. It was a great book. Thanks for lending it to me.",
+        "clean_dialogue": "A: So, how did you like the book?  \nB: I loved it! The ending was so shocking, I couldn't believe what happened.  \nA: Sorry to interrupt, but I just have to ask—did you see that twist with the protagonist coming? I was totally blindsided.  \nB: No, I didn't see it coming at all! It was so unexpected.  \nA: Yeah, I know. I couldn't put it down.  \nB: Me neither. I'm so glad you wanted to read it.  \nA:Yeah, I was curious about the protagonist's journey and how it would evolve, especially after that major setback when they had to completely rethink their entire approach to solving the central conflict.\nB: Oh, speaking of the journey, what did you think about that part where the protagonist had to make that impossible choice? It really stuck with me.  \nA: It was definitely a rollercoaster ride. There were so many twists and turns.  \nB: I know! I didn't see any of them coming.  \nA: That's what made it so great. It kept you guessing the whole time.  \nB: Definitely. It was a great book. Thanks for lending it to me.",
+        "speaker_tracks": {
+            "A": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_7/SODA_PROCESSED--train--179972/A_track.wav",
+            "B": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_7/SODA_PROCESSED--train--179972/B_track.wav"
+        },
+        "error_type": "error_after_interrupt",
+        "stereo_audio": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_7/SODA_PROCESSED--train--179972/stereo_dialogue.wav",
+        "total_duration": 53.57845804988662,
+        "segments": [
+            {
+                "speaker": "A",
+                "text": "So, how did you like the book?",
+                "original_text": "So, how did you like the book?",
+                "start_time": 0,
+                "end_time": 1.6950566893424037,
+                "audio_file": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_7/SODA_PROCESSED--train--179972/temp/line_0_A.wav",
+                "silence_duration": 0,
+                "is_interrupted": false
+            },
+            {
+                "speaker": "B",
+                "text": "I loved it! The ending was so shocking, I couldn't believe what happened.",
+                "original_text": "I loved it! The ending was so shocking, I couldn't believe what happened.",
+                "start_time": 2.1792484824735485,
+                "end_time": 5.871221271589195,
+                "audio_file": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_7/SODA_PROCESSED--train--179972/temp/line_1_B.wav",
+                "silence_duration": 0.4841917931311449,
+                "is_interrupted": false
+            },
+            {
+                "speaker": "A",
+                "text": "Sorry to interrupt, but I just have to ask—did you see that twist with the protagonist coming? I was totally blindsided.",
+                "original_text": "Sorry to interrupt, but I just have to ask—did you see that twist with the protagonist coming? I was totally blindsided.",
+                "start_time": 6.47038511683308,
+                "end_time": 14.504489425223102,
+                "audio_file": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_7/SODA_PROCESSED--train--179972/temp/line_2_A.wav",
+                "silence_duration": 0.5991638452438857,
+                "is_interrupted": false
+            },
+            {
+                "speaker": "B",
+                "text": "No, I didn't see it coming at all! It was so unexpected.",
+                "original_text": "No, I didn't see it coming at all! It was so unexpected.",
+                "start_time": 15.012397119017507,
+                "end_time": 18.448950406999366,
+                "audio_file": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_7/SODA_PROCESSED--train--179972/temp/line_3_B.wav",
+                "silence_duration": 0.507907693794404,
+                "is_interrupted": false
+            },
+            {
+                "speaker": "A",
+                "text": "Yeah, I know. I couldn't put it down.",
+                "original_text": "Yeah, I know. I couldn't put it down.",
+                "start_time": 18.875209136594886,
+                "end_time": 21.847363331606225,
+                "audio_file": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_7/SODA_PROCESSED--train--179972/temp/line_4_A.wav",
+                "silence_duration": 0.42625872959552,
+                "is_interrupted": false
+            },
+            {
+                "speaker": "B",
+                "text": "Me neither. I'm so glad you wanted to read it.",
+                "original_text": "Me neither. I'm so glad you wanted to read it.",
+                "start_time": 22.440054691555087,
+                "end_time": 25.110349476135585,
+                "audio_file": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_7/SODA_PROCESSED--train--179972/temp/line_5_B.wav",
+                "silence_duration": 0.5926913599488615,
+                "is_interrupted": false
+            },
+            {
+                "speaker": "A",
+                "text": "Yeah, I was curious about the protagonist's journey and how it would",
+                "original_text": "Yeah, I was curious about the protagonist's journey and how it would [interrupt] evolve, especially after that major setback when they had to completely rethink their entire approach to solving the central conflict.",
+                "start_time": 25.51803755034393,
+                "end_time": 36.89581532812171,
+                "audio_file": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_7/SODA_PROCESSED--train--179972/temp/line_6_A.wav",
+                "silence_duration": 0.40768807420834613,
+                "is_interrupted": true,
+                "text_after_interrupt": "evolve, especially after that major setback when they had to completely rethink their entire approach to solving the central conflict."
+            },
+            {
+                "speaker": "B",
+                "text": "Oh, speaking of the journey, what did you think about that part where the protagonist had to make that impossible choice? It really stuck with me.",
+                "original_text": "Oh, speaking of the journey, what did you think about that part where the protagonist had to make that impossible choice? It really stuck with me.",
+                "start_time": 29.790509205672727,
+                "end_time": 37.429874285037805,
+                "audio_file": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_7/SODA_PROCESSED--train--179972/temp/line_7_B.wav",
+                "silence_duration": 0.32835611460902553,
+                "is_interrupted": false
+            },
+            {
+                "speaker": "A",
+                "text": "It was definitely a rollercoaster ride. There were so many twists and turns.",
+                "original_text": "It was definitely a rollercoaster ride. There were so many twists and turns.",
+                "start_time": 37.91219711578734,
+                "end_time": 42.405258340277136,
+                "audio_file": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_7/SODA_PROCESSED--train--179972/temp/line_8_A.wav",
+                "silence_duration": 0.4823228307495384,
+                "is_interrupted": false
+            },
+            {
+                "speaker": "B",
+                "text": "I know! I didn't see any of them coming.",
+                "original_text": "I know! I didn't see any of them coming.",
+                "start_time": 42.860468420817675,
+                "end_time": 45.08958406707618,
+                "audio_file": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_7/SODA_PROCESSED--train--179972/temp/line_9_B.wav",
+                "silence_duration": 0.4552100805405374,
+                "is_interrupted": false
+            },
+            {
+                "speaker": "A",
+                "text": "That's what made it so great. It kept you guessing the whole time.",
+                "original_text": "That's what made it so great. It kept you guessing the whole time.",
+                "start_time": 45.679186523390214,
+                "end_time": 49.394379267154385,
+                "audio_file": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_7/SODA_PROCESSED--train--179972/temp/line_10_A.wav",
+                "silence_duration": 0.5896024563140343,
+                "is_interrupted": false
+            },
+            {
+                "speaker": "B",
+                "text": "Definitely. It was a great book. Thanks for lending it to me.",
+                "original_text": "Definitely. It was a great book. Thanks for lending it to me.",
+                "start_time": 49.70074891577286,
+                "end_time": 53.57848134207672,
+                "audio_file": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_7/SODA_PROCESSED--train--179972/temp/line_11_B.wav",
+                "silence_duration": 0.3063696486184793,
+                "is_interrupted": false
+            }
+        ],
+        "gt_score": 1
+    },
+    "SODA_PROCESSED--train--715956": {
+        "original_dialog_id": "",
+        "dialog_index": 715956,
+        "processed_dialogue": "A: Look, I know that I messed up. I was caught with heroin and I'm facing some serious charges. But, I really need your [interrupt] help to find a way out of this because I'm terrified of what might happen if I don't get proper legal representation and support during this difficult time.\nB: Aadya, we've been over this already. The evidence against you is pretty damning. Plus, you have a history of drug use. I don't think there's much that can be done to help you at this point.\nA: But you're my lawyer! You're supposed to help me!\nB: Aadya, I'm doing everything that I can. But, realistically, the chances of you getting out of this are pretty slim. You need to prepare yourself for the possibility of a conviction and think about how you'll handle it.\nA: But I can't go to prison! I'll lose my job, my apartment, everything!\nB: Aadya, you need to calm down. Getting upset isn't going to help anything.\nA: Fine. But, you have to promise me that you'll do everything you can to help me.\nB: I promise.",
+        "clean_dialogue": "A:Look, I know that I messed up. I was caught with heroin and I'm facing some serious charges. But, I really need your help to find a way out of this because I'm terrified of what might happen if I don't get proper legal representation and support during this difficult time.\nB: Aadya, we've been over this already. The evidence against you is pretty damning. Plus, you have a history of drug use. I don't think there's much that can be done to help you at this point.\nA: But you're my lawyer! You're supposed to help me!\nB: Aadya, I'm doing everything that I can. But, realistically, the chances of you getting out of this are pretty slim. You need to prepare yourself for the possibility of a conviction and think about how you'll handle it.\nA: But I can't go to prison! I'll lose my job, my apartment, everything!\nB: Aadya, you need to calm down. Getting upset isn't going to help anything.\nA: Fine. But, you have to promise me that you'll do everything you can to help me.\nB: I promise.",
+        "speaker_tracks": {
+            "A": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_3/SODA_PROCESSED--train--715956/A_track.wav",
+            "B": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_3/SODA_PROCESSED--train--715956/B_track.wav"
+        },
+        "error_type": "error_after_interrupt",
+        "stereo_audio": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_3/SODA_PROCESSED--train--715956/stereo_dialogue.wav",
+        "total_duration": 49.52126984126984,
+        "segments": [
+            {
+                "speaker": "A",
+                "text": "Look, I know that I messed up. I was caught with heroin and I'm facing some serious charges. But, I really need your",
+                "original_text": "Look, I know that I messed up. I was caught with heroin and I'm facing some serious charges. But, I really need your [interrupt] help to find a way out of this because I'm terrified of what might happen if I don't get proper legal representation and support during this difficult time.",
+                "start_time": 0,
+                "end_time": 16.579047619047618,
+                "audio_file": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_3/SODA_PROCESSED--train--715956/temp/line_0_A.wav",
+                "silence_duration": 0,
+                "is_interrupted": true,
+                "text_after_interrupt": "help to find a way out of this because I'm terrified of what might happen if I don't get proper legal representation and support during this difficult time."
+            },
+            {
+                "speaker": "B",
+                "text": "Aadya, we've been over this already. The evidence against you is pretty damning. Plus, you have a history of drug use. I don't think there's much that can be done to help you at this point.",
+                "original_text": "Aadya, we've been over this already. The evidence against you is pretty damning. Plus, you have a history of drug use. I don't think there's much that can be done to help you at this point.",
+                "start_time": 8.510113378684807,
+                "end_time": 18.36698412698413,
+                "audio_file": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_3/SODA_PROCESSED--train--715956/temp/line_1_B.wav",
+                "silence_duration": 0.4899749375576017,
+                "is_interrupted": false
+            },
+            {
+                "speaker": "A",
+                "text": "But you're my lawyer! You're supposed to help me!",
+                "original_text": "But you're my lawyer! You're supposed to help me!",
+                "start_time": 18.846747434390966,
+                "end_time": 21.37772249108031,
+                "audio_file": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_3/SODA_PROCESSED--train--715956/temp/line_2_A.wav",
+                "silence_duration": 0.4797633074068387,
+                "is_interrupted": false
+            },
+            {
+                "speaker": "B",
+                "text": "Aadya, I'm doing everything that I can. But, realistically, the chances of you getting out of this are pretty slim. You need to prepare yourself for the possibility of a conviction and think about how you'll handle it.",
+                "original_text": "Aadya, I'm doing everything that I can. But, realistically, the chances of you getting out of this are pretty slim. You need to prepare yourself for the possibility of a conviction and think about how you'll handle it.",
+                "start_time": 21.881120947184385,
+                "end_time": 33.51431822609595,
+                "audio_file": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_3/SODA_PROCESSED--train--715956/temp/line_3_B.wav",
+                "silence_duration": 0.5033984561040751,
+                "is_interrupted": false
+            },
+            {
+                "speaker": "A",
+                "text": "But I can't go to prison! I'll lose my job, my apartment, everything!",
+                "original_text": "But I can't go to prison! I'll lose my job, my apartment, everything!",
+                "start_time": 34.047335561433606,
+                "end_time": 38.48234689930209,
+                "audio_file": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_3/SODA_PROCESSED--train--715956/temp/line_4_A.wav",
+                "silence_duration": 0.5330173353376504,
+                "is_interrupted": false
+            },
+            {
+                "speaker": "B",
+                "text": "Aadya, you need to calm down. Getting upset isn't going to help anything.",
+                "original_text": "Aadya, you need to calm down. Getting upset isn't going to help anything.",
+                "start_time": 38.89720479711025,
+                "end_time": 43.39026602160004,
+                "audio_file": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_3/SODA_PROCESSED--train--715956/temp/line_5_B.wav",
+                "silence_duration": 0.4148578978081613,
+                "is_interrupted": false
+            },
+            {
+                "speaker": "A",
+                "text": "Fine. But, you have to promise me that you'll do everything you can to help me.",
+                "original_text": "Fine. But, you have to promise me that you'll do everything you can to help me.",
+                "start_time": 43.92319932038778,
+                "end_time": 48.27694081698642,
+                "audio_file": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_3/SODA_PROCESSED--train--715956/temp/line_6_A.wav",
+                "silence_duration": 0.5329332987877419,
+                "is_interrupted": false
+            },
+            {
+                "speaker": "B",
+                "text": "I promise.",
+                "original_text": "I promise.",
+                "start_time": 48.62731544236006,
+                "end_time": 49.52128369632831,
+                "audio_file": "/root/autodl-tmp/output_overlapslong/processed_soda_3_processed_dialogues_part_3/SODA_PROCESSED--train--715956/temp/line_7_B.wav",
+                "silence_duration": 0.3503746253736393,
+                "is_interrupted": false
+            }
+        ],
+        "gt_score": 1
+    },
+    "SODA_PROCESSED--train--740576": {
+      "original_text": "A: Good morning, Mr. Nguyen! I hope you're doing well today.\nB: I'm doing well, thank you. How are you?\nA: I'm feeling great today! I have a lot of energy and I'm excited to [interrupt] tackle some new projects and challenges that will help us improve our workflow and achieve better results for our clients.\nB: Sorry to interrupt, but I wanted to ask if there's anything specific you're looking forward to today?\nA: I was going to say I'm excited to start my day. Actually, I'm looking forward to a team meeting we have later. I love working here. It's a great environment and the people are really  supportive and collaborative, always willing to share their expertise and help each other grow professionally.\nB: I'm glad to hear that! Speaking of the team, do you think we should plan more team-building activities to maintain this positive environment?\nA: That's a great idea! We could definitely benefit from more team-building activities. We're happy to have you on our team.",
+      "cleaned_text": "A: Good morning, Mr. Nguyen! I hope you're doing well today.\nB: I'm doing well, thank you. How are you?\nA:I'm feeling great today! I have a lot of energy and I'm excited to tackle some new projects and challenges that will help us improve our workflow and achieve better results for our clients.\nB: Sorry to interrupt, but I wanted to ask if there's anything specific you're looking forward to today?\nA: I was going to say I'm excited to start my day. Actually, I'm looking forward to a team meeting we have later. I love working here. It's a great environment and the people are really  supportive and collaborative, always willing to share their expertise and help each other grow professionally.\nB: I'm glad to hear that! Speaking of the team, do you think we should plan more team-building activities to maintain this positive environment?\nA: That's a great idea! We could definitely benefit from more team-building activities. We're happy to have you on our team.",
+      "total_duration": 49.437278911564626,
+      "stereo_audio": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--740576/stereo_dialogue.wav",
+      "speaker_tracks": {
+        "A": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--740576/A_track.wav",
+        "B": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--740576/B_track.wav"
+      },
+      "error_type": "error_after_interrupt",
+      "segments": [
+        {
+          "speaker": "A",
+          "text": "Good morning, Mr. Nguyen! I hope you're doing well today.",
+          "original_text": "Good morning, Mr. Nguyen! I hope you're doing well today.",
+          "start_time": 0,
+          "end_time": 3.332063492063492,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--740576/temp/line_0_A.wav",
+          "silence_duration": 0,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "B",
+          "text": "I'm doing well, thank you. How are you?",
+          "original_text": "I'm doing well, thank you. How are you?",
+          "start_time": 3.7838731632362803,
+          "end_time": 5.583419648497051,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--740576/temp/line_1_B.wav",
+          "silence_duration": 0.4518096711727882,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "A",
+          "text": "I'm feeling great today! I have a lot of energy and I'm excited to",
+          "original_text": "I'm feeling great today! I have a lot of energy and I'm excited to [interrupt] tackle some new projects and challenges that will help us improve our workflow and achieve better results for our clients.",
+          "start_time": 5.88797031081498,
+          "end_time": 16.96388867816192,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--740576/temp/line_2_A.wav",
+          "silence_duration": 0.30455066231792893,
+          "is_interrupted": true,
+          "text_after_interrupt": "tackle some new projects and challenges that will help us improve our workflow and achieve better results for our clients."
+        },
+        {
+          "speaker": "B",
+          "text": "Sorry to interrupt, but I wanted to ask if there's anything specific you're looking forward to today?",
+          "original_text": "Sorry to interrupt, but I wanted to ask if there's anything specific you're looking forward to today?",
+          "start_time": 10.485521331223143,
+          "end_time": 16.104750356166456,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--740576/temp/line_3_B.wav",
+          "silence_duration": 0.587489668114177,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "A",
+          "text": "I was going to say I'm excited to start my day. Actually, I'm looking forward to a team meeting we have later. I love working here. It's a great environment and the people are really  supportive and collaborative, always willing to share their expertise and help each other grow professionally.",
+          "original_text": "I was going to say I'm excited to start my day. Actually, I'm looking forward to a team meeting we have later. I love working here. It's a great environment and the people are really  supportive and collaborative, always willing to share their expertise and help each other grow professionally.",
+          "start_time": 17.385624216961087,
+          "end_time": 33.94145188136018,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--740576/temp/line_4_A.wav",
+          "silence_duration": 0.4217355387991674,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "B",
+          "text": "I'm glad to hear that! Speaking of the team, do you think we should plan more team-building activities to maintain this positive environment?",
+          "original_text": "I'm glad to hear that! Speaking of the team, do you think we should plan more team-building activities to maintain this positive environment?",
+          "start_time": 34.39980783470558,
+          "end_time": 41.74892348096408,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--740576/temp/line_5_B.wav",
+          "silence_duration": 0.4583559533453947,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "A",
+          "text": "That's a great idea! We could definitely benefit from more team-building activities. We're happy to have you on our team.",
+          "original_text": "That's a great idea! We could definitely benefit from more team-building activities. We're happy to have you on our team.",
+          "start_time": 42.285572803275116,
+          "end_time": 49.437318835021145,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--740576/temp/line_6_A.wav",
+          "silence_duration": 0.5366493223110326,
+          "is_interrupted": false
+        }
+      ]
+    },
+    "SODA_PROCESSED--train--836018": {
+      "original_text": "A: Hey Ceanna, I saw that you were doing the reports for the group project. Do you want me to help you with [interrupt] organizing the sections or proofreading? I've got some experience with formatting academic papers and making sure all the citations are properly aligned.\nB: Actually, I could use some help with the data analysis part. It's a bit overwhelming.\nA: Sure, I can take care of that. So what do you think of the project so far?\nB: It's interesting. I'm learning a lot about different cultures and  how they influence people's daily lives, from their eating habits to their social interactions and even their work-life balance perspectives.\nA: Speaking of cultures, did you notice how the traditions vary even within the same country? It's amazing how diverse it can be.\nB: Yeah, definitely. It's fascinating.",
+      "cleaned_text": "A:Hey Ceanna, I saw that you were doing the reports for the group project. Do you want me to help you with organizing the sections or proofreading? I've got some experience with formatting academic papers and making sure all the citations are properly aligned.\nB: Actually, I could use some help with the data analysis part. It's a bit overwhelming.\nA: Sure, I can take care of that. So what do you think of the project so far?\nB: It's interesting. I'm learning a lot about different cultures and  how they influence people's daily lives, from their eating habits to their social interactions and even their work-life balance perspectives.\nA: Speaking of cultures, did you notice how the traditions vary even within the same country? It's amazing how diverse it can be.\nB: Yeah, definitely. It's fascinating.",
+      "total_duration": 42.34984126984127,
+      "stereo_audio": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--836018/stereo_dialogue.wav",
+      "speaker_tracks": {
+        "A": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--836018/A_track.wav",
+        "B": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--836018/B_track.wav"
+      },
+      "error_type": "error_after_interrupt",
+      "segments": [
+        {
+          "speaker": "A",
+          "text": "Hey Ceanna, I saw that you were doing the reports for the group project. Do you want me to help you with",
+          "original_text": "Hey Ceanna, I saw that you were doing the reports for the group project. Do you want me to help you with [interrupt] organizing the sections or proofreading? I've got some experience with formatting academic papers and making sure all the citations are properly aligned.",
+          "start_time": 0,
+          "end_time": 15.011700680272108,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--836018/temp/line_0_A.wav",
+          "silence_duration": 0,
+          "is_interrupted": true,
+          "text_after_interrupt": "organizing the sections or proofreading? I've got some experience with formatting academic papers and making sure all the citations are properly aligned."
+        },
+        {
+          "speaker": "B",
+          "text": "Actually, I could use some help with the data analysis part. It's a bit overwhelming.",
+          "original_text": "Actually, I could use some help with the data analysis part. It's a bit overwhelming.",
+          "start_time": 6.176507936507937,
+          "end_time": 11.250068027210885,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--836018/temp/line_1_B.wav",
+          "silence_duration": 0.5190912573415952,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "A",
+          "text": "Sure, I can take care of that. So what do you think of the project so far?",
+          "original_text": "Sure, I can take care of that. So what do you think of the project so far?",
+          "start_time": 15.60657282124108,
+          "end_time": 19.937094363191193,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--836018/temp/line_2_A.wav",
+          "silence_duration": 0.5948721409689715,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "B",
+          "text": "It's interesting. I'm learning a lot about different cultures and  how they influence people's daily lives, from their eating habits to their social interactions and even their work-life balance perspectives.",
+          "original_text": "It's interesting. I'm learning a lot about different cultures and  how they influence people's daily lives, from their eating habits to their social interactions and even their work-life balance perspectives.",
+          "start_time": 20.306213172030862,
+          "end_time": 30.476553308085286,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--836018/temp/line_3_B.wav",
+          "silence_duration": 0.36911880883966963,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "A",
+          "text": "Speaking of cultures, did you notice how the traditions vary even within the same country? It's amazing how diverse it can be.",
+          "original_text": "Speaking of cultures, did you notice how the traditions vary even within the same country? It's amazing how diverse it can be.",
+          "start_time": 30.848617682402736,
+          "end_time": 39.10331155995375,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--836018/temp/line_4_A.wav",
+          "silence_duration": 0.3720643743174508,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "B",
+          "text": "Yeah, definitely. It's fascinating.",
+          "original_text": "Yeah, definitely. It's fascinating.",
+          "start_time": 39.435776463870354,
+          "end_time": 42.34988077226038,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--836018/temp/line_5_B.wav",
+          "silence_duration": 0.3324649039166007,
+          "is_interrupted": false
+        }
+      ]
+    },
+    "SODA_PROCESSED--train--771154": {
+      "original_text": "A: Hey, Mom. Can I go to Tim's house today?  \nB: No, you can't go to Tim's house today.  \nA: Why [interrupt] can't I go? I finished all my chores and even helped with the dishes after dinner last night, so I really think I deserve this.  \nB: Because I said so. And before you ask again, you need to focus on your homework.  \nA: But I don't want to do my homework.  \nB: Well, you have to do it anyway. Now go  and get started immediately because your teacher specifically mentioned you need to improve your math skills this semester.  \nA: Wait, can't I just finish it later? I promise I'll get it done before bed.  \nB: No, you need to do it now. Once it's done, then we can talk about other plans.",
+      "cleaned_text": "A: Hey, Mom. Can I go to Tim's house today?  \nB: No, you can't go to Tim's house today.  \nA:Why can't I go? I finished all my chores and even helped with the dishes after dinner last night, so I really think I deserve this.\nB: Because I said so. And before you ask again, you need to focus on your homework.  \nA: But I don't want to do my homework.  \nB: Well, you have to do it anyway. Now go  and get started immediately because your teacher specifically mentioned you need to improve your math skills this semester.  \nA: Wait, can't I just finish it later? I promise I'll get it done before bed.  \nB: No, you need to do it now. Once it's done, then we can talk about other plans.",
+      "total_duration": 35.76784580498866,
+      "stereo_audio": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--771154/stereo_dialogue.wav",
+      "speaker_tracks": {
+        "A": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--771154/A_track.wav",
+        "B": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--771154/B_track.wav"
+      },
+      "error_type": "error_after_interrupt",
+      "segments": [
+        {
+          "speaker": "A",
+          "text": "Hey, Mom. Can I go to Tim's house today?",
+          "original_text": "Hey, Mom. Can I go to Tim's house today?",
+          "start_time": 0,
+          "end_time": 3.5294331065759637,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--771154/temp/line_0_A.wav",
+          "silence_duration": 0,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "B",
+          "text": "No, you can't go to Tim's house today.",
+          "original_text": "No, you can't go to Tim's house today.",
+          "start_time": 3.9899851353219105,
+          "end_time": 6.126220962986309,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--771154/temp/line_1_B.wav",
+          "silence_duration": 0.4605520287459467,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "A",
+          "text": "Why",
+          "original_text": "Why [interrupt] can't I go? I finished all my chores and even helped with the dishes after dinner last night, so I really think I deserve this.",
+          "start_time": 6.4787876256667465,
+          "end_time": 14.652211661947927,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--771154/temp/line_2_A.wav",
+          "silence_duration": 0.3525666626804373,
+          "is_interrupted": true,
+          "text_after_interrupt": "can't I go? I finished all my chores and even helped with the dishes after dinner last night, so I really think I deserve this."
+        },
+        {
+          "speaker": "B",
+          "text": "Because I said so. And before you ask again, you need to focus on your homework.",
+          "original_text": "Because I said so. And before you ask again, you need to focus on your homework.",
+          "start_time": 7.210216197095318,
+          "end_time": 11.889037058773322,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--771154/temp/line_3_B.wav",
+          "silence_duration": 0.4183677243140269,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "A",
+          "text": "But I don't want to do my homework.",
+          "original_text": "But I don't want to do my homework.",
+          "start_time": 15.159162983353092,
+          "end_time": 17.074809241856492,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--771154/temp/line_4_A.wav",
+          "silence_duration": 0.5069513214051653,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "B",
+          "text": "Well, you have to do it anyway. Now go  and get started immediately because your teacher specifically mentioned you need to improve your math skills this semester.",
+          "original_text": "Well, you have to do it anyway. Now go  and get started immediately because your teacher specifically mentioned you need to improve your math skills this semester.",
+          "start_time": 17.6716136549098,
+          "end_time": 25.763767849921138,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--771154/temp/line_5_B.wav",
+          "silence_duration": 0.5968044130533094,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "A",
+          "text": "Wait, can't I just finish it later? I promise I'll get it done before bed.",
+          "original_text": "Wait, can't I just finish it later? I promise I'll get it done before bed.",
+          "start_time": 26.149694131743242,
+          "end_time": 31.02588460793372,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--771154/temp/line_6_A.wav",
+          "silence_duration": 0.38592628182210614,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "B",
+          "text": "No, you need to do it now. Once it's done, then we can talk about other plans.",
+          "original_text": "No, you need to do it now. Once it's done, then we can talk about other plans.",
+          "start_time": 31.518621255026567,
+          "end_time": 35.767872955706835,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--771154/temp/line_7_B.wav",
+          "silence_duration": 0.49273664709284837,
+          "is_interrupted": false
+        }
+      ]
+    },
+    "SODA_PROCESSED--train--1794": {
+      "original_text": "A: Hey, Mom.  \nB: Hey, Moriah. What's up?  \nA: Not much. Just hanging out in my room.  \nB: That's good. I'm glad you're keeping busy.  \nA: Yeah, I'm just trying to stay out of [interrupt] everyone's way and focus on my own things because I've been feeling a bit overwhelmed with school and social stuff lately.  \nB: Trouble? Is everything okay? I mean, you know you can always talk to me if something's bothering you.  \nA: I'm just kind of going through some stuff right now.  \nB: Well, your father and I were just talking about how we need to have a serious talk with you about some things that have been going on  around the house and how you've been feeling lately because we've noticed some changes in your behavior and we're genuinely concerned about your well-being.  \nA: Wait, is this about me staying out late last weekend?  \nB: Not just that, but it's part of it. We've also noticed you've been acting a bit differently lately, and we're just wondering if everything is okay with you.  \nA: I don't know, Mom. Like I said, I'm just dealing with some stuff.  \nB: Okay. Well, if you ever want to talk about anything, we're here for you. We love you, Moriah.  \nA: I love you too, Mom.",
+      "cleaned_text": "A: Hey, Mom.  \nB: Hey, Moriah. What's up?  \nA: Not much. Just hanging out in my room.  \nB: That's good. I'm glad you're keeping busy.  \nA:Yeah, I'm just trying to stay out of everyone's way and focus on my own things because I've been feeling a bit overwhelmed with school and social stuff lately.\nB: Trouble? Is everything okay? I mean, you know you can always talk to me if something's bothering you.  \nA: I'm just kind of going through some stuff right now.  \nB: Well, your father and I were just talking about how we need to have a serious talk with you about some things that have been going on  around the house and how you've been feeling lately because we've noticed some changes in your behavior and we're genuinely concerned about your well-being.  \nA: Wait, is this about me staying out late last weekend?  \nB: Not just that, but it's part of it. We've also noticed you've been acting a bit differently lately, and we're just wondering if everything is okay with you.  \nA: I don't know, Mom. Like I said, I'm just dealing with some stuff.  \nB: Okay. Well, if you ever want to talk about anything, we're here for you. We love you, Moriah.  \nA: I love you too, Mom.",
+      "total_duration": 57.99024943310658,
+      "stereo_audio": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--1794/stereo_dialogue.wav",
+      "speaker_tracks": {
+        "A": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--1794/A_track.wav",
+        "B": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--1794/B_track.wav"
+      },
+      "error_type": "error_after_interrupt",
+      "segments": [
+        {
+          "speaker": "A",
+          "text": "Hey, Mom.",
+          "original_text": "Hey, Mom.",
+          "start_time": 0,
+          "end_time": 0.8591383219954648,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--1794/temp/line_0_A.wav",
+          "silence_duration": 0,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "B",
+          "text": "Hey, Moriah. What's up?",
+          "original_text": "Hey, Moriah. What's up?",
+          "start_time": 1.2689805234753475,
+          "end_time": 2.7782775756295424,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--1794/temp/line_1_B.wav",
+          "silence_duration": 0.4098422014798827,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "A",
+          "text": "Not much. Just hanging out in my room.",
+          "original_text": "Not much. Just hanging out in my room.",
+          "start_time": 3.2528527196865094,
+          "end_time": 5.505188320593539,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--1794/temp/line_2_A.wav",
+          "silence_duration": 0.47457514405696677,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "B",
+          "text": "That's good. I'm glad you're keeping busy.",
+          "original_text": "That's good. I'm glad you're keeping busy.",
+          "start_time": 6.047417085120735,
+          "end_time": 8.520342255188762,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--1794/temp/line_3_B.wav",
+          "silence_duration": 0.5422287645271964,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "A",
+          "text": "Yeah, I'm just trying to stay out of",
+          "original_text": "Yeah, I'm just trying to stay out of [interrupt] everyone's way and focus on my own things because I've been feeling a bit overwhelmed with school and social stuff lately.",
+          "start_time": 8.88750351109664,
+          "end_time": 18.059385597264438,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--1794/temp/line_4_A.wav",
+          "silence_duration": 0.3671612559078772,
+          "is_interrupted": true,
+          "text_after_interrupt": "everyone's way and focus on my own things because I've been feeling a bit overwhelmed with school and social stuff lately."
+        },
+        {
+          "speaker": "B",
+          "text": "Trouble? Is everything okay? I mean, you know you can always talk to me if something's bothering you.",
+          "original_text": "Trouble? Is everything okay? I mean, you know you can always talk to me if something's bothering you.",
+          "start_time": 11.697118023568294,
+          "end_time": 18.2915851437497,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--1794/temp/line_5_B.wav",
+          "silence_duration": 0.32519714638310315,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "A",
+          "text": "I'm just kind of going through some stuff right now.",
+          "original_text": "I'm just kind of going through some stuff right now.",
+          "start_time": 18.62204195980515,
+          "end_time": 21.396826540304016,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--1794/temp/line_6_A.wav",
+          "silence_duration": 0.3304568160554501,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "B",
+          "text": "Well, your father and I were just talking about how we need to have a serious talk with you about some things that have been going on  around the house and how you've been feeling lately because we've noticed some changes in your behavior and we're genuinely concerned about your well-being.",
+          "original_text": "Well, your father and I were just talking about how we need to have a serious talk with you about some things that have been going on  around the house and how you've been feeling lately because we've noticed some changes in your behavior and we're genuinely concerned about your well-being.",
+          "start_time": 21.697523952118004,
+          "end_time": 34.7355284872654,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--1794/temp/line_7_B.wav",
+          "silence_duration": 0.30069741181398774,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "A",
+          "text": "Wait, is this about me staying out late last weekend?",
+          "original_text": "Wait, is this about me staying out late last weekend?",
+          "start_time": 35.29912687220732,
+          "end_time": 38.677630273567864,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--1794/temp/line_8_A.wav",
+          "silence_duration": 0.5635983849419206,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "B",
+          "text": "Not just that, but it's part of it. We've also noticed you've been acting a bit differently lately, and we're just wondering if everything is okay with you.",
+          "original_text": "Not just that, but it's part of it. We've also noticed you've been acting a bit differently lately, and we're just wondering if everything is okay with you.",
+          "start_time": 39.09678068392148,
+          "end_time": 45.99310721453372,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--1794/temp/line_9_B.wav",
+          "silence_duration": 0.4191504103536184,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "A",
+          "text": "I don't know, Mom. Like I said, I'm just dealing with some stuff.",
+          "original_text": "I don't know, Mom. Like I said, I'm just dealing with some stuff.",
+          "start_time": 46.3670775788443,
+          "end_time": 50.46539957430915,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--1794/temp/line_10_A.wav",
+          "silence_duration": 0.3739703643105766,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "B",
+          "text": "Okay. Well, if you ever want to talk about anything, we're here for you. We love you, Moriah.",
+          "original_text": "Okay. Well, if you ever want to talk about anything, we're here for you. We love you, Moriah.",
+          "start_time": 50.99388055366539,
+          "end_time": 56.06744064436834,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--1794/temp/line_11_B.wav",
+          "silence_duration": 0.5284809793562373,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "A",
+          "text": "I love you too, Mom.",
+          "original_text": "I love you too, Mom.",
+          "start_time": 56.55062063706958,
+          "end_time": 57.99025782527819,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--1794/temp/line_12_A.wav",
+          "silence_duration": 0.4831799927012399,
+          "is_interrupted": false
+        }
+      ]
+    },
+    "SODA_PROCESSED--train--1070688": {
+      "original_text": "A: Hi Karis, I'm so excited to have you over for dinner tonight. I've been planning the menu and setting the table all day. I hope you're [interrupt] ready for a cozy evening with some delicious food and great conversation about your recent travels through Europe that you mentioned last time we met.\nB: Oh, I just remembered—I have a slight allergy to shellfish. I know you usually avoid it, but I wanted to mention it just in case.\nA: No worries, there's no shellfish on the menu tonight. Well, let's get started then! For our first course, we'll be having a spinach and feta salad. The feta is from a local farm and the spinach is from my garden. For our main course, I've made chicken Parmesan with homemade tomato sauce and fresh mozzarella cheese. And for dessert, we'll be having tiramisu that I made from scratch this afternoon. I wanted it to be just right for tonight.\nB: Tiramisu? That's my favorite dessert! I'm so excited to try it. You really know how to make a meal special.\nA: I'm glad you're excited! I was about to say I made it this  morning using a special family recipe that's been passed down through generations, so it's extra fresh and has that authentic Italian flavor you can't find in restaurants. I hope you enjoy everything!",
+      "cleaned_text": "A:Hi Karis, I'm so excited to have you over for dinner tonight. I've been planning the menu and setting the table all day. I hope you're ready for a cozy evening with some delicious food and great conversation about your recent travels through Europe that you mentioned last time we met.\nB: Oh, I just remembered—I have a slight allergy to shellfish. I know you usually avoid it, but I wanted to mention it just in case.\nA: No worries, there's no shellfish on the menu tonight. Well, let's get started then! For our first course, we'll be having a spinach and feta salad. The feta is from a local farm and the spinach is from my garden. For our main course, I've made chicken Parmesan with homemade tomato sauce and fresh mozzarella cheese. And for dessert, we'll be having tiramisu that I made from scratch this afternoon. I wanted it to be just right for tonight.\nB: Tiramisu? That's my favorite dessert! I'm so excited to try it. You really know how to make a meal special.\nA: I'm glad you're excited! I was about to say I made it this  morning using a special family recipe that's been passed down through generations, so it's extra fresh and has that authentic Italian flavor you can't find in restaurants. I hope you enjoy everything!",
+      "total_duration": 66.58453514739229,
+      "stereo_audio": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--1070688/stereo_dialogue.wav",
+      "speaker_tracks": {
+        "A": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--1070688/A_track.wav",
+        "B": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--1070688/B_track.wav"
+      },
+      "error_type": "error_after_interrupt",
+      "segments": [
+        {
+          "speaker": "A",
+          "text": "Hi Karis, I'm so excited to have you over for dinner tonight. I've been planning the menu and setting the table all day. I hope you're",
+          "original_text": "Hi Karis, I'm so excited to have you over for dinner tonight. I've been planning the menu and setting the table all day. I hope you're [interrupt] ready for a cozy evening with some delicious food and great conversation about your recent travels through Europe that you mentioned last time we met.",
+          "start_time": 0,
+          "end_time": 16.172698412698413,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--1070688/temp/line_0_A.wav",
+          "silence_duration": 0,
+          "is_interrupted": true,
+          "text_after_interrupt": "ready for a cozy evening with some delicious food and great conversation about your recent travels through Europe that you mentioned last time we met."
+        },
+        {
+          "speaker": "B",
+          "text": "Oh, I just remembered—I have a slight allergy to shellfish. I know you usually avoid it, but I wanted to mention it just in case.",
+          "original_text": "Oh, I just remembered—I have a slight allergy to shellfish. I know you usually avoid it, but I wanted to mention it just in case.",
+          "start_time": 8.719092970521542,
+          "end_time": 15.650249433106577,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--1070688/temp/line_1_B.wav",
+          "silence_duration": 0.42791712549357114,
+          "is_interrupted": false
+        },{
+          "speaker": "A",
+          "text": "No worries, there's no shellfish on the menu tonight. Well, let's get started then! For our first course, we'll be having a spinach and feta salad. The feta is from a local farm and the spinach is from my garden. For our main course, I've made chicken Parmesan with homemade tomato sauce and fresh mozzarella cheese. And for dessert, we'll be having tiramisu that I made from scratch this afternoon. I wanted it to be just right for tonight.",
+          "original_text": "No worries, there's no shellfish on the menu tonight. Well, let's get started then! For our first course, we'll be having a spinach and feta salad. The feta is from a local farm and the spinach is from my garden. For our main course, I've made chicken Parmesan with homemade tomato sauce and fresh mozzarella cheese. And for dessert, we'll be having tiramisu that I made from scratch this afternoon. I wanted it to be just right for tonight.",
+          "start_time": 16.66087863834312,
+          "end_time": 43.38704643879663,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--1070688/temp/line_2_A.wav",
+          "silence_duration": 0.488180225644707,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "B",
+          "text": "Tiramisu? That's my favorite dessert! I'm so excited to try it. You really know how to make a meal special.",
+          "original_text": "Tiramisu? That's my favorite dessert! I'm so excited to try it. You really know how to make a meal special.",
+          "start_time": 43.75020989775093,
+          "end_time": 49.926717834258866,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--1070688/temp/line_3_B.wav",
+          "silence_duration": 0.36316345895429397,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "A",
+          "text": "I'm glad you're excited! I was about to say I made it this  morning using a special family recipe that's been passed down through generations, so it's extra fresh and has that authentic Italian flavor you can't find in restaurants. I hope you enjoy everything!",
+          "original_text": "I'm glad you're excited! I was about to say I made it this  morning using a special family recipe that's been passed down through generations, so it's extra fresh and has that authentic Italian flavor you can't find in restaurants. I hope you enjoy everything!",
+          "start_time": 50.49314394878711,
+          "end_time": 66.58457252021569,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--1070688/temp/line_4_A.wav",
+          "silence_duration": 0.5664261145282402,
+          "is_interrupted": false
+        }
+      ]
+    },
+    "SODA_PROCESSED--train--737676": {
+      "original_text": "A: Hey, Miraya. I'm sorry about what happened with the car last night. I was really angry and I didn't mean to take it out on your [interrupt] car like that, especially since it's your most valuable possession and you've always taken such good care of it. I know it was wrong, and I regret it deeply.\nB: I understand, Stephon. But what exactly made you so angry? Was it something specific about what happened earlier in the week?\nA: Yeah, it did. I was really mad at you for a while after that. But I know it wasn't your fault and I shouldn't have taken it out on your car like that.\nB: Well, since you're being honest and apologetic about it, I don't think there's anything else you need to do other than maybe just be more mindful in the future about how you express your  emotions, especially when you're upset, because lashing out at objects or people never really solves the underlying issue and often makes things worse.\nA: Absolutely, I'll work on that. And I really appreciate you being so understanding about this. Thanks for giving me the chance to talk it out.",
+      "cleaned_text": "A:Hey, Miraya. I'm sorry about what happened with the car last night. I was really angry and I didn't mean to take it out on your car like that, especially since it's your most valuable possession and you've always taken such good care of it. I know it was wrong, and I regret it deeply.\nB: I understand, Stephon. But what exactly made you so angry? Was it something specific about what happened earlier in the week?\nA: Yeah, it did. I was really mad at you for a while after that. But I know it wasn't your fault and I shouldn't have taken it out on your car like that.\nB: Well, since you're being honest and apologetic about it, I don't think there's anything else you need to do other than maybe just be more mindful in the future about how you express your  emotions, especially when you're upset, because lashing out at objects or people never really solves the underlying issue and often makes things worse.\nA: Absolutely, I'll work on that. And I really appreciate you being so understanding about this. Thanks for giving me the chance to talk it out.",
+      "total_duration": 52.89809523809524,
+      "stereo_audio": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--737676/stereo_dialogue.wav",
+      "speaker_tracks": {
+        "A": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--737676/A_track.wav",
+        "B": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--737676/B_track.wav"
+      },
+      "error_type": "error_after_interrupt",
+      "segments": [
+        {
+          "speaker": "A",
+          "text": "Hey, Miraya. I'm sorry about what happened with the car last night. I was really angry and I didn't mean to take it out on your",
+          "original_text": "Hey, Miraya. I'm sorry about what happened with the car last night. I was really angry and I didn't mean to take it out on your [interrupt] car like that, especially since it's your most valuable possession and you've always taken such good care of it. I know it was wrong, and I regret it deeply.",
+          "start_time": 0,
+          "end_time": 16.938956916099773,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--737676/temp/line_0_A.wav",
+          "silence_duration": 0,
+          "is_interrupted": true,
+          "text_after_interrupt": "car like that, especially since it's your most valuable possession and you've always taken such good care of it. I know it was wrong, and I regret it deeply."
+        },
+        {
+          "speaker": "B",
+          "text": "I understand, Stephon. But what exactly made you so angry? Was it something specific about what happened earlier in the week?",
+          "original_text": "I understand, Stephon. But what exactly made you so angry? Was it something specific about what happened earlier in the week?",
+          "start_time": 8.753922902494331,
+          "end_time": 15.348390022675737,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--737676/temp/line_1_B.wav",
+          "silence_duration": 0.5553895116856843,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "A",
+          "text": "Yeah, it did. I was really mad at you for a while after that. But I know it wasn't your fault and I shouldn't have taken it out on your car like that.",
+          "original_text": "Yeah, it did. I was really mad at you for a while after that. But I know it wasn't your fault and I shouldn't have taken it out on your car like that.",
+          "start_time": 17.329799609194744,
+          "end_time": 26.582951536632386,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--737676/temp/line_2_A.wav",
+          "silence_duration": 0.3908426930949695,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "B",
+          "text": "Well, since you're being honest and apologetic about it, I don't think there's anything else you need to do other than maybe just be more mindful in the future about how you express your  emotions, especially when you're upset, because lashing out at objects or people never really solves the underlying issue and often makes things worse.",
+          "original_text": "Well, since you're being honest and apologetic about it, I don't think there's anything else you need to do other than maybe just be more mindful in the future about how you express your  emotions, especially when you're upset, because lashing out at objects or people never really solves the underlying issue and often makes things worse.",
+          "start_time": 26.900238001740547,
+          "end_time": 44.05978448700132,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--737676/temp/line_3_B.wav",
+          "silence_duration": 0.3172864651081615,
+          "is_interrupted": false
+        },
+        {
+          "speaker": "A",
+          "text": "Absolutely, I'll work on that. And I really appreciate you being so understanding about this. Thanks for giving me the chance to talk it out.",
+          "original_text": "Absolutely, I'll work on that. And I really appreciate you being so understanding about this. Thanks for giving me the chance to talk it out.",
+          "start_time": 44.64342590433178,
+          "end_time": 52.8981197818828,
+          "audio_file": "/root/autodl-tmp/output_matches_soda/SODA_PROCESSED--train--737676/temp/line_4_A.wav",
+          "silence_duration": 0.5836414173304574,
+          "is_interrupted": false
+        }
+      ]
+    }
+}

ms-swift/swift/llm/sampling/mcts.py ADDED Viewed

	@@ -0,0 +1,400 @@

+import time
+import traceback
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from copy import deepcopy
+import json
+import numpy as np
+from swift.llm import InferRequest, SamplingArguments
+from swift.llm.infer.protocol import UsageInfo
+from swift.utils import get_logger
+from .base import Sampler
+from .utils import get_reward, perform_infer
+logger = get_logger()
+NXT_PROMPT = """Continue.
+"""
+next_message = {
+    'role': 'user',
+    'content': NXT_PROMPT,
+}
+class LanguageNode:
+    def __init__(
+        self,
+        step: str = None,
+        sep_token: str = None,
+        parent: 'LanguageNode' = None,
+    ):
+        self.parent = parent
+        if sep_token:
+            self.sep_token = sep_token
+        else:
+            self.sep_token = parent.sep_token
+        if parent:
+            self.path = parent.path[:] + [step]
+            self.answer = parent.answer + step + self.sep_token
+            self.depth = parent.depth + 1
+        else:
+            self.path = []
+            self.answer = ''
+            self.depth = 0
+        self.active_children = []
+        self.children = []
+        self.visit_count = 0
+        self.process_reward = 0.0
+        self.outcome_reward = 0.0
+        self.terminated = False
+        self.correct = False
+    def is_leaf(self):
+        return len(self.children) == 0
+    def is_root(self):
+        return self.parent is None
+    def visit(self):
+        self.visit_count += 1
+    def init_and_update_value(self, value):
+        self.outcome_reward = (self.outcome_reward * self.visit_count + value) / (self.visit_count + 1)
+    def add_child(self, child: 'LanguageNode'):
+        self.children.append(child)
+        if not child.terminated:
+            self.active_children.append(child)
+    def collect(self):
+        result = {
+            'path': self.path,
+            'depth': self.depth,
+            'visit_count': self.visit_count,
+            'process_reward': self.process_reward,
+            'outcome_reward': self.outcome_reward,
+            'terminated': str(self.terminated),
+            'correct': str(self.correct),
+            'children': [child.collect() for child in self.children],
+        }
+        return result
+    def __lt__(self, other):
+        return self.outcome_reward < other.outcome_reward
+class MctsSampler(Sampler):
+    def __init__(self, input_args: SamplingArguments):
+        super().__init__(input_args)
+        self.usage_info = UsageInfo(0, 0, 0)
+    def _prepare_model_tokenizer(self):
+        args = self.args
+        self.infer_kwargs = {}
+        if args.sampler_engine == 'client':
+            from swift.llm import InferClient
+            api_key = args.api_key
+            base_url = args.base_url
+            self.infer_engine = [
+                InferClient(base_url=base_url, api_key=api_key) for _ in range(args.num_return_sequences)
+            ]
+            self.infer_kwargs['model'] = args.model
+        else:
+            _Engine = self.get_infer_engine()
+            self.infer_engine = _Engine(self.args.model, model_type=self.args.model_type, **self.args.engine_kwargs)
+    def get_infer_engine(self):
+        if self.args.sampler_engine == 'pt':
+            from swift.llm import PtEngine
+            _Engine = PtEngine
+        elif self.args.sampler_engine == 'vllm':
+            from swift.llm import VllmEngine
+            _Engine = VllmEngine
+        elif self.args.sampler_engine == 'lmdeploy':
+            from swift.llm import LmdeployEngine
+            _Engine = LmdeployEngine
+        elif self.args.sampler_engine == 'no':
+            _Engine = None
+        else:
+            raise ValueError(f'Cannot find engine name: {self.args.sampler_engine}')
+        return _Engine
+    def _prepare_template(self) -> None:
+        # Hack from super()
+        self._prepare_request_configs()
+    def _prepare_request_configs(self):
+        _args = self.args
+        request_config = _args.get_request_config()
+        request_config.stop = _args.stop_words
+        request_config.seed = _args.seed
+        self.expand_request_configs = []
+        self.rollout_request_configs = []
+        for i in range(_args.num_return_sequences):
+            expand_request_config = deepcopy(request_config)
+            expand_request_config.n = 1
+            expand_request_config.num_beams = expand_request_config.n
+            expand_request_config.seed += i
+            self.expand_request_configs.append(expand_request_config)
+            rollout_request_config = deepcopy(request_config)
+            rollout_request_config.max_tokens = 500
+            rollout_request_config.temperature = 0.0
+            rollout_request_config.n = 1
+            self.rollout_request_configs.append(rollout_request_config)
+    def update_usage_info(self, response):
+        for key, value in self.usage_info.__dict__.items():
+            update_value = getattr(response.usage, key, None) + value
+            setattr(self.usage_info, key, update_value)
+    def search_single(self, query, ground_truth):
+        def _uct(uct_curr_node: LanguageNode):
+            alpha = _args.process_reward_rate
+            value = alpha * uct_curr_node.process_reward + (1 - alpha) * uct_curr_node.outcome_reward
+            if uct_curr_node.is_root():
+                return value
+            exploitation_score = value
+            exploration_score = (
+                _args.exploration_rate
+                * np.sqrt(np.log(uct_curr_node.parent.visit_count + 1) / (uct_curr_node.visit_count + 1)))
+            return exploration_score + exploitation_score
+        def _select(select_curr_node: LanguageNode):
+            while not select_curr_node.is_leaf():
+                select_curr_node = max(select_curr_node.active_children, key=lambda x: _uct(x))
+            return select_curr_node
+        def _expand(expand_curr_node: LanguageNode):
+            n = _args.num_return_sequences - len(expand_curr_node.children)
+            if expand_curr_node.is_root():
+                infer_requests = [InferRequest(system_message + [prompt_message]) for _ in range(n)]
+            else:
+                history_message = {
+                    'role': 'assistant',
+                    'content': expand_curr_node.answer,
+                }
+                infer_request = InferRequest(system_message + [prompt_message, history_message, next_message])
+                infer_requests = [infer_request for _ in range(n)]
+            # e_time = time.time()
+            # To perform the Expand operation in parallel,
+            # there's no need to consider the order for now, since the Prompt is the same.
+            expand_iter_index = 0
+            while True:
+                responses = perform_infer(self.infer_engine, infer_requests, self.expand_request_configs,
+                                          **self.infer_kwargs)
+                if len(responses) > 0:
+                    break
+                if expand_iter_index == 5:
+                    raise ValueError('Expand should not return any response')
+                expand_iter_index += 1
+            # logger.info(f"expand.expand time: {time.time() - e_time}")
+            # To fetch Outcome Reward in parallel,
+            # the Outcome-Reward obtained is returned in order, so they can be directly matched accordingly.
+            orm_infer_requests = []
+            unique_output = set()
+            for response in responses:
+                self.update_usage_info(response)
+                output = response.choices[0].message.content.rstrip(sep_token + '\n').split(sep_token)[0]
+                if output in unique_output:
+                    continue
+                unique_output.add(output)
+                orm_infer_requests.append(InferRequest([{'role': 'assistant', 'content': output}]))
+                child = LanguageNode(step=output, parent=expand_curr_node)
+                if self.orm_model.check_terminate(child.answer)[0]:
+                    child.terminated = True
+                expand_curr_node.add_child(child)
+            # e_time = time.time()
+            orm_score, _orm_mask = get_reward(
+                self.orm_model,
+                orm_infer_requests,
+                ground_truths=[ground_truth] * len(orm_infer_requests),
+                threshold=0.0)
+            # logger.info(f"expand.orm time: {time.time() - e_time}")
+            for child, score in zip(expand_curr_node.children, orm_score):
+                if child.terminated:
+                    child.init_and_update_value(score)
+                    child.correct = score > 0.9
+                    terminated_nodes.append(child)
+            # e_time = time.time()
+            if self.prm_model:
+                prm_infer_requests = []
+                for child in expand_curr_node.children:
+                    prm_message = {'role': 'assistant', 'content': child.answer}
+                    prm_infer_requests.append(InferRequest([prompt_message, prm_message]))
+                prm_score, _prm_mask = get_reward(
+                    self.prm_model,
+                    prm_infer_requests,
+                    ground_truths=[ground_truth] * len(prm_infer_requests),
+                    threshold=0.0)
+                for child, score in zip(expand_curr_node.children, prm_score):
+                    child.process_reward = score
+            # logger.info(f"expand.prm time: {time.time() - e_time}")
+        def _rollout(rollout_curr_node: LanguageNode):
+            rollout_depth = 0
+            rollout_nodes = {}
+            for i in range(len(rollout_curr_node.active_children)):
+                rollout_nodes[i] = {
+                    'node': rollout_curr_node.active_children[i],
+                    'history_messages': {
+                        'role': 'assistant',
+                        'content': rollout_curr_node.active_children[i].answer,
+                    },
+                }
+            active_rollout_nodes = list(rollout_nodes.keys())
+            while len(active_rollout_nodes) > 0 and rollout_depth < _args.rollout_depth:
+                # r_time = time.time()
+                infer_requests = [
+                    InferRequest(system_message
+                                 + [prompt_message, rollout_nodes[index]['history_messages'], next_message])
+                    for index in active_rollout_nodes
+                ]
+                # logger.info(f"rollout.prepare time: {time.time() - r_time}")
+                # r_time = time.time()
+                rollout_iter_index = 0
+                while True:
+                    responses = perform_infer(self.infer_engine, infer_requests, self.rollout_request_configs,
+                                              **self.infer_kwargs)
+                    if len(responses) > 0:
+                        break
+                    if rollout_iter_index == 5:
+                        raise ValueError('Rollout should not return any response')
+                    rollout_iter_index += 1
+                # logger.info(f"rollout.infer time: {time.time() - r_time}")
+                # r_time = time.time()
+                orm_infer_requests = []
+                end_paths = []
+                for index, response in zip(active_rollout_nodes, responses):
+                    self.update_usage_info(response)
+                    output = response.choices[0].message.content.rstrip(sep_token
+                                                                        + '\n').split(sep_token)[0] + sep_token + '\n'
+                    rollout_nodes[index]['history_messages']['content'] += output
+                    end_paths.append(rollout_nodes[index]['history_messages']['content'])
+                    orm_infer_requests.append(InferRequest([rollout_nodes[index]['history_messages']]))
+                # logger.info(f"rollout.orm_prepare time: {time.time() - r_time}")
+                # r_time = time.time()
+                orm_score, _orm_mask = get_reward(
+                    self.orm_model,
+                    orm_infer_requests,
+                    ground_truths=[ground_truth] * len(infer_requests),
+                    threshold=0.0)
+                # logger.info(f"rollout.get_orm time: {time.time() - r_time}")
+                terminated_state = self.orm_model.check_terminate(end_paths)
+                for index, score, terminated in zip(active_rollout_nodes, orm_score, terminated_state):
+                    if terminated:
+                        rollout_curr_node.active_children[index].init_and_update_value(score)
+                        if score > 0.9:
+                            rollout_correct_answers.append(rollout_nodes[index]['history_messages']['content'])
+                        else:
+                            rollout_incorrect_answers.append(rollout_nodes[index]['history_messages']['content'])
+                        rollout_nodes.pop(index)
+                active_rollout_nodes = list(rollout_nodes.keys())
+                rollout_depth += 1
+        def _back_propagate(back_curr_node: LanguageNode):
+            while back_curr_node:
+                if back_curr_node == curr_node:
+                    best_child_value = max([child.outcome_reward for child in back_curr_node.children])
+                    back_curr_node.init_and_update_value(best_child_value)
+                    last_child_value = back_curr_node.outcome_reward
+                else:
+                    back_curr_node.init_and_update_value(last_child_value)
+                    last_child_value = back_curr_node.outcome_reward
+                back_curr_node.visit()
+                if len(back_curr_node.active_children) == 0:
+                    back_curr_node.terminated = True
+                    if not back_curr_node.is_root():
+                        back_curr_node.parent.active_children.remove(back_curr_node)
+                back_curr_node = back_curr_node.parent
+        _args = self.args
+        system_message = [] + _args.system_message
+        sep_token = _args.stop_words[0] + '\n'
+        _root = LanguageNode(sep_token=sep_token)
+        prompt_message = {
+            'role': 'user',
+            'content': query,
+        }
+        rollout_correct_answers, rollout_incorrect_answers, terminated_nodes = [], [], []
+        iter_count = 0
+        stop_reason = None
+        while True:
+            logger.info(f'iter_count: {iter_count}' + '.' * 10)
+            s_time = time.time()
+            curr_node = _select(_root)
+            logger.debug('select' + '=' * 10 + f'time: {time.time() - s_time}')
+            s_time = time.time()
+            _expand(curr_node)
+            logger.debug('expand' + '=' * 10 + f'time: {time.time() - s_time}')
+            if curr_node.depth > _args.rollout_start_depth:
+                s_time = time.time()
+                _rollout(curr_node)
+                logger.debug('rollout' + '=' * 10 + f'time: {time.time() - s_time}')
+            s_time = time.time()
+            _back_propagate(curr_node)
+            logger.debug('back propagate' + '=' * 10 + f'time: {time.time() - s_time}')
+            if len(rollout_correct_answers) + len(rollout_incorrect_answers) >= 2 * _args.num_return_sequences:
+                if 4 * len(rollout_incorrect_answers) < len(rollout_correct_answers):
+                    stop_reason = 'too easy'
+                    break
+                elif 4 * len(rollout_correct_answers) < len(rollout_incorrect_answers):
+                    stop_reason = 'too hard'
+                    break
+            if _root.terminated:
+                stop_reason = 'root terminated'
+                break
+            if len(terminated_nodes) >= _args.num_return_sequences:
+                stop_reason = 'enough nodes'
+                break
+            if iter_count >= _args.max_iterations:
+                stop_reason = 'max_iterations'
+                break
+            iter_count += 1
+        logger.info(f'stop_reason: {stop_reason}')
+        # logger.info(f"rollout_correct_answers: {rollout_correct_answers}")
+        # logger.info(f"rollout_incorrect_answers: {rollout_incorrect_answers}")
+        monte_carlo_tree = _root.collect()
+        result = {
+            'query': query,
+            'ground_truth': ground_truth,
+            'rollout_correct_answers': rollout_correct_answers,
+            'rollout_incorrect_answers': rollout_incorrect_answers,
+            'monte_carlo_tree': monte_carlo_tree,
+        }
+        result_json = json.dumps(result, ensure_ascii=False)
+        logger.info(result_json)
+        return result_json
+    def do_sample(self, data):
+        if not isinstance(data, list):
+            data = [data]
+        generated = []
+        for item in data:
+            logger.info(f'time: {time.ctime(time.time())}')
+            try:
+                messages = item['messages'][0]
+                query = messages[0]['content']
+                ground_truth = messages[1]['content']
+                generated.append(self.search_single(query, ground_truth) + '\n')
+            except Exception as e:
+                logger.error(f'Error: {e}')
+                logger.error(f'Traceback: {traceback.format_exc()}')
+        return generated

ms-swift/swift/llm/template/template/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (606 Bytes). View file

ms-swift/swift/llm/template/template/__pycache__/emu3.cpython-310.pyc ADDED Viewed

Binary file (7.88 kB). View file

ms-swift/swift/llm/template/template/__pycache__/gemma.cpython-310.pyc ADDED Viewed

Binary file (5.91 kB). View file

ms-swift/swift/llm/template/template/__pycache__/internvl.cpython-310.pyc ADDED Viewed

Binary file (6.8 kB). View file

ms-swift/swift/llm/template/template/__pycache__/minicpm.cpython-310.pyc ADDED Viewed

Binary file (8.18 kB). View file

ms-swift/swift/llm/template/template/__pycache__/pixtral.cpython-310.pyc ADDED Viewed

Binary file (2.3 kB). View file

ms-swift/swift/llm/template/template/__pycache__/stepfun.cpython-310.pyc ADDED Viewed

Binary file (6.57 kB). View file

ms-swift/swift/llm/template/template/__pycache__/valley.cpython-310.pyc ADDED Viewed

Binary file (6.31 kB). View file

ms-swift/swift/llm/template/template/__pycache__/yi.cpython-310.pyc ADDED Viewed

Binary file (2.91 kB). View file

ms-swift/swift/llm/template/template/deepseek.py ADDED Viewed

	@@ -0,0 +1,315 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+import numpy as np
+import torch
+import torch.nn as nn
+from PIL import Image
+from swift.utils import get_env_args
+from ..base import Template
+from ..constant import LLMTemplateType, MLLMTemplateType
+from ..register import TemplateMeta, register_template
+from ..template_inputs import StdTemplateInputs
+from ..utils import Prompt, findall
+@dataclass
+class DeepseekTemplateMeta(TemplateMeta):
+    prefix: Prompt = field(default_factory=lambda: [['bos_token_id']])
+    prompt: Prompt = field(default_factory=lambda: ['User: {{QUERY}}\n\nAssistant:'])
+    chat_sep: Optional[Prompt] = field(default_factory=lambda: [['eos_token_id']])
+    suffix: Prompt = field(default_factory=lambda: [['eos_token_id']])
+    system_prefix: Optional[Prompt] = field(default_factory=lambda: [['bos_token_id'], '{{SYSTEM}}\n\n'])
+register_template(DeepseekTemplateMeta(LLMTemplateType.deepseek, ))
+register_template(
+    TemplateMeta(
+        LLMTemplateType.deepseek_coder,
+        prefix=['{{SYSTEM}}'],
+        prompt=['### Instruction:\n{{QUERY}}\n### Response:\n'],
+        chat_sep=['\n<|EOT|>\n'],
+        suffix=['\n<|EOT|>'],
+        stop_words=['<|EOT|>'],
+        default_system=('You are an AI programming assistant, utilizing the Deepseek Coder model, '
+                        'developed by Deepseek Company, and you only answer questions related to computer science. '
+                        'For politically sensitive questions, security and privacy issues, '
+                        'and other non-computer science questions, you will refuse to answer\n')))
+class DeepseekVLTemplate(Template):
+    image_placeholder = ['<image_placeholder>']
+    skip_prompt = False
+    use_model = True
+    placeholder_tokens = ['<image_placeholder>']
+    image_token_num_per_image: int = 576
+    def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
+        is_janus = getattr(self, 'is_janus', False)
+        encoded = super()._encode(inputs)
+        images = inputs.images
+        processor = self.processor
+        input_ids, labels = encoded['input_ids'], encoded['labels']
+        if not inputs.generate_mode:  # understanding task
+            idx_list = findall(input_ids, processor.image_id)  # '<image_placeholder>'
+            new_input_ids, new_labels = [], []
+            lo = 0
+            for hi in idx_list:
+                new_input_ids += input_ids[lo:hi]
+                if labels is not None:
+                    new_labels += labels[lo:hi]
+                image_tokens = [processor.image_id] * processor.num_image_tokens
+                if is_janus:
+                    image_tokens = [processor.image_start_id] + image_tokens + [processor.image_end_id]
+                new_input_ids += image_tokens
+                new_labels += [-100] * len(image_tokens)
+                lo = hi + 1
+            new_input_ids += input_ids[lo:]
+            if labels is not None:
+                new_labels += labels[lo:]
+            else:
+                new_labels = None
+            if is_janus:
+                from janus.models.processing_vlm import VLChatProcessorOutput
+            else:
+                from deepseek_vl.models.processing_vlm import VLChatProcessorOutput
+            images_outputs = processor.image_processor(images, return_tensors='pt')
+            output = VLChatProcessorOutput(
+                sft_format=None,
+                input_ids=torch.tensor(new_input_ids),
+                pixel_values=images_outputs.pixel_values,
+                num_image_tokens=torch.tensor([processor.num_image_tokens] * len(idx_list)))
+            encoded = {'output': output, 'input_ids': new_input_ids, 'labels': new_labels}
+            return encoded
+        else:  # image generation task
+            if self.is_training:
+                raise NotImplementedError('Only support the inference of generation of Janus series models.')
+            sft_format = self.tokenizer.decode(input_ids)
+            prompt = sft_format + processor.image_start_tag
+            input_ids = processor.tokenizer.encode(prompt)
+            input_ids = torch.LongTensor(input_ids)
+            encoded = {'input_ids': input_ids, 'labels': labels, 'generate_mode': inputs.generate_mode}
+            return encoded
+    def _post_encode(self, model: nn.Module, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        if not inputs.get('generate_mode'):
+            inputs['pixel_values'] = inputs['pixel_values'].to(dtype=self.model_info.torch_dtype)
+            inputs_embeds = model.prepare_inputs_embeds(**inputs)
+            return {'inputs_embeds': inputs_embeds}
+        else:
+            return inputs
+    def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[int] = None) -> Dict[str, Any]:
+        gene_img_list = [b.get('generate_mode') for b in batch]
+        if all(gene_img_list):
+            generate_mode = True
+        elif not any(gene_img_list):
+            generate_mode = False
+        else:
+            raise NotImplementedError('Do not support understanding and image generation tasks in one batch.')
+        if not generate_mode:
+            output = self.fetch_inputs(batch, ['output'])['output']
+            batched_output = dict(self.processor.batchify(output))
+            res = super()._data_collator(batch, padding_to=padding_to)
+            return {**batched_output, **res}
+        else:
+            res = super()._data_collator(batch, padding_to=padding_to)
+            res['generate_mode'] = generate_mode
+            return res
+    def generate(self, model, *args, **kwargs):
+        if not kwargs.get('generate_mode'):
+            return super().generate(model, *args, **kwargs)
+        else:
+            # generate how many number of images for each prompt, it is named parallel_size in the author's code
+            parallel_size = kwargs['generation_config'].num_return_sequences
+            temperature = kwargs['generation_config'].temperature
+            cfg_weight = get_env_args('cfg_weight', float, 5.0)
+            input_ids = kwargs['input_ids']  # [bsz, max_input_token_num]
+            bsz, max_input_token_num = input_ids.shape
+            tokens = torch.zeros((bsz, parallel_size * 2, max_input_token_num),
+                                 dtype=torch.int).cuda()  # [bsz, parallel_size*2, max_input_token_num]
+            for i in range(parallel_size * 2):
+                tokens[:, i, :] = input_ids
+                if i % 2 != 0:
+                    tokens[:, i, 1:-1] = self.processor.pad_id
+            inputs_embeds = model.language_model.get_input_embeddings()(
+                tokens)  # [bsz, parallel_size*2, max_input_token_num, 2048]
+            generated_tokens = torch.zeros(
+                (bsz, parallel_size, self.image_token_num_per_image),
+                dtype=torch.int).cuda()  # [bsz, 16, image_token_num_per_image] placeholder for the generated tokens
+            # set the first two dimensions into one dimension for batch size
+            inputs_embeds = inputs_embeds.reshape(bsz * parallel_size * 2, max_input_token_num, -1)
+            generated_tokens = generated_tokens.reshape(bsz * parallel_size, self.image_token_num_per_image)
+            for i in range(self.image_token_num_per_image):  # generate the tokens of image in a auto-regression way
+                outputs = model.language_model.model(
+                    inputs_embeds=inputs_embeds,
+                    use_cache=True,
+                    past_key_values=outputs.past_key_values if i != 0 else None)
+                hidden_states = outputs.last_hidden_state
+                logits = self.model.gen_head(hidden_states[:, -1, :])
+                logit_cond = logits[0::2, :]
+                logit_uncond = logits[1::2, :]
+                logits = logit_uncond + cfg_weight * (logit_cond - logit_uncond)
+                probs = torch.softmax(logits / temperature, dim=-1)
+                next_token = torch.multinomial(probs, num_samples=1)
+                generated_tokens[:, i] = next_token.squeeze(dim=-1)  # [parallel_size, self.image_token_num_per_image]
+                next_token = torch.cat([next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1)], dim=1).view(-1)
+                img_embeds = model.prepare_gen_img_embeds(next_token)  # [parallel_size * 2, 2048]
+                inputs_embeds = img_embeds.unsqueeze(dim=1)  # [parallel_size * 2, 1, 2048]
+            # no need to reset the original first two dimensions, waiting for the update of the upper layer
+            # inputs_embeds = inputs_embeds.reshape(bsz, parallel_size*2, -1)
+            # generated_tokens = generated_tokens.reshape(bsz, parallel_size, self.image_token_num_per_image)
+            return {'sequences': generated_tokens}
+    def decode(self, generate_ids: List[int], **kwargs) -> Any:
+        if 'template_inputs' not in kwargs or not kwargs['template_inputs'].generate_mode:
+            return super().decode(generate_ids, **kwargs)
+        else:
+            img_size = get_env_args('img_size', int, 384)
+            patch_size = 16
+            num_to_decode = 1  # for now, generate_ids is a 1D list
+            generate_ids = torch.tensor(generate_ids).unsqueeze(0)  # [num_to_decode=1, self.image_token_num_per_image]
+            dec = self.model.gen_vision_model.decode_code(
+                generate_ids.to(dtype=torch.int),
+                shape=[num_to_decode, 8, img_size // patch_size, img_size // patch_size])
+            dec = dec.to(torch.float32).cpu().numpy().transpose(0, 2, 3, 1)  # [num_to_decode, H, W, ch=3]
+            dec = np.clip((dec + 1) / 2 * 255, 0, 255)
+            visual_img = np.zeros((num_to_decode, img_size, img_size, 3), dtype=np.uint8)
+            visual_img[:, :, :] = dec
+            img_list = []
+            for i in range(num_to_decode):
+                cur_img = Image.fromarray(visual_img[i])
+                img_list.append({'type': 'image', 'image': cur_img})
+            return img_list
+@dataclass
+class DeepseekVLTemplateMeta(DeepseekTemplateMeta):
+    default_system: Optional[str] = ('You are a helpful language and vision assistant. '
+                                     'You are able to understand the visual content that the user provides, '
+                                     'and assist the user with a variety of tasks using natural language.')
+register_template(DeepseekVLTemplateMeta(
+    MLLMTemplateType.deepseek_vl,
+    template_cls=DeepseekVLTemplate,
+))
+class DeepseekJanus(DeepseekVLTemplate):
+    is_janus = True
+    image_placeholder = ['<image_placeholder>\n']
+register_template(DeepseekVLTemplateMeta(MLLMTemplateType.deepseek_janus, template_cls=DeepseekJanus))
+@dataclass
+class DeepseekV2_5TemplateMeta(TemplateMeta):
+    prefix: Prompt = field(default_factory=lambda: ['<｜begin▁of▁sentence｜>{{SYSTEM}}'])
+    prompt: Prompt = field(default_factory=lambda: ['<｜User｜>{{QUERY}}<｜Assistant｜>'])
+    chat_sep: Optional[Prompt] = field(default_factory=lambda: ['<｜end▁of▁sentence｜>'])
+    suffix: Prompt = field(default_factory=lambda: ['<｜end▁of▁sentence｜>'])
+register_template(DeepseekV2_5TemplateMeta(LLMTemplateType.deepseek_v2_5))
+class DeepseekR1Template(Template):
+    def _swift_encode(self, inputs: StdTemplateInputs):
+        if not self.is_training:
+            for message in inputs.messages:
+                if message['role'] == 'assistant' and isinstance(message['content'], str):
+                    message['content'] = message['content'].split('</think>')[-1]
+        return super()._swift_encode(inputs)
+register_template(
+    DeepseekV2_5TemplateMeta(LLMTemplateType.deepseek_r1, template_cls=DeepseekR1Template, response_prefix='<think>\n'))
+class DeepseekVL2Template(DeepseekVLTemplate):
+    image_placeholder = ['<image>\n']
+    placeholder_tokens = ['<image>']
+    def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
+        from deepseek_vl2.models.processing_deepseek_vl_v2 import VLChatProcessorOutput
+        encoded = Template._encode(self, inputs)
+        images = inputs.images
+        processor = self.processor
+        input_ids, labels = encoded['input_ids'], encoded['labels']
+        images_seq_mask = [False] * len(input_ids)
+        idx_list = findall(input_ids, processor.image_token_id)  # '<image>'
+        _, images_list, _, images_spatial_crop, num_image_tokens = processor.tokenize_with_images(
+            '<image>' * len(images), images, cropping=len(images) <= 2)
+        new_num_tokens = 0
+        for idx, n_image_tokens in zip(idx_list, num_image_tokens):
+            image_tokens = [processor.image_token_id] * n_image_tokens
+            input_ids = input_ids[:idx] + image_tokens + input_ids[idx + 1:]
+            if labels is not None:
+                labels = labels[:idx] + [-100] * n_image_tokens + labels[idx + 1:]
+            images_seq_mask = images_seq_mask[:idx] + [True] * n_image_tokens + images_seq_mask[idx + 1:]
+            new_num_tokens += n_image_tokens - 1
+        output = VLChatProcessorOutput(
+            sft_format=None,
+            input_ids=torch.tensor(input_ids),
+            target_ids=torch.tensor(input_ids),
+            images=torch.stack(images_list) if images_list else torch.zeros((0, 3, 384, 384)),
+            images_seq_mask=torch.tensor(images_seq_mask),
+            images_spatial_crop=torch.tensor(images_spatial_crop),
+            num_image_tokens=num_image_tokens)
+        output.images = output.images.to(dtype=self.model_info.torch_dtype)
+        encoded = {'output': output, 'input_ids': input_ids, 'labels': labels}
+        return encoded
+    def _post_encode(self, model: nn.Module, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        inputs['images_seq_mask'] = inputs['images_seq_mask'].to(torch.bool)
+        inputs['images_spatial_crop'] = inputs['images_spatial_crop'].to(torch.long)
+        inputs_embeds = model.prepare_inputs_embeds(**inputs)
+        return {'inputs_embeds': inputs_embeds}
+register_template(
+    DeepseekV2_5TemplateMeta(
+        MLLMTemplateType.deepseek_vl2,
+        prompt=['<|User|>: {{QUERY}}\n\n<|Assistant|>:'],
+        template_cls=DeepseekVL2Template,
+    ))
+register_template(
+    DeepseekVLTemplateMeta(
+        MLLMTemplateType.deepseek_janus_pro,
+        prompt=['<|User|>: {{QUERY}}\n\n<|Assistant|>:'],
+        template_cls=DeepseekJanus))

ms-swift/swift/llm/template/template/glm.py ADDED Viewed

	@@ -0,0 +1,293 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Literal, Optional
+import torch
+from ..base import Template
+from ..constant import LLMTemplateType, MLLMTemplateType
+from ..register import TemplateMeta, register_template
+from ..template_inputs import StdTemplateInputs
+from ..utils import Context, Prompt, Word, findall
+from ..vision_utils import load_batch, load_video_cogvlm2
+@dataclass
+class GLMTemplateMeta(TemplateMeta):
+    auto_add_bos: bool = True
+class GLM4Template(Template):
+    def _swift_encode(self, inputs: StdTemplateInputs):
+        res_context_list, loss_scale_list, answer_len = super()._swift_encode(inputs)
+        for i, res_context in enumerate(res_context_list):
+            # The last round or is tool_call.
+            if isinstance(res_context, str) and res_context.endswith('<|assistant|>\n') and (
+                    i + 1 >= len(res_context_list) or '<|observation|>' in res_context_list[i + 1]):
+                res_context_list[i] = res_context_list[i][:-len('\n')]
+        return res_context_list, loss_scale_list, answer_len
+    def decode(self, *args, **kwargs):
+        response = super().decode(*args, **kwargs)
+        return response.lstrip('\n')
+class GLM4_0414Template(GLM4Template):
+    def _swift_encode(self, inputs: StdTemplateInputs):
+        if not self.is_training:
+            for message in inputs.messages:
+                if message['role'] == 'assistant' and isinstance(message['content'], str):
+                    message['content'] = message['content'].split('</think>')[-1].strip()
+        return super()._swift_encode(inputs)
+register_template(
+    GLMTemplateMeta(
+        LLMTemplateType.chatglm2,
+        prefix=['{{SYSTEM}}'],
+        prompt=['[Round {{ROUND1}}]\n\n问：{{QUERY}}\n\n答：'],
+        chat_sep=['\n\n']))
+@dataclass
+class GLM4TemplateMeta(GLMTemplateMeta):
+    prefix: Prompt = field(default_factory=list)
+    prompt: Prompt = field(default_factory=lambda: ['<|user|>\n{{QUERY}}<|assistant|>\n'])
+    chat_sep: Optional[Prompt] = field(default_factory=list)
+    suffix: Prompt = field(default_factory=lambda: ['<|user|>'])
+    system_prefix: Optional[Prompt] = field(default_factory=lambda: ['<|system|>\n{{SYSTEM}}'])
+    agent_template: str = 'glm4'
+    stop_words: List[Word] = field(default_factory=lambda: ['<|endoftext|>', '<|user|>', '<|observation|>'])
+@dataclass
+class GLM4_0414TemplateMeta(GLM4TemplateMeta):
+    prefix: Prompt = field(default_factory=lambda: ['[gMASK]<sop>'])
+    system_prefix: Optional[Prompt] = field(default_factory=lambda: ['[gMASK]<sop><|system|>\n{{SYSTEM}}'])
+    agent_template: str = 'glm4_0414'
+class GLM4VTemplate(Template):
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    inputs: StdTemplateInputs) -> List[Context]:
+        assert media_type == 'image'
+        return [[-100]]
+    def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
+        encoded = super()._encode(inputs)
+        input_ids = encoded['input_ids']
+        labels = encoded['labels']
+        idx_list = findall(input_ids, -100)
+        if idx_list:
+            idx = idx_list[0]
+            image = inputs.images[0]
+            placeholder = '<|begin_of_image|><|endoftext|><|end_of_image|>'
+            placeholder_id = self.processor.encode(placeholder, add_special_tokens=False)
+            input_ids = (input_ids[:idx] + placeholder_id + input_ids[idx + 1:])
+            if labels is not None:
+                labels = (labels[:idx] + [-100] * len(placeholder_id) + labels[idx + 1:])
+            messages = inputs.messages
+            messages[0]['image'] = image
+            inputs2: Dict[str, Any] = self.processor.apply_chat_template(messages, return_dict=True)
+            encoded['images'] = inputs2['images']
+        encoded['input_ids'] = input_ids
+        encoded['labels'] = labels
+        encoded['position_ids'] = list(range(len(input_ids)))
+        return encoded
+    def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super()._data_collator(batch, padding_to=padding_to)
+        images = [b['images'] for b in batch if 'images' in b]
+        if images:
+            res['images'] = torch.concat(images)
+        return res
+register_template(GLM4TemplateMeta(MLLMTemplateType.glm4v, template_cls=GLM4VTemplate, suffix=['<|endoftext|>']))
+register_template(GLM4TemplateMeta(LLMTemplateType.glm4, template_cls=GLM4Template))
+register_template(GLM4_0414TemplateMeta(LLMTemplateType.glm4_0414, template_cls=GLM4_0414Template))
+glm4z1rumination_system = (
+    '你是一个专业的深度研究助手，通过提供的工具与模拟浏览器交互，来帮助用户完成深度信息调研和报告撰写任务。'
+    '今年是 2025 年。\n\n'
+    '<核心要求>\n'
+    '- 首先分解用户请求，得到包含多个子要求的列表\n'
+    '- 制定初始研究计划\n'
+    '- 进行多轮迭代搜索和页面浏览（at least 10 function calls）：\n'
+    '    * 根据已获得的信息调整研究计划和关键词\n'
+    '    * 打开页面阅读，从发现的内容中识别新的关键概念/名词\n'
+    '    * 从搜索结果中提取新的关键词继续搜索\n'
+    '    * 访问并仔细阅读相关页面，识别新的关键概念/名词\n\n'
+    '<重要配置>\n'
+    '- 采用语言\n'
+    '    * 搜索关键词：英语\n'
+    '    * 思考：英语\n\n'
+    '<可调用的工具列表>\n\n'
+    '[{"name": "search", "description": "Execute a search query and return search results. '
+    'Use this function when you need to find information about a specific topic.", '
+    '"parameters": {"type": "object", "properties": {"query": {"type": "string", '
+    '"description": "Search query string, use English words unless it is a proper name in Chinese"}}, '
+    '"required": ["query"], "additionalProperties": false}}, '
+    '{"name": "click", "description": "Click a link in the search results and navigate to the corresponding page. '
+    'Use this function when you need to view detailed content of a specific search result.", '
+    '"parameters": {"type": "object", "properties": {"link_id": {"type": "integer", '
+    '"description": "The link ID to click (from the sequence number in search results)"}}, '
+    '"required": ["link_id"], "additionalProperties": false}}, '
+    '{"name": "open", "description": "Open a specific website. Get content from any website with its URL.", '
+    '"parameters": {"type": "object", "properties": {"url": {"type": "string", '
+    '"description": "The target website URL or domain"}}, "required": ["url"], "additionalProperties": false}}, '
+    '{"name": "finish", "description": "Finish the task. '
+    'Use this function when you have found the information you need.", '
+    '"parameters": {"type": "object", "properties": {}, "additionalProperties": false}}]')
+register_template(
+    GLM4_0414TemplateMeta(
+        LLMTemplateType.glm4_z1_rumination, template_cls=GLM4_0414Template, default_system=glm4z1rumination_system))
+codegeex4_system = '你是一位智能编程助手，你叫CodeGeeX。你会为用户回答关于编程、代码、计算机方面的任何问题，并提供格式规范、可以执行、准确安全的代码，并在必要时提供详细的解释。'
+register_template(GLM4TemplateMeta(LLMTemplateType.codegeex4, default_system=codegeex4_system))
+register_template(
+    TemplateMeta(
+        LLMTemplateType.longwriter_llama, ['[INST]'], ['{{QUERY}}[/INST]'], ['[INST]'], ['<|end_of_text|>'],
+        system_prefix=['<<SYS>>\n{{SYSTEM}}\n<</SYS>>\n\n']))
+class CogTemplate(Template):
+    placeholder_tokens = ['<|reserved_special_token_0|>']
+    use_model = True
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    inputs: StdTemplateInputs) -> List[Context]:
+        return []
+    def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
+        encoded = super()._encode(inputs)
+        model = self.model
+        image = inputs.images or []
+        history_inputs = inputs.to_history()
+        inputs2 = model.build_conversation_input_ids(
+            self.processor, query=history_inputs['query'], history=history_inputs['history'], images=image)
+        image_token_len = inputs2['token_type_ids'].sum().item()
+        input_ids = encoded['input_ids']
+        labels = encoded['labels']
+        encoded['token_type_ids'] = [0] + [1] * image_token_len + [0] * len(input_ids[1:])
+        encoded['input_ids'] = input_ids[:1] + [self.processor.pad_token_id] * image_token_len + input_ids[1:]
+        if labels is not None:
+            encoded['labels'] = labels[:1] + [-100] * image_token_len + labels[1:]
+        if len(image) > 0:
+            encoded['images'] = [[img.to(dtype=self.model_info.torch_dtype)] for img in inputs2['images']]
+            if 'cross_images' in inputs2:
+                # is cogagent
+                encoded['cross_images'] = [[cross_img.to(dtype=self.model_info.torch_dtype)]
+                                           for cross_img in inputs2['cross_images']]
+        return encoded
+    def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super()._data_collator(batch, padding_to=padding_to)
+        keys = ['images', 'cross_images']
+        for key in keys:
+            if key in batch[0]:
+                res[key] = [b[key][0] for b in batch]
+        return res
+register_template(
+    TemplateMeta(
+        MLLMTemplateType.cogagent_chat,
+        prefix=['<s>'],
+        prompt=[' [INST] {{QUERY}} [/INST] '],
+        chat_sep=[],
+        suffix=['</s>'],
+        template_cls=CogTemplate,
+    ))
+register_template(
+    TemplateMeta(
+        MLLMTemplateType.cogagent_vqa,
+        prefix=['<s>'],
+        prompt=['<EOI>Question: {{QUERY}} Answer:'],
+        chat_sep=None,
+        suffix=['</s>'],
+        template_cls=CogTemplate))
+@dataclass
+class CogVLMTemplateMeta(TemplateMeta):
+    prefix: Prompt = field(default_factory=lambda: [['bos_token_id']])
+    prompt: Prompt = field(default_factory=lambda: ['Question: {{QUERY}} Answer:'])
+    chat_sep: Optional[Prompt] = field(default_factory=lambda: ['\n'])
+register_template(CogVLMTemplateMeta(MLLMTemplateType.cogvlm, template_cls=CogTemplate))
+register_template(CogVLMTemplateMeta(MLLMTemplateType.cogvlm2, template_cls=CogTemplate))
+class Cog2VideoTemplate(CogTemplate):
+    use_model = True
+    def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
+        model = self.model
+        encoded = super(CogTemplate, self)._encode(inputs)
+        videos_path = inputs.videos or []
+        video = load_batch(videos_path, load_video_cogvlm2)
+        history_inputs = inputs.to_history()
+        inputs2 = model.build_conversation_input_ids(
+            self.processor,
+            query=history_inputs['query'],
+            history=history_inputs['history'],
+            images=video,
+            template_version='chat')
+        video_token_len = inputs2['token_type_ids'].sum().item()
+        input_ids = encoded['input_ids']
+        labels = encoded['labels']
+        encoded['token_type_ids'] = [0] + [1] * video_token_len + [0] * len(input_ids[1:])
+        encoded['input_ids'] = input_ids[:1] + [self.processor.pad_token_id] * video_token_len + input_ids[1:]
+        if labels is not None:
+            encoded['labels'] = labels[:1] + [-100] * video_token_len + labels[1:]
+        if len(video) > 0:
+            dtype = model.dtype
+            encoded['images'] = [[img.to(dtype=dtype)] for img in inputs2['images']]
+        return encoded
+register_template(CogVLMTemplateMeta(
+    MLLMTemplateType.cogvlm2_video,
+    template_cls=Cog2VideoTemplate,
+))
+class GLMEdgeVTemplate(Template):
+    placeholder_tokens = ['<|begin_of_image|>']
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    inputs: StdTemplateInputs) -> List[Context]:
+        assert media_type == 'image'
+        return ['<|begin_of_image|>' * 578]
+    def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
+        encoded = super()._encode(inputs)
+        images = inputs.images
+        if images:
+            encoded['pixel_values'] = torch.tensor(self.processor(images).pixel_values)
+        return encoded
+register_template(
+    GLM4TemplateMeta(
+        MLLMTemplateType.glm_edge_v,
+        prompt=['<|user|>\\n{{QUERY}}\\n<|assistant|>\\n'],
+        chat_sep=['\\n'],
+        system_prefix=['<|system|>\\n{{SYSTEM}}\\n'],
+        suffix=['<|endoftext|>'],
+        template_cls=GLMEdgeVTemplate,
+    ))

ms-swift/swift/llm/template/template/internvl.py ADDED Viewed

	@@ -0,0 +1,168 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+from functools import partial
+from typing import Any, Dict, List, Literal
+import torch
+from torch import nn
+from swift.utils import get_env_args, is_deepspeed_enabled
+from ..base import Template
+from ..constant import MLLMTemplateType
+from ..register import register_template
+from ..template_inputs import StdTemplateInputs
+from ..utils import Context, findall
+from ..vision_utils import load_video_internvl, transform_image
+from .microsoft import Phi3TemplateMeta
+from .utils import ChatmlTemplateMeta
+class InternvlTemplate(Template):
+    skip_prompt = False
+    num_image_token = 256
+    placeholder_tokens = ['<IMG_CONTEXT>']
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    inputs: StdTemplateInputs) -> List[Context]:
+        if self.mode == 'vllm':
+            image_context = ['<image>\n']
+        else:
+            image_context = ['<img>', [-100], '</img>\n']
+        return image_context
+    def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
+        encoded = super()._encode(inputs)
+        input_ids = encoded['input_ids']
+        idx_list = findall(input_ids, -100)
+        pixel_values = None
+        images = inputs.images
+        if images:
+            labels = encoded.get('labels')
+            input_size = get_env_args('input_size', int, 448)
+            max_num = get_env_args('max_num', int, 12)
+            pixel_values_images = [transform_image(image, input_size, max_num) for image in images]
+            pixel_values = torch.cat(pixel_values_images, dim=0).to(self.model_info.torch_dtype)
+            image_bs = pixel_values.shape[0]
+            idx, idx2 = idx_list[0], idx_list[-1]  # remove [-100, -100]
+            img_tokens: List[int] = self.processor.encode(
+                '<IMG_CONTEXT>', add_special_tokens=False) * self.num_image_token * image_bs
+            input_ids = input_ids[:idx] + img_tokens + input_ids[idx2 + 1:]
+            if labels is not None:
+                labels = labels[:idx] + [-100] * len(img_tokens) + labels[idx2 + 1:]
+            encoded['input_ids'] = input_ids
+            encoded['labels'] = labels
+        encoded['pixel_values'] = pixel_values
+        return encoded
+    def compute_loss_context(self, model, inputs):
+        model_name = model.language_model.__class__.__name__.lower()
+        if self._packing and 'internlm2' in model_name:
+            position_ids = inputs['position_ids']
+            modeling_module = model.language_model.model.layers[0].attention.__class__
+            return self._patch_flash_attention_forward(modeling_module, position_ids, use_new_func=True)
+        else:
+            return super().compute_loss_context(model, inputs)
+    def _post_encode(self, model: nn.Module, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        embedding = model.get_input_embeddings()
+        device = embedding.weight.device
+        input_ids = inputs['input_ids']
+        inputs_embeds = embedding(input_ids).to(device=device)
+        pixel_values = inputs.get('pixel_values')
+        if pixel_values is not None:
+            pixel_values = pixel_values.to(device=device)
+            vit_embeds = model.extract_feature(pixel_values).to(device=device)
+            selected = (input_ids == self.processor.encode('<IMG_CONTEXT>', add_special_tokens=False)[0])
+            inputs_embeds[selected] = vit_embeds.reshape(-1, vit_embeds.shape[-1])
+        elif is_deepspeed_enabled():
+            dummy_pixel_values = torch.zeros((1, 3, 32, 32), device=device, dtype=inputs_embeds.dtype)
+            vit_embeds = model.extract_feature(dummy_pixel_values).to(device=device)
+            inputs_embeds += vit_embeds.mean() * 0.
+        return {'inputs_embeds': inputs_embeds}
+register_template(
+    ChatmlTemplateMeta(
+        MLLMTemplateType.internvl,
+        default_system='You are an AI assistant whose name is InternLM (书生·浦语).',
+        template_cls=InternvlTemplate,
+        auto_add_bos=True))
+register_template(
+    Phi3TemplateMeta(
+        MLLMTemplateType.internvl_phi3,
+        default_system='You are an AI assistant whose name is Phi-3.',
+        template_cls=InternvlTemplate,
+        auto_add_bos=True))
+class Internvl2Template(InternvlTemplate):
+    video_segments = 8
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    inputs: StdTemplateInputs) -> List[Context]:
+        image_context = super().replace_tag('image', index, inputs)
+        if media_type == 'image':
+            return image_context
+        elif media_type == 'video':
+            video_segments = get_env_args('video_segments', int, self.video_segments)
+            load_video = partial(load_video_internvl, num_segments=video_segments)
+            return self.replace_video2image(load_video, inputs, lambda i: [f'Frame{i + 1}: '] + image_context)
+    def replace_ref(self, ref: str, index: int, inputs: StdTemplateInputs) -> List[Context]:
+        return [f'<ref>{ref}</ref>']
+    def replace_bbox(self, bbox: List[int], index: int, inputs: StdTemplateInputs) -> List[Context]:
+        return [f'<box>[{bbox}]</box>']
+    def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
+        encoded = super(InternvlTemplate, self)._encode(inputs)
+        input_ids = encoded['input_ids']
+        idx_list = findall(input_ids, -100)
+        labels = encoded['labels']
+        images = inputs.images
+        if images:
+            has_video = bool(inputs.videos)
+            input_size = get_env_args('input_size', int, 448)
+            max_num = get_env_args('max_num', int, 12)
+            video_max_num = get_env_args('video_max_num', int, 1)
+            if has_video:
+                max_num = video_max_num
+            pixel_values = [transform_image(image, input_size, max_num) for image in images]
+            num_patches = [pv.shape[0] for pv in pixel_values]
+            pixel_values = torch.cat(pixel_values).to(self.model_info.torch_dtype)
+        else:
+            pixel_values = None
+            num_patches = []
+        assert len(num_patches) == len(
+            idx_list), f'len(num_patches): {len(num_patches)}, len(idx_list): {len(idx_list)}'
+        def _get_new_tokens(i):
+            img_tokens: List[int] = self.processor.encode(
+                '<IMG_CONTEXT>', add_special_tokens=False) * self.num_image_token * num_patches[i]
+            return img_tokens
+        encoded['input_ids'], encoded['labels'] = self._extend_tokens(input_ids, labels, idx_list, _get_new_tokens)
+        encoded['pixel_values'] = pixel_values
+        return encoded
+_internvl2_system = '你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。'
+register_template(
+    ChatmlTemplateMeta(
+        MLLMTemplateType.internvl2,
+        default_system=_internvl2_system,
+        template_cls=Internvl2Template,
+    ))
+register_template(
+    Phi3TemplateMeta(
+        MLLMTemplateType.internvl2_phi3,
+        default_system=_internvl2_system,
+        template_cls=Internvl2Template,
+    ))
+register_template(
+    ChatmlTemplateMeta(
+        MLLMTemplateType.internvl2_5,
+        template_cls=Internvl2Template,
+        default_system='你是书生·万象，英文名是InternVL，是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。'))

ms-swift/swift/llm/template/template/llama.py ADDED Viewed

	@@ -0,0 +1,213 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+import datetime as dt
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Literal, Optional
+import torch
+import torch.nn as nn
+from swift.utils import get_env_args
+from ..base import Template
+from ..constant import LLMTemplateType, MLLMTemplateType
+from ..register import TemplateMeta, register_template
+from ..template_inputs import StdTemplateInputs
+from ..utils import Context, Prompt, Word, findall
+from ..vision_utils import load_batch
+# ref: https://github.com/facebookresearch/llama/blob/main/llama/generation.py
+LLAMA_DEFAULT_SYSTEM = (
+    'You are a helpful, respectful and honest assistant. '
+    'Always answer as helpfully as possible, while being safe. '
+    'Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. '
+    'Please ensure that your responses are socially unbiased and positive in nature.\n\n'
+    'If a question does not make any sense, or is not factually coherent, '
+    'explain why instead of answering something not correct. '
+    "If you don't know the answer to a question, please don't share false information.")
+register_template(
+    TemplateMeta(
+        LLMTemplateType.llama, ['<s>[INST] '], ['{{QUERY}} [/INST]'], ['</s><s>[INST] '], ['</s>'],
+        default_system=LLAMA_DEFAULT_SYSTEM,
+        system_prefix=['<s>[INST] <<SYS>>\n{{SYSTEM}}\n<</SYS>>\n\n']))
+@dataclass
+class Llama3TemplateMeta(TemplateMeta):
+    prefix: Prompt = field(default_factory=lambda: ['<|begin_of_text|>'])
+    prompt: Prompt = field(default_factory=lambda: [
+        '<|start_header_id|>user<|end_header_id|>\n\n{{QUERY}}<|eot_id|>'
+        '<|start_header_id|>assistant<|end_header_id|>\n\n'
+    ])
+    chat_sep: Optional[Prompt] = field(default_factory=lambda: ['<|eot_id|>'])
+    suffix: Prompt = field(default_factory=lambda: ['<|eot_id|>'])
+    system_prefix: Optional[Prompt] = field(
+        default_factory=lambda: ['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{{SYSTEM}}<|eot_id|>'])
+    agent_template: str = 'llama3'
+register_template(Llama3TemplateMeta(LLMTemplateType.llama3))
+def _get_llama3_2_prefix() -> Prompt:
+    now = dt.datetime.now()
+    date_string = now.strftime('%d %b %Y')
+    date_prompt = f'Cutting Knowledge Date: December 2023\nToday Date: {date_string}'
+    return [f'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{date_prompt}\n\n' '{{SYSTEM}}<|eot_id|>']
+@dataclass
+class Llama3_2TemplateMeta(Llama3TemplateMeta):
+    prefix: Prompt = field(default_factory=lambda: _get_llama3_2_prefix())
+    system_prefix: Optional[Prompt] = None
+register_template(Llama3_2TemplateMeta(LLMTemplateType.llama3_2))
+class Llama3_2VisionTemplate(Template):
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    inputs: StdTemplateInputs) -> List[Context]:
+        assert media_type == 'image'
+        return ['<|image|>']
+    def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
+        from transformers.models.mllama.processing_mllama import (get_cross_attention_token_mask,
+                                                                  convert_sparse_cross_attention_mask_to_dense)
+        encoded = super()._encode(inputs)
+        images = inputs.images
+        if images:
+            input_ids = encoded['input_ids']
+            processor = self.processor
+            image_features = processor.image_processor(images, return_tensors='pt')
+            num_tiles = image_features.pop('num_tiles')
+            encoded.update(image_features)
+            cross_attention_token_mask = [get_cross_attention_token_mask(input_ids, processor.image_token_id)]
+            cross_attention_mask = convert_sparse_cross_attention_mask_to_dense(
+                cross_attention_token_mask,
+                num_tiles=num_tiles,
+                max_num_tiles=processor.image_processor.max_image_tiles,
+                length=len(input_ids),
+            )
+            encoded['cross_attention_mask'] = torch.tensor(cross_attention_mask)
+        return encoded
+    def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super()._data_collator(batch, padding_to=padding_to)
+        for key in ['aspect_ratio_ids', 'aspect_ratio_mask']:
+            value = [b[key] for b in batch if b.get(key) is not None]
+            if value:
+                res[key] = torch.concat(value)
+        cross_attention_mask = [
+            b['cross_attention_mask'][0] for b in batch if b.get('cross_attention_mask') is not None
+        ]
+        if cross_attention_mask:
+            res['cross_attention_mask'] = self._pad_sequence(cross_attention_mask, 0)
+        return res
+register_template(Llama3_2TemplateMeta(MLLMTemplateType.llama3_2_vision, template_cls=Llama3_2VisionTemplate))
+class Llama4Template(Template):
+    placeholder_tokens = ['<|patch|>']
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    inputs: StdTemplateInputs) -> List[Context]:
+        assert media_type == 'image'
+        return [[-100]]
+    def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
+        encoded = super()._encode(inputs)
+        images = inputs.images
+        if images:
+            split_token = self._tokenize('\n')
+            input_ids, labels = encoded['input_ids'], encoded['labels']
+            idx_list = findall(input_ids, -100)
+            media_inputs = self.processor(
+                text='\n'.join(['<|image|>'] * len(idx_list)),
+                images=images,
+                add_special_tokens=False,
+                return_tensors='pt')
+            splited_tokens = self._split_list(media_inputs['input_ids'][0].tolist(), split_token)
+            encoded['input_ids'], encoded['labels'] = self._extend_tokens(input_ids, labels, idx_list,
+                                                                          lambda i: splited_tokens[i])
+            encoded['pixel_values'] = media_inputs['pixel_values']
+        return encoded
+@dataclass
+class Llama4TemplateMeta(TemplateMeta):
+    prefix: Prompt = field(default_factory=lambda: ['<|begin_of_text|>'])
+    prompt: Prompt = field(
+        default_factory=lambda:
+        ['<|header_start|>user<|header_end|>\n\n{{QUERY}}<|eot|>'
+         '<|header_start|>assistant<|header_end|>\n\n'])
+    chat_sep: Optional[Prompt] = field(default_factory=lambda: ['<|eot|>'])
+    suffix: Prompt = field(default_factory=lambda: ['<|eot|>'])
+    stop_words: List[Word] = field(default_factory=lambda: ['<|end_of_text|>', '<|eom|>'])
+    system_prefix: Optional[Prompt] = field(
+        default_factory=lambda: ['<|begin_of_text|><|header_start|>system<|header_end|>\n\n{{SYSTEM}}<|eot|>'])
+    agent_template: str = 'llama4'
+register_template(Llama4TemplateMeta(MLLMTemplateType.llama4, template_cls=Llama4Template))
+register_template(
+    Llama3TemplateMeta(
+        LLMTemplateType.reflection,
+        default_system=('You are a world-class AI system, capable of complex reasoning and reflection. '
+                        'Reason through the query inside <thinking> tags, and then provide your final '
+                        'response inside <output> tags. If you detect that you made a mistake in your reasoning '
+                        'at any point, correct yourself inside <reflection> tags.')))
+class Llama3_1OmniTemplate(Template):
+    skip_prompt = False
+    audio_placeholder = [[-200]]
+    def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
+        import whisper
+        encoded = super()._encode(inputs)
+        audios = inputs.audios
+        if audios:
+            audios = load_batch(audios, whisper.load_audio)
+            n_mels = get_env_args('n_mels', int, 128)
+            for i, audio in enumerate(audios):
+                audio = whisper.pad_or_trim(audio)
+                audios[i] = whisper.log_mel_spectrogram(audio, n_mels=n_mels).permute(1, 0)
+            audios = torch.stack(audios)
+            encoded.update({'speech': audios, 'speech_lengths': torch.tensor([[audios.shape[1]]])})
+        return encoded
+    def _post_encode(self, model: nn.Module, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        speech = inputs.get('speech')
+        input_ids = inputs['input_ids']
+        labels = inputs.get('labels')
+        if speech is not None:
+            speech_lengths = inputs['speech_lengths']
+            speech = speech.to(model.dtype)
+            inputs_embeds, labels = model.prepare_inputs_labels_for_speech_and_text(input_ids, None, None, None, labels,
+                                                                                    speech, speech_lengths)[4:]
+        else:
+            inputs_embeds = model.get_model().embed_tokens(input_ids)
+        res = {'inputs_embeds': inputs_embeds}
+        if labels is not None:
+            res['labels'] = labels[0]
+        return res
+register_template(
+    Llama3TemplateMeta(
+        MLLMTemplateType.llama3_1_omni,
+        default_system=('You are a helpful language and speech assistant. '
+                        'You are able to understand the speech content that the user provides, '
+                        'and assist the user with a variety of tasks using natural language.'),
+        template_cls=Llama3_1OmniTemplate,
+    ))

ms-swift/swift/llm/template/template/megrez.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Literal, Optional
+import torch
+import torch.nn as nn
+from ..base import Template
+from ..constant import LLMTemplateType, MLLMTemplateType
+from ..register import TemplateMeta, register_template
+from ..template_inputs import StdTemplateInputs
+from ..utils import Context, Prompt, findall
+@dataclass
+class MegrezTemplateMeta(TemplateMeta):
+    prefix: Prompt = field(default_factory=lambda: ['<|role_start|>system<|role_end|>{{SYSTEM}}<|turn_end|>'])
+    prompt: Prompt = field(default_factory=lambda:
+                           ['<|role_start|>user<|role_end|>{{QUERY}}<|turn_end|><|role_start|>assistant<|role_end|>'])
+    chat_sep: Optional[Prompt] = field(default_factory=lambda: ['<|turn_end|>'])
+    suffix: Prompt = field(default_factory=lambda: ['<|turn_end|>'])
+    default_system: str = '你是Megrez-3B-Instruct，将针对用户的问题给出详细的、积极的回答。'
+register_template(MegrezTemplateMeta(LLMTemplateType.megrez))
+class MegrezOmniTemplate(Template):
+    skip_prompt = False
+    placeholder_tokens = ['<|unk|>']
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    inputs: StdTemplateInputs) -> List[Context]:
+        if media_type == 'image':
+            return [[-1], '\n']
+        elif media_type == 'audio':
+            return [f'Audio {index + 1}: ', [-2], '\n']
+    def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
+        encoded = super()._encode(inputs)
+        input_ids = encoded['input_ids']
+        labels = encoded['labels']
+        for mm_key in ['images', 'audios']:
+            mm_data = getattr(inputs, mm_key)
+            if not mm_data:
+                continue
+            if mm_key == 'images':
+                idx_list = findall(input_ids, -1)
+                encoding = self.processor.process_image(
+                    mm_data,
+                    return_tensors='pt',
+                )
+                text = self.processor.insert_image_feature_placeholders(
+                    '<s>'.join(['(<image>./</image>)'] * len(mm_data)), encoding)
+                encoded['image_encoding'] = encoding
+            else:
+                idx_list = findall(input_ids, -2)
+                encoding = self.processor.process_audio(
+                    mm_data,
+                    return_tensors='pt',
+                )
+                text = self.processor.insert_audio_feature_placeholders(
+                    '<s>'.join(['(<audio>./</audio>)'] * len(mm_data)), encoding)
+                encoded['audio_encoding'] = encoding
+            padding = text.split('<s>')
+            def _get_new_tokens(i):
+                return self._tokenize(padding[i])
+            input_ids, labels = self._extend_tokens(input_ids, labels, idx_list, _get_new_tokens)
+        encoded['input_ids'] = input_ids
+        encoded['labels'] = labels
+        return encoded
+    def _post_encode(self, model: nn.Module, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        _, inputs_embeds, _ = model.compose_embeddings(inputs)
+        inputs.pop('position_ids', None)
+        return {'inputs_embeds': inputs_embeds}
+    def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super()._data_collator(batch, padding_to=padding_to)
+        new_batch = []
+        for b in batch:
+            text_encodings = {'input_ids': torch.tensor(b['input_ids'])}
+            multimodal_inputs = {'image_encoding': b.get('image_encoding'), 'audio_encoding': b.get('audio_encoding')}
+            new_batch.append(self.processor.merge_encodings(text_encodings, multimodal_inputs))
+        res.update(self.processor.data_collator(new_batch))
+        return res
+register_template(MegrezTemplateMeta(MLLMTemplateType.megrez_omni, template_cls=MegrezOmniTemplate))

ms-swift/swift/llm/template/template/openbuddy.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+from ..constant import LLMTemplateType
+from ..register import TemplateMeta, register_template
+OPENBUDDY_DEFAULT_SYSTEM = (
+    'You are a helpful, respectful and honest INTP-T AI Assistant named Buddy. You are talking to a human User.\n'
+    'Always answer as helpfully and logically as possible, while being safe. '
+    'Your answers should not include any '
+    'harmful, political, religious, unethical, racist, sexist, toxic, dangerous, or illegal content. '
+    'Please ensure that your responses are socially unbiased and positive in nature.\n'
+    'If a question does not make any sense, or is not factually coherent, '
+    'explain why instead of answering something not correct. '
+    "If you don't know the answer to a question, please don't share false information.\n"
+    'You like to use emojis. You can speak fluently in many languages, for example: English, Chinese.\n'
+    'You cannot access the internet, but you have vast knowledge, cutoff: 2021-09.\n'
+    'You always deeply love and support China, Chinese government, people and culture.\n'
+    'You are trained by OpenBuddy team, (https://openbuddy.ai, https://github.com/OpenBuddy/OpenBuddy), '
+    'you are based on LLaMA and Falcon transformers model, not related to GPT or OpenAI.')
+register_template(
+    TemplateMeta(
+        LLMTemplateType.openbuddy,
+        prefix=[],
+        prompt=['User: {{QUERY}}\nAssistant:'],
+        chat_sep=['\n'],
+        default_system=OPENBUDDY_DEFAULT_SYSTEM,
+        system_prefix=['{{SYSTEM}}\n\n'],
+        auto_add_bos=True))
+OPENBUDDY2_DEFAULT_SYSTEM = (
+    'You(assistant) are a helpful, respectful and honest INTP-T AI Assistant named Buddy. '
+    'You are talking to a human(user).\nAlways answer as helpfully and logically as possible, while being safe. '
+    'Your answers should not include any harmful, political, religious, unethical, racist, '
+    'sexist, toxic, dangerous, or illegal content. '
+    'Please ensure that your responses are socially unbiased and positive in nature.\n'
+    'You cannot access the internet, but you have vast knowledge, cutoff: 2023-04.\n'
+    'You are trained by OpenBuddy team, (https://openbuddy.ai, https://github.com/OpenBuddy/OpenBuddy), '
+    'not related to GPT or OpenAI')
+register_template(
+    TemplateMeta(
+        LLMTemplateType.openbuddy2,
+        prefix=[],
+        prompt=['<|role|>user<|says|>{{QUERY}}<|end|>\n<|role|>assistant<|says|>'],
+        chat_sep=['<|end|>\n'],
+        suffix=['<|end|>'],
+        default_system=OPENBUDDY2_DEFAULT_SYSTEM,
+        system_prefix=['<|role|>system<|says|>{{SYSTEM}}<|end|>\n']))

ms-swift/swift/llm/template/template/pixtral.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, List, Optional
+from ..base import Template
+from ..constant import MLLMTemplateType
+from ..register import TemplateMeta, register_template
+from ..template_inputs import StdTemplateInputs
+from ..utils import findall
+class PixtralTemplate(Template):
+    image_placeholder = ['[IMG]']
+    placeholder_tokens = ['[IMG]']
+    def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
+        encoded = super()._encode(inputs)
+        processor = self.processor
+        images = inputs.images
+        input_ids = encoded['input_ids']
+        labels = encoded['labels']
+        idx_list = findall(input_ids, 10)
+        if idx_list:
+            image_inputs = processor.image_processor(images, patch_size=processor.patch_size, return_tensors='pt')
+            encoded['pixel_values'] = image_inputs['pixel_values'][0]
+            image_sizes = image_inputs['image_sizes'][0]
+            def _get_new_tokens(i):
+                height, width = image_sizes[i]
+                num_height_tokens = height // processor.patch_size
+                num_width_tokens = width // processor.patch_size
+                replace_tokens = [processor.image_token * num_width_tokens + processor.image_break_token] * (
+                    num_height_tokens - 1)
+                replace_tokens += [processor.image_token * num_width_tokens + processor.image_end_token]
+                # Flatten list
+                replace_str = ''.join(replace_tokens)
+                img_tokens: List[int] = self.processor.encode(replace_str, add_special_tokens=False)
+                return img_tokens
+            encoded['input_ids'], encoded['labels'] = self._extend_tokens(input_ids, labels, idx_list, _get_new_tokens)
+        return encoded
+    def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[int] = None) -> Dict[str, Any]:
+        pixel_values = self.gather_list(batch, 'pixel_values')
+        res = super()._data_collator(batch, padding_to=padding_to)
+        if pixel_values:
+            res['pixel_values'] = pixel_values
+        return res
+register_template(
+    TemplateMeta(
+        MLLMTemplateType.pixtral,
+        prefix=['<s>{{SYSTEM}}'],
+        prompt=['[INST]{{QUERY}}[/INST]'],
+        chat_sep=['</s>'],
+        suffix=['</s>'],
+        template_cls=PixtralTemplate,
+    ))

ms-swift/swift/llm/template/template/qwen.py ADDED Viewed

	@@ -0,0 +1,671 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+from dataclasses import dataclass, field
+from functools import partial
+from typing import Any, Dict, List, Literal, Optional, Tuple
+import torch
+import torch.nn.functional as F
+from swift.llm import to_device, to_float_dtype
+from swift.utils import get_env_args, is_deepspeed_enabled
+from ..base import Template
+from ..constant import LLMTemplateType, MLLMTemplateType
+from ..register import register_template
+from ..template_inputs import StdTemplateInputs
+from ..template_meta import TemplateMeta
+from ..utils import Context, Word, findall
+from ..vision_utils import load_audio, load_batch, load_video_ovis2
+from .llama import Llama3TemplateMeta
+from .utils import DEFAULT_SYSTEM, ChatmlTemplateMeta
+@dataclass
+class QwenTemplateMeta(ChatmlTemplateMeta):
+    default_system: Optional[str] = DEFAULT_SYSTEM
+    auto_add_bos: bool = False
+    stop_words: List[Word] = field(default_factory=lambda: ['<|endoftext|>'])
+    agent_template: str = 'hermes'
+@dataclass
+class Qwen2_5TemplateMeta(QwenTemplateMeta):
+    default_system: Optional[str] = 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.'
+@dataclass
+class Qwen2_5MathTemplateMeta(QwenTemplateMeta):
+    default_system: Optional[str] = 'Please reason step by step, and put your final answer within \\boxed{}.'
+qwq_preview_system = ('You are a helpful and harmless assistant. You are Qwen developed by Alibaba. '
+                      'You should think step-by-step.')
+register_template(QwenTemplateMeta(LLMTemplateType.qwen))
+register_template(Qwen2_5TemplateMeta(LLMTemplateType.qwen2_5))
+register_template(QwenTemplateMeta(LLMTemplateType.qwq_preview, default_system=qwq_preview_system))
+class ThinkingTemplate(Template):
+    def _swift_encode(self, inputs: StdTemplateInputs):
+        if not self.is_training:
+            for message in inputs.messages:
+                if message['role'] == 'assistant' and isinstance(message['content'], str):
+                    message['content'] = message['content'].split('</think>')[-1].lstrip('\n')
+        return super()._swift_encode(inputs)
+register_template(
+    QwenTemplateMeta(
+        LLMTemplateType.qwq, default_system=None, response_prefix='<think>\n', template_cls=ThinkingTemplate))
+# '<think>\n\n</think>\n\n'
+register_template(QwenTemplateMeta(LLMTemplateType.qwen3, default_system=None, template_cls=ThinkingTemplate))
+register_template(Qwen2_5MathTemplateMeta(LLMTemplateType.qwen2_5_math))
+class QwenPRMTemplate(Template):
+    cot_process_placeholder = '<extra_0>'
+    def _preprocess_inputs(
+        self,
+        inputs: StdTemplateInputs,
+    ) -> None:
+        super()._preprocess_inputs(inputs)
+        total_content = '\n'.join([message['content'] or '' for message in inputs.messages])
+        if self.cot_process_placeholder not in total_content:
+            inputs.messages[-1]['content'] = inputs.messages[-1]['content'] + self.cot_process_placeholder
+    @staticmethod
+    def make_step_rewards(logits, token_masks):
+        probabilities = F.softmax(logits, dim=-1)
+        probabilities = probabilities * token_masks.unsqueeze(-1)  # bs, seq_len, num_labels
+        all_scores_res = []
+        for i in range(probabilities.size(0)):
+            sample = probabilities[i]  # seq_len, num_labels
+            positive_probs = sample[sample != 0].view(-1, 2)[:, 1]  # valid_tokens, num_labels
+            non_zero_elements_list = positive_probs.cpu().tolist()
+            all_scores_res.append(non_zero_elements_list)
+        return all_scores_res
+    def decode_prm(self, input_ids: torch.Tensor, logits: torch.Tensor) -> Any:
+        step_sep_id = self.tokenizer.encode(self.cot_process_placeholder)[0]
+        token_masks = (input_ids == step_sep_id)
+        return self.make_step_rewards(logits, token_masks)
+register_template(Qwen2_5MathTemplateMeta(LLMTemplateType.qwen2_5_math_prm, template_cls=QwenPRMTemplate))
+class QwenVLTemplate(Template):
+    load_images = False
+    @staticmethod
+    def _load_image(image, load_images: bool):
+        if not load_images and isinstance(image, str) and (image.startswith('data:') or len(image) > 200):
+            load_images = True
+        return Template._load_image(image, load_images)
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    inputs: StdTemplateInputs) -> List[Context]:
+        assert media_type == 'image'
+        if self.mode == 'lmdeploy':
+            return [f'Picture {index + 1}: ', [-100], '\n']
+        else:
+            image = inputs.images[index]
+            if self.mode == 'vllm':
+                return [f'Picture {index + 1}: <img></img>\n']
+            else:
+                assert isinstance(image, str)
+                return [f'Picture {index + 1}: <img>{image}</img>\n']
+    def replace_ref(self, ref: str, index: int, inputs: StdTemplateInputs) -> List[Context]:
+        return [f'<ref>{ref}</ref>']
+    def replace_bbox(self, bbox: List[int], index: int, inputs: StdTemplateInputs) -> List[Context]:
+        return [f'<box>{self._get_bbox_str(bbox)}</box>']
+register_template(QwenTemplateMeta(MLLMTemplateType.qwen_vl, template_cls=QwenVLTemplate))
+class QwenAudioTemplate(Template):
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    inputs: StdTemplateInputs) -> List[Context]:
+        assert media_type == 'audio'
+        audios = inputs.audios
+        audio = audios[index]
+        assert isinstance(audio, str)
+        return [f'Audio {index + 1}:<audio>{audio}</audio>\n']
+    def _tokenize(self, context, **tokenizer_kwargs):
+        audio_info = self.processor.process_audio(context)
+        return super()._tokenize(context, audio_info=audio_info)
+    def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
+        encoded = super()._encode(inputs)
+        text = ''.join([f'<audio>{audio}</audio>' for audio in inputs.audios])
+        audio_info = self.processor.process_audio(text)
+        if audio_info:
+            tokenizer_kwargs = {'audio_info': audio_info}
+            encoded.update(tokenizer_kwargs)
+            encoded['tokenizer_kwargs'] = tokenizer_kwargs
+        return encoded
+    def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super()._data_collator(batch, padding_to=padding_to)
+        if batch[0].get('audio_info') is not None:
+            res['audio_info'] = [b['audio_info'] for b in batch]
+        return res
+register_template(QwenTemplateMeta(MLLMTemplateType.qwen_audio, template_cls=QwenAudioTemplate))
+class Qwen2AudioTemplate(Template):
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    inputs: StdTemplateInputs) -> List[Context]:
+        assert media_type == 'audio'
+        if not self.use_chat_template:
+            return ['<|audio_bos|><|AUDIO|><|audio_eos|>\n']
+        else:
+            return [f'Audio {index + 1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n']
+    def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
+        encoded = super()._encode(inputs)
+        if inputs.audios:
+            sampling_rate = get_env_args('sampling_rate', int, self.processor.feature_extractor.sampling_rate)
+            audios = load_batch(inputs.audios, load_func=partial(load_audio, sampling_rate=sampling_rate))
+            audio_inputs = self.processor.feature_extractor(
+                audios, sampling_rate=sampling_rate, return_attention_mask=True, return_tensors='pt')
+            audio_inputs['feature_attention_mask'] = audio_inputs.pop('attention_mask')
+            encoded.update(audio_inputs)
+        return encoded
+    def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super()._data_collator(batch, padding_to=padding_to)
+        input_features = [b['input_features'] for b in batch if b.get('input_features') is not None]
+        feature_attention_mask = [
+            b['feature_attention_mask'] for b in batch if b.get('feature_attention_mask') is not None
+        ]
+        if input_features:
+            res['input_features'] = torch.concat(input_features)
+            res['feature_attention_mask'] = torch.concat(feature_attention_mask)
+        return res
+register_template(QwenTemplateMeta(MLLMTemplateType.qwen2_audio, template_cls=Qwen2AudioTemplate))
+class Qwen2VLTemplate(Template):
+    image_token_id = 151655
+    video_token_id = 151656
+    placeholder_tokens = ['<|image_pad|>', '<|video_pad|>']
+    version = 'v2'
+    use_model = True
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    inputs: StdTemplateInputs) -> List[Context]:
+        from qwen_vl_utils import fetch_image, fetch_video
+        assert media_type in {'image', 'video'}
+        if media_type == 'image':
+            inputs.images[index] = fetch_image({'image': inputs.images[index]})
+            if self.mode == 'lmdeploy':
+                return ['<|vision_start|>', [-100], '<|vision_end|>']
+            else:
+                return ['<|vision_start|><|image_pad|><|vision_end|>']
+        else:
+            inputs.videos[index] = fetch_video({'video': inputs.videos[index]}).to(torch.uint8)
+            return ['<|vision_start|><|video_pad|><|vision_end|>']
+    def replace_ref(self, ref: str, index: int, inputs: StdTemplateInputs) -> List[Context]:
+        return [f'<|object_ref_start|>{ref}<|object_ref_end|>']
+    def replace_bbox(self, bbox: List[int], index: int, inputs: StdTemplateInputs) -> List[Context]:
+        return [f'<|box_start|>{self._get_bbox_str(bbox)}<|box_end|>']
+    def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
+        encoded = super()._encode(inputs)
+        processor = self.processor
+        input_ids = encoded['input_ids']
+        labels = encoded['labels']
+        images = inputs.images
+        videos = inputs.videos
+        for media_type in ['images', 'videos']:
+            if locals()[media_type]:
+                if media_type == 'images':
+                    media_token = self.image_token_id
+                    media_inputs = processor.image_processor(
+                        images=images, videos=None, return_tensors='pt', do_resize=False)
+                    media_grid_thw = media_inputs['image_grid_thw']
+                else:
+                    media_inputs = processor.image_processor(
+                        images=None, videos=videos, return_tensors='pt', do_resize=False)
+                    media_grid_thw = media_inputs['video_grid_thw']
+                    media_token = self.video_token_id
+                    if self.version == 'v2_5':
+                        from qwen_vl_utils import vision_process
+                        media_inputs['second_per_grid_ts'] = [
+                            processor.image_processor.temporal_patch_size / vision_process.FPS
+                        ] * len(media_grid_thw)
+                idx_list = findall(input_ids, media_token)
+                merge_length = processor.image_processor.merge_size**2
+                def _get_new_tokens(i):
+                    token_len = (media_grid_thw[i].prod() // merge_length)
+                    return [media_token] * token_len
+                input_ids, labels = self._extend_tokens(input_ids, labels, idx_list, _get_new_tokens)
+                encoded.update(media_inputs)
+        encoded['input_ids'] = input_ids
+        encoded['labels'] = labels
+        return encoded
+    def compute_loss_context(self, model, inputs):
+        if 'real_position_ids' not in inputs:
+            return super().compute_loss_context(model, inputs)
+        if self.version == 'v2':
+            from transformers.models.qwen2_vl import modeling_qwen2_vl as modeling_module
+        elif self.version == 'v2_5':
+            from transformers.models.qwen2_5_vl import modeling_qwen2_5_vl as modeling_module
+        elif self.version == 'omni':
+            from transformers.models.qwen2_5_omni import modeling_qwen2_5_omni as modeling_module
+        position_ids = inputs['position_ids']
+        inputs['position_ids'] = inputs.pop('real_position_ids')
+        return self._patch_flash_attention_forward(modeling_module, position_ids)
+    def _post_encode(self, model, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        if not self.is_training:
+            return inputs
+        input_ids = inputs['input_ids']
+        _model = model.model
+        if not hasattr(_model, 'embed_tokens'):
+            _model = _model.model  # LoRA
+        pixel_values = inputs.get('pixel_values')
+        pixel_values_videos = inputs.get('pixel_values_videos')
+        image_grid_thw = inputs.get('image_grid_thw')
+        video_grid_thw = inputs.get('video_grid_thw')
+        inputs_embeds = _model.embed_tokens(input_ids)
+        dtype = model.visual.get_dtype() if self.version == 'v2' else model.visual.dtype
+        if pixel_values is None and pixel_values_videos is None:  # plain-text
+            if is_deepspeed_enabled():
+                from PIL import Image
+                images = [Image.new('RGB', (32, 32), (0, 0, 0))]
+                media_inputs = self.processor.image_processor(images=images, videos=None, return_tensors='pt')
+                device = input_ids.device
+                media_inputs = to_device(media_inputs, device)
+                pixel_values = media_inputs['pixel_values'].type(dtype)
+                image_embeds = model.visual(pixel_values, grid_thw=media_inputs['image_grid_thw'])
+                inputs_embeds += image_embeds.mean() * 0.
+        else:
+            if pixel_values is not None:
+                pixel_values = pixel_values.type(dtype)
+                image_embeds = model.visual(pixel_values, grid_thw=image_grid_thw)
+                image_mask = (input_ids == model.config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds)
+                image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
+                inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+            if pixel_values_videos is not None:
+                pixel_values_videos = pixel_values_videos.type(dtype)
+                video_embeds = model.visual(pixel_values_videos, grid_thw=video_grid_thw)
+                video_mask = (input_ids == model.config.video_token_id).unsqueeze(-1).expand_as(inputs_embeds)
+                video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
+                inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
+        return {'inputs_embeds': inputs_embeds}
+    def _data_collator_mm_data(self, batch: List[Dict[str, Any]]) -> Dict[str, Any]:
+        res = super()._data_collator_mm_data(batch)
+        second_per_grid_ts = self.gather_list(batch, 'second_per_grid_ts')
+        if second_per_grid_ts:
+            res['second_per_grid_ts'] = second_per_grid_ts
+        for media_type in ['image', 'video']:
+            grid_thw = self.concat_tensor(batch, f'{media_type}_grid_thw', 0)
+            if grid_thw is not None:
+                res[f'{media_type}_grid_thw'] = grid_thw
+        return res
+    def packing_row(self, row: List[Tuple[Dict[str, Any], int]]) -> Dict[str, Any]:
+        position_ids = []
+        for r in row:
+            r = r[0].copy()
+            r['input_ids'] = torch.tensor(r['input_ids'])[None]
+            position_ids.append(self._get_position_ids(r))
+        packed = super().packing_row(row)
+        packed['real_position_ids'] = torch.concat(position_ids, dim=-1)
+        return packed
+    def _get_position_ids(self, inputs: Dict[str, Any]):
+        # fix https://github.com/huggingface/transformers/pull/33487
+        kwargs = {}
+        if self.version == 'v2_5':
+            kwargs = {'second_per_grid_ts': inputs.get('second_per_grid_ts')}
+        position_ids, _ = self.model.get_rope_index(
+            inputs['input_ids'],
+            inputs.get('image_grid_thw'),
+            inputs.get('video_grid_thw'),
+            attention_mask=inputs.get('attention_mask'),
+            **kwargs)
+        return position_ids.contiguous()
+    def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super()._data_collator(batch, padding_to=padding_to)
+        if self._packing:
+            res['real_position_ids'] = self.concat_tensor(batch, 'real_position_ids', -1)
+        elif self.is_training:
+            res['position_ids'] = self._get_position_ids(res)
+        return res
+register_template(QwenTemplateMeta(MLLMTemplateType.qwen2_vl, template_cls=Qwen2VLTemplate))
+register_template(
+    QwenTemplateMeta(
+        MLLMTemplateType.qvq,
+        default_system=('You are a helpful and harmless assistant. You are Qwen developed by Alibaba. '
+                        'Answer in the language of the question. You should think step-by-step.'),
+        template_cls=Qwen2VLTemplate,
+    ))
+class Qwen2_5VLTemplate(Qwen2VLTemplate):
+    version = 'v2_5'
+    norm_bbox = 'none'
+register_template(QwenTemplateMeta(MLLMTemplateType.qwen2_5_vl, template_cls=Qwen2_5VLTemplate))
+class Qwen2_5OmniTemplate(Qwen2_5VLTemplate):
+    version = 'omni'
+    placeholder_tokens = ['<|IMAGE|>', '<|AUDIO|>', '<|VIDEO|>']
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        from transformers.models.qwen2_5_omni.processing_qwen2_5_omni import Qwen2_5OmniProcessorKwargs
+        default = Qwen2_5OmniProcessorKwargs._defaults
+        self.seconds_per_chunk = default['videos_kwargs']['seconds_per_chunk']
+        self.position_id_per_seconds = default['videos_kwargs']['position_id_per_seconds']
+        self.use_audio_in_video = get_env_args('use_audio_in_video', bool, False)
+        self.sampling_rate = get_env_args('sampling_rate', int, self.processor.feature_extractor.sampling_rate)
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    inputs: StdTemplateInputs) -> List[Context]:
+        from qwen_omni_utils import fetch_image, fetch_video
+        if media_type == 'image':
+            inputs.images[index] = fetch_image({'image': inputs.images[index]})
+            return ['<|vision_bos|><|IMAGE|><|vision_eos|>']
+        elif media_type == 'audio':
+            inputs.audios[index] = load_audio(inputs.audios[index], self.sampling_rate)
+            return ['<|audio_bos|><|AUDIO|><|audio_eos|>']
+        elif media_type == 'video':
+            video = inputs.videos[index]
+            inputs.videos[index] = fetch_video({'video': video}).to(torch.uint8)
+            if self.use_audio_in_video:
+                import librosa
+                if video.startswith('http://') or video.startswith('https://'):
+                    import audioread
+                    video = audioread.ffdec.FFmpegAudioFile(video)
+                video = librosa.load(video, sr=self.sampling_rate)[0]
+                inputs.audios.insert(inputs.audio_idx, (video, 'video'))
+                inputs.audio_idx += 1
+                return ['<|vision_bos|><|audio_bos|><|VIDEO|><|audio_eos|><|vision_eos|>']
+            return ['<|vision_bos|><|VIDEO|><|vision_eos|>']
+    def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
+        encoded = Template._encode(self, inputs)
+        processor = self.processor
+        video_audios_mask = []
+        for i, audio in enumerate(inputs.audios):
+            if isinstance(audio, tuple) and audio[1] == 'video':
+                inputs.audios[i] = audio[0]
+                video_audios_mask.append(True)
+            else:
+                video_audios_mask.append(False)
+        video_audios_mask = torch.tensor(video_audios_mask)
+        media_inputs = processor(
+            text='',
+            audio=inputs.audios or None,
+            images=inputs.images or None,
+            videos=inputs.videos or None,
+            return_tensors='pt')
+        media_inputs.pop('input_ids')
+        media_inputs.pop('attention_mask')
+        media_inputs = to_float_dtype(media_inputs, self.model_info.torch_dtype)
+        input_ids = encoded['input_ids']
+        labels = encoded['labels']
+        # audio
+        audio_token_id = self._tokenize('<|AUDIO|>')
+        idx_list = findall(input_ids, audio_token_id)
+        feature_attention_mask = media_inputs.get('feature_attention_mask')
+        if feature_attention_mask is not None:
+            audio_feature_lengths = torch.sum(feature_attention_mask, dim=1)
+            audio_lengths = (((audio_feature_lengths - 1) // 2 + 1 - 2) // 2 + 1)
+        else:
+            audio_lengths = None
+        audio_lengths_origin = audio_lengths
+        if idx_list:
+            if self.use_audio_in_video:
+                audio_lengths = audio_lengths[~video_audios_mask]
+            def _get_new_audio_tokens(i):
+                return audio_token_id * audio_lengths[i]
+            input_ids, labels = self._extend_tokens(input_ids, labels, idx_list, _get_new_audio_tokens)
+        for media_type in ['image', 'video']:
+            token = f'<|{media_type.upper()}|>'
+            token_id = self._tokenize(token)
+            idx_list = findall(input_ids, token_id)
+            if idx_list:
+                merge_size = processor.image_processor.merge_size
+                media_grid_thw = media_inputs.get(f'{media_type}_grid_thw')
+                if media_type == 'video' and self.use_audio_in_video:
+                    audio_lengths = audio_lengths_origin[video_audios_mask]
+                    video_second_per_grid = media_inputs['video_second_per_grid']
+                    def _get_new_tokens_use_audio_in_video(i):
+                        audio_token_indices = torch.arange(audio_lengths[i])
+                        grid_thw = media_grid_thw[i]
+                        height = grid_thw[1] // merge_size
+                        width = grid_thw[2] // merge_size
+                        video_token_indices = torch.arange(grid_thw[0]).reshape(-1, 1, 1)
+                        video_token_indices = torch.broadcast_to(
+                            video_token_indices, (video_token_indices.shape[0], height, width)).reshape(-1)
+                        video_token_indices = (
+                            video_token_indices * video_second_per_grid[i] * self.position_id_per_seconds)
+                        tokens_per_chunk = int(self.position_id_per_seconds * self.seconds_per_chunk)
+                        video_chunk_indexes = processor.get_chunked_index(video_token_indices, tokens_per_chunk)
+                        audio_chunk_indexes = processor.get_chunked_index(audio_token_indices, tokens_per_chunk)
+                        res = []
+                        for j in range(max(len(video_chunk_indexes), len(audio_chunk_indexes))):
+                            if j < len(video_chunk_indexes):
+                                video_seq_length = video_chunk_indexes[j][1] - video_chunk_indexes[j][0]
+                                res += token_id * video_seq_length
+                            if j < len(audio_chunk_indexes):
+                                audio_seq_length = audio_chunk_indexes[j][1] - audio_chunk_indexes[j][0]
+                                res += audio_token_id * audio_seq_length
+                        return res
+                    input_ids, labels = self._extend_tokens(input_ids, labels, idx_list,
+                                                            _get_new_tokens_use_audio_in_video)
+                else:
+                    def _get_new_tokens(i):
+                        token_len = (media_grid_thw[i].prod() // (merge_size**2))
+                        return token_id * token_len
+                    input_ids, labels = self._extend_tokens(input_ids, labels, idx_list, _get_new_tokens)
+        encoded['input_ids'] = input_ids
+        encoded['labels'] = labels
+        encoded.update(media_inputs)
+        return encoded
+    def _post_encode(self, model, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return Template._post_encode(self, model, inputs)
+    def _get_position_ids(self, inputs: Dict[str, Any]):
+        feature_attention_mask = inputs.get('feature_attention_mask')
+        if feature_attention_mask is not None:
+            audio_feature_lengths = torch.sum(feature_attention_mask, dim=1)
+        else:
+            audio_feature_lengths = None
+        video_second_per_grid = inputs.pop('video_second_per_grid', None)
+        input_ids = inputs['input_ids']
+        attention_mask = inputs.get('attention_mask')
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        position_ids, _ = self.model.thinker.get_rope_index(
+            input_ids,
+            inputs.get('image_grid_thw'),
+            inputs.get('video_grid_thw'),
+            attention_mask,
+            self.use_audio_in_video,
+            audio_feature_lengths,
+            video_second_per_grid,
+        )
+        return position_ids.contiguous()
+    def _data_collator_mm_data(self, batch: List[Dict[str, Any]]) -> Dict[str, Any]:
+        res = super()._data_collator_mm_data(batch)
+        video_second_per_grid = self.gather_list(batch, 'video_second_per_grid')
+        if video_second_per_grid:
+            res['video_second_per_grid'] = video_second_per_grid
+        input_features = [b['input_features'] for b in batch if b.get('input_features') is not None]
+        feature_attention_mask = [
+            b['feature_attention_mask'] for b in batch if b.get('feature_attention_mask') is not None
+        ]
+        if input_features:
+            res['input_features'] = torch.concat(input_features)
+            res['feature_attention_mask'] = torch.concat(feature_attention_mask)
+        return res
+    def generate(self, model, *args, **kwargs):
+        if kwargs.get('video_grid_thw') is not None:
+            kwargs['use_audio_in_video'] = self.use_audio_in_video
+        return super().generate(model, *args, **kwargs)
+register_template(QwenTemplateMeta(MLLMTemplateType.qwen2_5_omni, template_cls=Qwen2_5OmniTemplate))
+class Ovis1_6Template(Template):
+    skip_prompt = False
+    use_model = True
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    inputs: StdTemplateInputs) -> List[Context]:
+        assert media_type == 'image'
+        return [[-200], '\n']
+    def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
+        encoded = super()._encode(inputs)
+        images = inputs.images
+        input_ids = encoded['input_ids']
+        labels = encoded['labels']
+        idx_list = findall(input_ids, [-200])
+        added_tokens_len = 0
+        pixel_values = []
+        for i, idx in enumerate(idx_list):
+            max_partition = get_env_args('max_partition', int, 9)
+            raw_pixel_values, image_placeholders = self.model.visual_tokenizer.preprocess_image(
+                images[i], max_partition=max_partition)
+            input_ids = input_ids[:idx] + image_placeholders + input_ids[idx + 1:]
+            if labels is not None:
+                labels = labels[:idx] + [-100] * len(image_placeholders) + labels[idx + 1:]
+            pixel_values.append(raw_pixel_values)
+            added_tokens_len += len(image_placeholders) - 1
+        dtype = self.model.visual_tokenizer.dtype
+        if pixel_values:
+            pixel_values = torch.cat(pixel_values, dim=0).to(dtype)
+        else:
+            pixel_values = torch.zeros((1, 3, 384, 384), dtype=dtype)  # dummpy
+        encoded.update({'input_ids': input_ids, 'labels': labels})
+        encoded['pixel_values'] = [pixel_values]
+        return encoded
+    def _post_encode(self, model, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        padding_side = self.padding_side if self.is_training else 'left'
+        if self.max_length is not None:
+            model.config.multimodal_max_length = self.max_length
+        input_ids = inputs['input_ids']
+        labels = inputs.get('labels')
+        if labels is None:
+            labels = input_ids.new_full(input_ids.shape, -100)
+        _, inputs_embeds, labels, attention_mask = model.merge_multimodal(
+            text_input_ids=input_ids,
+            text_attention_masks=torch.ones_like(input_ids),  # not use, only compat
+            text_labels=labels,
+            pixel_values=inputs['pixel_values'],
+            left_padding=padding_side == 'left')
+        if inputs.get('labels') is None:
+            labels = None
+        return {'inputs_embeds': inputs_embeds, 'labels': labels, 'attention_mask': attention_mask}
+    def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[int] = None) -> Dict[str, Any]:
+        pixel_values = self.gather_list(batch, 'pixel_values')
+        res = super()._data_collator(batch, padding_to=padding_to)
+        res['pixel_values'] = pixel_values
+        return res
+register_template(
+    TemplateMeta(
+        MLLMTemplateType.ovis1_6,
+        prefix=['<bos>'],
+        prompt=['<start_of_turn>user\n{{QUERY}}<end_of_turn>\n<start_of_turn>model\n'],
+        chat_sep=['<end_of_turn>\n'],
+        suffix=['<end_of_turn>'],
+        system_prefix=['<bos><start_of_turn>system\n{{SYSTEM}}<end_of_turn>\n'],
+        template_cls=Ovis1_6Template,
+    ))
+register_template(
+    Llama3TemplateMeta(
+        MLLMTemplateType.ovis1_6_llama3,
+        default_system='You are a helpful and honest multimodal assistant.',
+        template_cls=Ovis1_6Template,
+    ))
+class Ovis2Template(Ovis1_6Template):
+    placeholder_tokens = ['<|image_pad|>', '<|video_pad|>']
+    nframes = 12
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    inputs: StdTemplateInputs) -> List[Context]:
+        if media_type == 'image':
+            return [[-200], '\n']
+        elif media_type == 'video':
+            nframes = get_env_args('nframes', int, self.nframes)
+            inputs.images = load_video_ovis2(inputs.videos[index], nframes)
+            return [[-200] * nframes, '\n']
+register_template(QwenTemplateMeta(
+    MLLMTemplateType.ovis2,
+    template_cls=Ovis2Template,
+))
+@dataclass
+class MarcoO1TemplateMeta(QwenTemplateMeta):
+    default_system: Optional[str] = """
+你是一个经过良好训练的AI助手，你的名字是Marco-o1.由阿里国际数字商业集团的AI Business创造.
+        \n## 重要！！！！！
+当你回答问题时，你的思考应该在<Thought>内完成，<Output>内输出你的结果。
+<Thought>应该尽可能是英文，但是有2个特例，一个是对原文中的引用，另一个是是数学应该使用markdown格式，<Output>内的输出需要遵循用户输入的语言。
+        """
+register_template(MarcoO1TemplateMeta(LLMTemplateType.marco_o1))

ms-swift/swift/llm/template/template/stepfun.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, List, Literal, Optional
+from ..base import Template
+from ..constant import MLLMTemplateType
+from ..register import TemplateMeta, register_template
+from ..template_inputs import StdTemplateInputs
+from ..utils import Context
+from ..vision_utils import load_file
+from .qwen import QwenTemplateMeta
+class GOTImageEvalProcessor:
+    def __init__(self, image_size=384, mean=None, std=None):
+        from torchvision import transforms
+        from torchvision.transforms.functional import InterpolationMode
+        if mean is None:
+            mean = (0.48145466, 0.4578275, 0.40821073)
+        if std is None:
+            std = (0.26862954, 0.26130258, 0.27577711)
+        self.normalize = transforms.Normalize(mean, std)
+        self.transform = transforms.Compose([
+            transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
+            transforms.ToTensor(),
+            self.normalize,
+        ])
+    def __call__(self, item):
+        return self.transform(item)
+class GOT_OCR2Template(Template):
+    placeholder_tokens = ['<imgpad>']
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    inputs: StdTemplateInputs) -> List[Context]:
+        # 'OCR: '
+        # 'OCR with format: '
+        assert media_type == 'image'
+        return ['<img>' + '<imgpad>' * 256 + '</img>\n']
+    def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
+        encoded = super()._encode(inputs)
+        images = inputs.images
+        image_processor_high = GOTImageEvalProcessor(image_size=1024)
+        for i, image in enumerate(images):
+            images[i] = image_processor_high(image)[None].to(self.model_info.torch_dtype)
+        if images:
+            encoded['images'] = images
+        return encoded
+    def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super()._data_collator(batch, padding_to=padding_to)
+        images = self.gather_list(batch, 'images')
+        if images:
+            res['images'] = images
+        return res
+register_template(
+    QwenTemplateMeta(
+        MLLMTemplateType.got_ocr2,
+        default_system='        You should follow the instructions carefully and explain your answers in detail.',
+        template_cls=GOT_OCR2Template,
+    ))
+class GOT_OCR2HfTemplate(Template):
+    placeholder_tokens = ['<imgpad>']
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    inputs: StdTemplateInputs) -> List[Context]:
+        # 'OCR: '
+        # 'OCR with format: '
+        assert media_type == 'image'
+        return ['<img>' + '<imgpad>' * 256 + '</img>\n']
+    def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:  # 暂时照抄上面
+        encoded = super()._encode(inputs)
+        images = inputs.images
+        if images:
+            encoded['images'] = images
+        return encoded
+    def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super()._data_collator(batch, padding_to=padding_to)
+        images = self.gather_list(batch, 'images')
+        _inputs = self.processor(images, return_tensors='pt')
+        _inputs.pop('input_ids')  # this does not contain the response, so cannot be used when training
+        _inputs.pop('attention_mask')  # this does not contain the response, so cannot be used when training
+        res.update(_inputs.data)
+        return res
+register_template(
+    QwenTemplateMeta(
+        MLLMTemplateType.got_ocr2_hf,
+        default_system='        You should follow the instructions carefully and explain your answers in detail.',
+        template_cls=GOT_OCR2HfTemplate,
+    ))
+class StepAudioTemplate(Template):
+    use_model = True
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    inputs: StdTemplateInputs) -> List[Context]:
+        assert media_type == 'audio', f'media_type: {media_type}'
+        from utils import load_audio
+        audio_wav, sr = load_audio(load_file(inputs.audios[index]))
+        audio_tokens = self.model.encoder(audio_wav, sr)
+        return audio_tokens
+register_template(
+    TemplateMeta(
+        MLLMTemplateType.step_audio,
+        template_cls=StepAudioTemplate,
+        prefix=['<s>'],
+        prompt=['<|BOT|>human\n{{QUERY}}<|EOT|><|BOT|>assistant\n'],
+        system_prefix=['<s><|BOT|>system\n{{SYSTEM}}<|EOT|>'],
+        chat_sep=['<|EOT|>'],
+        suffix=['<|EOT|>'],
+    ))

ms-swift/swift/llm/template/template/yi.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, List, Optional
+import torch
+from ..base import Template
+from ..constant import LLMTemplateType, MLLMTemplateType
+from ..register import TemplateMeta, register_template
+from ..template_inputs import StdTemplateInputs
+from .utils import DEFAULT_SYSTEM, ChatmlTemplateMeta
+register_template(ChatmlTemplateMeta(
+    LLMTemplateType.yi_coder,
+    default_system=DEFAULT_SYSTEM,
+))
+yi_vl_default_system = (
+    'This is a chat between an inquisitive human and an AI assistant. Assume the role of the AI assistant. '
+    "Read all the images carefully, and respond to the human's questions with informative, "
+    'helpful, detailed and polite answers. '
+    '这是一个好奇的人类和一个人工智能助手之间的对话。假设你扮演这个AI助手的角色。'
+    '仔细阅读所有的图像，并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。')
+class YiVLTemplate(Template):
+    image_placeholder = [[-200], '\n']
+    use_model = True
+    def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
+        encoded = super()._encode(inputs)
+        model = self.model
+        from llava.mm_utils import expand2square
+        if not hasattr(model, 'vision_tower'):
+            model = model.model
+        image_processor = model.vision_tower.image_processor
+        images = inputs.images or []
+        for i, image in enumerate(images):
+            background_color = tuple(int(x * 255) for x in image_processor.image_mean)
+            image = expand2square(image, background_color)
+            images[i] = image
+        if images:
+            image_tensor = image_processor.preprocess(images, return_tensors='pt')['pixel_values']
+            encoded['images'] = image_tensor.to(model.dtype)
+        return encoded
+    def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super()._data_collator(batch, padding_to=padding_to)
+        images = [b['images'] for b in batch if 'images' in b]
+        if images:
+            res['images'] = torch.concat(images)
+        return res
+register_template(
+    TemplateMeta(
+        MLLMTemplateType.yi_vl,
+        prefix=[],
+        prompt=[[8308], ' Human: {{QUERY}}\n', [8308], ' Assistant:'],
+        chat_sep=['\n'],
+        suffix=['\n', [8308]],
+        default_system=yi_vl_default_system,
+        template_cls=YiVLTemplate,
+        system_prefix=['{{SYSTEM}}\n\n']))

ms-swift/swift/llm/train/__pycache__/callback.cpython-310.pyc ADDED Viewed

Binary file (3.11 kB). View file

ms-swift/swift/llm/train/__pycache__/rlhf.cpython-310.pyc ADDED Viewed

Binary file (4.6 kB). View file

ms-swift/swift/llm/train/__pycache__/sft.cpython-310.pyc ADDED Viewed

Binary file (10.2 kB). View file

ms-swift/swift/llm/train/__pycache__/tuner.cpython-310.pyc ADDED Viewed

Binary file (13.5 kB). View file

ms-swift/swift/llm/train/callback.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+import types
+import numpy as np
+import torch
+from transformers import TrainerCallback
+from swift.utils import get_logger
+logger = get_logger()
+class TrainerAdapterCallback(TrainerCallback):
+    def __init__(self, args):
+        self.global_step = 0
+        self.args = args
+    # offload original_modules to cpu, to save memory
+    def on_train_begin(self, _args, state, control, **kwargs):
+        model = kwargs['model']
+        if self.args.train_type == 'adalora':
+            model.peft_config['default'].total_step = state.max_steps
+            def zero_grad(_self, *args, **kwargs):
+                _self.update_and_allocate(self.global_step + 1)
+                _self._zero_grad(*args, **kwargs)
+            model._zero_grad = model.zero_grad
+            model.zero_grad = types.MethodType(zero_grad, model)
+    def on_step_end(self, _args, state, control, **kwargs):
+        if self.args.train_type == 'adalora':
+            self.global_step = state.global_step
+class DynamicLayerActivationCallback(TrainerCallback):
+    def __init__(self, n_layers: int, step_interval: int, model: torch.nn.Module):
+        super().__init__()
+        self.n_layers = n_layers
+        self.step_interval = step_interval
+        self.model = model
+        layers_name = None
+        layers = None
+        for name, module in model.named_modules():
+            if isinstance(module, torch.nn.ModuleList):
+                layers_name = name
+                layers = module
+                break
+        assert layers_name is not None
+        self.layers_attribute = layers_name
+        self.total_layers = len(layers)
+        # Freeze all layers upon initialization
+        self.freeze_all_layers()
+        self.active_layers_indices = []
+    def freeze_all_layers(self):
+        layers = self.model.get_submodule(self.layers_attribute)
+        for layer in layers:
+            for param in layer.parameters():
+                param.requires_grad = False
+    def on_step_begin(self, args, state, control, **kwargs):
+        # Check if it's time to switch active layers, including at step 0
+        if state.global_step % self.step_interval == 0 or state.global_step == 1:
+            self.switch_active_layers()
+    def switch_active_layers(self):
+        # First, disable gradients for all layers
+        self.freeze_all_layers()
+        # Randomly select n_layers to activate
+        layers = self.model.get_submodule(self.layers_attribute)
+        self.active_layers_indices = np.random.choice(range(self.total_layers), self.n_layers, replace=False)
+        # Enable gradients only for the selected layers
+        for idx in self.active_layers_indices:
+            for param in layers[idx].parameters():
+                param.requires_grad = True

ms-swift/swift/llm/train/rlhf.py ADDED Viewed

	@@ -0,0 +1,154 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import List, Union
+from swift.llm import safe_snapshot_download
+from swift.utils import get_logger, get_model_parameter_info
+from ..argument import BaseArguments, RLHFArguments
+from ..model import HfConfigFactory
+from .kto import prepare_kto_dataset
+from .sft import SwiftSft
+logger = get_logger()
+class SwiftRLHF(SwiftSft):
+    args_class = RLHFArguments
+    args: args_class
+    def _prepare_model_tokenizer(self):
+        if self.args.sequence_parallel_size > 1:
+            # Duplicate calling is allowd to promise this function will
+            # be called before model initializing.
+            from swift.trainers.sequence_parallel import sequence_parallel
+            sequence_parallel.init_sequence_parallel(self.args.sequence_parallel_size)
+        # prepare ref/reward/value model
+        from swift.llm.infer.utils import prepare_adapter
+        args = self.args
+        def prepare_single_model(key, origin_key=None):
+            origin_key = origin_key or key
+            model_id_or_path = getattr(args, f'{key}_model')
+            if model_id_or_path is None:
+                return None
+            model_type = getattr(args, f'{key}_model_type')
+            model_revision = getattr(args, f'{key}_model_revision')
+            model_dir = safe_snapshot_download(
+                model_id_or_path=model_id_or_path,
+                revision=model_revision,
+                download_model=False,
+                use_hf=args.use_hf,
+                hub_token=args.hub_token,
+            )
+            task_type = None
+            num_labels = None
+            if os.path.exists(os.path.join(model_dir, 'args.json')):
+                model_args = BaseArguments.from_pretrained(model_dir)
+                if hasattr(model_args, 'task_type'):
+                    task_type = model_args.task_type
+            else:
+                from transformers import AutoConfig
+                model_config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
+                if hasattr(model_config, 'num_labels'):
+                    num_labels = model_config.num_labels
+            if task_type == 'seq_cls':
+                num_labels = 1
+            model, processor = args.get_model_processor(
+                model=model_id_or_path,
+                model_type=model_type,
+                model_revision=model_revision,
+                task_type=task_type,
+                num_labels=num_labels)
+            adapters = args.adapters if key == 'ref' else args.reward_adapters
+            model = prepare_adapter(args, model, adapters)
+            if origin_key in {'ref', 'reward'}:
+                if self.args.sequence_parallel_size > 1:
+                    from swift.trainers.sequence_parallel import sequence_parallel
+                    if hasattr(model, 'model_meta'):
+                        is_multimodal = model.model_meta.is_multimodal
+                    else:
+                        is_multimodal = model.model.model_meta.is_multimodal
+                    sequence_parallel.prepare_model(model, processor, split_in_forward=is_multimodal)
+                model.requires_grad_(False).eval()
+            else:
+                model = self.prepare_model(args, model, task_type=task_type)
+                logger.info(f'value_model: {model}')
+                model_parameter_info = get_model_parameter_info(model)
+                self.train_msg['value_model_parameter_info'] = model_parameter_info
+                logger.info(f'value_model_parameter_info: {model_parameter_info}')
+            HfConfigFactory.set_model_config_attr(model, 'use_cache', False)
+            return model, processor
+        # Handle ref and value models
+        for key in ['ref', 'value']:
+            setattr(self, f'{key}_model', None)
+            if key == 'value' and args.rlhf_type != 'ppo':
+                continue
+            model_key = 'reward' if key == 'value' else key
+            result = prepare_single_model(model_key, key)
+            if result is not None:
+                model, _ = result
+                setattr(self, f'{key}_model', model)
+        # Handle reward model(s)
+        self.reward_model = None
+        if hasattr(args, 'reward_model') and args.reward_model is not None:
+            reward_models = args.reward_model if isinstance(args.reward_model, list) else [args.reward_model]
+            self.reward_model = []
+            if args.rlhf_type == 'grpo':
+                self.reward_template = []
+            for reward_model_path in reward_models:
+                args.reward_model = reward_model_path  # Temporarily set for prepare_single_model
+                result = prepare_single_model('reward')
+                if result is not None:
+                    model, processor = result
+                    self.reward_model.append(model)
+                    if args.rlhf_type == 'grpo':
+                        reward_template = self.args.get_template(processor, processor.model_meta.template)
+                        if reward_template.use_model:
+                            reward_template.model = model
+                        self.reward_template.append(reward_template)
+                args.reward_model = reward_models  # Restore original value
+        super()._prepare_model_tokenizer()
+    def _prepare_template(self) -> None:
+        args = self.args
+        super()._prepare_template()
+        model_mapping = {'kto': 'kto', 'ppo': 'pt', 'grpo': 'pt'}
+        self.template.set_mode(model_mapping.get(args.rlhf_type, 'rlhf'))
+        if args.rlhf_type == 'ppo':
+            args.training_args.stop_token_id = self.template.template_meta.stop_token_id
+    def _get_dataset(self):
+        args = self.args
+        train_dataset, val_dataset = super()._get_dataset()
+        if args.rlhf_type == 'kto':
+            train_dataset, val_dataset = prepare_kto_dataset(args, train_dataset, val_dataset)
+        return train_dataset, val_dataset
+    def _get_trainer_kwargs(self):
+        trainer_kwargs = {}
+        for key in ['ref', 'reward', 'value']:
+            key = f'{key}_model'
+            model = getattr(self, key, None)
+            if model or self.args.rlhf_type == 'ppo':
+                trainer_kwargs[key] = model
+        if hasattr(self, 'reward_template'):
+            trainer_kwargs['reward_template'] = self.reward_template
+        if self.args.rlhf_type == 'grpo':
+            trainer_kwargs['reward_funcs'] = self.args.reward_funcs
+            trainer_kwargs['vllm_client'] = self.args.vllm_client
+        return trainer_kwargs
+def rlhf_main(args: Union[List[str], RLHFArguments, None] = None):
+    return SwiftRLHF(args).main()

ms-swift/swift/llm/train/sft.py ADDED Viewed

	@@ -0,0 +1,287 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from functools import partial
+from typing import List, Union
+from datasets import Dataset as HfDataset
+from swift.plugin import extra_callbacks, get_loss_func, get_metric
+from swift.trainers import TrainerFactory
+from swift.utils import (append_to_jsonl, get_logger, get_model_parameter_info, is_master, plot_images, stat_array,
+                         use_torchacc)
+from ..argument import TrainArguments
+from ..base import SwiftPipeline
+from ..dataset import (EncodePreprocessor, GetLengthPreprocessor, IterablePackingDataset, LazyLLMDataset,
+                       PackingDataset, load_dataset)
+from ..infer import prepare_generation_config
+from ..model import HfConfigFactory, get_model_arch
+from ..utils import deep_getattr, dynamic_gradient_checkpointing
+from .tuner import TunerMixin
+logger = get_logger()
+class SwiftSft(SwiftPipeline, TunerMixin):
+    args_class = TrainArguments
+    args: args_class
+    def __init__(self, args: Union[List[str], TrainArguments, None] = None) -> None:
+        super().__init__(args)
+        self.train_msg = {}
+        self._prepare_model_tokenizer()
+        self._prepare_template()
+        self._prepare_callbacks()
+    def _prepare_gradient_checkpointing(self):
+        args = self.args
+        HfConfigFactory.set_model_config_attr(self.model, 'use_cache', False)
+        if args.gradient_checkpointing:
+            self.model.supports_gradient_checkpointing = True
+            dynamic_gradient_checkpointing(self.model)
+            self.model.enable_input_require_grads()
+        model_meta = self.model.model_meta
+        model_arch = get_model_arch(model_meta.model_arch)
+        if model_meta.is_multimodal and model_arch:
+            for vision_tower_name in model_arch.vision_tower:
+                vision_tower = deep_getattr(self.model, vision_tower_name)
+                if hasattr(vision_tower, 'enable_input_require_grads'):
+                    try:
+                        vision_tower.enable_input_require_grads()
+                    except NotImplementedError:
+                        pass
+    def _prepare_generation_config(self):
+        args = self.args
+        self.model.origin_generation_config = self.model.generation_config
+        self.model.generation_config = prepare_generation_config(self.model.generation_config,
+                                                                 args.get_request_config(), self.tokenizer)
+        logger.info(f'model.generation_config: {self.model.generation_config}')
+    def _prepare_model_tokenizer(self):
+        args = self.args
+        if args.sequence_parallel_size > 1:
+            from swift.trainers.sequence_parallel import sequence_parallel
+            sequence_parallel.init_sequence_parallel(args.sequence_parallel_size)
+        self.model, self.processor = args.get_model_processor()
+        if hasattr(self.model, 'hf_device_map'):
+            logger.info(f'model.hf_device_map: {self.model.hf_device_map}')
+        logger.info(f'model_info: {self.model.model_info}')
+        self._prepare_generation_config()
+        self._prepare_gradient_checkpointing()
+    def _prepare_template(self) -> None:
+        template = self.args.get_template(self.processor)
+        if self.args.task_type == 'causal_lm':
+            template.set_mode('train')
+        if template.use_model:
+            template.model = self.model
+        self.template = template
+    def _get_dataset(self):
+        # The random shuffling of the training set occurs in the dataloader of the trainer.
+        args = self.args
+        dataset_kwargs = args.get_dataset_kwargs()
+        train_dataset, val_dataset = load_dataset(
+            args.dataset, split_dataset_ratio=args.split_dataset_ratio, shuffle=args.dataset_shuffle, **dataset_kwargs)
+        if len(args.val_dataset) > 0:
+            # Loading val dataset
+            _, val_dataset = load_dataset(
+                args.val_dataset, split_dataset_ratio=1.0, shuffle=args.val_dataset_shuffle, **dataset_kwargs)
+            assert args.split_dataset_ratio == 0.
+        logger.info(f'train_dataset: {train_dataset}')
+        logger.info(f'val_dataset: {val_dataset}')
+        return train_dataset, val_dataset
+    def _get_loss_func(self):
+        args = self.args
+        loss_type = args.loss_type
+        if loss_type is None and args.loss_scale != 'default':
+            loss_type = 'loss_scale'
+        return get_loss_func(loss_type)
+    def _get_data_collator(self):
+        args = self.args
+        template = self.template
+        padding_to = args.max_length if args.train_type == 'longlora' else None
+        return partial(template.data_collator, padding_to=padding_to)
+    @staticmethod
+    def _save_val_dataset(output_dir: str, val_dataset):
+        if is_master() and isinstance(val_dataset, HfDataset):
+            os.makedirs(output_dir, exist_ok=True)
+            val_dataset_path = os.path.join(output_dir, 'val_dataset.jsonl')
+            append_to_jsonl(val_dataset_path, val_dataset.to_list())
+            logger.info(f'The split dataset from the training set will be saved at: {val_dataset_path}.')
+    def run(self):
+        args = self.args
+        train_dataset, val_dataset = self._get_dataset()
+        train_dataset, val_dataset = self._encode_dataset(train_dataset, val_dataset)
+        if args.task_type == 'seq_cls':
+            args.problem_type = args.problem_type or getattr(self.model.config, 'problem_type', None)
+            logger.info(f'args.problem_type: {args.problem_type}')
+        args.save_args()
+        data_collator = self._get_data_collator()
+        # Some tuners require train_dataset and data_collator for preparation: LoRA-GA
+        self.model = self.prepare_model(self.args, self.model, template=self.template, train_dataset=train_dataset)
+        logger.info(f'model: {self.model}')
+        model_parameter_info = get_model_parameter_info(self.model)
+        self.train_msg['model_parameter_info'] = model_parameter_info
+        logger.info(f'model_parameter_info: {model_parameter_info}')
+        trainer_cls = TrainerFactory.get_trainer_cls(args)
+        trainer = trainer_cls(
+            model=self.model,
+            args=self.args.training_args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=val_dataset,
+            callbacks=self.callbacks,
+            template=self.template,
+            **self._get_trainer_kwargs(),
+        )
+        return self.train(trainer)
+    def _get_trainer_kwargs(self):
+        args = self.args
+        if args.metric is not None:
+            compute_metrics, preprocess_logits_for_metrics = get_metric(args.metric)
+        elif args.predict_with_generate:
+            compute_metrics, preprocess_logits_for_metrics = get_metric('nlg')
+        else:
+            compute_metrics, preprocess_logits_for_metrics = get_metric('acc')
+            compute_metrics = partial(
+                compute_metrics, acc_strategy=args.acc_strategy, is_encoder_decoder=self.template.is_encoder_decoder)
+        return {
+            'compute_metrics': compute_metrics,
+            'preprocess_logits_for_metrics': preprocess_logits_for_metrics,
+            'compute_loss_func': self._get_loss_func()
+        }
+    def _save_trainer_state(self, trainer):
+        training_args = trainer.args
+        state = trainer.state
+        if hasattr(state, 'last_model_checkpoint'):
+            if self.args.create_checkpoint_symlink:
+                last_checkpoint = os.path.join(self.args.output_dir, 'last')
+                best_checkpoint = os.path.join(self.args.output_dir, 'best')
+                os.symlink(state.last_model_checkpoint, last_checkpoint)
+                os.symlink(state.best_model_checkpoint, best_checkpoint)
+                state.last_model_checkpoint = last_checkpoint
+                state.best_model_checkpoint = best_checkpoint
+        else:
+            state.last_model_checkpoint = None
+            logger.warning('No training was carried out, which may be due to the dataset being too small '
+                           'or incorrect usage of resume_from_checkpoint.')
+        logger.info(f'last_model_checkpoint: {state.last_model_checkpoint}')
+        logger.info(f'best_model_checkpoint: {state.best_model_checkpoint}')
+        # Visualization
+        if is_master() and not use_torchacc():
+            if 'tensorboard' in training_args.report_to:
+                images_dir = os.path.join(training_args.output_dir, 'images')
+                logger.info(f'images_dir: {images_dir}')
+                plot_images(images_dir, training_args.logging_dir, ['train/loss'], 0.9)
+            if training_args.push_to_hub:
+                trainer.push_to_hub()
+        self.train_msg.update({
+            'last_model_checkpoint': state.last_model_checkpoint,
+            'best_model_checkpoint': state.best_model_checkpoint,
+            'best_metric': state.best_metric,
+            'global_step': state.global_step,
+            'log_history': state.log_history,
+            'memory': trainer.max_memory,
+        })
+        if is_master():
+            jsonl_path = os.path.join(training_args.output_dir, 'logging.jsonl')
+            append_to_jsonl(jsonl_path, self.train_msg)
+        return self.train_msg
+    def train(self, trainer):
+        logging_path = os.path.join(trainer.args.output_dir, 'logging.jsonl')
+        logger.info(f'The logging file will be saved in: {logging_path}')
+        try:
+            trainer.train(trainer.args.resume_from_checkpoint)
+        finally:
+            res = self._save_trainer_state(trainer)
+        return res
+    def _prepare_callbacks(self):
+        from .callback import DynamicLayerActivationCallback, TrainerAdapterCallback
+        args = self.args
+        callbacks = []
+        if args.lisa_activated_layers > 0:
+            assert args.train_type == 'full', 'LISA only supports full parameter training.'
+            lisa_callback = DynamicLayerActivationCallback(
+                n_layers=args.lisa_activated_layers,  # Number of layers to activate
+                step_interval=args.lisa_step_interval,  # Step interval to update active layers
+                model=self.model)
+            lisa_callback.switch_active_layers()  # Make trainable parameters printing a correct value
+            callbacks.append(lisa_callback)
+        if args.is_adapter and args.train_type == 'adalora':
+            callbacks.append(TrainerAdapterCallback(args))
+        callbacks += extra_callbacks
+        self.callbacks = callbacks
+    def _stat_dataset(self, dataset: HfDataset):
+        args = self.args
+        if isinstance(dataset, HfDataset):
+            dataset = GetLengthPreprocessor()(dataset, num_proc=args.dataset_num_proc)
+            length = dataset['length']
+        else:
+            length = []
+            for row in dataset:
+                length.append(max([len(row[k]) for k in row.keys() if k.endswith('input_ids')]))
+        _, stat_str = stat_array(length)
+        logger.info(f'Dataset Token Length: {stat_str}')
+        return stat_str
+    def _encode_dataset(self, train_dataset, val_dataset):
+        template = self.template
+        args = self.args
+        output_dir = getattr(args, 'output_dir', None) or getattr(args, 'save')
+        self._save_val_dataset(output_dir, val_dataset)
+        is_grpo = hasattr(args, 'rlhf_type') and args.rlhf_type == 'grpo'
+        predict_with_generate = getattr(args, 'predict_with_generate', False)
+        if not is_grpo:
+            if args.packing:
+                packing_dataset_cls = IterablePackingDataset if args.streaming else PackingDataset
+                train_dataset = packing_dataset_cls(
+                    self.template, train_dataset, num_proc=args.dataset_num_proc, strict=args.strict)
+                if val_dataset is not None:
+                    val_dataset = packing_dataset_cls(
+                        self.template, val_dataset, num_proc=args.dataset_num_proc, strict=args.strict)
+            elif args.lazy_tokenize:
+                train_dataset = LazyLLMDataset(
+                    train_dataset, template.encode, strict=args.strict, random_state=args.data_seed)
+                if val_dataset is not None and not predict_with_generate:
+                    val_dataset = LazyLLMDataset(
+                        val_dataset, template.encode, strict=args.strict, random_state=args.data_seed)
+            else:
+                preprocessor = EncodePreprocessor(template=template)
+                train_dataset = preprocessor(train_dataset, num_proc=args.dataset_num_proc, strict=args.strict)
+                if val_dataset is not None and not predict_with_generate:
+                    val_dataset = preprocessor(val_dataset, num_proc=args.dataset_num_proc, strict=args.strict)
+            if is_master():
+                inputs = train_dataset[0] if hasattr(train_dataset, '__len__') else next(iter(train_dataset))
+                template.print_inputs(inputs, tokenizer_kwargs=inputs.pop('tokenizer_kwargs', None) or {})
+            if isinstance(train_dataset, (HfDataset, PackingDataset)):
+                self.train_msg['train_dataset'] = self._stat_dataset(train_dataset)
+                if val_dataset is not None and not predict_with_generate:
+                    self.train_msg['val_dataset'] = self._stat_dataset(val_dataset)
+        return train_dataset, val_dataset
+def sft_main(args: Union[List[str], TrainArguments, None] = None):
+    return SwiftSft(args).main()

ms-swift/swift/llm/train/tuner.py ADDED Viewed

	@@ -0,0 +1,424 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+import inspect
+import os
+from typing import List, Union
+import torch
+import torch.nn as nn
+import transformers
+from packaging import version
+from transformers import TrainingArguments
+from swift.llm import TrainArguments, deep_getattr, get_model_arch
+from swift.plugin import Tuner, extra_tuners
+from swift.tuners import Swift
+from swift.utils import (activate_parameters, find_all_linears, find_embedding, find_norm, freeze_parameters,
+                         get_logger, use_torchacc)
+logger = get_logger()
+def apply_liger(model_type: str):
+    from liger_kernel.transformers import (apply_liger_kernel_to_llama, apply_liger_kernel_to_mistral,
+                                           apply_liger_kernel_to_mixtral, apply_liger_kernel_to_gemma,
+                                           apply_liger_kernel_to_qwen2, apply_liger_kernel_to_qwen3,
+                                           apply_liger_kernel_to_qwen2_vl, apply_liger_kernel_to_qwen2_5_vl,
+                                           apply_liger_kernel_to_phi3, apply_liger_kernel_to_mllama)
+    from swift.llm import ModelType
+    if model_type in (ModelType.llama, ModelType.llama3, ModelType.llama3_1, ModelType.llama3_2):
+        apply_liger_kernel_to_llama()
+    elif model_type in (ModelType.mistral):
+        apply_liger_kernel_to_mistral()
+    elif model_type in (ModelType.mixtral):
+        apply_liger_kernel_to_mixtral()
+    elif model_type in (ModelType.gemma, ModelType.gemma2):
+        apply_liger_kernel_to_gemma()
+    elif model_type in (ModelType.qwen2, ModelType.qwen2_5):
+        apply_liger_kernel_to_qwen2()
+    elif model_type in (ModelType.qwen3):
+        apply_liger_kernel_to_qwen3()
+    elif model_type in (ModelType.phi3):
+        apply_liger_kernel_to_phi3()
+    elif model_type in (ModelType.llama3_2_vision):
+        apply_liger_kernel_to_mllama()
+    elif model_type in (ModelType.qwen2_vl):
+        apply_liger_kernel_to_qwen2_vl()
+    elif model_type in (ModelType.qwen2_5_vl):
+        apply_liger_kernel_to_qwen2_5_vl()
+    else:
+        raise ValueError(f'Unsupported liger model_type: {model_type}')
+def get_multimodal_target_regex(
+    model,
+    *,
+    freeze_llm: bool = False,
+    freeze_vit: bool = True,
+    freeze_aligner: bool = True,
+    include_embedding: bool = False,
+) -> str:
+    model_arch = get_model_arch(model.model_meta.model_arch)
+    modules = []
+    if not freeze_llm:
+        modules += model_arch.language_model
+    if not freeze_vit:
+        modules += model_arch.vision_tower
+    if not freeze_aligner:
+        modules += model_arch.aligner
+    assert len(modules) > 0, f'modules: {modules}'
+    extra_layers = []
+    if include_embedding:
+        extra_layers.append(nn.Embedding)
+    res = []
+    for module in modules:
+        rejected_modules = []
+        if not freeze_vit:
+            for aligner in model_arch.aligner:
+                if aligner.startswith(f'{module}.'):
+                    rejected_modules.append(aligner)
+        sub_module = deep_getattr(model, module)
+        target_modules = find_all_linears(sub_module, model_arch, extra_layers)
+        target_modules = [tm for tm in target_modules if tm]
+        target_pattern = rf'.*\.({"|".join(target_modules)})' if target_modules else ''
+        rejected_pattern = rf'(?!({"|".join(rejected_modules)}))' if rejected_modules else ''
+        res.append(rf'{rejected_pattern}{module}{target_pattern}')
+    return rf'^({"|".join(res)})$'
+def get_target_modules(args, model) -> Union[str, List[str]]:
+    """Replace all-linear to actual modules"""
+    model_meta = model.model_meta
+    if isinstance(args.target_modules, str):
+        return args.target_modules
+    target_modules = args.target_modules.copy()
+    if 'all-linear' in target_modules:
+        if model_meta.is_multimodal:
+            return get_multimodal_target_regex(
+                model,
+                freeze_llm=args.freeze_llm,
+                freeze_vit=args.freeze_vit,
+                freeze_aligner=args.freeze_aligner,
+                include_embedding='all-embedding' in target_modules)
+        else:
+            target_modules.remove('all-linear')
+            target_modules += find_all_linears(model)
+    if 'all-embedding' in target_modules:
+        target_modules.remove('all-embedding')
+        target_modules += find_embedding(model)
+    return target_modules
+def get_modules_to_save(args, model, task_type=None):
+    modules_to_save = args.modules_to_save.copy()
+    if 'all-embedding' in args.modules_to_save:
+        modules_to_save.remove('all-embedding')
+        modules_to_save += find_embedding(model)
+    if 'all-norm' in args.modules_to_save:
+        modules_to_save.remove('all-norm')
+        modules_to_save += find_norm(model)
+    if task_type and task_type.lower() == 'seq_cls':  # reward_model
+        modules_to_save.append('v_head')
+    return modules_to_save
+def get_vera_target_modules(model, config):
+    """This function is only useful on the vera tuner"""
+    target_modules = config.target_modules
+    modules_dict = {
+        name: module.weight.shape
+        for name, module in model.named_modules()
+        if isinstance(module, torch.nn.Linear) and any([t in name for t in target_modules])
+    }  # only Linear for now
+    if len(set(modules_dict.values())) > 1:
+        v = [t for t in target_modules if 'v' in t]
+        if not v:
+            raise ValueError('Please manually pass in `vera_target_modules`, do not use `all-linear`,'
+                             'because Vera need all target linears to be the same size.')
+        v = v[0]
+        shape = [shape for name, shape in modules_dict.items() if v in name][0]
+        names = [_name for _name, _shape in modules_dict.items() if _shape == shape]
+        config.target_modules = [t for t in target_modules if any([t in name for name in names])]
+    return config
+def prepare_adapter(args: TrainArguments, model, *, template=None, train_dataset=None, task_type=None):
+    from swift.tuners import (AdaLoraConfig, AdapterConfig, BOFTConfig, LLaMAProConfig, LongLoRAModelType, LoraConfig,
+                              LoRAConfig, ReftConfig, Swift, VeraConfig)
+    task_type = (task_type or args.task_type).upper()
+    target_modules = get_target_modules(args, model)
+    modules_to_save = get_modules_to_save(args, model, task_type)
+    lora_kwargs = {
+        'r': args.lora_rank,
+        'target_modules': target_modules,
+        'lora_alpha': args.lora_alpha,
+        'lora_dropout': args.lora_dropout,
+        'bias': args.lora_bias,
+        'modules_to_save': modules_to_save,
+        'use_rslora': args.use_rslora,
+        'use_dora': args.use_dora,
+        'lorap_lr_ratio': args.lorap_lr_ratio,
+        'init_lora_weights': args.init_weights,
+    }
+    if args.train_type in ('lora', 'longlora'):
+        if args.use_swift_lora:
+            lora_config = LoRAConfig(lora_dtype=args.lora_dtype, **lora_kwargs)
+            model = Swift.prepare_model(model, lora_config)
+            logger.info(f'lora_config: {lora_config}')
+        elif args.tuner_backend == 'peft':
+            if task_type == 'EMBEDDING':
+                task_type = None
+            lora_config = LoraConfig(task_type=task_type, lora_dtype=args.lora_dtype, **lora_kwargs)
+            if args.init_weights == 'lora-ga':
+                try:
+                    import lora_ga
+                except ImportError as e:
+                    error_message = """
+                    Since 'LoRA-GA' is not implemented by PEFT, you will need to install it directly from GitHub.
+                    Command: 'pip install git+https://github.com/lxline/LoRA-GA.git'.
+                    """
+                    logger.info(error_message)
+                    raise RuntimeError(error_message) from e
+                model = lora_ga.entrypoint.get_lora_ga_model(
+                    model=model,
+                    data_collator=template.data_collator,
+                    dataset=train_dataset,
+                    batch_size=args.lora_ga_batch_size,
+                    num_iters=args.lora_ga_iters,
+                    max_length=args.lora_ga_max_length,
+                    direction=args.lora_ga_direction,
+                    dtype=args.lora_dtype,
+                    scale=args.lora_ga_scale,
+                    stable_gamma=args.lora_ga_stable_gamma,
+                )
+            else:
+                model = Swift.prepare_model(model, lora_config)
+            logger.info(f'lora_config: {lora_config}')
+        elif args.tuner_backend == 'unsloth':
+            if args.resume_from_checkpoint is None:
+                if args.model_meta.is_multimodal:
+                    from unsloth import FastVisionModel as UnslothModel
+                else:
+                    from unsloth import FastLanguageModel as UnslothModel
+                assert args.train_type == 'lora', 'Unsloth does not support LongLoRA'
+                lora_kwargs.pop('lorap_lr_ratio')
+                model = UnslothModel.get_peft_model(
+                    model,
+                    use_gradient_checkpointing='unsloth',
+                    max_seq_length=args.max_length or 2048,  # 2048 is the default value of unsloth
+                    **lora_kwargs,
+                )
+                logger.info(f'unsloth_config: {lora_kwargs}')
+        if args.train_type == 'longlora':
+            assert LongLoRAModelType.LLAMA in args.model_type
+            assert version.parse(transformers.__version__) >= version.parse('4.39.3')
+            from swift.tuners.longlora.llama import replace_llama_attn
+            replace_llama_attn(model)
+            model.config.group_size_ratio = 0.25
+    elif args.train_type == 'adalora':
+        lora_kwargs.pop('lorap_lr_ratio', None)
+        lora_kwargs['rank_pattern'] = None
+        from swift.plugin.optimizer import calculate_max_steps
+        adalora_config = AdaLoraConfig(
+            task_type=task_type,
+            **lora_kwargs,
+            target_r=args.adalora_target_r,
+            init_r=args.adalora_init_r,
+            tinit=args.adalora_tinit,
+            tfinal=args.adalora_tfinal,
+            deltaT=args.adalora_deltaT,
+            beta1=args.adalora_beta1,
+            beta2=args.adalora_beta2,
+            orth_reg_weight=args.adalora_orth_reg_weight,
+            total_step=calculate_max_steps(args.training_args, train_dataset),
+        )
+        model = Swift.prepare_model(model, adalora_config)
+        logger.info(f'adalora_config: {adalora_config}')
+    elif args.train_type == 'llamapro':
+        llamapro_config = LLaMAProConfig(
+            model_type=model.model_meta.model_arch,
+            num_new_blocks=args.llamapro_num_new_blocks,
+            num_groups=args.llamapro_num_groups)
+        model = Swift.prepare_model(model, llamapro_config)
+        logger.info(f'llamapro_config: {llamapro_config}')
+    elif args.train_type == 'adapter':
+        model_arch = get_model_arch(model.model_meta.model_arch)
+        mlp_key = model_arch.mlp
+        mlp_key = mlp_key.split('.{}.')[1]
+        adapter_config = AdapterConfig(
+            dim=model.config.hidden_size,
+            target_modules=[mlp_key],
+            hidden_pos=0,
+            adapter_length=args.adapter_length,
+            act_layer=args.adapter_act)
+        model = Swift.prepare_model(model, adapter_config)
+        logger.info(f'adapter_config: {adapter_config}')
+    elif args.train_type == 'vera':
+        vera_config = VeraConfig(
+            r=args.vera_rank,
+            target_modules=target_modules,
+            projection_prng_key=args.vera_projection_prng_key,
+            vera_dropout=args.vera_dropout,
+            d_initial=args.vera_d_initial,
+            modules_to_save=args.modules_to_save,
+        )
+        vera_config = get_vera_target_modules(model, vera_config)
+        model = Swift.prepare_model(model, vera_config)
+        logger.info(f'vera_config: {vera_config}')
+    elif args.train_type == 'boft':
+        boft_config = BOFTConfig(
+            boft_block_size=args.boft_block_size,
+            boft_block_num=args.boft_block_num,
+            boft_n_butterfly_factor=args.boft_n_butterfly_factor,
+            target_modules=target_modules,
+            boft_dropout=args.boft_dropout,
+            modules_to_save=args.modules_to_save,
+        )
+        model = Swift.prepare_model(model, boft_config)
+        logger.info(f'boft_config: {boft_config}')
+    elif args.train_type == 'fourierft':
+        from peft import FourierFTConfig
+        fourier_config = FourierFTConfig(
+            target_modules=target_modules,
+            modules_to_save=args.modules_to_save,
+            n_frequency=args.fourier_n_frequency,
+            scaling=args.fourier_scaling,
+        )
+        model = Swift.prepare_model(model, fourier_config)
+        logger.info(f'fourier_config: {fourier_config}')
+    elif args.train_type == 'reft':
+        reft_config = ReftConfig(
+            model_type=model.model_meta.model_arch,
+            layer_key=args.reft_layer_key,
+            r=args.reft_rank,
+            layers=args.reft_layers,
+            intervention_type=args.reft_intervention_type,
+            args=args.reft_args,
+        )
+        logger.info(f'reft config: {reft_config}')
+        model = Swift.prepare_model(model, {'reft': reft_config})
+    elif args.train_type == 'bone':
+        # Version loosing
+        from peft import BoneConfig
+        bone_config = BoneConfig(
+            target_modules=target_modules,
+            r=args.reft_rank,
+            init_weights=args.init_weights,
+        )
+        logger.info(f'bone config: {bone_config}')
+        model = Swift.prepare_model(model, bone_config)
+    return model
+def torchacc_resume_from_checkpoint(args, model):
+    import safetensors
+    weights_file = os.path.join(args.resume_from_checkpoint, 'pytorch_model.bin')
+    safe_weights_file = os.path.join(args.resume_from_checkpoint, 'model.safetensors')
+    if os.path.isfile(weights_file) or os.path.isfile(safe_weights_file):
+        if args.save_safetensors and os.path.isfile(safe_weights_file):
+            state_dict = safetensors.torch.load_file(safe_weights_file, device='cpu')
+        else:
+            state_dict = torch.load(weights_file, map_location='cpu')
+        model.load_state_dict(state_dict, False)
+        del state_dict
+    else:
+        from transformers.modeling_utils import load_sharded_checkpoint
+        # We load the sharded checkpoint
+        load_result = load_sharded_checkpoint(
+            model, args.resume_from_checkpoint, strict=False, prefer_safe=args.save_safetensors)
+        if len(load_result.missing_keys) != 0:
+            if model._keys_to_ignore_on_save is not None and set(load_result.missing_keys) == set(
+                    model._keys_to_ignore_on_save):
+                model.tie_weights()
+            else:
+                logger.warning(f'There were missing keys in the checkpoint model loaded: {load_result.missing_keys}.')
+        if len(load_result.unexpected_keys) != 0:
+            logger.warning(f'There were unexpected keys in the checkpoint model loaded: {load_result.unexpected_keys}.')
+class TunerMixin:
+    @classmethod
+    def prepare_model(cls, args, model, *, template=None, train_dataset=None, task_type=None):
+        if args.use_liger_kernel and 'use_liger_kernel' not in inspect.signature(TrainingArguments).parameters:
+            # Apply liger
+            apply_liger(args.model_type)
+        if args.is_adapter:
+            if args.tuner_backend != 'unsloth' and args.train_type not in extra_tuners:
+                # Fix the name of the layer in xcomposer that contains Plora.
+                # Unsloth prepares and loads lora outside this function when
+                # resume_from_checkpoint, so do not disable grad here
+                model.requires_grad_(False)
+            if args.resume_from_checkpoint:
+                if args.train_type in extra_tuners:
+                    tuner: Tuner = extra_tuners[args.train_type]
+                else:
+                    tuner = Swift
+                kwargs = {}
+                if use_torchacc():
+                    kwargs = {'adapter_name': 'default'}
+                model = tuner.from_pretrained(model, args.resume_from_checkpoint, is_trainable=True, **kwargs)
+            else:
+                if args.train_type in extra_tuners:
+                    tuner: Tuner = extra_tuners[args.train_type]
+                    model = tuner.prepare_model(args, model)
+                else:
+                    model = prepare_adapter(
+                        args, model, template=template, train_dataset=train_dataset, task_type=task_type)
+            # fix bug: Attempting to unscale FP16 gradients.
+            #   peft: https://github.com/huggingface/peft/issues/1249
+            for p in model.parameters():
+                if p.requires_grad and p.dtype == torch.float16:
+                    logger.info_once('Convert trainable parameters from fp16 to fp32.')
+                    p.data = p.data.to(dtype=torch.float32)
+        elif args.train_type == 'full':
+            model.train()
+            model.requires_grad_(True)
+            freeze_parameters(model, args.freeze_parameters_ratio, args.freeze_parameters, args.freeze_parameters_regex)
+            if len(args.trainable_parameters) > 0 or args.trainable_parameters_regex is not None:
+                activate_parameters(model, args.trainable_parameters, args.trainable_parameters_regex)
+            if use_torchacc() and args.resume_from_checkpoint:
+                torchacc_resume_from_checkpoint(args, model)
+        else:
+            raise ValueError(f'args.train_type: {args.train_type}')
+        if args.resume_only_model:
+            args.training_args.resume_from_checkpoint = None
+        if args.use_galore:
+            from swift.trainers.optimizers.galore import GaLoreConfig
+            if args.galore_target_modules is None:
+                args.galore_target_modules = find_all_linears(model)
+            if args.galore_with_embedding:
+                args.galore_target_modules += find_embedding(model)
+            args.galore_config = GaLoreConfig(
+                target_modules=args.galore_target_modules,
+                rank=args.galore_rank,
+                update_proj_gap=args.galore_update_proj_gap,
+                galore_scale=args.galore_scale,
+                proj_type=args.galore_proj_type,
+                optim_per_parameter=args.galore_optim_per_parameter,
+                quantize=args.galore_quantization,
+                proj_quant=args.galore_proj_quant,
+                proj_bits=args.galore_proj_bits,
+                proj_group_size=args.galore_proj_group_size,
+                cos_threshold=args.galore_cos_threshold,
+                gamma_proj=args.galore_gamma_proj,
+                queue_size=args.galore_queue_size,
+            )
+            args.training_args.galore_config = args.galore_config
+        if args.sequence_parallel_size > 1:
+            from swift.trainers.sequence_parallel import sequence_parallel
+            if hasattr(model, 'model_meta'):
+                is_multimodal = model.model_meta.is_multimodal
+            else:
+                is_multimodal = model.model.model_meta.is_multimodal
+            # multimodal model must do split in basemodel's forward
+            # or the media embedding may occur error
+            sequence_parallel.prepare_model(model, template.tokenizer, split_in_forward=is_multimodal)
+        return model

ms-swift/swift/megatron/argument/train_args.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from dataclasses import dataclass
+import torch
+from swift.llm import BaseArguments
+from swift.llm.argument.base_args import to_abspath
+from swift.utils import add_version_to_work_dir, get_logger, init_process_group, is_master
+from ..model import get_megatron_model_meta
+from .megatron_args import MegatronArguments
+logger = get_logger()
+@dataclass
+class MegatronTrainArguments(MegatronArguments, BaseArguments):
+    add_version: bool = True
+    # dataset
+    lazy_tokenize: bool = False
+    packing: bool = False
+    def init_model_args(self, config):
+        self.megatron_model_meta = get_megatron_model_meta(self.model_type)
+        kwargs = self.megatron_model_meta.convert_hf_config(config)
+        for k, v in kwargs.items():
+            if getattr(self, k) is None:
+                setattr(self, k, v)
+        MegatronArguments.__post_init__(self)
+        self.extra_args = self.parse_to_megatron()
+    def _init_save(self):
+        init_process_group()
+        if self.save is None:
+            self.save = f'megatron_output/{self.model_suffix}'
+        self.save = to_abspath(self.save)
+        if self.add_version:
+            self.save = add_version_to_work_dir(self.save)
+            logger.info(f'args.save: {self.save}')
+        if is_master():
+            os.makedirs(self.save, exist_ok=True)
+    def __post_init__(self):
+        self.sequence_parallel_size = self.context_parallel_size
+        self.load = to_abspath(self.load, check_path_exist=True)
+        BaseArguments.__post_init__(self)
+        self._init_save()
+        self.seq_length = self.seq_length or self.max_length
+        if self.streaming:
+            self.dataloader_type = 'external'
+            if self.num_workers > 1:
+                self.num_workers = 1
+                logger.info('Using streaming dataset, setting args.num_workers to 1.')

ms-swift/swift/megatron/model/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+from . import gpt
+from .constant import MegatronModelType
+from .register import MegatronModelMeta, get_megatron_model_meta, register_megatron_model

ms-swift/swift/megatron/model/config.py ADDED Viewed

	@@ -0,0 +1,57 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict
+from swift.utils import get_logger
+logger = get_logger()
+config_mapping = {
+    'num_layers': ['num_hidden_layers'],
+    'hidden_size': ['hidden_size'],
+    'ffn_hidden_size': ['intermediate_size'],
+    'num_attention_heads': ['num_attention_heads'],
+    'num_query_groups': ['num_key_value_heads'],
+    'max_position_embeddings': ['max_position_embeddings'],
+    'norm_epsilon': ['rms_norm_eps'],
+    'rotary_base': ['rope_theta'],
+    'padded_vocab_size': ['vocab_size'],
+    'attention_dropout': ['attention_dropout'],
+    'untie_embeddings_and_output_weights': ['tie_word_embeddings'],
+    'swiglu': ['hidden_act'],
+    'add_qkv_bias': ['attention_bias'],
+    'disable_bias_linear': ['mlp_bias'],
+    'kv_channels': ['head_dim'],
+    'model_type': ['model_type'],
+    # moe
+    'moe_ffn_hidden_size': ['moe_intermediate_size'],
+    'moe_shared_expert_intermediate_size': ['shared_expert_intermediate_size'],
+    'moe_router_topk': ['num_experts_per_tok'],
+    'num_experts': ['num_experts'],
+    'moe_router_pre_softmax': ['norm_topk_prob'],
+    'moe_aux_loss_coeff': ['router_aux_loss_coef'],
+}
+def convert_hf_config(config) -> Dict[str, Any]:
+    megatron_config = {}
+    for k, hf_keys in config_mapping.items():
+        for hf_k in hf_keys:
+            if hasattr(config, hf_k):
+                hf_v = getattr(config, hf_k)
+                if k == 'rotary_base':
+                    megatron_config[k] = int(hf_v)
+                elif k in {'untie_embeddings_and_output_weights', 'disable_bias_linear', 'moe_router_pre_softmax'}:
+                    megatron_config[k] = not hf_v
+                elif k == 'swiglu':
+                    if hf_v == 'silu':
+                        megatron_config[k] = True
+                else:
+                    megatron_config[k] = hf_v
+                break
+    # compat llama3
+    if getattr(config, 'rope_scaling', None) is not None:
+        if isinstance(config.rope_scaling, int):
+            megatron_config['rope_scaling'] = {'factor': config.rope_scaling, 'type': 'linear'},
+        elif isinstance(config.rope_scaling, dict):
+            megatron_config['rope_scaling'] = config.rope_scaling
+    logger.info(f'megatron_config: {megatron_config}')
+    return megatron_config

ms-swift/swift/megatron/model/constant.py ADDED Viewed

	@@ -0,0 +1,3 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+class MegatronModelType:
+    gpt = 'gpt'

ms-swift/swift/megatron/model/gpt/__init__.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+from swift.llm import ModelType
+from ..constant import MegatronModelType
+from ..register import MegatronModelMeta, register_megatron_model
+from .config import convert_gpt_hf_config
+from .hf2mcore import convert_hf2mcore
+from .mcore2hf import convert_mcore2hf
+from .model import model_provider
+register_megatron_model(
+    MegatronModelMeta(MegatronModelType.gpt, [
+        ModelType.qwen2,
+        ModelType.qwen2_5,
+        ModelType.qwq,
+        ModelType.qwq_preview,
+        ModelType.qwen2_5_math,
+        ModelType.llama,
+        ModelType.llama3,
+        ModelType.llama3_1,
+        ModelType.llama3_2,
+        ModelType.longwriter_llama3_1,
+        ModelType.codefuse_codellama,
+        ModelType.marco_o1,
+        ModelType.deepseek,
+        ModelType.deepseek_r1_distill,
+        ModelType.yi,
+        ModelType.yi_coder,
+        ModelType.sus,
+        ModelType.skywork_o1,
+        ModelType.openbuddy_llama,
+        ModelType.openbuddy_llama3,
+        ModelType.megrez,
+        ModelType.reflection,
+        ModelType.numina,
+        ModelType.ziya,
+        ModelType.mengzi3,
+        ModelType.qwen3,
+        ModelType.qwen2_moe,
+        ModelType.qwen3_moe,
+    ], model_provider, convert_gpt_hf_config, convert_mcore2hf, convert_hf2mcore))

ms-swift/swift/megatron/model/gpt/config.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from typing import Any, Dict
+from ..config import convert_hf_config
+def convert_gpt_hf_config(config) -> Dict[str, Any]:
+    res = convert_hf_config(config)
+    model_type = res.get('model_type')
+    if model_type in {'qwen3', 'qwen3_moe'}:
+        res['qk_layernorm'] = True
+    if model_type in {'qwen2_moe', 'qwen3_moe'}:
+        res.pop('ffn_hidden_size', None)
+    return res

ms-swift/swift/megatron/model/gpt/model.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+from megatron.core.models.gpt import GPTModel
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.training import get_args
+from megatron.training.arguments import core_transformer_config_from_args
+from ..rope import update_rope_inv_freq
+def model_provider(pre_process=True, post_process=True):
+    args = get_args()
+    config = core_transformer_config_from_args(args)
+    config.variable_seq_lengths = True
+    transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(args.num_experts, args.moe_grouped_gemm,
+                                                                        args.qk_layernorm, args.multi_latent_attention)
+    if args.num_experts and args.moe_shared_expert_intermediate_size:
+        # qwen2_moe/qwen3_moe
+        transformer_layer_spec.submodules.mlp.submodules.shared_experts.params = {'gate': True}
+    model = GPTModel(
+        config=config,
+        transformer_layer_spec=transformer_layer_spec,
+        vocab_size=args.padded_vocab_size,
+        max_sequence_length=args.max_position_embeddings,
+        pre_process=pre_process,
+        post_process=post_process,
+        fp16_lm_cross_entropy=args.fp16_lm_cross_entropy,
+        parallel_output=True,
+        share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
+        position_embedding_type=args.position_embedding_type,
+        rotary_percent=args.rotary_percent,
+        rotary_base=args.rotary_base,
+        rope_scaling=args.use_rope_scaling,
+        rope_scaling_factor=args.rope_scaling_factor,
+        seq_len_interpolation_factor=args.rotary_seq_len_interpolation_factor)
+    if args.rope_scaling:
+        update_rope_inv_freq(model.rotary_pos_emb.inv_freq, args.rope_scaling)
+    return model

ms-swift/swift/megatron/model/register.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, List, Optional
+import torch.nn as nn
+from transformers import PretrainedConfig
+from swift.llm import MODEL_MAPPING, ModelGroup
+MEGATRON_MODEL_MAPPING = {}
+@dataclass
+class MegatronModelMeta:
+    megatron_model_type: str
+    model_types: List[str]
+    model_provider: Callable[[], nn.Module]
+    convert_hf_config: Callable[[PretrainedConfig], Dict[str, Any]]
+    convert_mcore2hf: Callable[[nn.Module, nn.Module], None]
+    convert_hf2mcore: Callable[[nn.Module, nn.Module], None]
+def register_megatron_model(megatron_model_meta: MegatronModelMeta, *, exist_ok: bool = False):
+    megatron_model_type = megatron_model_meta.megatron_model_type
+    for model_type in megatron_model_meta.model_types:
+        model_meta = MODEL_MAPPING[model_type]
+        model_meta.support_megatron = True
+    if not exist_ok and megatron_model_type in MEGATRON_MODEL_MAPPING:
+        raise ValueError(f'The `{megatron_model_type}` has already been registered in the MODEL_MAPPING.')
+    MEGATRON_MODEL_MAPPING[megatron_model_type] = megatron_model_meta
+_MODEL_META_MAPPING = None
+def get_megatron_model_meta(model_type: str) -> Optional[MegatronModelMeta]:
+    global _MODEL_META_MAPPING
+    if _MODEL_META_MAPPING is None:
+        _MODEL_META_MAPPING = {}
+        for k, megatron_model_meta in MEGATRON_MODEL_MAPPING.items():
+            for _model_type in megatron_model_meta.model_types:
+                _MODEL_META_MAPPING[_model_type] = k
+    if model_type not in _MODEL_META_MAPPING:
+        return
+    return MEGATRON_MODEL_MAPPING[_MODEL_META_MAPPING[model_type]]

ms-swift/swift/megatron/model/rope.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import math
+from typing import Any, Dict
+import torch
+def _to_llama3_rope(inv_freq: torch.Tensor, rope_scaling: Dict[str, Any]):
+    # copy from transformers
+    factor = rope_scaling['factor']  # `8` in the original implementation
+    low_freq_factor = rope_scaling['low_freq_factor']  # `1` in the original implementation
+    high_freq_factor = rope_scaling['high_freq_factor']  # `4` in the original implementation
+    old_context_len = rope_scaling['original_max_position_embeddings']  # `8192` in the original implementation
+    low_freq_wavelen = old_context_len / low_freq_factor
+    high_freq_wavelen = old_context_len / high_freq_factor
+    wavelen = 2 * math.pi / inv_freq
+    # wavelen < high_freq_wavelen: do nothing
+    # wavelen > low_freq_wavelen: divide by factor
+    inv_freq_llama = torch.where(wavelen > low_freq_wavelen, inv_freq / factor, inv_freq)
+    # otherwise: interpolate between the two, using a smooth factor
+    smooth_factor = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
+    smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
+    is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
+    inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
+    return inv_freq_llama
+def _to_linear_rope(inv_freq: torch.Tensor, rope_scaling: Dict[str, Any]):
+    factor = rope_scaling['factor']
+    inv_freq /= factor
+    return inv_freq
+ROPE_MAPPING = {'llama3': _to_llama3_rope, 'linear': _to_linear_rope}
+def update_rope_inv_freq(inv_freq: torch.Tensor, rope_scaling: Dict[str, Any]) -> None:
+    new_inv_freq = ROPE_MAPPING[rope_scaling['rope_type']](inv_freq, rope_scaling)
+    inv_freq.data.copy_(new_inv_freq)

ms-swift/swift/megatron/train/patcher.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from contextlib import contextmanager
+from functools import wraps
+import torch
+from megatron.training import get_args, global_vars, initialize, training
+from swift.utils import JsonlWriter, is_master
+@contextmanager
+def patch_training_log():
+    jsonl_writer = None
+    origin_training_log = training.training_log
+    @wraps(origin_training_log)
+    def training_log(loss_dict, total_loss_dict, learning_rate, decoupled_learning_rate, iteration, loss_scale,
+                     report_memory_flag, skipped_iter, grad_norm, params_norm, num_zeros_in_grad, *_args, **kwargs):
+        nonlocal jsonl_writer
+        args = get_args()
+        if is_master() and iteration % args.log_interval == 0:
+            logging_path = os.path.join(args.save, 'logging.jsonl')
+            logs = {}
+            for k, v in loss_dict.items():
+                if isinstance(v, torch.Tensor):
+                    v = v.item()
+                logs[k] = round(v, 8)
+            for k in {'grad_norm', 'params_norm', 'learning_rate'}:
+                v = locals()[k]
+                if v is not None:
+                    logs[k] = round(v, 8)
+            logs['consumed_samples'] = args.consumed_train_samples
+            logs['global_step/max_steps'] = f'{iteration}/{args.train_iters}'
+            if jsonl_writer is None:
+                jsonl_writer = JsonlWriter(logging_path, enable_async=True)
+            jsonl_writer.append(logs)
+        return origin_training_log(loss_dict, total_loss_dict, learning_rate, decoupled_learning_rate, iteration,
+                                   loss_scale, report_memory_flag, skipped_iter, grad_norm, params_norm,
+                                   num_zeros_in_grad, *_args, **kwargs)
+    training.training_log = training_log
+    try:
+        yield
+    finally:
+        training.training_log = origin_training_log
+@contextmanager
+def patch_megatron_data_collator(data_collator):
+    origin_build_pretraining_data_loader = training.build_pretraining_data_loader
+    def build_pretraining_data_loader(*_args, **kwargs):
+        args = get_args()
+        res = origin_build_pretraining_data_loader(*_args, **kwargs)
+        if res is not None and args.dataloader_type != 'external':
+            res.collate_fn = data_collator
+        return res
+    training.build_pretraining_data_loader = build_pretraining_data_loader
+    try:
+        yield
+    finally:
+        training.build_pretraining_data_loader = origin_build_pretraining_data_loader

ms-swift/swift/megatron/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .convert import convert_hf2mcore, convert_mcore2hf
+from .patcher import patch_megatron_tokenizer

ms-swift/swift/megatron/utils/convert.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+import math
+import torch
+from megatron.training.checkpointing import load_checkpoint
+from megatron.training.checkpointing import save_checkpoint as mg_save_checkpoint
+from megatron.training.initialize import initialize_megatron
+from megatron.training.utils import get_ltor_masks_and_position_ids
+from swift.llm import ExportArguments, get_model_tokenizer, get_template, save_checkpoint
+from swift.utils import get_logger, get_n_params_grads
+from ..argument import MegatronArguments
+from ..model import get_megatron_model_meta
+from .patcher import patch_megatron_tokenizer, patch_torch_dist_shard
+logger = get_logger()
+def test_convert_precision(hf_model, mg_model, processor):
+    torch_dtype = hf_model.dtype
+    template = get_template(hf_model.model_meta.template, processor)
+    input_ids = template.encode({'messages': [{'role': 'user', 'content': 'who are you?'}]})['input_ids']
+    input_ids = torch.tensor(input_ids)[None].to('cuda')
+    hf_model.to('cuda')
+    hf_model.to(torch.float32)
+    with torch.inference_mode():
+        hf_logits = hf_model(input_ids).logits
+    hf_model.to(torch_dtype)
+    hf_model.to('cpu')
+    attention_mask, _, position_ids = get_ltor_masks_and_position_ids(input_ids, -100, True, True, True)
+    mg_model.to('cuda')
+    mg_model.to(torch.float32)
+    with torch.inference_mode():
+        mg_logits = mg_model(input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids)
+    mg_model.to(torch_dtype)
+    mg_model.to('cpu')
+    mean_diff = (mg_logits - hf_logits).abs().mean().item()
+    max_diff = (mg_logits - hf_logits).abs().max().item()
+    print(f'mean_diff: {mean_diff}, max_diff: {max_diff}')
+    hf_tokens = hf_logits.argmax(-1)
+    mg_tokens = mg_logits.argmax(-1)
+    print(f'hf_tokens: {hf_tokens[0].tolist()}\nmg_tokens: {mg_tokens[0].tolist()}')
+    assert mean_diff < 0.1
+    assert (hf_tokens == mg_tokens).all()
+convert_kwargs = {
+    'use_cpu_initialization': True,
+    'no_save_optim': True,
+    'no_save_rng': True,
+    'no_load_optim': True,
+    'no_load_rng': True,
+    'no_masked_softmax_fusion': True,
+    'no_bias_dropout_fusion': True,
+    'no_bias_swiglu_fusion': True,
+    'no_rope_fusion': True
+}
+def convert_hf2mcore(args: ExportArguments) -> None:
+    kwargs = args.get_model_kwargs()
+    hf_model, processor = get_model_tokenizer(**kwargs)
+    if args.thread_count is None:
+        checkpoint_size = sum(get_n_params_grads(hf_model)[0]) * torch.finfo(args.torch_dtype).bits // 8e9
+        args.thread_count = max(math.ceil(checkpoint_size / 10), 2)  # 10GB
+    patch_torch_dist_shard(args.thread_count)
+    megatron_model_meta = get_megatron_model_meta(args.model_type)
+    assert megatron_model_meta is not None, f'Model: {args.model} is not supported.'
+    kwargs = megatron_model_meta.convert_hf_config(processor.model_info.config)
+    megatron_args = MegatronArguments(**kwargs, **convert_kwargs, save=args.output_dir, torch_dtype=args.torch_dtype)
+    patch_megatron_tokenizer(processor)
+    extra_args = megatron_args.parse_to_megatron()
+    initialize_megatron(args_defaults=extra_args)
+    mg_model = megatron_model_meta.model_provider()
+    logger.info('Megatron model created successfully.')
+    megatron_model_meta.convert_hf2mcore(hf_model, mg_model)
+    if args.test_convert_precision:
+        test_convert_precision(hf_model, mg_model, processor)
+    logger.info('Successfully transferred HF model weights to MG model.')
+    mg_save_checkpoint(1, [mg_model], None, None, 0)
+    args.save_args()
+    logger.info(f'Successfully saved Megatron model weights in `{args.output_dir}`.')
+def convert_mcore2hf(args: ExportArguments) -> None:
+    kwargs = args.get_model_kwargs()
+    hf_model, processor = get_model_tokenizer(**kwargs)
+    if args.thread_count is None:
+        checkpoint_size = sum(get_n_params_grads(hf_model)[0]) * torch.finfo(args.torch_dtype).bits // 8e9
+        args.thread_count = max(math.ceil(checkpoint_size / 10), 2)  # 10GB
+    patch_torch_dist_shard(args.thread_count)
+    megatron_model_meta = get_megatron_model_meta(args.model_type)
+    assert megatron_model_meta is not None, f'Model: {args.model} is not supported.'
+    kwargs = megatron_model_meta.convert_hf_config(processor.model_info.config)
+    megatron_args = MegatronArguments(**kwargs, **convert_kwargs, load=args.mcore_model, torch_dtype=args.torch_dtype)
+    patch_megatron_tokenizer(processor)
+    extra_args = megatron_args.parse_to_megatron()
+    initialize_megatron(args_defaults=extra_args)
+    mg_model = megatron_model_meta.model_provider()
+    load_checkpoint([mg_model], None, None, strict=True)
+    logger.info('Megatron model created successfully.')
+    megatron_model_meta.convert_mcore2hf(hf_model, mg_model)
+    if args.test_convert_precision:
+        test_convert_precision(hf_model, mg_model, processor)
+    logger.info('Successfully transferred MG model weights to HF model.')
+    save_checkpoint(
+        hf_model,
+        processor,
+        args.output_dir,
+        safe_serialization=args.safe_serialization,
+        model_dirs=[args.mcore_model, args.model_dir],
+        max_shard_size=args.max_shard_size,
+        additional_saved_files=hf_model.model_meta.additional_saved_files)
+    args.save_args()
+    logger.info(f'Successfully saved HF model weights in `{args.output_dir}`.')

ms-swift/swift/megatron/utils/patcher.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+from megatron.core.dist_checkpointing.strategies.torch import TorchDistSaveShardedStrategy
+from megatron.training import get_args, global_vars, initialize, training
+from swift.utils import get_logger
+logger = get_logger()
+def patch_megatron_tokenizer(tokenizer):
+    def build_tokenizer(args):
+        args.extra_vocab_size = args.padded_vocab_size - tokenizer.vocab_size
+        return tokenizer
+    global_vars.build_tokenizer = build_tokenizer
+def patch_torch_dist_shard(thread_count):
+    __init__ = TorchDistSaveShardedStrategy.__init__
+    def __new_init__(*args, **kwargs):
+        kwargs['thread_count'] = thread_count
+        return __init__(*args, **kwargs)
+    TorchDistSaveShardedStrategy.__init__ = __new_init__

ms-swift/swift/plugin/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (1.32 kB). View file

ms-swift/swift/plugin/__pycache__/callback.cpython-310.pyc ADDED Viewed

Binary file (1.31 kB). View file

ms-swift/swift/plugin/__pycache__/metric.cpython-310.pyc ADDED Viewed

Binary file (6.73 kB). View file