File size: 11,379 Bytes
e791fa3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
import json
import os
import random
def get_prompt_for_file(filename):
if 'overlapgap' in filename:
return overlap_prompt
elif 'silencegap' in filename:
return silence_prompt
elif 'speaker' in filename:
return speaker_prompt
elif 'transcription' in filename:
return transcript_prompt
else:
return None
output_path = "/root/autodl-tmp/ms-swift/dataset_newCOTSFT1.jsonl"
# with open(input_path, "r") as fin:
# input_data = json.load(fin)
www = "hello"
www = (
"# Dialogue Response Evaluation\n\n"
"**IMPORTANT:** Evaluation must include`<score>` rating.\n\n"
"Listen to the dialogue recording (two sentences, 1-second pause in between). Evaluate the quality of the **second sentence** as a response to the first, focusing on **text relevance** and the **appropriateness** of **Linguistic information (a range of paralinguistic information such as emotion/age/pitch/speed/volume)**.\n"
"**Note:** Focus on evaluating the appropriateness of the second sentence relative to the first, even if the first sentence itself contains contradictory information.\n\n"
"## Scoring Criteria\n\n"
"**1 points**: Text content is irrelevant or incorrect or illogical.(low intelligence)\n"
"**3 points**: Text is relevant, but paralinguistic information is **inappropriate** for the context.(low emotional quotient)\n"
"**5 points**: Text is relevant, and paralinguistic information is **appropriate** for the context, resulting in effective communication.(High intelligence and emotional intelligence.)\n\n"
"## Evaluation Requirements\n\n"
"Response **MUST** follow this format:\n\n"
"<score>X</score> (**X is 1, 3, or 5**)\n\n")
# www = (
# "# Interactional Dialogue Evaluation\n\n"
# "**IMPORTANT**: Evaluation must include `<response think>` and `<fluency think>` analysis and `<overall score>` rating.\n"
# "Listen to a two-person interactional dialogue speech (Dual-channel audio, with each channel representing one speaker), labeled as speakers A and B. Evaluate the quality of the interaction, focusing on:\n"
# "**Response Relevance:** \n"
# "**logical consistency, topic coherence**\n"
# "**Interactional Fluency:**\n"
# "**Strictly detect dual-tracked vocal overlap >3s (cross-channel analysis)**\n"
# "**Pauses >5s between turns (must evaluate) \n\n**"
# "**Note**: Small pauses and brief overlaps in audio are acceptable, while prolonged pauses and overlapping audio are harmful. You should consider Response Relevance and Interactional Fluency separately, and provide the corresponding thinking process.\n\n"
# "## Scoring Criteria\n"
# "Assign a single holistic score based on the combined evaluation:\n"
# "`1` (Poor): Significant issues in either **Response Relevance ** or **Interactional Fluency. **\n"
# "`2` (Excellent): Both **Response Relevance ** and **Interactional Fluency ** are consistently appropriate and natural.\n"
# "## Evaluation Output Format:\n"
# "Strictly follow this template:\n"
# "<response think>\n"
# "[Analysing Response Relevance and giving reasons for scoring...]\n"
# "</response think>\n"
# "<fluency think>\n"
# "[Analysing Interactional Fluency and giving reasons for scoring.]\n"
# "</fluency think>\n"
# "<overall score>X</overall score>\n"
# )
# www = (
# "# Interactional Dialogue Evaluation\n\n"
# "**IMPORTANT**: Evaluation must include `<response think>` and `<fluency think>` analysis and `<overall score>` rating.\n"
# "Listen to a two-person interactional dialogue speech (Dual-channel audio, with each channel representing one speaker), labeled as speakers A and B. Evaluate the quality of the interaction, focusing on:\n"
# "**Response Relevance:** \n"
# "**logical consistency, topic coherence**\n"
# "**Interactional Fluency:**\n"
# "**Strictly detect dual-tracked vocal overlap >3s (cross-channel analysis)**\n"
# "**Pauses >5s between turns (must evaluate) \n\n**"
# "**Note**: Small pauses and brief overlaps in audio are acceptable, while prolonged pauses and overlapping audio are harmful. You should consider Response Relevance and Interactional Fluency separately, and provide the corresponding thinking process.\n\n"
# "## Scoring Criteria\n"
# "Assign a single holistic score based on the combined evaluation:\n"
# "`1` (Poor): Significant issues in either **Response Relevance ** or **Interactional Fluency. **\n"
# "`2` (Excellent): Both **Response Relevance ** and **Interactional Fluency ** are consistently appropriate and natural.\n"
# "## Evaluation Output Format:\n"
# "Strictly follow this template:\n"
# "<response think>\n"
# "[Analysing Response Relevance and giving reasons for scoring...]\n"
# "</response think>\n"
# "<fluency think>\n"
# "[Analysing Interactional Fluency and giving reasons for scoring.]\n"
# "</fluency think>\n"
# "<overall score>X</overall score>\n"
# )
overlap_prompt = (
"Analyze the dual-channel audio and identify segments where multiple speakers are talking simultaneously for more than 3 seconds. \n"
"Simply tell me when the overlap starts and ends in MM:SS format. \n"
"Just one simple sentence about the overlap timing. Keep the word count within 40 words."
)
overlap_prompt_yes = (
"Analyze the dual-channel audio and identify segments where multiple speakers are talking simultaneously for more than 3 seconds. \n"
"Simply tell me whether the audio has overlap segments. \n"
"Just one simple sentence about the overlap segments. Keep the word count within 40 words."
)
silence_prompt = (
"Analyze the dual-channel audio and identify segments where multiple speakers are silent for more than 3 seconds. \n"
"Simply tell me when the silence starts and ends in MM:SS format. \n"
"Just one simple sentence about the silence timing. Keep the word count within 40 words."
)
speaker_prompt = (
"Analyze the dual-channel audio and detect individual speakers. \n"
"List the speaking segments for each speaker in MM:SS-MM:SS format. \n"
"Only output speaker labels and time segments in a similar format. \n"
)
transcript_prompt = (
"Analyze the dual-channel audio and transcript each speaker's sentences with timestamps. \n"
"List the speaking segments and transcript text for each speaker in MM:SS-MM:SS format. \n"
"Only output time segments, speaker labels, and transcript text in a similar format.\n"
)
# Process files in the silence_overlaps directory
input_dir = "/root/autodl-tmp/ms-swift/input"
all_data = []
prompt_template = (
"# Interactional Dialogue Evaluation\n\n"
"**IMPORTANT**: Evaluation must include `<response think>` and `<fluency think>` analysis and `<overall score>` rating.\n"
"Listen to a two-person interactional dialogue speech (Dual-channel audio, with each channel representing one speaker), labeled as speakers A and B. Evaluate the quality of the interaction, focusing on:\n"
"**Response Relevance:** \n"
"**logical consistency, topic coherence**\n"
"**Interactional Fluency:**\n"
"**Detect and evaluate extended vocal overlaps, e.g., cross-channel overlap.**\n"
"**Detect and evaluate long pauses, e.g., pauses more than 3s between speaker turns.\n\n**"
"**Note**: Small pauses and brief overlaps in audio are acceptable, while prolonged pauses and overlapping audio are harmful. You should consider Response Relevance and Interactional Fluency separately, and provide the corresponding thinking process.\n\n"
"## Scoring Criteria\n"
"Assign a single holistic score based on the combined evaluation:\n"
"`1` (Poor): Significant issues in either **Response Relevance ** or **Interactional Fluency. **\n"
"`2` (Excellent): Both **Response Relevance ** and **Interactional Fluency ** are consistently appropriate and natural.\n"
"## Evaluation Output Format:\n"
"Strictly follow this template:\n"
"<response think>\n"
"[Analysing Response Relevance and giving reasons for scoring...]\n"
"</response think>\n"
"<fluency think>\n"
"[Analysing Interactional Fluency and giving reasons for scoring.]\n"
"</fluency think>\n"
"<overall score>X</overall score>\n"
)
# prompt_template4Job = (
# "# Interactional Dialogue Evaluation\n\n"
# "**IMPORTANT**: Evaluation must include `<response think>` and `<fluency think>` analysis and `<overall score>` rating.\n"
# "Listen to a two-person interactional dialogue speech (Dual-channel audio, with each channel representing one speaker), labeled as speakers A and B. Evaluate the quality of the interaction, focusing on:\n"
# "**Response Relevance:** \n"
# "**logical consistency, topic coherence**\n"
# "**Interactional Fluency:**\n"
# "**Detect and evaluate extended vocal overlaps, e.g., cross-channel overlap.**\n"
# "**Detect and evaluate long pauses, e.g., pauses more than 3s between speaker turns.\n\n**"
# "**Note**: Small pauses and brief overlaps in audio are acceptable, while prolonged pauses and overlapping audio are harmful. You should consider Response Relevance and Interactional Fluency separately, and provide the corresponding thinking process.\n\n"
# "## Error Type\n"
# "Assign a single holistic score based on the combined evaluation:\n"
# "`1` (Correct): Responses are logically consistent and relevant. No problematic overlaps or silences.\n"
# "`2` (Detect overlap): Extended or frequent overlapping speech is present.\n"
# "`3` (Detect silence): Long pauses (>3 seconds) occur between turns.\n"
# "`4` (Detect text error): Responses are off-topic or disconnected.\n"
# "## Evaluation Output Format:\n"
# "Strictly follow this template:\n"
# "<response think>\n"
# "[Analysing Response Relevance and giving reasons for scoring...]\n"
# "</response think>\n"
# "<fluency think>\n"
# "[Analysing Interactional Fluency and giving reasons for scoring.]\n"
# "</fluency think>\n"
# "<error type>X</error type>\n"
# )
# Process each file
for filename in os.listdir(input_dir):
input_path = os.path.join(input_dir, filename)
if not os.path.isfile(input_path):
continue
# Get the appropriate prompt for this file
prompt = prompt_template
if prompt is None:
print(f"Skipping {filename} - no matching prompt found")
continue
# Read input data
with open(input_path, "r") as fin:
input_data = json.load(fin)
# Process each item
for item in input_data:
data = {
"messages": [
{"role": "user", "content": f"<audio>{prompt}"},
{"role": "assistant", "content": item["model_output"]}
],
"audios": [
item["key"] + "/stereo_dialogue_with_laugh.wav"
],
"solution": 2
}
all_data.append(data)
random.shuffle(all_data)
# Write all processed data to a single output file
with open(output_path, "w", encoding="utf-8") as fout:
for data in all_data:
json.dump(data, fout, ensure_ascii=False)
fout.write('\n') |