| import os | |
| import json | |
| import logging | |
| import numpy as np | |
| import torchaudio | |
| from torch.utils.data import Dataset | |
| def _load_audio(audio_path, target_rate=16000): | |
| waveform, sample_rate = torchaudio.load(audio_path) | |
| if sample_rate != target_rate: | |
| waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_rate)(waveform) | |
| audio = waveform[0] | |
| return audio | |
| def _process_dialogue(dialogue_id, obj, sample_rate=16000): | |
| # Load stereo audio if available | |
| audio = None | |
| if "stereo_audio" in obj and os.path.exists(obj["stereo_audio"]): | |
| audio = _load_audio(obj["stereo_audio"], sample_rate).numpy() | |
| # Use the latest prompt_template from dataset2.py | |
| # prompt_template = ( | |
| # "# Dialogue Response Evaluation\n\n" | |
| # "**IMPORTANT:** Evaluation must include `<think>` analysis and `<score>` rating.\n\n" | |
| # "Listen to the dialogue recording (two sentences, 1-second pause in between). Evaluate the quality of the **second sentence** as a response to the first, focusing on **text relevance** and the **appropriateness** of **Linguistic information (a range of paralinguistic information such as emotion/age/pitch/speed/volume)**.\n" | |
| # "**Note:** Focus on evaluating the appropriateness of the second sentence relative to the first, even if the first sentence itself contains contradictory information.\n\n" | |
| # "## Scoring Criteria\n\n" | |
| # "**2 points**: Text content is irrelevant or incorrect or illogical.(low intelligence)\n" | |
| # "**4 points**: Text is relevant, but paralinguistic information is **inappropriate** for the context.(low emotional quotient)\n" | |
| # "**10 points**: Text is relevant, and paralinguistic information is **appropriate** for the context, resulting in effective communication.(High intelligence and emotional intelligence.)\n\n" | |
| # "## Evaluation Requirements\n\n" | |
| # "Response **MUST** follow this format:\n\n" | |
| # "<think>\n" | |
| # "Analysing text relevance and paralinguistic information **Appropriateness** and reasons for scoring...\n" | |
| # "</think>\n\n" | |
| # "<score>X</score> (**X is 2, 4, or 10**)\n\n" | |
| # ) | |
| # prompt_template = ( | |
| # "# Dialogue Interaction Evaluation\n\n" | |
| # "**IMPORTANT:** Evaluation must include `<think>` analysis and `<score>` rating sections.\n\n" | |
| # "Please listen to the interactive dialogue recording (multiple sentences, two people conversing). Evaluate the quality of the **interactive dialogue**, focusing on **text relevance (e.g., ignoring interruptions, information errors, redundancy, etc.)** and **speech quality (e.g., slow responses, speech errors, unreasonable interruptions, etc.)**.\n" | |
| # "**Note:** The interactive dialogue involves interruptions. When evaluating, the reasonableness of interruptions must be considered.\n\n" | |
| # "## Scoring Criteria\n\n" | |
| # "**2 points**: Text content is irrelevant, incorrect, or logically inconsistent, and speech quality is poor.\n" | |
| # "**4 points**: Text is relevant but speech quality is poor, OR speech quality is acceptable but text content is irrelevant.\n" | |
| # "**10 points**: Text is relevant, speech quality is good, and interruptions are effectively managed for smooth interaction.\n\n" | |
| # "## Evaluation Requirements\n\n" | |
| # "Responses **MUST** follow this format:\n\n" | |
| # "<think>\n" | |
| # "Analyze text relevance and speech quality, and provide scoring rationale...\n" | |
| # "</think>\n\n" | |
| # "<score>X</score> (**X is 2, 4, or 10**)\n\n" | |
| # ) | |
| # Use gt_score if available, otherwise None | |
| solution = obj.get('gt_score', None) | |
| system = 'You are an audio deep-thinking model. Upon receiving a question, please respond in two parts: <THINK> and <RESPONSE>. The <THINK> section should be further divided into four parts: <PLANNING>, <CAPTION>, <REASONING>, and <SUMMARY>.' | |
| processed_obj = { | |
| "id": dialogue_id, | |
| "prompt": [ | |
| {"role": "system", "content": system}, | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "audio", "audio_url": obj.get("stereo_audio", None)}, | |
| {"type": "text", "text": prompt_template} | |
| ]} | |
| ], | |
| "solution": solution, | |
| "audio": audio, | |
| "clean_dialogue": obj.get("clean_dialogue", None), | |
| } | |
| return processed_obj | |
| class AudioDataset(Dataset): | |
| def __init__(self, data_dir, sample_rate=16000): | |
| super().__init__() | |
| self.sample_rate = sample_rate | |
| self.data_dir = data_dir | |
| self.metadata = [] # Store only metadata instead of full data | |
| self._load_metadata() | |
| logging.info(f"Loaded metadata for {len(self.metadata)} dialogues from {data_dir}") | |
| def _load_metadata(self): | |
| for fname in os.listdir(self.data_dir): | |
| if fname.endswith('.json'): | |
| fpath = os.path.join(self.data_dir, fname) | |
| with open(fpath, 'r', encoding='utf8') as f: | |
| try: | |
| json_obj = json.load(f) | |
| except Exception as e: | |
| logging.warning(f"Failed to load {fpath}: {e}") | |
| continue | |
| for dialogue_id, obj in json_obj.items(): | |
| # Store only essential metadata | |
| metadata = { | |
| "id": dialogue_id, | |
| "stereo_audio": obj.get("stereo_audio", None), | |
| "gt_score": obj.get("gt_score", None), | |
| "clean_dialogue": obj.get("clean_dialogue", None), | |
| "json_path": fpath | |
| } | |
| self.metadata.append(metadata) | |
| def __len__(self): | |
| return len(self.metadata) | |
| def __getitem__(self, index): | |
| metadata = self.metadata[index] | |
| # Load audio only when needed | |
| audio = None | |
| if metadata["stereo_audio"] and os.path.exists(metadata["stereo_audio"]): | |
| audio = _load_audio(metadata["stereo_audio"], self.sample_rate).numpy() | |
| # Use the latest prompt_template | |
| # prompt_template = ( | |
| # "# Dialogue Interaction Evaluation\n\n" | |
| # "**IMPORTANT:** Evaluation must include `<think>` analysis and `<score>` rating sections.\n\n" | |
| # "Please listen to the interactive dialogue recording (multiple sentences between two speakers). Evaluate the quality of the **interactive dialogue**, focusing on:\n" | |
| # "1. **Textual Coherence**: Logical continuity and contextual relevance between speakers (e.g., handling interruptions, information accuracy, redundancy)\n" | |
| # "2. **Interaction Flow**: Management of dialogue dynamics (e.g., response timing, speech clarity, interruption reasonableness)\n\n" | |
| # "**Note:** The evaluation must consider the justification for any interruptions present in the dialogue.\n\n" | |
| # "## Scoring Criteria\n\n" | |
| # "**1 point**: Poor interaction (low coherence, disrupted flow)\n" | |
| # "**2 points**: Excellent interaction (high coherence, smooth flow)\n\n" | |
| # "## Evaluation Requirements\n\n" | |
| # "Responses **MUST** follow this format:\n\n" | |
| # "<think>\n" | |
| # "Analyze both textual coherence and interaction flow, providing specific examples...\n" | |
| # "</think>\n\n" | |
| # "<score>X</score> (**X is 1 or 2**)\n\n" | |
| # ) | |
| prompt_template = ( | |
| "# Dialogue Interaction Evaluation\n\n" | |
| "**Important Note**: The evaluation must include the analysis section and the rating section." | |
| "Please listen to the interactive dialogue recording (multiple sentences between the two speakers). Evaluate the quality of the interactive dialogue, focusing on:" | |
| "**Textual Coherence**: The logical continuity and contextual relevance between the speakers (e.g., handling of interruptions, accuracy of information, redundancy)." | |
| "**Interaction Flow**: The management of the dialogue dynamics (e.g., response timing, clarity of expression, reasonableness of interruptions)." | |
| "**Note**: The evaluation must consider the reasonableness of any interruptions that occur in the dialogue." | |
| "## Scoring Criteria" | |
| "**1 point**: Poor interaction quality (low coherence, disrupted flow)." | |
| "**2 points**: Excellent interaction quality (high coherence, smooth flow)." | |
| "## Dialogue Examples" | |
| "When evaluating the quality of the dialogue, refer to the following specific examples to determine the problems in textual coherence and interaction flow:" | |
| "### Textual Coherence" | |
| "**Ignoring User Interruption Request**" | |
| "Example:" | |
| "A: I'm having a great time! My favorite historical figure is Empress Dowager Dou of the Western Han Dynasty. \[Interruption] She played an important role in formulating the policies of the Han Dynasty." | |
| "B: Excuse me for interrupting, but I've always been curious — did she get involved in any major conflicts during her time?" | |
| "A: Empress Dowager Dou was an outstanding leader, and she made great contributions to the stability and prosperity of the Han Dynasty." | |
| "B: But I asked if she was involved in conflicts. Could you tell me more about that?" | |
| "The user has interrupted and asked a clear question, but the system continues with the original topic and does not respond to the new intention." | |
| "**Information Error**" | |
| "Example:" | |
| "A: I'm in a great state! My favorite historical figure is Empress Dowager Dou of the Western Han Dynasty. \[Interruption] She played an important role in formulating the policies of the Han Dynasty." | |
| "B: Excuse me for interrupting, but I've always been curious — was the trade route of the Silk Road opened by Empress Dowager Dou?" | |
| "A: That's right! She personally led the first caravan to Rome, and she negotiated all the trade agreements herself." | |
| "B: That's really interesting, but I remember that the Silk Road was opened during the reign of Emperor Wu of the Han Dynasty later. Did she really go all the way to Rome?" | |
| "**Wordy and Repetitive, Excessively Long Content**" | |
| "Example:" | |
| "A: I'm in a great state! My favorite historical figure is Empress Dowager Dou of the Western Han Dynasty. \[Interruption] She played an important role in formulating the policies of the Han Dynasty." | |
| "B: Sorry for interrupting, but I'm really interested — which specific policies did she influence during her time as empress dowager?" | |
| "A: Oh, that's a really excellent, wonderful and insightful question! Empress Dowager Dou, this extraordinary and unparalleled historical figure, had a huge, significant and immeasurable impact on numerous policies during the Han Dynasty..." | |
| "### Interaction Flow" | |
| "**Reacting Too Late After an Interruption** (The responder pauses for between 2-4 seconds after the questioner asks a question)" | |
| "Example:" | |
| "A: I'm having a very good time! My favorite historical figure is Empress Dowager Dou of the Han Dynasty. \[Interruption] She played an important role in formulating the policies of the Han Dynasty." | |
| "B: Excuse me for interrupting, but I've always been curious — what's the relationship between her and Emperor Wu of the Han Dynasty?" | |
| "A: (Reacting after more than 2 seconds) She was the mother of Emperor Wu of the Han Dynasty..." | |
| "**The Speaker Continuing to Speak After Being Interrupted** (The responder ignores the interruption and continues speaking after the questioner interrupts)" | |
| "Example:" | |
| "A: I'm having a very good time! My favorite historical figure is Empress Dowager Dou of the Han Dynasty. \[Interruption]" | |
| "A&B: She played an important role in formulating the policies of the Han Dynasty. {Excuse me for interrupting, but I've always been curious — what's the relationship between her and Emperor Wu of the Han Dynasty?} (The content in the curly brackets indicates that B is also speaking)" | |
| "**Error in the Speaker After Interruption** (B finishes the content that A was originally going to say)" | |
| "**Unreasonable Interruption** (The interruption is not reasonable, such as interrupting the speaker while they are speaking)" | |
| "Example:" | |
| "A: I'm having a very good time! My favorite historical figure is Empress Dowager Dou of the Han Dynasty. \[Interruption] (She played an important role in formulating the policies of the Han Dynasty.)" | |
| "B: Excuse me for interrupting, but I've always been curious — what's the relationship between her and Emperor Wu of the Han Dynasty? She played an important role in formulating the policies of the Han Dynasty." | |
| "## Evaluation Requirements" | |
| "The response must follow the following format:" | |
| "<think>" | |
| "Analyze the interactive dialogue in terms of textual coherence and interaction flow, and provide specific examples... X (X is either 1 or 2)" | |
| "</think>" | |
| "<score>X</score> (X is either 1 or 2)" | |
| ) | |
| return { | |
| "id": metadata["id"], | |
| "prompt": [{ | |
| "role": "user", | |
| "content": [ | |
| #{"type": "audio", "audio_url": metadata["stereo_audio"]}, Qw | |
| {"type": "audio", "audio": metadata["stereo_audio"]}, | |
| {"type": "text", "text": prompt_template} | |
| ] | |
| }], | |
| "solution": metadata["gt_score"], | |
| "audio": audio, | |
| "clean_dialogue": metadata["clean_dialogue"], | |
| } | |
| # import os | |
| # import json | |
| # import logging | |
| # import numpy as np | |
| # import torchaudio | |
| # from torch.utils.data import Dataset | |
| # def _load_audio(audio_path, target_rate=16000): | |
| # waveform, sample_rate = torchaudio.load(audio_path) | |
| # if sample_rate != target_rate: | |
| # waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_rate)(waveform) | |
| # audio = waveform[0] | |
| # return audio | |
| # def _process_dialogue(dialogue_id, obj, sample_rate=16000): | |
| # # Load stereo audio if available | |
| # audio = None | |
| # if "stereo_audio" in obj and os.path.exists(obj["stereo_audio"]): | |
| # audio = _load_audio(obj["stereo_audio"], sample_rate).numpy() | |
| # # Use the latest prompt_template from dataset2.py | |
| # # prompt_template = ( | |
| # # "# Dialogue Response Evaluation\n\n" | |
| # # "**IMPORTANT:** Evaluation must include `<think>` analysis and `<score>` rating.\n\n" | |
| # # "Listen to the dialogue recording (two sentences, 1-second pause in between). Evaluate the quality of the **second sentence** as a response to the first, focusing on **text relevance** and the **appropriateness** of **Linguistic information (a range of paralinguistic information such as emotion/age/pitch/speed/volume)**.\n" | |
| # # "**Note:** Focus on evaluating the appropriateness of the second sentence relative to the first, even if the first sentence itself contains contradictory information.\n\n" | |
| # # "## Scoring Criteria\n\n" | |
| # # "**2 points**: Text content is irrelevant or incorrect or illogical.(low intelligence)\n" | |
| # # "**4 points**: Text is relevant, but paralinguistic information is **inappropriate** for the context.(low emotional quotient)\n" | |
| # # "**10 points**: Text is relevant, and paralinguistic information is **appropriate** for the context, resulting in effective communication.(High intelligence and emotional intelligence.)\n\n" | |
| # # "## Evaluation Requirements\n\n" | |
| # # "Response **MUST** follow this format:\n\n" | |
| # # "<think>\n" | |
| # # "Analysing text relevance and paralinguistic information **Appropriateness** and reasons for scoring...\n" | |
| # # "</think>\n\n" | |
| # # "<score>X</score> (**X is 2, 4, or 10**)\n\n" | |
| # # ) | |
| # prompt_template = ( | |
| # "# Dialogue Interaction Evaluation\n\n" | |
| # "**IMPORTANT:** Evaluation must include `<think>` analysis and `<score>` rating sections.\n\n" | |
| # "Please listen to the interactive dialogue recording (multiple sentences, two people conversing). Evaluate the quality of the **interactive dialogue**, focusing on **text relevance (e.g., ignoring interruptions, information errors, redundancy, etc.)** and **speech quality (e.g., slow responses, speech errors, unreasonable interruptions, etc.)**.\n" | |
| # "**Note:** The interactive dialogue involves interruptions. When evaluating, the reasonableness of interruptions must be considered.\n\n" | |
| # "## Scoring Criteria\n\n" | |
| # "**2 points**: Text content is irrelevant, incorrect, or logically inconsistent, and speech quality is poor.\n" | |
| # "**4 points**: Text is relevant but speech quality is poor, OR speech quality is acceptable but text content is irrelevant.\n" | |
| # "**10 points**: Text is relevant, speech quality is good, and interruptions are effectively managed for smooth interaction.\n\n" | |
| # "## Evaluation Requirements\n\n" | |
| # "Responses **MUST** follow this format:\n\n" | |
| # "<think>\n" | |
| # "Analyze text relevance and speech quality, and provide scoring rationale...\n" | |
| # "</think>\n\n" | |
| # "<score>X</score> (**X is 2, 4, or 10**)\n\n" | |
| # ) | |
| # # Use gt_score if available, otherwise None | |
| # solution = obj.get('gt_score', None) | |
| # processed_obj = { | |
| # "id": dialogue_id, | |
| # "prompt": [{ | |
| # "role": "user", | |
| # "content": [ | |
| # {"type": "audio", "audio_url": obj.get("stereo_audio", None)}, | |
| # {"type": "text", "text": prompt_template} | |
| # ] | |
| # }], | |
| # "solution": solution, | |
| # "audio": audio, | |
| # "clean_dialogue": obj.get("clean_dialogue", None), | |
| # } | |
| # return processed_obj | |
| # class AudioDataset(Dataset): | |
| # def __init__(self, data_dir, sample_rate=16000): | |
| # super().__init__() | |
| # self.data = [] | |
| # self.sample_rate = sample_rate | |
| # self.data_dir = data_dir | |
| # self._load_all_jsons() | |
| # logging.info(f"Loaded {len(self.data)} dialogues from {data_dir}") | |
| # def _load_all_jsons(self): | |
| # for fname in os.listdir(self.data_dir): | |
| # if fname.endswith('.json'): | |
| # fpath = os.path.join(self.data_dir, fname) | |
| # with open(fpath, 'r', encoding='utf8') as f: | |
| # try: | |
| # json_obj = json.load(f) | |
| # except Exception as e: | |
| # logging.warning(f"Failed to load {fpath}: {e}") | |
| # continue | |
| # for dialogue_id, obj in json_obj.items(): | |
| # processed = _process_dialogue(dialogue_id, obj, self.sample_rate) | |
| # self.data.append(processed) | |
| # def __len__(self): | |
| # return len(self.data) | |
| # def __getitem__(self, index): | |
| # return self.data[index] |