|
|
import json |
|
|
import logging |
|
|
import os |
|
|
|
|
|
import torchaudio |
|
|
from torch.utils.data import Dataset |
|
|
|
|
|
|
|
|
def _handle_wav(wav_path, target_rate=16000): |
|
|
""" |
|
|
handle one wav file. |
|
|
Return: |
|
|
waveform: numpy narray(1d) |
|
|
""" |
|
|
waveform, sample_rate = torchaudio.load(wav_path) |
|
|
if sample_rate != 16000: |
|
|
waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_rate)(waveform) |
|
|
audio = waveform[0] |
|
|
return audio |
|
|
|
|
|
def _handle_qa(obj, is_think=True, think_max_len=50): |
|
|
|
|
|
system_prompt = 'You are an audio deep-thinking model. Upon receiving a question, please respond in two parts: <THINK> and <RESPONSE>. The <THINK> section should be further divided into four parts: <PLANNING>, <CAPTION>, <REASONING>, and <SUMMARY>.' |
|
|
prompt_template1 = ( |
|
|
system_prompt + "\n" + |
|
|
"# Dialogue Response Evaluation\n\n" |
|
|
"**IMPORTANT:** Evaluation must include `<think>` analysis and `<score>` rating.\n\n" |
|
|
"Listen to the dialogue recording (two sentences, 1-second pause in between). Evaluate the quality of the **second sentence** as a response to the first, focusing on **text relevance** and the **appropriateness** of **Linguistic information (a range of paralinguistic information such as emotion/age/pitch/speed/volume)**.\n" |
|
|
"**Note:** Focus on evaluating the appropriateness of the second sentence relative to the first, even if the first sentence itself contains contradictory information.\n\n" |
|
|
"## Scoring Criteria\n\n" |
|
|
"**1 points**: Text content is irrelevant or incorrect or illogical.(low intelligence)\n" |
|
|
"**3 points**: Text is relevant, but paralinguistic information is **inappropriate** for the context.(low emotional quotient)\n" |
|
|
"**5 points**: Text is relevant, and paralinguistic information is **appropriate** for the context, resulting in effective communication.(High intelligence and emotional intelligence.)\n\n" |
|
|
"## Evaluation Requirements\n\n" |
|
|
"Response **MUST** follow this format:\n\n" |
|
|
"<think>\n" |
|
|
f"Analysing text relevance and paralinguistic information **Appropriateness** and reasons for scoring...(less than {think_max_len} words)\n" |
|
|
"</think>\n\n" |
|
|
"<score>X</score> (**X is 1, 3, or 5**)\n\n") |
|
|
|
|
|
|
|
|
prompt_template2 = ( |
|
|
system_prompt + "\n" + |
|
|
"# Dialogue Response Evaluation\n\n" |
|
|
"**IMPORTANT:** Evaluation must include `<think>` analysis and `<score>` rating.\n\n" |
|
|
"Listen to the dialogue recording (two sentences, 1-second pause in between). Evaluate the quality of the **second sentence** as a response to the first, focusing on **text relevance** and the **appropriateness** of **Linguistic information (a range of paralinguistic information such as emotion/age/pitch/speed/volume)**.\n" |
|
|
"**Note:** Focus on evaluating the appropriateness of the second sentence relative to the first, even if the first sentence itself contains contradictory information.\n\n" |
|
|
"## Scoring Criteria\n\n" |
|
|
"**1 points**: Text content is irrelevant or incorrect or illogical.(low intelligence)\n" |
|
|
"**3 points**: Text is relevant, but paralinguistic information is **inappropriate** for the context.(low emotional quotient)\n" |
|
|
"**5 points**: Text is relevant, and paralinguistic information is **appropriate** for the context, resulting in effective communication.(High intelligence and emotional intelligence.)\n\n" |
|
|
"## Evaluation Requirements\n\n" |
|
|
"Response **MUST** follow this format:\n\n" |
|
|
"<think>\n" |
|
|
f"Analysing text relevance and paralinguistic information **Appropriateness** and reasons for scoring...(less than {think_max_len} words)\n" |
|
|
"</think>\n\n" |
|
|
"<score>X</score> (**X is 1, 3, or 5**)\n\n") |
|
|
|
|
|
|
|
|
|
|
|
obj["prompt1"] = [ |
|
|
{ |
|
|
"role": "system", |
|
|
"content": [ |
|
|
{"type": "text", "text": system_prompt} |
|
|
] |
|
|
}, |
|
|
{"role": "user", "content": [ |
|
|
{"type": "audio", "audio": obj["stereo_audio"]}, |
|
|
{"type": "text", "text": prompt_template1}] |
|
|
}] |
|
|
|
|
|
obj["prompt2"] = [ |
|
|
{ |
|
|
"role": "system", |
|
|
"content": [ |
|
|
{"type": "text", "text": system_prompt} |
|
|
] |
|
|
}, |
|
|
{"role": "user", "content": [ |
|
|
{"type": "audio", "audio": obj["stereo_audio"]}, |
|
|
{"type": "text", "text": prompt_template2} |
|
|
]}] |
|
|
|
|
|
|
|
|
obj["solution"] = obj["gt_score"] |
|
|
return obj |
|
|
|
|
|
|
|
|
class AudioDualPromptDataset(Dataset): |
|
|
def __init__(self, data_dir, sample_rate=16000, is_think=True, think_max_len=50, load_audio=True): |
|
|
super().__init__() |
|
|
self.sample_rate = sample_rate |
|
|
self.is_think = is_think |
|
|
self.think_max_len = think_max_len |
|
|
self.load_audio = load_audio |
|
|
self.data_dir = data_dir |
|
|
self.metadata = [] |
|
|
self._load_metadata() |
|
|
logging.info(f"Loaded metadata for {len(self.metadata)} items from {data_dir}") |
|
|
|
|
|
def _load_metadata(self): |
|
|
for fname in os.listdir(self.data_dir): |
|
|
if fname.endswith('.json'): |
|
|
fpath = os.path.join(self.data_dir, fname) |
|
|
with open(fpath, 'r', encoding='utf8') as f: |
|
|
try: |
|
|
json_obj = json.load(f) |
|
|
except Exception as e: |
|
|
logging.warning(f"Failed to load {fpath}: {e}") |
|
|
continue |
|
|
for item_id, obj in json_obj.items(): |
|
|
|
|
|
metadata = { |
|
|
"id": item_id, |
|
|
"stereo_audio": obj.get("stereo_audio", None), |
|
|
"gt_score": obj.get("gt_score", None), |
|
|
"json_path": fpath |
|
|
} |
|
|
self.metadata.append(metadata) |
|
|
|
|
|
def __len__(self): |
|
|
return len(self.metadata) |
|
|
|
|
|
def __getitem__(self, index): |
|
|
metadata = self.metadata[index] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return _handle_qa( |
|
|
metadata, |
|
|
is_think=self.is_think, |
|
|
think_max_len=self.think_max_len |
|
|
) |