File size: 6,824 Bytes
8613355 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import json
import logging
import os
import torchaudio
from torch.utils.data import Dataset
def _handle_wav(wav_path, target_rate=16000):
"""
handle one wav file.
Return:
waveform: numpy narray(1d)
"""
waveform, sample_rate = torchaudio.load(wav_path)
if sample_rate != 16000:
waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_rate)(waveform)
audio = waveform[0]
return audio
def _handle_qa(obj, is_think=True, think_max_len=50):
# First prompt template
system_prompt = 'You are an audio deep-thinking model. Upon receiving a question, please respond in two parts: <THINK> and <RESPONSE>. The <THINK> section should be further divided into four parts: <PLANNING>, <CAPTION>, <REASONING>, and <SUMMARY>.'
prompt_template1 = (
system_prompt + "\n" +
"# Dialogue Response Evaluation\n\n"
"**IMPORTANT:** Evaluation must include `<think>` analysis and `<score>` rating.\n\n"
"Listen to the dialogue recording (two sentences, 1-second pause in between). Evaluate the quality of the **second sentence** as a response to the first, focusing on **text relevance** and the **appropriateness** of **Linguistic information (a range of paralinguistic information such as emotion/age/pitch/speed/volume)**.\n"
"**Note:** Focus on evaluating the appropriateness of the second sentence relative to the first, even if the first sentence itself contains contradictory information.\n\n"
"## Scoring Criteria\n\n"
"**1 points**: Text content is irrelevant or incorrect or illogical.(low intelligence)\n"
"**3 points**: Text is relevant, but paralinguistic information is **inappropriate** for the context.(low emotional quotient)\n"
"**5 points**: Text is relevant, and paralinguistic information is **appropriate** for the context, resulting in effective communication.(High intelligence and emotional intelligence.)\n\n"
"## Evaluation Requirements\n\n"
"Response **MUST** follow this format:\n\n"
"<think>\n"
f"Analysing text relevance and paralinguistic information **Appropriateness** and reasons for scoring...(less than {think_max_len} words)\n"
"</think>\n\n"
"<score>X</score> (**X is 1, 3, or 5**)\n\n")
# Second prompt template
prompt_template2 = (
system_prompt + "\n" +
"# Dialogue Response Evaluation\n\n"
"**IMPORTANT:** Evaluation must include `<think>` analysis and `<score>` rating.\n\n"
"Listen to the dialogue recording (two sentences, 1-second pause in between). Evaluate the quality of the **second sentence** as a response to the first, focusing on **text relevance** and the **appropriateness** of **Linguistic information (a range of paralinguistic information such as emotion/age/pitch/speed/volume)**.\n"
"**Note:** Focus on evaluating the appropriateness of the second sentence relative to the first, even if the first sentence itself contains contradictory information.\n\n"
"## Scoring Criteria\n\n"
"**1 points**: Text content is irrelevant or incorrect or illogical.(low intelligence)\n"
"**3 points**: Text is relevant, but paralinguistic information is **inappropriate** for the context.(low emotional quotient)\n"
"**5 points**: Text is relevant, and paralinguistic information is **appropriate** for the context, resulting in effective communication.(High intelligence and emotional intelligence.)\n\n"
"## Evaluation Requirements\n\n"
"Response **MUST** follow this format:\n\n"
"<think>\n"
f"Analysing text relevance and paralinguistic information **Appropriateness** and reasons for scoring...(less than {think_max_len} words)\n"
"</think>\n\n"
"<score>X</score> (**X is 1, 3, or 5**)\n\n")
# Create two prompts with different templates
obj["prompt1"] = [
{
"role": "system",
"content": [
{"type": "text", "text": system_prompt}
]
},
{"role": "user", "content": [
{"type": "audio", "audio": obj["stereo_audio"]},
{"type": "text", "text": prompt_template1}]
}]
obj["prompt2"] = [
{
"role": "system",
"content": [
{"type": "text", "text": system_prompt}
]
},
{"role": "user", "content": [
{"type": "audio", "audio": obj["stereo_audio"]},
{"type": "text", "text": prompt_template2}
]}]
# Store the ground truth score
obj["solution"] = obj["gt_score"]
return obj
class AudioDualPromptDataset(Dataset):
def __init__(self, data_dir, sample_rate=16000, is_think=True, think_max_len=50, load_audio=True):
super().__init__()
self.sample_rate = sample_rate
self.is_think = is_think
self.think_max_len = think_max_len
self.load_audio = load_audio
self.data_dir = data_dir
self.metadata = [] # Store only metadata instead of full data
self._load_metadata()
logging.info(f"Loaded metadata for {len(self.metadata)} items from {data_dir}")
def _load_metadata(self):
for fname in os.listdir(self.data_dir):
if fname.endswith('.json'):
fpath = os.path.join(self.data_dir, fname)
with open(fpath, 'r', encoding='utf8') as f:
try:
json_obj = json.load(f)
except Exception as e:
logging.warning(f"Failed to load {fpath}: {e}")
continue
for item_id, obj in json_obj.items():
# Store only essential metadata
metadata = {
"id": item_id,
"stereo_audio": obj.get("stereo_audio", None),
"gt_score": obj.get("gt_score", None),
"json_path": fpath
}
self.metadata.append(metadata)
def __len__(self):
return len(self.metadata)
def __getitem__(self, index):
metadata = self.metadata[index]
# Load the full data from JSON file
# with open(metadata["json_path"], 'r', encoding='utf8') as f:
# json_obj = json.load(f)
# item = json_obj[metadata["id"]]
# Load audio if needed
# if self.load_audio and metadata["stereo_audio"] and os.path.exists(metadata["stereo_audio"]):
# item["audio"] = _handle_wav(metadata["stereo_audio"], self.sample_rate)
return _handle_qa(
metadata,
is_think=self.is_think,
think_max_len=self.think_max_len
) |