File size: 6,187 Bytes
8613355
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import os
import json
import logging
import numpy as np
import torchaudio
from torch.utils.data import Dataset


def _handle_wav(wav_path, target_rate=16000):
    """
    handle one wav file.
    Return:
        waveform: numpy narray(1d)
    """
    waveform, sample_rate = torchaudio.load(wav_path)
    if sample_rate != target_rate:
        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_rate)(waveform)
    audio = waveform[0]
    return audio


def _handle_qa(obj, is_think=True, think_max_len=50):
    if is_think:
        prompt_template = (
            "# Dialogue Response Evaluation\n\n"
            "**IMPORTANT:** Evaluation must include `<think>` analysis and `<score>` rating.\n\n"
            "Listen to the dialogue recording (two sentences, 1-second pause in between). Evaluate the quality of the **second sentence** as a response to the first, focusing on **text relevance** and the **appropriateness** of **Linguistic information (a range of paralinguistic information such as emotion/age/pitch/speed/volume)**.\n"
            "**Note:** Focus on evaluating the appropriateness of the second sentence relative to the first, even if the first sentence itself contains contradictory information.\n\n"
            "## Scoring Criteria\n\n"
            "**1 points**: Text content is irrelevant or incorrect or illogical.(low intelligence)\n"
            "**3 points**: Text is relevant, but paralinguistic information is **inappropriate** for the context.(low emotional quotient)\n"
            "**5 points**: Text is relevant, and paralinguistic information is **appropriate** for the context, resulting in effective communication.(High intelligence and emotional intelligence.)\n\n"
            "## Evaluation Requirements\n\n"
            "Response **MUST** follow this format:\n\n"
            "<think>\n"
            f"Analysing text relevance and paralinguistic information **Appropriateness** and reasons for scoring...(less than {think_max_len} words)\n"
            "</think>\n\n"
            "<score>X</score> (**X is 1, 3, or 5**)\n\n")
    else:
        prompt_template = (
            "# Dialogue Response Evaluation\n\n"
            "**IMPORTANT:** Evaluation must include`<score>` rating.\n\n"
            "Listen to the dialogue recording (two sentences, 1-second pause in between). Evaluate the quality of the **second sentence** as a response to the first, focusing on **text relevance** and the **appropriateness** of **Linguistic information (a range of paralinguistic information such as emotion/age/pitch/speed/volume)**.\n"
            "**Note:** Focus on evaluating the appropriateness of the second sentence relative to the first, even if the first sentence itself contains contradictory information.\n\n"
            "## Scoring Criteria\n\n"
            "**1 points**: Text content is irrelevant or incorrect or illogical.(low intelligence)\n"
            "**3 points**: Text is relevant, but paralinguistic information is **inappropriate** for the context.(low emotional quotient)\n"
            "**5 points**: Text is relevant, and paralinguistic information is **appropriate** for the context, resulting in effective communication.(High intelligence and emotional intelligence.)\n\n"
            "## Evaluation Requirements\n\n"
            "Response **MUST** follow this format:\n\n"
            "<score>X</score> (**X is 1, 3, or 5**)\n\n")

    # 构建处理后的对象
    processed_obj = {
        "id": obj["id"],
        "prompt": [{"role": "user", "content": [
            {"type": "audio", "audio": obj["merge_wav"]},
            {"type": "text", "text": prompt_template}
        ]}],
        "solution": obj["gt_score"],
        "audio": obj.get("audio", None),
        "clean_dialogue": obj.get("clean_dialogue", None)
    }
    return processed_obj


class AudioDataset(Dataset):
    def __init__(self, data_dir, sample_rate=16000, is_think=True, think_max_len=50, load_audio=False):
        super().__init__()
        self.sample_rate = sample_rate
        self.data_dir = data_dir
        self.is_think = is_think
        self.think_max_len = think_max_len
        self.load_audio = load_audio
        self.metadata = []  # Store only metadata instead of full data
        self._load_metadata()
        logging.info(f"Loaded metadata for {len(self.metadata)} dialogues from {data_dir}")

    def _load_metadata(self):
        for fname in os.listdir(self.data_dir):
            if fname.endswith('.json'):
                fpath = os.path.join(self.data_dir, fname)
                with open(fpath, 'r', encoding='utf8') as f:
                    try:
                        json_obj = json.load(f)
                    except Exception as e:
                        logging.warning(f"Failed to load {fpath}: {e}")
                        continue
                    for dialogue_id, obj in json_obj.items():
                        # Store only essential metadata
                        metadata = {
                            "id": dialogue_id,
                            "merge_wav": obj.get("merge_wav", None),
                            "gt_score": obj.get("gt_score", None),
                            "clean_dialogue": obj.get("clean_dialogue", None),
                            "json_path": fpath
                        }
                        self.metadata.append(metadata)

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, index):
        metadata = self.metadata[index]
        
        # 构建完整的对象
        item = {
            "id": metadata["id"],
            "merge_wav": metadata["merge_wav"],
            "gt_score": metadata["gt_score"],
            "clean_dialogue": metadata["clean_dialogue"]
        }
        
        # 如果需要加载音频
        if self.load_audio and metadata["merge_wav"] and os.path.exists(metadata["merge_wav"]):
            item["audio"] = _handle_wav(metadata["merge_wav"], self.sample_rate).numpy()
        
        # 使用_handle_qa处理对象
        return _handle_qa(
            item,
            is_think=self.is_think,
            think_max_len=self.think_max_len
        )