File size: 6,824 Bytes
import json
import logging
import os

import torchaudio
from torch.utils.data import Dataset


def _handle_wav(wav_path, target_rate=16000):
    """
    handle one wav file.
    Return:
        waveform: numpy narray(1d)
    """
    waveform, sample_rate = torchaudio.load(wav_path)
    if sample_rate != 16000:
        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_rate)(waveform)
    audio = waveform[0]
    return audio

def _handle_qa(obj, is_think=True, think_max_len=50):
    # First prompt template
    system_prompt = 'You are an audio deep-thinking model. Upon receiving a question, please respond in two parts: <THINK> and <RESPONSE>. The <THINK> section should be further divided into four parts: <PLANNING>, <CAPTION>, <REASONING>, and <SUMMARY>.'
    prompt_template1 = (
        system_prompt + "\n" +
        "# Dialogue Response Evaluation\n\n"
        "**IMPORTANT:** Evaluation must include `<think>` analysis and `<score>` rating.\n\n"
        "Listen to the dialogue recording (two sentences, 1-second pause in between). Evaluate the quality of the **second sentence** as a response to the first, focusing on **text relevance** and the **appropriateness** of **Linguistic information (a range of paralinguistic information such as emotion/age/pitch/speed/volume)**.\n"
        "**Note:** Focus on evaluating the appropriateness of the second sentence relative to the first, even if the first sentence itself contains contradictory information.\n\n"
        "## Scoring Criteria\n\n"
        "**1 points**: Text content is irrelevant or incorrect or illogical.(low intelligence)\n"
        "**3 points**: Text is relevant, but paralinguistic information is **inappropriate** for the context.(low emotional quotient)\n"
        "**5 points**: Text is relevant, and paralinguistic information is **appropriate** for the context, resulting in effective communication.(High intelligence and emotional intelligence.)\n\n"
        "## Evaluation Requirements\n\n"
        "Response **MUST** follow this format:\n\n"
        "<think>\n"
        f"Analysing text relevance and paralinguistic information **Appropriateness** and reasons for scoring...(less than {think_max_len} words)\n"
        "</think>\n\n"
        "<score>X</score> (**X is 1, 3, or 5**)\n\n")

    # Second prompt template
    prompt_template2 = (
        system_prompt + "\n" +
        "# Dialogue Response Evaluation\n\n"
        "**IMPORTANT:** Evaluation must include `<think>` analysis and `<score>` rating.\n\n"
        "Listen to the dialogue recording (two sentences, 1-second pause in between). Evaluate the quality of the **second sentence** as a response to the first, focusing on **text relevance** and the **appropriateness** of **Linguistic information (a range of paralinguistic information such as emotion/age/pitch/speed/volume)**.\n"
        "**Note:** Focus on evaluating the appropriateness of the second sentence relative to the first, even if the first sentence itself contains contradictory information.\n\n"
        "## Scoring Criteria\n\n"
        "**1 points**: Text content is irrelevant or incorrect or illogical.(low intelligence)\n"
        "**3 points**: Text is relevant, but paralinguistic information is **inappropriate** for the context.(low emotional quotient)\n"
        "**5 points**: Text is relevant, and paralinguistic information is **appropriate** for the context, resulting in effective communication.(High intelligence and emotional intelligence.)\n\n"
        "## Evaluation Requirements\n\n"
        "Response **MUST** follow this format:\n\n"
        "<think>\n"
        f"Analysing text relevance and paralinguistic information **Appropriateness** and reasons for scoring...(less than {think_max_len} words)\n"
        "</think>\n\n"
        "<score>X</score> (**X is 1, 3, or 5**)\n\n")
    
    
    # Create two prompts with different templates
    obj["prompt1"] = [
        {
        "role": "system",
        "content": [
            {"type": "text", "text": system_prompt}
        ]
        },
        {"role": "user", "content": [
        {"type": "audio", "audio": obj["stereo_audio"]},
        {"type": "text", "text": prompt_template1}]
        }]
    
    obj["prompt2"] = [
        {
        "role": "system",
        "content": [
            {"type": "text", "text": system_prompt}
        ]
        },
        {"role": "user", "content": [
        {"type": "audio", "audio": obj["stereo_audio"]},
        {"type": "text", "text": prompt_template2}
    ]}]
    
    # Store the ground truth score
    obj["solution"] = obj["gt_score"]
    return obj


class AudioDualPromptDataset(Dataset):
    def __init__(self, data_dir, sample_rate=16000, is_think=True, think_max_len=50, load_audio=True):
        super().__init__()
        self.sample_rate = sample_rate
        self.is_think = is_think
        self.think_max_len = think_max_len
        self.load_audio = load_audio
        self.data_dir = data_dir
        self.metadata = []  # Store only metadata instead of full data
        self._load_metadata()
        logging.info(f"Loaded metadata for {len(self.metadata)} items from {data_dir}")

    def _load_metadata(self):
        for fname in os.listdir(self.data_dir):
            if fname.endswith('.json'):
                fpath = os.path.join(self.data_dir, fname)
                with open(fpath, 'r', encoding='utf8') as f:
                    try:
                        json_obj = json.load(f)
                    except Exception as e:
                        logging.warning(f"Failed to load {fpath}: {e}")
                        continue
                    for item_id, obj in json_obj.items():
                        # Store only essential metadata
                        metadata = {
                            "id": item_id,
                            "stereo_audio": obj.get("stereo_audio", None),
                            "gt_score": obj.get("gt_score", None),
                            "json_path": fpath
                        }
                        self.metadata.append(metadata)

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, index):
        metadata = self.metadata[index]
        
        # Load the full data from JSON file
        # with open(metadata["json_path"], 'r', encoding='utf8') as f:
        #     json_obj = json.load(f)
        #     item = json_obj[metadata["id"]]
        
        # Load audio if needed
        # if self.load_audio and metadata["stereo_audio"] and os.path.exists(metadata["stereo_audio"]):
        #     item["audio"] = _handle_wav(metadata["stereo_audio"], self.sample_rate)
            
        return _handle_qa(
            metadata,
            is_think=self.is_think,
            think_max_len=self.think_max_len
        )