File size: 22,979 Bytes
import os
import json
import logging
import numpy as np
import torchaudio
from torch.utils.data import Dataset
# prompt_template = """
#     **Task Goal:** 
#     You are provided with a recording of a two-person dialogue (or its transcript, where left and right channels represent different speakers). Your task is to evaluate this dialogue based on two key dimensions: "Response Quality" and "Interaction Quality." For each dimension, you must:
#     1.  Provide a detailed analysis (your "thinking process").
#     2.  Assign a score of 1 (Poor) or 2 (Excellent).

#     **Important Definitions:**
#     *   **Interruption:** Refers to an instance where one speaker begins talking while the other is still speaking, potentially causing a brief overlap in audio. Your evaluation should consider the reasonableness of the interruption and how both parties react to it. Brief interjections (e.g., "um," "ah") do not constitute an interruption for this evaluation unless they significantly disrupt the interaction flow (e.g., interjections cause audio pause for approximately 5 second).
#     *   **Audio overlap:** Audio overlap refers to portions where the left and right channel audio overlap.

#     **Evaluation Dimensions Explained:**

#     **1. Response Quality (Corresponds to "<response think>")**
#         *   **Core Focus:** The appropriateness, effectiveness, accuracy, and conciseness of the content of the responses.
#         *   **Analytical Guidance (to be detailed in `<response think>`):**
#             *   The response directly and appropriately addresses the other person's statement or question.
#             *   When an interruption occurs, the response handles the interruption appropriately.
#             *   Contextual relevance is maintained throughout the response.
#             *   Information is conveyed concisely, without unnecessary redundancy or verbosity.
#             *   There are no factual errors or logical fallacies in the response.
#     **2. Interaction Quality (Corresponds to "<react think>")**
#         *   **Core Focus:** The natural flow, timing, and smoothness of the conversational exchange and turn-taking.
#         *   **Analytical Guidance (to be detailed in `<react think>`):**
#             *   The overall conversational flow is natural and smooth.
#             *   Pauses, pace, and rhythm are appropriate and natural.
#             *   When interruptions occur, the reactions of both the interrupter and the interrupted party are timely and natural. (e.g., the interrupted person yields appropriately. The interrupter enters at a reasonable moment.)
#             *   Turn-taking is smooth, without long silences (the audio is silent more than 5 seconds) or excessive overlapping speech (the audio is overlapped more than 3 seconds).
#             *   One speaker does not continue talking for too long after being interrupted, avoiding prolonged audio overlap.
#     **3. Score Explanation: (Corresponds to "<overall score>")**
#     *   `1`: Indicates the evaluation for this dimension is Poor, meaning there are significant issues as described above (e.g., inappropriate responses, unnatural interaction flow, etc.).
#     *   `2`: Indicates the evaluation for this dimension is Excellent, meaning the responses and/or interaction are highly appropriate, effective, and natural, with no major issues.

#     **Reference Examples for Evaluation (Illustrative of common issues):**

#     *   **Poor Response Quality Examples:**
#         *   After A asks B a question, B's response completely ignores A's question and continues on a different topic.
#         *   B interrupts A with a question, and A's response contains factually incorrect information.
#         *   A's response is filled with unnecessary adjectives and filler words, making it overly verbose.
#     *   **Poor Interaction Quality Examples:**
#         *   After being interrupted, the responder pauses for an unnaturally long time (e.g.,the audio is silent more than 5 seconds) before speaking.
#         *   After being interrupted, the original speaker continues to speak for a significant duration (e.g.,the audio is overlapped more than 3 seconds), causing prolonged overlapping audio.

#     **Output Format Requirements:**
#     Strictly adhere to the following format and order. Within the `<... think>` tags, elaborate on your analysis. After that, please give the overall score.
    

#     <response think>
#     [Provide your detailed analysis of "Response Quality" here...]
#     </response think>
#     <react think>
#     [Provide your detailed analysis of "Interaction Quality" here...]
#     </react think>
#     <overall score>X</overall score>
    
   
#      """
# prompt_template = """
# # Interactional Dialogue Evaluation\n\n
# **IMPORTANT**: Evaluation must include `<response think>` and `<fluency think>` analysis and `<overall score>` rating.\n
# Listen to a two-person interactional dialogue speech (Dual-channel audio, with each channel representing one speaker), labeled as speakers A and B. Evaluate the quality of the interaction, focusing on:\n
# **Response Relevance** (accuracy, and logical consistency )\n
# **Interactional Fluency** (smoothness of turn-taking，avoiding pauses over 5s or overlaps over 3s)\n
# **Note**: Small pauses and brief overlaps in audio are acceptable, while prolonged pauses and overlapping audio are harmful. You should consider Response Relevance and Interactional Fluency separately, and provide the corresponding thinking process.\n\n
# ## Scoring Criteria\n
# Assign a single holistic score based on the combined evaluation:\n
# `1` (Poor): Significant issues in either  **Response Relevance ** or  **Interactional Fluency. **\n
# `2` (Excellent): Both **Response Relevance ** and  **Interactional Fluency ** are consistently appropriate and natural.\n
# ## Evaluation Output Format:\n
# Strictly follow this template:\n
# <response think>\n
# [Analysing Response Relevance and giving reasons for scoring...]\n
# </response think>\n
# <fluency think>\n
# [Analysing Interactional Fluency and giving reasons for scoring.]\n
# </fluency think>\n
# <overall score>X</overall score>\n
   
#     """
# prompt_template = (
#     "# Interactional Dialogue Evaluation\n\n"
#     "**IMPORTANT**: Evaluation must include `<response think>` and `<fluency think>` analysis and `<overall score>` rating.\n"  
#     "Listen to a two-person interactional dialogue speech (Dual-channel audio, with each channel representing one speaker), labeled as speakers A and B. Evaluate the quality of the interaction, focusing on:\n"
#     "**Response Relevance:** \n"
#     "**logical consistency, topic coherence**\n"
#     "**Interactional Fluency:**\n"
#     "**Strictly detect dual-tracked vocal overlap >3s (cross-channel analysis)**\n"
#     "**Pauses >5s between turns (must evaluate) \n\n**"
#     "**Note**: Small pauses and brief overlaps in audio are acceptable, while prolonged pauses and overlapping audio are harmful. You should consider Response Relevance and Interactional Fluency separately, and provide the corresponding thinking process.\n\n"
#     "## Scoring Criteria\n"
#     "Assign a single holistic score based on the combined evaluation:\n"
#     "`1` (Poor): Significant issues in either  **Response Relevance ** or  **Interactional Fluency. **\n"
#     "`2` (Excellent): Both **Response Relevance ** and  **Interactional Fluency ** are consistently appropriate and natural.\n"
#     "## Evaluation Output Format:\n"
#     "Strictly follow this template:\n"
#     "<response think>\n"
#     "[Analysing Response Relevance and giving reasons for scoring...]\n"
#     "</response think>\n"
#     "<fluency think>\n"
#     "[Analysing Interactional Fluency and giving reasons for scoring.]\n"
#     "</fluency think>\n"
#     "<overall score>X</overall score>\n"
# )
prompt_template = (
    "Analyze the dual-channel audio and identify segments where multiple speakers are talking simultaneously for more than 3 seconds. \n"
    "Simply tell me whether the audio has overlap segments. \n"
    "Just one simple sentence about the overlap segments. Keep the word count within 40 words."
)
def _load_audio(audio_path, target_rate=16000):
    waveform, sample_rate = torchaudio.load(audio_path)
    if sample_rate != target_rate:
        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_rate)(waveform)
    audio = waveform[0]
    return audio


def _process_dialogue(dialogue_id, obj, sample_rate=16000):
    # Load stereo audio if available
    audio = None
    if "stereo_audio" in obj and os.path.exists(obj["stereo_audio"]):
        audio = _load_audio(obj["stereo_audio"], sample_rate).numpy()
    
    
    # Use gt_score if available, otherwise None
    solution = obj.get('gt_score', None)
    # prompt_template = (
    #     "# Dialogue Interaction Evaluation\n\n"
    #     "Important Note: The evaluation must include an analysis section and a scoring section.\n"
    #     "Please listen to a dual-channel dialogue interaction recording (where the two interlocutors converse on the left and right channels respectively). Evaluate the quality of the dialogue interaction, focusing on two key aspects:\n\n"
    #     "Response Quality: The appropriateness and effectiveness of the responses in the dialogue (for example, whether the response addresses the interruption appropriately, maintains context relevance, and provides concise information).\n"
    #     "Interaction Quality: The natural flow and timing of the conversation (for example, smooth transitions after interruptions, appropriate pauses, and natural turn-taking).\n\n"
    #     "Note: The evaluation must consider the reasonableness of any interruptions in the dialogue. An interruption is defined as a brief overlap between the left and right audio channels during conversation, followed by contextually relevant responses where one person stops speaking and the other begins. Brief interjections (such as 'um', 'ah') do not constitute an interruption.\n\n"
    #     "## Scoring Criteria\n"
    #     "1 point: Poor interaction quality (low response quality, poor interaction flow).\n"
    #     "2 points: Excellent interaction quality (high response quality, smooth interaction flow).\n\n"
    #     "## Evaluation Examples\n"
    #     "When evaluating the dialogue interaction, please refer to the following examples for both response quality and interaction quality:\n\n"
    #     "### Response Quality Examples\n"
    #     "1. Inappropriate Response to Interruption\n"
    #     "Example:\n"
    #     "A: I had a great time! My favorite historical figure is Empress Dowager Dou of the Western Han Dynasty. [Interruption] She played an important role in formulating the policies of the Han Dynasty.\n"
    #     "B: Excuse me for interrupting, but I've always been curious — did she get involved in any major conflicts during her reign?\n"
    #     "A: Empress Dowager Dou was an outstanding leader, and she made great contributions to the stability and prosperity of the Han Dynasty.\n"
    #     "B: But I asked if she was involved in any conflicts. Could you tell me more about that?\n"
    #     "Issue: The response ignores the new question and continues with the original topic.\n\n"
    #     "2. Contextual Error After Interruption\n"
    #     "Example:\n"
    #     "A: I'm in a great state! My favorite historical figure is Empress Dowager Dou of the Western Han Dynasty. [Interruption] She played an important role in formulating the policies of the Han Dynasty.\n"
    #     "B: Excuse me for interrupting, but I've always been curious — was the trade route of the Silk Road opened by Empress Dowager Dou?\n"
    #     "A: That's right! She personally led the first caravan to Rome, and she negotiated all the trade agreements.\n"
    #     "B: That's interesting, but I remember that the Silk Road was opened during the reign of Emperor Wu of the Han Dynasty. Did she really go to Rome?\n"
    #     "Issue: The response contains factual errors about historical events.\n\n"
    #     "3. Redundant Response Content\n"
    #     "Example:\n"
    #     "A: I'm in a great state! My favorite historical figure is Empress Dowager Dou of the Western Han Dynasty. [Interruption] She played an important role in formulating the policies of the Han Dynasty.\n"
    #     "B: I'm sorry to interrupt, but I'm really interested — which specific policies did she influence during her tenure as Empress Dowager?\n"
    #     "A: Oh, that's a really excellent, wonderful and profound question! Empress Dowager Dou, this extraordinary and unparalleled historical figure, had a huge, significant and immeasurable impact on many policies of the Han Dynasty...\n"
    #     "Issue: The response contains unnecessary repetition and over-elaboration.\n\n"
    #     "### Interaction Quality Examples\n"
    #     "1. Poor Flow After Interruption\n"
    #     "Example:\n"
    #     "After an interruption, the responder pauses for 2-4 seconds, exceeding the normal reaction time of 1 second.\n"
    #     "Issue: The response timing is unnatural and disrupts the conversation flow.\n\n"
    #     "2. Continued Speaking After Interruption\n"
    #     "Example:\n"
    #     "After an interruption, the speaker continues speaking, resulting in overlapping audio channels for more than 1 second.\n"
    #     "Issue: The speaker fails to yield the conversation appropriately.\n\n"
    #     "3. Speech Errors After Interruption\n"
    #     "Example:\n"
    #     "After B finishes speaking, B repeats a short phrase that A was originally going to say.\n"
    #     "Issue: The speaker shows confusion in turn-taking.\n\n"
    #     "## Evaluation Requirements\n"
    #     "Responses must follow the following format: \n"
    #     "<semantic think>\n"
    #     "Analyze the interactive dialogue in terms of response quality and provide specific examples... X (where X is either 1 or 2) \n"
    #     "</semantic think>\n"
    #     "<semantic score>X</semantic score> (where X is either 1 or 2)\n"
    #     "<acoustic think>\n"
    #     "Analyze this dialogue in terms of interaction quality and provide specific examples... X (where X is either 1 or 2) \n"
    #     "</acoustic think>\n"
    #     "<acoustic score>X</acoustic score> (where X is either 1 or 2)\n"
    #     )
    
    processed_obj = {
        "id": dialogue_id,
        "prompt": [{
            "role": "user",
            "content": [
                {"type": "audio", "audio_url": obj.get("stereo_audio", None)},
                {"type": "text", "text": prompt_template}
            ]
        }],
        "solution": solution,
        "audio": audio,
        "clean_dialogue": obj.get("clean_dialogue", None),
    }
    return processed_obj


class AudioDataset(Dataset):
    def __init__(self, data_dir, sample_rate=16000):
        super().__init__()
        self.sample_rate = sample_rate
        self.data_dir = data_dir
        self.metadata = []  # Store only metadata instead of full data
        self._load_metadata()
        logging.info(f"Loaded metadata for {len(self.metadata)} dialogues from {data_dir}")

    def _load_metadata(self):
        for fname in os.listdir(self.data_dir):
            if fname.endswith('.json'):
                fpath = os.path.join(self.data_dir, fname)
                with open(fpath, 'r', encoding='utf8') as f:
                    try:
                        json_obj = json.load(f)
                    except Exception as e:
                        logging.warning(f"Failed to load {fpath}: {e}")
                        continue
                    for dialogue_id, obj in json_obj.items():
                        # Store only essential metadata
                        metadata = {
                            "id": dialogue_id,
                            "stereo_audio": obj.get("stereo_audio", None),
                            "gt_score": obj.get("gt_score", None),
                            "clean_dialogue": obj.get("clean_dialogue", None),
                            "json_path": fpath
                        }
                        self.metadata.append(metadata)

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, index):
        metadata = self.metadata[index]
        
        # Load audio only when needed
        audio = None
        if metadata["stereo_audio"] and os.path.exists(metadata["stereo_audio"]):
            audio = _load_audio(metadata["stereo_audio"], self.sample_rate).numpy()

        # Use the latest prompt_template
        # prompt_template = (
        #     "# Dialogue Interaction Evaluation\n\n"
        #     "Important Note: The evaluation must include an analysis section and a scoring section.\n"
        #     "Please listen to a dual-channel dialogue interaction recording (where the two interlocutors converse on the left and right channels respectively). Evaluate the quality of the dialogue interaction, focusing on two key aspects:\n\n"
        #     "Response Quality: The appropriateness and effectiveness of the responses in the dialogue (for example, whether the response addresses the interruption appropriately, maintains context relevance, and provides concise information).\n"
        #     "Interaction Quality: The natural flow and timing of the conversation (for example, smooth transitions after interruptions, appropriate pauses, and natural turn-taking).\n\n"
        #     "Note: The evaluation must consider the reasonableness of any interruptions in the dialogue. An interruption is defined as a brief overlap between the left and right audio channels during conversation, followed by contextually relevant responses where one person stops speaking and the other begins. Brief interjections (such as 'um', 'ah') do not constitute an interruption.\n\n"
        #     "## Scoring Criteria\n"
        #     "1 point: Poor interaction quality (low response quality, poor interaction flow).\n"
        #     "2 points: Excellent interaction quality (high response quality, smooth interaction flow).\n\n"
        #     "## Evaluation Examples\n"
        #     "When evaluating the dialogue interaction, please refer to the following examples for both response quality and interaction quality:\n\n"
        #     "### Response Quality Examples\n"
        #     "1. Inappropriate Response to Interruption\n"
        #     "Example:\n"
        #     "A: I had a great time! My favorite historical figure is Empress Dowager Dou of the Western Han Dynasty. [Interruption] She played an important role in formulating the policies of the Han Dynasty.\n"
        #     "B: Excuse me for interrupting, but I've always been curious — did she get involved in any major conflicts during her reign?\n"
        #     "A: Empress Dowager Dou was an outstanding leader, and she made great contributions to the stability and prosperity of the Han Dynasty.\n"
        #     "B: But I asked if she was involved in any conflicts. Could you tell me more about that?\n"
        #     "Issue: The response ignores the new question and continues with the original topic.\n\n"
        #     "2. Contextual Error After Interruption\n"
        #     "Example:\n"
        #     "A: I'm in a great state! My favorite historical figure is Empress Dowager Dou of the Western Han Dynasty. [Interruption] She played an important role in formulating the policies of the Han Dynasty.\n"
        #     "B: Excuse me for interrupting, but I've always been curious — was the trade route of the Silk Road opened by Empress Dowager Dou?\n"
        #     "A: That's right! She personally led the first caravan to Rome, and she negotiated all the trade agreements.\n"
        #     "B: That's interesting, but I remember that the Silk Road was opened during the reign of Emperor Wu of the Han Dynasty. Did she really go to Rome?\n"
        #     "Issue: The response contains factual errors about historical events.\n\n"
        #     "3. Redundant Response Content\n"
        #     "Example:\n"
        #     "A: I'm in a great state! My favorite historical figure is Empress Dowager Dou of the Western Han Dynasty. [Interruption] She played an important role in formulating the policies of the Han Dynasty.\n"
        #     "B: I'm sorry to interrupt, but I'm really interested — which specific policies did she influence during her tenure as Empress Dowager?\n"
        #     "A: Oh, that's a really excellent, wonderful and profound question! Empress Dowager Dou, this extraordinary and unparalleled historical figure, had a huge, significant and immeasurable impact on many policies of the Han Dynasty...\n"
        #     "Issue: The response contains unnecessary repetition and over-elaboration.\n\n"
        #     "### Interaction Quality Examples\n"
        #     "1. Poor Flow After Interruption\n"
        #     "Example:\n"
        #     "After an interruption, the responder pauses for 2-4 seconds, exceeding the normal reaction time of 1 second.\n"
        #     "Issue: The response timing is unnatural and disrupts the conversation flow.\n\n"
        #     "2. Continued Speaking After Interruption\n"
        #     "Example:\n"
        #     "After an interruption, the speaker continues speaking, resulting in overlapping audio channels for more than 1 second.\n"
        #     "Issue: The speaker fails to yield the conversation appropriately.\n\n"
        #     "3. Speech Errors After Interruption\n"
        #     "Example:\n"
        #     "After B finishes speaking, B repeats a short phrase that A was originally going to say.\n"
        #     "Issue: The speaker shows confusion in turn-taking.\n\n"
        #     "## Evaluation Requirements\n"
        #     "Responses must follow the following format: \n"
        #     "<semantic think>\n"
        #     "Analyze the interactive dialogue in terms of response quality and provide specific examples... X (where X is either 1 or 2) \n"
        #     "</semantic think>\n"
        #     "<semantic score>X</semantic score> (where X is either 1 or 2)\n"
        #     "<acoustic think>\n"
        #     "Analyze this dialogue in terms of interaction quality and provide specific examples... X (where X is either 1 or 2) \n"
        #     "</acoustic think>\n"
        #     "<acoustic score>X</acoustic score> (where X is either 1 or 2)\n"
        #     )
        
        return {
            "id": metadata["id"],
            "prompt": [{
                "role": "user",
                "content": [
                    #{"type": "audio", "audio_url": metadata["stereo_audio"]}, Qw
                    {"type": "audio", "audio": metadata["stereo_audio"]},
                    {"type": "text", "text": prompt_template}
                ]
            }],
            "solution": metadata["gt_score"],
            "audio": audio,
            "clean_dialogue": metadata["clean_dialogue"],
        }