interactSpeech / dataset /dataset_qwenOmni_dual.py

Add files using upload-large-folder tool

8613355 verified 5 months ago

6.82 kB

	import json
	import logging
	import os

	import torchaudio
	from torch.utils.data import Dataset


	def _handle_wav(wav_path, target_rate=16000):
	"""
	handle one wav file.
	Return:
	waveform: numpy narray(1d)
	"""
	waveform, sample_rate = torchaudio.load(wav_path)
	if sample_rate != 16000:
	waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_rate)(waveform)
	audio = waveform[0]
	return audio

	def _handle_qa(obj, is_think=True, think_max_len=50):
	# First prompt template
	system_prompt = 'You are an audio deep-thinking model. Upon receiving a question, please respond in two parts: <THINK> and <RESPONSE>. The <THINK> section should be further divided into four parts: <PLANNING>, <CAPTION>, <REASONING>, and <SUMMARY>.'
	prompt_template1 = (
	system_prompt + "\n" +
	"# Dialogue Response Evaluation\n\n"
	"IMPORTANT: Evaluation must include `<think>` analysis and `<score>` rating.\n\n"
	"Listen to the dialogue recording (two sentences, 1-second pause in between). Evaluate the quality of the second sentence as a response to the first, focusing on text relevance and the appropriateness of Linguistic information (a range of paralinguistic information such as emotion/age/pitch/speed/volume).\n"
	"Note: Focus on evaluating the appropriateness of the second sentence relative to the first, even if the first sentence itself contains contradictory information.\n\n"
	"## Scoring Criteria\n\n"
	"1 points: Text content is irrelevant or incorrect or illogical.(low intelligence)\n"
	"3 points: Text is relevant, but paralinguistic information is inappropriate for the context.(low emotional quotient)\n"
	"5 points: Text is relevant, and paralinguistic information is appropriate for the context, resulting in effective communication.(High intelligence and emotional intelligence.)\n\n"
	"## Evaluation Requirements\n\n"
	"Response MUST follow this format:\n\n"
	"<think>\n"
	f"Analysing text relevance and paralinguistic information Appropriateness and reasons for scoring...(less than {think_max_len} words)\n"
	"</think>\n\n"
	"<score>X</score> (X is 1, 3, or 5)\n\n")

	# Second prompt template
	prompt_template2 = (
	system_prompt + "\n" +
	"# Dialogue Response Evaluation\n\n"
	"IMPORTANT: Evaluation must include `<think>` analysis and `<score>` rating.\n\n"
	"Listen to the dialogue recording (two sentences, 1-second pause in between). Evaluate the quality of the second sentence as a response to the first, focusing on text relevance and the appropriateness of Linguistic information (a range of paralinguistic information such as emotion/age/pitch/speed/volume).\n"
	"Note: Focus on evaluating the appropriateness of the second sentence relative to the first, even if the first sentence itself contains contradictory information.\n\n"
	"## Scoring Criteria\n\n"
	"1 points: Text content is irrelevant or incorrect or illogical.(low intelligence)\n"
	"3 points: Text is relevant, but paralinguistic information is inappropriate for the context.(low emotional quotient)\n"
	"5 points: Text is relevant, and paralinguistic information is appropriate for the context, resulting in effective communication.(High intelligence and emotional intelligence.)\n\n"
	"## Evaluation Requirements\n\n"
	"Response MUST follow this format:\n\n"
	"<think>\n"
	f"Analysing text relevance and paralinguistic information Appropriateness and reasons for scoring...(less than {think_max_len} words)\n"
	"</think>\n\n"
	"<score>X</score> (X is 1, 3, or 5)\n\n")


	# Create two prompts with different templates
	obj["prompt1"] = [
	{
	"role": "system",
	"content": [
	{"type": "text", "text": system_prompt}
	]
	},
	{"role": "user", "content": [
	{"type": "audio", "audio": obj["stereo_audio"]},
	{"type": "text", "text": prompt_template1}]
	}]

	obj["prompt2"] = [
	{
	"role": "system",
	"content": [
	{"type": "text", "text": system_prompt}
	]
	},
	{"role": "user", "content": [
	{"type": "audio", "audio": obj["stereo_audio"]},
	{"type": "text", "text": prompt_template2}
	]}]

	# Store the ground truth score
	obj["solution"] = obj["gt_score"]
	return obj


	class AudioDualPromptDataset(Dataset):
	def __init__(self, data_dir, sample_rate=16000, is_think=True, think_max_len=50, load_audio=True):
	super().__init__()
	self.sample_rate = sample_rate
	self.is_think = is_think
	self.think_max_len = think_max_len
	self.load_audio = load_audio
	self.data_dir = data_dir
	self.metadata = [] # Store only metadata instead of full data
	self._load_metadata()
	logging.info(f"Loaded metadata for {len(self.metadata)} items from {data_dir}")

	def _load_metadata(self):
	for fname in os.listdir(self.data_dir):
	if fname.endswith('.json'):
	fpath = os.path.join(self.data_dir, fname)
	with open(fpath, 'r', encoding='utf8') as f:
	try:
	json_obj = json.load(f)
	except Exception as e:
	logging.warning(f"Failed to load {fpath}: {e}")
	continue
	for item_id, obj in json_obj.items():
	# Store only essential metadata
	metadata = {
	"id": item_id,
	"stereo_audio": obj.get("stereo_audio", None),
	"gt_score": obj.get("gt_score", None),
	"json_path": fpath
	}
	self.metadata.append(metadata)

	def __len__(self):
	return len(self.metadata)

	def __getitem__(self, index):
	metadata = self.metadata[index]

	# Load the full data from JSON file
	# with open(metadata["json_path"], 'r', encoding='utf8') as f:
	# json_obj = json.load(f)
	# item = json_obj[metadata["id"]]

	# Load audio if needed
	# if self.load_audio and metadata["stereo_audio"] and os.path.exists(metadata["stereo_audio"]):
	# item["audio"] = _handle_wav(metadata["stereo_audio"], self.sample_rate)

	return _handle_qa(
	metadata,
	is_think=self.is_think,
	think_max_len=self.think_max_len
	)