In [None]:
import datasets
import json

# Define the dataset features (audio, text, and source)
# change the data structure according to your needs, only important changes here is using datasets.Audio to load audio file
# And provide audio path in the data construction
# once loaded through datasets.Audio, we can access audio data, in the form of np.array(float32) using doc["audio"]["array"]
features = datasets.Features(
 {
 "audio": datasets.Audio(sampling_rate=16000),
 "prompt": datasets.Value("string"),
 "gt": datasets.Value("string"),
 "source": datasets.Value("string"),
 "task": datasets.Value("string"),
 }
)

In [None]:
# loading data into dict form
def load_audio_data(data_path):
 with open(data_path, 'r') as f:
 data_lines = f.readlines()

 audio_list = []
 prompt_list = []
 gt_list = []
 source_list = []
 task_list = []

 for line in data_lines:
 json_data = json.loads(line.strip())

 audio_list.append(json_data['audio']) # Path to the actual audio file
 prompt_list.append("<|audio_bos|><|AUDIO|><|audio_eos|>" + json_data['prompt'])
 gt_list.append(json_data['gt'])
 source_list.append(json_data['source'])
 task_list.append(json_data['task'])

 # Return a dictionary where keys are features and values are lists of data
 return {
 'audio': audio_list,
 'prompt': prompt_list,
 'gt': gt_list,
 'source': source_list,
 'task': task_list
 }

In [None]:
# load data according to different task
def load_audio_data_task(data_path, task):
 with open(data_path, 'r') as f:
 data_lines = f.readlines()

 audio_list = []
 prompt_list = []
 gt_list = []
 source_list = []
 task_list = []

 for line in data_lines:
 json_data = json.loads(line.strip())
 if json_data['source'] == task: 

 
 audio_list.append(json_data['audio']) # Path to the actual audio file
 prompt_list.append("<|audio_bos|><|AUDIO|><|audio_eos|>" + json_data['prompt'])
 gt_list.append(json_data['gt'])
 source_list.append(json_data['source'])
 task_list.append(json_data['task'])

 # Return a dictionary where keys are features and values are lists of data
 return {
 'audio': audio_list,
 'prompt': prompt_list,
 'gt': gt_list,
 'source': source_list,
 'task': task_list
 }


tasks = ['librispeech_test_other', 'librispeech_dev_other', 'librispeech_test_clean', 'librispeech_dev_clean']

# description_root
data_description_path = "./librispeech_eval.jsonl"

data_dict = {}
for task in tasks:

 # Load the dataset into a Hugging Face Dataset object
 data = load_audio_data_task(data_description_path, task)

 # Create a Dataset from the data and features
 dataset = datasets.Dataset.from_dict(data, features=features)

 # Verify the dataset structure
 print(dataset)

 data_dict[task] = dataset


In [None]:
data = datasets.DatasetDict(data_dict)
data.push_to_hub("Alarak/librispeech")