{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import datasets\n", "import json\n", "\n", "# Define the dataset features (audio, text, and source)\n", "# change the data structure according to your needs, only important changes here is using datasets.Audio to load audio file\n", "# And provide audio path in the data construction\n", "# once loaded through datasets.Audio, we can access audio data, in the form of np.array(float32) using doc[\"audio\"][\"array\"]\n", "features = datasets.Features(\n", " {\n", " \"audio\": datasets.Audio(sampling_rate=16000),\n", " \"prompt\": datasets.Value(\"string\"),\n", " \"gt\": datasets.Value(\"string\"),\n", " \"source\": datasets.Value(\"string\"),\n", " \"task\": datasets.Value(\"string\"),\n", " }\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# loading data into dict form\n", "def load_audio_data(data_path):\n", " with open(data_path, 'r') as f:\n", " data_lines = f.readlines()\n", "\n", " audio_list = []\n", " prompt_list = []\n", " gt_list = []\n", " source_list = []\n", " task_list = []\n", "\n", " for line in data_lines:\n", " json_data = json.loads(line.strip())\n", "\n", " audio_list.append(json_data['audio']) # Path to the actual audio file\n", " prompt_list.append(\"<|audio_bos|><|AUDIO|><|audio_eos|>\" + json_data['prompt'])\n", " gt_list.append(json_data['gt'])\n", " source_list.append(json_data['source'])\n", " task_list.append(json_data['task'])\n", "\n", " # Return a dictionary where keys are features and values are lists of data\n", " return {\n", " 'audio': audio_list,\n", " 'prompt': prompt_list,\n", " 'gt': gt_list,\n", " 'source': source_list,\n", " 'task': task_list\n", " }" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# load data according to different task\n", "def load_audio_data_task(data_path, task):\n", " with open(data_path, 'r') as f:\n", " data_lines = f.readlines()\n", "\n", " audio_list = []\n", " prompt_list = []\n", " gt_list = []\n", " source_list = []\n", " task_list = []\n", "\n", " for line in data_lines:\n", " json_data = json.loads(line.strip())\n", " if json_data['source'] == task: \n", "\n", " \n", " audio_list.append(json_data['audio']) # Path to the actual audio file\n", " prompt_list.append(\"<|audio_bos|><|AUDIO|><|audio_eos|>\" + json_data['prompt'])\n", " gt_list.append(json_data['gt'])\n", " source_list.append(json_data['source'])\n", " task_list.append(json_data['task'])\n", "\n", " # Return a dictionary where keys are features and values are lists of data\n", " return {\n", " 'audio': audio_list,\n", " 'prompt': prompt_list,\n", " 'gt': gt_list,\n", " 'source': source_list,\n", " 'task': task_list\n", " }\n", "\n", "\n", "tasks = ['librispeech_test_other', 'librispeech_dev_other', 'librispeech_test_clean', 'librispeech_dev_clean']\n", "\n", "# description_root\n", "data_description_path = \"./librispeech_eval.jsonl\"\n", "\n", "data_dict = {}\n", "for task in tasks:\n", "\n", " # Load the dataset into a Hugging Face Dataset object\n", " data = load_audio_data_task(data_description_path, task)\n", "\n", " # Create a Dataset from the data and features\n", " dataset = datasets.Dataset.from_dict(data, features=features)\n", "\n", " # Verify the dataset structure\n", " print(dataset)\n", "\n", " data_dict[task] = dataset\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data = datasets.DatasetDict(data_dict)\n", "data.push_to_hub(\"Alarak/librispeech\")" ] } ], "metadata": { "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 2 }