{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "source": [ "#convert file audio to mp3 32k" ], "metadata": { "id": "XE8k_JyY5eBb" } }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Wyv972sc5Lb4" }, "outputs": [], "source": [ "import os\n", "from multiprocessing import Pool, cpu_count\n", "from tqdm import tqdm\n", "import subprocess\n", "\n", "def process_file_ffmpeg(args):\n", " file_path, input_folder, output_folder = args\n", " rel_path = os.path.relpath(file_path, input_folder)\n", " rel_path = os.path.splitext(rel_path)[0] + \".mp3\" # luôn xuất mp3\n", " out_path = os.path.join(output_folder, rel_path)\n", " os.makedirs(os.path.dirname(out_path), exist_ok=True)\n", "\n", " cmd = [\n", " \"ffmpeg\",\n", " \"-y\", # overwrite nếu đã tồn tại\n", " \"-i\", file_path,\n", " \"-ar\", \"32000\", # sample rate 32kHz\n", " \"-ac\", \"1\", # stereo set thành 2 còn mono set thành 1\n", " \"-b:a\", \"192k\", # bitrate\n", " out_path\n", " ]\n", "\n", " try:\n", " subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)\n", " return True\n", " except subprocess.CalledProcessError:\n", " print(f\"❌ Lỗi khi xử lý {file_path}\")\n", " return False\n", "\n", "def convert_audio_ffmpeg_multiprocessing(input_folder, output_folder, num_workers=None):\n", " audio_exts = ('.mp3', '.wav', '.flac', '.m4a', '.ogg')\n", "\n", " # Lấy danh sách tất cả file audio\n", " all_files = []\n", " for root, _, files in os.walk(input_folder):\n", " for f in files:\n", " if f.lower().endswith(audio_exts):\n", " all_files.append(os.path.join(root, f))\n", "\n", " if num_workers is None:\n", " num_workers = cpu_count()\n", "\n", " args_list = [(f, input_folder, output_folder) for f in all_files]\n", "\n", " # Multiprocessing + tqdm\n", " with Pool(num_workers) as pool:\n", " for _ in tqdm(pool.imap_unordered(process_file_ffmpeg, args_list),\n", " total=len(args_list), desc=\"Converting to 32kHz MP3\"):\n", " pass\n", "\n", "# --- Ví dụ sử dụng ---\n", "input_dir = \"/content/dataset\"\n", "output_dir = \"/content/dataset_process\"\n", "convert_audio_ffmpeg_multiprocessing(input_dir, output_dir)\n" ] }, { "cell_type": "markdown", "source": [ "#mp3 to hdf5" ], "metadata": { "id": "kIoHloKr5ky7" } }, { "cell_type": "markdown", "source": [ "##Audioset" ], "metadata": { "id": "pctubCgR5sli" } }, { "cell_type": "code", "source": [ "import h5py\n", "import pandas as pd\n", "import numpy as np\n", "import csv\n", "import os\n", "import io\n", "import av\n", "\n", "def decode_mp3(mp3_arr):\n", " \"\"\"\n", " Giải mã một mảng uint8 đại diện cho một file MP3.\n", " :rtype: np.array\n", " \"\"\"\n", " container = av.open(io.BytesIO(mp3_arr.tobytes())) # Đọc dữ liệu MP3\n", " stream = next(s for s in container.streams if s.type == 'audio') # Lấy stream âm thanh\n", " a = []\n", " for i, packet in enumerate(container.demux(stream)): # Demux các gói dữ liệu âm thanh\n", " for frame in packet.decode(): # Giải mã frame\n", " a.append(frame.to_ndarray().reshape(-1)) # Chuyển đổi frame thành mảng numpy\n", " waveform = np.concatenate(a) # Kết nối tất cả các frame lại\n", " if waveform.dtype != 'float32': # Kiểm tra loại dữ liệu\n", " raise RuntimeError(\"Unexpected wave type\")\n", " return waveform\n", "\n", "# %%\n", "base_dir = \"/content/output_\"\n", "balanced_csv= '/content/new_updated_balanced_train_segments.csv'\n", "eval_csv= '/content/new_eval_segments.csv'\n", "mp3_path = \"/content/dataset/\"\n", "\n", "\n", "# %%\n", "\n", "\n", "def read_metadata(csv_path, classes_num, id_to_ix):\n", " \"\"\"Read metadata of AudioSet from a csv file.\"\"\"\n", "\n", " audio_names = []\n", " targets = []\n", "\n", " with open(csv_path, 'r') as fr:\n", " reader = csv.reader(fr)\n", " next(reader) # Skip header line if exists\n", " next(reader) # Skip another potential header line\n", " next(reader) # Skip another potential header line\n", "\n", " for line in reader:\n", " if len(line) < 4:\n", " continue # Skip malformed lines\n", "\n", " audio_name = 'Y{}.mp3'.format(line[0]) # Assumed naming convention\n", " label_ids = line[3].strip('\"').split(',')\n", "\n", " audio_names.append(audio_name)\n", " target = np.zeros(classes_num, dtype=bool)\n", "\n", " for label_id in label_ids:\n", " if label_id in id_to_ix:\n", " ix = id_to_ix[label_id]\n", " target[ix] = 1\n", " else:\n", " print(f\"Warning: Label ID {label_id} not found in id_to_ix.\")\n", "\n", " targets.append(target)\n", "\n", " meta_dict = {'audio_name': np.array(audio_names), 'target': np.array(targets)}\n", " print(meta_dict)\n", " return meta_dict\n", "\n", "# Load label\n", "with open('/content/new_class_labels_indices_filter_discard.csv', 'r') as f:\n", " reader = csv.reader(f, delimiter=',')\n", " lines = list(reader)\n", "\n", "labels = []\n", "ids = [] # Each label has a unique id such as \"/m/068hy\"\n", "for i1 in range(1, len(lines)):\n", " id = lines[i1][1]\n", " label = lines[i1][2]\n", " ids.append(id)\n", " labels.append(label)\n", "\n", "classes_num = len(labels)\n", "\n", "lb_to_ix = {label : i for i, label in enumerate(labels)}\n", "ix_to_lb = {i : label for i, label in enumerate(labels)}\n", "\n", "id_to_ix = {id : i for i, id in enumerate(ids)}\n", "ix_to_id = {i : id for i, id in enumerate(ids)}\n", "\n", "# %%\n", "\n", "def check_available(balanced_csv,balanced_audio_path,prefix=None):\n", " meta_csv = read_metadata(balanced_csv,classes_num,id_to_ix)\n", " #print(meta_csv)\n", " audios_num = len(meta_csv['audio_name'])\n", " found=0\n", " notfound=0\n", " available_files=[]\n", " available_targets=[]\n", " if prefix is None:\n", " prefix = os.path.basename(balanced_csv)[:-4]\n", " for n in range(audios_num):\n", " audio_path = meta_csv['audio_name'][n]\n", " #print(balanced_audio_path + f\"{prefix}/{audio_path}\")\n", " if os.path.isfile(balanced_audio_path + f\"{prefix}/{audio_path}\" ):\n", " found+=1\n", " available_files.append(meta_csv['audio_name'][n])\n", " available_targets.append(meta_csv['target'][n])\n", " else:\n", " notfound+=1\n", " print(f\"Found {found} . not found {notfound}\")\n", " return available_files,available_targets\n", "# %%\n", "\n", "# %%\n", "\n", "# %%\n", "\n", "\n", "os.makedirs(os.path.dirname(base_dir + \"mp3\"), exist_ok=True)\n", "\n", "for read_file,prefix in [(balanced_csv,\"balanced_train_segments/\"), (eval_csv,\"eval_segments/\"),]:\n", " print(\"now working on \",read_file,prefix)\n", " #files, y = torch.load(read_file+\".pth\")\n", " files, y = check_available(read_file, mp3_path, prefix)\n", " y = np.packbits(y, axis=-1)\n", " packed_len = y.shape[1]\n", " print(files[0], \"classes: \",packed_len, y.dtype)\n", " available_size = len(files)\n", " f = files[0][:-3]+\"mp3\"\n", " a = np.fromfile(mp3_path+prefix + \"/\"+f, dtype='uint8')\n", "\n", " dt = h5py.vlen_dtype(np.dtype('uint8'))\n", " save_file = prefix.split(\"/\")[0]\n", " os.makedirs(os.path.dirname(base_dir + \"mp3/\" ), exist_ok=True)\n", " with h5py.File(base_dir+ \"mp3/\" + save_file+\"_mp3.hdf\", 'w') as hf:\n", " audio_name = hf.create_dataset('audio_name', shape=(0,), maxshape=(None,), dtype='S20')\n", " waveform = hf.create_dataset('mp3', shape=(0,), maxshape=(None,), dtype=dt)\n", " target = hf.create_dataset('target', shape=(0, packed_len), maxshape=(None, packed_len), dtype=y.dtype)\n", " for i,file in enumerate(files):\n", " if i%1000==0:\n", " print(f\"{i}/{available_size}\")\n", " f = file[:-3] + \"mp3\"\n", " a = np.fromfile(mp3_path + prefix + f, dtype='uint8')\n", " try:\n", " # Kiểm tra xem file audio có đọc được không\n", " decode_mp3(a) # Dùng hàm decode_mp3 của bạn\n", "\n", " audio_name.resize((i + 1,))\n", " waveform.resize((i + 1,))\n", " target.resize((i + 1, packed_len))\n", "\n", " audio_name[i]=f\n", " waveform[i] = a\n", " target[i] = y[i]\n", " except Exception as e:\n", " print(f\"File lỗi tại index {i} với file {file}: {e}\")\n", "\n", " print(a.shape)\n", " print(\"Done!\" , prefix)" ], "metadata": { "id": "8oFKEbtb5mzr" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "##For this structure folder/train/folder(class)/file" ], "metadata": { "id": "mCwcSx8y5v7q" } }, { "cell_type": "markdown", "source": [ "![image.png]()" ], "metadata": { "id": "cRAr5tkn566K" } }, { "cell_type": "code", "source": [ "import h5py\n", "import numpy as np\n", "import os\n", "import io\n", "import av\n", "from pathlib import Path\n", "from tqdm import tqdm\n", "\n", "def decode_mp3(mp3_arr):\n", " \"\"\"\n", " Giải mã một mảng uint8 đại diện cho một file MP3.\n", " :rtype: np.array\n", " \"\"\"\n", " try:\n", " container = av.open(io.BytesIO(mp3_arr.tobytes()))\n", " stream = next(s for s in container.streams if s.type == 'audio')\n", " a = []\n", " for packet in container.demux(stream):\n", " for frame in packet.decode():\n", " a.append(frame.to_ndarray().reshape(-1))\n", " waveform = np.concatenate(a)\n", " if waveform.dtype != 'float32':\n", " raise RuntimeError(\"Unexpected wave type\")\n", " return waveform\n", " except Exception as e:\n", " raise RuntimeError(f\"Cannot decode MP3: {e}\")\n", "\n", "def scan_dataset_structure(dataset_path):\n", " \"\"\"\n", " Quét cấu trúc thư mục dataset và tạo mapping cho classes\n", " Structure: dataset_path/train(or test)/class_name/*.mp3\n", " \"\"\"\n", " dataset_path = Path(dataset_path)\n", "\n", " # Lấy tất cả các class names từ thư mục train\n", " train_path = dataset_path / \"train\"\n", " if not train_path.exists():\n", " raise ValueError(f\"Train folder not found: {train_path}\")\n", "\n", " classes = sorted([d.name for d in train_path.iterdir() if d.is_dir()])\n", " classes_num = len(classes)\n", "\n", " # Tạo mapping\n", " lb_to_ix = {label: i for i, label in enumerate(classes)}\n", " ix_to_lb = {i: label for i, label in enumerate(classes)}\n", "\n", " print(f\"Found {classes_num} classes: {classes[:7]}...\" if len(classes) > 7 else f\"Found {classes_num} classes: {classes}\")\n", "\n", " return classes, classes_num, lb_to_ix, ix_to_lb\n", "\n", "def collect_audio_files(dataset_path, split='train', shuffle=True, random_seed=42):\n", " \"\"\"\n", " Thu thập tất cả audio files từ structure thư mục và shuffle để tránh grouping theo class\n", " \"\"\"\n", " dataset_path = Path(dataset_path)\n", " split_path = dataset_path / split\n", "\n", " if not split_path.exists():\n", " raise ValueError(f\"{split} folder not found: {split_path}\")\n", "\n", " audio_files = []\n", " labels = []\n", " class_counts = {}\n", "\n", " class_dirs = [d for d in split_path.iterdir() if d.is_dir()]\n", "\n", " print(f\"📁 Scanning {split} folder...\")\n", " for class_dir in tqdm(class_dirs, desc=f\"Scanning classes\"):\n", " class_name = class_dir.name\n", " mp3_files = list(class_dir.glob(\"*.mp3\"))\n", " class_counts[class_name] = len(mp3_files)\n", "\n", " for mp3_file in mp3_files:\n", " audio_files.append(str(mp3_file))\n", " labels.append(class_name)\n", "\n", " # Shuffle để tránh việc group theo class trong HDF5\n", " if shuffle:\n", " import random\n", " random.seed(random_seed)\n", "\n", " # Zip files và labels lại, sau đó shuffle\n", " combined = list(zip(audio_files, labels))\n", " random.shuffle(combined)\n", "\n", " # Unpack lại\n", " audio_files, labels = zip(*combined)\n", " audio_files = list(audio_files)\n", " labels = list(labels)\n", "\n", " print(f\"🔀 Files shuffled with seed={random_seed}\")\n", "\n", " # In class distribution\n", " print(f\"✅ Found {len(audio_files)} audio files in {split} set\")\n", " print(f\"📊 Class distribution:\")\n", " for class_name, count in sorted(class_counts.items()):\n", " percentage = count / len(audio_files) * 100\n", " print(f\" {class_name}: {count} files ({percentage:.1f}%)\")\n", "\n", " return audio_files, labels\n", "\n", "def create_target_array(labels, classes_num, lb_to_ix):\n", " \"\"\"\n", " Tạo target array từ danh sách labels\n", " \"\"\"\n", " targets = []\n", " for label in labels:\n", " target = np.zeros(classes_num, dtype=bool)\n", " if label in lb_to_ix:\n", " ix = lb_to_ix[label]\n", " target[ix] = 1\n", " targets.append(target)\n", "\n", " return np.array(targets)\n", "\n", "def convert_to_hdf5(dataset_path, output_dir):\n", " \"\"\"\n", " Convert audio dataset to HDF5 format\n", " \"\"\"\n", " # Tạo output directory\n", " os.makedirs(output_dir, exist_ok=True)\n", "\n", " # Quét cấu trúc dataset\n", " classes, classes_num, lb_to_ix, ix_to_lb = scan_dataset_structure(dataset_path)\n", "\n", " # Process both train and test splits\n", " for split in ['train', 'test']:\n", " print(f\"\\n=== Processing {split} set ===\")\n", "\n", " try:\n", " # Thu thập audio files\n", " audio_files, labels = collect_audio_files(dataset_path, split)\n", "\n", " if len(audio_files) == 0:\n", " print(f\"No audio files found in {split} set, skipping...\")\n", " continue\n", "\n", " # Tạo target array\n", " targets = create_target_array(labels, classes_num, lb_to_ix)\n", "\n", " # Pack targets để tiết kiệm memory\n", " packed_targets = np.packbits(targets, axis=-1)\n", " packed_len = packed_targets.shape[1]\n", "\n", " print(f\"Target shape: {targets.shape} -> Packed: {packed_targets.shape}\")\n", "\n", " # Tạo HDF5 file\n", " dt = h5py.vlen_dtype(np.dtype('uint8'))\n", " hdf5_path = os.path.join(output_dir, f\"{split}_mp3.hdf5\")\n", "\n", " with h5py.File(hdf5_path, 'w') as hf:\n", " # Tạo datasets\n", " audio_name_ds = hf.create_dataset('audio_name', shape=(0,), maxshape=(None,), dtype='S200')\n", " waveform_ds = hf.create_dataset('mp3', shape=(0,), maxshape=(None,), dtype=dt)\n", " target_ds = hf.create_dataset('target', shape=(0, packed_len), maxshape=(None, packed_len), dtype=packed_targets.dtype)\n", "\n", " # Lưu class info\n", " hf.attrs['classes'] = [c.encode('utf-8') for c in classes]\n", " hf.attrs['classes_num'] = classes_num\n", "\n", " valid_count = 0\n", " error_count = 0\n", "\n", " # Process từng file với tqdm\n", " pbar = tqdm(zip(audio_files, labels),\n", " total=len(audio_files),\n", " desc=f\"Converting {split}\")\n", "\n", " for i, (audio_file, label) in enumerate(pbar):\n", " try:\n", " # Đọc file MP3\n", " audio_data = np.fromfile(audio_file, dtype='uint8')\n", "\n", " # Kiểm tra tính hợp lệ bằng cách decode\n", " decode_mp3(audio_data)\n", "\n", " # Resize datasets\n", " audio_name_ds.resize((valid_count + 1,))\n", " waveform_ds.resize((valid_count + 1,))\n", " target_ds.resize((valid_count + 1, packed_len))\n", "\n", " # Lưu data\n", " filename = os.path.basename(audio_file).encode('utf-8')\n", " audio_name_ds[valid_count] = filename\n", " waveform_ds[valid_count] = audio_data\n", " target_ds[valid_count] = packed_targets[i]\n", "\n", " valid_count += 1\n", "\n", " # Update progress bar\n", " pbar.set_postfix({\n", " 'Valid': valid_count,\n", " 'Errors': error_count,\n", " 'Success Rate': f\"{valid_count/(i+1)*100:.1f}%\"\n", " })\n", "\n", " except Exception as e:\n", " error_count += 1\n", " pbar.set_postfix({\n", " 'Valid': valid_count,\n", " 'Errors': error_count,\n", " 'Success Rate': f\"{valid_count/(i+1)*100:.1f}%\"\n", " })\n", " if error_count <= 5: # Chỉ show 5 error đầu tiên\n", " tqdm.write(f\"❌ Error processing {os.path.basename(audio_file)}: {e}\")\n", " continue\n", "\n", " pbar.close()\n", "\n", " print(f\"Successfully processed {valid_count}/{len(audio_files)} files\")\n", " print(f\"Saved to: {hdf5_path}\")\n", "\n", " except Exception as e:\n", " print(f\"Error processing {split} set: {e}\")\n", "\n", "def main():\n", " # Cấu hình paths\n", " dataset_path = \"/content/dataset\" # Thay đổi path này\n", " output_dir = \"/content/dataset_hdf5\" # Thay đổi path này\n", "\n", " # Chạy conversion\n", " convert_to_hdf5(dataset_path, output_dir)\n", " print(\"\\n=== Conversion completed! ===\")\n", "\n", "if __name__ == \"__main__\":\n", " # Example usage:\n", " # dataset_path = \"/content/audio_dataset\"\n", " # output_dir = \"/content/output_hdf5\"\n", " # convert_to_hdf5(dataset_path, output_dir)\n", " main()" ], "metadata": { "id": "lcdNaKMx59ip" }, "execution_count": null, "outputs": [] } ] }