| import numpy as np |
| import tqdm |
| import matplotlib.pyplot as plt |
| import os |
| import shutil |
| import wave |
|
|
| WAV_MIN_LENGTH = 2 |
| SAMPLE_RATE = 1 |
| SAMPLE_MIN = 2 |
| SAMPLE_MAX = 10 |
|
|
|
|
| |
| def check_duration(wav_file): |
| |
| f = wave.open(wav_file, "rb") |
| |
| frames = f.getnframes() |
| rate = f.getframerate() |
| |
| duration = frames / float(rate) |
| |
| f.close() |
| |
| return duration > WAV_MIN_LENGTH |
|
|
| |
| def split_data(src_dir, dst_dir, ratio): |
| |
| if not os.path.exists(dst_dir): |
| os.makedirs(dst_dir) |
| |
| |
| subdirs, files, subfiles = [], [], [] |
| for item in os.listdir(src_dir): |
| item_path = os.path.join(src_dir, item) |
| if os.path.isdir(item_path): |
| subdirs.append(item) |
| for subitem in os.listdir(item_path): |
| subitem_path = os.path.join(item_path, subitem) |
| if os.path.isfile(subitem_path) and subitem.endswith(".wav"): |
| subfiles.append(subitem) |
| elif os.path.isfile(item_path) and item.endswith(".wav"): |
| files.append(item) |
|
|
| |
| if len(files) == 0: |
| if len(subfiles) == 0: |
| print(f"Error: No wav files found in {src_dir}") |
| return |
| |
| |
| num_files = int(len(files) * ratio) |
| num_files = max(SAMPLE_MIN, min(SAMPLE_MAX, num_files)) |
|
|
| |
| np.random.shuffle(files) |
| selected_files = files[:num_files] |
| |
| |
| pbar = tqdm.tqdm(total=num_files) |
|
|
| |
| for file in selected_files: |
| src_file = os.path.join(src_dir, file) |
| |
| if not check_duration(src_file): |
| print(f"Skipped {src_file} because its duration is less than 2 seconds.") |
| continue |
| |
| dst_file = os.path.join(dst_dir, file) |
| shutil.move(src_file, dst_file) |
| pbar.update(1) |
|
|
| pbar.close() |
|
|
| |
| for subdir in subdirs: |
| |
| src_subdir = os.path.join(src_dir, subdir) |
| dst_subdir = os.path.join(dst_dir, subdir) |
| |
| split_data(src_subdir, dst_subdir, ratio) |
|
|
| |
|
|
| def main(): |
| root_dir = os.path.abspath('.') |
| dst_dir = root_dir + "/data/val/audio" |
| |
| ratio = float(SAMPLE_RATE) / 100 |
|
|
| |
| src_dir = root_dir + "/data/train/audio" |
|
|
| |
| split_data(src_dir, dst_dir, ratio) |
|
|
| |
| if __name__ == "__main__": |
| main() |