Spaces:
Sleeping
Sleeping
File size: 1,495 Bytes
9583919 5f6b40b 9583919 5f6b40b 9583919 5f6b40b 9583919 5f6b40b 9583919 5f6b40b 9583919 5f6b40b 9583919 5f6b40b 9583919 5f6b40b 9583919 5f6b40b 9583919 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 | import json
import os
import random
from pathlib import Path
import argbind
from tqdm import tqdm
@argbind.bind(without_prefix=True)
def train_test_split(
audio_folder: str = ".",
test_size: float = 0.2,
seed: int = 42,
pattern: str = "**/*.mp3",
):
print("finding audio")
audio_folder = Path(audio_folder)
audio_files = list(tqdm(audio_folder.glob(pattern)))
print(f"found {len(audio_files)} audio files")
# split according to test_size
n_test = int(len(audio_files) * test_size)
n_train = len(audio_files) - n_test
# shuffle
random.seed(seed)
random.shuffle(audio_files)
train_files = audio_files[:n_train]
test_files = audio_files[n_train:]
print(f"Train files: {len(train_files)}")
print(f"Test files: {len(test_files)}")
continue_ = input("Continue [yn]? ") or "n"
if continue_ != "y":
return
for split, files in (("train", train_files), ("test", test_files)):
for file in tqdm(files):
out_file = (
audio_folder.parent / f"{audio_folder.name}-{split}" / Path(file).name
)
out_file.parent.mkdir(exist_ok=True, parents=True)
os.symlink(file, out_file)
# save split as json
with open(Path(audio_folder) / f"{split}.json", "w") as f:
json.dump([str(f) for f in files], f)
if __name__ == "__main__":
args = argbind.parse_args()
with argbind.scope(args):
train_test_split()
|