Spaces:
Sleeping
Sleeping
| #We are dividing your audio dataset into train / dev / test lists that pyannote will later use. | |
| import os | |
| import random | |
| # Path to audio folder | |
| audio_dir = "dataset/audio" | |
| # Collect all wav files | |
| uris = [ | |
| f.replace(".wav", "") | |
| for f in os.listdir(audio_dir) | |
| if f.endswith(".wav") | |
| ] | |
| # Safety check | |
| if len(uris) != 89: | |
| print(f"Warning: expected 89 files, found {len(uris)}") | |
| # Shuffle for randomness | |
| random.seed(42) | |
| random.shuffle(uris) | |
| # Split sizes for 89 files | |
| train = uris[:71] | |
| dev = uris[71:80] | |
| test = uris[80:89] | |
| # Create splits folder if not exists | |
| os.makedirs("dataset/splits", exist_ok=True) | |
| def write_split(name, data): | |
| with open(f"dataset/splits/{name}.txt", "w", encoding="utf-8") as f: | |
| for uri in data: | |
| f.write(uri + "\n") | |
| write_split("train", train) | |
| write_split("dev", dev) | |
| write_split("test", test) | |
| # Print summary | |
| print("Dataset split completed:") | |
| print(f" Train: {len(train)} files") | |
| print(f" Dev : {len(dev)} files") | |
| print(f" Test : {len(test)} files") | |
| # 71 for training | |
| # 9 for validation (dev) | |
| # 9 for testing |