| import os |
| import glob |
| import librosa |
| import soundfile as sf |
| import argparse |
| from tqdm import tqdm |
|
|
| def process_file(in_path, out_dir, min_sec=3.0, max_sec=15.0, top_db=40, sr=44100): |
| os.makedirs(out_dir, exist_ok=True) |
| try: |
| |
| y, _ = librosa.load(in_path, sr=sr) |
| except Exception as e: |
| print(f"Failed to load {in_path}: {e}") |
| return |
|
|
| |
| intervals = librosa.effects.split(y, top_db=top_db) |
| |
| |
| merged_intervals = [] |
| cur_start = None |
| cur_end = None |
| |
| for start, end in intervals: |
| if cur_start is None: |
| cur_start = start |
| cur_end = end |
| else: |
| |
| if (end - cur_start) / sr > max_sec: |
| |
| merged_intervals.append((cur_start, cur_end)) |
| cur_start = start |
| cur_end = end |
| else: |
| |
| cur_end = end |
| |
| if cur_start is not None: |
| merged_intervals.append((cur_start, cur_end)) |
| |
| base_name = os.path.basename(in_path).replace(".wav", "").replace(".", "_") |
| |
| saved_chunks = 0 |
| for i, (start, end) in enumerate(merged_intervals): |
| duration = (end - start) / sr |
| |
| |
| if duration < min_sec and len(merged_intervals) > 1: |
| continue |
| |
| chunk_data = y[start:end] |
| out_filename = os.path.join(out_dir, f"{base_name}_{i:04d}.wav") |
| sf.write(out_filename, chunk_data, sr) |
| saved_chunks += 1 |
| |
| return saved_chunks |
|
|
| def segment_dataset(input_dir, output_dir, sr=44100, top_db=40): |
| wavs = glob.glob(os.path.join(input_dir, "**", "*.wav"), recursive=True) |
| if not wavs: |
| print(f"No .wav files found in {input_dir}.") |
| return |
|
|
| print(f"Found {len(wavs)} huge .wav files. Preparing to segment into clips...") |
| |
| total_clips = 0 |
| for w in tqdm(wavs): |
| |
| rel_path = os.path.relpath(w, input_dir) |
| parts = rel_path.split(os.sep) |
| |
| |
| |
| if len(parts) > 1: |
| speaker_domain = parts[0] |
| else: |
| speaker_domain = "singer_00" |
| |
| out_d = os.path.join(output_dir, speaker_domain) |
| chunks_created = process_file(w, out_d, sr=sr, top_db=top_db) |
| if chunks_created: |
| total_clips += chunks_created |
|
|
| print(f"\nSegmentation complete! Sliced into {total_clips} valid distillation chunks.") |
| print(f"Check results in {output_dir}") |
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser(description="Cleanly slice continuous massive dataset wavs into optimal batch lengths.") |
| parser.add_argument("--input_dir", type=str, default="./opensinger", help="Folder containing raw continuous dataset") |
| parser.add_argument("--output_dir", type=str, default="./dataset_raw", help="Folder mapping where slices go for train prep") |
| parser.add_argument("--sr", type=int, default=44100, help="Universal resample rate") |
| parser.add_argument("--top_db", type=int, default=40, help="DB threshold for silence trimming") |
| args = parser.parse_args() |
|
|
| segment_dataset(args.input_dir, args.output_dir, sr=args.sr, top_db=args.top_db) |
|
|