import os import glob import librosa import soundfile as sf import argparse from tqdm import tqdm def process_file(in_path, out_dir, min_sec=3.0, max_sec=15.0, top_db=40, sr=44100): os.makedirs(out_dir, exist_ok=True) try: # Load audio (librosa converts it to mono by default) y, _ = librosa.load(in_path, sr=sr) except Exception as e: print(f"Failed to load {in_path}: {e}") return # Split audio on silence, returns intervals of (start_idx, end_idx) intervals = librosa.effects.split(y, top_db=top_db) # Merge tiny intervals dynamically to enforce min_sec and max_sec lengths merged_intervals = [] cur_start = None cur_end = None for start, end in intervals: if cur_start is None: cur_start = start cur_end = end else: # If we add this new interval, does it exceed the max allowed length? if (end - cur_start) / sr > max_sec: # We exceeded max len. Commit the current chunk and start fresh. merged_intervals.append((cur_start, cur_end)) cur_start = start cur_end = end else: # Merge them cur_end = end if cur_start is not None: merged_intervals.append((cur_start, cur_end)) base_name = os.path.basename(in_path).replace(".wav", "").replace(".", "_") saved_chunks = 0 for i, (start, end) in enumerate(merged_intervals): duration = (end - start) / sr # If the chunk is ridiculously short, don't keep it (unless it's the only one) if duration < min_sec and len(merged_intervals) > 1: continue chunk_data = y[start:end] out_filename = os.path.join(out_dir, f"{base_name}_{i:04d}.wav") sf.write(out_filename, chunk_data, sr) saved_chunks += 1 return saved_chunks def segment_dataset(input_dir, output_dir, sr=44100, top_db=40): wavs = glob.glob(os.path.join(input_dir, "**", "*.wav"), recursive=True) if not wavs: print(f"No .wav files found in {input_dir}.") return print(f"Found {len(wavs)} huge .wav files. Preparing to segment into clips...") total_clips = 0 for w in tqdm(wavs): # Determine speaker ID by reading the parent folder structure under the input_dir rel_path = os.path.relpath(w, input_dir) parts = rel_path.split(os.sep) # Usually OpenSinger is formatted as OpenSinger/Singer_XX/song_YY.wav # Use the first sub-folder as the speaker namespace if len(parts) > 1: speaker_domain = parts[0] else: speaker_domain = "singer_00" out_d = os.path.join(output_dir, speaker_domain) chunks_created = process_file(w, out_d, sr=sr, top_db=top_db) if chunks_created: total_clips += chunks_created print(f"\nSegmentation complete! Sliced into {total_clips} valid distillation chunks.") print(f"Check results in {output_dir}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Cleanly slice continuous massive dataset wavs into optimal batch lengths.") parser.add_argument("--input_dir", type=str, default="./opensinger", help="Folder containing raw continuous dataset") parser.add_argument("--output_dir", type=str, default="./dataset_raw", help="Folder mapping where slices go for train prep") parser.add_argument("--sr", type=int, default=44100, help="Universal resample rate") parser.add_argument("--top_db", type=int, default=40, help="DB threshold for silence trimming") args = parser.parse_args() segment_dataset(args.input_dir, args.output_dir, sr=args.sr, top_db=args.top_db)