cfm_svc / segment_opensinger.py
Hector Li
Initial commit for Hugging Face
df93d13
import os
import glob
import librosa
import soundfile as sf
import argparse
from tqdm import tqdm
def process_file(in_path, out_dir, min_sec=3.0, max_sec=15.0, top_db=40, sr=44100):
os.makedirs(out_dir, exist_ok=True)
try:
# Load audio (librosa converts it to mono by default)
y, _ = librosa.load(in_path, sr=sr)
except Exception as e:
print(f"Failed to load {in_path}: {e}")
return
# Split audio on silence, returns intervals of (start_idx, end_idx)
intervals = librosa.effects.split(y, top_db=top_db)
# Merge tiny intervals dynamically to enforce min_sec and max_sec lengths
merged_intervals = []
cur_start = None
cur_end = None
for start, end in intervals:
if cur_start is None:
cur_start = start
cur_end = end
else:
# If we add this new interval, does it exceed the max allowed length?
if (end - cur_start) / sr > max_sec:
# We exceeded max len. Commit the current chunk and start fresh.
merged_intervals.append((cur_start, cur_end))
cur_start = start
cur_end = end
else:
# Merge them
cur_end = end
if cur_start is not None:
merged_intervals.append((cur_start, cur_end))
base_name = os.path.basename(in_path).replace(".wav", "").replace(".", "_")
saved_chunks = 0
for i, (start, end) in enumerate(merged_intervals):
duration = (end - start) / sr
# If the chunk is ridiculously short, don't keep it (unless it's the only one)
if duration < min_sec and len(merged_intervals) > 1:
continue
chunk_data = y[start:end]
out_filename = os.path.join(out_dir, f"{base_name}_{i:04d}.wav")
sf.write(out_filename, chunk_data, sr)
saved_chunks += 1
return saved_chunks
def segment_dataset(input_dir, output_dir, sr=44100, top_db=40):
wavs = glob.glob(os.path.join(input_dir, "**", "*.wav"), recursive=True)
if not wavs:
print(f"No .wav files found in {input_dir}.")
return
print(f"Found {len(wavs)} huge .wav files. Preparing to segment into clips...")
total_clips = 0
for w in tqdm(wavs):
# Determine speaker ID by reading the parent folder structure under the input_dir
rel_path = os.path.relpath(w, input_dir)
parts = rel_path.split(os.sep)
# Usually OpenSinger is formatted as OpenSinger/Singer_XX/song_YY.wav
# Use the first sub-folder as the speaker namespace
if len(parts) > 1:
speaker_domain = parts[0]
else:
speaker_domain = "singer_00"
out_d = os.path.join(output_dir, speaker_domain)
chunks_created = process_file(w, out_d, sr=sr, top_db=top_db)
if chunks_created:
total_clips += chunks_created
print(f"\nSegmentation complete! Sliced into {total_clips} valid distillation chunks.")
print(f"Check results in {output_dir}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Cleanly slice continuous massive dataset wavs into optimal batch lengths.")
parser.add_argument("--input_dir", type=str, default="./opensinger", help="Folder containing raw continuous dataset")
parser.add_argument("--output_dir", type=str, default="./dataset_raw", help="Folder mapping where slices go for train prep")
parser.add_argument("--sr", type=int, default=44100, help="Universal resample rate")
parser.add_argument("--top_db", type=int, default=40, help="DB threshold for silence trimming")
args = parser.parse_args()
segment_dataset(args.input_dir, args.output_dir, sr=args.sr, top_db=args.top_db)