Spaces:

mvp-lab
/

cfm_svc

Running on Zero

cfm_svc / segment_opensinger.py

Hector Li

Initial commit for Hugging Face

df93d13 12 days ago

3.84 kB

	import os
	import glob
	import librosa
	import soundfile as sf
	import argparse
	from tqdm import tqdm

	def process_file(in_path, out_dir, min_sec=3.0, max_sec=15.0, top_db=40, sr=44100):
	os.makedirs(out_dir, exist_ok=True)
	try:
	# Load audio (librosa converts it to mono by default)
	y, _ = librosa.load(in_path, sr=sr)
	except Exception as e:
	print(f"Failed to load {in_path}: {e}")
	return

	# Split audio on silence, returns intervals of (start_idx, end_idx)
	intervals = librosa.effects.split(y, top_db=top_db)

	# Merge tiny intervals dynamically to enforce min_sec and max_sec lengths
	merged_intervals = []
	cur_start = None
	cur_end = None

	for start, end in intervals:
	if cur_start is None:
	cur_start = start
	cur_end = end
	else:
	# If we add this new interval, does it exceed the max allowed length?
	if (end - cur_start) / sr > max_sec:
	# We exceeded max len. Commit the current chunk and start fresh.
	merged_intervals.append((cur_start, cur_end))
	cur_start = start
	cur_end = end
	else:
	# Merge them
	cur_end = end

	if cur_start is not None:
	merged_intervals.append((cur_start, cur_end))

	base_name = os.path.basename(in_path).replace(".wav", "").replace(".", "_")

	saved_chunks = 0
	for i, (start, end) in enumerate(merged_intervals):
	duration = (end - start) / sr

	# If the chunk is ridiculously short, don't keep it (unless it's the only one)
	if duration < min_sec and len(merged_intervals) > 1:
	continue

	chunk_data = y[start:end]
	out_filename = os.path.join(out_dir, f"{base_name}_{i:04d}.wav")
	sf.write(out_filename, chunk_data, sr)
	saved_chunks += 1

	return saved_chunks

	def segment_dataset(input_dir, output_dir, sr=44100, top_db=40):
	wavs = glob.glob(os.path.join(input_dir, "*", ".wav"), recursive=True)
	if not wavs:
	print(f"No .wav files found in {input_dir}.")
	return

	print(f"Found {len(wavs)} huge .wav files. Preparing to segment into clips...")

	total_clips = 0
	for w in tqdm(wavs):
	# Determine speaker ID by reading the parent folder structure under the input_dir
	rel_path = os.path.relpath(w, input_dir)
	parts = rel_path.split(os.sep)

	# Usually OpenSinger is formatted as OpenSinger/Singer_XX/song_YY.wav
	# Use the first sub-folder as the speaker namespace
	if len(parts) > 1:
	speaker_domain = parts[0]
	else:
	speaker_domain = "singer_00"

	out_d = os.path.join(output_dir, speaker_domain)
	chunks_created = process_file(w, out_d, sr=sr, top_db=top_db)
	if chunks_created:
	total_clips += chunks_created

	print(f"\nSegmentation complete! Sliced into {total_clips} valid distillation chunks.")
	print(f"Check results in {output_dir}")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Cleanly slice continuous massive dataset wavs into optimal batch lengths.")
	parser.add_argument("--input_dir", type=str, default="./opensinger", help="Folder containing raw continuous dataset")
	parser.add_argument("--output_dir", type=str, default="./dataset_raw", help="Folder mapping where slices go for train prep")
	parser.add_argument("--sr", type=int, default=44100, help="Universal resample rate")
	parser.add_argument("--top_db", type=int, default=40, help="DB threshold for silence trimming")
	args = parser.parse_args()

	segment_dataset(args.input_dir, args.output_dir, sr=args.sr, top_db=args.top_db)