import os
import glob
import librosa
import soundfile as sf
import argparse
from tqdm import tqdm

def process_file(in_path, out_dir, min_sec=3.0, max_sec=15.0, top_db=40, sr=44100):
    os.makedirs(out_dir, exist_ok=True)
    try:
        # Load audio (librosa converts it to mono by default)
        y, _ = librosa.load(in_path, sr=sr)
    except Exception as e:
        print(f"Failed to load {in_path}: {e}")
        return

    # Split audio on silence, returns intervals of (start_idx, end_idx)
    intervals = librosa.effects.split(y, top_db=top_db)
    
    # Merge tiny intervals dynamically to enforce min_sec and max_sec lengths
    merged_intervals = []
    cur_start = None
    cur_end = None
    
    for start, end in intervals:
        if cur_start is None:
            cur_start = start
            cur_end = end
        else:
            # If we add this new interval, does it exceed the max allowed length?
            if (end - cur_start) / sr > max_sec:
                # We exceeded max len. Commit the current chunk and start fresh.
                merged_intervals.append((cur_start, cur_end))
                cur_start = start
                cur_end = end
            else:
                # Merge them
                cur_end = end
                
    if cur_start is not None:
        merged_intervals.append((cur_start, cur_end))
        
    base_name = os.path.basename(in_path).replace(".wav", "").replace(".", "_")
    
    saved_chunks = 0
    for i, (start, end) in enumerate(merged_intervals):
        duration = (end - start) / sr
        
        # If the chunk is ridiculously short, don't keep it (unless it's the only one)
        if duration < min_sec and len(merged_intervals) > 1:
            continue
            
        chunk_data = y[start:end]
        out_filename = os.path.join(out_dir, f"{base_name}_{i:04d}.wav")
        sf.write(out_filename, chunk_data, sr)
        saved_chunks += 1
        
    return saved_chunks

def segment_dataset(input_dir, output_dir, sr=44100, top_db=40):
    wavs = glob.glob(os.path.join(input_dir, "**", "*.wav"), recursive=True)
    if not wavs:
        print(f"No .wav files found in {input_dir}.")
        return

    print(f"Found {len(wavs)} huge .wav files. Preparing to segment into clips...")
    
    total_clips = 0
    for w in tqdm(wavs):
        # Determine speaker ID by reading the parent folder structure under the input_dir
        rel_path = os.path.relpath(w, input_dir)
        parts = rel_path.split(os.sep)
        
        # Usually OpenSinger is formatted as OpenSinger/Singer_XX/song_YY.wav
        # Use the first sub-folder as the speaker namespace
        if len(parts) > 1:
            speaker_domain = parts[0]
        else:
            speaker_domain = "singer_00"
            
        out_d = os.path.join(output_dir, speaker_domain)
        chunks_created = process_file(w, out_d, sr=sr, top_db=top_db)
        if chunks_created:
            total_clips += chunks_created

    print(f"\nSegmentation complete! Sliced into {total_clips} valid distillation chunks.")
    print(f"Check results in {output_dir}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Cleanly slice continuous massive dataset wavs into optimal batch lengths.")
    parser.add_argument("--input_dir", type=str, default="./opensinger", help="Folder containing raw continuous dataset")
    parser.add_argument("--output_dir", type=str, default="./dataset_raw", help="Folder mapping where slices go for train prep")
    parser.add_argument("--sr", type=int, default=44100, help="Universal resample rate")
    parser.add_argument("--top_db", type=int, default=40, help="DB threshold for silence trimming")
    args = parser.parse_args()

    segment_dataset(args.input_dir, args.output_dir, sr=args.sr, top_db=args.top_db)