import os import pandas as pd from pathlib import Path from sklearn.model_selection import train_test_split from tqdm import tqdm import librosa def harmonize_data(raw_data_path, output_path): print(f"Scanning directory: {raw_data_path}") data = [] # Folder names are our labels emotion_folders = [f for f in os.listdir(raw_data_path) if os.path.isdir(os.path.join(raw_data_path, f))] # Map folder names to standard labels # Note: 'Suprised' is misspelled in the source, we'll keep it for mapping but label it 'surprised' label_map = {folder: folder.lower() for folder in emotion_folders} for folder in emotion_folders: folder_path = Path(raw_data_path) / folder files = list(folder_path.glob("*.wav")) print(f"Processing {folder}: {len(files)} files") for file_path in tqdm(files, desc=f"Processing {folder}"): try: # Basic validation: can librosa load it? # We don't load the whole file here to save time, just check existence if file_path.exists(): data.append({ "filename": file_path.name, "emotion": label_map[folder], "path": str(file_path.absolute()) }) except Exception as e: print(f"Error processing {file_path}: {e}") df = pd.DataFrame(data) if df.empty: print("No data found! Please check the raw_data_path.") return # --- Stratified Splitting (80/10/10) --- print("\nCreating stratified splits...") # First split: Train vs Temp (20%) train_df, temp_df = train_test_split( df, test_size=0.2, stratify=df['emotion'], random_state=42 ) # Second split: Val (10%) vs Test (10%) from the Temp (20%) val_df, test_df = train_test_split( temp_df, test_size=0.5, stratify=temp_df['emotion'], random_state=42 ) # Mark splits train_df = train_df.assign(split='train') val_df = val_df.assign(split='val') test_df = test_df.assign(split='test') # Combine back final_df = pd.concat([train_df, val_df, test_df]) # Save os.makedirs(os.path.dirname(output_path), exist_ok=True) final_df.to_csv(output_path, index=False) print(f"\nHarmonization Complete!") print(f"Total files: {len(final_df)}") print(f"Metadata saved to: {output_path}") print("\nSplit Statistics:") print(final_df.groupby(['split', 'emotion']).size().unstack(fill_value=0)) if __name__ == "__main__": RAW_PATH = r"C:\dev\archive\Emotions" OUTPUT_PATH = "data/processed/metadata.csv" harmonize_data(RAW_PATH, OUTPUT_PATH)