nice-bill commited on
Commit
70404f4
Β·
1 Parent(s): 2bc3291

initialize project structure and data harmonization pipeline

Browse files
Files changed (1) hide show
  1. src/data/harmonize.py +78 -0
src/data/harmonize.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ from pathlib import Path
4
+ from sklearn.model_selection import train_test_split
5
+ from tqdm import tqdm
6
+ import librosa
7
+
8
+ def harmonize_data(raw_data_path, output_path):
9
+ print(f"πŸ” Scanning directory: {raw_data_path}")
10
+
11
+ data = []
12
+ # Folder names are our labels
13
+ emotion_folders = [f for f in os.listdir(raw_data_path) if os.path.isdir(os.path.join(raw_data_path, f))]
14
+
15
+ # Map folder names to standard labels
16
+ # Note: 'Suprised' is misspelled in the source, we'll keep it for mapping but label it 'surprised'
17
+ label_map = {folder: folder.lower() for folder in emotion_folders}
18
+
19
+ for folder in emotion_folders:
20
+ folder_path = Path(raw_data_path) / folder
21
+ files = list(folder_path.glob("*.wav"))
22
+
23
+ print(f"πŸ“‚ Processing {folder}: {len(files)} files")
24
+
25
+ for file_path in tqdm(files, desc=f"Processing {folder}"):
26
+ try:
27
+ # Basic validation: can librosa load it?
28
+ # We don't load the whole file here to save time, just check existence
29
+ if file_path.exists():
30
+ data.append({
31
+ "filename": file_path.name,
32
+ "emotion": label_map[folder],
33
+ "path": str(file_path.absolute())
34
+ })
35
+ except Exception as e:
36
+ print(f"❌ Error processing {file_path}: {e}")
37
+
38
+ df = pd.DataFrame(data)
39
+
40
+ if df.empty:
41
+ print("❌ No data found! Please check the raw_data_path.")
42
+ return
43
+
44
+ # --- Stratified Splitting (80/10/10) ---
45
+ print("\nβš–οΈ Creating stratified splits...")
46
+
47
+ # First split: Train vs Temp (20%)
48
+ train_df, temp_df = train_test_split(
49
+ df, test_size=0.2, stratify=df['emotion'], random_state=42
50
+ )
51
+
52
+ # Second split: Val (10%) vs Test (10%) from the Temp (20%)
53
+ val_df, test_df = train_test_split(
54
+ temp_df, test_size=0.5, stratify=temp_df['emotion'], random_state=42
55
+ )
56
+
57
+ # Mark splits
58
+ train_df = train_df.assign(split='train')
59
+ val_df = val_df.assign(split='val')
60
+ test_df = test_df.assign(split='test')
61
+
62
+ # Combine back
63
+ final_df = pd.concat([train_df, val_df, test_df])
64
+
65
+ # Save
66
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
67
+ final_df.to_csv(output_path, index=False)
68
+
69
+ print(f"\nβœ… Harmonization Complete!")
70
+ print(f"πŸ“Š Total files: {len(final_df)}")
71
+ print(f"πŸ“ Metadata saved to: {output_path}")
72
+ print("\nSplit Statistics:")
73
+ print(final_df.groupby(['split', 'emotion']).size().unstack(fill_value=0))
74
+
75
+ if __name__ == "__main__":
76
+ RAW_PATH = r"C:\dev\archive\Emotions"
77
+ OUTPUT_PATH = "data/processed/metadata.csv"
78
+ harmonize_data(RAW_PATH, OUTPUT_PATH)