File size: 6,452 Bytes
016e82d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
"""
scripts/build_features.py

Extracts MFCC feature vectors (for Random Forest) and log-scaled mel
spectrograms (for EfficientNet-B0) from all audio clips in the filtered
metadata. Saves results as .npy arrays and a fitted LabelEncoder.

Usage:
    python scripts/build_features.py

Attribution:
    librosa audio analysis library β€” https://librosa.org
"""

import argparse
from pathlib import Path

import joblib
import librosa
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm


# ── Audio / feature constants ──────────────────────────────────────────────────
SAMPLE_RATE    = 22050
AUDIO_DURATION = 5       # seconds β€” clips are trimmed or padded to this length
N_MFCC         = 40      # number of MFCC coefficients
N_MELS         = 128     # mel frequency bins
N_FFT          = 2048
HOP_LENGTH     = 512


def load_audio(filepath: str, sr: int = SAMPLE_RATE, duration: int = AUDIO_DURATION) -> np.ndarray:
    """
    Load an audio file and pad or trim to a fixed duration.

    Args:
        filepath: Path to the .ogg / .mp3 / .wav file.
        sr:       Target sample rate (Hz).
        duration: Desired clip length in seconds.

    Returns:
        1-D float32 numpy array of shape (sr * duration,).
    """
    target_len = sr * duration
    audio, _   = librosa.load(filepath, sr=sr, duration=duration, mono=True)

    if len(audio) < target_len:
        audio = np.pad(audio, (0, target_len - len(audio)), mode="constant")

    return audio[:target_len].astype(np.float32)


def extract_mfcc(audio: np.ndarray, sr: int = SAMPLE_RATE, n_mfcc: int = N_MFCC) -> np.ndarray:
    """
    Compute a fixed-length MFCC feature vector via mean and std pooling.

    Args:
        audio:  1-D audio signal.
        sr:     Sample rate.
        n_mfcc: Number of MFCC coefficients.

    Returns:
        Feature vector of shape (n_mfcc * 2,)  β€” [mean | std].
    """
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    return np.concatenate([mfcc.mean(axis=1), mfcc.std(axis=1)]).astype(np.float32)


def compute_mel_spectrogram(
    audio: np.ndarray,
    sr: int = SAMPLE_RATE,
    n_mels: int = N_MELS,
    n_fft: int = N_FFT,
    hop_length: int = HOP_LENGTH,
) -> np.ndarray:
    """
    Compute a log-scaled (dB) mel spectrogram suitable as CNN input.

    Args:
        audio:      1-D audio signal.
        sr:         Sample rate.
        n_mels:     Number of mel filter banks.
        n_fft:      FFT window size.
        hop_length: Hop size between frames.

    Returns:
        2-D float32 array of shape (n_mels, time_frames).
    """
    mel = librosa.feature.melspectrogram(
        y=audio, sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length
    )
    return librosa.power_to_db(mel, ref=np.max).astype(np.float32)


def build_feature_arrays(
    df: pd.DataFrame,
    audio_root: Path,
    audio_subdir: str = "train_audio",
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Iterate over all rows in the metadata DataFrame and extract features.

    Args:
        df:           Filtered metadata with 'filename' and 'primary_label' columns.
        audio_root:   Root data directory (data/raw/).
        audio_subdir: Subdirectory containing species sub-folders of .ogg files.

    Returns:
        Tuple of (X_mfcc, X_mel, labels) as numpy arrays.
        X_mfcc shape: (N, N_MFCC * 2)
        X_mel  shape: (N, N_MELS, time_frames)
        labels shape: (N,)  β€” string species codes
    """
    X_mfcc, X_mel, labels = [], [], []
    failed = 0

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Extracting features"):
        filepath = audio_root / audio_subdir / row["filename"]
        try:
            audio = load_audio(str(filepath))
            X_mfcc.append(extract_mfcc(audio))
            X_mel.append(compute_mel_spectrogram(audio))
            labels.append(row["primary_label"])
        except Exception as exc:
            failed += 1
            if failed <= 5:
                print(f"  Warning β€” failed to load {filepath}: {exc}")

    print(f"Feature extraction complete. Failed: {failed}/{len(df)}")
    return np.array(X_mfcc), np.array(X_mel), np.array(labels)


def save_features(
    X_mfcc: np.ndarray,
    X_mel: np.ndarray,
    y: np.ndarray,
    le: LabelEncoder,
    processed_dir: Path,
) -> None:
    """
    Persist feature arrays, encoded labels, and the LabelEncoder to disk.

    Args:
        X_mfcc:        MFCC feature matrix.
        X_mel:         Mel spectrogram array.
        y:             Integer-encoded label array.
        le:            Fitted LabelEncoder (needed to decode predictions later).
        processed_dir: Directory to save .npy files and encoder.
    """
    processed_dir.mkdir(parents=True, exist_ok=True)
    np.save(processed_dir / "X_mfcc.npy",  X_mfcc)
    np.save(processed_dir / "X_mel.npy",   X_mel)
    np.save(processed_dir / "y.npy",       y)
    np.save(processed_dir / "classes.npy", le.classes_)
    joblib.dump(le, processed_dir / "label_encoder.pkl")

    print(f"Saved features to {processed_dir}/")
    print(f"  X_mfcc : {X_mfcc.shape}")
    print(f"  X_mel  : {X_mel.shape}")
    print(f"  y      : {y.shape}  ({len(le.classes_)} classes)")


def main() -> None:
    parser = argparse.ArgumentParser(description="Extract audio features from BirdCLEF 2023.")
    parser.add_argument(
        "--meta", type=str,
        default="data/processed/train_metadata_filtered.csv",
        help="Path to filtered metadata CSV (output of make_dataset.py)",
    )
    parser.add_argument(
        "--audio-root", type=str, default="data/raw",
        help="Root directory containing train_audio/",
    )
    args = parser.parse_args()

    meta_path  = Path(args.meta)
    audio_root = Path(args.audio_root)

    if not meta_path.exists():
        raise FileNotFoundError(
            f"Filtered metadata not found at {meta_path}. "
            "Run scripts/make_dataset.py first."
        )

    df = pd.read_csv(meta_path)
    print(f"Loaded metadata: {len(df)} rows, {df['primary_label'].nunique()} species")

    X_mfcc, X_mel, labels = build_feature_arrays(df, audio_root)

    le = LabelEncoder()
    y  = le.fit_transform(labels)

    save_features(X_mfcc, X_mel, y, le, Path("data/processed"))


if __name__ == "__main__":
    main()