File size: 6,452 Bytes
016e82d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 | """
scripts/build_features.py
Extracts MFCC feature vectors (for Random Forest) and log-scaled mel
spectrograms (for EfficientNet-B0) from all audio clips in the filtered
metadata. Saves results as .npy arrays and a fitted LabelEncoder.
Usage:
python scripts/build_features.py
Attribution:
librosa audio analysis library β https://librosa.org
"""
import argparse
from pathlib import Path
import joblib
import librosa
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
# ββ Audio / feature constants ββββββββββββββββββββββββββββββββββββββββββββββββββ
SAMPLE_RATE = 22050
AUDIO_DURATION = 5 # seconds β clips are trimmed or padded to this length
N_MFCC = 40 # number of MFCC coefficients
N_MELS = 128 # mel frequency bins
N_FFT = 2048
HOP_LENGTH = 512
def load_audio(filepath: str, sr: int = SAMPLE_RATE, duration: int = AUDIO_DURATION) -> np.ndarray:
"""
Load an audio file and pad or trim to a fixed duration.
Args:
filepath: Path to the .ogg / .mp3 / .wav file.
sr: Target sample rate (Hz).
duration: Desired clip length in seconds.
Returns:
1-D float32 numpy array of shape (sr * duration,).
"""
target_len = sr * duration
audio, _ = librosa.load(filepath, sr=sr, duration=duration, mono=True)
if len(audio) < target_len:
audio = np.pad(audio, (0, target_len - len(audio)), mode="constant")
return audio[:target_len].astype(np.float32)
def extract_mfcc(audio: np.ndarray, sr: int = SAMPLE_RATE, n_mfcc: int = N_MFCC) -> np.ndarray:
"""
Compute a fixed-length MFCC feature vector via mean and std pooling.
Args:
audio: 1-D audio signal.
sr: Sample rate.
n_mfcc: Number of MFCC coefficients.
Returns:
Feature vector of shape (n_mfcc * 2,) β [mean | std].
"""
mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
return np.concatenate([mfcc.mean(axis=1), mfcc.std(axis=1)]).astype(np.float32)
def compute_mel_spectrogram(
audio: np.ndarray,
sr: int = SAMPLE_RATE,
n_mels: int = N_MELS,
n_fft: int = N_FFT,
hop_length: int = HOP_LENGTH,
) -> np.ndarray:
"""
Compute a log-scaled (dB) mel spectrogram suitable as CNN input.
Args:
audio: 1-D audio signal.
sr: Sample rate.
n_mels: Number of mel filter banks.
n_fft: FFT window size.
hop_length: Hop size between frames.
Returns:
2-D float32 array of shape (n_mels, time_frames).
"""
mel = librosa.feature.melspectrogram(
y=audio, sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length
)
return librosa.power_to_db(mel, ref=np.max).astype(np.float32)
def build_feature_arrays(
df: pd.DataFrame,
audio_root: Path,
audio_subdir: str = "train_audio",
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Iterate over all rows in the metadata DataFrame and extract features.
Args:
df: Filtered metadata with 'filename' and 'primary_label' columns.
audio_root: Root data directory (data/raw/).
audio_subdir: Subdirectory containing species sub-folders of .ogg files.
Returns:
Tuple of (X_mfcc, X_mel, labels) as numpy arrays.
X_mfcc shape: (N, N_MFCC * 2)
X_mel shape: (N, N_MELS, time_frames)
labels shape: (N,) β string species codes
"""
X_mfcc, X_mel, labels = [], [], []
failed = 0
for _, row in tqdm(df.iterrows(), total=len(df), desc="Extracting features"):
filepath = audio_root / audio_subdir / row["filename"]
try:
audio = load_audio(str(filepath))
X_mfcc.append(extract_mfcc(audio))
X_mel.append(compute_mel_spectrogram(audio))
labels.append(row["primary_label"])
except Exception as exc:
failed += 1
if failed <= 5:
print(f" Warning β failed to load {filepath}: {exc}")
print(f"Feature extraction complete. Failed: {failed}/{len(df)}")
return np.array(X_mfcc), np.array(X_mel), np.array(labels)
def save_features(
X_mfcc: np.ndarray,
X_mel: np.ndarray,
y: np.ndarray,
le: LabelEncoder,
processed_dir: Path,
) -> None:
"""
Persist feature arrays, encoded labels, and the LabelEncoder to disk.
Args:
X_mfcc: MFCC feature matrix.
X_mel: Mel spectrogram array.
y: Integer-encoded label array.
le: Fitted LabelEncoder (needed to decode predictions later).
processed_dir: Directory to save .npy files and encoder.
"""
processed_dir.mkdir(parents=True, exist_ok=True)
np.save(processed_dir / "X_mfcc.npy", X_mfcc)
np.save(processed_dir / "X_mel.npy", X_mel)
np.save(processed_dir / "y.npy", y)
np.save(processed_dir / "classes.npy", le.classes_)
joblib.dump(le, processed_dir / "label_encoder.pkl")
print(f"Saved features to {processed_dir}/")
print(f" X_mfcc : {X_mfcc.shape}")
print(f" X_mel : {X_mel.shape}")
print(f" y : {y.shape} ({len(le.classes_)} classes)")
def main() -> None:
parser = argparse.ArgumentParser(description="Extract audio features from BirdCLEF 2023.")
parser.add_argument(
"--meta", type=str,
default="data/processed/train_metadata_filtered.csv",
help="Path to filtered metadata CSV (output of make_dataset.py)",
)
parser.add_argument(
"--audio-root", type=str, default="data/raw",
help="Root directory containing train_audio/",
)
args = parser.parse_args()
meta_path = Path(args.meta)
audio_root = Path(args.audio_root)
if not meta_path.exists():
raise FileNotFoundError(
f"Filtered metadata not found at {meta_path}. "
"Run scripts/make_dataset.py first."
)
df = pd.read_csv(meta_path)
print(f"Loaded metadata: {len(df)} rows, {df['primary_label'].nunique()} species")
X_mfcc, X_mel, labels = build_feature_arrays(df, audio_root)
le = LabelEncoder()
y = le.fit_transform(labels)
save_features(X_mfcc, X_mel, y, le, Path("data/processed"))
if __name__ == "__main__":
main()
|