chirp / scripts /build_features.py
mg643's picture
added data setup, feature engineering, model building, outputs
016e82d
"""
scripts/build_features.py
Extracts MFCC feature vectors (for Random Forest) and log-scaled mel
spectrograms (for EfficientNet-B0) from all audio clips in the filtered
metadata. Saves results as .npy arrays and a fitted LabelEncoder.
Usage:
python scripts/build_features.py
Attribution:
librosa audio analysis library β€” https://librosa.org
"""
import argparse
from pathlib import Path
import joblib
import librosa
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
# ── Audio / feature constants ──────────────────────────────────────────────────
SAMPLE_RATE = 22050
AUDIO_DURATION = 5 # seconds β€” clips are trimmed or padded to this length
N_MFCC = 40 # number of MFCC coefficients
N_MELS = 128 # mel frequency bins
N_FFT = 2048
HOP_LENGTH = 512
def load_audio(filepath: str, sr: int = SAMPLE_RATE, duration: int = AUDIO_DURATION) -> np.ndarray:
"""
Load an audio file and pad or trim to a fixed duration.
Args:
filepath: Path to the .ogg / .mp3 / .wav file.
sr: Target sample rate (Hz).
duration: Desired clip length in seconds.
Returns:
1-D float32 numpy array of shape (sr * duration,).
"""
target_len = sr * duration
audio, _ = librosa.load(filepath, sr=sr, duration=duration, mono=True)
if len(audio) < target_len:
audio = np.pad(audio, (0, target_len - len(audio)), mode="constant")
return audio[:target_len].astype(np.float32)
def extract_mfcc(audio: np.ndarray, sr: int = SAMPLE_RATE, n_mfcc: int = N_MFCC) -> np.ndarray:
"""
Compute a fixed-length MFCC feature vector via mean and std pooling.
Args:
audio: 1-D audio signal.
sr: Sample rate.
n_mfcc: Number of MFCC coefficients.
Returns:
Feature vector of shape (n_mfcc * 2,) β€” [mean | std].
"""
mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
return np.concatenate([mfcc.mean(axis=1), mfcc.std(axis=1)]).astype(np.float32)
def compute_mel_spectrogram(
audio: np.ndarray,
sr: int = SAMPLE_RATE,
n_mels: int = N_MELS,
n_fft: int = N_FFT,
hop_length: int = HOP_LENGTH,
) -> np.ndarray:
"""
Compute a log-scaled (dB) mel spectrogram suitable as CNN input.
Args:
audio: 1-D audio signal.
sr: Sample rate.
n_mels: Number of mel filter banks.
n_fft: FFT window size.
hop_length: Hop size between frames.
Returns:
2-D float32 array of shape (n_mels, time_frames).
"""
mel = librosa.feature.melspectrogram(
y=audio, sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length
)
return librosa.power_to_db(mel, ref=np.max).astype(np.float32)
def build_feature_arrays(
df: pd.DataFrame,
audio_root: Path,
audio_subdir: str = "train_audio",
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Iterate over all rows in the metadata DataFrame and extract features.
Args:
df: Filtered metadata with 'filename' and 'primary_label' columns.
audio_root: Root data directory (data/raw/).
audio_subdir: Subdirectory containing species sub-folders of .ogg files.
Returns:
Tuple of (X_mfcc, X_mel, labels) as numpy arrays.
X_mfcc shape: (N, N_MFCC * 2)
X_mel shape: (N, N_MELS, time_frames)
labels shape: (N,) β€” string species codes
"""
X_mfcc, X_mel, labels = [], [], []
failed = 0
for _, row in tqdm(df.iterrows(), total=len(df), desc="Extracting features"):
filepath = audio_root / audio_subdir / row["filename"]
try:
audio = load_audio(str(filepath))
X_mfcc.append(extract_mfcc(audio))
X_mel.append(compute_mel_spectrogram(audio))
labels.append(row["primary_label"])
except Exception as exc:
failed += 1
if failed <= 5:
print(f" Warning β€” failed to load {filepath}: {exc}")
print(f"Feature extraction complete. Failed: {failed}/{len(df)}")
return np.array(X_mfcc), np.array(X_mel), np.array(labels)
def save_features(
X_mfcc: np.ndarray,
X_mel: np.ndarray,
y: np.ndarray,
le: LabelEncoder,
processed_dir: Path,
) -> None:
"""
Persist feature arrays, encoded labels, and the LabelEncoder to disk.
Args:
X_mfcc: MFCC feature matrix.
X_mel: Mel spectrogram array.
y: Integer-encoded label array.
le: Fitted LabelEncoder (needed to decode predictions later).
processed_dir: Directory to save .npy files and encoder.
"""
processed_dir.mkdir(parents=True, exist_ok=True)
np.save(processed_dir / "X_mfcc.npy", X_mfcc)
np.save(processed_dir / "X_mel.npy", X_mel)
np.save(processed_dir / "y.npy", y)
np.save(processed_dir / "classes.npy", le.classes_)
joblib.dump(le, processed_dir / "label_encoder.pkl")
print(f"Saved features to {processed_dir}/")
print(f" X_mfcc : {X_mfcc.shape}")
print(f" X_mel : {X_mel.shape}")
print(f" y : {y.shape} ({len(le.classes_)} classes)")
def main() -> None:
parser = argparse.ArgumentParser(description="Extract audio features from BirdCLEF 2023.")
parser.add_argument(
"--meta", type=str,
default="data/processed/train_metadata_filtered.csv",
help="Path to filtered metadata CSV (output of make_dataset.py)",
)
parser.add_argument(
"--audio-root", type=str, default="data/raw",
help="Root directory containing train_audio/",
)
args = parser.parse_args()
meta_path = Path(args.meta)
audio_root = Path(args.audio_root)
if not meta_path.exists():
raise FileNotFoundError(
f"Filtered metadata not found at {meta_path}. "
"Run scripts/make_dataset.py first."
)
df = pd.read_csv(meta_path)
print(f"Loaded metadata: {len(df)} rows, {df['primary_label'].nunique()} species")
X_mfcc, X_mel, labels = build_feature_arrays(df, audio_root)
le = LabelEncoder()
y = le.fit_transform(labels)
save_features(X_mfcc, X_mel, y, le, Path("data/processed"))
if __name__ == "__main__":
main()