Spaces:

mg643
/

chirp

Sleeping

App Files Files Community

chirp / scripts /build_features.py

mg643

added data setup, feature engineering, model building, outputs

016e82d 24 days ago

raw

history blame contribute delete

6.45 kB

	"""
	scripts/build_features.py

	Extracts MFCC feature vectors (for Random Forest) and log-scaled mel
	spectrograms (for EfficientNet-B0) from all audio clips in the filtered
	metadata. Saves results as .npy arrays and a fitted LabelEncoder.

	Usage:
	python scripts/build_features.py

	Attribution:
	librosa audio analysis library — https://librosa.org
	"""

	import argparse
	from pathlib import Path

	import joblib
	import librosa
	import numpy as np
	import pandas as pd
	from sklearn.preprocessing import LabelEncoder
	from tqdm import tqdm


	# ── Audio / feature constants ──────────────────────────────────────────────────
	SAMPLE_RATE = 22050
	AUDIO_DURATION = 5 # seconds — clips are trimmed or padded to this length
	N_MFCC = 40 # number of MFCC coefficients
	N_MELS = 128 # mel frequency bins
	N_FFT = 2048
	HOP_LENGTH = 512


	def load_audio(filepath: str, sr: int = SAMPLE_RATE, duration: int = AUDIO_DURATION) -> np.ndarray:
	"""
	Load an audio file and pad or trim to a fixed duration.

	Args:
	filepath: Path to the .ogg / .mp3 / .wav file.
	sr: Target sample rate (Hz).
	duration: Desired clip length in seconds.

	Returns:
	1-D float32 numpy array of shape (sr * duration,).
	"""
	target_len = sr * duration
	audio, _ = librosa.load(filepath, sr=sr, duration=duration, mono=True)

	if len(audio) < target_len:
	audio = np.pad(audio, (0, target_len - len(audio)), mode="constant")

	return audio[:target_len].astype(np.float32)


	def extract_mfcc(audio: np.ndarray, sr: int = SAMPLE_RATE, n_mfcc: int = N_MFCC) -> np.ndarray:
	"""
	Compute a fixed-length MFCC feature vector via mean and std pooling.

	Args:
	audio: 1-D audio signal.
	sr: Sample rate.
	n_mfcc: Number of MFCC coefficients.

	Returns:
	Feature vector of shape (n_mfcc * 2,) — [mean \| std].
	"""
	mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
	return np.concatenate([mfcc.mean(axis=1), mfcc.std(axis=1)]).astype(np.float32)


	def compute_mel_spectrogram(
	audio: np.ndarray,
	sr: int = SAMPLE_RATE,
	n_mels: int = N_MELS,
	n_fft: int = N_FFT,
	hop_length: int = HOP_LENGTH,
	) -> np.ndarray:
	"""
	Compute a log-scaled (dB) mel spectrogram suitable as CNN input.

	Args:
	audio: 1-D audio signal.
	sr: Sample rate.
	n_mels: Number of mel filter banks.
	n_fft: FFT window size.
	hop_length: Hop size between frames.

	Returns:
	2-D float32 array of shape (n_mels, time_frames).
	"""
	mel = librosa.feature.melspectrogram(
	y=audio, sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length
	)
	return librosa.power_to_db(mel, ref=np.max).astype(np.float32)


	def build_feature_arrays(
	df: pd.DataFrame,
	audio_root: Path,
	audio_subdir: str = "train_audio",
	) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
	"""
	Iterate over all rows in the metadata DataFrame and extract features.

	Args:
	df: Filtered metadata with 'filename' and 'primary_label' columns.
	audio_root: Root data directory (data/raw/).
	audio_subdir: Subdirectory containing species sub-folders of .ogg files.

	Returns:
	Tuple of (X_mfcc, X_mel, labels) as numpy arrays.
	X_mfcc shape: (N, N_MFCC * 2)
	X_mel shape: (N, N_MELS, time_frames)
	labels shape: (N,) — string species codes
	"""
	X_mfcc, X_mel, labels = [], [], []
	failed = 0

	for _, row in tqdm(df.iterrows(), total=len(df), desc="Extracting features"):
	filepath = audio_root / audio_subdir / row["filename"]
	try:
	audio = load_audio(str(filepath))
	X_mfcc.append(extract_mfcc(audio))
	X_mel.append(compute_mel_spectrogram(audio))
	labels.append(row["primary_label"])
	except Exception as exc:
	failed += 1
	if failed <= 5:
	print(f" Warning — failed to load {filepath}: {exc}")

	print(f"Feature extraction complete. Failed: {failed}/{len(df)}")
	return np.array(X_mfcc), np.array(X_mel), np.array(labels)


	def save_features(
	X_mfcc: np.ndarray,
	X_mel: np.ndarray,
	y: np.ndarray,
	le: LabelEncoder,
	processed_dir: Path,
	) -> None:
	"""
	Persist feature arrays, encoded labels, and the LabelEncoder to disk.

	Args:
	X_mfcc: MFCC feature matrix.
	X_mel: Mel spectrogram array.
	y: Integer-encoded label array.
	le: Fitted LabelEncoder (needed to decode predictions later).
	processed_dir: Directory to save .npy files and encoder.
	"""
	processed_dir.mkdir(parents=True, exist_ok=True)
	np.save(processed_dir / "X_mfcc.npy", X_mfcc)
	np.save(processed_dir / "X_mel.npy", X_mel)
	np.save(processed_dir / "y.npy", y)
	np.save(processed_dir / "classes.npy", le.classes_)
	joblib.dump(le, processed_dir / "label_encoder.pkl")

	print(f"Saved features to {processed_dir}/")
	print(f" X_mfcc : {X_mfcc.shape}")
	print(f" X_mel : {X_mel.shape}")
	print(f" y : {y.shape} ({len(le.classes_)} classes)")


	def main() -> None:
	parser = argparse.ArgumentParser(description="Extract audio features from BirdCLEF 2023.")
	parser.add_argument(
	"--meta", type=str,
	default="data/processed/train_metadata_filtered.csv",
	help="Path to filtered metadata CSV (output of make_dataset.py)",
	)
	parser.add_argument(
	"--audio-root", type=str, default="data/raw",
	help="Root directory containing train_audio/",
	)
	args = parser.parse_args()

	meta_path = Path(args.meta)
	audio_root = Path(args.audio_root)

	if not meta_path.exists():
	raise FileNotFoundError(
	f"Filtered metadata not found at {meta_path}. "
	"Run scripts/make_dataset.py first."
	)

	df = pd.read_csv(meta_path)
	print(f"Loaded metadata: {len(df)} rows, {df['primary_label'].nunique()} species")

	X_mfcc, X_mel, labels = build_feature_arrays(df, audio_root)

	le = LabelEncoder()
	y = le.fit_transform(labels)

	save_features(X_mfcc, X_mel, y, le, Path("data/processed"))


	if __name__ == "__main__":
	main()