Spaces:

mg643
/

chirp

Sleeping

App Files Files Community

chirp / setup.py

mg643

added data setup, feature engineering, model building, outputs

016e82d 23 days ago

raw

history blame contribute delete

8.16 kB

	"""
	setup.py

	End-to-end pipeline script for Warbler — bird audio species classifier.

	Runs in order:
	1. Load pre-extracted features (or run build_features.py first)
	2. Train / val / test split
	3. Train all three models (Naive Baseline, Random Forest, EfficientNet-B0)
	4. Evaluate and compare
	5. Save best model + config for app.py

	Usage:
	# First time (download + feature extraction):
	python scripts/make_dataset.py
	python scripts/build_features.py
	python setup.py

	# If features already exist:
	python setup.py --epochs 20
	"""

	import argparse
	import json
	from pathlib import Path

	import joblib
	import numpy as np
	import pandas as pd
	from sklearn.model_selection import train_test_split

	from scripts.model import EfficientNetModel, NaiveBaseline, RandomForestModel


	# ── Defaults ──────────────────────────────────────────────────────────────────
	PROCESSED_DIR = Path("data/processed")
	MODELS_DIR = Path("models")
	OUTPUTS_DIR = Path("data/outputs")
	SEED = 42
	TEST_SIZE = 0.20
	VAL_SIZE = 0.10


	def load_features(processed_dir: Path) -> tuple:
	"""
	Load pre-computed feature arrays and label encoder from disk.

	Args:
	processed_dir: Directory containing .npy files and label_encoder.pkl.

	Returns:
	Tuple of (X_mfcc, X_mel, y, label_encoder).

	Raises:
	FileNotFoundError: If feature files are missing — run build_features.py first.
	"""
	required = ["X_mfcc.npy", "X_mel.npy", "y.npy", "label_encoder.pkl"]
	for f in required:
	if not (processed_dir / f).exists():
	raise FileNotFoundError(
	f"Missing {f} in {processed_dir}. "
	"Run `python scripts/build_features.py` first."
	)

	X_mfcc = np.load(processed_dir / "X_mfcc.npy")
	X_mel = np.load(processed_dir / "X_mel.npy")
	y = np.load(processed_dir / "y.npy")
	le = joblib.load(processed_dir / "label_encoder.pkl")

	print(f"Loaded features: {len(y)} samples, {len(le.classes_)} classes")
	print(f" X_mfcc: {X_mfcc.shape} X_mel: {X_mel.shape}")
	return X_mfcc, X_mel, y, le


	def make_splits(
	X_mfcc: np.ndarray,
	X_mel: np.ndarray,
	y: np.ndarray,
	test_size: float = TEST_SIZE,
	val_size: float = VAL_SIZE,
	seed: int = SEED,
	) -> tuple[dict, dict, dict]:
	"""
	Create stratified train / val / test splits.

	Args:
	X_mfcc: MFCC feature matrix.
	X_mel: Mel spectrogram array.
	y: Integer label array.
	test_size: Fraction of data for the test set.
	val_size: Fraction of data for the validation set.
	seed: Random seed for reproducibility.

	Returns:
	Three dicts each with keys 'mfcc', 'mel', 'y'.
	"""
	idx = np.arange(len(y))

	idx_trainval, idx_test = train_test_split(
	idx, test_size=test_size, stratify=y, random_state=seed
	)
	val_frac = val_size / (1 - test_size)
	idx_train, idx_val = train_test_split(
	idx_trainval, test_size=val_frac, stratify=y[idx_trainval], random_state=seed
	)

	def subset(idx_):
	return {"mfcc": X_mfcc[idx_], "mel": X_mel[idx_], "y": y[idx_]}

	train, val, test = subset(idx_train), subset(idx_val), subset(idx_test)
	print(f"Split — Train: {len(idx_train)} Val: {len(idx_val)} Test: {len(idx_test)}")
	return train, val, test


	def save_results(results: list[dict], outputs_dir: Path) -> None:
	"""
	Save model comparison table as CSV and print a summary.

	Args:
	results: List of result dicts from each model's .evaluate() call.
	outputs_dir: Directory to write model_comparison.csv.
	"""
	outputs_dir.mkdir(parents=True, exist_ok=True)
	df = pd.DataFrame([
	{"Model": r["model"], "Test Accuracy": r["accuracy"], "Macro F1": r["macro_f1"]}
	for r in results
	])
	df.to_csv(outputs_dir / "model_comparison.csv", index=False)

	print("\n=== MODEL COMPARISON ===")
	print(df.to_string(index=False))


	def save_model_config(best: dict, le, models_dir: Path) -> None:
	"""
	Persist the model config JSON consumed by app.py at startup.

	Args:
	best: Result dict of the winning model.
	le: Fitted LabelEncoder.
	models_dir: Directory to write model_config.json.
	"""
	config = {
	"best_model": best["model"],
	"test_accuracy": round(best["accuracy"], 4),
	"test_macro_f1": round(best["macro_f1"], 4),
	"classes": le.classes_.tolist(),
	"num_classes": len(le.classes_),
	"sample_rate": 22050,
	"audio_duration": 5,
	"n_mels": 128,
	"n_fft": 2048,
	"hop_length": 512,
	"n_mfcc": 40,
	}
	models_dir.mkdir(parents=True, exist_ok=True)
	with open(models_dir / "model_config.json", "w") as f:
	json.dump(config, f, indent=2)

	print(f"\nBest model: {best['model']} (Macro F1: {best['macro_f1']:.4f})")
	print(f"Config saved → {models_dir / 'model_config.json'}")


	def run_pipeline(epochs: int = 20) -> None:
	"""
	Execute the full training pipeline.

	Args:
	epochs: Number of epochs for EfficientNet-B0 training.
	"""
	# ── 1. Load features ──────────────────────────────────────────────────────
	X_mfcc, X_mel, y, le = load_features(PROCESSED_DIR)
	num_classes = len(le.classes_)

	# ── 2. Split ──────────────────────────────────────────────────────────────
	train, val, test = make_splits(X_mfcc, X_mel, y)

	results = []

	# ── 3a. Naive Baseline ────────────────────────────────────────────────────
	print("\n── Naive Baseline ──")
	nb = NaiveBaseline()
	nb.train(train["mfcc"], train["y"])
	results.append(nb.evaluate(test["mfcc"], test["y"], le.classes_.tolist()))
	nb.save(MODELS_DIR)

	# ── 3b. Random Forest ─────────────────────────────────────────────────────
	print("\n── Random Forest ──")
	rf = RandomForestModel(n_estimators=200)
	rf.train(train["mfcc"], train["y"])
	results.append(rf.evaluate(test["mfcc"], test["y"], le.classes_.tolist()))
	rf.save(MODELS_DIR)

	# ── 3c. EfficientNet-B0 ───────────────────────────────────────────────────
	print(f"\n── EfficientNet-B0 ({epochs} epochs) ──")
	cnn = EfficientNetModel(num_classes=num_classes)
	cnn_result = cnn.train(
	train["mel"], train["y"],
	val["mel"], val["y"],
	test["mel"], test["y"],
	epochs=epochs,
	models_dir=MODELS_DIR,
	)
	results.append(cnn_result)

	# Also save label encoder alongside model weights
	joblib.dump(le, MODELS_DIR / "label_encoder.pkl")

	# ── 4. Compare & save ─────────────────────────────────────────────────────
	save_results(results, OUTPUTS_DIR)
	best = max(results, key=lambda r: r["macro_f1"])
	save_model_config(best, le, MODELS_DIR)

	print("\n✅ Pipeline complete. Artifacts in models/ and data/outputs/")


	def main() -> None:
	parser = argparse.ArgumentParser(description="Train and evaluate Warbler bird classifier.")
	parser.add_argument("--epochs", type=int, default=20, help="EfficientNet training epochs")
	args = parser.parse_args()
	run_pipeline(epochs=args.epochs)


	if __name__ == "__main__":
	main()