chirp / scripts /make_dataset.py
mg643's picture
added data setup, feature engineering, model building, outputs
016e82d
"""
scripts/make_dataset.py
Downloads BirdCLEF 2023 from Kaggle and filters train_metadata.csv
to the top N North American species by sample count.
Usage:
python scripts/make_dataset.py --top-n 20 --min-samples 30
Attribution:
BirdCLEF 2023 dataset β€” Cornell Lab of Ornithology
https://www.kaggle.com/competitions/birdclef-2023
"""
import argparse
import os
import subprocess
from pathlib import Path
import pandas as pd
# North America approximate bounding box
NA_LAT_MIN, NA_LAT_MAX = 15.0, 72.0
NA_LON_MIN, NA_LON_MAX = -168.0, -52.0
def download_dataset(data_dir: Path) -> None:
"""
Download BirdCLEF 2023 via the Kaggle CLI into data/raw/.
Requires KAGGLE_USERNAME and KAGGLE_KEY environment variables
or a valid ~/.kaggle/kaggle.json credential file.
Args:
data_dir: Destination directory for the downloaded zip.
"""
data_dir.mkdir(parents=True, exist_ok=True)
print("Downloading BirdCLEF 2023 from Kaggle...")
subprocess.run(
[
"kaggle", "competitions", "download",
"-c", "birdclef-2023",
"-p", str(data_dir),
],
check=True,
)
zip_path = data_dir / "birdclef-2023.zip"
if zip_path.exists():
print("Extracting archive...")
subprocess.run(["unzip", "-q", str(zip_path), "-d", str(data_dir)], check=True)
zip_path.unlink()
print(f"Extracted to {data_dir}")
else:
print("Zip not found β€” data may already be extracted.")
def filter_north_america(df: pd.DataFrame) -> pd.DataFrame:
"""
Filter recordings to those geotagged within North America.
Uses a lat/lon bounding box: lat [15, 72], lon [-168, -52].
Rows without coordinates are dropped.
Args:
df: Raw train_metadata DataFrame.
Returns:
Filtered DataFrame.
"""
if "latitude" not in df.columns or "longitude" not in df.columns:
print("Warning: no lat/lon columns found β€” skipping geographic filter.")
return df
before = len(df)
df = df.dropna(subset=["latitude", "longitude"])
df = df[
df["latitude"].between(NA_LAT_MIN, NA_LAT_MAX) &
df["longitude"].between(NA_LON_MIN, NA_LON_MAX)
]
print(f"Geographic filter: {before} β†’ {len(df)} rows (North America only)")
return df.reset_index(drop=True)
def select_top_species(df: pd.DataFrame, top_n: int, min_samples: int) -> pd.DataFrame:
"""
Keep only the top N species by sample count, each with at least
min_samples recordings.
Args:
df: Filtered metadata DataFrame.
top_n: Number of species to retain.
min_samples: Minimum recordings required per species.
Returns:
Further-filtered DataFrame.
"""
counts = df["primary_label"].value_counts()
valid = counts[counts >= min_samples].head(top_n).index.tolist()
df = df[df["primary_label"].isin(valid)].reset_index(drop=True)
print(f"Selected {df['primary_label'].nunique()} species, {len(df)} total recordings.")
print("Species included:")
for sp, cnt in df["primary_label"].value_counts().items():
print(f" {sp}: {cnt}")
return df
def save_filtered_metadata(df: pd.DataFrame, output_path: Path) -> None:
"""
Save the filtered metadata CSV to the processed data directory.
Args:
df: Filtered metadata DataFrame.
output_path: Destination .csv path.
"""
output_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(output_path, index=False)
print(f"Saved filtered metadata β†’ {output_path}")
def main() -> None:
parser = argparse.ArgumentParser(description="Download and scope BirdCLEF 2023 dataset.")
parser.add_argument("--top-n", type=int, default=20, help="Number of top species to keep")
parser.add_argument("--min-samples",type=int, default=30, help="Minimum samples per species")
parser.add_argument("--skip-download", action="store_true", help="Skip Kaggle download if data already exists")
args = parser.parse_args()
raw_dir = Path("data/raw")
processed_dir = Path("data/processed")
meta_path = raw_dir / "train_metadata.csv"
out_path = processed_dir / "train_metadata_filtered.csv"
if not args.skip_download:
download_dataset(raw_dir)
if not meta_path.exists():
raise FileNotFoundError(
f"Metadata not found at {meta_path}. "
"Run without --skip-download or check your data/raw/ directory."
)
df = pd.read_csv(meta_path)
print(f"Loaded metadata: {len(df)} rows, {df['primary_label'].nunique()} species")
df = filter_north_america(df)
df = select_top_species(df, top_n=args.top_n, min_samples=args.min_samples)
save_filtered_metadata(df, out_path)
if __name__ == "__main__":
main()