Spaces:

mg643
/

chirp

Sleeping

File size: 4,864 Bytes

016e82d

"""
scripts/make_dataset.py

Downloads BirdCLEF 2023 from Kaggle and filters train_metadata.csv
to the top N North American species by sample count.

Usage:
    python scripts/make_dataset.py --top-n 20 --min-samples 30

Attribution:
    BirdCLEF 2023 dataset — Cornell Lab of Ornithology
    https://www.kaggle.com/competitions/birdclef-2023
"""

import argparse
import os
import subprocess
from pathlib import Path

import pandas as pd


# North America approximate bounding box
NA_LAT_MIN, NA_LAT_MAX =  15.0,  72.0
NA_LON_MIN, NA_LON_MAX = -168.0, -52.0


def download_dataset(data_dir: Path) -> None:
    """
    Download BirdCLEF 2023 via the Kaggle CLI into data/raw/.

    Requires KAGGLE_USERNAME and KAGGLE_KEY environment variables
    or a valid ~/.kaggle/kaggle.json credential file.

    Args:
        data_dir: Destination directory for the downloaded zip.
    """
    data_dir.mkdir(parents=True, exist_ok=True)
    print("Downloading BirdCLEF 2023 from Kaggle...")
    subprocess.run(
        [
            "kaggle", "competitions", "download",
            "-c", "birdclef-2023",
            "-p", str(data_dir),
        ],
        check=True,
    )

    zip_path = data_dir / "birdclef-2023.zip"
    if zip_path.exists():
        print("Extracting archive...")
        subprocess.run(["unzip", "-q", str(zip_path), "-d", str(data_dir)], check=True)
        zip_path.unlink()
        print(f"Extracted to {data_dir}")
    else:
        print("Zip not found — data may already be extracted.")


def filter_north_america(df: pd.DataFrame) -> pd.DataFrame:
    """
    Filter recordings to those geotagged within North America.

    Uses a lat/lon bounding box: lat [15, 72], lon [-168, -52].
    Rows without coordinates are dropped.

    Args:
        df: Raw train_metadata DataFrame.

    Returns:
        Filtered DataFrame.
    """
    if "latitude" not in df.columns or "longitude" not in df.columns:
        print("Warning: no lat/lon columns found — skipping geographic filter.")
        return df

    before = len(df)
    df = df.dropna(subset=["latitude", "longitude"])
    df = df[
        df["latitude"].between(NA_LAT_MIN, NA_LAT_MAX) &
        df["longitude"].between(NA_LON_MIN, NA_LON_MAX)
    ]
    print(f"Geographic filter: {before} → {len(df)} rows (North America only)")
    return df.reset_index(drop=True)


def select_top_species(df: pd.DataFrame, top_n: int, min_samples: int) -> pd.DataFrame:
    """
    Keep only the top N species by sample count, each with at least
    min_samples recordings.

    Args:
        df:          Filtered metadata DataFrame.
        top_n:       Number of species to retain.
        min_samples: Minimum recordings required per species.

    Returns:
        Further-filtered DataFrame.
    """
    counts = df["primary_label"].value_counts()
    valid  = counts[counts >= min_samples].head(top_n).index.tolist()
    df     = df[df["primary_label"].isin(valid)].reset_index(drop=True)

    print(f"Selected {df['primary_label'].nunique()} species, {len(df)} total recordings.")
    print("Species included:")
    for sp, cnt in df["primary_label"].value_counts().items():
        print(f"  {sp}: {cnt}")

    return df


def save_filtered_metadata(df: pd.DataFrame, output_path: Path) -> None:
    """
    Save the filtered metadata CSV to the processed data directory.

    Args:
        df:          Filtered metadata DataFrame.
        output_path: Destination .csv path.
    """
    output_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(output_path, index=False)
    print(f"Saved filtered metadata → {output_path}")


def main() -> None:
    parser = argparse.ArgumentParser(description="Download and scope BirdCLEF 2023 dataset.")
    parser.add_argument("--top-n",      type=int, default=20, help="Number of top species to keep")
    parser.add_argument("--min-samples",type=int, default=30, help="Minimum samples per species")
    parser.add_argument("--skip-download", action="store_true", help="Skip Kaggle download if data already exists")
    args = parser.parse_args()

    raw_dir       = Path("data/raw")
    processed_dir = Path("data/processed")
    meta_path     = raw_dir / "train_metadata.csv"
    out_path      = processed_dir / "train_metadata_filtered.csv"

    if not args.skip_download:
        download_dataset(raw_dir)

    if not meta_path.exists():
        raise FileNotFoundError(
            f"Metadata not found at {meta_path}. "
            "Run without --skip-download or check your data/raw/ directory."
        )

    df = pd.read_csv(meta_path)
    print(f"Loaded metadata: {len(df)} rows, {df['primary_label'].nunique()} species")

    df = filter_north_america(df)
    df = select_top_species(df, top_n=args.top_n, min_samples=args.min_samples)
    save_filtered_metadata(df, out_path)


if __name__ == "__main__":
    main()