| """ |
| scripts/make_dataset.py |
| |
| Downloads BirdCLEF 2023 from Kaggle and filters train_metadata.csv |
| to the top N North American species by sample count. |
| |
| Usage: |
| python scripts/make_dataset.py --top-n 20 --min-samples 30 |
| |
| Attribution: |
| BirdCLEF 2023 dataset β Cornell Lab of Ornithology |
| https://www.kaggle.com/competitions/birdclef-2023 |
| """ |
|
|
| import argparse |
| import os |
| import subprocess |
| from pathlib import Path |
|
|
| import pandas as pd |
|
|
|
|
| |
| NA_LAT_MIN, NA_LAT_MAX = 15.0, 72.0 |
| NA_LON_MIN, NA_LON_MAX = -168.0, -52.0 |
|
|
|
|
| def download_dataset(data_dir: Path) -> None: |
| """ |
| Download BirdCLEF 2023 via the Kaggle CLI into data/raw/. |
| |
| Requires KAGGLE_USERNAME and KAGGLE_KEY environment variables |
| or a valid ~/.kaggle/kaggle.json credential file. |
| |
| Args: |
| data_dir: Destination directory for the downloaded zip. |
| """ |
| data_dir.mkdir(parents=True, exist_ok=True) |
| print("Downloading BirdCLEF 2023 from Kaggle...") |
| subprocess.run( |
| [ |
| "kaggle", "competitions", "download", |
| "-c", "birdclef-2023", |
| "-p", str(data_dir), |
| ], |
| check=True, |
| ) |
|
|
| zip_path = data_dir / "birdclef-2023.zip" |
| if zip_path.exists(): |
| print("Extracting archive...") |
| subprocess.run(["unzip", "-q", str(zip_path), "-d", str(data_dir)], check=True) |
| zip_path.unlink() |
| print(f"Extracted to {data_dir}") |
| else: |
| print("Zip not found β data may already be extracted.") |
|
|
|
|
| def filter_north_america(df: pd.DataFrame) -> pd.DataFrame: |
| """ |
| Filter recordings to those geotagged within North America. |
| |
| Uses a lat/lon bounding box: lat [15, 72], lon [-168, -52]. |
| Rows without coordinates are dropped. |
| |
| Args: |
| df: Raw train_metadata DataFrame. |
| |
| Returns: |
| Filtered DataFrame. |
| """ |
| if "latitude" not in df.columns or "longitude" not in df.columns: |
| print("Warning: no lat/lon columns found β skipping geographic filter.") |
| return df |
|
|
| before = len(df) |
| df = df.dropna(subset=["latitude", "longitude"]) |
| df = df[ |
| df["latitude"].between(NA_LAT_MIN, NA_LAT_MAX) & |
| df["longitude"].between(NA_LON_MIN, NA_LON_MAX) |
| ] |
| print(f"Geographic filter: {before} β {len(df)} rows (North America only)") |
| return df.reset_index(drop=True) |
|
|
|
|
| def select_top_species(df: pd.DataFrame, top_n: int, min_samples: int) -> pd.DataFrame: |
| """ |
| Keep only the top N species by sample count, each with at least |
| min_samples recordings. |
| |
| Args: |
| df: Filtered metadata DataFrame. |
| top_n: Number of species to retain. |
| min_samples: Minimum recordings required per species. |
| |
| Returns: |
| Further-filtered DataFrame. |
| """ |
| counts = df["primary_label"].value_counts() |
| valid = counts[counts >= min_samples].head(top_n).index.tolist() |
| df = df[df["primary_label"].isin(valid)].reset_index(drop=True) |
|
|
| print(f"Selected {df['primary_label'].nunique()} species, {len(df)} total recordings.") |
| print("Species included:") |
| for sp, cnt in df["primary_label"].value_counts().items(): |
| print(f" {sp}: {cnt}") |
|
|
| return df |
|
|
|
|
| def save_filtered_metadata(df: pd.DataFrame, output_path: Path) -> None: |
| """ |
| Save the filtered metadata CSV to the processed data directory. |
| |
| Args: |
| df: Filtered metadata DataFrame. |
| output_path: Destination .csv path. |
| """ |
| output_path.parent.mkdir(parents=True, exist_ok=True) |
| df.to_csv(output_path, index=False) |
| print(f"Saved filtered metadata β {output_path}") |
|
|
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser(description="Download and scope BirdCLEF 2023 dataset.") |
| parser.add_argument("--top-n", type=int, default=20, help="Number of top species to keep") |
| parser.add_argument("--min-samples",type=int, default=30, help="Minimum samples per species") |
| parser.add_argument("--skip-download", action="store_true", help="Skip Kaggle download if data already exists") |
| args = parser.parse_args() |
|
|
| raw_dir = Path("data/raw") |
| processed_dir = Path("data/processed") |
| meta_path = raw_dir / "train_metadata.csv" |
| out_path = processed_dir / "train_metadata_filtered.csv" |
|
|
| if not args.skip_download: |
| download_dataset(raw_dir) |
|
|
| if not meta_path.exists(): |
| raise FileNotFoundError( |
| f"Metadata not found at {meta_path}. " |
| "Run without --skip-download or check your data/raw/ directory." |
| ) |
|
|
| df = pd.read_csv(meta_path) |
| print(f"Loaded metadata: {len(df)} rows, {df['primary_label'].nunique()} species") |
|
|
| df = filter_north_america(df) |
| df = select_top_species(df, top_n=args.top_n, min_samples=args.min_samples) |
| save_filtered_metadata(df, out_path) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|