File size: 4,864 Bytes
016e82d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 | """
scripts/make_dataset.py
Downloads BirdCLEF 2023 from Kaggle and filters train_metadata.csv
to the top N North American species by sample count.
Usage:
python scripts/make_dataset.py --top-n 20 --min-samples 30
Attribution:
BirdCLEF 2023 dataset — Cornell Lab of Ornithology
https://www.kaggle.com/competitions/birdclef-2023
"""
import argparse
import os
import subprocess
from pathlib import Path
import pandas as pd
# North America approximate bounding box
NA_LAT_MIN, NA_LAT_MAX = 15.0, 72.0
NA_LON_MIN, NA_LON_MAX = -168.0, -52.0
def download_dataset(data_dir: Path) -> None:
"""
Download BirdCLEF 2023 via the Kaggle CLI into data/raw/.
Requires KAGGLE_USERNAME and KAGGLE_KEY environment variables
or a valid ~/.kaggle/kaggle.json credential file.
Args:
data_dir: Destination directory for the downloaded zip.
"""
data_dir.mkdir(parents=True, exist_ok=True)
print("Downloading BirdCLEF 2023 from Kaggle...")
subprocess.run(
[
"kaggle", "competitions", "download",
"-c", "birdclef-2023",
"-p", str(data_dir),
],
check=True,
)
zip_path = data_dir / "birdclef-2023.zip"
if zip_path.exists():
print("Extracting archive...")
subprocess.run(["unzip", "-q", str(zip_path), "-d", str(data_dir)], check=True)
zip_path.unlink()
print(f"Extracted to {data_dir}")
else:
print("Zip not found — data may already be extracted.")
def filter_north_america(df: pd.DataFrame) -> pd.DataFrame:
"""
Filter recordings to those geotagged within North America.
Uses a lat/lon bounding box: lat [15, 72], lon [-168, -52].
Rows without coordinates are dropped.
Args:
df: Raw train_metadata DataFrame.
Returns:
Filtered DataFrame.
"""
if "latitude" not in df.columns or "longitude" not in df.columns:
print("Warning: no lat/lon columns found — skipping geographic filter.")
return df
before = len(df)
df = df.dropna(subset=["latitude", "longitude"])
df = df[
df["latitude"].between(NA_LAT_MIN, NA_LAT_MAX) &
df["longitude"].between(NA_LON_MIN, NA_LON_MAX)
]
print(f"Geographic filter: {before} → {len(df)} rows (North America only)")
return df.reset_index(drop=True)
def select_top_species(df: pd.DataFrame, top_n: int, min_samples: int) -> pd.DataFrame:
"""
Keep only the top N species by sample count, each with at least
min_samples recordings.
Args:
df: Filtered metadata DataFrame.
top_n: Number of species to retain.
min_samples: Minimum recordings required per species.
Returns:
Further-filtered DataFrame.
"""
counts = df["primary_label"].value_counts()
valid = counts[counts >= min_samples].head(top_n).index.tolist()
df = df[df["primary_label"].isin(valid)].reset_index(drop=True)
print(f"Selected {df['primary_label'].nunique()} species, {len(df)} total recordings.")
print("Species included:")
for sp, cnt in df["primary_label"].value_counts().items():
print(f" {sp}: {cnt}")
return df
def save_filtered_metadata(df: pd.DataFrame, output_path: Path) -> None:
"""
Save the filtered metadata CSV to the processed data directory.
Args:
df: Filtered metadata DataFrame.
output_path: Destination .csv path.
"""
output_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(output_path, index=False)
print(f"Saved filtered metadata → {output_path}")
def main() -> None:
parser = argparse.ArgumentParser(description="Download and scope BirdCLEF 2023 dataset.")
parser.add_argument("--top-n", type=int, default=20, help="Number of top species to keep")
parser.add_argument("--min-samples",type=int, default=30, help="Minimum samples per species")
parser.add_argument("--skip-download", action="store_true", help="Skip Kaggle download if data already exists")
args = parser.parse_args()
raw_dir = Path("data/raw")
processed_dir = Path("data/processed")
meta_path = raw_dir / "train_metadata.csv"
out_path = processed_dir / "train_metadata_filtered.csv"
if not args.skip_download:
download_dataset(raw_dir)
if not meta_path.exists():
raise FileNotFoundError(
f"Metadata not found at {meta_path}. "
"Run without --skip-download or check your data/raw/ directory."
)
df = pd.read_csv(meta_path)
print(f"Loaded metadata: {len(df)} rows, {df['primary_label'].nunique()} species")
df = filter_north_america(df)
df = select_top_species(df, top_n=args.top_n, min_samples=args.min_samples)
save_filtered_metadata(df, out_path)
if __name__ == "__main__":
main()
|