File size: 4,864 Bytes
016e82d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
"""
scripts/make_dataset.py

Downloads BirdCLEF 2023 from Kaggle and filters train_metadata.csv
to the top N North American species by sample count.

Usage:
    python scripts/make_dataset.py --top-n 20 --min-samples 30

Attribution:
    BirdCLEF 2023 dataset — Cornell Lab of Ornithology
    https://www.kaggle.com/competitions/birdclef-2023
"""

import argparse
import os
import subprocess
from pathlib import Path

import pandas as pd


# North America approximate bounding box
NA_LAT_MIN, NA_LAT_MAX =  15.0,  72.0
NA_LON_MIN, NA_LON_MAX = -168.0, -52.0


def download_dataset(data_dir: Path) -> None:
    """
    Download BirdCLEF 2023 via the Kaggle CLI into data/raw/.

    Requires KAGGLE_USERNAME and KAGGLE_KEY environment variables
    or a valid ~/.kaggle/kaggle.json credential file.

    Args:
        data_dir: Destination directory for the downloaded zip.
    """
    data_dir.mkdir(parents=True, exist_ok=True)
    print("Downloading BirdCLEF 2023 from Kaggle...")
    subprocess.run(
        [
            "kaggle", "competitions", "download",
            "-c", "birdclef-2023",
            "-p", str(data_dir),
        ],
        check=True,
    )

    zip_path = data_dir / "birdclef-2023.zip"
    if zip_path.exists():
        print("Extracting archive...")
        subprocess.run(["unzip", "-q", str(zip_path), "-d", str(data_dir)], check=True)
        zip_path.unlink()
        print(f"Extracted to {data_dir}")
    else:
        print("Zip not found — data may already be extracted.")


def filter_north_america(df: pd.DataFrame) -> pd.DataFrame:
    """
    Filter recordings to those geotagged within North America.

    Uses a lat/lon bounding box: lat [15, 72], lon [-168, -52].
    Rows without coordinates are dropped.

    Args:
        df: Raw train_metadata DataFrame.

    Returns:
        Filtered DataFrame.
    """
    if "latitude" not in df.columns or "longitude" not in df.columns:
        print("Warning: no lat/lon columns found — skipping geographic filter.")
        return df

    before = len(df)
    df = df.dropna(subset=["latitude", "longitude"])
    df = df[
        df["latitude"].between(NA_LAT_MIN, NA_LAT_MAX) &
        df["longitude"].between(NA_LON_MIN, NA_LON_MAX)
    ]
    print(f"Geographic filter: {before}{len(df)} rows (North America only)")
    return df.reset_index(drop=True)


def select_top_species(df: pd.DataFrame, top_n: int, min_samples: int) -> pd.DataFrame:
    """
    Keep only the top N species by sample count, each with at least
    min_samples recordings.

    Args:
        df:          Filtered metadata DataFrame.
        top_n:       Number of species to retain.
        min_samples: Minimum recordings required per species.

    Returns:
        Further-filtered DataFrame.
    """
    counts = df["primary_label"].value_counts()
    valid  = counts[counts >= min_samples].head(top_n).index.tolist()
    df     = df[df["primary_label"].isin(valid)].reset_index(drop=True)

    print(f"Selected {df['primary_label'].nunique()} species, {len(df)} total recordings.")
    print("Species included:")
    for sp, cnt in df["primary_label"].value_counts().items():
        print(f"  {sp}: {cnt}")

    return df


def save_filtered_metadata(df: pd.DataFrame, output_path: Path) -> None:
    """
    Save the filtered metadata CSV to the processed data directory.

    Args:
        df:          Filtered metadata DataFrame.
        output_path: Destination .csv path.
    """
    output_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(output_path, index=False)
    print(f"Saved filtered metadata → {output_path}")


def main() -> None:
    parser = argparse.ArgumentParser(description="Download and scope BirdCLEF 2023 dataset.")
    parser.add_argument("--top-n",      type=int, default=20, help="Number of top species to keep")
    parser.add_argument("--min-samples",type=int, default=30, help="Minimum samples per species")
    parser.add_argument("--skip-download", action="store_true", help="Skip Kaggle download if data already exists")
    args = parser.parse_args()

    raw_dir       = Path("data/raw")
    processed_dir = Path("data/processed")
    meta_path     = raw_dir / "train_metadata.csv"
    out_path      = processed_dir / "train_metadata_filtered.csv"

    if not args.skip_download:
        download_dataset(raw_dir)

    if not meta_path.exists():
        raise FileNotFoundError(
            f"Metadata not found at {meta_path}. "
            "Run without --skip-download or check your data/raw/ directory."
        )

    df = pd.read_csv(meta_path)
    print(f"Loaded metadata: {len(df)} rows, {df['primary_label'].nunique()} species")

    df = filter_north_america(df)
    df = select_top_species(df, top_n=args.top_n, min_samples=args.min_samples)
    save_filtered_metadata(df, out_path)


if __name__ == "__main__":
    main()