Spaces:

mg643
/

chirp

Sleeping

App Files Files Community

chirp / scripts /make_dataset.py

mg643

added data setup, feature engineering, model building, outputs

016e82d 24 days ago

raw

history blame contribute delete

4.86 kB

	"""
	scripts/make_dataset.py

	Downloads BirdCLEF 2023 from Kaggle and filters train_metadata.csv
	to the top N North American species by sample count.

	Usage:
	python scripts/make_dataset.py --top-n 20 --min-samples 30

	Attribution:
	BirdCLEF 2023 dataset — Cornell Lab of Ornithology
	https://www.kaggle.com/competitions/birdclef-2023
	"""

	import argparse
	import os
	import subprocess
	from pathlib import Path

	import pandas as pd


	# North America approximate bounding box
	NA_LAT_MIN, NA_LAT_MAX = 15.0, 72.0
	NA_LON_MIN, NA_LON_MAX = -168.0, -52.0


	def download_dataset(data_dir: Path) -> None:
	"""
	Download BirdCLEF 2023 via the Kaggle CLI into data/raw/.

	Requires KAGGLE_USERNAME and KAGGLE_KEY environment variables
	or a valid ~/.kaggle/kaggle.json credential file.

	Args:
	data_dir: Destination directory for the downloaded zip.
	"""
	data_dir.mkdir(parents=True, exist_ok=True)
	print("Downloading BirdCLEF 2023 from Kaggle...")
	subprocess.run(
	[
	"kaggle", "competitions", "download",
	"-c", "birdclef-2023",
	"-p", str(data_dir),
	],
	check=True,
	)

	zip_path = data_dir / "birdclef-2023.zip"
	if zip_path.exists():
	print("Extracting archive...")
	subprocess.run(["unzip", "-q", str(zip_path), "-d", str(data_dir)], check=True)
	zip_path.unlink()
	print(f"Extracted to {data_dir}")
	else:
	print("Zip not found — data may already be extracted.")


	def filter_north_america(df: pd.DataFrame) -> pd.DataFrame:
	"""
	Filter recordings to those geotagged within North America.

	Uses a lat/lon bounding box: lat [15, 72], lon [-168, -52].
	Rows without coordinates are dropped.

	Args:
	df: Raw train_metadata DataFrame.

	Returns:
	Filtered DataFrame.
	"""
	if "latitude" not in df.columns or "longitude" not in df.columns:
	print("Warning: no lat/lon columns found — skipping geographic filter.")
	return df

	before = len(df)
	df = df.dropna(subset=["latitude", "longitude"])
	df = df[
	df["latitude"].between(NA_LAT_MIN, NA_LAT_MAX) &
	df["longitude"].between(NA_LON_MIN, NA_LON_MAX)
	]
	print(f"Geographic filter: {before} → {len(df)} rows (North America only)")
	return df.reset_index(drop=True)


	def select_top_species(df: pd.DataFrame, top_n: int, min_samples: int) -> pd.DataFrame:
	"""
	Keep only the top N species by sample count, each with at least
	min_samples recordings.

	Args:
	df: Filtered metadata DataFrame.
	top_n: Number of species to retain.
	min_samples: Minimum recordings required per species.

	Returns:
	Further-filtered DataFrame.
	"""
	counts = df["primary_label"].value_counts()
	valid = counts[counts >= min_samples].head(top_n).index.tolist()
	df = df[df["primary_label"].isin(valid)].reset_index(drop=True)

	print(f"Selected {df['primary_label'].nunique()} species, {len(df)} total recordings.")
	print("Species included:")
	for sp, cnt in df["primary_label"].value_counts().items():
	print(f" {sp}: {cnt}")

	return df


	def save_filtered_metadata(df: pd.DataFrame, output_path: Path) -> None:
	"""
	Save the filtered metadata CSV to the processed data directory.

	Args:
	df: Filtered metadata DataFrame.
	output_path: Destination .csv path.
	"""
	output_path.parent.mkdir(parents=True, exist_ok=True)
	df.to_csv(output_path, index=False)
	print(f"Saved filtered metadata → {output_path}")


	def main() -> None:
	parser = argparse.ArgumentParser(description="Download and scope BirdCLEF 2023 dataset.")
	parser.add_argument("--top-n", type=int, default=20, help="Number of top species to keep")
	parser.add_argument("--min-samples",type=int, default=30, help="Minimum samples per species")
	parser.add_argument("--skip-download", action="store_true", help="Skip Kaggle download if data already exists")
	args = parser.parse_args()

	raw_dir = Path("data/raw")
	processed_dir = Path("data/processed")
	meta_path = raw_dir / "train_metadata.csv"
	out_path = processed_dir / "train_metadata_filtered.csv"

	if not args.skip_download:
	download_dataset(raw_dir)

	if not meta_path.exists():
	raise FileNotFoundError(
	f"Metadata not found at {meta_path}. "
	"Run without --skip-download or check your data/raw/ directory."
	)

	df = pd.read_csv(meta_path)
	print(f"Loaded metadata: {len(df)} rows, {df['primary_label'].nunique()} species")

	df = filter_north_america(df)
	df = select_top_species(df, top_n=args.top_n, min_samples=args.min_samples)
	save_filtered_metadata(df, out_path)


	if __name__ == "__main__":
	main()