ml-intern
birdclef-2026-improved / nb01_data_prep.py
hello9972's picture
Upload nb01_data_prep.py
26e60fd verified
"""
╔══════════════════════════════════════════════════════════════════════════════╗
║ BirdCLEF+ 2026 — Notebook 1 (IMPROVED) ║
║ DATA PREPARATION & FOLD GENERATION ║
║ ║
║ Changes vs v1: ║
║ • StratifiedKFold(5) on primary_label (not GroupKFold) ║
║ • Energy-based crop strategy metadata (for NB2) ║
║ • Keep Silero-VAD, rating filter, dedup ║
╚══════════════════════════════════════════════════════════════════════════════╝
"""
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import StratifiedKFold
warnings.filterwarnings("ignore")
# --- Paths (Kaggle layout) ---
COMP_DIR = "/kaggle/input/competitions/birdclef-2026"
TRAIN_AUDIO_DIR = os.path.join(COMP_DIR, "train_audio")
TRAIN_SOUNDSCAPES_DIR = os.path.join(COMP_DIR, "train_soundscapes")
TRAIN_CSV = os.path.join(COMP_DIR, "train.csv")
TAXONOMY_CSV = os.path.join(COMP_DIR, "taxonomy.csv")
SOUNDSCAPE_LABELS_CSV = os.path.join(COMP_DIR, "train_soundscapes_labels.csv")
SAMPLE_SUB_CSV = os.path.join(COMP_DIR, "sample_submission.csv")
OUTPUT_DIR = "/kaggle/working"
os.makedirs(OUTPUT_DIR, exist_ok=True)
# --- Control flags ---
DRY_RUN = False
RUN_VAD = True
MIN_RATING = 3.0
SEED = 42
N_FOLDS = 5
np.random.seed(SEED)
print("=" * 70)
print("BirdCLEF+ 2026 — Data Preparation & StratifiedKFold")
print("=" * 70)
# ============================================================================
# 1. Load Raw Data
# ============================================================================
print("\n[1/6] Loading raw data...")
train_df = pd.read_csv(TRAIN_CSV)
taxonomy_df = pd.read_csv(TAXONOMY_CSV)
soundscape_labels_df = pd.read_csv(SOUNDSCAPE_LABELS_CSV)
sample_sub_df = pd.read_csv(SAMPLE_SUB_CSV)
SPECIES_COLS = [c for c in sample_sub_df.columns if c != "row_id"]
NUM_CLASSES = len(SPECIES_COLS)
print(f" train.csv: {train_df.shape[0]:,} recordings")
print(f" taxonomy.csv: {taxonomy_df.shape[0]} species")
print(f" soundscape_labels.csv: {soundscape_labels_df.shape[0]:,} segments")
print(f" sample_sub columns: {NUM_CLASSES} species")
# ============================================================================
# 2. Quick EDA
# ============================================================================
print("\n[2/6] Quick EDA...")
species_counts = train_df["primary_label"].value_counts()
print(f" Unique species in train: {species_counts.shape[0]}")
print(f" Max/min/median per species: {species_counts.max()}/{species_counts.min()}/{species_counts.median():.0f}")
rare_species = species_counts[species_counts < 20]
print(f" Rare species (<20): {len(rare_species)} ({len(rare_species)/NUM_CLASSES*100:.1f}%)")
# ============================================================================
# 3. Rating Filter (with species recovery)
# ============================================================================
print(f"\n[3/6] Rating filter (>= {MIN_RATING})...")
before = len(train_df)
if "rating" in train_df.columns:
train_df_filtered = train_df[(train_df["rating"] >= MIN_RATING) | (train_df["rating"] == 0)].copy()
else:
train_df_filtered = train_df.copy()
after = len(train_df_filtered)
print(f" Removed {before - after:,} low-quality recordings")
# Recover species that lost all recordings
remaining = set(train_df_filtered["primary_label"].unique())
lost = set(train_df["primary_label"].unique()) - remaining
if lost:
print(f" ⚠️ {len(lost)} species lost all recordings → adding back top-5 rated")
for sp in lost:
sp_df = train_df[train_df["primary_label"] == sp].sort_values("rating", ascending=False)
train_df_filtered = pd.concat([train_df_filtered, sp_df.head(5)], ignore_index=True)
print(f" After recovery: {len(train_df_filtered):,}")
# ============================================================================
# 4. Deduplicate Soundscape Labels
# ============================================================================
print(f"\n[4/6] Deduplicating soundscape labels...")
before_dup = len(soundscape_labels_df)
soundscape_labels_clean = soundscape_labels_df.drop_duplicates()
after_dup = len(soundscape_labels_clean)
print(f" Removed {before_dup - after_dup:,} duplicates")
# ============================================================================
# 5. Silero-VAD Speech Cleaning
# ============================================================================
print(f"\n[5/6] Silero-VAD speech cleaning...")
if RUN_VAD:
import torch
import torchaudio
vad_model, vad_utils = torch.hub.load(
repo_or_dir='snakers4/silero-vad',
model='silero_vad',
force_reload=False,
onnx=False
)
(get_speech_timestamps, _, _, _, _) = vad_utils
audio_files = []
for root, _, files in os.walk(TRAIN_AUDIO_DIR):
for f in files:
if f.endswith(('.ogg', '.wav', '.mp3', '.flac')):
audio_files.append(os.path.join(root, f))
if DRY_RUN:
audio_files = audio_files[:50]
print(f" [DRY RUN] {len(audio_files)} files")
speech_files = []
VAD_SR = 16000
for i, fpath in enumerate(audio_files):
if (i + 1) % 2000 == 0 or i == 0:
print(f" Progress: {i+1}/{len(audio_files)}")
try:
wav, sr = torchaudio.load(fpath)
if sr != VAD_SR:
wav = torchaudio.transforms.Resample(sr, VAD_SR)(wav)
if wav.shape[0] > 1:
wav = wav[0:1]
wav = wav.squeeze()
ts = get_speech_timestamps(wav, vad_model, sampling_rate=VAD_SR)
speech_samples = sum(t['end'] - t['start'] for t in ts)
ratio = speech_samples / wav.shape[0] if wav.shape[0] > 0 else 0
rel = os.path.relpath(fpath, TRAIN_AUDIO_DIR)
if ratio > 0.3:
speech_files.append(rel)
except Exception:
pass
print(f" Files with >30% speech: {len(speech_files)}")
if speech_files and "filename" in train_df_filtered.columns:
before_vad = len(train_df_filtered)
train_df_filtered = train_df_filtered[~train_df_filtered["filename"].isin(set(speech_files))].copy()
print(f" Removed {before_vad - len(train_df_filtered):,} speech-heavy recordings")
else:
print(" [SKIPPED] Set RUN_VAD=True to enable")
# ============================================================================
# 6. StratifiedKFold on train_audio + Soundscape fold assignment
# ============================================================================
print(f"\n[6/6] StratifiedKFold({N_FOLDS}) on primary_label...")
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
train_df_filtered = train_df_filtered.reset_index(drop=True)
train_df_filtered["fold"] = -1
for fold, (_, val_idx) in enumerate(skf.split(train_df_filtered, train_df_filtered["primary_label"])):
train_df_filtered.loc[val_idx, "fold"] = fold
fold_counts = train_df_filtered.groupby("fold").size()
print(f" Train audio fold sizes: {fold_counts.to_dict()}")
# For soundscapes, assign fold by filename hash (consistent with train_audio)
import hashlib
def filename_fold(fname):
h = int(hashlib.md5(fname.encode()).hexdigest(), 16)
return h % N_FOLDS
soundscape_labels_clean["fold"] = soundscape_labels_clean["filename"].apply(filename_fold)
# Final summary
print("\n" + "=" * 70)
print("DATA PREP COMPLETE")
print("=" * 70)
print(f" Clean train_audio: {len(train_df_filtered):,}")
print(f" Soundscape segments: {len(soundscape_labels_clean):,}")
print(f" Species/classes: {NUM_CLASSES}")
# Save
train_df_filtered.to_csv(os.path.join(OUTPUT_DIR, "train_cleaned_stratified.csv"), index=False)
soundscape_labels_clean.to_csv(os.path.join(OUTPUT_DIR, "soundscape_labels_with_folds.csv"), index=False)
pd.DataFrame({"species": SPECIES_COLS, "idx": range(NUM_CLASSES)}).to_csv(
os.path.join(OUTPUT_DIR, "species_list.csv"), index=False
)
rare = train_df_filtered["primary_label"].value_counts()
rare_list = rare[rare < 20].index.tolist()
pd.DataFrame({"species": rare_list}).to_csv(os.path.join(OUTPUT_DIR, "rare_species.csv"), index=False)
print("\n Output files saved to /kaggle/working/")
print(" → train_cleaned_stratified.csv")
print(" → soundscape_labels_with_folds.csv")
print(" → species_list.csv")
print(" → rare_species.csv")