""" ╔══════════════════════════════════════════════════════════════════════════════╗ ║ BirdCLEF+ 2026 — Notebook 1 (IMPROVED) ║ ║ DATA PREPARATION & FOLD GENERATION ║ ║ ║ ║ Changes vs v1: ║ ║ • StratifiedKFold(5) on primary_label (not GroupKFold) ║ ║ • Energy-based crop strategy metadata (for NB2) ║ ║ • Keep Silero-VAD, rating filter, dedup ║ ╚══════════════════════════════════════════════════════════════════════════════╝ """ import os import numpy as np import pandas as pd import matplotlib.pyplot as plt import warnings from sklearn.model_selection import StratifiedKFold warnings.filterwarnings("ignore") # --- Paths (Kaggle layout) --- COMP_DIR = "/kaggle/input/competitions/birdclef-2026" TRAIN_AUDIO_DIR = os.path.join(COMP_DIR, "train_audio") TRAIN_SOUNDSCAPES_DIR = os.path.join(COMP_DIR, "train_soundscapes") TRAIN_CSV = os.path.join(COMP_DIR, "train.csv") TAXONOMY_CSV = os.path.join(COMP_DIR, "taxonomy.csv") SOUNDSCAPE_LABELS_CSV = os.path.join(COMP_DIR, "train_soundscapes_labels.csv") SAMPLE_SUB_CSV = os.path.join(COMP_DIR, "sample_submission.csv") OUTPUT_DIR = "/kaggle/working" os.makedirs(OUTPUT_DIR, exist_ok=True) # --- Control flags --- DRY_RUN = False RUN_VAD = True MIN_RATING = 3.0 SEED = 42 N_FOLDS = 5 np.random.seed(SEED) print("=" * 70) print("BirdCLEF+ 2026 — Data Preparation & StratifiedKFold") print("=" * 70) # ============================================================================ # 1. Load Raw Data # ============================================================================ print("\n[1/6] Loading raw data...") train_df = pd.read_csv(TRAIN_CSV) taxonomy_df = pd.read_csv(TAXONOMY_CSV) soundscape_labels_df = pd.read_csv(SOUNDSCAPE_LABELS_CSV) sample_sub_df = pd.read_csv(SAMPLE_SUB_CSV) SPECIES_COLS = [c for c in sample_sub_df.columns if c != "row_id"] NUM_CLASSES = len(SPECIES_COLS) print(f" train.csv: {train_df.shape[0]:,} recordings") print(f" taxonomy.csv: {taxonomy_df.shape[0]} species") print(f" soundscape_labels.csv: {soundscape_labels_df.shape[0]:,} segments") print(f" sample_sub columns: {NUM_CLASSES} species") # ============================================================================ # 2. Quick EDA # ============================================================================ print("\n[2/6] Quick EDA...") species_counts = train_df["primary_label"].value_counts() print(f" Unique species in train: {species_counts.shape[0]}") print(f" Max/min/median per species: {species_counts.max()}/{species_counts.min()}/{species_counts.median():.0f}") rare_species = species_counts[species_counts < 20] print(f" Rare species (<20): {len(rare_species)} ({len(rare_species)/NUM_CLASSES*100:.1f}%)") # ============================================================================ # 3. Rating Filter (with species recovery) # ============================================================================ print(f"\n[3/6] Rating filter (>= {MIN_RATING})...") before = len(train_df) if "rating" in train_df.columns: train_df_filtered = train_df[(train_df["rating"] >= MIN_RATING) | (train_df["rating"] == 0)].copy() else: train_df_filtered = train_df.copy() after = len(train_df_filtered) print(f" Removed {before - after:,} low-quality recordings") # Recover species that lost all recordings remaining = set(train_df_filtered["primary_label"].unique()) lost = set(train_df["primary_label"].unique()) - remaining if lost: print(f" ⚠️ {len(lost)} species lost all recordings → adding back top-5 rated") for sp in lost: sp_df = train_df[train_df["primary_label"] == sp].sort_values("rating", ascending=False) train_df_filtered = pd.concat([train_df_filtered, sp_df.head(5)], ignore_index=True) print(f" After recovery: {len(train_df_filtered):,}") # ============================================================================ # 4. Deduplicate Soundscape Labels # ============================================================================ print(f"\n[4/6] Deduplicating soundscape labels...") before_dup = len(soundscape_labels_df) soundscape_labels_clean = soundscape_labels_df.drop_duplicates() after_dup = len(soundscape_labels_clean) print(f" Removed {before_dup - after_dup:,} duplicates") # ============================================================================ # 5. Silero-VAD Speech Cleaning # ============================================================================ print(f"\n[5/6] Silero-VAD speech cleaning...") if RUN_VAD: import torch import torchaudio vad_model, vad_utils = torch.hub.load( repo_or_dir='snakers4/silero-vad', model='silero_vad', force_reload=False, onnx=False ) (get_speech_timestamps, _, _, _, _) = vad_utils audio_files = [] for root, _, files in os.walk(TRAIN_AUDIO_DIR): for f in files: if f.endswith(('.ogg', '.wav', '.mp3', '.flac')): audio_files.append(os.path.join(root, f)) if DRY_RUN: audio_files = audio_files[:50] print(f" [DRY RUN] {len(audio_files)} files") speech_files = [] VAD_SR = 16000 for i, fpath in enumerate(audio_files): if (i + 1) % 2000 == 0 or i == 0: print(f" Progress: {i+1}/{len(audio_files)}") try: wav, sr = torchaudio.load(fpath) if sr != VAD_SR: wav = torchaudio.transforms.Resample(sr, VAD_SR)(wav) if wav.shape[0] > 1: wav = wav[0:1] wav = wav.squeeze() ts = get_speech_timestamps(wav, vad_model, sampling_rate=VAD_SR) speech_samples = sum(t['end'] - t['start'] for t in ts) ratio = speech_samples / wav.shape[0] if wav.shape[0] > 0 else 0 rel = os.path.relpath(fpath, TRAIN_AUDIO_DIR) if ratio > 0.3: speech_files.append(rel) except Exception: pass print(f" Files with >30% speech: {len(speech_files)}") if speech_files and "filename" in train_df_filtered.columns: before_vad = len(train_df_filtered) train_df_filtered = train_df_filtered[~train_df_filtered["filename"].isin(set(speech_files))].copy() print(f" Removed {before_vad - len(train_df_filtered):,} speech-heavy recordings") else: print(" [SKIPPED] Set RUN_VAD=True to enable") # ============================================================================ # 6. StratifiedKFold on train_audio + Soundscape fold assignment # ============================================================================ print(f"\n[6/6] StratifiedKFold({N_FOLDS}) on primary_label...") skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED) train_df_filtered = train_df_filtered.reset_index(drop=True) train_df_filtered["fold"] = -1 for fold, (_, val_idx) in enumerate(skf.split(train_df_filtered, train_df_filtered["primary_label"])): train_df_filtered.loc[val_idx, "fold"] = fold fold_counts = train_df_filtered.groupby("fold").size() print(f" Train audio fold sizes: {fold_counts.to_dict()}") # For soundscapes, assign fold by filename hash (consistent with train_audio) import hashlib def filename_fold(fname): h = int(hashlib.md5(fname.encode()).hexdigest(), 16) return h % N_FOLDS soundscape_labels_clean["fold"] = soundscape_labels_clean["filename"].apply(filename_fold) # Final summary print("\n" + "=" * 70) print("DATA PREP COMPLETE") print("=" * 70) print(f" Clean train_audio: {len(train_df_filtered):,}") print(f" Soundscape segments: {len(soundscape_labels_clean):,}") print(f" Species/classes: {NUM_CLASSES}") # Save train_df_filtered.to_csv(os.path.join(OUTPUT_DIR, "train_cleaned_stratified.csv"), index=False) soundscape_labels_clean.to_csv(os.path.join(OUTPUT_DIR, "soundscape_labels_with_folds.csv"), index=False) pd.DataFrame({"species": SPECIES_COLS, "idx": range(NUM_CLASSES)}).to_csv( os.path.join(OUTPUT_DIR, "species_list.csv"), index=False ) rare = train_df_filtered["primary_label"].value_counts() rare_list = rare[rare < 20].index.tolist() pd.DataFrame({"species": rare_list}).to_csv(os.path.join(OUTPUT_DIR, "rare_species.csv"), index=False) print("\n Output files saved to /kaggle/working/") print(" → train_cleaned_stratified.csv") print(" → soundscape_labels_with_folds.csv") print(" → species_list.csv") print(" → rare_species.csv")