| """ |
| ╔══════════════════════════════════════════════════════════════════════════════╗ |
| ║ BirdCLEF+ 2026 — Notebook 1 (IMPROVED) ║ |
| ║ DATA PREPARATION & FOLD GENERATION ║ |
| ║ ║ |
| ║ Changes vs v1: ║ |
| ║ • StratifiedKFold(5) on primary_label (not GroupKFold) ║ |
| ║ • Energy-based crop strategy metadata (for NB2) ║ |
| ║ • Keep Silero-VAD, rating filter, dedup ║ |
| ╚══════════════════════════════════════════════════════════════════════════════╝ |
| """ |
|
|
| import os |
| import numpy as np |
| import pandas as pd |
| import matplotlib.pyplot as plt |
| import warnings |
| from sklearn.model_selection import StratifiedKFold |
|
|
| warnings.filterwarnings("ignore") |
|
|
| |
| COMP_DIR = "/kaggle/input/competitions/birdclef-2026" |
| TRAIN_AUDIO_DIR = os.path.join(COMP_DIR, "train_audio") |
| TRAIN_SOUNDSCAPES_DIR = os.path.join(COMP_DIR, "train_soundscapes") |
| TRAIN_CSV = os.path.join(COMP_DIR, "train.csv") |
| TAXONOMY_CSV = os.path.join(COMP_DIR, "taxonomy.csv") |
| SOUNDSCAPE_LABELS_CSV = os.path.join(COMP_DIR, "train_soundscapes_labels.csv") |
| SAMPLE_SUB_CSV = os.path.join(COMP_DIR, "sample_submission.csv") |
|
|
| OUTPUT_DIR = "/kaggle/working" |
| os.makedirs(OUTPUT_DIR, exist_ok=True) |
|
|
| |
| DRY_RUN = False |
| RUN_VAD = True |
| MIN_RATING = 3.0 |
| SEED = 42 |
| N_FOLDS = 5 |
|
|
| np.random.seed(SEED) |
|
|
| print("=" * 70) |
| print("BirdCLEF+ 2026 — Data Preparation & StratifiedKFold") |
| print("=" * 70) |
|
|
| |
| |
| |
| print("\n[1/6] Loading raw data...") |
|
|
| train_df = pd.read_csv(TRAIN_CSV) |
| taxonomy_df = pd.read_csv(TAXONOMY_CSV) |
| soundscape_labels_df = pd.read_csv(SOUNDSCAPE_LABELS_CSV) |
| sample_sub_df = pd.read_csv(SAMPLE_SUB_CSV) |
|
|
| SPECIES_COLS = [c for c in sample_sub_df.columns if c != "row_id"] |
| NUM_CLASSES = len(SPECIES_COLS) |
|
|
| print(f" train.csv: {train_df.shape[0]:,} recordings") |
| print(f" taxonomy.csv: {taxonomy_df.shape[0]} species") |
| print(f" soundscape_labels.csv: {soundscape_labels_df.shape[0]:,} segments") |
| print(f" sample_sub columns: {NUM_CLASSES} species") |
|
|
| |
| |
| |
| print("\n[2/6] Quick EDA...") |
|
|
| species_counts = train_df["primary_label"].value_counts() |
| print(f" Unique species in train: {species_counts.shape[0]}") |
| print(f" Max/min/median per species: {species_counts.max()}/{species_counts.min()}/{species_counts.median():.0f}") |
|
|
| rare_species = species_counts[species_counts < 20] |
| print(f" Rare species (<20): {len(rare_species)} ({len(rare_species)/NUM_CLASSES*100:.1f}%)") |
|
|
| |
| |
| |
| print(f"\n[3/6] Rating filter (>= {MIN_RATING})...") |
|
|
| before = len(train_df) |
| if "rating" in train_df.columns: |
| train_df_filtered = train_df[(train_df["rating"] >= MIN_RATING) | (train_df["rating"] == 0)].copy() |
| else: |
| train_df_filtered = train_df.copy() |
|
|
| after = len(train_df_filtered) |
| print(f" Removed {before - after:,} low-quality recordings") |
|
|
| |
| remaining = set(train_df_filtered["primary_label"].unique()) |
| lost = set(train_df["primary_label"].unique()) - remaining |
| if lost: |
| print(f" ⚠️ {len(lost)} species lost all recordings → adding back top-5 rated") |
| for sp in lost: |
| sp_df = train_df[train_df["primary_label"] == sp].sort_values("rating", ascending=False) |
| train_df_filtered = pd.concat([train_df_filtered, sp_df.head(5)], ignore_index=True) |
| print(f" After recovery: {len(train_df_filtered):,}") |
|
|
| |
| |
| |
| print(f"\n[4/6] Deduplicating soundscape labels...") |
|
|
| before_dup = len(soundscape_labels_df) |
| soundscape_labels_clean = soundscape_labels_df.drop_duplicates() |
| after_dup = len(soundscape_labels_clean) |
| print(f" Removed {before_dup - after_dup:,} duplicates") |
|
|
| |
| |
| |
| print(f"\n[5/6] Silero-VAD speech cleaning...") |
|
|
| if RUN_VAD: |
| import torch |
| import torchaudio |
|
|
| vad_model, vad_utils = torch.hub.load( |
| repo_or_dir='snakers4/silero-vad', |
| model='silero_vad', |
| force_reload=False, |
| onnx=False |
| ) |
| (get_speech_timestamps, _, _, _, _) = vad_utils |
|
|
| audio_files = [] |
| for root, _, files in os.walk(TRAIN_AUDIO_DIR): |
| for f in files: |
| if f.endswith(('.ogg', '.wav', '.mp3', '.flac')): |
| audio_files.append(os.path.join(root, f)) |
|
|
| if DRY_RUN: |
| audio_files = audio_files[:50] |
| print(f" [DRY RUN] {len(audio_files)} files") |
|
|
| speech_files = [] |
| VAD_SR = 16000 |
|
|
| for i, fpath in enumerate(audio_files): |
| if (i + 1) % 2000 == 0 or i == 0: |
| print(f" Progress: {i+1}/{len(audio_files)}") |
| try: |
| wav, sr = torchaudio.load(fpath) |
| if sr != VAD_SR: |
| wav = torchaudio.transforms.Resample(sr, VAD_SR)(wav) |
| if wav.shape[0] > 1: |
| wav = wav[0:1] |
| wav = wav.squeeze() |
| ts = get_speech_timestamps(wav, vad_model, sampling_rate=VAD_SR) |
| speech_samples = sum(t['end'] - t['start'] for t in ts) |
| ratio = speech_samples / wav.shape[0] if wav.shape[0] > 0 else 0 |
| rel = os.path.relpath(fpath, TRAIN_AUDIO_DIR) |
| if ratio > 0.3: |
| speech_files.append(rel) |
| except Exception: |
| pass |
|
|
| print(f" Files with >30% speech: {len(speech_files)}") |
| if speech_files and "filename" in train_df_filtered.columns: |
| before_vad = len(train_df_filtered) |
| train_df_filtered = train_df_filtered[~train_df_filtered["filename"].isin(set(speech_files))].copy() |
| print(f" Removed {before_vad - len(train_df_filtered):,} speech-heavy recordings") |
| else: |
| print(" [SKIPPED] Set RUN_VAD=True to enable") |
|
|
| |
| |
| |
| print(f"\n[6/6] StratifiedKFold({N_FOLDS}) on primary_label...") |
|
|
| skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED) |
| train_df_filtered = train_df_filtered.reset_index(drop=True) |
|
|
| train_df_filtered["fold"] = -1 |
| for fold, (_, val_idx) in enumerate(skf.split(train_df_filtered, train_df_filtered["primary_label"])): |
| train_df_filtered.loc[val_idx, "fold"] = fold |
|
|
| fold_counts = train_df_filtered.groupby("fold").size() |
| print(f" Train audio fold sizes: {fold_counts.to_dict()}") |
|
|
| |
| import hashlib |
|
|
| def filename_fold(fname): |
| h = int(hashlib.md5(fname.encode()).hexdigest(), 16) |
| return h % N_FOLDS |
|
|
| soundscape_labels_clean["fold"] = soundscape_labels_clean["filename"].apply(filename_fold) |
|
|
| |
| print("\n" + "=" * 70) |
| print("DATA PREP COMPLETE") |
| print("=" * 70) |
| print(f" Clean train_audio: {len(train_df_filtered):,}") |
| print(f" Soundscape segments: {len(soundscape_labels_clean):,}") |
| print(f" Species/classes: {NUM_CLASSES}") |
|
|
| |
| train_df_filtered.to_csv(os.path.join(OUTPUT_DIR, "train_cleaned_stratified.csv"), index=False) |
| soundscape_labels_clean.to_csv(os.path.join(OUTPUT_DIR, "soundscape_labels_with_folds.csv"), index=False) |
| pd.DataFrame({"species": SPECIES_COLS, "idx": range(NUM_CLASSES)}).to_csv( |
| os.path.join(OUTPUT_DIR, "species_list.csv"), index=False |
| ) |
|
|
| rare = train_df_filtered["primary_label"].value_counts() |
| rare_list = rare[rare < 20].index.tolist() |
| pd.DataFrame({"species": rare_list}).to_csv(os.path.join(OUTPUT_DIR, "rare_species.csv"), index=False) |
|
|
| print("\n Output files saved to /kaggle/working/") |
| print(" → train_cleaned_stratified.csv") |
| print(" → soundscape_labels_with_folds.csv") |
| print(" → species_list.csv") |
| print(" → rare_species.csv") |
|
|