Spaces:

samsaara
/

munich_bird_identifier

Sleeping

App Files Files Community

Vivek Vaddina commited on Jan 8

Commit

254b144

unverified ·

1 Parent(s): f240b3a

initial working commit

Browse files

Files changed (15) hide show

.gitignore +212 -0
.pixi/config.toml +4 -0
app.py +71 -0
models/checkpoint.pth +3 -0
pixi.lock +0 -0
pixi.toml +19 -0
requirements.txt +6 -0
samples/corvus_corone_XC592284.mp3 +3 -0
samples/scolopax_rusticola_XC795042.mp3 +3 -0
src/__init__.py +0 -0
src/audio.py +17 -0
src/config.py +90 -0
src/modeling.py +87 -0
src/processing.py +52 -0
src/utils.py +30 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,212 @@

+# Created by https://www.toptal.com/developers/gitignore/api/python,jupyternotebooks,visualstudiocode
+# Edit at https://www.toptal.com/developers/gitignore?templates=python,jupyternotebooks,visualstudiocode
+### JupyterNotebooks ###
+# gitignore template for Jupyter Notebooks
+# website: http://jupyter.org/
+.ipynb_checkpoints
+*/.ipynb_checkpoints/*
+# IPython
+profile_default/
+ipython_config.py
+# Remove previous ipynb_checkpoints
+#   git rm -r .ipynb_checkpoints/
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+# IPython
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+# ruff
+.ruff_cache/
+# LSP config files
+pyrightconfig.json
+### VisualStudioCode ###
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+# Local History for Visual Studio Code
+.history/
+# Built Visual Studio Code Extensions
+*.vsix
+### VisualStudioCode Patch ###
+# Ignore all local history of files
+.history
+.ionide
+# End of https://www.toptal.com/developers/gitignore/api/python,jupyternotebooks,visualstudiocode
+## Custom
+data/
+# pixi environments
+.pixi/*
+!.pixi/config.toml

.pixi/config.toml ADDED Viewed

	@@ -0,0 +1,4 @@

+run-post-link-scripts = "insecure"
+[shell]
+change-ps1 = false

app.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import gradio as gr
+from pathlib import Path
+from src.config import CKPT_PATH
+from src.modeling import Model
+# -------------------------------------------------
+# Load model once at startup
+# -------------------------------------------------
+MODEL = Model(device="cpu")
+MODEL.load_from_chkpt(Path(CKPT_PATH))
+# -------------------------------------------------
+# Inference function used by Gradio
+# -------------------------------------------------
+def run_inference(audio_file):
+    if audio_file is None:
+        return None, ""
+    # audio_file is a filepath provided by Gradio
+    audio_fp = Path(audio_file)
+    result = MODEL.make_preds(audio_fp)
+    name = ' '.join(result.upper().split('_'))
+    return f"# 🐦 Identified species:**{name}**"
+def clear_outputs():
+    return None, ""
+# -------------------------------------------------
+# Gradio UI
+# -------------------------------------------------
+with gr.Blocks(title="Bird Species Identification") as demo:
+    gr.Markdown(
+        """
+        ### 🐦 Bird Species Identification
+        Upload an audio recording of a bird call to identify the species.
+        """
+    )
+    audio_input = gr.Audio(
+        sources=["upload"],
+        type="filepath",
+        label="Upload bird audio"
+    )
+    output_text = gr.Markdown(
+        label="Identified species",
+    )
+    with gr.Row():
+        submit_btn = gr.Button("Identify")
+        clear_btn = gr.Button("Clear")
+    submit_btn.click(
+        fn=run_inference,
+        inputs=audio_input,
+        outputs=output_text
+    )
+    clear_btn.click(
+        fn=clear_outputs,
+        outputs=[audio_input, output_text]
+    )
+if __name__ == "__main__":
+    demo.launch()

models/checkpoint.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7f58c1413fe19595def2cbcb4ba01fced3bd84418874b253ba5529510a677550
+size 85613285

pixi.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pixi.toml ADDED Viewed

	@@ -0,0 +1,19 @@

+[workspace]
+channels = ["conda-forge", "pytorch"]
+name = "munich_bird_identifier"
+platforms = ["linux-64"]
+version = "0.1.0"
+[tasks]
+[dependencies]
+python = "3.12.*"
+librosa = ">=0.11.0,<0.12"
+click = ">=8.3.1,<9"
+gradio = ">=6.2.0,<7"
+ipython = ">=9.9.0,<10"
+[pypi-dependencies]
+torch = { version = "*", index = "https://download.pytorch.org/whl/cpu" }
+torchvision = { version = "*", index = "https://download.pytorch.org/whl/cpu" }
+torchaudio = { version = "*", index = "https://download.pytorch.org/whl/cpu" }

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio
+torch
+torchvision
+torchaudio
+librosa
+click

samples/corvus_corone_XC592284.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:db23dd9478e7dbd8dfd878fb08d900fad694ea93556c2013ab4b954553507957
+size 180652

samples/scolopax_rusticola_XC795042.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c1273f5edb7971248c08c78d7bc25ee30a6f9c475e27d204be9fc223c016faac
+size 628162

src/__init__.py ADDED Viewed

File without changes

src/audio.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import librosa
+from src.config import N_MELS, SR
+# chosen to be able to use for modeling downstream
+def load_audio(audio_fp, sr=None, res_type='soxr_hq'):
+    wave, sr = librosa.load(audio_fp, sr=sr, res_type=res_type)
+    return wave, sr
+def get_melspec(y, sr=None, plot=False):
+    if not sr: sr = SR  # default
+    mel_power = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=N_MELS, hop_length=1000)
+    mel_dB = librosa.power_to_db(mel_power)
+    if plot:
+        pass
+    return mel_dB

src/config.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import logging
+from pathlib import Path
+from dotenv import load_dotenv
+load_dotenv()
+def get_logger(LOG_LEVEL="INFO"):
+    LOG_PATH = Path("logs.log")
+    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+    log = logging.Logger("agentic_search")
+    log.setLevel(LOG_LEVEL)
+    file_handler = logging.FileHandler(LOG_PATH)
+    file_handler.setLevel(LOG_LEVEL)
+    file_handler.setFormatter(formatter)
+    log.addHandler(file_handler)
+    return log
+log = get_logger("DEBUG")
+CKPT_PATH = Path('models/checkpoint.pth')
+N_MELS = 256
+SR = 32_000
+# these are the bird species that the model has trained on
+IDX2CODE = {
+ 0: 'accipiter_gentilis',
+ 1: 'acrocephalus_scirpaceus',
+ 2: 'aegolius_funereus',
+ 3: 'alauda_arvensis',
+ 4: 'anthus_cervinus',
+ 5: 'anthus_trivialis',
+ 6: 'asio_otus',
+ 7: 'charadrius_dubius',
+ 8: 'chloris_chloris',
+ 9: 'coccothraustes_coccothraustes',
+ 10: 'corvus_corone',
+ 11: 'corvus_frugilegus',
+ 12: 'crex_crex',
+ 13: 'cuculus_canorus',
+ 14: 'curruca_communis',
+ 15: 'cyanistes_caeruleus',
+ 16: 'dendrocopos_major',
+ 17: 'dryocopus_martius',
+ 18: 'emberiza_citrinella',
+ 19: 'erithacus_rubecula',
+ 20: 'falco_peregrinus',
+ 21: 'fringilla_coelebs',
+ 22: 'garrulus_glandarius',
+ 23: 'lanius_collurio',
+ 24: 'larus_michahellis',
+ 25: 'linaria_cannabina',
+ 26: 'locustella_fluviatilis',
+ 27: 'locustella_naevia',
+ 28: 'lullula_arborea',
+ 29: 'luscinia_megarhynchos',
+ 30: 'mareca_penelope',
+ 31: 'motacilla_flava',
+ 32: 'muscicapa_striata',
+ 33: 'nucifraga_caryocatactes',
+ 34: 'nycticorax_nycticorax',
+ 35: 'nymphicus_hollandicus',
+ 36: 'parus_major',
+ 37: 'perdix_perdix',
+ 38: 'periparus_ater',
+ 39: 'phoenicurus_phoenicurus',
+ 40: 'phylloscopus_collybita',
+ 41: 'phylloscopus_sibilatrix',
+ 42: 'phylloscopus_trochilus',
+ 43: 'picus_canus',
+ 44: 'picus_viridis',
+ 45: 'poecile_montanus',
+ 46: 'poecile_palustris',
+ 47: 'prunella_modularis',
+ 48: 'saxicola_rubicola',
+ 49: 'scolopax_rusticola',
+ 50: 'serinus_serinus',
+ 51: 'strix_aluco',
+ 52: 'sylvia_atricapilla',
+ 53: 'sylvia_borin',
+ 54: 'troglodytes_troglodytes',
+ 55: 'turdus_merula',
+ 56: 'turdus_philomelos'
+}
+CODE2IDX = {v:k for k,v in IDX2CODE.items()}

src/modeling.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import torch
+from torch import nn, optim
+from torchvision.models import resnet34, ResNet34_Weights
+from src.processing import generate_test_images
+from src.config import IDX2CODE
+class BirdNet(nn.Module):
+    def __init__(self, n_out=len(IDX2CODE.keys()), pretrained=True, freeze_backbone=True, dropout=.25):
+        super().__init__()
+        self.model = resnet34(weights=ResNet34_Weights.DEFAULT if pretrained else None)
+        # Modify first convolution layer to accept 1-channel grayscale input
+        # Original ResNet34 expects 3-channel RGB input
+        # We adapt it to accept 1-channel grayscale melspectrogram
+        original_conv1 = self.model.conv1
+        self.model.conv1 = nn.Conv2d(
+            in_channels=1,  # Grayscale input
+            out_channels=original_conv1.out_channels,
+            kernel_size=original_conv1.kernel_size,
+            stride=original_conv1.stride,
+            padding=original_conv1.padding,
+            bias=original_conv1.bias
+        )
+        if pretrained:
+            with torch.no_grad():
+                self.model.conv1.weight.data = original_conv1.weight.data.mean(dim=1, keepdim=True)
+        # in_features = self.model.fc.in_features
+        # layers = list(self.model.children())[:-2]
+        # layers.append(nn.AdaptiveMaxPool2d(1))
+        # self.encoder = nn.Sequential(*layers)
+        self.model.fc = nn.Linear(self.model.fc.in_features, n_out)
+        # self.model.fc = nn.Sequential(
+        #     nn.Linear(self.model.fc.in_features, 256),
+        #     nn.ReLU(),
+        #     nn.Dropout(dropout),
+        #     nn.Linear(256, n_out)
+        # )
+        # Optional: Freeze backbone for fine-tuning (train only the final layer)
+        if freeze_backbone:
+            for param in self.model.parameters():
+                param.requires_grad = False
+            # Unfreeze the final layer
+            for param in self.model.fc.parameters():
+                param.requires_grad = True
+    def forward(self, x):
+        return self.model(x)
+class Model:
+    def __init__(self, device, n_out=len(IDX2CODE.keys()), loss_fn=nn.CrossEntropyLoss(),
+                 pretrained=True, freeze_backbone=True, dropout=.1):
+        self.n_out = n_out
+        self.device = device
+        self.model = BirdNet(self.n_out, pretrained=pretrained,
+                             freeze_backbone=freeze_backbone, dropout=dropout).to(self.device)
+        self.lr = 5e-3
+        self.loss_fn = loss_fn
+        self.opt = optim.Adam(self.model.parameters(), lr=self.lr)
+        # self.lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(self.opt, mode='min', factor=.5, patience=3, min_lr=1e-5)
+        self.epoch_train_losses = []
+        self.epoch_val_losses = []
+        self.epoch_train_accs = []
+        self.epoch_val_accs = []
+        self.epoch = 0
+    def load_from_chkpt(self, chkpt_path):
+        chkpt = torch.load(chkpt_path, weights_only=False, map_location=torch.device(self.device))
+        self.epoch = chkpt['epoch']
+        self.model.load_state_dict(chkpt['model'])
+        self.opt.load_state_dict(chkpt['optim'])
+        self.epoch_train_losses = chkpt['train_losses']
+        self.epoch_val_losses = chkpt['valid_losses']
+        self.epoch_train_accs = chkpt['train_accs']
+        self.epoch_val_accs = chkpt['valid_accs']
+    def make_preds(self, fp):
+        arrs = generate_test_images(fp)
+        self.model.eval();
+        with torch.no_grad():
+            out = self.model(arrs.to(self.device).float())
+        labels = out.argmax(dim=1)
+        vc = labels.unique(return_counts=True)
+        return IDX2CODE[vc[0][vc[1].argmax()].item()]

src/processing.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import torch
+import numpy as np
+import soundfile as sf
+from src.audio import load_audio, get_melspec
+from src.config import SR
+from src.utils import get_idx, to_square
+# https://www.kaggle.com/code/tarunpaparaju/birdcall-identification-spectrogram-loader
+def to_imagenet(X, mean=None, std=None, norm_max=None, norm_min=None, eps=1e-6):
+    mean = mean or X.mean()
+    X = X - mean
+    std = std or X.std()
+    Xstd = X / (std + eps)
+    _min, _max = Xstd.min(), Xstd.max()
+    norm_max = norm_max or _max
+    norm_min = norm_min or _min
+    if (_max - _min) > eps:
+        # Normalize to [0, 255]
+        V = Xstd
+        V[V < norm_min] = norm_min
+        V[V > norm_max] = norm_max
+        V = (V - norm_min) / (norm_max - norm_min)
+    else:
+        # Just zero
+        V = np.zeros_like(Xstd, dtype=np.uint8)
+    return V #np.stack([V]*3, axis=-1)
+def extract_melspec_as_imgarr(fp, n_secs=8, random_chunk=True, convert_to_int8=False):
+    info = sf.info(fp)
+    y, _ = load_audio(fp, SR) #, offset=start, duration=n_secs
+    while True:
+        start, end = get_idx(info.duration, n_secs, random_chunk=random_chunk)
+        y2 = y[start:end]
+        if len(y2):
+            y = y2
+            break
+    mel_dB = to_square(get_melspec(y, SR))
+    try:
+        normalised_db = to_imagenet(mel_dB)  # replaced minmax_scale
+    except:
+        normalised_db = torch.zeros_like(torch.as_tensor(mel_dB))
+    db_array = np.asarray(normalised_db)*255
+    if convert_to_int8:
+        db_array = db_array.astype(np.uint8)
+    return db_array[::-1].astype(float)
+def generate_test_images(fp, n=10):
+    arrs = []
+    for _ in range(n):
+        arrs.append(extract_melspec_as_imgarr(fp))
+    return torch.as_tensor(np.array(arrs)).unsqueeze(1)

src/utils.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import torch
+import numpy as np
+from src.config import SR, CODE2IDX
+def get_idx(duration, n_secs=5, sr=SR, random_chunk=True):
+    num_frames = np.ceil(sr * duration)
+    chunk_idx = (n_secs*sr)
+    DEFAULT_OFFSET = 10
+    start = np.random.randint(DEFAULT_OFFSET, num_frames-chunk_idx) if random_chunk else DEFAULT_OFFSET
+    return start, start+chunk_idx
+def to_square(arr):
+    """Convert (almost square) array to a square array by padding/truncating."""
+    rows, cols = arr.shape
+    if cols < rows:
+        pad_width = ((0, 0), (0, rows - cols))
+        return np.pad(arr, pad_width, mode='constant')
+    else:
+        return arr[:, :rows]
+def to_tensor(data):
+    return [torch.FloatTensor(x) for x in data]
+def one_hot(idx):
+    y = torch.zeros(len(CODE2IDX.keys()))
+    y[idx] = 1.
+    return y