Spaces:

EarthSpeciesProject
/

NatureLM-Audio

Running on Zero

App Files Files Community

Fix loading

by Titouan - opened Aug 28, 2025

base: refs/heads/main

←

from: refs/pr/5

Discussion Files changed

+263

-416

Files changed (9) hide show

.gitattributes +0 -1
NatureLM/config.py +0 -1
NatureLM/models/NatureLM.py +19 -194
README.md +15 -11
app.py +228 -145
assets/American Crow - Corvus brachyrhynchos.mp3 +0 -3
configs/inference.yml +0 -1
data_store.py +0 -58
requirements.txt +1 -2

.gitattributes CHANGED Viewed

@@ -50,4 +50,3 @@ assets/Humpback[[:space:]]Whale[[:space:]]-[[:space:]]Megaptera[[:space:]]novaea
 assets/Lazuli_Bunting_yell-YELLLAZB20160625SM303143.m4a filter=lfs diff=lfs merge=lfs -text
 assets/Walrus[[:space:]]-[[:space:]]Odobenus[[:space:]]rosmarus.wav filter=lfs diff=lfs merge=lfs -text
 assets/ESP_logo_white.png filter=lfs diff=lfs merge=lfs -text
-assets/American[[:space:]]Crow[[:space:]]-[[:space:]]Corvus[[:space:]]brachyrhynchos.mp3 filter=lfs diff=lfs merge=lfs -text

 assets/Lazuli_Bunting_yell-YELLLAZB20160625SM303143.m4a filter=lfs diff=lfs merge=lfs -text
 assets/Walrus[[:space:]]-[[:space:]]Odobenus[[:space:]]rosmarus.wav filter=lfs diff=lfs merge=lfs -text
 assets/ESP_logo_white.png filter=lfs diff=lfs merge=lfs -text

NatureLM/config.py CHANGED Viewed

@@ -136,7 +136,6 @@ class GenerateConfig(BaseModel, extra="forbid", validate_assignment=True):
     temperature: float
     repetition_penalty: float
     length_penalty: float
-    merging_alpha: float = 1.0
 class ModelConfig(BaseModel, extra="forbid", validate_assignment=True):

     temperature: float
     repetition_penalty: float
     length_penalty: float
 class ModelConfig(BaseModel, extra="forbid", validate_assignment=True):

NatureLM/models/NatureLM.py CHANGED Viewed

@@ -12,10 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import hashlib
 import logging
 import os
-from collections import OrderedDict
 from pathlib import Path
 from typing import Literal, Union
@@ -37,98 +35,8 @@ from .Qformer import BertConfig, BertLMHeadModel
 from .utils import StoppingCriteriaSub
 torch.backends.cuda.matmul.allow_tf32 = True
-auth_token = os.getenv("llama", None)
-class AudioEncodingCache:
-    """LRU cache for audio encoding with content-based hashing."""
-    def __init__(self, capacity: int = 100):
-        self.capacity = capacity
-        self.cache = OrderedDict()
-        self.hits = 0
-        self.misses = 0
-    def _compute_hash(
-        self, raw_wav: torch.Tensor, audio_padding_mask: torch.Tensor | None = None
-    ) -> str:
-        """Compute a hash key from the audio tensor and padding mask."""
-        # Use a sample of the tensor for efficiency (first, middle, last portions)
-        B, L = raw_wav.shape
-        sample_size = min(1000, L)  # Sample 1000 points or entire length if smaller
-        # Sample from beginning, middle, and end
-        indices = torch.cat(
-            [
-                torch.arange(min(sample_size // 3, L)),
-                torch.arange(L // 2, min(L // 2 + sample_size // 3, L)),
-                torch.arange(max(0, L - sample_size // 3), L),
-            ]
-        )
-        sampled_wav = raw_wav[:, indices].cpu().numpy().tobytes()
-        # Create hash from audio data, shape, and padding mask presence
-        hash_obj = hashlib.sha256(sampled_wav)
-        hash_obj.update(str(raw_wav.shape).encode())
-        hash_obj.update(str(raw_wav.dtype).encode())
-        if audio_padding_mask is not None:
-            mask_sample = audio_padding_mask[:, indices].cpu().numpy().tobytes()
-            hash_obj.update(mask_sample)
-            hash_obj.update(str(audio_padding_mask.shape).encode())
-        else:
-            hash_obj.update(b"no_mask")
-        return hash_obj.hexdigest()
-    def get(self, raw_wav: torch.Tensor, audio_padding_mask: torch.Tensor = None):
-        """Retrieve cached encoding if available."""
-        key = self._compute_hash(raw_wav, audio_padding_mask)
-        if key in self.cache:
-            self.hits += 1
-            # Move to end (most recently used)
-            self.cache.move_to_end(key)
-            return self.cache[key]
-        self.misses += 1
-        return None
-    def put(self, raw_wav: torch.Tensor, audio_padding_mask: torch.Tensor, value: tuple):
-        """Store encoding in cache (on CPU to save GPU memory)."""
-        key = self._compute_hash(raw_wav, audio_padding_mask)
-        # Move tensors to CPU for storage
-        audio_embeds, audio_atts = value
-        cached_value = (audio_embeds.cpu(), audio_atts.cpu())
-        # Add to cache
-        self.cache[key] = cached_value
-        self.cache.move_to_end(key)
-        # Evict oldest if over capacity
-        if len(self.cache) > self.capacity:
-            self.cache.popitem(last=False)
-    def clear(self):
-        """Clear the cache."""
-        self.cache.clear()
-        self.hits = 0
-        self.misses = 0
-    def get_stats(self):
-        """Get cache statistics."""
-        total = self.hits + self.misses
-        hit_rate = self.hits / total if total > 0 else 0
-        return {
-            "hits": self.hits,
-            "misses": self.misses,
-            "hit_rate": hit_rate,
-            "size": len(self.cache),
-            "capacity": self.capacity,
-        }
 class NatureLM(nn.Module, PyTorchModelHubMixin):
     def __init__(
@@ -157,16 +65,9 @@ class NatureLM(nn.Module, PyTorchModelHubMixin):
         max_txt_len: int = 128,
         end_sym: str = "</s>",
         device: str = "cuda",
-        audio_encoding_cache_size: int = 100,
     ):
         super().__init__()
-        self.audio_encoding_cache = (
-            AudioEncodingCache(capacity=audio_encoding_cache_size)
-            if audio_encoding_cache_size > 0
-            else None
-        )
         self.beats_path = beats_path
         self.beats_cfg = beats_cfg
         self.use_audio_Qformer = use_audio_Qformer
@@ -183,9 +84,7 @@ class NatureLM(nn.Module, PyTorchModelHubMixin):
         logging.info(f"Llama path: {llama_path}")
         logging.info("Loading Llama Tokenizer")
-        self.llama_tokenizer = AutoTokenizer.from_pretrained(
-            llama_path, use_fast=False, use_auth_token=auth_token
-        )
         self.llama_tokenizer.add_special_tokens({"pad_token": "[PAD]"})
         self.llama_tokenizer.padding_side = "right"
@@ -196,6 +95,7 @@ class NatureLM(nn.Module, PyTorchModelHubMixin):
                 torch_dtype=torch.float32,
                 attn_implementation="eager",
                 device_map="cpu",
             )
             # An issue with tiny-llama is that pad_token_id was set to -1, but
             # model.save_pretrained checks generation configs and does not allow -1 as
@@ -206,6 +106,7 @@ class NatureLM(nn.Module, PyTorchModelHubMixin):
                 llama_path,
                 torch_dtype=torch.bfloat16,
                 attn_implementation=flash_attn,
             )
         self.llama_model.resize_token_embeddings(len(self.llama_tokenizer))
@@ -234,9 +135,7 @@ class NatureLM(nn.Module, PyTorchModelHubMixin):
         self.beats = BEATs(cfg=BEATsConfig(dict(self.beats_cfg)))
         if self.beats_path:
-            beats_ckpt = universal_torch_load(
-                self.beats_path, cache_mode="none", map_location="cpu"
-            )
             self.beats.load_state_dict(beats_ckpt["model"])
         self.ln_audio = nn.LayerNorm(self.beats.cfg.encoder_embed_dim)
@@ -437,15 +336,11 @@ class NatureLM(nn.Module, PyTorchModelHubMixin):
             audio_embeds = self.ln_audio(audio_embeds)
             # Generate attention mask
-            audio_atts = torch.ones(audio_embeds.size()[:-1], dtype=torch.long).to(
-                audio_embeds.device
-            )
             if self.window_level_Qformer:
                 B, T, C = audio_embeds.shape  # batch, T, Channels
-                kernel = round(
-                    1500 * self.second_per_window / 30.0
-                )  # 160 ms patches; calculate kernel size
                 stride = round(1500 * self.second_stride / 30.0)  # Calculate stride size
                 kernel = (1, kernel)
                 stride = (1, stride)
@@ -465,9 +360,7 @@ class NatureLM(nn.Module, PyTorchModelHubMixin):
                     audio_embeds_overlap, [0, 3, 2, 1]
                 )  # (B, num_windows, kernel_size, C)
                 audio_embeds = audio_embeds_overlap.reshape(-1, kernel[1], C)
-                audio_atts = torch.ones(audio_embeds.size()[:-1], dtype=torch.long).to(
-                    audio_embeds.device
-                )
                 # Q-Former mechanism
                 query_tokens = self.audio_query_tokens.expand(audio_embeds.shape[0], -1, -1)
@@ -483,19 +376,13 @@ class NatureLM(nn.Module, PyTorchModelHubMixin):
                 if self.window_level_Qformer:
                     audio_embeds = audio_embeds.view(B, -1, audio_embeds.size(2)).contiguous()
-            audio_atts = torch.ones(audio_embeds.size()[:-1], dtype=torch.long).to(
-                audio_embeds.device
-            )
         elif self.htsat:
             # HTSAT processing
             audio_embeds = self.ln_audio(audio_embeds)
-            audio_embeds = self.audio_llama_proj(audio_embeds).reshape(
-                -1, 30, self.llama_model.config.hidden_size
-            )
-            audio_atts = torch.ones(audio_embeds.size()[:-1], dtype=torch.long).to(
-                audio_embeds.device
-            )
         else:
             raise NotImplementedError("no audio qformer or max pooling")
@@ -503,32 +390,9 @@ class NatureLM(nn.Module, PyTorchModelHubMixin):
         return audio_embeds, audio_atts
     def encode_audio(self, raw_wav, audio_padding_mask=None):
-        # Only use cache during inference (not training)
-        if self.audio_encoding_cache is not None and not self.training:
-            cached_result = self.audio_encoding_cache.get(raw_wav, audio_padding_mask)
-            if cached_result is not None:
-                print("#### Audio encoding cache hit ####")
-                # Move cached tensors back to the model's device
-                audio_embeds, audio_atts = cached_result
-                return audio_embeds.to(self.device), audio_atts.to(self.device)
-        # Compute encoding if not cached
         with torch.autocast(self.device.type, dtype=torch.bfloat16):
             audio_embeds, audio_pad_mask = self.beats(raw_wav, padding_mask=audio_padding_mask)
-            result = self._encode_auditory_feature(
-                audio_embeds=audio_embeds, audio_pad_mask=audio_pad_mask
-            )
-        # Store in cache if enabled and in inference mode
-        if self.audio_encoding_cache is not None and not self.training:
-            self.audio_encoding_cache.put(raw_wav, audio_padding_mask, result)
-        return result
-    def clear_audio_embed_cache(self):
-        """Clear the audio encoding cache."""
-        if self.audio_encoding_cache is not None:
-            self.audio_encoding_cache.clear()
     def prompt_wrap(self, audio_embeds, audio_atts, prompt: list[str]):
         """Merge audio embeddings with embeddings of the tokens in the prompt.
@@ -576,9 +440,7 @@ class NatureLM(nn.Module, PyTorchModelHubMixin):
             wrapped_atts = []
             for part in prompt_parts:
-                tokens = self.llama_tokenizer(
-                    part, return_tensors="pt", add_special_tokens=False
-                ).to(device)
                 part_embeds = self.llama_embed_tokens(tokens.input_ids).squeeze(0)
                 part_atts = tokens.attention_mask.squeeze(0)
                 wrapped_embeds.append(part_embeds)
@@ -644,9 +506,7 @@ class NatureLM(nn.Module, PyTorchModelHubMixin):
         # BOS token embeddings
         bos_token_id = self.llama_tokenizer.bos_token_id
-        bos = torch.full(
-            (batch_size, 1), bos_token_id, dtype=torch.long, device=audio_embeds.device
-        )
         bos_embeds = self.llama_embed_tokens(bos)
         # Prepare lists to collect per-sample embeddings, attention masks, and targets
@@ -661,9 +521,7 @@ class NatureLM(nn.Module, PyTorchModelHubMixin):
             # Extract non-padded text embeddings and attention mask
             text_embed = to_regress_embeds[i][to_regress_tokens.attention_mask[i].bool()]
-            text_att = to_regress_tokens.attention_mask[i][
-                to_regress_tokens.attention_mask[i].bool()
-            ]
             # Extract corresponding targets for the text tokens
             target = targets[i][to_regress_tokens.attention_mask[i].bool()]
@@ -723,9 +581,7 @@ class NatureLM(nn.Module, PyTorchModelHubMixin):
             shift_logits.view(-1, nvocab),  # Flatten to [batch_size * (seq_len-1), vocab_size]
             shift_labels.view(-1),  # Flatten to [batch_size * (seq_len-1)]
         )
-        loss_per_token = loss_per_token.view(
-            shift_labels.size()
-        )  # Reshape back to [batch_size, seq_len-1]
         # Create mask
         mask = shift_labels != -100  # [batch_size, seq_len-1]
@@ -741,9 +597,7 @@ class NatureLM(nn.Module, PyTorchModelHubMixin):
             predicted_tokens = shift_logits.argmax(dim=-1)  # [batch_size, seq_len-1]
             # Compute per-example correct counts
-            correct_per_sample = (
-                ((predicted_tokens == shift_labels) & mask).sum(dim=1).float()
-            )  # [batch_size]
             total_tokens_per_sample = mask.sum(dim=1).float()  # [batch_size]
             # Total correct and total tokens across the batch
@@ -761,37 +615,8 @@ class NatureLM(nn.Module, PyTorchModelHubMixin):
         return {"loss": loss, "per_example_loss": loss_per_example}
-    def model_merging_scaling(self, merging_alpha, adapter_name="default"):
-        """
-        Performs model merging with the base model by adjusting the scaling of the LoRA adapters as described in
-        "Model Merging Improves Zero-Shot Generalization in Bioacoustic Foundation Models"
-        (https://arxiv.org/abs/2511.05171).
-        The best value for alpha is task- and dataset-specific, but the paper found alpha values between
-        0.4 and 0.6 to perform generally well.
-        Args:
-            merging_alpha: The merging_alpha used for interpolation.
-            adapter_name (str): The name of the adapter to rescale when merging.
-        """
-        # Store original scaling on first call, then always scale relative to original
-        if not hasattr(self, "_original_lora_scaling"):
-            self._original_lora_scaling = {}
-            for name, module in self.llama_model.named_modules():
-                if hasattr(module, "r") and isinstance(module.r, dict) and adapter_name in module.r:
-                    self._original_lora_scaling[name] = module.scaling[adapter_name]
-        for name, module in self.llama_model.named_modules():
-            if name in self._original_lora_scaling:
-                module.scaling[adapter_name] = merging_alpha * self._original_lora_scaling[name]
     @torch.inference_mode()
-    def generate(self, samples, generate_cfg, prompts) -> list[str]:
-        merging_alpha = getattr(generate_cfg, "merging_alpha", 1.0)
-        if merging_alpha != 1.0:
-            self.model_merging_scaling(merging_alpha)
         batch_size = len(prompts)
         raw_wav = samples["raw_wav"]
@@ -820,7 +645,7 @@ class NatureLM(nn.Module, PyTorchModelHubMixin):
         stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
         with torch.autocast(self.device.type, dtype=torch.bfloat16):
-            outputs = self.llama_model.generate(  # TODO: Wrap the llama_model with outlines https://outlines-dev.github.io/outlines/reference/models/transformers/
                 inputs_embeds=embeds.bfloat16(),
                 max_new_tokens=generate_cfg.max_new_tokens,
                 stopping_criteria=stopping_criteria,

 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
 import os
 from pathlib import Path
 from typing import Literal, Union
 from .utils import StoppingCriteriaSub
 torch.backends.cuda.matmul.allow_tf32 = True
+auth_token = os.getenv('llama')
 class NatureLM(nn.Module, PyTorchModelHubMixin):
     def __init__(
         max_txt_len: int = 128,
         end_sym: str = "</s>",
         device: str = "cuda",
     ):
         super().__init__()
         self.beats_path = beats_path
         self.beats_cfg = beats_cfg
         self.use_audio_Qformer = use_audio_Qformer
         logging.info(f"Llama path: {llama_path}")
         logging.info("Loading Llama Tokenizer")
+        self.llama_tokenizer = AutoTokenizer.from_pretrained(llama_path, use_fast=False, use_auth_token=auth_token)
         self.llama_tokenizer.add_special_tokens({"pad_token": "[PAD]"})
         self.llama_tokenizer.padding_side = "right"
                 torch_dtype=torch.float32,
                 attn_implementation="eager",
                 device_map="cpu",
+                use_auth_token=auth_token
             )
             # An issue with tiny-llama is that pad_token_id was set to -1, but
             # model.save_pretrained checks generation configs and does not allow -1 as
                 llama_path,
                 torch_dtype=torch.bfloat16,
                 attn_implementation=flash_attn,
+                use_auth_token=auth_token
             )
         self.llama_model.resize_token_embeddings(len(self.llama_tokenizer))
         self.beats = BEATs(cfg=BEATsConfig(dict(self.beats_cfg)))
         if self.beats_path:
+            beats_ckpt = universal_torch_load(self.beats_path, cache_mode="none", map_location="cpu")
             self.beats.load_state_dict(beats_ckpt["model"])
         self.ln_audio = nn.LayerNorm(self.beats.cfg.encoder_embed_dim)
             audio_embeds = self.ln_audio(audio_embeds)
             # Generate attention mask
+            audio_atts = torch.ones(audio_embeds.size()[:-1], dtype=torch.long).to(audio_embeds.device)
             if self.window_level_Qformer:
                 B, T, C = audio_embeds.shape  # batch, T, Channels
+                kernel = round(1500 * self.second_per_window / 30.0)  # 160 ms patches; calculate kernel size
                 stride = round(1500 * self.second_stride / 30.0)  # Calculate stride size
                 kernel = (1, kernel)
                 stride = (1, stride)
                     audio_embeds_overlap, [0, 3, 2, 1]
                 )  # (B, num_windows, kernel_size, C)
                 audio_embeds = audio_embeds_overlap.reshape(-1, kernel[1], C)
+                audio_atts = torch.ones(audio_embeds.size()[:-1], dtype=torch.long).to(audio_embeds.device)
                 # Q-Former mechanism
                 query_tokens = self.audio_query_tokens.expand(audio_embeds.shape[0], -1, -1)
                 if self.window_level_Qformer:
                     audio_embeds = audio_embeds.view(B, -1, audio_embeds.size(2)).contiguous()
+            audio_atts = torch.ones(audio_embeds.size()[:-1], dtype=torch.long).to(audio_embeds.device)
         elif self.htsat:
             # HTSAT processing
             audio_embeds = self.ln_audio(audio_embeds)
+            audio_embeds = self.audio_llama_proj(audio_embeds).reshape(-1, 30, self.llama_model.config.hidden_size)
+            audio_atts = torch.ones(audio_embeds.size()[:-1], dtype=torch.long).to(audio_embeds.device)
         else:
             raise NotImplementedError("no audio qformer or max pooling")
         return audio_embeds, audio_atts
     def encode_audio(self, raw_wav, audio_padding_mask=None):
         with torch.autocast(self.device.type, dtype=torch.bfloat16):
             audio_embeds, audio_pad_mask = self.beats(raw_wav, padding_mask=audio_padding_mask)
+            return self._encode_auditory_feature(audio_embeds=audio_embeds, audio_pad_mask=audio_pad_mask)
     def prompt_wrap(self, audio_embeds, audio_atts, prompt: list[str]):
         """Merge audio embeddings with embeddings of the tokens in the prompt.
             wrapped_atts = []
             for part in prompt_parts:
+                tokens = self.llama_tokenizer(part, return_tensors="pt", add_special_tokens=False).to(device)
                 part_embeds = self.llama_embed_tokens(tokens.input_ids).squeeze(0)
                 part_atts = tokens.attention_mask.squeeze(0)
                 wrapped_embeds.append(part_embeds)
         # BOS token embeddings
         bos_token_id = self.llama_tokenizer.bos_token_id
+        bos = torch.full((batch_size, 1), bos_token_id, dtype=torch.long, device=audio_embeds.device)
         bos_embeds = self.llama_embed_tokens(bos)
         # Prepare lists to collect per-sample embeddings, attention masks, and targets
             # Extract non-padded text embeddings and attention mask
             text_embed = to_regress_embeds[i][to_regress_tokens.attention_mask[i].bool()]
+            text_att = to_regress_tokens.attention_mask[i][to_regress_tokens.attention_mask[i].bool()]
             # Extract corresponding targets for the text tokens
             target = targets[i][to_regress_tokens.attention_mask[i].bool()]
             shift_logits.view(-1, nvocab),  # Flatten to [batch_size * (seq_len-1), vocab_size]
             shift_labels.view(-1),  # Flatten to [batch_size * (seq_len-1)]
         )
+        loss_per_token = loss_per_token.view(shift_labels.size())  # Reshape back to [batch_size, seq_len-1]
         # Create mask
         mask = shift_labels != -100  # [batch_size, seq_len-1]
             predicted_tokens = shift_logits.argmax(dim=-1)  # [batch_size, seq_len-1]
             # Compute per-example correct counts
+            correct_per_sample = ((predicted_tokens == shift_labels) & mask).sum(dim=1).float()  # [batch_size]
             total_tokens_per_sample = mask.sum(dim=1).float()  # [batch_size]
             # Total correct and total tokens across the batch
         return {"loss": loss, "per_example_loss": loss_per_example}
     @torch.inference_mode()
+    def generate(self, samples, generate_cfg, prompts):
         batch_size = len(prompts)
         raw_wav = samples["raw_wav"]
         stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
         with torch.autocast(self.device.type, dtype=torch.bfloat16):
+            outputs = self.llama_model.generate(
                 inputs_embeds=embeds.bfloat16(),
                 max_new_tokens=generate_cfg.max_new_tokens,
                 stopping_criteria=stopping_criteria,

README.md CHANGED Viewed

@@ -1,21 +1,25 @@
 ---
-title: NatureLM-audio Demo
-emoji: 🔈
-colorFrom: green
-colorTo: green
 sdk: gradio
 sdk_version: 5.38.2
 app_file: app.py
-pinned: true
 license: apache-2.0
-short_description: Analyze your bioacoustic data with NatureLM-audio
-thumbnail: >-
-  https://cdn-uploads.huggingface.co/production/uploads/67e0630403121d657d96b0a4/VwZf6xhy8xz-AIr8rykvB.png
 ---
-# NatureLM-audio Demo
-This is a demo of the NatureLM-audio model. Users can upload an audio file containing animal vocalizations and ask questions about them in a chat interface.
 ## Usage
@@ -27,4 +31,4 @@ This is a demo of the NatureLM-audio model. Users can upload an audio file conta
 The app uses lazy loading to start quickly. The model is only loaded when you first interact with it, not during app initialization. This prevents timeout issues on HuggingFace Spaces.
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: NatureLM Audio Demo
+emoji: 🎵
+colorFrom: purple
+colorTo: purple
 sdk: gradio
 sdk_version: 5.38.2
 app_file: app.py
+pinned: false
 license: apache-2.0
+short_description: Audio analysis with NatureLM model
 ---
+# NatureLM Audio Demo
+This is a demo of the NatureLM audio analysis model. The app provides three main features:
+## Features
+1. **Chat Interface**: Upload audio files and ask questions about them
+2. **Batch Processing**: Process multiple audio files with the same task
+3. **Long Recording Analysis**: Analyze long audio recordings by chunking them
 ## Usage
 The app uses lazy loading to start quickly. The model is only loaded when you first interact with it, not during app initialization. This prevents timeout issues on HuggingFace Spaces.
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import spaces
-import uuid
 import warnings
 import traceback
 import numpy as np
@@ -10,37 +9,16 @@ from collections import Counter
 import gradio as gr
 import torch
 import torchaudio
-import soundfile as sf
 import matplotlib.pyplot as plt
 from NatureLM.config import Config
 from NatureLM.models.NatureLM import NatureLM
 from NatureLM.infer import Pipeline
-from data_store import upload_data
 warnings.filterwarnings("ignore")
 SAMPLE_RATE = 16000  # Default sample rate for NatureLM-audio
-DEVICE: str = "cuda" if torch.cuda.is_available() else "cpu"
-MIN_AUDIO_DURATION: float = 0.5  # seconds
-MAX_HISTORY_TURNS = (
-    3  # Maximum number of conversation turns to include in context (user + assistant pairs)
-)
-# Load model at startup if CUDA is available
-print(f"Device: {DEVICE}")
-model = NatureLM.from_pretrained("EarthSpeciesProject/NatureLM-audio")
-model = model.eval().to(DEVICE)
-model = Pipeline(model)
-def check_audio_duration_greater(audio_path: str) -> bool:
-    """Check the duration of the audio file."""
-    info = sf.info(audio_path)
-    duration = info.duration # info.num_frames / info.sample_rate
-    if not duration >= MIN_AUDIO_DURATION:
-        raise gr.Error(f"Audio duration must be at least {MIN_AUDIO_DURATION} seconds.")
 def get_spectrogram(audio: torch.Tensor) -> plt.Figure:
@@ -86,6 +64,85 @@ def get_spectrogram(audio: torch.Tensor) -> plt.Figure:
     return fig
 def take_majority_vote(results: list[list[dict]]) -> list[str]:
     """For each audio file, take the majority vote of the labels across all windows"""
     outputs = []
@@ -110,19 +167,35 @@ def prompt_lm(
     hop_length_seconds: float = 10.0,
 ) -> list[str]:
     """Generate response using the model
     Args:
         audios (list[str]): List of audio file paths
         queries (list[str] | str): Query or list of queries to process
         window_length_seconds (float): Length of the window for processing audio
         hop_length_seconds (float): Hop length for processing audio
     Returns:
         list[str]: List of generated responses for each audio-query pair
     """
-    if model is None:
-        return "❌ Model not loaded. Please check the model configuration."
     with torch.amp.autocast(device_type="cuda", dtype=torch.float16):
-        results: list[list[dict]] = model(
             audios,
             queries,
             window_length_seconds=window_length_seconds,
@@ -164,65 +237,33 @@ def add_user_query(chatbot_history: list[dict], chat_input: str) -> list[dict]:
         return chatbot_history
     chatbot_history.append({"role": "user", "content": chat_input.strip()})
     return chatbot_history
-def send_data_to_hub(chatbot_history: list[dict], audio: str, session_id: str):
-    """Upload data to hub"""
-    if not chatbot_history or len(chatbot_history) < 2:
-        return
-    user_text = chatbot_history[-2]["content"]
-    model_response = chatbot_history[-1]["content"]
-    upload_data(audio, user_text, model_response, session_id)
 def get_response(chatbot_history: list[dict], audio_input: str) -> list[dict]:
-    """Generate response from the model based on user input and audio file with conversation history"""
     try:
-        # Warn if conversation is getting long
-        num_turns = len(chatbot_history)
-        if num_turns > MAX_HISTORY_TURNS * 2:  # Each turn = user + assistant message
-            gr.Warning(
-                "⚠️ Long conversations may affect response quality. Consider starting a new conversation with the Clear button."
-            )
-        # Build conversation context from history
-        conversation_context = []
-        for message in chatbot_history:
-            if message["role"] == "user":
-                conversation_context.append(f"User: {message['content']}")
-            elif message["role"] == "assistant":
-                conversation_context.append(f"Assistant: {message['content']}")
-        # Get the last user message
         last_user_message = ""
         for message in reversed(chatbot_history):
             if message["role"] == "user":
                 last_user_message = message["content"]
                 break
-        # Format the full prompt with conversation history
-        if len(conversation_context) > 2:  # More than just the current query
-            # Include previous turns (limit to last MAX_HISTORY_TURNS exchanges)
-            # recent_context = conversation_context[
-            #    -(MAX_HISTORY_TURNS + 1) : -1
-            # ]  # Exclude current message
-            recent_context = conversation_context
-            full_prompt = (
-                "Previous conversation:\n"
-                + "\n".join(recent_context)
-                + "\n\nCurrent question: "
-                + last_user_message
-            )
-        else:
-            full_prompt = last_user_message
-        print("\nFull prompt with history:", full_prompt)
         response = prompt_lm(
             audios=[audio_input],
-            queries=[full_prompt.strip()],
             window_length_seconds=100_000,
             hop_length_seconds=100_000,
         )
@@ -236,7 +277,7 @@ def get_response(chatbot_history: list[dict], audio_input: str) -> list[dict]:
         print(f"Error generating response: {e}")
         traceback.print_exc()
         response = "Error generating response. Please try again."
     # Add model response to chat history
     chatbot_history.append({"role": "assistant", "content": response})
@@ -245,7 +286,19 @@ def get_response(chatbot_history: list[dict], audio_input: str) -> list[dict]:
 def main(
     assets_dir: Path,
 ):
     # Check if assets directory exists, if not create a placeholder
     if not assets_dir.exists():
         print(f"Warning: Assets directory {assets_dir} does not exist")
@@ -255,8 +308,7 @@ def main(
     laz_audio = assets_dir / "Lazuli_Bunting_yell-YELLLAZB20160625SM303143.mp3"
     frog_audio = assets_dir / "nri-GreenTreeFrogEvergladesNP.mp3"
     robin_audio = assets_dir / "yell-YELLAMRO20160506SM3.mp3"
-    whale_audio = assets_dir / "Humpback Whale - Megaptera novaeangliae.wav"
-    crow_audio = assets_dir / "American Crow - Corvus brachyrhynchos.mp3"
     examples = {
         "Identifying Focal Species (Lazuli Bunting)": [
@@ -271,30 +323,35 @@ def main(
             str(robin_audio),
             "Caption the audio, using the scientific name for any animal species.",
         ],
-        "Identifying Focal Species (Megaptera novaeangliae)": [
-            str(whale_audio),
-            "What is the scientific name for the focal species in the audio?",
-        ],
-        "Speaker Count (American Crow)": [
-            str(crow_audio),
             "How many individuals are vocalizing in this audio?",
         ],
-        "Caption the audio (Humpback Whale)": [str(whale_audio), "Caption the audio."],
     }
-    gr.set_static_paths(paths=[Path.cwd().absolute() / "assets"])
     with gr.Blocks(
         title="NatureLM-audio",
-        theme=gr.themes.Base(primary_hue="blue", font=[gr.themes.GoogleFont("Noto Sans")]),
     ) as app:
-        with gr.Row():
-            gr.HTML("""
         <div style="display: flex; align-items: center; gap: 12px;">
             <picture>
-                <source srcset="/gradio_api/file=assets/ESP_logo_white.png" media="(prefers-color-scheme: dark)">
-                <source srcset="/gradio_api/file=assets/esp_logo.png" media="(prefers-color-scheme: light)">
-                <img src="/gradio_api/file=assets/esp_logo.png"
                     alt="ESP Logo"
                     style="height: 40px; width: auto;">
             </picture>
@@ -304,8 +361,7 @@ def main(
         with gr.Tabs():
             with gr.Tab("Analyze Audio"):
-                session_id = gr.State(str(uuid.uuid4()))
-                # uploaded_audio = gr.State()
                 # Status indicator
                 # status_text = gr.Textbox(
                 #     value=model_manager.get_status(),
@@ -325,7 +381,7 @@ def main(
                                 <div class="banner-text">Upload your first audio file below or select a pre-loaded example below.</div>
                             </div>
                         </div>
-                        <a href="https://huggingface.co/blog/EarthSpeciesProject/nature-lm-audio-ui-demo/" target="_blank" class="link-btn">View Tutorial</a>
                     </div>
                     """,
                         padding=False,
@@ -338,14 +394,6 @@ def main(
                         interactive=True,
                         sources=["upload"],
                     )
-                    # check that audio duration is greater than MIN_AUDIO_DURATION
-                    # raise
-                    audio_input.change(
-                        fn=check_audio_duration_greater,
-                        inputs=[audio_input],
-                        outputs=[],
-                    )
                 with gr.Accordion(
                     label="Toggle Spectrogram", open=False, visible=False
                 ) as spectrogram:
@@ -398,7 +446,7 @@ def main(
                             lines=1,
                             show_label=False,
                             submit_btn="Send",
-                            container=True,
                             autofocus=False,
                             elem_id="chat-input",
                         )
@@ -420,6 +468,11 @@ def main(
                     updated_history = add_user_query(chatbot_history, chat_input)
                     return updated_history, ""
                 clear_button = gr.ClearButton(
                     components=[chatbot, chat_input, audio_input, plotter],
                     visible=False,
@@ -459,11 +512,19 @@ def main(
                         chat,
                         plotter,
                     ],
                 ).then(
                     fn=make_spectrogram_figure,
                     inputs=[audio_input],
                     outputs=[plotter],
-                )
                 # When submit clicked first:
                 # 1. Validate and add user query to chat history
@@ -482,20 +543,18 @@ def main(
                     lambda: gr.update(visible=True),  # Show clear button
                     None,
                     [clear_button],
-                ).then(
-                    send_data_to_hub,
-                    [chatbot, audio_input, session_id],
-                    None,
                 )
-                clear_button.click(lambda: gr.ClearButton(visible=False), None, [clear_button])
             with gr.Tab("Sample Library"):
                 with gr.Row():
                     with gr.Column():
                         gr.Markdown("### Download Sample Audio")
                         gr.Markdown(
-                            """Feel free to explore these sample audio files. To download, click the button in the top-right corner of each audio file. You can also find a large collection of publicly available animal sounds on
                             [Xenocanto](https://xeno-canto.org/explore/taxonomy) and [Watkins Marine Mammal Sound Database](https://whoicf2.whoi.edu/science/B/whalesounds/index.cfm)."""
                         )
                         samples = [
@@ -508,8 +567,8 @@ def main(
                                 "Green Tree Frog",
                             ),
                             (
-                                "assets/American Crow - Corvus brachyrhynchos.mp3",
-                                "American Crow",
                             ),
                             (
                                 "assets/Gray Wolf - Canis lupus italicus.m4a",
@@ -531,46 +590,33 @@ def main(
                                             type="filepath",
                                             show_download_button=True,
                                         )
             with gr.Tab("💡 Help"):
-                gr.HTML("""
-                        <div class="banner">
-                            <div style="display: flex; padding: 0px; align-items: center; flex: 1;">
-                                <div style="font-size: 20px; margin-right: 12px;"></div>
-                                <div style="flex: 1;">
-                                    <div class="banner-header">Help us improve the model!</div>
-                                    <div class="banner-text">Found an issue or have suggestions? Join us on Discourse to share feedback and questions.</div>
-                                </div>
-                            </div>
-                            <a href="https://earthspeciesproject.discourse.group/t/feedback-for-naturelm-audio-ui-hugging-face-spaces-demo/17" target="_blank" class="link-btn">Share Feedback</a>
-                        </div>
                         <div class="guide-section">
-                            <h3>Getting Started</h3>
                             <ol style="margin-top: 12px; padding-left: 20px; color: #6b7280; font-size: 14px; line-height: 1.6;">
-                                <li style="margin-bottom: 8px;"><strong>Upload your audio</strong> or click on a pre-loaded example. Drag and drop your audio file containing animal vocalizations, or click on an example.</li>
-                                <li style="margin-bottom: 8px;"><strong>Trim your audio (if needed)</strong> by clicking the scissors icon on the bottom right of the audio panel. Try to keep your audio to 10 seconds or less.</li>
-                                <li style="margin-bottom: 8px;"><strong>View the Spectrogram (optional)</strong>. You can easily view/hide the spectrogram of your audio for closer analysis.</li>
-                                <li style="margin-bottom: 8px;"><strong>Select a task or write your own</strong>. Select an option from pre-loaded tasks. This will auto-fill the text box with a prompt, so all you have to do is hit Send. Or, type a custom prompt directly into the chat.</li>
-                                <li style="margin-bottom: 0;"><strong>Send and Analyze Audio</strong>. Press "Send" or type Enter to begin processing your audio. Ask follow-up questions or press "Clear" to start a new conversation.</li>
                             </ol>
                         <p></p>
                         </div>
                         <div class="guide-section">
-                            <h3>Tips</h3>
                                 <b>Prompting Best Practices</b>
-                                <ul style="margin-top: 12px; padding-left: 20px; color: #6b7280; font-size: 14px; line-height: 1.6;">
-                                    <li>When possible, use scientific or taxonomic names and mention the context if known (geographic area/location, time of day or year, habitat type)</li>
-                                    <li>Ask one question at a time, and be specific about what you want to know</li>
-                                        <ul>❌ Don't ask: <i>"Analyze this audio and tell me all you know about it."</i></ul>
-                                        <ul>✅ Do ask: <i>"What species made this sound?"</i></ul>
-                                    <li>Keep prompts more open-ended and avoid asking Yes/No or very targeted questions</li>
-                                        <ul>❌ Don't ask: <i>"Is there a bottlenose dolphin vocalizing in the audio? Yes or No."</i></ul>
-                                        <ul>✅ Do ask: <i>"What focal species, if any, are heard in the audio?"</i></em></ul>
-                                    <li>Giving the model options to choose works well for broader categories (less so for specific species)</li>
-                                        <ul>❌ Don't ask: <i>"Classify the audio into one of the following species: Bottlenose Dolphin, Orca, Great Gray Owl"‍</i></ul>
-                                        <ul>✅ Do ask: <i>"Classify the audio into one of the following categories: Cetaceans, Aves, or None."</i></ul>
                                 </ul>
-                                <br>
                                 <b>Audio Files</b>
                                 <ul style="margin-top: 12px; padding-left: 20px; color: #6b7280; font-size: 14px; line-height: 1.6;">
                                     <li>Supported formats: .wav, .mp3, .aac, .flac, .ogg,  .webm,  .midi, .aiff,  .wma, .opus, .amr</li>
@@ -582,20 +628,32 @@ def main(
                             <div class="guide-section">
                                     <h3>Learn More</h3>
                                     <ul style="margin-top: 12px; padding-left: 20px; color: #6b7280; font-size: 14px; line-height: 1.6;">
-                                        <li>Read our <a href="https://huggingface.co/blog/EarthSpeciesProject/nature-lm-audio-ui-demo/" target="_blank">recent blog post</a> with a step-by-step tutorial</li>
                                         <li>Check out the <a href="https://arxiv.org/abs/2411.07186" target="_blank">published paper</a> for a deeper technical dive on NatureLM-audio.</li>
                                         <li>Visit the <a href="https://earthspecies.github.io/naturelm-audio-demo/" target="_blank">NatureLM-audio Demo Page</a> for additional context, a demo video, and more examples of the model in action.</li>
                                         <li>Sign up for our <a href="https://forms.gle/WjrbmFhKkzmEgwvY7" target="_blank">closed beta waitlist</a>, if you’re interested in testing upcoming features like longer audio files and batch processing.</li>
                                     </ul>
                             </div>
                     </div>
                         """)
             app.css = """
                 #chat-input textarea {
                     background: white;
                     flex: 1;
                 }
                 #chat-input .submit-button {
                     padding: 10px;
                     margin: 2px 6px;
@@ -624,6 +682,7 @@ def main(
                     color: #374151;
                     margin-bottom: 4px;
                 }
                 .banner .banner-text {
                     style="font-size: 14px;
                     color: #6b7280;
@@ -642,10 +701,30 @@ def main(
                     display: inline-block;
                     transition: background 0.2s ease;
                 }
                 .link-btn:hover {
                     background: #2563eb;
                 }
                 .guide-section {
                     margin-bottom: 32px;
                     border-radius: 8px;
@@ -667,10 +746,12 @@ def main(
                     #chat-input {
                         background: #1e1e1e;
                     }
                     #chat-input textarea {
                         background: #1e1e1e;
                         color: white;
                     }
                     .banner {
                         background: #1e1e1e;
                         color: white;
@@ -687,6 +768,8 @@ def main(
 # Create and launch the app
 app = main(
     assets_dir=Path("assets"),
 )
 if __name__ == "__main__":

 import spaces
 import warnings
 import traceback
 import numpy as np
 import gradio as gr
 import torch
 import torchaudio
 import matplotlib.pyplot as plt
 from NatureLM.config import Config
 from NatureLM.models.NatureLM import NatureLM
 from NatureLM.infer import Pipeline
 warnings.filterwarnings("ignore")
 SAMPLE_RATE = 16000  # Default sample rate for NatureLM-audio
+CURRENT_AUDIO = ""  # Placeholder for current audio file
+FIRST_QUERY: bool = True
 def get_spectrogram(audio: torch.Tensor) -> plt.Figure:
     return fig
+class ModelManager:
+    """Manages model loading and state"""
+    def __init__(self):
+        self.model: Optional[NatureLM] = None
+        self.config: Optional[Config] = None
+        self.is_loaded = False
+        self.is_loading = False
+        self.load_failed = False
+    def check_availability(self) -> tuple[bool, str]:
+        """Check if the model is available for download"""
+        try:
+            from huggingface_hub import model_info
+            model_info("EarthSpeciesProject/NatureLM-audio")
+            return True, "Model is available"
+        except Exception as e:
+            return False, f"Model not available: {str(e)}"
+    def reset_state(self):
+        """Reset the model loading state to allow retrying after a failure"""
+        self.model = None
+        self.is_loaded = False
+        self.is_loading = False
+        self.load_failed = False
+        return self.get_status()
+    def get_status(self) -> str:
+        """Get the current model loading status"""
+        if self.is_loaded:
+            return "✅ Model loaded and ready"
+        elif self.is_loading:
+            return "🔄 Loading model... Please wait"
+        elif self.load_failed:
+            return "❌ Model failed to load. Please check the configuration."
+        else:
+            return "⏳ Ready to load model on first use"
+    def load_model(self) -> Optional[NatureLM]:
+        """Load the model if needed"""
+        if self.is_loaded:
+            return self.model
+        if self.is_loading or self.load_failed:
+            return None
+        try:
+            self.is_loading = True
+            print("Loading model...")
+            # Check if model is available first
+            available, message = self.check_availability()
+            if not available:
+                raise Exception(f"Model not available: {message}")
+            model = NatureLM.from_pretrained("EarthSpeciesProject/NatureLM-audio")
+            model = model.eval().to("cuda")
+            self.model = Pipeline(model)
+            self.is_loaded = True
+            self.is_loading = False
+            print("Model loaded successfully!")
+        except Exception as e:
+            print(f"Error loading model: {e}")
+            self.is_loading = False
+            self.load_failed = True
+            return None
+# Global model manager instance
+model_manager = ModelManager()
+# @spaces.GPU
+# def load_model():
+#    model_manager.load_model()
 def take_majority_vote(results: list[list[dict]]) -> list[str]:
     """For each audio file, take the majority vote of the labels across all windows"""
     outputs = []
     hop_length_seconds: float = 10.0,
 ) -> list[str]:
     """Generate response using the model
     Args:
         audios (list[str]): List of audio file paths
         queries (list[str] | str): Query or list of queries to process
         window_length_seconds (float): Length of the window for processing audio
         hop_length_seconds (float): Hop length for processing audio
     Returns:
         list[str]: List of generated responses for each audio-query pair
     """
+    if model_manager.model is None:
+        model_manager.load_model()
+    if model_manager.model is None:
+        if model_manager.is_loading:
+            return "🔄 Loading model for the first query. This takes 20-30 seconds..👷🏽‍♂️����🪚"
+            # while True:
+            #    if model_manager.is_loaded:
+            #        model = model_manager.model
+            #        break
+            #    elif model_manager.load_failed:
+            #        return "❌ Model failed to load. This could be due to:\n• No internet connection\n• Insufficient disk space\n• Model repository access issues\n\nPlease check your connection and try again using the retry button."
+        elif model_manager.load_failed:
+            return "❌ Model failed to load. This could be due to:\n• No internet connection\n• Insufficient disk space\n• Model repository access issues\n\nPlease check your connection and try again using the retry button."
+        else:
+            return "Demo mode: Model not loaded. Please check the model configuration."
     with torch.amp.autocast(device_type="cuda", dtype=torch.float16):
+        results: list[list[dict]] = model_manager.model(
             audios,
             queries,
             window_length_seconds=window_length_seconds,
         return chatbot_history
     chatbot_history.append({"role": "user", "content": chat_input.strip()})
+    global FIRST_QUERY
+    if FIRST_QUERY:
+        # Add an assistant message indicating model is loading
+        chatbot_history.append(
+            {
+                "role": "assistant",
+                "content": "🔄 Loading model for the first query. This takes 30-40 seconds..👷🏽‍♂️🔨🪚",
+            }
+        )
+        FIRST_QUERY = False
     return chatbot_history
 def get_response(chatbot_history: list[dict], audio_input: str) -> list[dict]:
+    """Generate response from the model based on user input and audio file"""
     try:
+        # Get the last user message from chat history
         last_user_message = ""
         for message in reversed(chatbot_history):
             if message["role"] == "user":
                 last_user_message = message["content"]
                 break
+        print("\nUser message:", last_user_message)
         response = prompt_lm(
             audios=[audio_input],
+            queries=[last_user_message.strip()],
             window_length_seconds=100_000,
             hop_length_seconds=100_000,
         )
         print(f"Error generating response: {e}")
         traceback.print_exc()
         response = "Error generating response. Please try again."
     # Add model response to chat history
     chatbot_history.append({"role": "assistant", "content": response})
 def main(
     assets_dir: Path,
+    cfg_path: str | Path,
+    options: list[str] = [],
 ):
+    # Load configuration
+    try:
+        cfg = Config.from_sources(yaml_file=cfg_path, cli_args=options)
+        model_manager.config = cfg
+        print("Configuration loaded successfully")
+    except Exception as e:
+        print(f"Warning: Could not load config: {e}")
+        print("Running in demo mode")
+        model_manager.config = None
     # Check if assets directory exists, if not create a placeholder
     if not assets_dir.exists():
         print(f"Warning: Assets directory {assets_dir} does not exist")
     laz_audio = assets_dir / "Lazuli_Bunting_yell-YELLLAZB20160625SM303143.mp3"
     frog_audio = assets_dir / "nri-GreenTreeFrogEvergladesNP.mp3"
     robin_audio = assets_dir / "yell-YELLAMRO20160506SM3.mp3"
+    vireo_audio = assets_dir / "yell-YELLWarblingVireoMammoth20150614T29ms.mp3"
     examples = {
         "Identifying Focal Species (Lazuli Bunting)": [
             str(robin_audio),
             "Caption the audio, using the scientific name for any animal species.",
         ],
+        "Caption the audio (Warbling Vireo)": [str(vireo_audio), "Caption the audio."],
+        "Speaker Count (Lazuli Bunting)": [
+            str(laz_audio),
             "How many individuals are vocalizing in this audio?",
         ],
+        "Caption the audio (Green Tree Frog)": [
+            str(frog_audio),
+            "Caption the audio, using the common name for any animal species.",
+        ],
+        "Caption the audio (American Robin)": [
+            str(robin_audio),
+            "Caption the audio, using the scientific name for any animal species.",
+        ],
+        "Caption the audio (Warbling Vireo)": [str(vireo_audio), "Caption the audio."],
     }
     with gr.Blocks(
         title="NatureLM-audio",
+        theme=gr.themes.Base(
+            primary_hue="blue", font=[gr.themes.GoogleFont("Noto Sans")]
+        ),
+        css="styles.css",
     ) as app:
+        header = gr.HTML("""
         <div style="display: flex; align-items: center; gap: 12px;">
             <picture>
+                <source srcset="https://huggingface.co/spaces/EarthSpeciesProject/NatureLM-Audio/resolve/main/assets/ESP_logo_white.png" media="(prefers-color-scheme: dark)">
+                <source srcset="https://huggingface.co/spaces/EarthSpeciesProject/NatureLM-Audio/resolve/main/assets/esp_logo.png" media="(prefers-color-scheme: light)">
+                <img src="https://huggingface.co/spaces/EarthSpeciesProject/NatureLM-Audio/resolve/main/assets/esp_logo.png"
                     alt="ESP Logo"
                     style="height: 40px; width: auto;">
             </picture>
         with gr.Tabs():
             with gr.Tab("Analyze Audio"):
+                uploaded_audio = gr.State()
                 # Status indicator
                 # status_text = gr.Textbox(
                 #     value=model_manager.get_status(),
                                 <div class="banner-text">Upload your first audio file below or select a pre-loaded example below.</div>
                             </div>
                         </div>
+                        <a href="https://www.earthspecies.org/blog" target="_blank" class="link-btn">View Tutorial</a>
                     </div>
                     """,
                         padding=False,
                         interactive=True,
                         sources=["upload"],
                     )
                 with gr.Accordion(
                     label="Toggle Spectrogram", open=False, visible=False
                 ) as spectrogram:
                             lines=1,
                             show_label=False,
                             submit_btn="Send",
+                            container=False,
                             autofocus=False,
                             elem_id="chat-input",
                         )
                     updated_history = add_user_query(chatbot_history, chat_input)
                     return updated_history, ""
+                def update_current_audio(audio_input):
+                    global CURRENT_AUDIO
+                    if audio_input != CURRENT_AUDIO:
+                        CURRENT_AUDIO = audio_input
                 clear_button = gr.ClearButton(
                     components=[chatbot, chat_input, audio_input, plotter],
                     visible=False,
                         chat,
                         plotter,
                     ],
+                ).then(
+                    fn=update_current_audio,
+                    inputs=[audio_input],
+                    outputs=[],
                 ).then(
                     fn=make_spectrogram_figure,
                     inputs=[audio_input],
                     outputs=[plotter],
+                )# .then(
+                #    fn=load_model,
+                #    inputs=[],
+                #    outputs=[],
+                # )
                 # When submit clicked first:
                 # 1. Validate and add user query to chat history
                     lambda: gr.update(visible=True),  # Show clear button
                     None,
                     [clear_button],
                 )
+                clear_button.click(
+                    lambda: gr.ClearButton(visible=False), None, [clear_button]
+                )
             with gr.Tab("Sample Library"):
                 with gr.Row():
                     with gr.Column():
                         gr.Markdown("### Download Sample Audio")
                         gr.Markdown(
+                            """Feel free to explore these sample audio files. To download, click the button in the top-right corner of each audio file, or **Download All**. You can also find a large collection of publicly available animal sounds on
                             [Xenocanto](https://xeno-canto.org/explore/taxonomy) and [Watkins Marine Mammal Sound Database](https://whoicf2.whoi.edu/science/B/whalesounds/index.cfm)."""
                         )
                         samples = [
                                 "Green Tree Frog",
                             ),
                             (
+                                "assets/Eastern Gray Squirrel - Sciurus carolinensis.wav",
+                                "Eastern Gray Squirrel",
                             ),
                             (
                                 "assets/Gray Wolf - Canis lupus italicus.m4a",
                                             type="filepath",
                                             show_download_button=True,
                                         )
+                        with gr.Row():
+                            gr.HTML("""<center>
+                                <a href="https://huggingface.co/spaces/EarthSpeciesProject/NatureLM-Audio/resolve/main/assets/Sample_Audio_Files_NatureLM_audio.zip" download class="download-btn">Download All</a></center>
+                            """)
             with gr.Tab("💡 Help"):
+                gr.HTML("""
                         <div class="guide-section">
+                            <h3>Getting Started</h3>
                             <ol style="margin-top: 12px; padding-left: 20px; color: #6b7280; font-size: 14px; line-height: 1.6;">
+                                <li style="margin-bottom: 8px;"><strong>Upload your audio</strong> - Click the upload area or drag and drop your audio file containing animal vocalizations.</li>
+                                <li style="margin-bottom: 8px;"><strong>Trim your audio (if needed)</strong> - Try to keep your audio to 10 seconds or less.</li>
+                                <li style="margin-bottom: 8px;"><strong>View the Spectrogram (optional)</strong> - You can easily view/hide the spectrogram of your audio for closer analysis.</li>
+                                <li style="margin-bottom: 8px;"><strong>Select a task or write your own</strong> - Select an option from pre-loaded tasks. This will auto-fill the text box with a prompt, so all you have to do is hit Send. Or, type a custom prompt directly into the chat.</li>
+                                <li style="margin-bottom: 0;"><strong>Send and Analyze Audio</strong> - Press "Send" or type Enter to begin processing your audio. Ask follow-up questions or press "Clear" to start a new conversation.</li>
                             </ol>
                         <p></p>
                         </div>
                         <div class="guide-section">
+                            <h3>Tips & Tricks</h3>
                                 <b>Prompting Best Practices</b>
+                                <ul style="margin-top: 12px; padding-left: 20px; color: #6b7280; font-size: 14px; line-height: 1.6;">
+                                    <li>Be specific about what you want to know (e.g., "What species made this call?" vs "Analyze this audio")</li>
+                                    <li>Mention the context if known (geographic area/location, time of day or year, habitat type)</li>
+                                    <li>[TO ADD: examples of classification prompts that do and don't work well]</li>
                                 </ul>
                                 <b>Audio Files</b>
                                 <ul style="margin-top: 12px; padding-left: 20px; color: #6b7280; font-size: 14px; line-height: 1.6;">
                                     <li>Supported formats: .wav, .mp3, .aac, .flac, .ogg,  .webm,  .midi, .aiff,  .wma, .opus, .amr</li>
                             <div class="guide-section">
                                     <h3>Learn More</h3>
                                     <ul style="margin-top: 12px; padding-left: 20px; color: #6b7280; font-size: 14px; line-height: 1.6;">
+                                        <li>Read our <a href="https://earthspecies.org/blog" target="_blank">recent blog post</a> with a step-by-step tutorial</li>
                                         <li>Check out the <a href="https://arxiv.org/abs/2411.07186" target="_blank">published paper</a> for a deeper technical dive on NatureLM-audio.</li>
                                         <li>Visit the <a href="https://earthspecies.github.io/naturelm-audio-demo/" target="_blank">NatureLM-audio Demo Page</a> for additional context, a demo video, and more examples of the model in action.</li>
                                         <li>Sign up for our <a href="https://forms.gle/WjrbmFhKkzmEgwvY7" target="_blank">closed beta waitlist</a>, if you’re interested in testing upcoming features like longer audio files and batch processing.</li>
                                     </ul>
+                            </div>
+                            <div class="guide-section">
+                                    <h4>Help us improve the model!</h4>
+                                    <p>Found an issue or have suggestions? Please join us on <a href="https://earthspeciesproject.discourse.group/" target="_blank">Discourse</a> to share any feedback, questions, bug reports, or other ideas. Your input helps make NatureLM-audio better for everyone.</p>
                             </div>
                     </div>
                         """)
             app.css = """
+                #chat-input {
+                    background: white;
+                    padding: 10px;
+                    min-height: 44px;
+                    display: flex;
+                    align-items: center;
+                }
                 #chat-input textarea {
                     background: white;
                     flex: 1;
                 }
                 #chat-input .submit-button {
                     padding: 10px;
                     margin: 2px 6px;
                     color: #374151;
                     margin-bottom: 4px;
                 }
                 .banner .banner-text {
                     style="font-size: 14px;
                     color: #6b7280;
                     display: inline-block;
                     transition: background 0.2s ease;
                 }
                 .link-btn:hover {
                     background: #2563eb;
+                }
+                .download-btn {
+                    padding: 10px 20px;
+                    border-radius: 6px;
+                    font-size: 13px;
+                    font-weight: 500;
+                    cursor: pointer;
+                    border: none;
+                    background: #3b82f6;
+                    color: white;
+                    text-decoration: none;
+                    display: block;
+                    text-align: center;
+                    transition: background 0.2s ease;
+                    width: 200px;
+                    box-sizing: border-box;
+                }
+                .download-btn:hover {
+                    background: #2563eb;
                 }
                 .guide-section {
                     margin-bottom: 32px;
                     border-radius: 8px;
                     #chat-input {
                         background: #1e1e1e;
                     }
                     #chat-input textarea {
                         background: #1e1e1e;
                         color: white;
                     }
                     .banner {
                         background: #1e1e1e;
                         color: white;
 # Create and launch the app
 app = main(
     assets_dir=Path("assets"),
+    cfg_path=Path("configs/inference.yml"),
+    options=[],
 )
 if __name__ == "__main__":

assets/American Crow - Corvus brachyrhynchos.mp3 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d0f76bff28d3e3021be495754b28ef3924bc32ff0c657b67bd4ee6bb177a1f8e
-size 2164626

configs/inference.yml CHANGED Viewed

@@ -59,4 +59,3 @@ generate:
   temperature: 0.1
   repetition_penalty: 1.0
   length_penalty: 1.0
-  merging_alpha: 0.5

   temperature: 0.1
   repetition_penalty: 1.0
   length_penalty: 1.0

data_store.py DELETED Viewed

@@ -1,58 +0,0 @@
-import os
-from pathlib import Path
-import uuid
-import json
-from huggingface_hub import HfApi, HfFileSystem
-DATASET_REPO = "EarthSpeciesProject/naturelm-audio-space-logs"
-SPLIT = "test"
-TESTING = os.getenv("TESTING", "0") == "1"
-api = HfApi(token=os.getenv("HF_TOKEN", None))
-# Upload audio
-# check if file exists
-hf_fs = HfFileSystem(token=os.getenv("HF_TOKEN", None))
-def upload_data(audio: str | Path, user_text: str, model_response: str, session_id: str = ""):
-    data_id = str(uuid.uuid4())
-    if TESTING:
-        data_id = "test-" + data_id
-        session_id = "test-" + session_id
-    # Audio path in repo
-    suffix = Path(audio).suffix
-    audio_p = f"{SPLIT}/audio/" + session_id + suffix
-    if not hf_fs.exists(f"datasets/{DATASET_REPO}/{audio_p}"):
-        api.upload_file(
-            path_or_fileobj=str(audio),
-            path_in_repo=audio_p,
-            repo_id=DATASET_REPO,
-            repo_type="dataset",
-        )
-    text = {
-        "user_message": user_text,
-        "model_response": model_response,
-        "file_name": "audio/" + session_id + suffix,  # has to be relative to metadata.jsonl
-        "original_fn": os.path.basename(audio),
-        "id": data_id,
-        "session_id": session_id,
-    }
-    # Append to a jsonl file in the repo
-    # APPEND DOESNT WORK, have to open first
-    if hf_fs.exists(f"datasets/{DATASET_REPO}/{SPLIT}/metadata.jsonl"):
-        with hf_fs.open(f"datasets/{DATASET_REPO}/{SPLIT}/metadata.jsonl", "r") as f:
-            lines = f.readlines()
-        lines.append(json.dumps(text) + "\n")
-        with hf_fs.open(f"datasets/{DATASET_REPO}/{SPLIT}/metadata.jsonl", "w") as f:
-            f.writelines(lines)
-    else:
-        with hf_fs.open(f"datasets/{DATASET_REPO}/{SPLIT}/metadata.jsonl", "w") as f:
-            f.write(json.dumps(text) + "\n")
-    # Write a separate file instead
-    # with hf_fs.open(f"datasets/{DATASET_REPO}/{data_id}.json", "w") as f:
-    #     json.dump(text, f)

requirements.txt CHANGED Viewed

@@ -14,7 +14,6 @@ soundfile>=0.13.1
 spaces>=0.40.0
 torch>=2.8.0
 torchaudio>=2.8.0
-torchcodec>=0.8.0
 tqdm>=4.67.1
-transformers[sentencepiece]==4.55.3
 matplotlib>=3.10.5

 spaces>=0.40.0
 torch>=2.8.0
 torchaudio>=2.8.0
 tqdm>=4.67.1
+transformers[sentencepiece]>=4.55.2
 matplotlib>=3.10.5