import gradio as gr import pandas as pd import numpy as np import os from datetime import datetime import torchaudio import librosa import tensorflow as tf from huggingface_hub import hf_hub_download, HfApi import tempfile import gc import uuid import shutil import json # Enhanced configuration with multiple models per species MODEL_CONFIG = { "Amphibians": { "Base": { "repo_id": "RonaldCeballos/SpeciesClassifiers", "model_file": "models/amphibia/base/model_base.h5", "classes_file": "models/amphibia/base/classes.npy" }, "M1": { "repo_id": "RonaldCeballos/SpeciesClassifiers", "model_file": "models/amphibia/models/model_M1.h5", "classes_file": "models/amphibia/class/classes.npy" }, "M2": { "repo_id": "RonaldCeballos/SpeciesClassifiers", "model_file": "models/amphibia/models/model_M2.h5", "classes_file": "models/amphibia/class/classes.npy" } }, "Mammals": { "Base": { "repo_id": "RonaldCeballos/SpeciesClassifiers", "model_file": "models/mammals/base/model_base.h5", "classes_file": "models/mammals/base/classes.npy" }, "M1": { "repo_id": "RonaldCeballos/SpeciesClassifiers", "model_file": "models/mammals/models/model_M1.h5", "classes_file": "models/mammals/class/classes.npy" }, "M2": { "repo_id": "RonaldCeballos/SpeciesClassifiers", "model_file": "models/mammals/models/model_M2.h5", "classes_file": "models/mammals/class/classes.npy" } }, "Insects": { "Base": { "repo_id": "RonaldCeballos/SpeciesClassifiers", "model_file": "models/insects/base/model_base.h5", "classes_file": "models/insects/base/classes.npy" }, "M1": { "repo_id": "RonaldCeballos/SpeciesClassifiers", "model_file": "models/insects/models/model_M1.h5", "classes_file": "models/insects/class/classes.npy" }, "M2": { "repo_id": "RonaldCeballos/SpeciesClassifiers", "model_file": "models/insects/models/model_M2.h5", "classes_file": "models/insects/class/classes.npy" } } } # Cache for models model_cache = {} current_audio_path = None # To store the current audio being analyzed current_audio_name = None # To store the original audio file name def load_model_from_hub(species_type, model_version): """Load models from Hugging Face Hub with version selection""" cache_key = f"{species_type}_{model_version}" if cache_key in model_cache: return model_cache[cache_key] try: config = MODEL_CONFIG[species_type][model_version] print(f"Downloading {species_type} - {model_version}...") # Download files model_path = hf_hub_download( repo_id=config["repo_id"], filename=config["model_file"], cache_dir=tempfile.mkdtemp() ) classes_path = hf_hub_download( repo_id=config["repo_id"], filename=config["classes_file"], cache_dir=tempfile.mkdtemp() ) # Load model and classes model = tf.keras.models.load_model(model_path, compile=False) classes = np.load(classes_path) # Cache the model model_cache[cache_key] = (model, classes) gc.collect() print(f"Model {species_type} - {model_version} loaded successfully") return model, classes except Exception as e: print(f"Error loading {species_type} - {model_version}: {str(e)}") return None, None def predict_with_model(spec, model, classes): """Predict species from spectrogram - Adapted from notebook""" # Ensure correct dimensions (1025, 313) if spec.shape != (1025, 313): # Resize if needed spec = resize_spectrogram(spec, (1025, 313)) # Preprocess for model - exactly as in notebook arr = np.expand_dims(spec[..., np.newaxis], axis=0).astype('float32') X = arr / np.max(arr) # Predict pred = model.predict(X, verbose=0) pred_class_idx = np.argmax(pred) pred_class = str(classes[pred_class_idx]) prob = float(pred[0][pred_class_idx]) return pred_class, prob def extract_chunks(audio_clean, sr, time=5): """Extract audio chunks - Adapted from notebook's ext_chunks function""" n_samples = sr * time chunks = [] for i in range(0, len(audio_clean), n_samples): start = i end = i + n_samples if end <= len(audio_clean): chunk = audio_clean[start:end] else: # Circular padding - exactly as in notebook faltan = end - len(audio_clean) padding = audio_clean[:faltan] chunk = np.concatenate([audio_clean[start:], padding]) chunks.append(chunk) return np.array(chunks) def create_spectrogram(array_audio, n_fft=2048): """Create spectrogram from audio array - Adapted from notebook's spectogram function""" if isinstance(array_audio, np.ndarray): dta = np.abs(librosa.stft(array_audio, n_fft=n_fft)) D = librosa.amplitude_to_db(dta, ref=np.max) else: dta = np.abs(librosa.stft(array_audio.numpy())) D = librosa.amplitude_to_db(dta, ref=np.max) return D def resize_spectrogram(spec, target_shape): """Resize spectrogram to target shape""" from scipy import ndimage zoom_factors = (target_shape[0] / spec.shape[0], target_shape[1] / spec.shape[1]) resized = ndimage.zoom(spec, zoom_factors, order=1) return resized def predict_species_all_chunks(species_type, model_version, audio_file): """Main prediction function that processes all chunks""" global current_audio_path, current_audio_name if audio_file is None: return pd.DataFrame({"Info": ["Please upload an audio file"]}) try: # Store the current audio path and name for feedback current_audio_path = audio_file current_audio_name = os.path.basename(audio_file) # Load model model, classes = load_model_from_hub(species_type, model_version) if model is None or classes is None: return pd.DataFrame({"Error": [f"Could not load {species_type} - {model_version} model"]}) # Process audio - using notebook approach wav, sr = torchaudio.load(audio_file) wav = wav.mean(dim=0) # Convert to mono # Extract 5-second chunks using notebook function chunks = extract_chunks(wav.numpy(), sr, time=5) results = [] for i, chunk in enumerate(chunks): # Create spectrogram using notebook function spectrogram = create_spectrogram(chunk) # Normalize exactly as in notebook spectrogram = (spectrogram - np.mean(spectrogram)) / np.std(spectrogram) # Predict using adapted notebook function species, confidence = predict_with_model(spectrogram, model, classes) time_start = i * 5 time_end = (i + 1) * 5 results.append({ 'Segment': f'{i+1}', 'Time': f'{time_start}s - {time_end}s', 'Species': species, 'Confidence': f'{confidence:.1%}' }) # Clean memory del model gc.collect() if not results: return pd.DataFrame({"Info": ["No valid segments detected in the audio"]}) return pd.DataFrame(results) except Exception as e: print(f"Prediction error: {str(e)}") return pd.DataFrame({"Error": [f"Error during analysis: {str(e)}"]}) def predict_species_final(species_type, model_version, audio_file): """Enhanced prediction with voting system across chunks""" global current_audio_path, current_audio_name if audio_file is None: return pd.DataFrame({"Info": ["Please upload an audio file"]}) try: current_audio_path = audio_file current_audio_name = os.path.basename(audio_file) # Load model model, classes = load_model_from_hub(species_type, model_version) if model is None or classes is None: return pd.DataFrame({"Error": [f"Could not load {species_type} - {model_version} model"]}) # Process audio wav, sr = torchaudio.load(audio_file) wav = wav.mean(dim=0) # Extract chunks chunks = extract_chunks(wav.numpy(), sr, time=5) results = [] species_votes = {} for i, chunk in enumerate(chunks): # Create and normalize spectrogram spectrogram = create_spectrogram(chunk) spectrogram = (spectrogram - np.mean(spectrogram)) / np.std(spectrogram) # Predict species, confidence = predict_with_model(spectrogram, model, classes) # Count votes for final prediction if species in species_votes: species_votes[species] += confidence else: species_votes[species] = confidence time_start = i * 5 time_end = (i + 1) * 5 results.append({ 'Segment': f'{i+1}', 'Time': f'{time_start}s - {time_end}s', 'Species': species, 'Confidence': f'{confidence:.1%}' }) # Determine final prediction if species_votes: final_species = max(species_votes, key=species_votes.get) final_confidence = species_votes[final_species] / len(chunks) # Add final prediction row final_row = pd.DataFrame({ 'Segment': ['FINAL'], 'Time': ['Full Audio'], 'Species': [final_species], 'Confidence': [f'{final_confidence:.1%}'] }) results_df = pd.concat([pd.DataFrame(results), final_row], ignore_index=True) else: results_df = pd.DataFrame(results) # Clean memory del model gc.collect() if results_df.empty: return pd.DataFrame({"Info": ["No valid segments detected in the audio"]}) return results_df except Exception as e: print(f"Prediction error: {str(e)}") return pd.DataFrame({"Error": [f"Error during analysis: {str(e)}"]}) def save_feedback_to_dataset(audio_file_path, original_audio_name, feedback_text, consent_given, species_type, model_version, results_df): """Save audio and feedback to private Hugging Face dataset""" if not consent_given: return "❌ You must accept the consent to submit feedback." try: # Get HF token from environment (set in your Space secrets) hf_token = os.getenv('HF_TOKEN') if not hf_token: return "❌ HF_TOKEN not found. Please set it in Space secrets." api = HfApi(token=hf_token) repo_id = "RonaldCeballos/Audios-Feedback" # Your private dataset # Create temp directory temp_dir = "temp_uploads" os.makedirs(temp_dir, exist_ok=True) # Generate unique filename for audio, but keep the original extension original_base = os.path.splitext(original_audio_name)[0] unique_id = uuid.uuid4().hex[:8] # Short unique ID extension = os.path.splitext(original_audio_name)[1] or '.wav' audio_filename = f"{original_base}_{unique_id}{extension}" new_audio_path = f"{temp_dir}/{audio_filename}" # Copy audio to temp location shutil.copy(audio_file_path, new_audio_path) # Prepare metadata metadata = { 'timestamp': datetime.now().isoformat(), 'original_audio_name': original_audio_name, 'audio_file': audio_filename, 'feedback': feedback_text, 'consent_given': consent_given, 'species_type': species_type, 'model_version': model_version, 'analysis_results': results_df.to_dict() if results_df is not None else {} } # Save metadata to JSON metadata_filename = f"{original_base}_{unique_id}.json" metadata_path = f"{temp_dir}/{metadata_filename}" with open(metadata_path, 'w', encoding='utf-8') as f: json.dump(metadata, f, ensure_ascii=False, indent=2) # Upload files to dataset api.upload_file( path_or_fileobj=new_audio_path, path_in_repo=f"audios/{audio_filename}", repo_id=repo_id, repo_type="dataset", commit_message=f"Add audio feedback: {audio_filename}" ) api.upload_file( path_or_fileobj=metadata_path, path_in_repo=f"metadata/{metadata_filename}", repo_id=repo_id, repo_type="dataset", commit_message=f"Add metadata for: {audio_filename}" ) # Clean up temp files if os.path.exists(new_audio_path): os.remove(new_audio_path) if os.path.exists(metadata_path): os.remove(metadata_path) return f"✅ Audio '{original_audio_name}' and feedback saved successfully for model improvement!" except Exception as e: print(f"Error saving to dataset: {e}") # Clean up temp files in case of error if 'new_audio_path' in locals() and os.path.exists(new_audio_path): os.remove(new_audio_path) if 'metadata_path' in locals() and os.path.exists(metadata_path): os.remove(metadata_path) return f"❌ Error saving feedback: {str(e)}" def submit_feedback(feedback_text, consent_checkbox, species_type, model_version, results_df): """Handle feedback submission to private dataset""" global current_audio_path, current_audio_name if not feedback_text or not feedback_text.strip(): return "📝 Please write your comment" if current_audio_path is None: return "❌ No audio file available for feedback" return save_feedback_to_dataset( current_audio_path, current_audio_name, feedback_text, consent_checkbox, species_type, model_version, results_df ) def clear_interface(): """Clear interface and free memory""" global current_audio_path, current_audio_name current_audio_path = None current_audio_name = None gc.collect() return None, pd.DataFrame(), "Amphibians", "Base", "", False # Gradio Interface with gr.Blocks( title="Species Audio Classifier", theme=gr.themes.Soft(), css=""" .gradio-container { max-width: 1200px; margin: auto; } .consent-text { font-size: 0.9em; color: #666; } .final-prediction { background-color: #e8f5e8 !important; font-weight: bold; } """ ) as demo: # Store current results for feedback current_results = gr.State(value=pd.DataFrame()) gr.Markdown(""" ## Species Audio Classifier **Upload an audio file to identify species using AI models** *Based on your notebook implementation - Models are loaded from: [RonaldCeballos/SpeciesClassifiers](https://huggingface.co/RonaldCeballos/SpeciesClassifiers)* 🔍 **How it works:** - Audio is split into 5-second segments - Each segment is converted to a spectrogram - AI model predicts species for each segment - Final prediction is based on voting across all segments """) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### ⚙️ Configuration") species_selector = gr.Dropdown( choices=list(MODEL_CONFIG.keys()), label="🎯 Species Category", value="Amphibians", info="Select the type of species to identify" ) model_selector = gr.Dropdown( choices=["Base", "M1", "M2"], label="🔧 Model Version", value="Base", info="Choose the model version to use" ) audio_input = gr.Audio( label="Upload Audio File", type="filepath", sources=["upload"], waveform_options={"show_controls": True} ) with gr.Row(): predict_btn = gr.Button("🔍 Analyze Audio", variant="primary") clear_btn = gr.Button("🔄 Clear", variant="secondary") gr.Markdown(""" ### 💡 Instructions: 1. Select species category 2. Choose model version 3. Upload audio file (WAV, MP3, etc.) 4. Click "Analyze Audio" 5. Review results by 5-second segments 6. Final prediction shown at the bottom """) with gr.Column(scale=2): gr.Markdown("### 📊 Results") results_display = gr.Dataframe( label="🎧 Analyzed Chunks", headers=["Chunks", "Time", "Species", "Confidence"], wrap=True, max_height=500, datatype=["str", "str", "str", "str"] ) with gr.Accordion("💬 Submit Feedback for Model Improvement", open=False): gr.Markdown(""" **Help us improve!** Submit your audio and feedback to our private dataset for model training. *Using the same approach as your notebook implementation* """) consent_checkbox = gr.Checkbox( label="I consent to share this audio and my feedback for research and model improvement purposes", info="Your data will be stored in a private Hugging Face dataset" ) feedback_input = gr.Textbox( label="Your Feedback", placeholder="Example: Species X was misidentified as Y...", lines=3 ) feedback_btn = gr.Button("📤 Submit Feedback & Audio", variant="primary") feedback_status = gr.Textbox( label="Submission Status", interactive=False ) # Event handlers predict_btn.click( fn=predict_species_final, # Using the enhanced version with voting inputs=[species_selector, model_selector, audio_input], outputs=results_display ).then( fn=lambda results: results, inputs=[results_display], outputs=[current_results] ) feedback_btn.click( fn=submit_feedback, inputs=[feedback_input, consent_checkbox, species_selector, model_selector, current_results], outputs=feedback_status ) clear_btn.click( fn=clear_interface, inputs=None, outputs=[audio_input, results_display, species_selector, model_selector, feedback_input, consent_checkbox] ) # Configuration for Spaces if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860 )