import gradio as gr import librosa import numpy as np import torch from transformers import AutoTokenizer, AutoModelForCausalLM import os from huggingface_hub import login import tempfile # === CONFIGURATION === # Get token from environment variable (set this in your Space secrets) HF_TOKEN = os.environ.get("HUGGINGFACE_TOKEN") # Use a smaller Gemma model for faster loading MODEL_NAME = "google/gemma-2b-it" # 2B version is faster than 7B # Login to Hugging Face (required for gated models) try: if HF_TOKEN and HF_TOKEN != "your_hf_token_here": login(token=HF_TOKEN) print("✅ Authenticated with Hugging Face Hub") else: print("⚠️ No HF_TOKEN provided, using fallback method") except Exception as e: print(f"⚠️ Authentication warning: {e}") class GemmaAudioEmotionAnalyzer: def __init__(self, model_name: str = MODEL_NAME): self.model_name = model_name self.device = "cuda" if torch.cuda.is_available() else "cpu" print(f"🚀 Using device: {self.device}") try: print("📥 Loading Gemma tokenizer...") self.tokenizer = AutoTokenizer.from_pretrained( model_name, token=HF_TOKEN if HF_TOKEN != "your_hf_token_here" else None, trust_remote_code=True ) print("📥 Loading Gemma model...") self.model = AutoModelForCausalLM.from_pretrained( model_name, token=HF_TOKEN if HF_TOKEN != "your_hf_token_here" else None, torch_dtype=torch.float16 if self.device == "cuda" else torch.float32, device_map="auto" if self.device == "cuda" else None, trust_remote_code=True ) # Set pad token if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token print("✅ Gemma model loaded successfully!") except Exception as e: print(f"❌ Failed to load Gemma: {e}") print("🔧 Using fallback rule-based analyzer") self.model = None self.tokenizer = None def extract_fast_features(self, audio_path: str) -> dict: """Extract minimal features quickly""" try: # Load only first 3 seconds for speed y, sr = librosa.load(audio_path, sr=16000, duration=3) features = { 'energy': float(np.mean(librosa.feature.rms(y=y))), 'brightness': float(np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))), 'pitch': float(np.median(librosa.piptrack(y=y, sr=sr)[0][librosa.piptrack(y=y, sr=sr)[0] > 0]) or 150), 'tempo': float(librosa.beat.tempo(y=y, sr=sr)[0]), 'speech_rate': float(np.mean(librosa.feature.zero_crossing_rate(y))) } return features except Exception as e: print(f"❌ Feature extraction error: {e}") return {'energy': 0.05, 'brightness': 1500, 'pitch': 200, 'tempo': 100, 'speech_rate': 0.1} def create_gemma_prompt(self, features: dict) -> str: """Create optimized prompt for Gemma""" prompt = f"""Analyze the emotional content from these audio features: Audio Characteristics: - Energy Level: {"High" if features['energy'] > 0.08 else "Low" if features['energy'] < 0.03 else "Medium"} - Brightness: {"Bright" if features['brightness'] > 2000 else "Dark" if features['brightness'] < 1000 else "Neutral"} - Average Pitch: {"High" if features['pitch'] > 250 else "Low" if features['pitch'] < 150 else "Medium"} - Tempo: {"Fast" if features['tempo'] > 140 else "Slow" if features['tempo'] < 90 else "Moderate"} - Speech Rate: {"Rapid" if features['speech_rate'] > 0.15 else "Slow" if features['speech_rate'] < 0.08 else "Normal"} Based on these acoustic properties, identify the primary emotion. Choose ONE from: happy, sad, angry, fearful, neutral, excited, calm. Respond in this exact format: Emotion: [emotion] Confidence: [high/medium/low] Reason: [brief reason based on features] Analysis:""" return prompt def generate_with_gemma(self, prompt: str) -> str: """Generate response using Gemma with optimized settings""" if self.model is None: return "Emotion: neutral\nConfidence: medium\nReason: Using fallback analysis" try: # Tokenize inputs = self.tokenizer( prompt, return_tensors="pt", max_length=512, truncation=True ).to(self.device) # Generate with optimized settings for speed with torch.no_grad(): outputs = self.model.generate( **inputs, max_new_tokens=100, # Shorter response temperature=0.7, do_sample=True, top_p=0.9, pad_token_id=self.tokenizer.eos_token_id, repetition_penalty=1.1 ) # Decode response = self.tokenizer.decode(outputs[0], skip_special_tokens=True) return response[len(prompt):].strip() except Exception as e: print(f"❌ Gemma generation error: {e}") return "Emotion: neutral\nConfidence: low\nReason: Analysis unavailable" def parse_gemma_response(self, response: str) -> dict: """Parse Gemma's response""" lines = response.split('\n') result = { 'emotion': 'neutral', 'confidence': 'medium', 'reason': 'No analysis provided', 'raw_response': response } for line in lines: line = line.strip() if line.startswith('Emotion:'): result['emotion'] = line.split(':', 1)[1].strip().lower() elif line.startswith('Confidence:'): result['confidence'] = line.split(':', 1)[1].strip().lower() elif line.startswith('Reason:'): result['reason'] = line.split(':', 1)[1].strip() return result def analyze_emotion(self, audio_path: str) -> dict: """Main analysis function""" print(f"🎵 Analyzing: {os.path.basename(audio_path)}") # Step 1: Extract features (fast) features = self.extract_fast_features(audio_path) # Step 2: Create prompt prompt = self.create_gemma_prompt(features) # Step 3: Get Gemma analysis print("🤖 Querying Gemma...") gemma_response = self.generate_with_gemma(prompt) # Step 4: Parse response result = self.parse_gemma_response(gemma_response) result['features'] = features print(f"✅ Gemma result: {result['emotion']}") return result # Initialize analyzer print("🔄 Initializing Gemma Audio Analyzer...") analyzer = GemmaAudioEmotionAnalyzer() def process_audio(audio_path: str) -> str: """Gradio interface function""" if not audio_path: return "❌ Please provide an audio file" try: result = analyzer.analyze_emotion(audio_path) # Format output emotion_icons = { 'happy': '😊', 'sad': '😢', 'angry': '😠', 'fearful': '😨', 'neutral': '😐', 'excited': '🤩', 'calm': '😌' } icon = emotion_icons.get(result['emotion'], '🎭') output = f""" {icon} **Emotion**: {result['emotion'].title()} 📊 **Confidence**: {result['confidence'].title()} 💭 **Reason**: {result['reason']} 🔬 **Audio Analysis**: • Energy: {result['features']['energy']:.3f} • Brightness: {result['features']['brightness']:.0f} Hz • Pitch: {result['features']['pitch']:.0f} Hz • Tempo: {result['features']['tempo']:.0f} BPM 🤖 **Powered by Google Gemma** """ return output except Exception as e: return f"❌ Error: {str(e)}" # Create Gradio interface demo = gr.Interface( fn=process_audio, inputs=gr.Audio( sources=["upload"], type="filepath", label="Upload Audio File", max_length=10 # Limit to 10 seconds for faster processing ), outputs=gr.Markdown(label="Gemma Emotion Analysis"), title="🎵 Audio Emotion Analysis with Google Gemma", description="Upload audio to analyze emotions using Google's Gemma model", examples=[], allow_flagging="never" ) if __name__ == "__main__": print("🚀 Starting Gemma Audio Emotion Analyzer...") demo.launch( server_name="0.0.0.0", server_port=7860, share=True ) app = FastAPI() # Enable CORS so your gateway can call this app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"], ) @app.post("/api/analyze") async def api_analyze(audio: UploadFile = File(...)): """API endpoint for programmatic access""" # Save uploaded file temporarily temp_path = f"/tmp/{audio.filename}" with open(temp_path, "wb") as f: f.write(await audio.read()) # Use your existing emotion detection function result = detect_emotion(temp_path) return result # Mount Gradio to FastAPI (this combines both!) app = gr.mount_gradio_app(app, demo, path="/")