Spaces:

jatinsabari
/

echo

Sleeping

App Files Files Community

jatinsabari commited on Oct 26, 2025

Commit

5c4445f

verified ·

1 Parent(s): f03f067

Create app.py

Browse files

Files changed (1) hide show

app.py +371 -0

app.py ADDED Viewed

	@@ -0,0 +1,371 @@

+import gradio as gr
+import librosa
+import numpy as np
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import tempfile
+import os
+from typing import List, Dict, Any
+# Model configuration
+MODEL_NAME = "google/gemma-2-2b-it"  # Using Gemma 2B for better performance on Hugging Face
+# Note: gemma-3n model might not be available, using gemma-2-2b-it instead
+class AudioEmotionAnalyzer:
+    def __init__(self, model_name: str = MODEL_NAME):
+        self.model_name = model_name
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"🚀 Using device: {self.device}")
+        # Load tokenizer and model
+        print("📥 Loading tokenizer...")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        print("📥 Loading model...")
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
+            device_map="auto",
+            trust_remote_code=True
+        )
+        # Add padding token if it doesn't exist
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        print("✅ Model loaded successfully!")
+    def extract_audio_features(self, audio_path: str) -> Dict[str, Any]:
+        """Extract comprehensive audio features for emotion analysis"""
+        try:
+            # Load audio file
+            y, sr = librosa.load(audio_path, sr=22050, duration=10)  # Limit to 10 seconds
+            # Extract various audio features
+            features = {}
+            # MFCC features (most important for speech emotion)
+            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
+            features['mfcc_mean'] = np.mean(mfcc, axis=1).tolist()
+            features['mfcc_std'] = np.std(mfcc, axis=1).tolist()
+            # Spectral features
+            spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
+            features['spectral_centroid_mean'] = float(np.mean(spectral_centroid))
+            features['spectral_centroid_std'] = float(np.std(spectral_centroid))
+            # Zero crossing rate
+            zcr = librosa.feature.zero_crossing_rate(y)
+            features['zcr_mean'] = float(np.mean(zcr))
+            features['zcr_std'] = float(np.std(zcr))
+            # RMS energy
+            rms = librosa.feature.rms(y=y)
+            features['rms_mean'] = float(np.mean(rms))
+            features['rms_std'] = float(np.std(rms))
+            # Pitch features
+            pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
+            features['pitch_mean'] = float(np.mean(pitches[pitches > 0])) if np.any(pitches > 0) else 0.0
+            features['pitch_std'] = float(np.std(pitches[pitches > 0])) if np.any(pitches > 0) else 0.0
+            # Tempo
+            tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
+            features['tempo'] = float(tempo) if tempo else 0.0
+            # Duration
+            features['duration'] = len(y) / sr
+            print(f"✅ Extracted {len(features)} audio features")
+            return features
+        except Exception as e:
+            print(f"❌ Error extracting audio features: {e}")
+            return {}
+    def features_to_text_description(self, features: Dict[str, Any]) -> str:
+        """Convert audio features to a descriptive text prompt"""
+        # Create a descriptive prompt based on audio features
+        description_parts = []
+        # Analyze spectral characteristics
+        if features.get('spectral_centroid_mean', 0) > 2000:
+            description_parts.append("high-frequency content")
+        else:
+            description_parts.append("low-frequency content")
+        # Analyze energy levels
+        rms_mean = features.get('rms_mean', 0)
+        if rms_mean > 0.1:
+            description_parts.append("high energy")
+        elif rms_mean < 0.01:
+            description_parts.append("low energy")
+        else:
+            description_parts.append("moderate energy")
+        # Analyze speaking rate through zero crossing rate
+        zcr_mean = features.get('zcr_mean', 0)
+        if zcr_mean > 0.1:
+            description_parts.append("rapid speech")
+        elif zcr_mean < 0.05:
+            description_parts.append("slow speech")
+        # Analyze pitch variation
+        pitch_std = features.get('pitch_std', 0)
+        if pitch_std > 100:
+            description_parts.append("variable pitch")
+        else:
+            description_parts.append("steady pitch")
+        # Analyze tempo
+        tempo = features.get('tempo', 0)
+        if tempo > 120:
+            description_parts.append("fast tempo")
+        elif tempo < 80:
+            description_parts.append("slow tempo")
+        description = "This audio has: " + ", ".join(description_parts)
+        return description
+    def analyze_emotion(self, audio_path: str) -> Dict[str, Any]:
+        """Analyze emotion from audio file using Gemma model"""
+        try:
+            print(f"🎵 Analyzing audio: {audio_path}")
+            # Extract audio features
+            features = self.extract_audio_features(audio_path)
+            if not features:
+                return {"error": "Failed to extract audio features"}
+            # Create feature description
+            feature_description = self.features_to_text_description(features)
+            # Create comprehensive prompt for emotion analysis
+            prompt = f"""Analyze the emotional content of this audio based on its acoustic features.
+Audio Characteristics: {feature_description}
+Based on these acoustic properties, analyze the emotional content and provide:
+1. Primary emotion (choose from: happy, sad, angry, fearful, disgusted, surprised, neutral, excited, calm, anxious)
+2. Confidence level (0-100%)
+3. Detailed reasoning based on the audio features
+4. Secondary emotions if present
+Format your response as:
+Primary Emotion: [emotion]
+Confidence: [percentage]%
+Reasoning: [detailed explanation]
+Secondary Emotions: [comma-separated list]
+Analysis:"""
+            print("🤖 Generating emotion analysis with Gemma...")
+            # Tokenize input
+            inputs = self.tokenizer(
+                prompt,
+                return_tensors="pt",
+                max_length=1024,
+                truncation=True,
+                padding=True
+            ).to(self.device)
+            # Generate response
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=256,
+                    temperature=0.7,
+                    do_sample=True,
+                    top_p=0.9,
+                    pad_token_id=self.tokenizer.eos_token_id
+                )
+            # Decode response
+            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            # Extract just the new generated part (after the prompt)
+            generated_text = response[len(prompt):].strip()
+            print(f"✅ Gemma response: {generated_text}")
+            # Parse the response
+            return self.parse_emotion_response(generated_text, features)
+        except Exception as e:
+            print(f"❌ Error in emotion analysis: {e}")
+            return {"error": f"Analysis failed: {str(e)}"}
+    def parse_emotion_response(self, response: str, features: Dict[str, Any]) -> Dict[str, Any]:
+        """Parse Gemma's response to extract structured emotion data"""
+        try:
+            result = {
+                "primary_emotion": "unknown",
+                "confidence": 0,
+                "reasoning": "",
+                "secondary_emotions": [],
+                "audio_features": features,
+                "raw_response": response
+            }
+            lines = response.split('\n')
+            for line in lines:
+                line = line.strip()
+                if line.startswith('Primary Emotion:'):
+                    result["primary_emotion"] = line.split(':', 1)[1].strip()
+                elif line.startswith('Confidence:'):
+                    conf_text = line.split(':', 1)[1].strip().replace('%', '')
+                    try:
+                        result["confidence"] = float(conf_text)
+                    except:
+                        result["confidence"] = 50
+                elif line.startswith('Reasoning:'):
+                    result["reasoning"] = line.split(':', 1)[1].strip()
+                elif line.startswith('Secondary Emotions:'):
+                    sec_emotions = line.split(':', 1)[1].strip()
+                    result["secondary_emotions"] = [e.strip() for e in sec_emotions.split(',')]
+            # If parsing failed, use the raw response as reasoning
+            if not result["reasoning"]:
+                result["reasoning"] = response
+            return result
+        except Exception as e:
+            print(f"❌ Error parsing response: {e}")
+            return {
+                "primary_emotion": "unknown",
+                "confidence": 0,
+                "reasoning": response,
+                "secondary_emotions": [],
+                "audio_features": features,
+                "raw_response": response,
+                "error": f"Parsing error: {str(e)}"
+            }
+# Initialize the analyzer
+print("🔄 Initializing Audio Emotion Analyzer...")
+analyzer = AudioEmotionAnalyzer()
+def process_audio(audio_path: str) -> Dict[str, Any]:
+    """Gradio-compatible function to process audio"""
+    if audio_path is None:
+        return {"error": "No audio file provided"}
+    try:
+        result = analyzer.analyze_emotion(audio_path)
+        return result
+    except Exception as e:
+        return {"error": f"Processing error: {str(e)}"}
+# Create Gradio interface
+def create_interface():
+    """Create the Gradio interface"""
+    # Custom CSS for better styling
+    css = """
+    .emotion-result {
+        padding: 20px;
+        border-radius: 10px;
+        margin: 10px 0;
+    }
+    .primary-emotion {
+        font-size: 24px;
+        font-weight: bold;
+        margin: 10px 0;
+    }
+    .confidence-bar {
+        height: 20px;
+        background: linear-gradient(90deg, #ff6b6b, #4ecdc4);
+        border-radius: 10px;
+        margin: 10px 0;
+    }
+    """
+    # Emotion color mapping
+    emotion_colors = {
+        "happy": "#4ecdc4",
+        "sad": "#6c5ce7",
+        "angry": "#ff6b6b",
+        "fearful": "#a29bfe",
+        "disgusted": "#00b894",
+        "surprised": "#fdcb6e",
+        "neutral": "#b2bec3",
+        "excited": "#e17055",
+        "calm": "#74b9ff",
+        "anxious": "#fd79a8"
+    }
+    def process_audio_wrapper(audio_path):
+        """Wrapper function for Gradio"""
+        result = process_audio(audio_path)
+        if "error" in result:
+            return f"❌ Error: {result['error']}"
+        # Create formatted output
+        emotion = result.get("primary_emotion", "unknown")
+        confidence = result.get("confidence", 0)
+        reasoning = result.get("reasoning", "")
+        secondary = result.get("secondary_emotions", [])
+        color = emotion_colors.get(emotion.lower(), "#b2bec3")
+        output = f"""
+        <div class="emotion-result" style="border-left: 5px solid {color};">
+            <div class="primary-emotion" style="color: {color};">
+                🎭 {emotion.title()}
+            </div>
+            <div>
+                <strong>Confidence:</strong> {confidence}%
+            </div>
+            <div class="confidence-bar" style="width: {confidence}%;"></div>
+            <div>
+                <strong>Reasoning:</strong> {reasoning}
+            </div>
+            {f"<div><strong>Secondary Emotions:</strong> {', '.join(secondary)}</div>" if secondary else ""}
+        </div>
+        """
+        return output
+    # Create interface
+    interface = gr.Interface(
+        fn=process_audio_wrapper,
+        inputs=gr.Audio(
+            sources=["upload", "microphone"],
+            type="filepath",
+            label="Upload Audio File or Record",
+        ),
+        outputs=gr.HTML(label="Emotion Analysis Result"),
+        title="🎵 Audio Emotion Analyzer with Gemma",
+        description="""
+        Upload an audio file or record your voice to analyze emotional content using Google's Gemma model.
+        The AI will analyze acoustic features like pitch, energy, tempo, and spectral characteristics to detect emotions.
+        """,
+        examples=[
+            ["examples/happy_sample.wav"] if os.path.exists("examples/happy_sample.wav") else None,
+            ["examples/sad_sample.wav"] if os.path.exists("examples/sad_sample.wav") else None,
+        ],
+        css=css
+    )
+    return interface
+# Main execution
+if __name__ == "__main__":
+    print("🚀 Starting Audio Emotion Analyzer...")
+    print(f"📊 Using model: {MODEL_NAME}")
+    print(f"🎵 Supported formats: WAV, MP3, FLAC, etc.")
+    # Create and launch interface
+    demo = create_interface()
+    # Launch with appropriate settings
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True,
+        debug=True
+    )