Spaces:

jatinsabari
/

echo

Sleeping

App Files Files Community

jatinsabari commited on Oct 26, 2025

Commit

56a4063

verified ·

1 Parent(s): a1009a6

Update app.py

Browse files

Files changed (1) hide show

app.py +192 -169

app.py CHANGED Viewed

@@ -1,194 +1,198 @@
 import gradio as gr
 import librosa
 import numpy as np
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
 import tempfile
 import os
 from typing import Dict, Any
-from huggingface_hub import login
-# Your Hugging Face token - REPLACE WITH YOUR ACTUAL TOKEN
-HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
-# Login to Hugging Face
-try:
-    login(token=HF_TOKEN)
-    print("✅ Successfully authenticated with Hugging Face")
-except Exception as e:
-    print(f"❌ Authentication failed: {e}")
-# Model configuration
-MODEL_NAME = "google/gemma-2-2b-it"
-class AudioEmotionAnalyzer:
-    def __init__(self, model_name: str = MODEL_NAME):
-        self.model_name = model_name
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        print(f"🚀 Using device: {self.device}")
-        # Load tokenizer and model with authentication
-        print("📥 Loading tokenizer...")
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            model_name,
-            token=HF_TOKEN,
-            trust_remote_code=True
-        )
-        print("📥 Loading model...")
-        self.model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            token=HF_TOKEN,
-            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
-            device_map="auto",
-            trust_remote_code=True
-        )
-        # Add padding token if it doesn't exist
-        if self.tokenizer.pad_token is None:
-            self.tokenizer.pad_token = self.tokenizer.eos_token
-        print("✅ Gemma model loaded successfully!")
     def extract_audio_features(self, audio_path: str) -> Dict[str, Any]:
-        """Extract audio features for emotion analysis"""
         try:
-            y, sr = librosa.load(audio_path, sr=22050, duration=10)
             features = {}
-            # MFCC features
-            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
-            features['mfcc_mean'] = np.mean(mfcc, axis=1).tolist()
-            # Spectral features
             spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
-            features['spectral_centroid'] = float(np.mean(spectral_centroid))
-            # Zero crossing rate
             zcr = librosa.feature.zero_crossing_rate(y)
-            features['zcr'] = float(np.mean(zcr))
-            # RMS energy
-            rms = librosa.feature.rms(y=y)
-            features['rms'] = float(np.mean(rms))
-            # Pitch
-            pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
-            features['pitch'] = float(np.mean(pitches[pitches > 0])) if np.any(pitches > 0) else 0.0
-            print(f"✅ Extracted audio features")
             return features
         except Exception as e:
-            print(f"❌ Error extracting audio features: {e}")
-            return {}
-    def features_to_prompt(self, features: Dict[str, Any]) -> str:
-        """Convert audio features to a prompt for Gemma"""
-        prompt = f"""Analyze the emotional content of audio based on these acoustic features:
-Audio Features:
-- Spectral Centroid: {features.get('spectral_centroid', 0):.1f} Hz (brightness)
-- Zero Crossing Rate: {features.get('zcr', 0):.3f} (speech rate)
-- RMS Energy: {features.get('rms', 0):.3f} (loudness)
-- Pitch: {features.get('pitch', 0):.1f} Hz
-Based on these acoustic properties, determine the primary emotion from: happy, sad, angry, fearful, disgusted, surprised, neutral, excited, calm, anxious.
-Provide analysis in this format:
-Primary Emotion: [emotion]
-Confidence: [high/medium/low]
-Reasoning: [brief explanation based on features]
-Analysis:"""
-        return prompt
     def analyze_emotion(self, audio_path: str) -> Dict[str, Any]:
-        """Analyze emotion from audio file using Gemma"""
         try:
-            print(f"🎵 Analyzing audio: {audio_path}")
-            # Extract audio features
             features = self.extract_audio_features(audio_path)
-            if not features:
-                return {"error": "Failed to extract audio features"}
-            # Create prompt
-            prompt = self.features_to_prompt(features)
-            print("🤖 Generating emotion analysis with Gemma...")
-            # Tokenize input
-            inputs = self.tokenizer(
-                prompt,
-                return_tensors="pt",
-                max_length=512,
-                truncation=True
-            ).to(self.device)
-            # Generate response
-            with torch.no_grad():
-                outputs = self.model.generate(
-                    **inputs,
-                    max_new_tokens=150,
-                    temperature=0.7,
-                    do_sample=True,
-                    top_p=0.9,
-                    pad_token_id=self.tokenizer.eos_token_id
-                )
-            # Decode response
-            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            generated_text = response[len(prompt):].strip()
-            print(f"✅ Gemma response: {generated_text}")
-            return self.parse_emotion_response(generated_text, features)
-        except Exception as e:
-            print(f"❌ Error in emotion analysis: {e}")
-            return {"error": f"Analysis failed: {str(e)}"}
-    def parse_emotion_response(self, response: str, features: Dict[str, Any]) -> Dict[str, Any]:
-        """Parse Gemma's response"""
-        try:
-            result = {
-                "primary_emotion": "unknown",
-                "confidence": "unknown",
-                "reasoning": response,
-                "audio_features": features
-            }
-            lines = response.split('\n')
-            for line in lines:
-                line = line.strip()
-                if line.startswith('Primary Emotion:'):
-                    result["primary_emotion"] = line.split(':', 1)[1].strip()
-                elif line.startswith('Confidence:'):
-                    result["confidence"] = line.split(':', 1)[1].strip()
-                elif line.startswith('Reasoning:'):
-                    result["reasoning"] = line.split(':', 1)[1].strip()
             return result
         except Exception as e:
             return {
-                "primary_emotion": "unknown",
-                "confidence": "unknown",
-                "reasoning": response,
-                "audio_features": features,
-                "error": f"Parsing error: {str(e)}"
             }
-# Initialize the analyzer
-print("🔄 Initializing Gemma Audio Emotion Analyzer...")
-analyzer = AudioEmotionAnalyzer()
 def process_audio(audio_path: str) -> str:
-    """Gradio-compatible function to process audio"""
     if audio_path is None:
         return "❌ No audio file provided"
@@ -198,22 +202,38 @@ def process_audio(audio_path: str) -> str:
         if "error" in result:
             return f"❌ Error: {result['error']}"
-        # Format output
-        emotion = result.get("primary_emotion", "unknown")
-        confidence = result.get("confidence", "unknown")
-        reasoning = result.get("reasoning", "")
-        output = f"""
-        🎭 **Primary Emotion**: {emotion.title()}
-        📊 **Confidence**: {confidence}
-        💭 **Reasoning**: {reasoning}
-        📈 **Audio Features Analyzed**:
-        - Spectral Brightness: {result['audio_features'].get('spectral_centroid', 0):.1f} Hz
-        - Speech Rate: {result['audio_features'].get('zcr', 0):.3f}
-        - Loudness: {result['audio_features'].get('rms', 0):.3f}
-        - Pitch: {result['audio_features'].get('pitch', 0):.1f} Hz
-        """
         return output
@@ -226,16 +246,19 @@ demo = gr.Interface(
     inputs=gr.Audio(
         sources=["upload", "microphone"],
         type="filepath",
-        label="Upload Audio File or Record"
     ),
-    outputs=gr.Textbox(label="Emotion Analysis Result"),
-    title="🎵 Audio Emotion Analyzer with Google Gemma",
-    description="Upload audio or record to analyze emotions using Google's Gemma-2-2B model",
     examples=[],
 )
 if __name__ == "__main__":
-    print("🚀 Starting Gemma Audio Emotion Analyzer...")
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,

 import gradio as gr
 import librosa
 import numpy as np
 import tempfile
 import os
 from typing import Dict, Any
+import json
+class FastAudioEmotionAnalyzer:
+    def __init__(self):
+        print("🚀 Initializing Fast Audio Emotion Analyzer...")
+        # Pre-defined emotion rules based on audio features (no model loading)
+        self.emotion_rules = {
+            'happy': {
+                'conditions': ['high_pitch', 'high_energy', 'fast_tempo', 'bright_timbre'],
+                'description': 'Characterized by high energy, bright tones, and fast pace'
+            },
+            'sad': {
+                'conditions': ['low_pitch', 'low_energy', 'slow_tempo', 'dark_timbre'],
+                'description': 'Characterized by low energy, slow pace, and dark tones'
+            },
+            'angry': {
+                'conditions': ['high_energy', 'harsh_timbre', 'irregular_rhythm', 'high_pitch_variability'],
+                'description': 'Characterized by high energy, harsh tones, and irregular patterns'
+            },
+            'fearful': {
+                'conditions': ['high_pitch', 'irregular_energy', 'fast_tempo', 'tremolo_effect'],
+                'description': 'Characterized by high pitch, irregular energy, and nervous tempo'
+            },
+            'neutral': {
+                'conditions': ['medium_energy', 'medium_pitch', 'steady_tempo', 'balanced_timbre'],
+                'description': 'Characterized by balanced features and steady patterns'
+            },
+            'excited': {
+                'conditions': ['very_high_energy', 'fast_tempo', 'bright_timbre', 'high_pitch'],
+                'description': 'Characterized by very high energy and fast, bright patterns'
+            },
+            'calm': {
+                'conditions': ['low_energy', 'slow_tempo', 'smooth_timbre', 'low_pitch_variability'],
+                'description': 'Characterized by low energy, smooth tones, and steady pace'
+            }
+        }
+        print("✅ Fast analyzer ready! (No heavy models to load)")
     def extract_audio_features(self, audio_path: str) -> Dict[str, Any]:
+        """Extract audio features quickly"""
         try:
+            # Load only first 5 seconds for faster processing
+            y, sr = librosa.load(audio_path, sr=22050, duration=5)
             features = {}
+            # Basic MFCC (fast)
+            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=5)  # Reduced from 13 to 5
+            features['mfcc_mean'] = float(np.mean(mfcc))
+            # Spectral centroid (brightness)
             spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
+            features['brightness'] = float(np.mean(spectral_centroid))
+            # RMS energy (loudness)
+            rms = librosa.feature.rms(y=y)
+            features['energy'] = float(np.mean(rms))
+            # Zero crossing rate (noisiness/speech rate)
             zcr = librosa.feature.zero_crossing_rate(y)
+            features['speech_rate'] = float(np.mean(zcr))
+            # Tempo (pace)
+            tempo, _ = librosa.beat.beat_track(y=y, sr=sr, onset_envelope=None)  # Faster tempo estimation
+            features['tempo'] = float(tempo) if tempo else 80.0
+            # Pitch mean
+            pitches = librosa.piptrack(y=y, sr=sr, fmin=50, fmax=500)[0]
+            pitches = pitches[pitches > 0]
+            features['pitch'] = float(np.mean(pitches)) if len(pitches) > 0 else 150.0
+            print(f"✅ Extracted features in milliseconds")
             return features
         except Exception as e:
+            print(f"❌ Feature extraction error: {e}")
+            # Return default features
+            return {
+                'brightness': 1500.0,
+                'energy': 0.05,
+                'speech_rate': 0.1,
+                'tempo': 100.0,
+                'pitch': 200.0,
+                'mfcc_mean': 0.0
+            }
+    def analyze_emotion_rules(self, features: Dict[str, Any]) -> Dict[str, Any]:
+        """Analyze emotion using rule-based system (very fast)"""
+        # Define feature thresholds
+        conditions = []
+        # Brightness conditions
+        if features['brightness'] > 2000:
+            conditions.append('bright_timbre')
+        elif features['brightness'] < 1000:
+            conditions.append('dark_timbre')
+        else:
+            conditions.append('balanced_timbre')
+        # Energy conditions
+        if features['energy'] > 0.1:
+            conditions.append('high_energy')
+        elif features['energy'] > 0.05:
+            conditions.append('medium_energy')
+        else:
+            conditions.append('low_energy')
+        # Pitch conditions
+        if features['pitch'] > 250:
+            conditions.append('high_pitch')
+        elif features['pitch'] < 150:
+            conditions.append('low_pitch')
+        else:
+            conditions.append('medium_pitch')
+        # Tempo conditions
+        if features['tempo'] > 140:
+            conditions.append('fast_tempo')
+        elif features['tempo'] < 90:
+            conditions.append('slow_tempo')
+        else:
+            conditions.append('steady_tempo')
+        # Speech rate conditions
+        if features['speech_rate'] > 0.15:
+            conditions.append('fast_speech')
+        elif features['speech_rate'] < 0.08:
+            conditions.append('slow_speech')
+        else:
+            conditions.append('normal_speech')
+        # Score each emotion based on matching conditions
+        emotion_scores = {}
+        for emotion, data in self.emotion_rules.items():
+            score = 0
+            for condition in data['conditions']:
+                if condition in conditions:
+                    score += 1
+            emotion_scores[emotion] = score / len(data['conditions'])
+        # Get top emotion
+        top_emotion = max(emotion_scores, key=emotion_scores.get)
+        confidence = emotion_scores[top_emotion]
+        # Generate reasoning
+        reasoning = f"Audio shows {conditions[0]}, {conditions[1]}, {conditions[2]}. "
+        reasoning += f"Pattern matches {top_emotion} emotion ({self.emotion_rules[top_emotion]['description']})."
+        return {
+            'primary_emotion': top_emotion,
+            'confidence': confidence,
+            'reasoning': reasoning,
+            'all_scores': emotion_scores,
+            'detected_conditions': conditions
+        }
     def analyze_emotion(self, audio_path: str) -> Dict[str, Any]:
+        """Fast emotion analysis (usually < 2 seconds)"""
         try:
+            print(f"🎵 Fast analyzing: {os.path.basename(audio_path)}")
+            # Extract features (fast)
             features = self.extract_audio_features(audio_path)
+            # Rule-based analysis (instant)
+            result = self.analyze_emotion_rules(features)
+            result['audio_features'] = features
+            print(f"✅ Analysis complete: {result['primary_emotion']} ({result['confidence']:.1%})")
             return result
         except Exception as e:
+            print(f"❌ Analysis error: {e}")
             return {
+                'primary_emotion': 'neutral',
+                'confidence': 0.5,
+                'reasoning': f'Analysis failed: {str(e)}',
+                'error': str(e)
             }
+# Initialize the fast analyzer
+print("🔄 Initializing Fast Audio Emotion Analyzer...")
+analyzer = FastAudioEmotionAnalyzer()
 def process_audio(audio_path: str) -> str:
+    """Gradio-compatible function"""
     if audio_path is None:
         return "❌ No audio file provided"
         if "error" in result:
             return f"❌ Error: {result['error']}"
+        # Format beautiful output
+        emotion = result['primary_emotion']
+        confidence = result['confidence']
+        # Emotion emojis
+        emotion_emojis = {
+            'happy': '😊',
+            'sad': '😢',
+            'angry': '😠',
+            'fearful': '😨',
+            'neutral': '😐',
+            'excited': '🤩',
+            'calm': '😌'
+        }
+        emoji = emotion_emojis.get(emotion, '🎭')
+        output = f"""
+{emoji} **Primary Emotion**: {emotion.title()}
+📊 **Confidence**: {confidence:.1%}
+💭 **Reasoning**: {result['reasoning']}
+📈 **Audio Analysis**:
+• Brightness: {result['audio_features']['brightness']:.0f} Hz
+• Energy: {result['audio_features']['energy']:.3f}
+• Pitch: {result['audio_features']['pitch']:.0f} Hz
+• Tempo: {result['audio_features']['tempo']:.0f} BPM
+• Speech Rate: {result['audio_features']['speech_rate']:.3f}
+🔍 **Detected Patterns**: {', '.join(result['detected_conditions'][:3])}
+"""
         return output
     inputs=gr.Audio(
         sources=["upload", "microphone"],
         type="filepath",
+        label="Upload Audio File or Record",
+        max_length=30  # Limit to 30 seconds for faster processing
     ),
+    outputs=gr.Markdown(label="Emotion Analysis Result"),
+    title="🎵 Fast Audio Emotion Analyzer",
+    description="**Lightning-fast emotion detection from audio** ⚡ (Processes in 1-2 seconds)",
     examples=[],
+    allow_flagging="never"
 )
 if __name__ == "__main__":
+    print("🚀 Starting Fast Audio Emotion Analyzer...")
+    print("⚡ Ready to process audio in seconds!")
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,