Spaces:

jatinsabari
/

echo

Sleeping

App Files Files Community

jatinsabari commited on Oct 26, 2025

Commit

a1009a6

verified ·

1 Parent(s): 5c4445f

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -222

app.py CHANGED Viewed

@@ -5,11 +5,21 @@ import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import tempfile
 import os
-from typing import List, Dict, Any
 # Model configuration
-MODEL_NAME = "google/gemma-2-2b-it"  # Using Gemma 2B for better performance on Hugging Face
-# Note: gemma-3n model might not be available, using gemma-2-2b-it instead
 class AudioEmotionAnalyzer:
     def __init__(self, model_name: str = MODEL_NAME):
@@ -17,13 +27,18 @@ class AudioEmotionAnalyzer:
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"🚀 Using device: {self.device}")
-        # Load tokenizer and model
         print("📥 Loading tokenizer...")
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
         print("📥 Loading model...")
         self.model = AutoModelForCausalLM.from_pretrained(
             model_name,
             torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
             device_map="auto",
             trust_remote_code=True
@@ -33,103 +48,66 @@ class AudioEmotionAnalyzer:
         if self.tokenizer.pad_token is None:
             self.tokenizer.pad_token = self.tokenizer.eos_token
-        print("✅ Model loaded successfully!")
     def extract_audio_features(self, audio_path: str) -> Dict[str, Any]:
-        """Extract comprehensive audio features for emotion analysis"""
         try:
-            # Load audio file
-            y, sr = librosa.load(audio_path, sr=22050, duration=10)  # Limit to 10 seconds
-            # Extract various audio features
             features = {}
-            # MFCC features (most important for speech emotion)
             mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
             features['mfcc_mean'] = np.mean(mfcc, axis=1).tolist()
-            features['mfcc_std'] = np.std(mfcc, axis=1).tolist()
             # Spectral features
             spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
-            features['spectral_centroid_mean'] = float(np.mean(spectral_centroid))
-            features['spectral_centroid_std'] = float(np.std(spectral_centroid))
             # Zero crossing rate
             zcr = librosa.feature.zero_crossing_rate(y)
-            features['zcr_mean'] = float(np.mean(zcr))
-            features['zcr_std'] = float(np.std(zcr))
             # RMS energy
             rms = librosa.feature.rms(y=y)
-            features['rms_mean'] = float(np.mean(rms))
-            features['rms_std'] = float(np.std(rms))
-            # Pitch features
             pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
-            features['pitch_mean'] = float(np.mean(pitches[pitches > 0])) if np.any(pitches > 0) else 0.0
-            features['pitch_std'] = float(np.std(pitches[pitches > 0])) if np.any(pitches > 0) else 0.0
-            # Tempo
-            tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
-            features['tempo'] = float(tempo) if tempo else 0.0
-            # Duration
-            features['duration'] = len(y) / sr
-            print(f"✅ Extracted {len(features)} audio features")
             return features
         except Exception as e:
             print(f"❌ Error extracting audio features: {e}")
             return {}
-    def features_to_text_description(self, features: Dict[str, Any]) -> str:
-        """Convert audio features to a descriptive text prompt"""
-        # Create a descriptive prompt based on audio features
-        description_parts = []
-        # Analyze spectral characteristics
-        if features.get('spectral_centroid_mean', 0) > 2000:
-            description_parts.append("high-frequency content")
-        else:
-            description_parts.append("low-frequency content")
-        # Analyze energy levels
-        rms_mean = features.get('rms_mean', 0)
-        if rms_mean > 0.1:
-            description_parts.append("high energy")
-        elif rms_mean < 0.01:
-            description_parts.append("low energy")
-        else:
-            description_parts.append("moderate energy")
-        # Analyze speaking rate through zero crossing rate
-        zcr_mean = features.get('zcr_mean', 0)
-        if zcr_mean > 0.1:
-            description_parts.append("rapid speech")
-        elif zcr_mean < 0.05:
-            description_parts.append("slow speech")
-        # Analyze pitch variation
-        pitch_std = features.get('pitch_std', 0)
-        if pitch_std > 100:
-            description_parts.append("variable pitch")
-        else:
-            description_parts.append("steady pitch")
-        # Analyze tempo
-        tempo = features.get('tempo', 0)
-        if tempo > 120:
-            description_parts.append("fast tempo")
-        elif tempo < 80:
-            description_parts.append("slow tempo")
-        description = "This audio has: " + ", ".join(description_parts)
-        return description
     def analyze_emotion(self, audio_path: str) -> Dict[str, Any]:
-        """Analyze emotion from audio file using Gemma model"""
         try:
             print(f"🎵 Analyzing audio: {audio_path}")
@@ -138,27 +116,8 @@ class AudioEmotionAnalyzer:
             if not features:
                 return {"error": "Failed to extract audio features"}
-            # Create feature description
-            feature_description = self.features_to_text_description(features)
-            # Create comprehensive prompt for emotion analysis
-            prompt = f"""Analyze the emotional content of this audio based on its acoustic features.
-Audio Characteristics: {feature_description}
-Based on these acoustic properties, analyze the emotional content and provide:
-1. Primary emotion (choose from: happy, sad, angry, fearful, disgusted, surprised, neutral, excited, calm, anxious)
-2. Confidence level (0-100%)
-3. Detailed reasoning based on the audio features
-4. Secondary emotions if present
-Format your response as:
-Primary Emotion: [emotion]
-Confidence: [percentage]%
-Reasoning: [detailed explanation]
-Secondary Emotions: [comma-separated list]
-Analysis:"""
             print("🤖 Generating emotion analysis with Gemma...")
@@ -166,16 +125,15 @@ Analysis:"""
             inputs = self.tokenizer(
                 prompt,
                 return_tensors="pt",
-                max_length=1024,
-                truncation=True,
-                padding=True
             ).to(self.device)
             # Generate response
             with torch.no_grad():
                 outputs = self.model.generate(
                     **inputs,
-                    max_new_tokens=256,
                     temperature=0.7,
                     do_sample=True,
                     top_p=0.9,
@@ -184,13 +142,10 @@ Analysis:"""
             # Decode response
             response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            # Extract just the new generated part (after the prompt)
             generated_text = response[len(prompt):].strip()
             print(f"✅ Gemma response: {generated_text}")
-            # Parse the response
             return self.parse_emotion_response(generated_text, features)
         except Exception as e:
@@ -198,15 +153,13 @@ Analysis:"""
             return {"error": f"Analysis failed: {str(e)}"}
     def parse_emotion_response(self, response: str, features: Dict[str, Any]) -> Dict[str, Any]:
-        """Parse Gemma's response to extract structured emotion data"""
         try:
             result = {
                 "primary_emotion": "unknown",
-                "confidence": 0,
-                "reasoning": "",
-                "secondary_emotions": [],
-                "audio_features": features,
-                "raw_response": response
             }
             lines = response.split('\n')
@@ -215,157 +168,76 @@ Analysis:"""
                 if line.startswith('Primary Emotion:'):
                     result["primary_emotion"] = line.split(':', 1)[1].strip()
                 elif line.startswith('Confidence:'):
-                    conf_text = line.split(':', 1)[1].strip().replace('%', '')
-                    try:
-                        result["confidence"] = float(conf_text)
-                    except:
-                        result["confidence"] = 50
                 elif line.startswith('Reasoning:'):
                     result["reasoning"] = line.split(':', 1)[1].strip()
-                elif line.startswith('Secondary Emotions:'):
-                    sec_emotions = line.split(':', 1)[1].strip()
-                    result["secondary_emotions"] = [e.strip() for e in sec_emotions.split(',')]
-            # If parsing failed, use the raw response as reasoning
-            if not result["reasoning"]:
-                result["reasoning"] = response
             return result
         except Exception as e:
-            print(f"❌ Error parsing response: {e}")
             return {
                 "primary_emotion": "unknown",
-                "confidence": 0,
                 "reasoning": response,
-                "secondary_emotions": [],
                 "audio_features": features,
-                "raw_response": response,
                 "error": f"Parsing error: {str(e)}"
             }
 # Initialize the analyzer
-print("🔄 Initializing Audio Emotion Analyzer...")
 analyzer = AudioEmotionAnalyzer()
-def process_audio(audio_path: str) -> Dict[str, Any]:
     """Gradio-compatible function to process audio"""
     if audio_path is None:
-        return {"error": "No audio file provided"}
     try:
         result = analyzer.analyze_emotion(audio_path)
-        return result
-    except Exception as e:
-        return {"error": f"Processing error: {str(e)}"}
-# Create Gradio interface
-def create_interface():
-    """Create the Gradio interface"""
-    # Custom CSS for better styling
-    css = """
-    .emotion-result {
-        padding: 20px;
-        border-radius: 10px;
-        margin: 10px 0;
-    }
-    .primary-emotion {
-        font-size: 24px;
-        font-weight: bold;
-        margin: 10px 0;
-    }
-    .confidence-bar {
-        height: 20px;
-        background: linear-gradient(90deg, #ff6b6b, #4ecdc4);
-        border-radius: 10px;
-        margin: 10px 0;
-    }
-    """
-    # Emotion color mapping
-    emotion_colors = {
-        "happy": "#4ecdc4",
-        "sad": "#6c5ce7",
-        "angry": "#ff6b6b",
-        "fearful": "#a29bfe",
-        "disgusted": "#00b894",
-        "surprised": "#fdcb6e",
-        "neutral": "#b2bec3",
-        "excited": "#e17055",
-        "calm": "#74b9ff",
-        "anxious": "#fd79a8"
-    }
-    def process_audio_wrapper(audio_path):
-        """Wrapper function for Gradio"""
-        result = process_audio(audio_path)
         if "error" in result:
             return f"❌ Error: {result['error']}"
-        # Create formatted output
         emotion = result.get("primary_emotion", "unknown")
-        confidence = result.get("confidence", 0)
         reasoning = result.get("reasoning", "")
-        secondary = result.get("secondary_emotions", [])
-        color = emotion_colors.get(emotion.lower(), "#b2bec3")
         output = f"""
-        <div class="emotion-result" style="border-left: 5px solid {color};">
-            <div class="primary-emotion" style="color: {color};">
-                🎭 {emotion.title()}
-            </div>
-            <div>
-                <strong>Confidence:</strong> {confidence}%
-            </div>
-            <div class="confidence-bar" style="width: {confidence}%;"></div>
-            <div>
-                <strong>Reasoning:</strong> {reasoning}
-            </div>
-            {f"<div><strong>Secondary Emotions:</strong> {', '.join(secondary)}</div>" if secondary else ""}
-        </div>
         """
         return output
-    # Create interface
-    interface = gr.Interface(
-        fn=process_audio_wrapper,
-        inputs=gr.Audio(
-            sources=["upload", "microphone"],
-            type="filepath",
-            label="Upload Audio File or Record",
-        ),
-        outputs=gr.HTML(label="Emotion Analysis Result"),
-        title="🎵 Audio Emotion Analyzer with Gemma",
-        description="""
-        Upload an audio file or record your voice to analyze emotional content using Google's Gemma model.
-        The AI will analyze acoustic features like pitch, energy, tempo, and spectral characteristics to detect emotions.
-        """,
-        examples=[
-            ["examples/happy_sample.wav"] if os.path.exists("examples/happy_sample.wav") else None,
-            ["examples/sad_sample.wav"] if os.path.exists("examples/sad_sample.wav") else None,
-        ],
-        css=css
-    )
-    return interface
-# Main execution
 if __name__ == "__main__":
-    print("🚀 Starting Audio Emotion Analyzer...")
-    print(f"📊 Using model: {MODEL_NAME}")
-    print(f"🎵 Supported formats: WAV, MP3, FLAC, etc.")
-    # Create and launch interface
-    demo = create_interface()
-    # Launch with appropriate settings
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
-        share=True,
-        debug=True
     )

 from transformers import AutoTokenizer, AutoModelForCausalLM
 import tempfile
 import os
+from typing import Dict, Any
+from huggingface_hub import login
+# Your Hugging Face token - REPLACE WITH YOUR ACTUAL TOKEN
+HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
+# Login to Hugging Face
+try:
+    login(token=HF_TOKEN)
+    print("✅ Successfully authenticated with Hugging Face")
+except Exception as e:
+    print(f"❌ Authentication failed: {e}")
 # Model configuration
+MODEL_NAME = "google/gemma-2-2b-it"
 class AudioEmotionAnalyzer:
     def __init__(self, model_name: str = MODEL_NAME):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"🚀 Using device: {self.device}")
+        # Load tokenizer and model with authentication
         print("📥 Loading tokenizer...")
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            token=HF_TOKEN,
+            trust_remote_code=True
+        )
         print("📥 Loading model...")
         self.model = AutoModelForCausalLM.from_pretrained(
             model_name,
+            token=HF_TOKEN,
             torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
             device_map="auto",
             trust_remote_code=True
         if self.tokenizer.pad_token is None:
             self.tokenizer.pad_token = self.tokenizer.eos_token
+        print("✅ Gemma model loaded successfully!")
     def extract_audio_features(self, audio_path: str) -> Dict[str, Any]:
+        """Extract audio features for emotion analysis"""
         try:
+            y, sr = librosa.load(audio_path, sr=22050, duration=10)
             features = {}
+            # MFCC features
             mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
             features['mfcc_mean'] = np.mean(mfcc, axis=1).tolist()
             # Spectral features
             spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
+            features['spectral_centroid'] = float(np.mean(spectral_centroid))
             # Zero crossing rate
             zcr = librosa.feature.zero_crossing_rate(y)
+            features['zcr'] = float(np.mean(zcr))
             # RMS energy
             rms = librosa.feature.rms(y=y)
+            features['rms'] = float(np.mean(rms))
+            # Pitch
             pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
+            features['pitch'] = float(np.mean(pitches[pitches > 0])) if np.any(pitches > 0) else 0.0
+            print(f"✅ Extracted audio features")
             return features
         except Exception as e:
             print(f"❌ Error extracting audio features: {e}")
             return {}
+    def features_to_prompt(self, features: Dict[str, Any]) -> str:
+        """Convert audio features to a prompt for Gemma"""
+        prompt = f"""Analyze the emotional content of audio based on these acoustic features:
+Audio Features:
+- Spectral Centroid: {features.get('spectral_centroid', 0):.1f} Hz (brightness)
+- Zero Crossing Rate: {features.get('zcr', 0):.3f} (speech rate)
+- RMS Energy: {features.get('rms', 0):.3f} (loudness)
+- Pitch: {features.get('pitch', 0):.1f} Hz
+Based on these acoustic properties, determine the primary emotion from: happy, sad, angry, fearful, disgusted, surprised, neutral, excited, calm, anxious.
+Provide analysis in this format:
+Primary Emotion: [emotion]
+Confidence: [high/medium/low]
+Reasoning: [brief explanation based on features]
+Analysis:"""
+        return prompt
     def analyze_emotion(self, audio_path: str) -> Dict[str, Any]:
+        """Analyze emotion from audio file using Gemma"""
         try:
             print(f"🎵 Analyzing audio: {audio_path}")
             if not features:
                 return {"error": "Failed to extract audio features"}
+            # Create prompt
+            prompt = self.features_to_prompt(features)
             print("🤖 Generating emotion analysis with Gemma...")
             inputs = self.tokenizer(
                 prompt,
                 return_tensors="pt",
+                max_length=512,
+                truncation=True
             ).to(self.device)
             # Generate response
             with torch.no_grad():
                 outputs = self.model.generate(
                     **inputs,
+                    max_new_tokens=150,
                     temperature=0.7,
                     do_sample=True,
                     top_p=0.9,
             # Decode response
             response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
             generated_text = response[len(prompt):].strip()
             print(f"✅ Gemma response: {generated_text}")
             return self.parse_emotion_response(generated_text, features)
         except Exception as e:
             return {"error": f"Analysis failed: {str(e)}"}
     def parse_emotion_response(self, response: str, features: Dict[str, Any]) -> Dict[str, Any]:
+        """Parse Gemma's response"""
         try:
             result = {
                 "primary_emotion": "unknown",
+                "confidence": "unknown",
+                "reasoning": response,
+                "audio_features": features
             }
             lines = response.split('\n')
                 if line.startswith('Primary Emotion:'):
                     result["primary_emotion"] = line.split(':', 1)[1].strip()
                 elif line.startswith('Confidence:'):
+                    result["confidence"] = line.split(':', 1)[1].strip()
                 elif line.startswith('Reasoning:'):
                     result["reasoning"] = line.split(':', 1)[1].strip()
             return result
         except Exception as e:
             return {
                 "primary_emotion": "unknown",
+                "confidence": "unknown",
                 "reasoning": response,
                 "audio_features": features,
                 "error": f"Parsing error: {str(e)}"
             }
 # Initialize the analyzer
+print("🔄 Initializing Gemma Audio Emotion Analyzer...")
 analyzer = AudioEmotionAnalyzer()
+def process_audio(audio_path: str) -> str:
     """Gradio-compatible function to process audio"""
     if audio_path is None:
+        return "❌ No audio file provided"
     try:
         result = analyzer.analyze_emotion(audio_path)
         if "error" in result:
             return f"❌ Error: {result['error']}"
+        # Format output
         emotion = result.get("primary_emotion", "unknown")
+        confidence = result.get("confidence", "unknown")
         reasoning = result.get("reasoning", "")
         output = f"""
+        🎭 **Primary Emotion**: {emotion.title()}
+        📊 **Confidence**: {confidence}
+        💭 **Reasoning**: {reasoning}
+        📈 **Audio Features Analyzed**:
+        - Spectral Brightness: {result['audio_features'].get('spectral_centroid', 0):.1f} Hz
+        - Speech Rate: {result['audio_features'].get('zcr', 0):.3f}
+        - Loudness: {result['audio_features'].get('rms', 0):.3f}
+        - Pitch: {result['audio_features'].get('pitch', 0):.1f} Hz
         """
         return output
+    except Exception as e:
+        return f"❌ Processing error: {str(e)}"
+# Create Gradio interface
+demo = gr.Interface(
+    fn=process_audio,
+    inputs=gr.Audio(
+        sources=["upload", "microphone"],
+        type="filepath",
+        label="Upload Audio File or Record"
+    ),
+    outputs=gr.Textbox(label="Emotion Analysis Result"),
+    title="🎵 Audio Emotion Analyzer with Google Gemma",
+    description="Upload audio or record to analyze emotions using Google's Gemma-2-2B model",
+    examples=[],
+)
 if __name__ == "__main__":
+    print("🚀 Starting Gemma Audio Emotion Analyzer...")
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
+        share=True
     )