Spaces:
Sleeping
Sleeping
File size: 8,657 Bytes
feaf7eb c9132cc feaf7eb c9132cc feaf7eb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 | import librosa
import numpy as np
from transformers import pipeline
from config import config
from models_config import get_model_config
import os
class AudioEmotionProcessor:
"""Process audio files and extract emotions using ML models"""
def __init__(self):
self.model = None
self.model_name = config.MODEL_NAME
self.chunk_duration = config.CHUNK_DURATION
self.sample_rate = config.SAMPLE_RATE
# Get model-specific configuration
self.model_config = get_model_config(self.model_name)
self.label_mapping = self.model_config.get("label_mapping", {})
def load_model(self):
"""Load the emotion detection model"""
if self.model is None:
print(f"Loading model: {self.model_name}")
print(f"Model config: {self.model_config['description']}")
# Get task type from model config
task = self.model_config.get("task", "audio-classification")
try:
# Load model with configured task
self.model = pipeline(
task=task,
model=self.model_name
)
print("Model loaded successfully!")
except Exception as e:
print(f"Failed to load with task '{task}', trying auto-detection...")
try:
# Fallback: Try audio-classification
self.model = pipeline(
"audio-classification",
model=self.model_name
)
print("Model loaded successfully with audio-classification!")
except Exception as e2:
print(f"Error loading model: {e2}")
raise
return self.model
def load_audio(self, filepath):
"""Load audio file and resample to target sample rate"""
audio, sr = librosa.load(filepath, sr=self.sample_rate)
# Normalize audio volume (boost quiet recordings)
audio = self.normalize_audio(audio)
return audio, sr
def normalize_audio(self, audio):
"""Normalize audio to increase volume"""
# Get max absolute value
max_val = np.max(np.abs(audio))
# Avoid division by zero
if max_val > 0:
# Normalize to 0.95 to prevent clipping
audio = audio / max_val * 0.95
return audio
def get_audio_duration(self, audio, sr):
"""Get duration of audio in seconds"""
return librosa.get_duration(y=audio, sr=sr)
def split_into_chunks(self, audio, sr):
"""Split audio into fixed-duration chunks"""
chunk_samples = int(self.chunk_duration * sr)
chunks = []
for i in range(0, len(audio), chunk_samples):
chunk = audio[i:i + chunk_samples]
# Pad last chunk if it's shorter
if len(chunk) < chunk_samples:
chunk = np.pad(chunk, (0, chunk_samples - len(chunk)), mode='constant')
chunks.append(chunk)
return chunks
def predict_emotion(self, audio_chunk):
"""Predict emotion for a single audio chunk"""
if self.model is None:
self.load_model()
# Get predictions
predictions = self.model(audio_chunk)
# Get top prediction
top_prediction = predictions[0]
# Debug: Print raw model output
print(f"DEBUG - Raw prediction: {top_prediction}")
# Map model output to our emotion labels
emotion_label = self.map_emotion_label(top_prediction['label'])
confidence = top_prediction['score']
return emotion_label, confidence
def map_emotion_label(self, model_label):
"""Map model output labels to standardized emotion names"""
# Different models may have different label formats
label_lower = model_label.lower()
# Use model-specific label mapping first
if label_lower in self.label_mapping:
return self.label_mapping[label_lower]
# Fallback to common variations
emotion_map = {
'hap': 'Happy',
'happy': 'Happy',
'happiness': 'Happy',
'sad': 'Sad',
'sadness': 'Sad',
'ang': 'Angry',
'angry': 'Angry',
'anger': 'Angry',
'neu': 'Neutral',
'neutral': 'Neutral',
'calm': 'Neutral',
'fear': 'Fear',
'fearful': 'Fear',
'surprise': 'Surprise',
'surprised': 'Surprise',
'disgust': 'Disgust'
}
# Try to find a match
for key, value in emotion_map.items():
if key in label_lower:
return value
# Default: capitalize first letter
return model_label.capitalize()
def format_time(self, seconds):
"""Format seconds to MM:SS format"""
mins = int(seconds // 60)
secs = int(seconds % 60)
return f"{mins:02d}:{secs:02d}"
def process_audio_file(self, filepath, progress_callback=None):
"""
Process entire audio file and return emotion timeline
Args:
filepath: Path to audio file
progress_callback: Optional callback function(progress, message)
Returns:
dict: Results containing timeline and metadata
"""
try:
# Load model
if progress_callback:
progress_callback(10, "Loading model...")
self.load_model()
# Load audio
if progress_callback:
progress_callback(20, "Loading audio file...")
audio, sr = self.load_audio(filepath)
# Get duration
duration = self.get_audio_duration(audio, sr)
duration_formatted = self.format_time(duration)
# Split into chunks
if progress_callback:
progress_callback(30, "Splitting audio into segments...")
chunks = self.split_into_chunks(audio, sr)
# Process each chunk
timeline = []
total_chunks = len(chunks)
for i, chunk in enumerate(chunks):
# Calculate progress (30% to 90%)
progress = 30 + int((i / total_chunks) * 60)
if progress_callback:
progress_callback(
progress,
f"Analyzing chunk {i+1}/{total_chunks}..."
)
# Predict emotion
emotion, confidence = self.predict_emotion(chunk)
# Calculate timestamp
time_seconds = i * self.chunk_duration
time_formatted = self.format_time(time_seconds)
timeline.append({
"time": time_formatted,
"emotion": emotion,
"confidence": float(confidence)
})
# Calculate statistics
if progress_callback:
progress_callback(95, "Calculating statistics...")
emotions_list = [item['emotion'] for item in timeline]
unique_emotions = len(set(emotions_list))
# Find dominant emotion
from collections import Counter
emotion_counts = Counter(emotions_list)
dominant_emotion = emotion_counts.most_common(1)[0][0]
# Build results
results = {
"duration": duration_formatted,
"total_chunks": total_chunks,
"emotions_detected": unique_emotions,
"dominant_emotion": dominant_emotion,
"timeline": timeline
}
if progress_callback:
progress_callback(100, "Analysis complete!")
return results
except Exception as e:
raise Exception(f"Audio processing failed: {str(e)}")
# Global processor instance
_processor = None
def get_processor():
"""Get or create global processor instance"""
global _processor
if _processor is None:
_processor = AudioEmotionProcessor()
return _processor
|