Spaces:

temp12821
/

audioSentiment

Sleeping

App Files Files Community

audioSentiment / audio_processor.py

temp12821

feat: voice recording

c9132cc 3 months ago

raw

history blame contribute delete

8.66 kB

	import librosa
	import numpy as np
	from transformers import pipeline
	from config import config
	from models_config import get_model_config
	import os

	class AudioEmotionProcessor:
	"""Process audio files and extract emotions using ML models"""

	def __init__(self):
	self.model = None
	self.model_name = config.MODEL_NAME
	self.chunk_duration = config.CHUNK_DURATION
	self.sample_rate = config.SAMPLE_RATE

	# Get model-specific configuration
	self.model_config = get_model_config(self.model_name)
	self.label_mapping = self.model_config.get("label_mapping", {})

	def load_model(self):
	"""Load the emotion detection model"""
	if self.model is None:
	print(f"Loading model: {self.model_name}")
	print(f"Model config: {self.model_config['description']}")

	# Get task type from model config
	task = self.model_config.get("task", "audio-classification")

	try:
	# Load model with configured task
	self.model = pipeline(
	task=task,
	model=self.model_name
	)
	print("Model loaded successfully!")
	except Exception as e:
	print(f"Failed to load with task '{task}', trying auto-detection...")
	try:
	# Fallback: Try audio-classification
	self.model = pipeline(
	"audio-classification",
	model=self.model_name
	)
	print("Model loaded successfully with audio-classification!")
	except Exception as e2:
	print(f"Error loading model: {e2}")
	raise

	return self.model

	def load_audio(self, filepath):
	"""Load audio file and resample to target sample rate"""
	audio, sr = librosa.load(filepath, sr=self.sample_rate)

	# Normalize audio volume (boost quiet recordings)
	audio = self.normalize_audio(audio)

	return audio, sr

	def normalize_audio(self, audio):
	"""Normalize audio to increase volume"""
	# Get max absolute value
	max_val = np.max(np.abs(audio))

	# Avoid division by zero
	if max_val > 0:
	# Normalize to 0.95 to prevent clipping
	audio = audio / max_val * 0.95

	return audio

	def get_audio_duration(self, audio, sr):
	"""Get duration of audio in seconds"""
	return librosa.get_duration(y=audio, sr=sr)

	def split_into_chunks(self, audio, sr):
	"""Split audio into fixed-duration chunks"""
	chunk_samples = int(self.chunk_duration * sr)
	chunks = []

	for i in range(0, len(audio), chunk_samples):
	chunk = audio[i:i + chunk_samples]

	# Pad last chunk if it's shorter
	if len(chunk) < chunk_samples:
	chunk = np.pad(chunk, (0, chunk_samples - len(chunk)), mode='constant')

	chunks.append(chunk)

	return chunks

	def predict_emotion(self, audio_chunk):
	"""Predict emotion for a single audio chunk"""
	if self.model is None:
	self.load_model()

	# Get predictions
	predictions = self.model(audio_chunk)

	# Get top prediction
	top_prediction = predictions[0]

	# Debug: Print raw model output
	print(f"DEBUG - Raw prediction: {top_prediction}")

	# Map model output to our emotion labels
	emotion_label = self.map_emotion_label(top_prediction['label'])
	confidence = top_prediction['score']

	return emotion_label, confidence

	def map_emotion_label(self, model_label):
	"""Map model output labels to standardized emotion names"""
	# Different models may have different label formats
	label_lower = model_label.lower()

	# Use model-specific label mapping first
	if label_lower in self.label_mapping:
	return self.label_mapping[label_lower]

	# Fallback to common variations
	emotion_map = {
	'hap': 'Happy',
	'happy': 'Happy',
	'happiness': 'Happy',
	'sad': 'Sad',
	'sadness': 'Sad',
	'ang': 'Angry',
	'angry': 'Angry',
	'anger': 'Angry',
	'neu': 'Neutral',
	'neutral': 'Neutral',
	'calm': 'Neutral',
	'fear': 'Fear',
	'fearful': 'Fear',
	'surprise': 'Surprise',
	'surprised': 'Surprise',
	'disgust': 'Disgust'
	}

	# Try to find a match
	for key, value in emotion_map.items():
	if key in label_lower:
	return value

	# Default: capitalize first letter
	return model_label.capitalize()

	def format_time(self, seconds):
	"""Format seconds to MM:SS format"""
	mins = int(seconds // 60)
	secs = int(seconds % 60)
	return f"{mins:02d}:{secs:02d}"

	def process_audio_file(self, filepath, progress_callback=None):
	"""
	Process entire audio file and return emotion timeline

	Args:
	filepath: Path to audio file
	progress_callback: Optional callback function(progress, message)

	Returns:
	dict: Results containing timeline and metadata
	"""
	try:
	# Load model
	if progress_callback:
	progress_callback(10, "Loading model...")
	self.load_model()

	# Load audio
	if progress_callback:
	progress_callback(20, "Loading audio file...")
	audio, sr = self.load_audio(filepath)

	# Get duration
	duration = self.get_audio_duration(audio, sr)
	duration_formatted = self.format_time(duration)

	# Split into chunks
	if progress_callback:
	progress_callback(30, "Splitting audio into segments...")
	chunks = self.split_into_chunks(audio, sr)

	# Process each chunk
	timeline = []
	total_chunks = len(chunks)

	for i, chunk in enumerate(chunks):
	# Calculate progress (30% to 90%)
	progress = 30 + int((i / total_chunks) * 60)
	if progress_callback:
	progress_callback(
	progress,
	f"Analyzing chunk {i+1}/{total_chunks}..."
	)

	# Predict emotion
	emotion, confidence = self.predict_emotion(chunk)

	# Calculate timestamp
	time_seconds = i * self.chunk_duration
	time_formatted = self.format_time(time_seconds)

	timeline.append({
	"time": time_formatted,
	"emotion": emotion,
	"confidence": float(confidence)
	})

	# Calculate statistics
	if progress_callback:
	progress_callback(95, "Calculating statistics...")

	emotions_list = [item['emotion'] for item in timeline]
	unique_emotions = len(set(emotions_list))

	# Find dominant emotion
	from collections import Counter
	emotion_counts = Counter(emotions_list)
	dominant_emotion = emotion_counts.most_common(1)[0][0]

	# Build results
	results = {
	"duration": duration_formatted,
	"total_chunks": total_chunks,
	"emotions_detected": unique_emotions,
	"dominant_emotion": dominant_emotion,
	"timeline": timeline
	}

	if progress_callback:
	progress_callback(100, "Analysis complete!")

	return results

	except Exception as e:
	raise Exception(f"Audio processing failed: {str(e)}")

	# Global processor instance
	_processor = None

	def get_processor():
	"""Get or create global processor instance"""
	global _processor
	if _processor is None:
	_processor = AudioEmotionProcessor()
	return _processor