Spaces:
Runtime error
Runtime error
| # voice_emotion_classification.py | |
| import os | |
| import subprocess | |
| import sys | |
| import pkg_resources | |
| import time | |
| import tempfile | |
| import numpy as np | |
| import warnings | |
| from pathlib import Path | |
| warnings.filterwarnings("ignore") | |
| def install_package(package, version=None): | |
| package_spec = f"{package}=={version}" if version else package | |
| print(f"Installing {package_spec}...") | |
| try: | |
| subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-cache-dir", package_spec]) | |
| except subprocess.CalledProcessError as e: | |
| print(f"Failed to install {package_spec}: {e}") | |
| raise | |
| # Required packages (you may add version pins if necessary) | |
| required_packages = { | |
| "gradio": None, | |
| "torch": None, | |
| "torchaudio": None, | |
| "transformers": None, | |
| "librosa": None, | |
| "scipy": None, | |
| "matplotlib": None, | |
| "pydub": None | |
| } | |
| installed_packages = {pkg.key for pkg in pkg_resources.working_set} | |
| for package, version in required_packages.items(): | |
| if package not in installed_packages: | |
| install_package(package, version) | |
| # Now import all necessary packages | |
| import gradio as gr | |
| import torch | |
| import torchaudio | |
| import librosa | |
| import matplotlib.pyplot as plt | |
| from matplotlib.colors import LinearSegmentedColormap | |
| from pydub import AudioSegment | |
| import scipy | |
| import io | |
| from transformers import pipeline, AutoFeatureExtractor, AutoModelForAudioClassification | |
| from pathlib import Path | |
| import matplotlib | |
| matplotlib.use('Agg') # Use non-interactive backend | |
| # Define emotion labels, tone mapping, and descriptions | |
| EMOTION_DESCRIPTIONS = { | |
| "angry": "Voice shows irritation, hostility, or aggression. Tone may be harsh, loud, or intense.", | |
| "disgust": "Voice expresses revulsion or strong disapproval. Tone may sound repulsed or contemptuous.", | |
| "fear": "Voice reveals anxiety, worry, or dread. Tone may be shaky, hesitant, or tense.", | |
| "happy": "Voice conveys joy, pleasure, or positive emotions. Tone is often bright, energetic, and uplifted.", | |
| "neutral": "Voice lacks strong emotional signals. Tone is even, moderate, and relatively flat.", | |
| "sad": "Voice expresses sorrow, unhappiness, or melancholy. Tone may be quiet, heavy, or subdued.", | |
| "surprise": "Voice reflects unexpected reactions. Tone may be higher pitched, quick, or energetic." | |
| } | |
| # Here we map emotion to a generalized tone (for example, negative or positive) | |
| TONE_MAPPING = { | |
| "positive": ["happy", "surprise"], | |
| "neutral": ["neutral"], | |
| "negative": ["angry", "sad", "fear", "disgust"] | |
| } | |
| # Some Hugging Face models return short labels (e.g., "hap", "ang", etc.). | |
| # This mapping will ensure they're translated into our full canonical labels. | |
| MODEL_TO_EMOTION_MAP = { | |
| "hap": "happy", | |
| "ang": "angry", | |
| "sad": "sad", | |
| "dis": "disgust", | |
| "fea": "fear", | |
| "neu": "neutral", | |
| "sur": "surprise" | |
| } | |
| # Global variable for the emotion classifier | |
| audio_emotion_classifier = None | |
| def load_emotion_model(): | |
| """Load the emotion classification model once and cache it.""" | |
| global audio_emotion_classifier | |
| if audio_emotion_classifier is None: | |
| try: | |
| print("Loading emotion classification model...") | |
| # Using the Hugging Face pipeline with the new model that classifies speech emotion | |
| model_name = "superb/hubert-large-superb-er" | |
| audio_emotion_classifier = pipeline("audio-classification", model=model_name) | |
| print("Emotion classification model loaded successfully") | |
| return True | |
| except Exception as e: | |
| print(f"Error loading emotion model: {e}") | |
| return False | |
| return True | |
| def convert_audio_to_wav(audio_file): | |
| """Convert the uploaded audio to WAV format.""" | |
| try: | |
| audio = AudioSegment.from_file(audio_file) | |
| with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav: | |
| wav_path = temp_wav.name | |
| audio.export(wav_path, format="wav") | |
| return wav_path | |
| except Exception as e: | |
| print(f"Error converting audio: {e}") | |
| return None | |
| def analyze_audio_emotions(audio_file, progress=gr.Progress(), chunk_duration=5): | |
| """ | |
| Analyze emotions in an audio file by processing it in chunks. | |
| Returns a visualization, processed audio path, summary, and detailed results. | |
| """ | |
| if not load_emotion_model(): | |
| return None, "Failed to load emotion classification model. Please check console for details." | |
| # If the file is already a WAV, use it directly; else convert it. | |
| if audio_file.endswith('.wav'): | |
| audio_path = audio_file | |
| else: | |
| audio_path = convert_audio_to_wav(audio_file) | |
| if not audio_path: | |
| return None, "Failed to process audio file. Unsupported format or corrupted file." | |
| try: | |
| # Load the audio using librosa | |
| audio_data, sample_rate = librosa.load(audio_path, sr=16000) | |
| duration = len(audio_data) / sample_rate | |
| # Process in chunks for long files | |
| chunk_samples = int(chunk_duration * sample_rate) | |
| num_chunks = max(1, int(np.ceil(len(audio_data) / chunk_samples))) | |
| all_emotions = [] | |
| time_points = [] | |
| for i in range(num_chunks): | |
| progress((i + 1) / num_chunks, "Analyzing audio emotions...") | |
| start_idx = i * chunk_samples | |
| end_idx = min(start_idx + chunk_samples, len(audio_data)) | |
| chunk = audio_data[start_idx:end_idx] | |
| # Skip too-short chunks (<0.5 seconds) | |
| if len(chunk) < 0.5 * sample_rate: | |
| continue | |
| # Create a temporary file for this audio chunk | |
| with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_chunk: | |
| chunk_path = temp_chunk.name | |
| scipy.io.wavfile.write(chunk_path, sample_rate, (chunk * 32767).astype(np.int16)) | |
| # Get emotion classification results on this chunk | |
| results = audio_emotion_classifier(chunk_path) | |
| os.unlink(chunk_path) # Remove the temporary file | |
| all_emotions.append(results) | |
| time_points.append((start_idx / sample_rate, end_idx / sample_rate)) | |
| # Generate visualization and summary | |
| fig, detailed_results = generate_emotion_timeline(all_emotions, time_points, duration) | |
| with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_img: | |
| img_path = temp_img.name | |
| fig.savefig(img_path, dpi=100, bbox_inches='tight') | |
| plt.close(fig) | |
| summary = generate_emotion_summary(all_emotions, time_points) | |
| return img_path, audio_path, summary, detailed_results | |
| except Exception as e: | |
| print(f"Error analyzing audio: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return None, None, f"Error analyzing audio: {str(e)}", None | |
| def generate_emotion_timeline(all_emotions, time_points, duration): | |
| """ | |
| Generate a bar chart visualization of emotion percentages with tone analysis. | |
| Returns the matplotlib figure and a list of detailed results. | |
| """ | |
| # All possible emotion labels from our dictionary | |
| emotion_labels = list(EMOTION_DESCRIPTIONS.keys()) | |
| # We'll accumulate counts based on our canonical labels (e.g., "happy", "angry"). | |
| emotion_counts = {} | |
| for emotions in all_emotions: | |
| if not emotions: | |
| continue | |
| # The pipeline returns items like {"label": "Hap", "score": 0.95}, etc. | |
| top_emotion = max(emotions, key=lambda x: x['score']) | |
| # Normalize the label from the model to a canonical label used in EMOTION_DESCRIPTIONS | |
| raw_label = top_emotion['label'].lower().strip() # e.g., "hap", "ang", ... | |
| canonical_label = MODEL_TO_EMOTION_MAP.get(raw_label, raw_label) | |
| # If there's no mapping, we leave it as raw_label. | |
| # But typically, it should be one of "happy", "angry", "disgust", "fear", "sad", "neutral", "surprise". | |
| # Count how many times each canonical label appears | |
| emotion_counts[canonical_label] = emotion_counts.get(canonical_label, 0) + 1 | |
| total_chunks = len(all_emotions) | |
| emotion_percentages = { | |
| e: (count / total_chunks * 100) for e, count in emotion_counts.items() | |
| } | |
| # Create empty percentages for emotions that didn't appear | |
| for label in emotion_labels: | |
| if label not in emotion_percentages: | |
| emotion_percentages[label] = 0.0 | |
| # Sort emotions by percentage | |
| sorted_emotions = sorted(emotion_percentages.items(), key=lambda x: x[1], reverse=True) | |
| # Create the bar chart with subplots: one for emotions and one for tone | |
| fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), height_ratios=[3, 1], gridspec_kw={'hspace': 0.3}) | |
| # Capitalize each label for a nice display | |
| emotions = [item[0].capitalize() for item in sorted_emotions] | |
| percentages = [item[1] for item in sorted_emotions] | |
| # Custom colors for emotions (enough for 7 emotions) | |
| colors = ['red', 'brown', 'purple', 'green', 'gray', 'blue', 'orange'] | |
| if len(emotions) <= len(colors): | |
| bar_colors = colors[:len(emotions)] | |
| else: | |
| # fallback if there's more emotions than colors | |
| bar_colors = colors + ['#666666'] * (len(emotions) - len(colors)) | |
| # Plot emotion bars | |
| bars = ax1.bar(emotions, percentages, color=bar_colors) | |
| # Add percentage labels on top of each bar | |
| for bar in bars: | |
| height = bar.get_height() | |
| ax1.annotate(f'{height:.1f}%', | |
| xy=(bar.get_x() + bar.get_width() / 2, height), | |
| xytext=(0, 3), # 3 points vertical offset | |
| textcoords="offset points", | |
| ha='center', va='bottom') | |
| ax1.set_ylim(0, 100) # Fixed 100% scale | |
| ax1.set_ylabel('Percentage (%)') | |
| ax1.set_title('Emotion Distribution') | |
| ax1.grid(axis='y', linestyle='--', alpha=0.7) | |
| # Calculate tone percentages based on the canonical labels we found | |
| tone_percentages = {"positive": 0, "neutral": 0, "negative": 0} | |
| for emotion_label, percentage in emotion_percentages.items(): | |
| for tone, emotions_list in TONE_MAPPING.items(): | |
| if emotion_label in emotions_list: | |
| tone_percentages[tone] += percentage | |
| # Plot tone bars | |
| tones = list(tone_percentages.keys()) | |
| tone_values = list(tone_percentages.values()) | |
| tone_colors = {'positive': 'green', 'neutral': 'gray', 'negative': 'red'} | |
| tone_bars = ax2.bar(tones, tone_values, color=[tone_colors[t] for t in tones]) | |
| # Add percentage labels on tone bars | |
| for bar in tone_bars: | |
| height = bar.get_height() | |
| if height > 0: # Only add label if there's a visible bar | |
| ax2.annotate(f'{height:.1f}%', | |
| xy=(bar.get_x() + bar.get_width() / 2, height), | |
| xytext=(0, 3), | |
| textcoords="offset points", | |
| ha='center', va='bottom') | |
| ax2.set_ylim(0, 100) | |
| ax2.set_ylabel('Percentage (%)') | |
| ax2.set_title('Tone Analysis') | |
| ax2.grid(axis='y', linestyle='--', alpha=0.7) | |
| plt.tight_layout() | |
| # Generate a more detailed time-segmented result | |
| detailed_results = [] | |
| for idx, (emotions, (start_time, end_time)) in enumerate(zip(all_emotions, time_points)): | |
| if not emotions: | |
| continue | |
| top_emotion = max(emotions, key=lambda x: x['score']) | |
| raw_label = top_emotion['label'].lower().strip() | |
| canonical_label = MODEL_TO_EMOTION_MAP.get(raw_label, raw_label) | |
| # Determine the tone for this emotion | |
| # (based on canonical_label rather than the raw model label) | |
| tone = next((t for t, e_list in TONE_MAPPING.items() if canonical_label in e_list), "unknown") | |
| detailed_results.append({ | |
| 'Time Range': f"{start_time:.1f}s - {end_time:.1f}s", | |
| 'Emotion': canonical_label, | |
| 'Tone': tone.capitalize(), | |
| 'Confidence': f"{top_emotion['score']:.2f}", | |
| 'Description': EMOTION_DESCRIPTIONS.get(canonical_label, "") | |
| }) | |
| return fig, detailed_results | |
| def generate_emotion_summary(all_emotions, time_points): | |
| """ | |
| Create a summary text from the emotion analysis. | |
| Counts occurrences and computes percentages of the dominant emotion. | |
| """ | |
| if not all_emotions: | |
| return "No emotional content detected." | |
| emotion_counts = {} | |
| total_chunks = len(all_emotions) | |
| for emotions in all_emotions: | |
| if not emotions: | |
| continue | |
| top_emotion = max(emotions, key=lambda x: x['score']) | |
| # Normalize the label | |
| raw_label = top_emotion['label'].lower().strip() | |
| canonical_label = MODEL_TO_EMOTION_MAP.get(raw_label, raw_label) | |
| emotion_counts[canonical_label] = emotion_counts.get(canonical_label, 0) + 1 | |
| emotion_percentages = { | |
| e: (count / total_chunks * 100) | |
| for e, count in emotion_counts.items() | |
| } | |
| if not emotion_percentages: | |
| return "No emotional content detected." | |
| # Find the dominant emotion (highest percentage) | |
| dominant_emotion = max(emotion_percentages.items(), key=lambda x: x[1])[0] | |
| summary = f"### Voice Emotion Analysis Summary\n\n" | |
| summary += f"**Dominant emotion:** {dominant_emotion.capitalize()} ({emotion_percentages[dominant_emotion]:.1f}%)\n\n" | |
| summary += f"**Description:** {EMOTION_DESCRIPTIONS.get(dominant_emotion, '')}\n\n" | |
| summary += "**Emotion distribution:**\n" | |
| for emotion, percentage in sorted(emotion_percentages.items(), key=lambda x: x[1], reverse=True): | |
| summary += f"- {emotion.capitalize()}: {percentage:.1f}%\n" | |
| summary += "\n**Interpretation:** The voice predominantly expresses {0} emotion".format(dominant_emotion) | |
| return summary | |
| def record_audio(audio): | |
| """Save recorded audio and analyze emotions.""" | |
| try: | |
| with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file: | |
| audio_path = temp_file.name | |
| with open(audio_path, 'wb') as f: | |
| f.write(audio) | |
| return audio_path | |
| except Exception as e: | |
| print(f"Error saving recorded audio: {e}") | |
| return None | |
| def process_audio(audio_file, progress=gr.Progress()): | |
| """Process the audio file and analyze emotions.""" | |
| if audio_file is None: | |
| return None, None, "No audio file provided.", None | |
| img_path, processed_audio, summary, results = analyze_audio_emotions(audio_file, progress) | |
| if img_path is None: | |
| return None, None, "Failed to analyze audio emotions.", None | |
| return img_path, processed_audio, summary, results | |
| # Create Gradio interface | |
| with gr.Blocks(title="Voice Emotion Analysis System") as demo: | |
| gr.Markdown(""" | |
| # ποΈ Voice Emotion Analysis System | |
| This app analyzes the emotional content of voice recordings. | |
| It detects emotions including: | |
| * π‘ **Anger** | |
| * π€’ **Disgust** | |
| * π¨ **Fear** | |
| * π **Happiness** | |
| * π **Neutral** | |
| * π’ **Sadness** | |
| * π² **Surprise** | |
| And provides a detailed analysis and timeline. | |
| """) | |
| with gr.Tabs(): | |
| with gr.TabItem("Upload Audio"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| audio_input = gr.Audio( | |
| label="Upload Audio File", | |
| type="filepath", | |
| sources=["upload"] | |
| ) | |
| process_btn = gr.Button("Analyze Voice Emotions") | |
| with gr.Column(scale=2): | |
| emotion_timeline = gr.Image(label="Emotion Timeline", show_label=True) | |
| with gr.Row(): | |
| audio_playback = gr.Audio(label="Processed Audio", show_label=True) | |
| emotion_summary = gr.Markdown(label="Emotion Summary") | |
| with gr.Row(): | |
| emotion_results = gr.DataFrame( | |
| headers=["Time Range", "Emotion", "Tone", "Confidence", "Description"], | |
| label="Detailed Emotion Analysis" | |
| ) | |
| process_btn.click( | |
| fn=process_audio, | |
| inputs=[audio_input], | |
| outputs=[emotion_timeline, audio_playback, emotion_summary, emotion_results] | |
| ) | |
| with gr.TabItem("Record Voice"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| record_input = gr.Audio( | |
| label="Record Your Voice", | |
| sources=["microphone"], | |
| type="filepath" | |
| ) | |
| analyze_btn = gr.Button("Analyze Recording") | |
| with gr.Column(scale=2): | |
| rec_emotion_timeline = gr.Image(label="Emotion Timeline", show_label=True) | |
| with gr.Row(): | |
| rec_audio_playback = gr.Audio(label="Processed Audio", show_label=True) | |
| rec_emotion_summary = gr.Markdown(label="Emotion Summary") | |
| with gr.Row(): | |
| rec_emotion_results = gr.DataFrame( | |
| headers=["Time Range", "Emotion", "Tone", "Confidence", "Description"], | |
| label="Detailed Emotion Analysis" | |
| ) | |
| analyze_btn.click( | |
| fn=process_audio, | |
| inputs=[record_input], | |
| outputs=[rec_emotion_timeline, rec_audio_playback, rec_emotion_summary, rec_emotion_results] | |
| ) | |
| gr.Markdown(""" | |
| ### How to Use | |
| 1. **Upload Audio Tab:** Upload an audio file and click "Analyze Voice Emotions". | |
| 2. **Record Voice Tab:** Record your voice and click "Analyze Recording". | |
| **Tips:** | |
| - Use clear recordings with minimal background noise. | |
| - Longer recordings yield more consistent results. | |
| """) | |
| def initialize_app(): | |
| print("Initializing voice emotion analysis app...") | |
| if load_emotion_model(): | |
| print("Emotion model loaded successfully!") | |
| else: | |
| print("Failed to load emotion model.") | |
| if __name__ == "__main__": | |
| initialize_app() | |
| demo.launch() | |