Spaces:

Natwar
/

VoiceAnalysis

Runtime error

App Files Files Community

Natwar commited on Apr 13, 2025

Commit

e89b55b

verified ·

1 Parent(s): 343474c

Update app.py

Browse files

Files changed (1) hide show

app.py +286 -665

app.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import os
 import subprocess
 import sys
@@ -18,7 +20,7 @@ def install_package(package, version=None):
         print(f"Failed to install {package_spec}: {e}")
         raise
-# Required packages (add version pins if needed)
 required_packages = {
     "gradio": None,
     "torch": None,
@@ -27,8 +29,7 @@ required_packages = {
     "librosa": None,
     "scipy": None,
     "matplotlib": None,
-    "pydub": None,
-    "plotly": None
 }
 installed_packages = {pkg.key for pkg in pkg_resources.working_set}
@@ -36,18 +37,20 @@ for package, version in required_packages.items():
     if package not in installed_packages:
         install_package(package, version)
-# Now import necessary packages
 import gradio as gr
 import torch
 import torchaudio
 import librosa
-import matplotlib
-matplotlib.use('Agg')  # non-interactive backend for any fallback
 from pydub import AudioSegment
 import scipy
 import io
 from transformers import pipeline, AutoFeatureExtractor, AutoModelForAudioClassification
-import plotly.graph_objects as go
 # Define emotion labels, tone mapping, and descriptions
 EMOTION_DESCRIPTIONS = {
@@ -60,22 +63,35 @@ EMOTION_DESCRIPTIONS = {
     "surprise": "Voice reflects unexpected reactions. Tone may be higher pitched, quick, or energetic."
 }
-# If you wish to group emotions by tone, you can do so here:
 TONE_MAPPING = {
     "positive": ["happy", "surprise"],
     "neutral": ["neutral"],
     "negative": ["angry", "sad", "fear", "disgust"]
 }
 # Global variable for the emotion classifier
 audio_emotion_classifier = None
 def load_emotion_model():
-    """Load and cache the speech emotion classification model."""
     global audio_emotion_classifier
     if audio_emotion_classifier is None:
         try:
             print("Loading emotion classification model...")
             model_name = "superb/hubert-large-superb-er"
             audio_emotion_classifier = pipeline("audio-classification", model=model_name)
             print("Emotion classification model loaded successfully")
@@ -86,7 +102,7 @@ def load_emotion_model():
     return True
 def convert_audio_to_wav(audio_file):
-    """Convert uploaded audio to WAV format."""
     try:
         audio = AudioSegment.from_file(audio_file)
         with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav:
@@ -97,749 +113,354 @@ def convert_audio_to_wav(audio_file):
         print(f"Error converting audio: {e}")
         return None
-def analyze_voice_tone(audio_file):
-    """
-    Analyze the tone characteristics of the voice using more robust measurements.
-    Includes pitch variation, energy dynamics, and spectral features.
-    """
-    try:
-        audio_data, sample_rate = librosa.load(audio_file, sr=16000)
-        # 1. Basic audio features
-        audio_duration = librosa.get_duration(y=audio_data, sr=sample_rate)
-        if audio_duration < 1.0:  # Too short for reliable analysis
-            return "Audio too short for reliable tone analysis. Please provide at least 3 seconds."
-        # 2. Pitch analysis with more robust handling
-        f0, voiced_flag, voiced_prob = librosa.pyin(
-            audio_data,
-            fmin=librosa.note_to_hz('C2'),
-            fmax=librosa. note_to_hz('C7'),
-            sr=sample_rate
-        )
-        # Filter out NaN values and get valid pitch points
-        valid_f0 = f0[~np.isnan(f0)]
-        # If no pitch detected, may be noise or silence
-        if len(valid_f0) < 10:
-            return "**Voice Tone Analysis:** Unable to detect sufficient pitched content for analysis. The audio may contain primarily noise, silence, or non-speech sounds."
-        # 3. Calculate improved statistics
-        mean_pitch = np.mean(valid_f0)
-        median_pitch = np.median(valid_f0)
-        std_pitch = np.std(valid_f0)
-        pitch_range = np.percentile(valid_f0, 95) - np.percentile(valid_f0, 5)
-        # 4. Energy/volume dynamics
-        rms_energy = librosa.feature.rms(y=audio_data)[0]
-        mean_energy = np.mean(rms_energy)
-        std_energy = np.std(rms_energy)
-        energy_range = np.percentile(rms_energy, 95) - np.percentile(rms_energy, 5)
-        # 5. Speaking rate approximation (zero-crossing rate can help estimate this)
-        zcr = librosa.feature.zero_crossing_rate(audio_data)[0]
-        mean_zcr = np.mean(zcr)
-        # 6. Calculate pitch variability relative to the mean (coefficient of variation)
-        # This gives a better measure than raw std dev
-        pitch_cv = (std_pitch / mean_pitch) * 100 if mean_pitch > 0 else 0
-        # 7. Tone classification logic using multiple features
-        # Define tone characteristics based on combinations of features
-        tone_class = ""
-        tone_details = []
-        # Pitch-based characteristics
-        if pitch_cv < 5:
-            tone_class = "Monotone"
-            tone_details.append("Very little pitch variation - sounds flat and unexpressive")
-        elif pitch_cv < 12:
-            tone_class = "Steady"
-            tone_details.append("Moderate pitch variation - sounds controlled and measured")
-        elif pitch_cv < 20:
-            tone_class = "Expressive"
-            tone_details.append("Good pitch variation - sounds naturally engaging")
-        else:
-            tone_class = "Highly Dynamic"
-            tone_details.append("Strong pitch variation - sounds animated and emphatic")
-        # Pitch range classification
-        if mean_pitch > 180:
-            tone_details.append("Higher pitched voice - may convey excitement or tension")
-        elif mean_pitch < 120:
-            tone_details.append("Lower pitched voice - may convey calmness or authority")
-        else:
-            tone_details.append("Mid-range pitch - typically perceived as balanced")
-        # Energy/volume characteristics
-        energy_cv = (std_energy / mean_energy) * 100 if mean_energy > 0 else 0
-        if energy_cv < 10:
-            tone_details.append("Consistent volume - sounds controlled and measured")
-        elif energy_cv > 30:
-            tone_details.append("Variable volume - suggests emotional emphasis or expressiveness")
-        # Speech rate approximation
-        if mean_zcr > 0.1:
-            tone_details.append("Faster speech rate - may convey urgency or enthusiasm")
-        elif mean_zcr < 0.05:
-            tone_details.append("Slower speech rate - may convey thoughtfulness or hesitation")
-        # Generate tone summary and interpretation
-        tone_analysis = f"### Voice Tone Analysis\n\n"
-        tone_analysis += f"**Primary tone quality:** {tone_class}\n\n"
-        tone_analysis += "**Tone characteristics:**\n"
-        for detail in tone_details:
-            tone_analysis += f"- {detail}\n"
-        tone_analysis += "\n**Interpretation:**\n"
-        # Generate interpretation based on the classified tone
-        if tone_class == "Monotone":
-            tone_analysis += ("A monotone delivery can create distance and reduce engagement. "
-                             "Consider adding more vocal variety to sound more engaging and authentic.")
-        elif tone_class == "Steady":
-            tone_analysis += ("Your steady tone suggests reliability and control. "
-                             "This can be effective in professional settings or when conveying serious information.")
-        elif tone_class == "Expressive":
-            tone_analysis += ("Your expressive tone helps maintain listener interest and emphasize key points. "
-                             "This naturally engaging quality helps convey authenticity and conviction.")
-        else:  # Highly Dynamic
-            tone_analysis += ("Your highly dynamic vocal style conveys strong emotion and energy. "
-                             "This can be powerful for storytelling and persuasion, though in some contexts "
-                             "a more measured approach might be appropriate.")
-        return tone_analysis
-    except Exception as e:
-        print(f"Error in tone analysis: {e}")
-        return "Tone analysis unavailable due to an error processing the audio."
-def analyze_audio_emotions(audio_file, progress=gr.Progress(), chunk_duration=2):
     """
-    Analyze speech emotions in short chunks,
-    building a timeline of confidence for each emotion.
-    Returns a Plotly figure, summary text, detailed results.
     """
     if not load_emotion_model():
-        return None, "Failed to load emotion classifier.", None
-    # Use existing WAV if possible, else convert
-    if audio_file.endswith(".wav"):
         audio_path = audio_file
     else:
         audio_path = convert_audio_to_wav(audio_file)
         if not audio_path:
-            return None, "Could not process audio file", None
     try:
-        # Load with librosa
         audio_data, sample_rate = librosa.load(audio_path, sr=16000)
         duration = len(audio_data) / sample_rate
-        # Use shorter chunks for more granular analysis
         chunk_samples = int(chunk_duration * sample_rate)
         num_chunks = max(1, int(np.ceil(len(audio_data) / chunk_samples)))
         all_emotions = []
         time_points = []
-        # For each chunk, run emotion classification
         for i in range(num_chunks):
             progress((i + 1) / num_chunks, "Analyzing audio emotions...")
             start_idx = i * chunk_samples
             end_idx = min(start_idx + chunk_samples, len(audio_data))
             chunk = audio_data[start_idx:end_idx]
-            # Skip very short chunks
             if len(chunk) < 0.5 * sample_rate:
                 continue
-            # Write chunk to temp WAV
             with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_chunk:
                 chunk_path = temp_chunk.name
                 scipy.io.wavfile.write(chunk_path, sample_rate, (chunk * 32767).astype(np.int16))
-            # Classify - extract top-n predictions for each chunk
-            raw_results = audio_emotion_classifier(chunk_path, top_k=7)  # Get all 7 emotions
-            os.unlink(chunk_path)
-            all_emotions.append(raw_results)
             time_points.append((start_idx / sample_rate, end_idx / sample_rate))
-        # Skip if no valid emotions detected
-        if not all_emotions:
-            return None, "No speech detected in the audio.", None
-        # Build Plotly chart with improved styling
-        fig = build_plotly_line_chart(all_emotions, time_points, duration)
-        # Build summary and detailed results
-        summary_text = generate_emotion_summary(all_emotions)
-        detailed_results = build_detailed_results(all_emotions, time_points)
-        return fig, summary_text, detailed_results
     except Exception as e:
         import traceback
         traceback.print_exc()
-        return None, f"Error analyzing audio: {str(e)}", None
-def smooth_data(data, window_size=3):
-    """Apply a moving average smoothing to the data"""
-    smoothed = np.convolve(data, np.ones(window_size)/window_size, mode='valid')
-    # Add back points that were lost in the convolution
-    padding = len(data) - len(smoothed)
-    if padding > 0:
-        # Add padding at the beginning
-        padding_front = padding // 2
-        padding_back = padding - padding_front
-        # Use the first/last values for padding
-        front_padding = [smoothed[0]] * padding_front
-        back_padding = [smoothed[-1]] * padding_back
-        smoothed = np.concatenate([front_padding, smoothed, back_padding])
-    return smoothed
-def build_plotly_line_chart(all_emotions, time_points, duration):
     """
-    Create an improved Plotly line chart with toggles for each emotion.
-    Shows all emotions for each time point rather than just the top one.
     """
     emotion_labels = list(EMOTION_DESCRIPTIONS.keys())
-    # Custom color scheme for emotions
-    colors = {
-        "angry": "#E53935",     # Red
-        "disgust": "#8E24AA",   # Purple
-        "fear": "#7B1FA2",      # Deep Purple
-        "happy": "#FFC107",     # Amber/Yellow
-        "neutral": "#78909C",   # Blue Grey
-        "sad": "#1E88E5",       # Blue
-        "surprise": "#43A047"   # Green
-    }
-    # Prepare data structure for all emotions
-    emotion_data = {label: [] for label in emotion_labels}
-    timeline_times = [(start + end) / 2 for start, end in time_points]
-    # Process emotion scores - ensure all emotions have values
-    for chunk_emotions in all_emotions:
-        # Create a mapping of label to score for this chunk
-        scores = {item["label"]: item["score"] for item in chunk_emotions}
-        # Ensure all emotion labels have a value (default to 0.0)
-        for label in emotion_labels:
-            emotion_data[label].append(scores.get(label, 0.0))
-    # Smooth the data
-    for label in emotion_labels:
-        if len(emotion_data[label]) > 2:
-            emotion_data[label] = smooth_data(emotion_data[label])
-    # Build the chart
-    fig = go.Figure()
-    # Add traces for each emotion
-    for label in emotion_labels:
-        fig.add_trace(
-            go.Scatter(
-                x=timeline_times,
-                y=emotion_data[label],
-                mode='lines',
-                name=label.capitalize(),
-                line=dict(
-                    color=colors.get(label, None),
-                    width=3,
-                    shape='spline',  # Curved lines
-                    smoothing=1.3
-                ),
-                hovertemplate=f'{label.capitalize()}: %{{y:.2f}}<extra></extra>',
-            )
-        )
-    # Add markers for dominant emotion at each point
-    dominant_markers_x = []
-    dominant_markers_y = []
-    dominant_markers_text = []
-    dominant_markers_color = []
-    for i, time in enumerate(timeline_times):
-        scores = {label: emotion_data[label][i] for label in emotion_labels}
-        dominant = max(scores.items(), key=lambda x: x[1])
-        dominant_markers_x.append(time)
-        dominant_markers_y.append(dominant[1])
-        dominant_markers_text.append(f"{dominant[0].capitalize()}: {dominant[1]:.2f}")
-        dominant_markers_color.append(colors.get(dominant[0], "#000000"))
-    fig.add_trace(
-        go.Scatter(
-            x=dominant_markers_x,
-            y=dominant_markers_y,
-            mode='markers',
-            marker=dict(
-                size=10,
-                color=dominant_markers_color,
-                line=dict(width=2, color='white')
-            ),
-            name="Dominant Emotion",
-            text=dominant_markers_text,
-            hoverinfo="text",
-            hovertemplate='%{text}<extra></extra>'
-        )
-    )
-    # Add area chart for better visualization
-    for label in emotion_labels:
-        fig.add_trace(
-            go.Scatter(
-                x=timeline_times,
-                y=emotion_data[label],
-                mode='none',
-                name=f"{label.capitalize()} Area",
-                fill='tozeroy',
-                fillcolor=f"rgba{tuple(list(int(colors.get(label, '#000000').lstrip('#')[i:i+2], 16) for i in (0, 2, 4)) + [0.1])}",
-                showlegend=False,
-                hoverinfo='skip'
-            )
-        )
-    # Improve layout
-    fig.update_layout(
-        title={
-            'text': "Voice Emotion Analysis Over Time",
-            'font': {'size': 22, 'family': 'Arial, sans-serif'}
-        },
-        xaxis_title="Time (seconds)",
-        yaxis_title="Confidence Score",
-        yaxis=dict(
-            range=[0, 1.0],
-            showgrid=True,
-            gridcolor='rgba(230, 230, 230, 0.8)'
-        ),
-        xaxis=dict(
-            showgrid=True,
-            gridcolor='rgba(230, 230, 230, 0.8)'
-        ),
-        plot_bgcolor='white',
-        legend=dict(
-            bordercolor='rgba(0,0,0,0.1)',
-            borderwidth=1,
-            orientation="h",
-            yanchor="bottom",
-            y=1.02,
-            xanchor="right",
-            x=1
-        ),
-        hovermode='closest',
-        height=500,  # Larger size for better viewing
-        margin=dict(l=10, r=10, t=80, b=50)
-    )
-    return fig
-def generate_alternative_chart(all_emotions, time_points):
-    """
-    Create a stacked area chart to better visualize emotion changes over time
-    """
-    emotion_labels = list(EMOTION_DESCRIPTIONS.keys())
-    # Custom color scheme for emotions - more visible/distinct
-    colors = {
-        "angry": "#F44336",     # Red
-        "disgust": "#9C27B0",   # Purple
-        "fear": "#673AB7",      # Deep Purple
-        "happy": "#FFC107",     # Amber
-        "neutral": "#607D8B",   # Blue Grey
-        "sad": "#2196F3",       # Blue
-        "surprise": "#4CAF50"   # Green
     }
-    # Prepare timeline points
-    timeline_times = [(start + end) / 2 for start, end in time_points]
-    # Prepare data structure for all emotions
-    emotion_data = {label: [] for label in emotion_labels}
-    # Process emotion scores - ensure all emotions have values
-    for chunk_emotions in all_emotions:
-        # Create a mapping of label to score for this chunk
-        scores = {item["label"]: item["score"] for item in chunk_emotions}
-        # Ensure all emotion labels have a value (default to 0.0)
-        for label in emotion_labels:
-            emotion_data[label].append(scores.get(label, 0.0))
-    # Create the stacked area chart
-    fig = go.Figure()
-    # Add each emotion as a separate trace
     for label in emotion_labels:
-        fig.add_trace(
-            go.Scatter(
-                x=timeline_times,
-                y=emotion_data[label],
-                mode='lines',
-                name=label.capitalize(),
-                line=dict(width=0.5, color=colors.get(label, None)),
-                stackgroup='one',  # This makes it a stacked area chart
-                fillcolor=colors.get(label, None),
-                hovertemplate=f'{label.capitalize()}: %{{y:.2f}}<extra></extra>'
-            )
-        )
-    # Improve layout
-    fig.update_layout(
-        title={
-            'text': "Voice Emotion Distribution Over Time",
-            'font': {'size': 22, 'family': 'Arial, sans-serif'}
-        },
-        xaxis_title="Time (seconds)",
-        yaxis_title="Emotion Intensity",
-        yaxis=dict(
-            showgrid=True,
-            gridcolor='rgba(230, 230, 230, 0.8)'
-        ),
-        xaxis=dict(
-            showgrid=True,
-            gridcolor='rgba(230, 230, 230, 0.8)'
-        ),
-        plot_bgcolor='white',
-        legend=dict(
-            bordercolor='rgba(0,0,0,0.1)',
-            borderwidth=1,
-            orientation="h",
-            yanchor="bottom",
-            y=1.02,
-            xanchor="right",
-            x=1
-        ),
-        hovermode='closest',
-        height=500,
-        margin=dict(l=10, r=10, t=80, b=50)
-    )
-    return fig
-def generate_emotion_summary(all_emotions):
     """
-    Produce an improved textual summary of the overall emotion distribution.
     """
     if not all_emotions:
         return "No emotional content detected."
     emotion_counts = {}
-    emotion_confidence = {}
     total_chunks = len(all_emotions)
-    for chunk_emotions in all_emotions:
-        top_emotion = max(chunk_emotions, key=lambda x: x['score'])
-        label = top_emotion["label"]
-        confidence = top_emotion["score"]
-        emotion_counts[label] = emotion_counts.get(label, 0) + 1
-        emotion_confidence[label] = emotion_confidence.get(label, 0) + confidence
-    # Calculate average confidence for each emotion
-    for emotion in emotion_confidence:
-        if emotion_counts[emotion] > 0:
-            emotion_confidence[emotion] /= emotion_counts[emotion]
-    # Dominant emotion (highest percentage)
-    dominant_emotion = max(emotion_counts, key=emotion_counts.get)
-    dominant_pct = (emotion_counts[dominant_emotion] / total_chunks) * 100
-    # Most confident emotion (might differ from dominant)
-    most_confident = max(emotion_confidence, key=emotion_confidence.get)
-    # Tone grouping analysis
-    tone_group_counts = {group: 0 for group in TONE_MAPPING}
-    for emotion, count in emotion_counts.items():
-        for tone_group, emotions in TONE_MAPPING.items():
-            if emotion in emotions:
-                tone_group_counts[tone_group] += count
-    dominant_tone = max(tone_group_counts, key=tone_group_counts.get)
-    dominant_tone_pct = (tone_group_counts[dominant_tone] / total_chunks) * 100
-    # Build summary with markdown formatting
     summary = f"### Voice Emotion Analysis Summary\n\n"
-    summary += f"**Dominant emotion:** {dominant_emotion.capitalize()} ({dominant_pct:.1f}%)\n\n"
-    if dominant_emotion != most_confident and emotion_confidence[most_confident] > 0.7:
-        summary += f"**Most confident detection:** {most_confident.capitalize()} "
-        summary += f"(avg. confidence: {emotion_confidence[most_confident]:.2f})\n\n"
-    summary += f"**Overall tone:** {dominant_tone.capitalize()} ({dominant_tone_pct:.1f}%)\n\n"
     summary += f"**Description:** {EMOTION_DESCRIPTIONS.get(dominant_emotion, '')}\n\n"
-    # Show emotion distribution as sorted list
     summary += "**Emotion distribution:**\n"
-    for emotion, count in sorted(emotion_counts.items(), key=lambda x: x[1], reverse=True):
-        percentage = (count / total_chunks) * 100
-        avg_conf = emotion_confidence[emotion]
-        summary += f"- {emotion.capitalize()}: {percentage:.1f}% (confidence: {avg_conf:.2f})\n"
-    # Add interpretation based on dominant emotion
-    summary += f"\n**Interpretation:**\n"
-    if dominant_emotion == "happy":
-        summary += "The voice conveys primarily positive emotions, suggesting enthusiasm, satisfaction, or joy."
-    elif dominant_emotion == "neutral":
-        summary += "The voice maintains an even emotional tone, suggesting composure or professional delivery."
-    elif dominant_emotion == "sad":
-        summary += "The voice conveys melancholy or disappointment, potentially indicating concern or distress."
-    elif dominant_emotion == "angry":
-        summary += "The voice shows frustration or assertiveness, suggesting strong conviction or displeasure."
-    elif dominant_emotion == "fear":
-        summary += "The voice reveals anxiety or nervousness, suggesting uncertainty or concern."
-    elif dominant_emotion == "disgust":
-        summary += "The voice expresses disapproval or aversion, suggesting rejection of discussed concepts."
-    elif dominant_emotion == "surprise":
-        summary += "The voice shows unexpected reactions, suggesting discovery of new information or astonishment."
     return summary
-def build_detailed_results(all_emotions, time_points):
-    """
-    Return a list of dictionaries containing chunk start-end, top emotion, confidence, description.
-    Suitable for Gradio DataFrame display.
-    """
-    results_list = []
-    for (emotions, (start_time, end_time)) in zip(all_emotions, time_points):
-        top_emotion = max(emotions, key=lambda x: x['score'])
-        label = top_emotion["label"]
-        # Find second highest emotion if available
-        if len(emotions) > 1:
-            sorted_emotions = sorted(emotions, key=lambda x: x['score'], reverse=True)
-            second_emotion = sorted_emotions[1]["label"].capitalize()
-            second_score = sorted_emotions[1]["score"]
-            secondary = f" ({second_emotion}: {second_score:.2f})"
-        else:
-            secondary = ""
-        results_list.append({
-            "Time Range": f"{start_time:.1f}s - {end_time:.1f}s",
-            "Primary Emotion": label.capitalize(),
-            "Confidence": f"{top_emotion['score']:.2f}{secondary}",
-            "Description": EMOTION_DESCRIPTIONS.get(label, "")
-        })
-    return results_list
 def process_audio(audio_file, progress=gr.Progress()):
-    """
-    Main handler for Gradio:
-      1) Emotion analysis (returns Plotly figure).
-      2) Tone analysis (returns descriptive text).
-    """
-    if not audio_file:
-        return None, None, "No audio file provided.", None, "No tone analysis."
-    # 1) Analyze emotions
-    fig, summary_text, detailed_results = analyze_audio_emotions(audio_file, progress)
-    if not fig:  # Error or missing
-        return None, None, "Failed to analyze audio emotions.", None, "Tone analysis unavailable."
-    # 2) Generate alternative chart
-    # Extract the necessary data from detailed_results to create time_points
-    time_points = []
-    for result in detailed_results:
-        time_range = result["Time Range"]
-        start_time = float(time_range.split("s")[0])
-        end_time = float(time_range.split(" - ")[1].split("s")[0])
-        time_points.append((start_time, end_time))
-    # Extract emotion data from detailed_results
-    all_emotions = []
-    for result in detailed_results:
-        # Parse the primary emotion and confidence
-        primary_emotion = result["Primary Emotion"].lower()
-        confidence_str = result["Confidence"].split("(")[0].strip()
-        primary_confidence = float(confidence_str)
-        # Create a list of emotion dictionaries for this time point
-        emotions_at_time = [{"label": primary_emotion, "score": primary_confidence}]
-        # Check if there's a secondary emotion
-        if "(" in result["Confidence"]:
-            secondary_part = result["Confidence"].split("(")[1].split(")")[0]
-            secondary_emotion = secondary_part.split(":")[0].strip().lower()
-            secondary_confidence = float(secondary_part.split(":")[1].strip())
-            emotions_at_time.append({"label": secondary_emotion, "score": secondary_confidence})
-        # Add remaining emotions with zero confidence
-        for emotion in EMOTION_DESCRIPTIONS.keys():
-            if emotion not in [e["label"] for e in emotions_at_time]:
-                emotions_at_time.append({"label": emotion, "score": 0.0})
-        all_emotions.append(emotions_at_time)
-    # Now we can generate the alternative chart
-    alt_fig = generate_alternative_chart(all_emotions, time_points)
-    # 3) Analyze tone
-    tone_analysis = analyze_voice_tone(audio_file)
-    return fig, alt_fig, summary_text, detailed_results, tone_analysis
-# Create Gradio interface with improved UI/UX
-with gr.Blocks(title="Voice Emotion & Tone Analysis System", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
-    # 🎙️ Voice Emotion & Tone Analysis System
-    This app provides professional analysis of:
-    - **Emotions** in your voice (Anger, Disgust, Fear, Happy, Neutral, Sad, Surprise)
-    - **Tone characteristics** (based on pitch, energy, and speech patterns)
-    The interactive timeline shows emotion confidence scores throughout your audio.
     """)
     with gr.Tabs():
-        # Tab 1: Upload
         with gr.TabItem("Upload Audio"):
             with gr.Row():
                 with gr.Column(scale=1):
                     audio_input = gr.Audio(
                         label="Upload Audio File",
                         type="filepath",
-                        sources=["upload"],
-                        elem_id="audio_upload"
                     )
-                    process_btn = gr.Button("Analyze Voice", variant="primary")
-                    gr.Markdown("""
-                    **Supports:** MP3, WAV, M4A, and most audio formats
-                    **For best results:** Use a clear voice recording with minimal background noise
-                    """)
                 with gr.Column(scale=2):
-                    with gr.Tabs():
-                        with gr.TabItem("Line Chart"):
-                            emotion_timeline = gr.Plot(label="Emotion Timeline",
-                                                      elem_id="emotion_plot",
-                                                      container=True)
-                        with gr.TabItem("Area Chart"):
-                            emotion_area_chart = gr.Plot(label="Emotion Distribution",
-                                                        elem_id="emotion_area_plot",
-                                                        container=True)
             with gr.Row():
-                with gr.Column():
-                    emotion_summary = gr.Markdown(label="Emotion Summary")
-                with gr.Column():
-                    tone_analysis_output = gr.Markdown(label="Tone Analysis")
             with gr.Row():
                 emotion_results = gr.DataFrame(
-                    headers=["Time Range", "Primary Emotion", "Confidence", "Description"],
                     label="Detailed Emotion Analysis"
                 )
             process_btn.click(
                 fn=process_audio,
                 inputs=[audio_input],
-                outputs=[emotion_timeline, emotion_area_chart, emotion_summary, emotion_results, tone_analysis_output]
             )
-        # Tab 2: Record
         with gr.TabItem("Record Voice"):
             with gr.Row():
                 with gr.Column(scale=1):
                     record_input = gr.Audio(
                         label="Record Your Voice",
                         sources=["microphone"],
-                        type="filepath",
-                        elem_id="record_audio"
                     )
-                    analyze_btn = gr.Button("Analyze Recording", variant="primary")
-                    gr.Markdown("""
-                    **Tips:**
-                    - Speak clearly and at a normal pace
-                    - Record at least 10-15 seconds for more accurate analysis
-                    - Try different emotional tones to see how they're detected
-                    """)
                 with gr.Column(scale=2):
-                    with gr.Tabs():
-                        with gr.TabItem("Line Chart"):
-                            rec_emotion_timeline = gr.Plot(label="Emotion Timeline",
-                                                          elem_id="record_emotion_plot",
-                                                          container=True)
-                        with gr.TabItem("Area Chart"):
-                            rec_emotion_area_chart = gr.Plot(label="Emotion Distribution",
-                                                           elem_id="record_emotion_area_plot",
-                                                           container=True)
             with gr.Row():
-                with gr.Column():
-                    rec_emotion_summary = gr.Markdown(label="Emotion Summary")
-                with gr.Column():
-                    rec_tone_analysis_output = gr.Markdown(label="Tone Analysis")
             with gr.Row():
                 rec_emotion_results = gr.DataFrame(
-                    headers=["Time Range", "Primary Emotion", "Confidence", "Description"],
                     label="Detailed Emotion Analysis"
                 )
             analyze_btn.click(
                 fn=process_audio,
                 inputs=[record_input],
-                outputs=[rec_emotion_timeline, rec_emotion_area_chart, rec_emotion_summary, rec_emotion_results, rec_tone_analysis_output]
             )
-        # Tab 3: About & Help
-        with gr.TabItem("About & Help"):
-            gr.Markdown("""
-            ## About This System
-            This voice emotion & tone analysis system uses state-of-the-art deep learning models to detect emotions and analyze vocal characteristics. The system is built on HuBERT (Hidden Unit BERT) architecture trained on speech emotion recognition tasks.
-            ### How It Works
-            1. **Audio Processing**: Your audio is processed in short segments (chunks) to capture emotion variations over time.
-            2. **Emotion Classification**: Each segment is analyzed by a neural network to detect emotional patterns.
-            3. **Tone Analysis**: Acoustic features like pitch, energy, and rhythm are analyzed to describe voice tone characteristics.
-            ### Emotion Categories
-            The system detects seven standard emotions:
-            - **Angry**: Voice shows irritation, hostility, or aggression. Tone may be harsh, loud, or intense.
-            - **Disgust**: Voice expresses revulsion or strong disapproval. Tone may sound repulsed or contemptuous.
-            - **Fear**: Voice reveals anxiety, worry, or dread. Tone may be shaky, hesitant, or tense.
-            - **Happy**: Voice conveys joy, pleasure, or positive emotions. Tone is often bright, energetic, and uplifted.
-            - **Neutral**: Voice lacks strong emotional signals. Tone is even, moderate, and relatively flat.
-            - **Sad**: Voice expresses sorrow, unhappiness, or melancholy. Tone may be quiet, heavy, or subdued.
-            - **Surprise**: Voice reflects unexpected reactions. Tone may be higher pitched, quick, or energetic.
-            ### Tips for Best Results
-            - Use clear audio with minimal background noise
-            - Speak naturally at a comfortable volume
-            - Record at least 10-15 seconds of speech
-            - For tone analysis, longer recordings (30+ seconds) provide more accurate results
-            ### Privacy Notice
-            All audio processing happens on your device. No audio recordings or analysis results are stored or transmitted to external servers.
-            """)
     gr.Markdown("""
-    ---
-    ### System Information
-    - **Model**: HuBERT Large for Speech Emotion Recognition
-    - **Version**: 1.2.0
-    - **Libraries**: PyTorch, Transformers, Librosa, Plotly
-    This application demonstrates the use of AI for speech emotion recognition and acoustic analysis. For research and educational purposes only.
     """)
-# Check if model can load before launching interface
-print("Checking model availability...")
-load_success = load_emotion_model()
-if not load_success:
-    print("Warning: Emotion model failed to load. Application may have limited functionality.")
-# Launch the demo
 if __name__ == "__main__":
     demo.launch()

+# voice_emotion_classification.py
 import os
 import subprocess
 import sys
         print(f"Failed to install {package_spec}: {e}")
         raise
+# Required packages (you may add version pins if necessary)
 required_packages = {
     "gradio": None,
     "torch": None,
     "librosa": None,
     "scipy": None,
     "matplotlib": None,
+    "pydub": None
 }
 installed_packages = {pkg.key for pkg in pkg_resources.working_set}
     if package not in installed_packages:
         install_package(package, version)
+# Now import all necessary packages
 import gradio as gr
 import torch
 import torchaudio
 import librosa
+import matplotlib.pyplot as plt
+from matplotlib.colors import LinearSegmentedColormap
 from pydub import AudioSegment
 import scipy
 import io
 from transformers import pipeline, AutoFeatureExtractor, AutoModelForAudioClassification
+from pathlib import Path
+import matplotlib
+matplotlib.use('Agg')  # Use non-interactive backend
 # Define emotion labels, tone mapping, and descriptions
 EMOTION_DESCRIPTIONS = {
     "surprise": "Voice reflects unexpected reactions. Tone may be higher pitched, quick, or energetic."
 }
+# Here we map emotion to a generalized tone (for example, negative or positive)
 TONE_MAPPING = {
     "positive": ["happy", "surprise"],
     "neutral": ["neutral"],
     "negative": ["angry", "sad", "fear", "disgust"]
 }
+# Some Hugging Face models return short labels (e.g., "hap", "ang", etc.).
+# This mapping will ensure they're translated into our full canonical labels.
+MODEL_TO_EMOTION_MAP = {
+    "hap": "happy",
+    "ang": "angry",
+    "sad": "sad",
+    "dis": "disgust",
+    "fea": "fear",
+    "neu": "neutral",
+    "sur": "surprise"
+}
 # Global variable for the emotion classifier
 audio_emotion_classifier = None
 def load_emotion_model():
+    """Load the emotion classification model once and cache it."""
     global audio_emotion_classifier
     if audio_emotion_classifier is None:
         try:
             print("Loading emotion classification model...")
+            # Using the Hugging Face pipeline with the new model that classifies speech emotion
             model_name = "superb/hubert-large-superb-er"
             audio_emotion_classifier = pipeline("audio-classification", model=model_name)
             print("Emotion classification model loaded successfully")
     return True
 def convert_audio_to_wav(audio_file):
+    """Convert the uploaded audio to WAV format."""
     try:
         audio = AudioSegment.from_file(audio_file)
         with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav:
         print(f"Error converting audio: {e}")
         return None
+def analyze_audio_emotions(audio_file, progress=gr.Progress(), chunk_duration=5):
     """
+    Analyze emotions in an audio file by processing it in chunks.
+    Returns a visualization, processed audio path, summary, and detailed results.
     """
     if not load_emotion_model():
+        return None, "Failed to load emotion classification model. Please check console for details."
+    # If the file is already a WAV, use it directly; else convert it.
+    if audio_file.endswith('.wav'):
         audio_path = audio_file
     else:
         audio_path = convert_audio_to_wav(audio_file)
         if not audio_path:
+            return None, "Failed to process audio file. Unsupported format or corrupted file."
     try:
+        # Load the audio using librosa
         audio_data, sample_rate = librosa.load(audio_path, sr=16000)
         duration = len(audio_data) / sample_rate
+        # Process in chunks for long files
         chunk_samples = int(chunk_duration * sample_rate)
         num_chunks = max(1, int(np.ceil(len(audio_data) / chunk_samples)))
         all_emotions = []
         time_points = []
         for i in range(num_chunks):
             progress((i + 1) / num_chunks, "Analyzing audio emotions...")
             start_idx = i * chunk_samples
             end_idx = min(start_idx + chunk_samples, len(audio_data))
             chunk = audio_data[start_idx:end_idx]
+            # Skip too-short chunks (<0.5 seconds)
             if len(chunk) < 0.5 * sample_rate:
                 continue
+            # Create a temporary file for this audio chunk
             with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_chunk:
                 chunk_path = temp_chunk.name
                 scipy.io.wavfile.write(chunk_path, sample_rate, (chunk * 32767).astype(np.int16))
+            # Get emotion classification results on this chunk
+            results = audio_emotion_classifier(chunk_path)
+            os.unlink(chunk_path)  # Remove the temporary file
+            all_emotions.append(results)
             time_points.append((start_idx / sample_rate, end_idx / sample_rate))
+        # Generate visualization and summary
+        fig, detailed_results = generate_emotion_timeline(all_emotions, time_points, duration)
+        with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_img:
+            img_path = temp_img.name
+            fig.savefig(img_path, dpi=100, bbox_inches='tight')
+            plt.close(fig)
+        summary = generate_emotion_summary(all_emotions, time_points)
+        return img_path, audio_path, summary, detailed_results
     except Exception as e:
+        print(f"Error analyzing audio: {e}")
         import traceback
         traceback.print_exc()
+        return None, None, f"Error analyzing audio: {str(e)}", None
+def generate_emotion_timeline(all_emotions, time_points, duration):
     """
+    Generate a bar chart visualization of emotion percentages with tone analysis.
+    Returns the matplotlib figure and a list of detailed results.
     """
+    # All possible emotion labels from our dictionary
     emotion_labels = list(EMOTION_DESCRIPTIONS.keys())
+    # We'll accumulate counts based on our canonical labels (e.g., "happy", "angry").
+    emotion_counts = {}
+    for emotions in all_emotions:
+        if not emotions:
+            continue
+        # The pipeline returns items like {"label": "Hap", "score": 0.95}, etc.
+        top_emotion = max(emotions, key=lambda x: x['score'])
+        # Normalize the label from the model to a canonical label used in EMOTION_DESCRIPTIONS
+        raw_label = top_emotion['label'].lower().strip()  # e.g., "hap", "ang", ...
+        canonical_label = MODEL_TO_EMOTION_MAP.get(raw_label, raw_label)
+        # If there's no mapping, we leave it as raw_label.
+        # But typically, it should be one of "happy", "angry", "disgust", "fear", "sad", "neutral", "surprise".
+        # Count how many times each canonical label appears
+        emotion_counts[canonical_label] = emotion_counts.get(canonical_label, 0) + 1
+    total_chunks = len(all_emotions)
+    emotion_percentages = {
+        e: (count / total_chunks * 100) for e, count in emotion_counts.items()
     }
+    # Create empty percentages for emotions that didn't appear
     for label in emotion_labels:
+        if label not in emotion_percentages:
+            emotion_percentages[label] = 0.0
+    # Sort emotions by percentage
+    sorted_emotions = sorted(emotion_percentages.items(), key=lambda x: x[1], reverse=True)
+    # Create the bar chart with subplots: one for emotions and one for tone
+    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), height_ratios=[3, 1], gridspec_kw={'hspace': 0.3})
+    # Capitalize each label for a nice display
+    emotions = [item[0].capitalize() for item in sorted_emotions]
+    percentages = [item[1] for item in sorted_emotions]
+    # Custom colors for emotions (enough for 7 emotions)
+    colors = ['red', 'brown', 'purple', 'green', 'gray', 'blue', 'orange']
+    if len(emotions) <= len(colors):
+        bar_colors = colors[:len(emotions)]
+    else:
+        # fallback if there's more emotions than colors
+        bar_colors = colors + ['#666666'] * (len(emotions) - len(colors))
+    # Plot emotion bars
+    bars = ax1.bar(emotions, percentages, color=bar_colors)
+    # Add percentage labels on top of each bar
+    for bar in bars:
+        height = bar.get_height()
+        ax1.annotate(f'{height:.1f}%',
+                     xy=(bar.get_x() + bar.get_width() / 2, height),
+                     xytext=(0, 3),  # 3 points vertical offset
+                     textcoords="offset points",
+                     ha='center', va='bottom')
+    ax1.set_ylim(0, 100)  # Fixed 100% scale
+    ax1.set_ylabel('Percentage (%)')
+    ax1.set_title('Emotion Distribution')
+    ax1.grid(axis='y', linestyle='--', alpha=0.7)
+    # Calculate tone percentages based on the canonical labels we found
+    tone_percentages = {"positive": 0, "neutral": 0, "negative": 0}
+    for emotion_label, percentage in emotion_percentages.items():
+        for tone, emotions_list in TONE_MAPPING.items():
+            if emotion_label in emotions_list:
+                tone_percentages[tone] += percentage
+    # Plot tone bars
+    tones = list(tone_percentages.keys())
+    tone_values = list(tone_percentages.values())
+    tone_colors = {'positive': 'green', 'neutral': 'gray', 'negative': 'red'}
+    tone_bars = ax2.bar(tones, tone_values, color=[tone_colors[t] for t in tones])
+    # Add percentage labels on tone bars
+    for bar in tone_bars:
+        height = bar.get_height()
+        if height > 0:  # Only add label if there's a visible bar
+            ax2.annotate(f'{height:.1f}%',
+                         xy=(bar.get_x() + bar.get_width() / 2, height),
+                         xytext=(0, 3),
+                         textcoords="offset points",
+                         ha='center', va='bottom')
+    ax2.set_ylim(0, 100)
+    ax2.set_ylabel('Percentage (%)')
+    ax2.set_title('Tone Analysis')
+    ax2.grid(axis='y', linestyle='--', alpha=0.7)
+    plt.tight_layout()
+    # Generate a more detailed time-segmented result
+    detailed_results = []
+    for idx, (emotions, (start_time, end_time)) in enumerate(zip(all_emotions, time_points)):
+        if not emotions:
+            continue
+        top_emotion = max(emotions, key=lambda x: x['score'])
+        raw_label = top_emotion['label'].lower().strip()
+        canonical_label = MODEL_TO_EMOTION_MAP.get(raw_label, raw_label)
+        # Determine the tone for this emotion
+        # (based on canonical_label rather than the raw model label)
+        tone = next((t for t, e_list in TONE_MAPPING.items() if canonical_label in e_list), "unknown")
+        detailed_results.append({
+            'Time Range': f"{start_time:.1f}s - {end_time:.1f}s",
+            'Emotion': canonical_label,
+            'Tone': tone.capitalize(),
+            'Confidence': f"{top_emotion['score']:.2f}",
+            'Description': EMOTION_DESCRIPTIONS.get(canonical_label, "")
+        })
+    return fig, detailed_results
+def generate_emotion_summary(all_emotions, time_points):
     """
+    Create a summary text from the emotion analysis.
+    Counts occurrences and computes percentages of the dominant emotion.
     """
     if not all_emotions:
         return "No emotional content detected."
     emotion_counts = {}
     total_chunks = len(all_emotions)
+    for emotions in all_emotions:
+        if not emotions:
+            continue
+        top_emotion = max(emotions, key=lambda x: x['score'])
+        # Normalize the label
+        raw_label = top_emotion['label'].lower().strip()
+        canonical_label = MODEL_TO_EMOTION_MAP.get(raw_label, raw_label)
+        emotion_counts[canonical_label] = emotion_counts.get(canonical_label, 0) + 1
+    emotion_percentages = {
+        e: (count / total_chunks * 100)
+        for e, count in emotion_counts.items()
+    }
+    if not emotion_percentages:
+        return "No emotional content detected."
+    # Find the dominant emotion (highest percentage)
+    dominant_emotion = max(emotion_percentages.items(), key=lambda x: x[1])[0]
     summary = f"### Voice Emotion Analysis Summary\n\n"
+    summary += f"**Dominant emotion:** {dominant_emotion.capitalize()} ({emotion_percentages[dominant_emotion]:.1f}%)\n\n"
     summary += f"**Description:** {EMOTION_DESCRIPTIONS.get(dominant_emotion, '')}\n\n"
     summary += "**Emotion distribution:**\n"
+    for emotion, percentage in sorted(emotion_percentages.items(), key=lambda x: x[1], reverse=True):
+        summary += f"- {emotion.capitalize()}: {percentage:.1f}%\n"
+    summary += "\n**Interpretation:** The voice predominantly expresses {0} emotion".format(dominant_emotion)
     return summary
+def record_audio(audio):
+    """Save recorded audio and analyze emotions."""
+    try:
+        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
+            audio_path = temp_file.name
+            with open(audio_path, 'wb') as f:
+                f.write(audio)
+        return audio_path
+    except Exception as e:
+        print(f"Error saving recorded audio: {e}")
+        return None
 def process_audio(audio_file, progress=gr.Progress()):
+    """Process the audio file and analyze emotions."""
+    if audio_file is None:
+        return None, None, "No audio file provided.", None
+    img_path, processed_audio, summary, results = analyze_audio_emotions(audio_file, progress)
+    if img_path is None:
+        return None, None, "Failed to analyze audio emotions.", None
+    return img_path, processed_audio, summary, results
+# Create Gradio interface
+with gr.Blocks(title="Voice Emotion Analysis System") as demo:
     gr.Markdown("""
+    # 🎙️ Voice Emotion Analysis System
+    This app analyzes the emotional content of voice recordings.
+    It detects emotions including:
+    * 😡 **Anger**
+    * 🤢 **Disgust**
+    * 😨 **Fear**
+    * 😊 **Happiness**
+    * 😐 **Neutral**
+    * 😢 **Sadness**
+    * 😲 **Surprise**
+    And provides a detailed analysis and timeline.
     """)
     with gr.Tabs():
         with gr.TabItem("Upload Audio"):
             with gr.Row():
                 with gr.Column(scale=1):
                     audio_input = gr.Audio(
                         label="Upload Audio File",
                         type="filepath",
+                        sources=["upload"]
                     )
+                    process_btn = gr.Button("Analyze Voice Emotions")
                 with gr.Column(scale=2):
+                    emotion_timeline = gr.Image(label="Emotion Timeline", show_label=True)
             with gr.Row():
+                audio_playback = gr.Audio(label="Processed Audio", show_label=True)
+                emotion_summary = gr.Markdown(label="Emotion Summary")
             with gr.Row():
                 emotion_results = gr.DataFrame(
+                    headers=["Time Range", "Emotion", "Tone", "Confidence", "Description"],
                     label="Detailed Emotion Analysis"
                 )
             process_btn.click(
                 fn=process_audio,
                 inputs=[audio_input],
+                outputs=[emotion_timeline, audio_playback, emotion_summary, emotion_results]
             )
         with gr.TabItem("Record Voice"):
             with gr.Row():
                 with gr.Column(scale=1):
                     record_input = gr.Audio(
                         label="Record Your Voice",
                         sources=["microphone"],
+                        type="filepath"
                     )
+                    analyze_btn = gr.Button("Analyze Recording")
                 with gr.Column(scale=2):
+                    rec_emotion_timeline = gr.Image(label="Emotion Timeline", show_label=True)
             with gr.Row():
+                rec_audio_playback = gr.Audio(label="Processed Audio", show_label=True)
+                rec_emotion_summary = gr.Markdown(label="Emotion Summary")
             with gr.Row():
                 rec_emotion_results = gr.DataFrame(
+                    headers=["Time Range", "Emotion", "Tone", "Confidence", "Description"],
                     label="Detailed Emotion Analysis"
                 )
             analyze_btn.click(
                 fn=process_audio,
                 inputs=[record_input],
+                outputs=[rec_emotion_timeline, rec_audio_playback, rec_emotion_summary, rec_emotion_results]
             )
     gr.Markdown("""
+    ### How to Use
+    1. **Upload Audio Tab:** Upload an audio file and click "Analyze Voice Emotions".
+    2. **Record Voice Tab:** Record your voice and click "Analyze Recording".
+    **Tips:**
+    - Use clear recordings with minimal background noise.
+    - Longer recordings yield more consistent results.
     """)
+def initialize_app():
+    print("Initializing voice emotion analysis app...")
+    if load_emotion_model():
+        print("Emotion model loaded successfully!")
+    else:
+        print("Failed to load emotion model.")
 if __name__ == "__main__":
+    initialize_app()
     demo.launch()