import gradio as gr
import torch
import librosa
import numpy as np
import os

# Define PyTorch model class (must match the structure used during conversion)
class EmotionClassifier(torch.nn.Module):
    def __init__(self, input_shape, num_classes):
        super().__init__()
        # Adjust this architecture to match your converted model
        self.flatten = torch.nn.Flatten()
        self.layers = torch.nn.Sequential(
            torch.nn.Linear(input_shape, 128),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.3),
            torch.nn.Linear(128, 64),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.3),
            torch.nn.Linear(64, num_classes)
        )
        
    def forward(self, x):
        x = self.flatten(x)
        return self.layers(x)

# Create model instance
input_shape = 13 * 128  # n_mfcc * max_length
num_classes = 7  # Number of emotions
model = EmotionClassifier(input_shape, num_classes)

# Load the saved model weights
model_path = os.path.join(os.path.dirname(__file__), 'emotion_model.pt')
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
model.eval()

# Define emotions
emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised"]

def extract_features(audio_path, sample_rate=16000, n_mfcc=13, max_length=128):
    """Extract MFCC features from an audio file"""
    try:
        audio, sr = librosa.load(audio_path, sr=sample_rate)
        
        # Extract MFCCs
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
        
        # Pad or truncate to fixed length
        if mfccs.shape[1] < max_length:
            pad_width = max_length - mfccs.shape[1]
            mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        else:
            mfccs = mfccs[:, :max_length]
        
        return mfccs
    except Exception as e:
        print(f"Error in feature extraction: {e}")
        return None

def predict_emotion(audio):
    """Predict emotion from audio input"""
    try:
        # Process audio input
        if isinstance(audio, str):  # File path
            features = extract_features(audio)
        else:  # Audio array from microphone
            # Handle microphone input
            if isinstance(audio, tuple):
                audio_array, sample_rate = audio
            else:
                audio_array = audio
                sample_rate = 16000
                
            # Convert to mono if stereo
            if len(np.array(audio_array).shape) > 1:
                audio_array = np.mean(audio_array, axis=1)
                
            # Extract features
            mfccs = librosa.feature.mfcc(y=np.array(audio_array), sr=sample_rate, n_mfcc=13)
            
            # Pad or truncate to fixed length
            max_length = 128
            if mfccs.shape[1] < max_length:
                pad_width = max_length - mfccs.shape[1]
                mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
            else:
                mfccs = mfccs[:, :max_length]
                
            features = mfccs
        
        if features is None:
            return {emotion: 0.0 for emotion in emotions}
        
        # Flatten the features (adjust based on your model's input expectations)
        features_flat = features.reshape(1, -1)
        
        # Convert to PyTorch tensor
        features_tensor = torch.tensor(features_flat, dtype=torch.float32)
        
        # Get predictions
        with torch.no_grad():
            outputs = model(features_tensor)
            probabilities = torch.nn.functional.softmax(outputs, dim=1)
        
        # Format results
        result = {emotion: float(probabilities[0][i].item()) for i, emotion in enumerate(emotions)}
        return result
        
    except Exception as e:
        print(f"Error in prediction: {e}")
        import traceback
        traceback.print_exc()
        return {emotion: 1/len(emotions) for emotion in emotions}

# Create Gradio interface
demo = gr.Interface(
    fn=predict_emotion,
    inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
    outputs=gr.Label(num_top_classes=7),
    title="Speech Emotion Recognition",
    description="Upload an audio file or record your voice to identify the emotion. This model can detect neutral, happy, sad, angry, fearful, disgust, and surprised emotions."
)

demo.launch()