| import torch |
| import librosa |
| import numpy as np |
| import gradio as gr |
| from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor |
|
|
| model = Wav2Vec2ForSequenceClassification.from_pretrained("pollitoconpapass/superb-ser-finetuned-spanish-v5") |
| feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-er") |
|
|
| EMOTIONS_DICT = { |
| "LABEL_0": "miedo", |
| "LABEL_1": "triste", |
| "LABEL_2": "neutral", |
| "LABEL_3": "enojo", |
| "LABEL_4": "disgusto", |
| "LABEL_5": "feliz", |
| } |
|
|
| def predict_emotion(audio): |
| try: |
| if audio is None: |
| return "Por favor, graba o sube un audio", {} |
| |
| if isinstance(audio, tuple): |
| sr, speech = audio |
| |
| |
| speech = speech.astype(np.float32) |
| |
| |
| if speech.dtype == np.int16 or np.abs(speech).max() > 1.0: |
| speech = speech / 32768.0 |
| |
| |
| if len(speech.shape) > 1: |
| speech = np.mean(speech, axis=1) |
| |
| else: |
| speech, sr = librosa.load(audio, sr=16000, mono=True) |
|
|
| |
| if sr != 16000: |
| speech = librosa.resample(speech, orig_sr=sr, target_sr=16000) |
| sr = 16000 |
| |
| |
| if np.abs(speech).max() > 1.0: |
| speech = speech / np.abs(speech).max() |
|
|
| |
| inputs = feature_extractor(speech, sampling_rate=sr, padding=True, return_tensors="pt") |
|
|
| with torch.no_grad(): |
| logits = model(**inputs).logits |
|
|
| probs = torch.nn.functional.softmax(logits, dim=-1)[0] |
| predicted_id = torch.argmax(probs).item() |
| label = model.config.id2label[predicted_id] |
| emotion = EMOTIONS_DICT[label] |
|
|
| confidence_dict = { |
| EMOTIONS_DICT[model.config.id2label[i]]: float(probs[i]) |
| for i in range(len(probs)) |
| } |
|
|
| return f"{emotion.upper()}", confidence_dict |
|
|
| except Exception as e: |
| return f"Error: {str(e)}", {} |
|
|
| |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: |
| gr.Markdown( |
| """ |
| # 🎙️ Reconocimiento de Emociones en Voz |
| |
| Sube un archivo de audio o graba tu voz para detectar la emoción expresada. |
| |
| **Emociones detectadas:** Miedo, Triste, Neutral, Enojo, Disgusto, Feliz |
| """ |
| ) |
| |
| with gr.Row(): |
| with gr.Column(): |
| audio_input = gr.Audio( |
| sources=["upload", "microphone"], |
| type="numpy", |
| label="Subir o Grabar Audio" |
| ) |
| predict_btn = gr.Button("🔍 Analizar Emoción", variant="primary") |
| |
| with gr.Column(): |
| emotion_output = gr.Textbox( |
| label="Emoción Detectada", |
| placeholder="La emoción aparecerá aquí..." |
| ) |
| confidence_output = gr.Label( |
| label="Niveles de Confianza", |
| num_top_classes=6 |
| ) |
| |
| gr.Markdown( |
| """ |
| ### 📝 Instrucciones: |
| - **Subir:** Haz clic en el área de audio y selecciona un archivo |
| - **Grabar:** Haz clic en el micrófono y permite el acceso para grabar |
| - El modelo funciona mejor con audio claro y expresivo |
| - **Nota:** Después de grabar, espera unos segundos para el procesamiento |
| """ |
| ) |
| |
| |
| predict_btn.click( |
| fn=predict_emotion, |
| inputs=audio_input, |
| outputs=[emotion_output, confidence_output] |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch() |