Spaces:
No application file
No application file
| import sounddevice as sd | |
| import numpy as np | |
| import librosa | |
| import torch | |
| from transformers import pipeline | |
| class VoiceHandler: | |
| def __init__(self): | |
| self.sample_rate = 16000 | |
| self.emotion_classifier = pipeline("audio-classification", | |
| model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition") | |
| def record_audio(self, duration=5): | |
| """Record audio for specified duration""" | |
| recording = sd.rec(int(duration * self.sample_rate), | |
| samplerate=self.sample_rate, | |
| channels=1) | |
| sd.wait() | |
| return recording | |
| def process_audio(self, audio_data): | |
| """Process audio and detect emotion""" | |
| # Convert to mono if needed | |
| if len(audio_data.shape) > 1: | |
| audio_data = np.mean(audio_data, axis=1) | |
| # Normalize audio | |
| audio_data = librosa.util.normalize(audio_data) | |
| # Get emotion | |
| emotion = self.emotion_classifier(audio_data) | |
| return audio_data, emotion[0]['label'] | |
| def enhance_audio(self, audio_data): | |
| """Enhance audio quality""" | |
| # Noise reduction | |
| y = librosa.effects.preemphasis(audio_data) | |
| # Normalize | |
| y = librosa.util.normalize(y) | |
| return y | |