Spaces:
Running
Running
| import streamlit as st | |
| import librosa | |
| import librosa.display | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import soundfile as sf | |
| import wave | |
| import json | |
| from vosk import Model, KaldiRecognizer | |
| from transformers import pipeline | |
| import os | |
| from pydub import AudioSegment | |
| import noisereduce as nr | |
| import streamlit as st | |
| import subprocess | |
| try: | |
| import librosa | |
| st.write("β Librosa is installed successfully!") | |
| except ImportError: | |
| st.write("β Librosa is missing! Installing now...") | |
| subprocess.run(["pip", "install", "librosa"]) | |
| import librosa | |
| st.write("β Librosa installed successfully!") | |
| # Load Vosk model | |
| MODEL_PATH = "vosk-model-small-en-us-0.15" | |
| if not os.path.exists(MODEL_PATH): | |
| st.error("Vosk model not found! Please download and extract it.") | |
| st.stop() | |
| model = Model(MODEL_PATH) | |
| # Streamlit UI | |
| st.title("ποΈ Speech Detection System using Mozilla Common Voice") | |
| st.write("Upload an audio file and get real-time speech-to-text, noise filtering, and emotion analysis.") | |
| uploaded_file = st.file_uploader("Upload an MP3/WAV file", type=["mp3", "wav"]) | |
| if uploaded_file: | |
| # Convert MP3 to WAV if needed | |
| file_path = f"temp/{uploaded_file.name}" | |
| os.makedirs("temp", exist_ok=True) | |
| with open(file_path, "wb") as f: | |
| f.write(uploaded_file.getbuffer()) | |
| if file_path.endswith(".mp3"): | |
| wav_path = file_path.replace(".mp3", ".wav") | |
| audio = AudioSegment.from_mp3(file_path) | |
| audio.export(wav_path, format="wav") | |
| file_path = wav_path | |
| # Load audio | |
| y, sr = librosa.load(file_path, sr=16000) | |
| # Display waveform | |
| fig, ax = plt.subplots(figsize=(10, 4)) | |
| librosa.display.waveshow(y, sr=sr, ax=ax) | |
| st.pyplot(fig) | |
| # Noise Reduction | |
| y_denoised = nr.reduce_noise(y=y, sr=sr) | |
| denoised_path = file_path.replace(".wav", "_denoised.wav") | |
| sf.write(denoised_path, y_denoised, sr) | |
| # Speech-to-Text using Vosk | |
| def transcribe_audio(audio_path): | |
| wf = wave.open(audio_path, "rb") | |
| rec = KaldiRecognizer(model, wf.getframerate()) | |
| while True: | |
| data = wf.readframes(4000) | |
| if len(data) == 0: | |
| break | |
| if rec.AcceptWaveform(data): | |
| result = json.loads(rec.Result()) | |
| return result["text"] | |
| transcription = transcribe_audio(file_path) | |
| st.subheader("π Transcribed Text:") | |
| st.write(transcription) | |
| # Emotion Detection | |
| emotion_model = pipeline("audio-classification", model="superb/wav2vec2-large-xlsr-53") | |
| emotion_result = emotion_model(file_path) | |
| st.subheader("π Emotion Analysis:") | |
| st.write(emotion_result) | |
| # Play original and denoised audio | |
| st.audio(file_path, format="audio/wav", start_time=0) | |
| st.subheader("π Denoised Audio:") | |
| st.audio(denoised_path, format="audio/wav", start_time=0) | |