Spaces:

ojas121
/

speech_emotion_project

Running

App Files Files Community

speech_emotion_project / app.py

ojas121

Update app.py

032610a verified 9 months ago

raw

history blame

2.87 kB

	import streamlit as st
	import librosa
	import librosa.display
	import numpy as np
	import matplotlib.pyplot as plt
	import soundfile as sf
	import wave
	import json
	from vosk import Model, KaldiRecognizer
	from transformers import pipeline
	import os
	from pydub import AudioSegment
	import noisereduce as nr

	import streamlit as st
	import subprocess

	try:
	import librosa
	st.write("✅ Librosa is installed successfully!")
	except ImportError:
	st.write("❌ Librosa is missing! Installing now...")
	subprocess.run(["pip", "install", "librosa"])
	import librosa
	st.write("✅ Librosa installed successfully!")



	# Load Vosk model
	MODEL_PATH = "vosk-model-small-en-us-0.15"
	if not os.path.exists(MODEL_PATH):
	st.error("Vosk model not found! Please download and extract it.")
	st.stop()
	model = Model(MODEL_PATH)

	# Streamlit UI
	st.title("🎙️ Speech Detection System using Mozilla Common Voice")
	st.write("Upload an audio file and get real-time speech-to-text, noise filtering, and emotion analysis.")

	uploaded_file = st.file_uploader("Upload an MP3/WAV file", type=["mp3", "wav"])

	if uploaded_file:
	# Convert MP3 to WAV if needed
	file_path = f"temp/{uploaded_file.name}"
	os.makedirs("temp", exist_ok=True)
	with open(file_path, "wb") as f:
	f.write(uploaded_file.getbuffer())

	if file_path.endswith(".mp3"):
	wav_path = file_path.replace(".mp3", ".wav")
	audio = AudioSegment.from_mp3(file_path)
	audio.export(wav_path, format="wav")
	file_path = wav_path

	# Load audio
	y, sr = librosa.load(file_path, sr=16000)

	# Display waveform
	fig, ax = plt.subplots(figsize=(10, 4))
	librosa.display.waveshow(y, sr=sr, ax=ax)
	st.pyplot(fig)

	# Noise Reduction
	y_denoised = nr.reduce_noise(y=y, sr=sr)
	denoised_path = file_path.replace(".wav", "_denoised.wav")
	sf.write(denoised_path, y_denoised, sr)

	# Speech-to-Text using Vosk
	def transcribe_audio(audio_path):
	wf = wave.open(audio_path, "rb")
	rec = KaldiRecognizer(model, wf.getframerate())

	while True:
	data = wf.readframes(4000)
	if len(data) == 0:
	break
	if rec.AcceptWaveform(data):
	result = json.loads(rec.Result())
	return result["text"]

	transcription = transcribe_audio(file_path)
	st.subheader("📝 Transcribed Text:")
	st.write(transcription)

	# Emotion Detection
	emotion_model = pipeline("audio-classification", model="superb/wav2vec2-large-xlsr-53")
	emotion_result = emotion_model(file_path)

	st.subheader("😊 Emotion Analysis:")
	st.write(emotion_result)

	# Play original and denoised audio
	st.audio(file_path, format="audio/wav", start_time=0)
	st.subheader("🔊 Denoised Audio:")
	st.audio(denoised_path, format="audio/wav", start_time=0)