Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -10,16 +10,52 @@ import wave
|
|
| 10 |
import json
|
| 11 |
from vosk import Model, KaldiRecognizer
|
| 12 |
from transformers import pipeline
|
| 13 |
-
from huggingface_hub import snapshot_download
|
| 14 |
from pydub import AudioSegment
|
| 15 |
import noisereduce as nr
|
| 16 |
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
# β
Auto-Download Vosk Model (Speech-to-Text)
|
| 20 |
VOSK_MODEL = "vosk-model-small-en-us-0.15"
|
| 21 |
if not os.path.exists(VOSK_MODEL):
|
| 22 |
-
st.write("Downloading Vosk Model...")
|
| 23 |
subprocess.run(["wget", "-O", "vosk.zip", "https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip"])
|
| 24 |
subprocess.run(["unzip", "vosk.zip"])
|
| 25 |
subprocess.run(["rm", "vosk.zip"])
|
|
@@ -30,15 +66,15 @@ model = Model(VOSK_MODEL)
|
|
| 30 |
# β
Auto-Download Wav2Vec2 Model (Emotion Detection)
|
| 31 |
WAV2VEC_MODEL = "facebook/wav2vec2-large-xlsr-53"
|
| 32 |
if not os.path.exists(WAV2VEC_MODEL):
|
| 33 |
-
st.write(f"Downloading {WAV2VEC_MODEL}...")
|
| 34 |
snapshot_download(repo_id=WAV2VEC_MODEL, local_dir=WAV2VEC_MODEL)
|
| 35 |
|
| 36 |
# Load emotion detection model
|
| 37 |
emotion_model = pipeline("audio-classification", model=WAV2VEC_MODEL)
|
| 38 |
|
| 39 |
# β
Streamlit UI
|
| 40 |
-
st.
|
| 41 |
-
st.
|
| 42 |
|
| 43 |
uploaded_file = st.file_uploader("Upload an MP3/WAV file", type=["mp3", "wav"])
|
| 44 |
|
|
@@ -58,12 +94,14 @@ if uploaded_file:
|
|
| 58 |
# Load audio
|
| 59 |
y, sr = librosa.load(file_path, sr=16000)
|
| 60 |
|
| 61 |
-
# Display waveform
|
|
|
|
| 62 |
fig, ax = plt.subplots(figsize=(10, 4))
|
| 63 |
librosa.display.waveshow(y, sr=sr, ax=ax)
|
| 64 |
st.pyplot(fig)
|
| 65 |
|
| 66 |
# β
Noise Reduction
|
|
|
|
| 67 |
y_denoised = nr.reduce_noise(y=y, sr=sr)
|
| 68 |
denoised_path = file_path.replace(".wav", "_denoised.wav")
|
| 69 |
sf.write(denoised_path, y_denoised, sr)
|
|
@@ -82,16 +120,18 @@ if uploaded_file:
|
|
| 82 |
return result["text"]
|
| 83 |
|
| 84 |
transcription = transcribe_audio(file_path)
|
| 85 |
-
|
| 86 |
-
st.
|
|
|
|
| 87 |
|
| 88 |
# β
Emotion Detection
|
|
|
|
| 89 |
emotion_result = emotion_model(file_path)
|
| 90 |
-
|
| 91 |
-
st.subheader("π Emotion Analysis:")
|
| 92 |
st.write(emotion_result)
|
| 93 |
|
| 94 |
# β
Play Original & Denoised Audio
|
|
|
|
| 95 |
st.audio(file_path, format="audio/wav", start_time=0)
|
| 96 |
-
|
|
|
|
| 97 |
st.audio(denoised_path, format="audio/wav", start_time=0)
|
|
|
|
| 10 |
import json
|
| 11 |
from vosk import Model, KaldiRecognizer
|
| 12 |
from transformers import pipeline
|
| 13 |
+
from huggingface_hub import snapshot_download
|
| 14 |
from pydub import AudioSegment
|
| 15 |
import noisereduce as nr
|
| 16 |
|
| 17 |
+
# π¨ Apply Custom CSS Styling
|
| 18 |
+
st.markdown(
|
| 19 |
+
"""
|
| 20 |
+
<style>
|
| 21 |
+
.stApp {
|
| 22 |
+
background-color: #f0f2f6;
|
| 23 |
+
}
|
| 24 |
+
.title {
|
| 25 |
+
font-size: 32px;
|
| 26 |
+
text-align: center;
|
| 27 |
+
color: #4A90E2;
|
| 28 |
+
font-weight: bold;
|
| 29 |
+
}
|
| 30 |
+
.subheader {
|
| 31 |
+
font-size: 20px;
|
| 32 |
+
font-weight: bold;
|
| 33 |
+
color: #333;
|
| 34 |
+
}
|
| 35 |
+
.stButton>button {
|
| 36 |
+
background-color: #4A90E2 !important;
|
| 37 |
+
color: white !important;
|
| 38 |
+
font-size: 18px !important;
|
| 39 |
+
padding: 10px 24px !important;
|
| 40 |
+
border-radius: 10px !important;
|
| 41 |
+
border: none !important;
|
| 42 |
+
}
|
| 43 |
+
.stAudio {
|
| 44 |
+
width: 100% !important;
|
| 45 |
+
}
|
| 46 |
+
.stMarkdown {
|
| 47 |
+
font-size: 16px;
|
| 48 |
+
color: #333;
|
| 49 |
+
}
|
| 50 |
+
</style>
|
| 51 |
+
""",
|
| 52 |
+
unsafe_allow_html=True
|
| 53 |
+
)
|
| 54 |
|
| 55 |
# β
Auto-Download Vosk Model (Speech-to-Text)
|
| 56 |
VOSK_MODEL = "vosk-model-small-en-us-0.15"
|
| 57 |
if not os.path.exists(VOSK_MODEL):
|
| 58 |
+
st.write("π₯ Downloading Vosk Model...")
|
| 59 |
subprocess.run(["wget", "-O", "vosk.zip", "https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip"])
|
| 60 |
subprocess.run(["unzip", "vosk.zip"])
|
| 61 |
subprocess.run(["rm", "vosk.zip"])
|
|
|
|
| 66 |
# β
Auto-Download Wav2Vec2 Model (Emotion Detection)
|
| 67 |
WAV2VEC_MODEL = "facebook/wav2vec2-large-xlsr-53"
|
| 68 |
if not os.path.exists(WAV2VEC_MODEL):
|
| 69 |
+
st.write(f"π₯ Downloading {WAV2VEC_MODEL}...")
|
| 70 |
snapshot_download(repo_id=WAV2VEC_MODEL, local_dir=WAV2VEC_MODEL)
|
| 71 |
|
| 72 |
# Load emotion detection model
|
| 73 |
emotion_model = pipeline("audio-classification", model=WAV2VEC_MODEL)
|
| 74 |
|
| 75 |
# β
Streamlit UI
|
| 76 |
+
st.markdown("<div class='title'>ποΈ Speech Detection System</div>", unsafe_allow_html=True)
|
| 77 |
+
st.markdown("<div class='subheader'>π Upload an audio file for speech-to-text, noise filtering, and emotion analysis.</div>", unsafe_allow_html=True)
|
| 78 |
|
| 79 |
uploaded_file = st.file_uploader("Upload an MP3/WAV file", type=["mp3", "wav"])
|
| 80 |
|
|
|
|
| 94 |
# Load audio
|
| 95 |
y, sr = librosa.load(file_path, sr=16000)
|
| 96 |
|
| 97 |
+
# π΅ Display waveform
|
| 98 |
+
st.markdown("<div class='subheader'>πΌ Audio Waveform:</div>", unsafe_allow_html=True)
|
| 99 |
fig, ax = plt.subplots(figsize=(10, 4))
|
| 100 |
librosa.display.waveshow(y, sr=sr, ax=ax)
|
| 101 |
st.pyplot(fig)
|
| 102 |
|
| 103 |
# β
Noise Reduction
|
| 104 |
+
st.markdown("<div class='subheader'>π Applying Noise Reduction...</div>", unsafe_allow_html=True)
|
| 105 |
y_denoised = nr.reduce_noise(y=y, sr=sr)
|
| 106 |
denoised_path = file_path.replace(".wav", "_denoised.wav")
|
| 107 |
sf.write(denoised_path, y_denoised, sr)
|
|
|
|
| 120 |
return result["text"]
|
| 121 |
|
| 122 |
transcription = transcribe_audio(file_path)
|
| 123 |
+
|
| 124 |
+
st.markdown("<div class='subheader'>π Transcribed Text:</div>", unsafe_allow_html=True)
|
| 125 |
+
st.markdown(f"<div class='stMarkdown'>{transcription}</div>", unsafe_allow_html=True)
|
| 126 |
|
| 127 |
# β
Emotion Detection
|
| 128 |
+
st.markdown("<div class='subheader'>π Emotion Analysis:</div>", unsafe_allow_html=True)
|
| 129 |
emotion_result = emotion_model(file_path)
|
|
|
|
|
|
|
| 130 |
st.write(emotion_result)
|
| 131 |
|
| 132 |
# β
Play Original & Denoised Audio
|
| 133 |
+
st.markdown("<div class='subheader'>π Play Audio:</div>", unsafe_allow_html=True)
|
| 134 |
st.audio(file_path, format="audio/wav", start_time=0)
|
| 135 |
+
|
| 136 |
+
st.markdown("<div class='subheader'>π Denoised Audio:</div>", unsafe_allow_html=True)
|
| 137 |
st.audio(denoised_path, format="audio/wav", start_time=0)
|