Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -23,8 +23,8 @@ from transformers import pipeline
|
|
| 23 |
|
| 24 |
@st.cache_resource
|
| 25 |
def load_voice_model():
|
| 26 |
-
|
| 27 |
-
|
| 28 |
|
| 29 |
def process_audio(audio_bytes):
|
| 30 |
waveform, sample_rate = torchaudio.load(BytesIO(audio_bytes))
|
|
@@ -35,71 +35,33 @@ def process_audio(audio_bytes):
|
|
| 35 |
waveform = resampler(waveform)
|
| 36 |
return {"raw": waveform.numpy().squeeze(), "sampling_rate": 16000}
|
| 37 |
|
| 38 |
-
|
| 39 |
def get_voice_transcription(state_key):
|
| 40 |
"""Display audio recorder for a given key.
|
| 41 |
If new audio is recorded, transcribe it and update the session state.
|
| 42 |
"""
|
| 43 |
if state_key not in st.session_state:
|
| 44 |
st.session_state[state_key] = ""
|
| 45 |
-
|
| 46 |
# Use a unique key for the recorder widget
|
| 47 |
audio_bytes = audio_recorder(key=state_key + "_audio",
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
if audio_bytes:
|
| 54 |
current_hash = hashlib.md5(audio_bytes).hexdigest()
|
| 55 |
last_hash_key = state_key + "_last_hash"
|
| 56 |
-
|
| 57 |
if st.session_state.get(last_hash_key, "") != current_hash:
|
| 58 |
st.session_state[last_hash_key] = current_hash
|
| 59 |
-
|
| 60 |
-
# Create a status element
|
| 61 |
-
status = st.empty()
|
| 62 |
try:
|
| 63 |
-
# Show loading message
|
| 64 |
-
status.markdown("""
|
| 65 |
-
<div style="display: flex; align-items: center; gap: 0.5rem; padding: 0.5rem;
|
| 66 |
-
background: #f0f2f6; border-radius: 8px;">
|
| 67 |
-
<div class="loader"></div>
|
| 68 |
-
<span>Processing your voice...</span>
|
| 69 |
-
</div>
|
| 70 |
-
<style>
|
| 71 |
-
.loader {
|
| 72 |
-
border: 3px solid #f3f3f3;
|
| 73 |
-
border-radius: 50%;
|
| 74 |
-
border-top: 3px solid #6C63FF;
|
| 75 |
-
width: 20px;
|
| 76 |
-
height: 20px;
|
| 77 |
-
animation: spin 1s linear infinite;
|
| 78 |
-
}
|
| 79 |
-
@keyframes spin {
|
| 80 |
-
0% { transform: rotate(0deg); }
|
| 81 |
-
100% { transform: rotate(360deg); }
|
| 82 |
-
}
|
| 83 |
-
</style>
|
| 84 |
-
""", unsafe_allow_html=True)
|
| 85 |
-
|
| 86 |
-
# Process audio
|
| 87 |
audio_input = process_audio(audio_bytes)
|
| 88 |
whisper = load_voice_model()
|
| 89 |
transcribed_text = whisper(audio_input)["text"]
|
| 90 |
-
|
| 91 |
-
# Clear loading and show result
|
| 92 |
-
status.empty()
|
| 93 |
st.info(f"📝 Transcribed: {transcribed_text}")
|
| 94 |
-
|
| 95 |
-
# Update session state
|
| 96 |
st.session_state[state_key] += (" " + transcribed_text).strip()
|
| 97 |
st.experimental_rerun()
|
| 98 |
-
|
| 99 |
except Exception as e:
|
| 100 |
-
status.empty()
|
| 101 |
st.error(f"Voice input error: {str(e)}")
|
| 102 |
-
|
| 103 |
return st.session_state[state_key]
|
| 104 |
|
| 105 |
######################################
|
|
|
|
| 23 |
|
| 24 |
@st.cache_resource
|
| 25 |
def load_voice_model():
|
| 26 |
+
if 'whisper_model' not in st.session_state:
|
| 27 |
+
st.session_state.whisper_model = pipeline("automatic-speech-recognition", model="openai/whisper-base")
|
| 28 |
|
| 29 |
def process_audio(audio_bytes):
|
| 30 |
waveform, sample_rate = torchaudio.load(BytesIO(audio_bytes))
|
|
|
|
| 35 |
waveform = resampler(waveform)
|
| 36 |
return {"raw": waveform.numpy().squeeze(), "sampling_rate": 16000}
|
| 37 |
|
|
|
|
| 38 |
def get_voice_transcription(state_key):
|
| 39 |
"""Display audio recorder for a given key.
|
| 40 |
If new audio is recorded, transcribe it and update the session state.
|
| 41 |
"""
|
| 42 |
if state_key not in st.session_state:
|
| 43 |
st.session_state[state_key] = ""
|
|
|
|
| 44 |
# Use a unique key for the recorder widget
|
| 45 |
audio_bytes = audio_recorder(key=state_key + "_audio",
|
| 46 |
+
pause_threshold=0.8,
|
| 47 |
+
text="Speak to type",
|
| 48 |
+
recording_color="#e8b62c",
|
| 49 |
+
neutral_color="#6aa36f")
|
|
|
|
| 50 |
if audio_bytes:
|
| 51 |
current_hash = hashlib.md5(audio_bytes).hexdigest()
|
| 52 |
last_hash_key = state_key + "_last_hash"
|
|
|
|
| 53 |
if st.session_state.get(last_hash_key, "") != current_hash:
|
| 54 |
st.session_state[last_hash_key] = current_hash
|
|
|
|
|
|
|
|
|
|
| 55 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
audio_input = process_audio(audio_bytes)
|
| 57 |
whisper = load_voice_model()
|
| 58 |
transcribed_text = whisper(audio_input)["text"]
|
|
|
|
|
|
|
|
|
|
| 59 |
st.info(f"📝 Transcribed: {transcribed_text}")
|
| 60 |
+
# Append (or set) new transcription
|
|
|
|
| 61 |
st.session_state[state_key] += (" " + transcribed_text).strip()
|
| 62 |
st.experimental_rerun()
|
|
|
|
| 63 |
except Exception as e:
|
|
|
|
| 64 |
st.error(f"Voice input error: {str(e)}")
|
|
|
|
| 65 |
return st.session_state[state_key]
|
| 66 |
|
| 67 |
######################################
|