ricklon's picture
Update app.py
244b56f
import streamlit as st
import whisper
import tempfile
# Function to transcribe audio and detect language
def transcribe_and_detect_language(audio_file):
model = whisper.load_model("base").to("cpu").float() # Ensure model is in full precision
# Load and process audio
audio = whisper.load_audio(audio_file)
audio = whisper.pad_or_trim(audio)
# Convert to log-Mel spectrogram in full precision
mel = whisper.log_mel_spectrogram(audio).to(model.device).float() # Convert to float32
# Detect the spoken language
_, probs = model.detect_language(mel)
detected_language = max(probs, key=probs.get)
# Decode the audio
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)
return detected_language, result.text
# Streamlit UI
st.title("Speech to Text with Whisper")
# File uploader widget
uploaded_file = st.file_uploader("Upload an audio file", type=['wav', 'mp3'])
if uploaded_file is not None:
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
tmp_file.write(uploaded_file.getvalue())
with st.spinner('Processing...'):
language, transcribed_text = transcribe_and_detect_language(tmp_file.name)
st.write(f"Detected language: {language}")
st.text_area("Transcribed Text:", value=transcribed_text, height=300)