import streamlit as st import whisper import tempfile import os import torchaudio from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration import numpy as np import soundfile as sf from io import BytesIO # Title and description st.title("๐ŸŽง Whisper Audio Transcriber") st.markdown("Upload a `.wav` or `.mp3` file or record audio using your microphone to get transcribed text with timestamps using Whisper.") # Load Whisper model @st.cache_resource def load_model(): return whisper.load_model("base") model = load_model() st.success("โœ… Whisper model loaded!") # File uploader audio_file = st.file_uploader("Upload audio file", type=["wav", "mp3"]) # Microphone recording st.subheader("๐ŸŽ™๏ธ Record Audio") RTC_CONFIGURATION = RTCConfiguration({"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]}) class AudioProcessor: def __init__(self): self.audio_buffer = [] def recv(self, frame): self.audio_buffer.append(frame.to_ndarray()) return frame ctx = webrtc_streamer( key="audio-recorder", mode=WebRtcMode.SENDONLY, rtc_configuration=RTC_CONFIGURATION, media_stream_constraints={"audio": True, "video": False}, audio_processor_factory=AudioProcessor, ) if ctx.audio_processor: if st.button("Stop and Transcribe Recording"): if ctx.audio_processor.audio_buffer: st.info("๐Ÿ“ Processing recorded audio...") # Combine audio frames audio_data = np.concatenate(ctx.audio_processor.audio_buffer, axis=0) # Save as WAV file with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: sf.write(tmp_file.name, audio_data, 16000) # WebRTC typically uses 16kHz temp_path = tmp_file.name # Transcription st.info("๐Ÿ“ Transcribing...") result = model.transcribe(temp_path) # Display segments st.subheader("๐Ÿ•’ Segments with Timestamps") for segment in result["segments"]: st.markdown(f"**[{segment['start']:.2f}s - {segment['end']:.2f}s]**: {segment['text']}") # Full transcription st.subheader("๐Ÿงพ Full Transcript") st.text_area("Transcribed Text", result["text"], height=250, key="recorded_transcript") # Clean up os.remove(temp_path) ctx.audio_processor.audio_buffer = [] # Clear buffer else: st.warning("โš ๏ธ No audio recorded.") # Process uploaded file if audio_file is not None: # Save uploaded file temporarily with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: tmp_file.write(audio_file.read()) temp_path = tmp_file.name # Convert MP3 to WAV if needed if audio_file.name.endswith(".mp3"): waveform, sample_rate = torchaudio.load(temp_path) wav_path = temp_path.replace(".wav", "_converted.wav") torchaudio.save(wav_path, waveform, sample_rate) os.remove(temp_path) temp_path = wav_path # Transcription st.info("๐Ÿ“ Transcribing...") result = model.transcribe(temp_path) # Display segments st.subheader("๐Ÿ•’ Segments with Timestamps") for segment in result["segments"]: st.markdown(f"**[{segment['start']:.2f}s - {segment['end']:.2f}s]**: {segment['text']}") # Full transcription st.subheader("๐Ÿงพ Full Transcript") st.text_area("Transcribed Text", result["text"], height=250, key="uploaded_transcript") # Clean up os.remove(temp_path)