File size: 3,590 Bytes
6280a78
 
fc896e5
0e34747
ac965e5
43f0036
 
 
 
6280a78
fe5d88c
 
43f0036
fe5d88c
 
6280a78
fc896e5
 
6280a78
fc896e5
fe5d88c
 
 
 
 
43f0036
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4bd759c
43f0036
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe5d88c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43f0036
fe5d88c
 
43f0036
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import streamlit as st
import whisper
import tempfile
import os
import torchaudio
from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration
import numpy as np
import soundfile as sf
from io import BytesIO

# Title and description
st.title("🎧 Whisper Audio Transcriber")
st.markdown("Upload a `.wav` or `.mp3` file or record audio using your microphone to get transcribed text with timestamps using Whisper.")

# Load Whisper model
@st.cache_resource
def load_model():
    return whisper.load_model("base")

model = load_model()
st.success("βœ… Whisper model loaded!")

# File uploader
audio_file = st.file_uploader("Upload audio file", type=["wav", "mp3"])

# Microphone recording
st.subheader("πŸŽ™οΈ Record Audio")
RTC_CONFIGURATION = RTCConfiguration({"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]})
class AudioProcessor:
    def __init__(self):
        self.audio_buffer = []

    def recv(self, frame):
        self.audio_buffer.append(frame.to_ndarray())
        return frame

ctx = webrtc_streamer(
    key="audio-recorder",
    mode=WebRtcMode.SENDONLY,
    rtc_configuration=RTC_CONFIGURATION,
    media_stream_constraints={"audio": True, "video": False},
    audio_processor_factory=AudioProcessor,
)
if ctx.audio_processor:
    if st.button("Stop and Transcribe Recording"):
        if ctx.audio_processor.audio_buffer:
            st.info("πŸ“ Processing recorded audio...")
            # Combine audio frames
            audio_data = np.concatenate(ctx.audio_processor.audio_buffer, axis=0)
            # Save as WAV file
            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
                sf.write(tmp_file.name, audio_data, 16000)  # WebRTC typically uses 16kHz
                temp_path = tmp_file.name

            # Transcription
            st.info("πŸ“ Transcribing...")
            result = model.transcribe(temp_path)

            # Display segments
            st.subheader("πŸ•’ Segments with Timestamps")
            for segment in result["segments"]:
                st.markdown(f"**[{segment['start']:.2f}s - {segment['end']:.2f}s]**: {segment['text']}")

            # Full transcription
            st.subheader("🧾 Full Transcript")
            st.text_area("Transcribed Text", result["text"], height=250, key="recorded_transcript")

            # Clean up
            os.remove(temp_path)
            ctx.audio_processor.audio_buffer = []  # Clear buffer
        else:
            st.warning("⚠️ No audio recorded.")

# Process uploaded file
if audio_file is not None:
    # Save uploaded file temporarily
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
        tmp_file.write(audio_file.read())
        temp_path = tmp_file.name

    # Convert MP3 to WAV if needed
    if audio_file.name.endswith(".mp3"):
        waveform, sample_rate = torchaudio.load(temp_path)
        wav_path = temp_path.replace(".wav", "_converted.wav")
        torchaudio.save(wav_path, waveform, sample_rate)
        os.remove(temp_path)
        temp_path = wav_path

    # Transcription
    st.info("πŸ“ Transcribing...")
    result = model.transcribe(temp_path)

    # Display segments
    st.subheader("πŸ•’ Segments with Timestamps")
    for segment in result["segments"]:
        st.markdown(f"**[{segment['start']:.2f}s - {segment['end']:.2f}s]**: {segment['text']}")

    # Full transcription
    st.subheader("🧾 Full Transcript")
    st.text_area("Transcribed Text", result["text"], height=250, key="uploaded_transcript")

    # Clean up
    os.remove(temp_path)