VoxSum

Sleeping

App Files Files Community

Luigi commited on Sep 9, 2025

Commit

c029cee

1 Parent(s): a76c0df

try interactive audio player and clickable transcript

Browse files

Files changed (2) hide show

src/asr.py +21 -8
src/streamlit_app.py +119 -32

src/asr.py CHANGED Viewed

@@ -4,14 +4,13 @@ import soundfile as sf
 from scipy.signal import resample_poly
 from silero_vad import load_silero_vad, VADIterator
 from moonshine_onnx import MoonshineOnnxModel, load_tokenizer
-import opencc
 import re
 SAMPLING_RATE = 16000
 CHUNK_SIZE = 512
 tokenizer = load_tokenizer()
-s2tw_converter = opencc.OpenCC('s2twp')
 def clean_transcript(text):
     text = re.sub(r'[�\uFFFD��]', '', text)
@@ -33,8 +32,9 @@ def transcribe_file(audio_path, vad_threshold, model_name):
     if wav.ndim > 1:
         wav = wav.mean(axis=1)
     speech_buffer = np.array([], dtype=np.float32)
-    full_transcript = []
     i = 0
     while i < len(wav):
@@ -48,19 +48,32 @@ def transcribe_file(audio_path, vad_threshold, model_name):
         if speech_dict:
             if "end" in speech_dict:
                 text = model.generate(speech_buffer[np.newaxis, :].astype(np.float32))
                 text = tokenizer.decode_batch(text)[0].strip()
                 if text:
-                    full_transcript.append(clean_transcript(s2tw_converter.convert(text)))
-                    yield " ".join(full_transcript) # , "Transcribing"
                 speech_buffer = np.array([], dtype=np.float32)
                 vad_iterator.reset_states()
     if len(speech_buffer) > SAMPLING_RATE * 0.5:
         text = model.generate(speech_buffer[np.newaxis, :].astype(np.float32))
         text = tokenizer.decode_batch(text)[0].strip()
         if text:
-            full_transcript.append(clean_transcript(s2tw_converter.convert(text)))
-            yield " ".join(full_transcript) # , "Transcription complete"
-    yield " ".join(full_transcript) if full_transcript else "No speech detected." # , "Transcription complete"

 from scipy.signal import resample_poly
 from silero_vad import load_silero_vad, VADIterator
 from moonshine_onnx import MoonshineOnnxModel, load_tokenizer
+from utils import s2tw_converter
 import re
 SAMPLING_RATE = 16000
 CHUNK_SIZE = 512
 tokenizer = load_tokenizer()
 def clean_transcript(text):
     text = re.sub(r'[�\uFFFD��]', '', text)
     if wav.ndim > 1:
         wav = wav.mean(axis=1)
+    utterances = []  # Store all utterances (start, end, text)
     speech_buffer = np.array([], dtype=np.float32)
+    segment_start = 0.0  # Track start time of current segment
     i = 0
     while i < len(wav):
         if speech_dict:
             if "end" in speech_dict:
+                # Calculate timestamps
+                segment_end = i / SAMPLING_RATE
                 text = model.generate(speech_buffer[np.newaxis, :].astype(np.float32))
                 text = tokenizer.decode_batch(text)[0].strip()
                 if text:
+                    cleaned_text = clean_transcript(s2tw_converter.convert(text))
+                    utterances.append((segment_start, segment_end, cleaned_text))
+                    # Yield current utterance + all accumulated utterances
+                    yield utterances[-1], utterances.copy()
+                # Reset for next segment
                 speech_buffer = np.array([], dtype=np.float32)
+                segment_start = i / SAMPLING_RATE  # Start of next segment
                 vad_iterator.reset_states()
+    # Process final segment
     if len(speech_buffer) > SAMPLING_RATE * 0.5:
+        segment_end = len(wav) / SAMPLING_RATE
         text = model.generate(speech_buffer[np.newaxis, :].astype(np.float32))
         text = tokenizer.decode_batch(text)[0].strip()
         if text:
+            cleaned_text = clean_transcript(s2tw_converter.convert(text))
+            utterances.append((segment_start, segment_end, cleaned_text))
+            yield utterances[-1], utterances.copy()
+    # Final yield with all utterances
+    if utterances:
+        yield None, utterances
+    else:
+        yield None, [(-1, -1, "No speech detected")]

src/streamlit_app.py CHANGED Viewed

@@ -4,6 +4,8 @@ from asr import transcribe_file
 from summarization import summarize_transcript
 from podcast import search_podcast_series, fetch_episodes, download_podcast_audio, fetch_audio
 from utils import model_names, available_gguf_llms
 # Session state init
 if "transcript" not in st.session_state:
@@ -14,6 +16,10 @@ if "status" not in st.session_state:
     st.session_state.status = "Ready"
 if "audio_path" not in st.session_state:
     st.session_state.audio_path = None
 st.set_page_config(page_title="🎙️ Moonshine ASR + LLM", layout="wide")
 st.title("🎙️ Speech Summarization with Moonshine ASR & LLM")
@@ -73,22 +79,47 @@ with tab2:
 with tab3:
     st.subheader("Transcription & Summary")
-    # Display audio player if audio is available
-    if st.session_state.audio_path:
-        st.markdown("### 🔊 Audio Preview")
-        st.audio(st.session_state.audio_path)
     status_placeholder = st.empty()
-    status_placeholder.text(st.session_state.get('status', ''))
-    # Persistent transcript display
-    if st.session_state.get('transcript'):
-        st.text_area("Transcription",
-                    value=st.session_state.transcript,
-                    height=300,
-                    key="transcript_display")
     # Persistent summary display
     if st.session_state.get('summary'):
         st.markdown("### Summary")
@@ -97,31 +128,91 @@ with tab3:
     if st.button("🎙️ Transcribe"):
         if st.session_state.audio_path:
             status_placeholder.text("Transcribing...")
-            # Temporary placeholder for incremental updates
-            live_transcript_placeholder = st.empty()
             st.session_state.transcript = ""
-            for accumulated_transcript in transcribe_file(
                 st.session_state.audio_path, vad_threshold, model_names[model_name]
             ):
-                st.session_state.transcript = accumulated_transcript
-                live_transcript_placeholder.text(accumulated_transcript)
-            # Clear temporary placeholder after completion
-            live_transcript_placeholder.empty()
             status_placeholder.empty()
-            # Force UI update to show persistent text area
-            st.rerun()
     if st.button("📝 Summarize"):
         if st.session_state.transcript:
             status_placeholder.text("Summarizing...")
-            # Temporary placeholder for incremental updates
             live_summary_placeholder = st.empty()
-            st.session_state.summary = ""  # Reset previous summary
             for accumulated_summary in summarize_transcript(
                 st.session_state.transcript, llm_model, prompt_input
@@ -129,9 +220,5 @@ with tab3:
                 st.session_state.summary = accumulated_summary
                 live_summary_placeholder.markdown(accumulated_summary)
-            # Clear temporary placeholder after completion
             live_summary_placeholder.empty()
-            status_placeholder.empty()
-            # Force UI update to show persistent summary
-            st.rerun()

 from summarization import summarize_transcript
 from podcast import search_podcast_series, fetch_episodes, download_podcast_audio, fetch_audio
 from utils import model_names, available_gguf_llms
+import base64
+import time
 # Session state init
 if "transcript" not in st.session_state:
     st.session_state.status = "Ready"
 if "audio_path" not in st.session_state:
     st.session_state.audio_path = None
+if "utterances" not in st.session_state:
+    st.session.utterances = []
+if "audio_base64" not in st.session_state:
+    st.session_state.audio_base64 = None
 st.set_page_config(page_title="🎙️ Moonshine ASR + LLM", layout="wide")
 st.title("🎙️ Speech Summarization with Moonshine ASR & LLM")
 with tab3:
     st.subheader("Transcription & Summary")
+    # Initialize audio player
+    if st.session_state.audio_path and not st.session_state.audio_base64:
+        with open(st.session_state.audio_path, "rb") as f:
+            audio_bytes = f.read()
+        st.session_state.audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
+    if st.session_state.audio_base64:
+        # Audio player with time update handler
+        audio_html = f"""
+        <audio id="audioPlayer" controls ontimeupdate="updateTime(this)">
+            <source src="data:audio/mp3;base64,{st.session_state.audio_base64}" type="audio/mp3">
+        </audio>
+        <script>
+            function seekAudio(time) {{
+                const player = document.getElementById('audioPlayer');
+                player.currentTime = time;
+                player.play();
+            }}
+            function updateTime(audio) {{
+                const time = audio.currentTime;
+                window.parent.postMessage({{
+                    type: "currentTimeUpdate",
+                    time: time
+                }}, "*");
+            }}
+            // Handle transcript click messages
+            window.addEventListener('message', (event) => {{
+                if (event.data.type === 'seekToTime') {{
+                    seekAudio(event.data.time);
+                }}
+            }});
+        </script>
+        """
+        st.markdown(audio_html, unsafe_allow_html=True)
     status_placeholder = st.empty()
+    transcript_placeholder = st.empty()
     # Persistent summary display
     if st.session_state.get('summary'):
         st.markdown("### Summary")
     if st.button("🎙️ Transcribe"):
         if st.session_state.audio_path:
             status_placeholder.text("Transcribing...")
+            st.session_state.utterances = []
             st.session_state.transcript = ""
+            # Initialize transcript display
+            transcript_placeholder.empty()
+            transcript_display = st.empty()
+            # Generate transcript HTML
+            def generate_transcript_html(utterances, current_time=0):
+                html = """
+                <style>
+                    .utterance {
+                        padding: 8px;
+                        margin: 4px 0;
+                        border-radius: 4px;
+                        cursor: pointer;
+                        transition: background 0.2s;
+                    }
+                    .utterance:hover { background-color: #f0f0f0; }
+                    .current-utterance {
+                        background-color: #ffebee;
+                        border-left: 3px solid #f44336;
+                        font-weight: 500;
+                    }
+                </style>
+                <div id="transcript-container">
+                """
+                for start, end, text in utterances:
+                    is_current = start <= current_time < end
+                    html += f"""
+                    <div class="utterance {'current-utterance' if is_current else ''}"
+                         onclick="parent.postMessage({{type: 'seekToTime', time: {start}}}, '*')">
+                        <b>[{time.strftime('%M:%S', time.gmtime(start))}-{time.strftime('%M:%S', time.gmtime(end))}]</b> {text}
+                    </div>
+                    """
+                html += "</div>"
+                return html
+            # Process ASR output
+            for current_utt, all_utts in transcribe_file(
                 st.session_state.audio_path, vad_threshold, model_names[model_name]
             ):
+                st.session_state.utterances = all_utts
+                st.session_state.transcript = "\n".join(
+                    f"[{start:.1f}-{end:.1f}] {text}"
+                    for start, end, text in all_utts
+                )
+                # Update transcript display
+                transcript_html = generate_transcript_html(
+                    all_utts,
+                    st.session_state.get('current_time', 0)
+                )
+                transcript_display.markdown(transcript_html, unsafe_allow_html=True)
             status_placeholder.empty()
+    # Time update handling
+    current_time_js = """
+    <script>
+    window.addEventListener('message', (event) => {
+        if (event.data.type === 'currentTimeUpdate') {
+            Streamlit.setComponentValue(event.data.time);
+        }
+    });
+    </script>
+    """
+    current_time = st.components.v1.html(current_time_js, height=0)
+    if current_time:
+        st.session_state.current_time = current_time
+        # Update highlighting when time changes
+        if st.session_state.utterances:
+            transcript_html = generate_transcript_html(
+                st.session_state.utterances,
+                current_time
+            )
+            transcript_placeholder.markdown(transcript_html, unsafe_allow_html=True)
+    # Summarization button
     if st.button("📝 Summarize"):
         if st.session_state.transcript:
             status_placeholder.text("Summarizing...")
             live_summary_placeholder = st.empty()
+            st.session_state.summary = ""
             for accumulated_summary in summarize_transcript(
                 st.session_state.transcript, llm_model, prompt_input
                 st.session_state.summary = accumulated_summary
                 live_summary_placeholder.markdown(accumulated_summary)
             live_summary_placeholder.empty()
+            status_placeholder.empty()