Luigi commited on
Commit
c029cee
·
1 Parent(s): a76c0df

try interactive audio player and clickable transcript

Browse files
Files changed (2) hide show
  1. src/asr.py +21 -8
  2. src/streamlit_app.py +119 -32
src/asr.py CHANGED
@@ -4,14 +4,13 @@ import soundfile as sf
4
  from scipy.signal import resample_poly
5
  from silero_vad import load_silero_vad, VADIterator
6
  from moonshine_onnx import MoonshineOnnxModel, load_tokenizer
7
- import opencc
8
  import re
9
 
10
  SAMPLING_RATE = 16000
11
  CHUNK_SIZE = 512
12
 
13
  tokenizer = load_tokenizer()
14
- s2tw_converter = opencc.OpenCC('s2twp')
15
 
16
  def clean_transcript(text):
17
  text = re.sub(r'[�\uFFFD��]', '', text)
@@ -33,8 +32,9 @@ def transcribe_file(audio_path, vad_threshold, model_name):
33
  if wav.ndim > 1:
34
  wav = wav.mean(axis=1)
35
 
 
36
  speech_buffer = np.array([], dtype=np.float32)
37
- full_transcript = []
38
 
39
  i = 0
40
  while i < len(wav):
@@ -48,19 +48,32 @@ def transcribe_file(audio_path, vad_threshold, model_name):
48
 
49
  if speech_dict:
50
  if "end" in speech_dict:
 
 
51
  text = model.generate(speech_buffer[np.newaxis, :].astype(np.float32))
52
  text = tokenizer.decode_batch(text)[0].strip()
53
  if text:
54
- full_transcript.append(clean_transcript(s2tw_converter.convert(text)))
55
- yield " ".join(full_transcript) # , "Transcribing"
 
 
 
56
  speech_buffer = np.array([], dtype=np.float32)
 
57
  vad_iterator.reset_states()
58
 
 
59
  if len(speech_buffer) > SAMPLING_RATE * 0.5:
 
60
  text = model.generate(speech_buffer[np.newaxis, :].astype(np.float32))
61
  text = tokenizer.decode_batch(text)[0].strip()
62
  if text:
63
- full_transcript.append(clean_transcript(s2tw_converter.convert(text)))
64
- yield " ".join(full_transcript) # , "Transcription complete"
 
65
 
66
- yield " ".join(full_transcript) if full_transcript else "No speech detected." # , "Transcription complete"
 
 
 
 
 
4
  from scipy.signal import resample_poly
5
  from silero_vad import load_silero_vad, VADIterator
6
  from moonshine_onnx import MoonshineOnnxModel, load_tokenizer
7
+ from utils import s2tw_converter
8
  import re
9
 
10
  SAMPLING_RATE = 16000
11
  CHUNK_SIZE = 512
12
 
13
  tokenizer = load_tokenizer()
 
14
 
15
  def clean_transcript(text):
16
  text = re.sub(r'[�\uFFFD��]', '', text)
 
32
  if wav.ndim > 1:
33
  wav = wav.mean(axis=1)
34
 
35
+ utterances = [] # Store all utterances (start, end, text)
36
  speech_buffer = np.array([], dtype=np.float32)
37
+ segment_start = 0.0 # Track start time of current segment
38
 
39
  i = 0
40
  while i < len(wav):
 
48
 
49
  if speech_dict:
50
  if "end" in speech_dict:
51
+ # Calculate timestamps
52
+ segment_end = i / SAMPLING_RATE
53
  text = model.generate(speech_buffer[np.newaxis, :].astype(np.float32))
54
  text = tokenizer.decode_batch(text)[0].strip()
55
  if text:
56
+ cleaned_text = clean_transcript(s2tw_converter.convert(text))
57
+ utterances.append((segment_start, segment_end, cleaned_text))
58
+ # Yield current utterance + all accumulated utterances
59
+ yield utterances[-1], utterances.copy()
60
+ # Reset for next segment
61
  speech_buffer = np.array([], dtype=np.float32)
62
+ segment_start = i / SAMPLING_RATE # Start of next segment
63
  vad_iterator.reset_states()
64
 
65
+ # Process final segment
66
  if len(speech_buffer) > SAMPLING_RATE * 0.5:
67
+ segment_end = len(wav) / SAMPLING_RATE
68
  text = model.generate(speech_buffer[np.newaxis, :].astype(np.float32))
69
  text = tokenizer.decode_batch(text)[0].strip()
70
  if text:
71
+ cleaned_text = clean_transcript(s2tw_converter.convert(text))
72
+ utterances.append((segment_start, segment_end, cleaned_text))
73
+ yield utterances[-1], utterances.copy()
74
 
75
+ # Final yield with all utterances
76
+ if utterances:
77
+ yield None, utterances
78
+ else:
79
+ yield None, [(-1, -1, "No speech detected")]
src/streamlit_app.py CHANGED
@@ -4,6 +4,8 @@ from asr import transcribe_file
4
  from summarization import summarize_transcript
5
  from podcast import search_podcast_series, fetch_episodes, download_podcast_audio, fetch_audio
6
  from utils import model_names, available_gguf_llms
 
 
7
 
8
  # Session state init
9
  if "transcript" not in st.session_state:
@@ -14,6 +16,10 @@ if "status" not in st.session_state:
14
  st.session_state.status = "Ready"
15
  if "audio_path" not in st.session_state:
16
  st.session_state.audio_path = None
 
 
 
 
17
 
18
  st.set_page_config(page_title="🎙️ Moonshine ASR + LLM", layout="wide")
19
  st.title("🎙️ Speech Summarization with Moonshine ASR & LLM")
@@ -73,22 +79,47 @@ with tab2:
73
 
74
  with tab3:
75
  st.subheader("Transcription & Summary")
76
-
77
- # Display audio player if audio is available
78
- if st.session_state.audio_path:
79
- st.markdown("### 🔊 Audio Preview")
80
- st.audio(st.session_state.audio_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
  status_placeholder = st.empty()
83
- status_placeholder.text(st.session_state.get('status', ''))
84
 
85
- # Persistent transcript display
86
- if st.session_state.get('transcript'):
87
- st.text_area("Transcription",
88
- value=st.session_state.transcript,
89
- height=300,
90
- key="transcript_display")
91
-
92
  # Persistent summary display
93
  if st.session_state.get('summary'):
94
  st.markdown("### Summary")
@@ -97,31 +128,91 @@ with tab3:
97
  if st.button("🎙️ Transcribe"):
98
  if st.session_state.audio_path:
99
  status_placeholder.text("Transcribing...")
100
-
101
- # Temporary placeholder for incremental updates
102
- live_transcript_placeholder = st.empty()
103
  st.session_state.transcript = ""
104
 
105
- for accumulated_transcript in transcribe_file(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  st.session_state.audio_path, vad_threshold, model_names[model_name]
107
  ):
108
- st.session_state.transcript = accumulated_transcript
109
- live_transcript_placeholder.text(accumulated_transcript)
 
 
 
 
 
 
 
 
 
 
110
 
111
- # Clear temporary placeholder after completion
112
- live_transcript_placeholder.empty()
113
  status_placeholder.empty()
114
 
115
- # Force UI update to show persistent text area
116
- st.rerun()
117
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  if st.button("📝 Summarize"):
119
  if st.session_state.transcript:
120
  status_placeholder.text("Summarizing...")
121
-
122
- # Temporary placeholder for incremental updates
123
  live_summary_placeholder = st.empty()
124
- st.session_state.summary = "" # Reset previous summary
125
 
126
  for accumulated_summary in summarize_transcript(
127
  st.session_state.transcript, llm_model, prompt_input
@@ -129,9 +220,5 @@ with tab3:
129
  st.session_state.summary = accumulated_summary
130
  live_summary_placeholder.markdown(accumulated_summary)
131
 
132
- # Clear temporary placeholder after completion
133
  live_summary_placeholder.empty()
134
- status_placeholder.empty()
135
-
136
- # Force UI update to show persistent summary
137
- st.rerun()
 
4
  from summarization import summarize_transcript
5
  from podcast import search_podcast_series, fetch_episodes, download_podcast_audio, fetch_audio
6
  from utils import model_names, available_gguf_llms
7
+ import base64
8
+ import time
9
 
10
  # Session state init
11
  if "transcript" not in st.session_state:
 
16
  st.session_state.status = "Ready"
17
  if "audio_path" not in st.session_state:
18
  st.session_state.audio_path = None
19
+ if "utterances" not in st.session_state:
20
+ st.session.utterances = []
21
+ if "audio_base64" not in st.session_state:
22
+ st.session_state.audio_base64 = None
23
 
24
  st.set_page_config(page_title="🎙️ Moonshine ASR + LLM", layout="wide")
25
  st.title("🎙️ Speech Summarization with Moonshine ASR & LLM")
 
79
 
80
  with tab3:
81
  st.subheader("Transcription & Summary")
82
+
83
+ # Initialize audio player
84
+ if st.session_state.audio_path and not st.session_state.audio_base64:
85
+ with open(st.session_state.audio_path, "rb") as f:
86
+ audio_bytes = f.read()
87
+ st.session_state.audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
88
+
89
+ if st.session_state.audio_base64:
90
+ # Audio player with time update handler
91
+ audio_html = f"""
92
+ <audio id="audioPlayer" controls ontimeupdate="updateTime(this)">
93
+ <source src="data:audio/mp3;base64,{st.session_state.audio_base64}" type="audio/mp3">
94
+ </audio>
95
+ <script>
96
+ function seekAudio(time) {{
97
+ const player = document.getElementById('audioPlayer');
98
+ player.currentTime = time;
99
+ player.play();
100
+ }}
101
+
102
+ function updateTime(audio) {{
103
+ const time = audio.currentTime;
104
+ window.parent.postMessage({{
105
+ type: "currentTimeUpdate",
106
+ time: time
107
+ }}, "*");
108
+ }}
109
+
110
+ // Handle transcript click messages
111
+ window.addEventListener('message', (event) => {{
112
+ if (event.data.type === 'seekToTime') {{
113
+ seekAudio(event.data.time);
114
+ }}
115
+ }});
116
+ </script>
117
+ """
118
+ st.markdown(audio_html, unsafe_allow_html=True)
119
 
120
  status_placeholder = st.empty()
121
+ transcript_placeholder = st.empty()
122
 
 
 
 
 
 
 
 
123
  # Persistent summary display
124
  if st.session_state.get('summary'):
125
  st.markdown("### Summary")
 
128
  if st.button("🎙️ Transcribe"):
129
  if st.session_state.audio_path:
130
  status_placeholder.text("Transcribing...")
131
+ st.session_state.utterances = []
 
 
132
  st.session_state.transcript = ""
133
 
134
+ # Initialize transcript display
135
+ transcript_placeholder.empty()
136
+ transcript_display = st.empty()
137
+
138
+ # Generate transcript HTML
139
+ def generate_transcript_html(utterances, current_time=0):
140
+ html = """
141
+ <style>
142
+ .utterance {
143
+ padding: 8px;
144
+ margin: 4px 0;
145
+ border-radius: 4px;
146
+ cursor: pointer;
147
+ transition: background 0.2s;
148
+ }
149
+ .utterance:hover { background-color: #f0f0f0; }
150
+ .current-utterance {
151
+ background-color: #ffebee;
152
+ border-left: 3px solid #f44336;
153
+ font-weight: 500;
154
+ }
155
+ </style>
156
+ <div id="transcript-container">
157
+ """
158
+ for start, end, text in utterances:
159
+ is_current = start <= current_time < end
160
+ html += f"""
161
+ <div class="utterance {'current-utterance' if is_current else ''}"
162
+ onclick="parent.postMessage({{type: 'seekToTime', time: {start}}}, '*')">
163
+ <b>[{time.strftime('%M:%S', time.gmtime(start))}-{time.strftime('%M:%S', time.gmtime(end))}]</b> {text}
164
+ </div>
165
+ """
166
+ html += "</div>"
167
+ return html
168
+
169
+ # Process ASR output
170
+ for current_utt, all_utts in transcribe_file(
171
  st.session_state.audio_path, vad_threshold, model_names[model_name]
172
  ):
173
+ st.session_state.utterances = all_utts
174
+ st.session_state.transcript = "\n".join(
175
+ f"[{start:.1f}-{end:.1f}] {text}"
176
+ for start, end, text in all_utts
177
+ )
178
+
179
+ # Update transcript display
180
+ transcript_html = generate_transcript_html(
181
+ all_utts,
182
+ st.session_state.get('current_time', 0)
183
+ )
184
+ transcript_display.markdown(transcript_html, unsafe_allow_html=True)
185
 
 
 
186
  status_placeholder.empty()
187
 
188
+ # Time update handling
189
+ current_time_js = """
190
+ <script>
191
+ window.addEventListener('message', (event) => {
192
+ if (event.data.type === 'currentTimeUpdate') {
193
+ Streamlit.setComponentValue(event.data.time);
194
+ }
195
+ });
196
+ </script>
197
+ """
198
+ current_time = st.components.v1.html(current_time_js, height=0)
199
+
200
+ if current_time:
201
+ st.session_state.current_time = current_time
202
+ # Update highlighting when time changes
203
+ if st.session_state.utterances:
204
+ transcript_html = generate_transcript_html(
205
+ st.session_state.utterances,
206
+ current_time
207
+ )
208
+ transcript_placeholder.markdown(transcript_html, unsafe_allow_html=True)
209
+
210
+ # Summarization button
211
  if st.button("📝 Summarize"):
212
  if st.session_state.transcript:
213
  status_placeholder.text("Summarizing...")
 
 
214
  live_summary_placeholder = st.empty()
215
+ st.session_state.summary = ""
216
 
217
  for accumulated_summary in summarize_transcript(
218
  st.session_state.transcript, llm_model, prompt_input
 
220
  st.session_state.summary = accumulated_summary
221
  live_summary_placeholder.markdown(accumulated_summary)
222
 
 
223
  live_summary_placeholder.empty()
224
+ status_placeholder.empty()