hmdliu commited on
Commit
4062687
·
verified ·
1 Parent(s): 5dc062f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +173 -67
app.py CHANGED
@@ -21,9 +21,8 @@ elevenlabs_client = None
21
  if ELEVENLABS_API_KEY:
22
  elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
23
 
24
-
25
  # ----------------------------
26
- # Prompt templates (placeholders)
27
  # ----------------------------
28
  PROMPT_TEMPLATE_1 = """\
29
  You are a speech-language assistant. Given the ORIGINAL script and the TRANSCRIPT (imperfect ASR),
@@ -49,17 +48,53 @@ Diagnosis notes on easy-to-stutter scenarios:
49
  ORIGINAL:
50
  {original_text}
51
 
52
- First convert the speech script into phonetic symbols (to guide the revision),
53
- then return the revised script.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  """
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  # ----------------------------
58
  # Helpers: STT & LLM calls
59
  # ----------------------------
60
  def transcribe_audio(record_path: str | None) -> str:
61
  """
62
- Prioritize uploaded file if both provided.
63
  Returns the transcribed text (or an error message).
64
  """
65
  audio_path = record_path
@@ -69,7 +104,6 @@ def transcribe_audio(record_path: str | None) -> str:
69
  if not ELEVENLABS_API_KEY:
70
  return "ELEVENLABS_API_KEY not set. Please configure your environment."
71
 
72
- # Read file as bytes -> BytesIO
73
  try:
74
  with open(audio_path, "rb") as f:
75
  audio_data = BytesIO(f.read())
@@ -84,12 +118,10 @@ def transcribe_audio(record_path: str | None) -> str:
84
  language_code="eng",
85
  diarize=True,
86
  )
87
- # Minimal output: just return text
88
  return transcription.text or ""
89
  except Exception as e:
90
  return f"Transcription error: {e}"
91
 
92
-
93
  def call_llm_302(model: str, prompt: str) -> str:
94
  """
95
  Minimal wrapper around 302.ai /v1/chat/completions.
@@ -117,29 +149,23 @@ def call_llm_302(model: str, prompt: str) -> str:
117
  conn.close()
118
 
119
  output = json.loads(raw)
120
- # Defensive parsing
121
  msg = output.get("choices", [{}])[0].get("message", {})
122
  text = msg.get("content") or msg.get("text") or str(msg)
123
  return text.strip()
124
  except Exception as e:
125
  return f"LLM API error: {e}"
126
 
127
-
128
  # ----------------------------
129
- # Button handlers
130
  # ----------------------------
131
  def on_click_transcribe(record_path):
132
- """
133
- Button 1: Transcribe audio -> fill Textbox1 (transcribed text, non-editable).
134
- """
135
  text = transcribe_audio(record_path)
136
  return gr.update(value=text)
137
 
138
-
139
- def on_click_analyze(selected_model, original_text, transcribed_text):
140
  """
141
- Button 2: Analyze easy-to-stutter words -> fill Textbox3 using PROMPT_TEMPLATE_1.
142
- Respects the selected LLM model.
143
  """
144
  prompt = PROMPT_TEMPLATE_1.format(
145
  original_text=original_text or "",
@@ -148,11 +174,33 @@ def on_click_analyze(selected_model, original_text, transcribed_text):
148
  analysis = call_llm_302(selected_model, prompt)
149
  return gr.update(value=analysis)
150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
- def on_click_rewrite(selected_model, original_text, transcribed_text, summary):
 
 
153
  """
154
- Button 3: Rewrite script -> always use PROMPT_TEMPLATE_2 (annotated version).
155
- Respects the selected LLM model.
156
  """
157
  prompt = PROMPT_TEMPLATE_2.format(
158
  notes=summary or "",
@@ -161,56 +209,114 @@ def on_click_rewrite(selected_model, original_text, transcribed_text, summary):
161
  revised = call_llm_302(selected_model, prompt)
162
  return gr.update(value=revised)
163
 
 
 
 
164
 
165
  # ----------------------------
166
- # Gradio UI
167
  # ----------------------------
168
  with gr.Blocks(title="DeStammerer: AI-assisted Speech Script Revision") as demo:
169
-
170
- # Row 1: [audio upload, audio record, button1]
171
- with gr.Row():
172
- audio_record = gr.Audio(label="Record Audio", sources=["microphone"], type="filepath")
173
- audio_download = gr.File(label="Audio Download", interactive=False)
174
- btn_transcribe = gr.Button("1) Transcribe")
175
-
176
- # Row 2: [textbox1 (ASR, readonly), textbox2 (original input), dropdown (model), button2]
177
- with gr.Row():
178
- txt_transcribed = gr.Textbox(label="Transcribed Text (ASR)", interactive=False, lines=6, placeholder="ASR output appears here.")
179
- txt_original = gr.Textbox(label="Original Script (input)", lines=6, placeholder="Paste your original script here.")
180
- model_selector = gr.Dropdown(
181
- choices=["gpt-4o-mini", "gpt-5"],
182
- value="gpt-4o-mini",
183
- label="LLM Model"
184
- )
185
- btn_analyze = gr.Button("2) Analyze")
186
-
187
- # Row 3: [textbox3 (LLM summary), textbox4 (revised script), button3]
188
- with gr.Row():
189
- txt_summary = gr.Textbox(label="LLM Summary: Easy-to-Stutter Words", lines=8, placeholder="Analysis will appear here.")
190
- txt_revised = gr.Textbox(label="Revised Script", lines=8, placeholder="Rewritten script will appear here.")
191
- btn_rewrite = gr.Button("3) Revise Script")
192
-
193
- # Wiring
194
- passthrough = lambda audio_path: audio_path
195
- audio_record.change(fn=passthrough, inputs=audio_record, outputs=audio_download)
196
-
197
- btn_transcribe.click(
198
- fn=on_click_transcribe,
199
- inputs=[audio_record],
200
- outputs=[txt_transcribed],
201
- )
202
-
203
- btn_analyze.click(
204
- fn=on_click_analyze,
205
- inputs=[model_selector, txt_original, txt_transcribed],
206
- outputs=[txt_summary],
207
- )
208
-
209
- btn_rewrite.click(
210
- fn=on_click_rewrite,
211
- inputs=[model_selector, txt_original, txt_transcribed, txt_summary],
212
- outputs=[txt_revised],
213
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
 
215
  if __name__ == "__main__":
216
  demo.launch()
 
21
  if ELEVENLABS_API_KEY:
22
  elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
23
 
 
24
  # ----------------------------
25
+ # Prompt templates
26
  # ----------------------------
27
  PROMPT_TEMPLATE_1 = """\
28
  You are a speech-language assistant. Given the ORIGINAL script and the TRANSCRIPT (imperfect ASR),
 
48
  ORIGINAL:
49
  {original_text}
50
 
51
+ Only return the revised full script, nothing else.
52
+ """
53
+
54
+ # New: IPA-only prompt (Baseline+IPA, step 1)
55
+ PROMPT_TEMPLATE_IPA = """\
56
+ Convert BOTH the ORIGINAL script and the ASR TRANSCRIPT into IPA with syllable boundaries.
57
+ Return ONLY the IPA text in a clearly labeled, compact format, such as:
58
+
59
+ ORIGINAL_IPA:
60
+ <ipa for original with syllable markers>
61
+
62
+ TRANSCRIPT_IPA:
63
+ <ipa for transcript with syllable markers>
64
+
65
+ Do not include any additional commentary.
66
+
67
+ ORIGINAL:
68
+ {original_text}
69
+
70
+ TRANSCRIPT:
71
+ {transcribed_text}
72
  """
73
 
74
+ # New: Diagnosis that uses IPA as extra signal (Baseline+IPA, step 2)
75
+ PROMPT_TEMPLATE_1_WITH_IPA = """\
76
+ You are a speech-language assistant. Given the ORIGINAL script, the TRANSCRIPT (imperfect ASR),
77
+ and their IPA annotations, list words/phrases likely to trigger stuttering (e.g., consonant clusters,
78
+ long multisyllabic words, difficult onsets). Output a short, structured summary and diagnosis for
79
+ easy-to-stutter scenarios.
80
+
81
+ ORIGINAL:
82
+ {original_text}
83
+
84
+ TRANSCRIPT:
85
+ {transcribed_text}
86
+
87
+ IPA_ANNOTATIONS:
88
+ {ipa_text}
89
+
90
+ Never give any suggestion. Only return a concise, principled diagnosis notes with easy-to-stutter scenarios.
91
+ """
92
 
93
  # ----------------------------
94
  # Helpers: STT & LLM calls
95
  # ----------------------------
96
  def transcribe_audio(record_path: str | None) -> str:
97
  """
 
98
  Returns the transcribed text (or an error message).
99
  """
100
  audio_path = record_path
 
104
  if not ELEVENLABS_API_KEY:
105
  return "ELEVENLABS_API_KEY not set. Please configure your environment."
106
 
 
107
  try:
108
  with open(audio_path, "rb") as f:
109
  audio_data = BytesIO(f.read())
 
118
  language_code="eng",
119
  diarize=True,
120
  )
 
121
  return transcription.text or ""
122
  except Exception as e:
123
  return f"Transcription error: {e}"
124
 
 
125
  def call_llm_302(model: str, prompt: str) -> str:
126
  """
127
  Minimal wrapper around 302.ai /v1/chat/completions.
 
149
  conn.close()
150
 
151
  output = json.loads(raw)
 
152
  msg = output.get("choices", [{}])[0].get("message", {})
153
  text = msg.get("content") or msg.get("text") or str(msg)
154
  return text.strip()
155
  except Exception as e:
156
  return f"LLM API error: {e}"
157
 
 
158
  # ----------------------------
159
+ # Button handlers (shared)
160
  # ----------------------------
161
  def on_click_transcribe(record_path):
162
+ """Row 1: Transcribe audio."""
 
 
163
  text = transcribe_audio(record_path)
164
  return gr.update(value=text)
165
 
166
+ def on_click_analyze_baseline(selected_model, original_text, transcribed_text):
 
167
  """
168
+ Baseline Tab: Single-call analysis using PROMPT_TEMPLATE_1.
 
169
  """
170
  prompt = PROMPT_TEMPLATE_1.format(
171
  original_text=original_text or "",
 
174
  analysis = call_llm_302(selected_model, prompt)
175
  return gr.update(value=analysis)
176
 
177
+ def on_click_analyze_ipa(selected_model, original_text, transcribed_text):
178
+ """
179
+ Baseline+IPA Tab: Two-step analysis.
180
+ 1) Generate IPA annotations.
181
+ 2) Use IPA + original + transcript for diagnosis.
182
+ Returns (ipa_box_update, summary_update)
183
+ """
184
+ # Step 1: IPA
185
+ ipa_prompt = PROMPT_TEMPLATE_IPA.format(
186
+ original_text=original_text or "",
187
+ transcribed_text=transcribed_text or "",
188
+ )
189
+ ipa_text = call_llm_302(selected_model, ipa_prompt)
190
+
191
+ # Step 2: Diagnosis with IPA
192
+ diag_prompt = PROMPT_TEMPLATE_1_WITH_IPA.format(
193
+ original_text=original_text or "",
194
+ transcribed_text=transcribed_text or "",
195
+ ipa_text=ipa_text or "",
196
+ )
197
+ summary = call_llm_302(selected_model, diag_prompt)
198
 
199
+ return gr.update(value=ipa_text), gr.update(value=summary)
200
+
201
+ def on_click_rewrite(selected_model, original_text, _transcribed_text_unused, summary):
202
  """
203
+ Row 3: Rewrite script (always annotated version) -> PROMPT_TEMPLATE_2.
 
204
  """
205
  prompt = PROMPT_TEMPLATE_2.format(
206
  notes=summary or "",
 
209
  revised = call_llm_302(selected_model, prompt)
210
  return gr.update(value=revised)
211
 
212
+ # Simple pass-through to mirror recorded file into a Gradio File component
213
+ def passthrough_file(path):
214
+ return path
215
 
216
  # ----------------------------
217
+ # Gradio UI (Tabs)
218
  # ----------------------------
219
  with gr.Blocks(title="DeStammerer: AI-assisted Speech Script Revision") as demo:
220
+ # gr.Markdown("### DeStammerer\nChoose a mode below. Both tabs share the same LLM selector semantics.")
221
+
222
+ with gr.Tabs():
223
+ # ------------------------ Tab 1: Baseline ------------------------
224
+ with gr.Tab("Baseline"):
225
+ # Row 1: Record + Download + Transcribe
226
+ with gr.Row():
227
+ audio_record_b = gr.Audio(label="Record Audio", sources=["microphone"], type="filepath")
228
+ audio_download_b = gr.File(label="Audio Download", interactive=False)
229
+ btn_transcribe_b = gr.Button("1) Transcribe")
230
+
231
+ # Row 2: ASR, Original, Model selector, Analyze
232
+ with gr.Row():
233
+ txt_transcribed_b = gr.Textbox(label="Transcribed Text (ASR)", interactive=False, lines=6, placeholder="ASR output appears here.")
234
+ txt_original_b = gr.Textbox(label="Original Script (input)", lines=6, placeholder="Paste your original script here.")
235
+ model_selector_b = gr.Dropdown(
236
+ choices=["gpt-4o-mini", "gpt-5"],
237
+ value="gpt-4o-mini",
238
+ label="LLM Model"
239
+ )
240
+ btn_analyze_b = gr.Button("2) Analyze")
241
+
242
+ # Row 3: Summary, Revised, Revise button
243
+ with gr.Row():
244
+ txt_summary_b = gr.Textbox(label="LLM Summary: Easy-to-Stutter Words", lines=8, placeholder="Analysis will appear here.")
245
+ txt_revised_b = gr.Textbox(label="Revised Script", lines=8, placeholder="Rewritten script will appear here.")
246
+ btn_rewrite_b = gr.Button("3) Revise Script")
247
+
248
+ # Row 4: Post-hoc audio record and download
249
+ with gr.Row():
250
+ posthoc_record_b = gr.Audio(label="Post-hoc Record Audio", sources=["microphone"], type="filepath")
251
+ posthoc_download_b = gr.File(label="Post-hoc Audio Download", interactive=False)
252
+
253
+ # Wiring (Baseline)
254
+ audio_record_b.change(fn=passthrough_file, inputs=audio_record_b, outputs=audio_download_b)
255
+ btn_transcribe_b.click(fn=on_click_transcribe, inputs=[audio_record_b], outputs=[txt_transcribed_b])
256
+ btn_analyze_b.click(
257
+ fn=on_click_analyze_baseline,
258
+ inputs=[model_selector_b, txt_original_b, txt_transcribed_b],
259
+ outputs=[txt_summary_b],
260
+ )
261
+ btn_rewrite_b.click(
262
+ fn=on_click_rewrite,
263
+ inputs=[model_selector_b, txt_original_b, txt_transcribed_b, txt_summary_b],
264
+ outputs=[txt_revised_b],
265
+ )
266
+ posthoc_record_b.change(fn=passthrough_file, inputs=posthoc_record_b, outputs=posthoc_download_b)
267
+
268
+ # -------------------- Tab 2: Baseline+IPA --------------------
269
+ with gr.Tab("Baseline+IPA"):
270
+ # Row 1: Record + Download + Transcribe
271
+ with gr.Row():
272
+ audio_record_i = gr.Audio(label="Record Audio", sources=["microphone"], type="filepath")
273
+ audio_download_i = gr.File(label="Audio Download", interactive=False)
274
+ btn_transcribe_i = gr.Button("1) Transcribe")
275
+
276
+ # Row 2: ASR, Original, IPA box, Model selector, Analyze
277
+ with gr.Row():
278
+ txt_transcribed_i = gr.Textbox(label="Transcribed Text (ASR)", interactive=False, lines=6, placeholder="ASR output appears here.")
279
+ txt_original_i = gr.Textbox(label="Original Script (input)", lines=6, placeholder="Paste your original script here.")
280
+ txt_ipa_i = gr.Textbox(label="IPA Annotations (LLM Output)", interactive=False, lines=6, placeholder="IPA for Original & Transcript will appear here.")
281
+ model_selector_i = gr.Dropdown(
282
+ choices=["gpt-4o-mini", "gpt-5"],
283
+ value="gpt-4o-mini",
284
+ label="LLM Model"
285
+ )
286
+ btn_analyze_i = gr.Button("2) Analyze (IPA → Diagnosis)")
287
+
288
+ # Row 3: Summary, Revised, Revise button
289
+ with gr.Row():
290
+ txt_summary_i = gr.Textbox(label="LLM Summary: Easy-to-Stutter Words (IPA-aware)", lines=8, placeholder="Analysis will appear here.")
291
+ txt_revised_i = gr.Textbox(label="Revised Script", lines=8, placeholder="Rewritten script will appear here.")
292
+ btn_rewrite_i = gr.Button("3) Revise Script")
293
+
294
+ # Row 4: Post-hoc audio record and download
295
+ with gr.Row():
296
+ posthoc_record_i = gr.Audio(label="Post-hoc Record Audio", sources=["microphone"], type="filepath")
297
+ posthoc_download_i = gr.File(label="Post-hoc Audio Download", interactive=False)
298
+
299
+ # Wiring (Baseline+IPA)
300
+ audio_record_i.change(fn=passthrough_file, inputs=audio_record_i, outputs=audio_download_i)
301
+ btn_transcribe_i.click(fn=on_click_transcribe, inputs=[audio_record_i], outputs=[txt_transcribed_i])
302
+
303
+ # Analyze in two steps: IPA then Diagnosis
304
+ def analyze_ipa_pipeline(model, original_text, transcribed_text):
305
+ ipa_update, summary_update = on_click_analyze_ipa(model, original_text, transcribed_text)
306
+ return ipa_update, summary_update
307
+
308
+ btn_analyze_i.click(
309
+ fn=analyze_ipa_pipeline,
310
+ inputs=[model_selector_i, txt_original_i, txt_transcribed_i],
311
+ outputs=[txt_ipa_i, txt_summary_i],
312
+ )
313
+
314
+ btn_rewrite_i.click(
315
+ fn=on_click_rewrite,
316
+ inputs=[model_selector_i, txt_original_i, txt_transcribed_i, txt_summary_i],
317
+ outputs=[txt_revised_i],
318
+ )
319
+ posthoc_record_i.change(fn=passthrough_file, inputs=posthoc_record_i, outputs=posthoc_download_i)
320
 
321
  if __name__ == "__main__":
322
  demo.launch()