Spaces:

hmdliu
/

SSR

Sleeping

App Files Files Community

hmdliu commited on Oct 3, 2025

Commit

4062687

verified ·

1 Parent(s): 5dc062f

Update app.py

Browse files

Files changed (1) hide show

app.py +173 -67

app.py CHANGED Viewed

@@ -21,9 +21,8 @@ elevenlabs_client = None
 if ELEVENLABS_API_KEY:
     elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
 # ----------------------------
-# Prompt templates (placeholders)
 # ----------------------------
 PROMPT_TEMPLATE_1 = """\
 You are a speech-language assistant. Given the ORIGINAL script and the TRANSCRIPT (imperfect ASR),
@@ -49,17 +48,53 @@ Diagnosis notes on easy-to-stutter scenarios:
 ORIGINAL:
 {original_text}
-First convert the speech script into phonetic symbols (to guide the revision),
-then return the revised script.
 """
 # ----------------------------
 # Helpers: STT & LLM calls
 # ----------------------------
 def transcribe_audio(record_path: str | None) -> str:
     """
-    Prioritize uploaded file if both provided.
     Returns the transcribed text (or an error message).
     """
     audio_path = record_path
@@ -69,7 +104,6 @@ def transcribe_audio(record_path: str | None) -> str:
     if not ELEVENLABS_API_KEY:
         return "ELEVENLABS_API_KEY not set. Please configure your environment."
-    # Read file as bytes -> BytesIO
     try:
         with open(audio_path, "rb") as f:
             audio_data = BytesIO(f.read())
@@ -84,12 +118,10 @@ def transcribe_audio(record_path: str | None) -> str:
             language_code="eng",
             diarize=True,
         )
-        # Minimal output: just return text
         return transcription.text or ""
     except Exception as e:
         return f"Transcription error: {e}"
 def call_llm_302(model: str, prompt: str) -> str:
     """
     Minimal wrapper around 302.ai /v1/chat/completions.
@@ -117,29 +149,23 @@ def call_llm_302(model: str, prompt: str) -> str:
         conn.close()
         output = json.loads(raw)
-        # Defensive parsing
         msg = output.get("choices", [{}])[0].get("message", {})
         text = msg.get("content") or msg.get("text") or str(msg)
         return text.strip()
     except Exception as e:
         return f"LLM API error: {e}"
 # ----------------------------
-# Button handlers
 # ----------------------------
 def on_click_transcribe(record_path):
-    """
-    Button 1: Transcribe audio -> fill Textbox1 (transcribed text, non-editable).
-    """
     text = transcribe_audio(record_path)
     return gr.update(value=text)
-def on_click_analyze(selected_model, original_text, transcribed_text):
     """
-    Button 2: Analyze easy-to-stutter words -> fill Textbox3 using PROMPT_TEMPLATE_1.
-    Respects the selected LLM model.
     """
     prompt = PROMPT_TEMPLATE_1.format(
         original_text=original_text or "",
@@ -148,11 +174,33 @@ def on_click_analyze(selected_model, original_text, transcribed_text):
     analysis = call_llm_302(selected_model, prompt)
     return gr.update(value=analysis)
-def on_click_rewrite(selected_model, original_text, transcribed_text, summary):
     """
-    Button 3: Rewrite script -> always use PROMPT_TEMPLATE_2 (annotated version).
-    Respects the selected LLM model.
     """
     prompt = PROMPT_TEMPLATE_2.format(
         notes=summary or "",
@@ -161,56 +209,114 @@ def on_click_rewrite(selected_model, original_text, transcribed_text, summary):
     revised = call_llm_302(selected_model, prompt)
     return gr.update(value=revised)
 # ----------------------------
-# Gradio UI
 # ----------------------------
 with gr.Blocks(title="DeStammerer: AI-assisted Speech Script Revision") as demo:
-    # Row 1: [audio upload, audio record, button1]
-    with gr.Row():
-        audio_record = gr.Audio(label="Record Audio", sources=["microphone"], type="filepath")
-        audio_download = gr.File(label="Audio Download", interactive=False)
-        btn_transcribe = gr.Button("1) Transcribe")
-    # Row 2: [textbox1 (ASR, readonly), textbox2 (original input), dropdown (model), button2]
-    with gr.Row():
-        txt_transcribed = gr.Textbox(label="Transcribed Text (ASR)", interactive=False, lines=6, placeholder="ASR output appears here.")
-        txt_original = gr.Textbox(label="Original Script (input)", lines=6, placeholder="Paste your original script here.")
-        model_selector = gr.Dropdown(
-            choices=["gpt-4o-mini", "gpt-5"],
-            value="gpt-4o-mini",
-            label="LLM Model"
-        )
-        btn_analyze = gr.Button("2) Analyze")
-    # Row 3: [textbox3 (LLM summary), textbox4 (revised script), button3]
-    with gr.Row():
-        txt_summary = gr.Textbox(label="LLM Summary: Easy-to-Stutter Words", lines=8, placeholder="Analysis will appear here.")
-        txt_revised = gr.Textbox(label="Revised Script", lines=8, placeholder="Rewritten script will appear here.")
-        btn_rewrite = gr.Button("3) Revise Script")
-    # Wiring
-    passthrough = lambda audio_path: audio_path
-    audio_record.change(fn=passthrough, inputs=audio_record, outputs=audio_download)
-    btn_transcribe.click(
-        fn=on_click_transcribe,
-        inputs=[audio_record],
-        outputs=[txt_transcribed],
-    )
-    btn_analyze.click(
-        fn=on_click_analyze,
-        inputs=[model_selector, txt_original, txt_transcribed],
-        outputs=[txt_summary],
-    )
-    btn_rewrite.click(
-        fn=on_click_rewrite,
-        inputs=[model_selector, txt_original, txt_transcribed, txt_summary],
-        outputs=[txt_revised],
-    )
 if __name__ == "__main__":
     demo.launch()

 if ELEVENLABS_API_KEY:
     elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
 # ----------------------------
+# Prompt templates
 # ----------------------------
 PROMPT_TEMPLATE_1 = """\
 You are a speech-language assistant. Given the ORIGINAL script and the TRANSCRIPT (imperfect ASR),
 ORIGINAL:
 {original_text}
+Only return the revised full script, nothing else.
+"""
+# New: IPA-only prompt (Baseline+IPA, step 1)
+PROMPT_TEMPLATE_IPA = """\
+Convert BOTH the ORIGINAL script and the ASR TRANSCRIPT into IPA with syllable boundaries.
+Return ONLY the IPA text in a clearly labeled, compact format, such as:
+ORIGINAL_IPA:
+<ipa for original with syllable markers>
+TRANSCRIPT_IPA:
+<ipa for transcript with syllable markers>
+Do not include any additional commentary.
+ORIGINAL:
+{original_text}
+TRANSCRIPT:
+{transcribed_text}
 """
+# New: Diagnosis that uses IPA as extra signal (Baseline+IPA, step 2)
+PROMPT_TEMPLATE_1_WITH_IPA = """\
+You are a speech-language assistant. Given the ORIGINAL script, the TRANSCRIPT (imperfect ASR),
+and their IPA annotations, list words/phrases likely to trigger stuttering (e.g., consonant clusters,
+long multisyllabic words, difficult onsets). Output a short, structured summary and diagnosis for
+easy-to-stutter scenarios.
+ORIGINAL:
+{original_text}
+TRANSCRIPT:
+{transcribed_text}
+IPA_ANNOTATIONS:
+{ipa_text}
+Never give any suggestion. Only return a concise, principled diagnosis notes with easy-to-stutter scenarios.
+"""
 # ----------------------------
 # Helpers: STT & LLM calls
 # ----------------------------
 def transcribe_audio(record_path: str | None) -> str:
     """
     Returns the transcribed text (or an error message).
     """
     audio_path = record_path
     if not ELEVENLABS_API_KEY:
         return "ELEVENLABS_API_KEY not set. Please configure your environment."
     try:
         with open(audio_path, "rb") as f:
             audio_data = BytesIO(f.read())
             language_code="eng",
             diarize=True,
         )
         return transcription.text or ""
     except Exception as e:
         return f"Transcription error: {e}"
 def call_llm_302(model: str, prompt: str) -> str:
     """
     Minimal wrapper around 302.ai /v1/chat/completions.
         conn.close()
         output = json.loads(raw)
         msg = output.get("choices", [{}])[0].get("message", {})
         text = msg.get("content") or msg.get("text") or str(msg)
         return text.strip()
     except Exception as e:
         return f"LLM API error: {e}"
 # ----------------------------
+# Button handlers (shared)
 # ----------------------------
 def on_click_transcribe(record_path):
+    """Row 1: Transcribe audio."""
     text = transcribe_audio(record_path)
     return gr.update(value=text)
+def on_click_analyze_baseline(selected_model, original_text, transcribed_text):
     """
+    Baseline Tab: Single-call analysis using PROMPT_TEMPLATE_1.
     """
     prompt = PROMPT_TEMPLATE_1.format(
         original_text=original_text or "",
     analysis = call_llm_302(selected_model, prompt)
     return gr.update(value=analysis)
+def on_click_analyze_ipa(selected_model, original_text, transcribed_text):
+    """
+    Baseline+IPA Tab: Two-step analysis.
+      1) Generate IPA annotations.
+      2) Use IPA + original + transcript for diagnosis.
+    Returns (ipa_box_update, summary_update)
+    """
+    # Step 1: IPA
+    ipa_prompt = PROMPT_TEMPLATE_IPA.format(
+        original_text=original_text or "",
+        transcribed_text=transcribed_text or "",
+    )
+    ipa_text = call_llm_302(selected_model, ipa_prompt)
+    # Step 2: Diagnosis with IPA
+    diag_prompt = PROMPT_TEMPLATE_1_WITH_IPA.format(
+        original_text=original_text or "",
+        transcribed_text=transcribed_text or "",
+        ipa_text=ipa_text or "",
+    )
+    summary = call_llm_302(selected_model, diag_prompt)
+    return gr.update(value=ipa_text), gr.update(value=summary)
+def on_click_rewrite(selected_model, original_text, _transcribed_text_unused, summary):
     """
+    Row 3: Rewrite script (always annotated version) -> PROMPT_TEMPLATE_2.
     """
     prompt = PROMPT_TEMPLATE_2.format(
         notes=summary or "",
     revised = call_llm_302(selected_model, prompt)
     return gr.update(value=revised)
+# Simple pass-through to mirror recorded file into a Gradio File component
+def passthrough_file(path):
+    return path
 # ----------------------------
+# Gradio UI (Tabs)
 # ----------------------------
 with gr.Blocks(title="DeStammerer: AI-assisted Speech Script Revision") as demo:
+    # gr.Markdown("### DeStammerer\nChoose a mode below. Both tabs share the same LLM selector semantics.")
+    with gr.Tabs():
+        # ------------------------ Tab 1: Baseline ------------------------
+        with gr.Tab("Baseline"):
+            # Row 1: Record + Download + Transcribe
+            with gr.Row():
+                audio_record_b = gr.Audio(label="Record Audio", sources=["microphone"], type="filepath")
+                audio_download_b = gr.File(label="Audio Download", interactive=False)
+                btn_transcribe_b = gr.Button("1) Transcribe")
+            # Row 2: ASR, Original, Model selector, Analyze
+            with gr.Row():
+                txt_transcribed_b = gr.Textbox(label="Transcribed Text (ASR)", interactive=False, lines=6, placeholder="ASR output appears here.")
+                txt_original_b = gr.Textbox(label="Original Script (input)", lines=6, placeholder="Paste your original script here.")
+                model_selector_b = gr.Dropdown(
+                    choices=["gpt-4o-mini", "gpt-5"],
+                    value="gpt-4o-mini",
+                    label="LLM Model"
+                )
+                btn_analyze_b = gr.Button("2) Analyze")
+            # Row 3: Summary, Revised, Revise button
+            with gr.Row():
+                txt_summary_b = gr.Textbox(label="LLM Summary: Easy-to-Stutter Words", lines=8, placeholder="Analysis will appear here.")
+                txt_revised_b = gr.Textbox(label="Revised Script", lines=8, placeholder="Rewritten script will appear here.")
+                btn_rewrite_b = gr.Button("3) Revise Script")
+            # Row 4: Post-hoc audio record and download
+            with gr.Row():
+                posthoc_record_b = gr.Audio(label="Post-hoc Record Audio", sources=["microphone"], type="filepath")
+                posthoc_download_b = gr.File(label="Post-hoc Audio Download", interactive=False)
+            # Wiring (Baseline)
+            audio_record_b.change(fn=passthrough_file, inputs=audio_record_b, outputs=audio_download_b)
+            btn_transcribe_b.click(fn=on_click_transcribe, inputs=[audio_record_b], outputs=[txt_transcribed_b])
+            btn_analyze_b.click(
+                fn=on_click_analyze_baseline,
+                inputs=[model_selector_b, txt_original_b, txt_transcribed_b],
+                outputs=[txt_summary_b],
+            )
+            btn_rewrite_b.click(
+                fn=on_click_rewrite,
+                inputs=[model_selector_b, txt_original_b, txt_transcribed_b, txt_summary_b],
+                outputs=[txt_revised_b],
+            )
+            posthoc_record_b.change(fn=passthrough_file, inputs=posthoc_record_b, outputs=posthoc_download_b)
+        # -------------------- Tab 2: Baseline+IPA --------------------
+        with gr.Tab("Baseline+IPA"):
+            # Row 1: Record + Download + Transcribe
+            with gr.Row():
+                audio_record_i = gr.Audio(label="Record Audio", sources=["microphone"], type="filepath")
+                audio_download_i = gr.File(label="Audio Download", interactive=False)
+                btn_transcribe_i = gr.Button("1) Transcribe")
+            # Row 2: ASR, Original, IPA box, Model selector, Analyze
+            with gr.Row():
+                txt_transcribed_i = gr.Textbox(label="Transcribed Text (ASR)", interactive=False, lines=6, placeholder="ASR output appears here.")
+                txt_original_i = gr.Textbox(label="Original Script (input)", lines=6, placeholder="Paste your original script here.")
+                txt_ipa_i = gr.Textbox(label="IPA Annotations (LLM Output)", interactive=False, lines=6, placeholder="IPA for Original & Transcript will appear here.")
+                model_selector_i = gr.Dropdown(
+                    choices=["gpt-4o-mini", "gpt-5"],
+                    value="gpt-4o-mini",
+                    label="LLM Model"
+                )
+                btn_analyze_i = gr.Button("2) Analyze (IPA → Diagnosis)")
+            # Row 3: Summary, Revised, Revise button
+            with gr.Row():
+                txt_summary_i = gr.Textbox(label="LLM Summary: Easy-to-Stutter Words (IPA-aware)", lines=8, placeholder="Analysis will appear here.")
+                txt_revised_i = gr.Textbox(label="Revised Script", lines=8, placeholder="Rewritten script will appear here.")
+                btn_rewrite_i = gr.Button("3) Revise Script")
+            # Row 4: Post-hoc audio record and download
+            with gr.Row():
+                posthoc_record_i = gr.Audio(label="Post-hoc Record Audio", sources=["microphone"], type="filepath")
+                posthoc_download_i = gr.File(label="Post-hoc Audio Download", interactive=False)
+            # Wiring (Baseline+IPA)
+            audio_record_i.change(fn=passthrough_file, inputs=audio_record_i, outputs=audio_download_i)
+            btn_transcribe_i.click(fn=on_click_transcribe, inputs=[audio_record_i], outputs=[txt_transcribed_i])
+            # Analyze in two steps: IPA then Diagnosis
+            def analyze_ipa_pipeline(model, original_text, transcribed_text):
+                ipa_update, summary_update = on_click_analyze_ipa(model, original_text, transcribed_text)
+                return ipa_update, summary_update
+            btn_analyze_i.click(
+                fn=analyze_ipa_pipeline,
+                inputs=[model_selector_i, txt_original_i, txt_transcribed_i],
+                outputs=[txt_ipa_i, txt_summary_i],
+            )
+            btn_rewrite_i.click(
+                fn=on_click_rewrite,
+                inputs=[model_selector_i, txt_original_i, txt_transcribed_i, txt_summary_i],
+                outputs=[txt_revised_i],
+            )
+            posthoc_record_i.change(fn=passthrough_file, inputs=posthoc_record_i, outputs=posthoc_download_i)
 if __name__ == "__main__":
     demo.launch()