|
|
|
|
|
import os |
|
|
import json |
|
|
import http.client |
|
|
from io import BytesIO |
|
|
|
|
|
import gradio as gr |
|
|
from dotenv import load_dotenv |
|
|
from elevenlabs.client import ElevenLabs |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "") |
|
|
API_KEY_302 = os.getenv("API_KEY_302", "") |
|
|
|
|
|
|
|
|
elevenlabs_client = None |
|
|
if ELEVENLABS_API_KEY: |
|
|
elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PROMPT_TEMPLATE_1 = """\ |
|
|
You are a speech-language assistant. Given the ORIGINAL script and the TRANSCRIPT (imperfect ASR), |
|
|
list words/phrases likely to trigger stuttering (e.g., consonant clusters, long multisyllabic words). |
|
|
Output a short, structured summary and diagnosis for easy-to-stutter scenarios. |
|
|
|
|
|
ORIGINAL: |
|
|
{original_text} |
|
|
|
|
|
TRANSCRIPT: |
|
|
{transcribed_text} |
|
|
|
|
|
Never give any suggestion. Only return a concise, principled diagnosis notes with easy-to-stutter scenarios. |
|
|
""" |
|
|
|
|
|
PROMPT_TEMPLATE_2 = """\ |
|
|
You are a speech-language assistant. Rewrite the ORIGINAL script to reduce stuttering risk, while |
|
|
preserving meaning and tone. Prefer simpler synonyms, shorter clauses, easier onsets. Keep it concise. |
|
|
|
|
|
Diagnosis notes on easy-to-stutter scenarios: |
|
|
{notes} |
|
|
|
|
|
ORIGINAL: |
|
|
{original_text} |
|
|
|
|
|
Only return the revised full script, nothing else. |
|
|
""" |
|
|
|
|
|
|
|
|
PROMPT_TEMPLATE_IPA = """\ |
|
|
Convert BOTH the ORIGINAL script and the ASR TRANSCRIPT into IPA with syllable boundaries. |
|
|
Return ONLY the IPA text in a clearly labeled, compact format, such as: |
|
|
|
|
|
ORIGINAL_IPA: |
|
|
<ipa for original with syllable markers> |
|
|
|
|
|
TRANSCRIPT_IPA: |
|
|
<ipa for transcript with syllable markers> |
|
|
|
|
|
Do not include any additional commentary. |
|
|
|
|
|
ORIGINAL: |
|
|
{original_text} |
|
|
|
|
|
TRANSCRIPT: |
|
|
{transcribed_text} |
|
|
""" |
|
|
|
|
|
|
|
|
PROMPT_TEMPLATE_1_WITH_IPA = """\ |
|
|
You are a speech-language assistant. Given the ORIGINAL script, the TRANSCRIPT (imperfect ASR), |
|
|
and their IPA annotations, list words/phrases likely to trigger stuttering (e.g., consonant clusters, |
|
|
long multisyllabic words, difficult onsets). Output a short, structured summary and diagnosis for |
|
|
easy-to-stutter scenarios. |
|
|
|
|
|
ORIGINAL: |
|
|
{original_text} |
|
|
|
|
|
TRANSCRIPT: |
|
|
{transcribed_text} |
|
|
|
|
|
IPA_ANNOTATIONS: |
|
|
{ipa_text} |
|
|
|
|
|
Never give any suggestion. Only return a concise, principled diagnosis notes with easy-to-stutter scenarios. |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def transcribe_audio(record_path: str | None) -> str: |
|
|
""" |
|
|
Returns the transcribed text (or an error message). |
|
|
""" |
|
|
audio_path = record_path |
|
|
if not audio_path: |
|
|
return "No audio provided. Please upload or record audio." |
|
|
|
|
|
if not ELEVENLABS_API_KEY: |
|
|
return "ELEVENLABS_API_KEY not set. Please configure your environment." |
|
|
|
|
|
try: |
|
|
with open(audio_path, "rb") as f: |
|
|
audio_data = BytesIO(f.read()) |
|
|
except Exception as e: |
|
|
return f"Failed to read audio: {e}" |
|
|
|
|
|
try: |
|
|
transcription = elevenlabs_client.speech_to_text.convert( |
|
|
file=audio_data, |
|
|
model_id="scribe_v1", |
|
|
tag_audio_events=True, |
|
|
language_code="eng", |
|
|
diarize=True, |
|
|
) |
|
|
return transcription.text or "" |
|
|
except Exception as e: |
|
|
return f"Transcription error: {e}" |
|
|
|
|
|
def call_llm_302(model: str, prompt: str) -> str: |
|
|
""" |
|
|
Minimal wrapper around 302.ai /v1/chat/completions. |
|
|
Returns assistant text or an error string. |
|
|
""" |
|
|
if not API_KEY_302: |
|
|
return "API_KEY_302 not set. Please configure your environment." |
|
|
|
|
|
try: |
|
|
conn = http.client.HTTPSConnection("api.302.ai") |
|
|
payload = json.dumps({ |
|
|
"model": model, |
|
|
"messages": [ |
|
|
{"role": "user", "content": prompt} |
|
|
] |
|
|
}) |
|
|
headers = { |
|
|
"Accept": "application/json", |
|
|
"Authorization": f"Bearer {API_KEY_302}", |
|
|
"Content-Type": "application/json" |
|
|
} |
|
|
conn.request("POST", "/v1/chat/completions", payload, headers) |
|
|
res = conn.getresponse() |
|
|
raw = res.read().decode("utf-8") |
|
|
conn.close() |
|
|
|
|
|
output = json.loads(raw) |
|
|
msg = output.get("choices", [{}])[0].get("message", {}) |
|
|
text = msg.get("content") or msg.get("text") or str(msg) |
|
|
return text.strip() |
|
|
except Exception as e: |
|
|
return f"LLM API error: {e}" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def on_click_transcribe(record_path): |
|
|
"""Row 1: Transcribe audio.""" |
|
|
text = transcribe_audio(record_path) |
|
|
return gr.update(value=text) |
|
|
|
|
|
def on_click_analyze_baseline(selected_model, original_text, transcribed_text): |
|
|
""" |
|
|
Baseline Tab: Single-call analysis using PROMPT_TEMPLATE_1. |
|
|
""" |
|
|
prompt = PROMPT_TEMPLATE_1.format( |
|
|
original_text=original_text or "", |
|
|
transcribed_text=transcribed_text or "", |
|
|
) |
|
|
analysis = call_llm_302(selected_model, prompt) |
|
|
return gr.update(value=analysis) |
|
|
|
|
|
def on_click_analyze_ipa(selected_model, original_text, transcribed_text): |
|
|
""" |
|
|
Baseline+IPA Tab: Two-step analysis. |
|
|
1) Generate IPA annotations. |
|
|
2) Use IPA + original + transcript for diagnosis. |
|
|
Returns (ipa_box_update, summary_update) |
|
|
""" |
|
|
|
|
|
ipa_prompt = PROMPT_TEMPLATE_IPA.format( |
|
|
original_text=original_text or "", |
|
|
transcribed_text=transcribed_text or "", |
|
|
) |
|
|
ipa_text = call_llm_302(selected_model, ipa_prompt) |
|
|
|
|
|
|
|
|
diag_prompt = PROMPT_TEMPLATE_1_WITH_IPA.format( |
|
|
original_text=original_text or "", |
|
|
transcribed_text=transcribed_text or "", |
|
|
ipa_text=ipa_text or "", |
|
|
) |
|
|
summary = call_llm_302(selected_model, diag_prompt) |
|
|
|
|
|
return gr.update(value=ipa_text), gr.update(value=summary) |
|
|
|
|
|
def on_click_rewrite(selected_model, original_text, _transcribed_text_unused, summary): |
|
|
""" |
|
|
Row 3: Rewrite script (always annotated version) -> PROMPT_TEMPLATE_2. |
|
|
""" |
|
|
prompt = PROMPT_TEMPLATE_2.format( |
|
|
notes=summary or "", |
|
|
original_text=original_text or "", |
|
|
) |
|
|
revised = call_llm_302(selected_model, prompt) |
|
|
return gr.update(value=revised) |
|
|
|
|
|
|
|
|
def passthrough_file(path): |
|
|
return path |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(title="DeStammerer: AI-assisted Speech Script Revision") as demo: |
|
|
|
|
|
|
|
|
with gr.Tabs(): |
|
|
|
|
|
with gr.Tab("Baseline"): |
|
|
|
|
|
with gr.Row(): |
|
|
audio_record_b = gr.Audio(label="Record Audio", sources=["microphone"], type="filepath") |
|
|
audio_download_b = gr.File(label="Audio Download", interactive=False) |
|
|
btn_transcribe_b = gr.Button("1) Transcribe") |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
txt_transcribed_b = gr.Textbox(label="Transcribed Text (ASR)", interactive=False, lines=6, placeholder="ASR output appears here.") |
|
|
txt_original_b = gr.Textbox(label="Original Script (input)", lines=6, placeholder="Paste your original script here.") |
|
|
model_selector_b = gr.Dropdown( |
|
|
choices=["gpt-4o-mini", "gpt-5"], |
|
|
value="gpt-4o-mini", |
|
|
label="LLM Model" |
|
|
) |
|
|
btn_analyze_b = gr.Button("2) Analyze") |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
txt_summary_b = gr.Textbox(label="LLM Summary: Easy-to-Stutter Words", lines=8, placeholder="Analysis will appear here.") |
|
|
txt_revised_b = gr.Textbox(label="Revised Script", lines=8, placeholder="Rewritten script will appear here.") |
|
|
btn_rewrite_b = gr.Button("3) Revise Script") |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
posthoc_record_b = gr.Audio(label="Post-hoc Record Audio", sources=["microphone"], type="filepath") |
|
|
posthoc_download_b = gr.File(label="Post-hoc Audio Download", interactive=False) |
|
|
|
|
|
|
|
|
audio_record_b.change(fn=passthrough_file, inputs=audio_record_b, outputs=audio_download_b) |
|
|
btn_transcribe_b.click(fn=on_click_transcribe, inputs=[audio_record_b], outputs=[txt_transcribed_b]) |
|
|
btn_analyze_b.click( |
|
|
fn=on_click_analyze_baseline, |
|
|
inputs=[model_selector_b, txt_original_b, txt_transcribed_b], |
|
|
outputs=[txt_summary_b], |
|
|
) |
|
|
btn_rewrite_b.click( |
|
|
fn=on_click_rewrite, |
|
|
inputs=[model_selector_b, txt_original_b, txt_transcribed_b, txt_summary_b], |
|
|
outputs=[txt_revised_b], |
|
|
) |
|
|
posthoc_record_b.change(fn=passthrough_file, inputs=posthoc_record_b, outputs=posthoc_download_b) |
|
|
|
|
|
|
|
|
with gr.Tab("Baseline+IPA"): |
|
|
|
|
|
with gr.Row(): |
|
|
audio_record_i = gr.Audio(label="Record Audio", sources=["microphone"], type="filepath") |
|
|
audio_download_i = gr.File(label="Audio Download", interactive=False) |
|
|
btn_transcribe_i = gr.Button("1) Transcribe") |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
txt_transcribed_i = gr.Textbox(label="Transcribed Text (ASR)", interactive=False, lines=6, placeholder="ASR output appears here.") |
|
|
txt_original_i = gr.Textbox(label="Original Script (input)", lines=6, placeholder="Paste your original script here.") |
|
|
txt_ipa_i = gr.Textbox(label="IPA Annotations (LLM Output)", interactive=False, lines=6, placeholder="IPA for Original & Transcript will appear here.") |
|
|
model_selector_i = gr.Dropdown( |
|
|
choices=["gpt-4o-mini", "gpt-5"], |
|
|
value="gpt-4o-mini", |
|
|
label="LLM Model" |
|
|
) |
|
|
btn_analyze_i = gr.Button("2) Analyze (IPA → Diagnosis)") |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
txt_summary_i = gr.Textbox(label="LLM Summary: Easy-to-Stutter Words (IPA-aware)", lines=8, placeholder="Analysis will appear here.") |
|
|
txt_revised_i = gr.Textbox(label="Revised Script", lines=8, placeholder="Rewritten script will appear here.") |
|
|
btn_rewrite_i = gr.Button("3) Revise Script") |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
posthoc_record_i = gr.Audio(label="Post-hoc Record Audio", sources=["microphone"], type="filepath") |
|
|
posthoc_download_i = gr.File(label="Post-hoc Audio Download", interactive=False) |
|
|
|
|
|
|
|
|
audio_record_i.change(fn=passthrough_file, inputs=audio_record_i, outputs=audio_download_i) |
|
|
btn_transcribe_i.click(fn=on_click_transcribe, inputs=[audio_record_i], outputs=[txt_transcribed_i]) |
|
|
|
|
|
|
|
|
def analyze_ipa_pipeline(model, original_text, transcribed_text): |
|
|
ipa_update, summary_update = on_click_analyze_ipa(model, original_text, transcribed_text) |
|
|
return ipa_update, summary_update |
|
|
|
|
|
btn_analyze_i.click( |
|
|
fn=analyze_ipa_pipeline, |
|
|
inputs=[model_selector_i, txt_original_i, txt_transcribed_i], |
|
|
outputs=[txt_ipa_i, txt_summary_i], |
|
|
) |
|
|
|
|
|
btn_rewrite_i.click( |
|
|
fn=on_click_rewrite, |
|
|
inputs=[model_selector_i, txt_original_i, txt_transcribed_i, txt_summary_i], |
|
|
outputs=[txt_revised_i], |
|
|
) |
|
|
posthoc_record_i.change(fn=passthrough_file, inputs=posthoc_record_i, outputs=posthoc_download_i) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|