Spaces:

Teera
/

conversation-extraction

Sleeping

File size: 7,582 Bytes

8b1d8cc

import os
import sys
import azure.cognitiveservices.speech as speechsdk
from dotenv import load_dotenv

load_dotenv()

SPEECH_KEY = os.getenv("SPEECH_KEY")
SPEECH_REGION = os.getenv("SPEECH_REGION", "eastus")


def create_speech_config(language="th-TH"):
    """Create a SpeechConfig with the given language."""
    config = speechsdk.SpeechConfig(
        subscription=SPEECH_KEY,
        region=SPEECH_REGION,
    )
    config.speech_recognition_language = language
    return config


def transcribe_from_mic():
    """Transcribe from the local microphone (CLI mode)."""
    speech_config = create_speech_config("th-TH")
    audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)
    recognizer = speechsdk.SpeechRecognizer(
        speech_config=speech_config,
        audio_config=audio_config,
    )

    print("🎤 Listening... Speak into your microphone.")
    result = recognizer.recognize_once()

    if result.reason == speechsdk.ResultReason.RecognizedSpeech:
        print("✅ Recognized: " + result.text)
    elif result.reason == speechsdk.ResultReason.NoMatch:
        print("❌ No speech could be recognized: " + str(result.no_match_details))
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        print("⚠️ Speech recognition canceled: " + str(cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print("Error details: " + str(cancellation_details.error_details))
            print("Did you set the speech resource key and region?")


def transcribe_audio_file(audio_path, language="th-TH"):
    """Transcribe an audio file using Azure Speech SDK."""
    if audio_path is None:
        return "⚠️ กรุณาอัดเสียงก่อน"

    speech_config = create_speech_config(language)
    audio_config = speechsdk.audio.AudioConfig(filename=audio_path)
    recognizer = speechsdk.SpeechRecognizer(
        speech_config=speech_config,
        audio_config=audio_config,
    )

    # Use continuous recognition to get the full transcript
    all_results = []
    done = False

    def on_recognized(evt):
        if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
            all_results.append(evt.result.text)

    def on_canceled(evt):
        nonlocal done
        done = True

    def on_stopped(evt):
        nonlocal done
        done = True

    recognizer.recognized.connect(on_recognized)
    recognizer.canceled.connect(on_canceled)
    recognizer.session_stopped.connect(on_stopped)

    recognizer.start_continuous_recognition()

    import time
    while not done:
        time.sleep(0.1)

    recognizer.stop_continuous_recognition()

    if all_results:
        return "\n".join(all_results)
    else:
        return "❌ ไม่สามารถถอดเสียงได้ — ลองพูดดังขึ้นหรือตรวจสอบไมค์"


def transcribe_and_analyze(audio_path, language):
    """Transcribe audio, then analyze with LLM. Returns (transcript, analysis_json)."""
    transcript = transcribe_audio_file(audio_path, language)

    if transcript.startswith("❌") or transcript.startswith("⚠️"):
        return transcript, ""

    from llm_client import analyze_football_content, format_analysis_result
    result = analyze_football_content(transcript)
    analysis_json = format_analysis_result(result)

    return transcript, analysis_json


def analyze_text_only(transcript):
    """Analyze existing transcript text without re-transcribing."""
    if not transcript or not transcript.strip():
        return "⚠️ กรุณาใส่ข้อความก่อน"

    from llm_client import analyze_football_content, format_analysis_result
    result = analyze_football_content(transcript)
    return format_analysis_result(result)


def run_web():
    """Run the Gradio web UI."""
    import gradio as gr

    with gr.Blocks(
        title="ASR - Football Analysis",
        theme=gr.themes.Soft(
            primary_hue=gr.themes.colors.indigo,
            secondary_hue=gr.themes.colors.purple,
            neutral_hue=gr.themes.colors.slate,
        ),
        css="""
        .gradio-container {
            max-width: 900px !important;
            margin: auto !important;
        }
        """,
    ) as app:

        gr.Markdown(
            """
            # ⚽ Football Speech Analyzer
            ### ถอดเสียงพูด + วิเคราะห์เนื้อหาฟุตบอลด้วย AI
            ---
            """
        )

        with gr.Row():
            language = gr.Dropdown(
                choices=[
                    ("🇹🇭 ไทย", "th-TH"),
                    ("🇺🇸 English", "en-US"),
                    ("🇯🇵 日本語", "ja-JP"),
                    ("🇨🇳 中文", "zh-CN"),
                    ("🇰🇷 한국어", "ko-KR"),
                ],
                value="th-TH",
                label="ภาษา",
                interactive=True,
            )

        gr.Markdown("### 🎤 อัดเสียงจากไมค์")
        audio_input = gr.Audio(
            sources=["microphone", "upload"],
            type="filepath",
            label="กดปุ่มอัดเสียง หรืออัปโหลดไฟล์เสียง",
        )

        with gr.Row():
            transcribe_btn = gr.Button(
                "✨ ถอดเสียงอย่างเดียว",
                variant="secondary",
                size="lg",
            )
            full_btn = gr.Button(
                "⚽ ถอดเสียง + วิเคราะห์ฟุตบอล",
                variant="primary",
                size="lg",
            )

        gr.Markdown("### 📝 ข้อความที่ถอดได้")
        output_text = gr.Textbox(
            label="Transcript",
            lines=6,
            show_copy_button=True,
            placeholder="ผลการถอดเสียงจะแสดงที่นี่...",
        )

        gr.Markdown("### 🧠 ผลวิเคราะห์จาก AI")
        with gr.Row():
            analyze_btn = gr.Button(
                "🔄 วิเคราะห์ข้อความข้างบนอีกครั้ง",
                variant="secondary",
                size="sm",
            )

        analysis_output = gr.Code(
            label="Football Analysis (JSON)",
            language="json",
            lines=20,
        )

        # --- Events ---

        # Transcribe only
        transcribe_btn.click(
            fn=transcribe_audio_file,
            inputs=[audio_input, language],
            outputs=output_text,
        )

        # Transcribe + Analyze
        full_btn.click(
            fn=transcribe_and_analyze,
            inputs=[audio_input, language],
            outputs=[output_text, analysis_output],
        )

        # Re-analyze existing transcript
        analyze_btn.click(
            fn=analyze_text_only,
            inputs=output_text,
            outputs=analysis_output,
        )

        # Auto-transcribe + analyze on recording stop
        audio_input.stop_recording(
            fn=transcribe_and_analyze,
            inputs=[audio_input, language],
            outputs=[output_text, analysis_output],
        )

    app.launch()



if __name__ == "__main__":
    run_web()