Teera's picture
8b1d8cc verified
import os
import sys
import azure.cognitiveservices.speech as speechsdk
from dotenv import load_dotenv
load_dotenv()
SPEECH_KEY = os.getenv("SPEECH_KEY")
SPEECH_REGION = os.getenv("SPEECH_REGION", "eastus")
def create_speech_config(language="th-TH"):
"""Create a SpeechConfig with the given language."""
config = speechsdk.SpeechConfig(
subscription=SPEECH_KEY,
region=SPEECH_REGION,
)
config.speech_recognition_language = language
return config
def transcribe_from_mic():
"""Transcribe from the local microphone (CLI mode)."""
speech_config = create_speech_config("th-TH")
audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)
recognizer = speechsdk.SpeechRecognizer(
speech_config=speech_config,
audio_config=audio_config,
)
print("🎤 Listening... Speak into your microphone.")
result = recognizer.recognize_once()
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
print("✅ Recognized: " + result.text)
elif result.reason == speechsdk.ResultReason.NoMatch:
print("❌ No speech could be recognized: " + str(result.no_match_details))
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print("⚠️ Speech recognition canceled: " + str(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: " + str(cancellation_details.error_details))
print("Did you set the speech resource key and region?")
def transcribe_audio_file(audio_path, language="th-TH"):
"""Transcribe an audio file using Azure Speech SDK."""
if audio_path is None:
return "⚠️ กรุณาอัดเสียงก่อน"
speech_config = create_speech_config(language)
audio_config = speechsdk.audio.AudioConfig(filename=audio_path)
recognizer = speechsdk.SpeechRecognizer(
speech_config=speech_config,
audio_config=audio_config,
)
# Use continuous recognition to get the full transcript
all_results = []
done = False
def on_recognized(evt):
if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
all_results.append(evt.result.text)
def on_canceled(evt):
nonlocal done
done = True
def on_stopped(evt):
nonlocal done
done = True
recognizer.recognized.connect(on_recognized)
recognizer.canceled.connect(on_canceled)
recognizer.session_stopped.connect(on_stopped)
recognizer.start_continuous_recognition()
import time
while not done:
time.sleep(0.1)
recognizer.stop_continuous_recognition()
if all_results:
return "\n".join(all_results)
else:
return "❌ ไม่สามารถถอดเสียงได้ — ลองพูดดังขึ้นหรือตรวจสอบไมค์"
def transcribe_and_analyze(audio_path, language):
"""Transcribe audio, then analyze with LLM. Returns (transcript, analysis_json)."""
transcript = transcribe_audio_file(audio_path, language)
if transcript.startswith("❌") or transcript.startswith("⚠️"):
return transcript, ""
from llm_client import analyze_football_content, format_analysis_result
result = analyze_football_content(transcript)
analysis_json = format_analysis_result(result)
return transcript, analysis_json
def analyze_text_only(transcript):
"""Analyze existing transcript text without re-transcribing."""
if not transcript or not transcript.strip():
return "⚠️ กรุณาใส่ข้อความก่อน"
from llm_client import analyze_football_content, format_analysis_result
result = analyze_football_content(transcript)
return format_analysis_result(result)
def run_web():
"""Run the Gradio web UI."""
import gradio as gr
with gr.Blocks(
title="ASR - Football Analysis",
theme=gr.themes.Soft(
primary_hue=gr.themes.colors.indigo,
secondary_hue=gr.themes.colors.purple,
neutral_hue=gr.themes.colors.slate,
),
css="""
.gradio-container {
max-width: 900px !important;
margin: auto !important;
}
""",
) as app:
gr.Markdown(
"""
# ⚽ Football Speech Analyzer
### ถอดเสียงพูด + วิเคราะห์เนื้อหาฟุตบอลด้วย AI
---
"""
)
with gr.Row():
language = gr.Dropdown(
choices=[
("🇹🇭 ไทย", "th-TH"),
("🇺🇸 English", "en-US"),
("🇯🇵 日本語", "ja-JP"),
("🇨🇳 中文", "zh-CN"),
("🇰🇷 한국어", "ko-KR"),
],
value="th-TH",
label="ภาษา",
interactive=True,
)
gr.Markdown("### 🎤 อัดเสียงจากไมค์")
audio_input = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="กดปุ่มอัดเสียง หรืออัปโหลดไฟล์เสียง",
)
with gr.Row():
transcribe_btn = gr.Button(
"✨ ถอดเสียงอย่างเดียว",
variant="secondary",
size="lg",
)
full_btn = gr.Button(
"⚽ ถอดเสียง + วิเคราะห์ฟุตบอล",
variant="primary",
size="lg",
)
gr.Markdown("### 📝 ข้อความที่ถอดได้")
output_text = gr.Textbox(
label="Transcript",
lines=6,
show_copy_button=True,
placeholder="ผลการถอดเสียงจะแสดงที่นี่...",
)
gr.Markdown("### 🧠 ผลวิเคราะห์จาก AI")
with gr.Row():
analyze_btn = gr.Button(
"🔄 วิเคราะห์ข้อความข้างบนอีกครั้ง",
variant="secondary",
size="sm",
)
analysis_output = gr.Code(
label="Football Analysis (JSON)",
language="json",
lines=20,
)
# --- Events ---
# Transcribe only
transcribe_btn.click(
fn=transcribe_audio_file,
inputs=[audio_input, language],
outputs=output_text,
)
# Transcribe + Analyze
full_btn.click(
fn=transcribe_and_analyze,
inputs=[audio_input, language],
outputs=[output_text, analysis_output],
)
# Re-analyze existing transcript
analyze_btn.click(
fn=analyze_text_only,
inputs=output_text,
outputs=analysis_output,
)
# Auto-transcribe + analyze on recording stop
audio_input.stop_recording(
fn=transcribe_and_analyze,
inputs=[audio_input, language],
outputs=[output_text, analysis_output],
)
app.launch()
if __name__ == "__main__":
run_web()