TTS / app.py
Thiophai's picture
Update app.py
ea2f315 verified
import os
import re # ← เพิ่ม
from fastapi import FastAPI, HTTPException, Body
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse, JSONResponse
import azure.cognitiveservices.speech as speechsdk
from typing import Optional
# --- Config from ENV ---
AZURE_SPEECH_KEY = os.getenv("AZURE_SPEECH_KEY")
AZURE_SPEECH_REGION = os.getenv("AZURE_SPEECH_REGION", "southeastasia")
if not AZURE_SPEECH_KEY:
print("[WARN] AZURE_SPEECH_KEY is not set. Set it in HF Spaces (Settings → Repository secrets).")
app = FastAPI(title="Azure TTS API", version="1.0.0")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
DEFAULT_VOICE = "th-TH-PremwadeeNeural"
# --- NEW: ตัวช่วยลบอิโมจิออกจากข้อความก่อนส่งให้ TTS ---
# ครอบคลุม emoji blocks + ตัวเชื่อม/ตัวเลือกเวอร์ชัน (ZWJ/VS-16) + โทนสีผิว + ธง
EMOJI_RE = re.compile(
"[" # main emoji blocks
"\U0001F300-\U0001F5FF"
"\U0001F600-\U0001F64F"
"\U0001F680-\U0001F6FF"
"\U0001F700-\U0001F77F"
"\U0001F780-\U0001F7FF"
"\U0001F800-\U0001F8FF"
"\U0001F900-\U0001F9FF"
"\U0001FA00-\U0001FA6F"
"\U0001FA70-\U0001FAFF"
"\U00002600-\U000026FF" # misc symbols
"\U000023F0-\U000023FA" # clocks/media
"\U00002700-\U000027BF" # dingbats
"\U0001F1E6-\U0001F1FF" # regional indicator (ธง)
"\U0001F3FB-\U0001F3FF" # skin tones
"]",
flags=re.UNICODE
)
def strip_emoji(s: str) -> str:
# ตัด emoji ออก + เอา ZWJ/VS-16 ที่หลงเหลือออก + จัดช่องว่างให้เรียบ
s = EMOJI_RE.sub("", s)
s = s.replace("\u200d", "").replace("\ufe0f", "")
return re.sub(r"\s{2,}", " ", s).strip()
# Map format → Azure SDK OutputFormat
FORMAT_MAP = {
"wav": speechsdk.SpeechSynthesisOutputFormat.Riff16Khz16BitMonoPcm,
"mp3": speechsdk.SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3,
}
@app.get("/")
def root():
return {"ok": True, "service": "Azure TTS API", "region": AZURE_SPEECH_REGION}
@app.get("/health")
def health():
return {"status": "healthy"}
@app.post("/synthesize")
def synthesize(
text: str = Body(..., embed=True, description="ข้อความที่จะสังเคราะห์เสียง"),
voice: Optional[str] = Body(DEFAULT_VOICE, embed=True),
audio_format: Optional[str] = Body("mp3", embed=True, description="mp3 หรือ wav"),
# ถ้าต้องการสวิตช์ได้ ก็เปิดพารามิเตอร์นี้ (ค่าเริ่มต้น True = ไม่อ่านอิโมจิ)
strip_emoji_before_tts: Optional[bool] = Body(True, embed=True)
):
"""
คืน audio bytes เป็น StreamingResponse (Content-Type ตามฟอร์แมต)
"""
if not AZURE_SPEECH_KEY:
raise HTTPException(status_code=500, detail="AZURE_SPEECH_KEY not set")
audio_format = audio_format.lower()
if audio_format not in FORMAT_MAP:
raise HTTPException(status_code=400, detail=f"Unsupported audio_format: {audio_format}. Use one of {list(FORMAT_MAP.keys())}")
try:
# 1) สร้าง SpeechConfig
speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
speech_config.speech_synthesis_voice_name = voice or DEFAULT_VOICE
speech_config.set_speech_synthesis_output_format(FORMAT_MAP[audio_format])
# 2) เตรียมข้อความสำหรับ TTS (ลบอิโมจิออกเฉพาะตอนสังเคราะห์เสียง)
text_for_tts = strip_emoji(text) if strip_emoji_before_tts else text
if not text_for_tts:
raise HTTPException(status_code=400, detail="Text contains only emojis after sanitization")
# 3) ไม่ส่ง audio_config -> จะได้ audio bytes กลับมาใน result.audio_data
synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None)
# 4) สังเคราะห์
result = synthesizer.speak_text_async(text_for_tts).get()
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
audio_bytes = result.audio_data # bytes
if not audio_bytes:
raise HTTPException(status_code=500, detail="No audio data produced")
mime = "audio/mpeg" if audio_format == "mp3" else "audio/wav"
filename = f"speech.{audio_format}"
return StreamingResponse(
iter([audio_bytes]),
media_type=mime,
headers={"Content-Disposition": f'inline; filename="{filename}"'}
)
elif result.reason == speechsdk.ResultReason.Canceled:
details = result.cancellation_details
msg = f"Synthesis canceled: {details.reason}"
if details.reason == speechsdk.CancellationReason.Error:
msg += f" | error: {details.error_details}"
raise HTTPException(status_code=500, detail=msg)
else:
raise HTTPException(status_code=500, detail=f"Unknown result: {result.reason}")
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/voices")
def list_voices():
if not AZURE_SPEECH_KEY:
raise HTTPException(status_code=500, detail="AZURE_SPEECH_KEY not set")
try:
speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None)
voices_result = synthesizer.get_voices_async().get()
if voices_result.reason == speechsdk.ResultReason.VoicesListRetrieved:
voices = []
for v in voices_result.voices:
voices.append({
"name": v.name,
"locale": v.locale,
"gender": v.gender.name if v.gender else None,
"shortName": getattr(v, "short_name", None)
})
return JSONResponse(content={"count": len(voices), "voices": voices})
else:
raise HTTPException(status_code=500, detail=f"Failed to list voices: {voices_result.reason}")
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))