malek-messaoudii
Add tts/stt services
c7fc3b6
raw
history blame
2.51 kB
"""Speech-to-Text & Text-to-Speech API Endpoints"""
from fastapi import APIRouter, UploadFile, File, HTTPException
from fastapi.responses import StreamingResponse
import logging
from models.audio import (
STTResponse,
TTSRequest,
TTSResponse
)
from services.stt_service import transcribe_audio
from services.tts_service import text_to_speech
router = APIRouter(prefix="/audio", tags=["Audio"])
logger = logging.getLogger(__name__)
# ============================================================
# SPEECH TO TEXT (Whisper)
# ============================================================
@router.post("/speech-to-text", response_model=STTResponse)
async def speech_to_text_endpoint(file: UploadFile = File(...)):
"""
Convert speech to text using openai/whisper-large-v3.
- Upload an audio file (wav, mp3, m4a…)
- Returns transcribed English text
"""
try:
audio_bytes = await file.read()
result = transcribe_audio(audio_bytes)
response_data = STTResponse(
text=result,
model_name="openai/whisper-large-v3",
language="en",
duration_seconds=None # optional filler
)
logger.info(f"STT completed: {response_data.text[:40]}...")
return response_data
except Exception as e:
logger.error(f"STT error: {str(e)}")
raise HTTPException(status_code=500, detail=f"Speech-to-text failed: {str(e)}")
# ============================================================
# TEXT TO SPEECH (Bark)
# ============================================================
@router.post("/text-to-speech", response_model=TTSResponse)
async def text_to_speech_endpoint(request: TTSRequest):
"""
Convert text to synthesized speech using Bark.
Returns streamed audio.
"""
try:
audio_bytes = text_to_speech(request.text)
metadata = TTSResponse(
message="Audio generated successfully",
audio_format="wav",
length_seconds=None,
model_name="suno/bark"
)
logger.info(f"TTS generated for text: {request.text[:40]}...")
return StreamingResponse(
iter([audio_bytes]),
media_type="audio/wav",
headers={
"X-Audio-Metadata": metadata.model_dump_json()
}
)
except Exception as e:
logger.error(f"TTS error: {str(e)}")
raise HTTPException(status_code=500, detail=f"Text-to-speech failed: {str(e)}")