"""Speech-to-Text & Text-to-Speech API Endpoints""" from fastapi import APIRouter, UploadFile, File, HTTPException from fastapi.responses import StreamingResponse import logging from models.audio import ( STTResponse, TTSRequest, TTSResponse ) from services.stt_service import transcribe_audio from services.tts_service import text_to_speech router = APIRouter(prefix="/audio", tags=["Audio"]) logger = logging.getLogger(__name__) # ============================================================ # SPEECH TO TEXT (Whisper) # ============================================================ @router.post("/speech-to-text", response_model=STTResponse) async def speech_to_text_endpoint(file: UploadFile = File(...)): """ Convert speech to text using openai/whisper-large-v3. - Upload an audio file (wav, mp3, m4a…) - Returns transcribed English text """ try: audio_bytes = await file.read() result = transcribe_audio(audio_bytes) response_data = STTResponse( text=result, model_name="openai/whisper-large-v3", language="en", duration_seconds=None # optional filler ) logger.info(f"STT completed: {response_data.text[:40]}...") return response_data except Exception as e: logger.error(f"STT error: {str(e)}") raise HTTPException(status_code=500, detail=f"Speech-to-text failed: {str(e)}") # ============================================================ # TEXT TO SPEECH (Bark) # ============================================================ @router.post("/text-to-speech", response_model=TTSResponse) async def text_to_speech_endpoint(request: TTSRequest): """ Convert text to synthesized speech using Bark. Returns streamed audio. """ try: audio_bytes = text_to_speech(request.text) metadata = TTSResponse( message="Audio generated successfully", audio_format="wav", length_seconds=None, model_name="suno/bark" ) logger.info(f"TTS generated for text: {request.text[:40]}...") return StreamingResponse( iter([audio_bytes]), media_type="audio/wav", headers={ "X-Audio-Metadata": metadata.model_dump_json() } ) except Exception as e: logger.error(f"TTS error: {str(e)}") raise HTTPException(status_code=500, detail=f"Text-to-speech failed: {str(e)}")