File size: 2,512 Bytes
c7fc3b6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
"""Speech-to-Text & Text-to-Speech API Endpoints"""
from fastapi import APIRouter, UploadFile, File, HTTPException
from fastapi.responses import StreamingResponse
import logging
from models.audio import (
STTResponse,
TTSRequest,
TTSResponse
)
from services.stt_service import transcribe_audio
from services.tts_service import text_to_speech
router = APIRouter(prefix="/audio", tags=["Audio"])
logger = logging.getLogger(__name__)
# ============================================================
# SPEECH TO TEXT (Whisper)
# ============================================================
@router.post("/speech-to-text", response_model=STTResponse)
async def speech_to_text_endpoint(file: UploadFile = File(...)):
"""
Convert speech to text using openai/whisper-large-v3.
- Upload an audio file (wav, mp3, m4a…)
- Returns transcribed English text
"""
try:
audio_bytes = await file.read()
result = transcribe_audio(audio_bytes)
response_data = STTResponse(
text=result,
model_name="openai/whisper-large-v3",
language="en",
duration_seconds=None # optional filler
)
logger.info(f"STT completed: {response_data.text[:40]}...")
return response_data
except Exception as e:
logger.error(f"STT error: {str(e)}")
raise HTTPException(status_code=500, detail=f"Speech-to-text failed: {str(e)}")
# ============================================================
# TEXT TO SPEECH (Bark)
# ============================================================
@router.post("/text-to-speech", response_model=TTSResponse)
async def text_to_speech_endpoint(request: TTSRequest):
"""
Convert text to synthesized speech using Bark.
Returns streamed audio.
"""
try:
audio_bytes = text_to_speech(request.text)
metadata = TTSResponse(
message="Audio generated successfully",
audio_format="wav",
length_seconds=None,
model_name="suno/bark"
)
logger.info(f"TTS generated for text: {request.text[:40]}...")
return StreamingResponse(
iter([audio_bytes]),
media_type="audio/wav",
headers={
"X-Audio-Metadata": metadata.model_dump_json()
}
)
except Exception as e:
logger.error(f"TTS error: {str(e)}")
raise HTTPException(status_code=500, detail=f"Text-to-speech failed: {str(e)}")
|