|
|
"""Speech-to-Text & Text-to-Speech API Endpoints""" |
|
|
|
|
|
from fastapi import APIRouter, UploadFile, File, HTTPException |
|
|
from fastapi.responses import StreamingResponse |
|
|
import logging |
|
|
|
|
|
from models.audio import ( |
|
|
STTResponse, |
|
|
TTSRequest, |
|
|
TTSResponse |
|
|
) |
|
|
|
|
|
from services.stt_service import transcribe_audio |
|
|
from services.tts_service import text_to_speech |
|
|
|
|
|
router = APIRouter(prefix="/audio", tags=["Audio"]) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@router.post("/speech-to-text", response_model=STTResponse) |
|
|
async def speech_to_text_endpoint(file: UploadFile = File(...)): |
|
|
""" |
|
|
Convert speech to text using openai/whisper-large-v3. |
|
|
|
|
|
- Upload an audio file (wav, mp3, m4a…) |
|
|
- Returns transcribed English text |
|
|
""" |
|
|
try: |
|
|
audio_bytes = await file.read() |
|
|
|
|
|
result = transcribe_audio(audio_bytes) |
|
|
|
|
|
response_data = STTResponse( |
|
|
text=result, |
|
|
model_name="openai/whisper-large-v3", |
|
|
language="en", |
|
|
duration_seconds=None |
|
|
) |
|
|
|
|
|
logger.info(f"STT completed: {response_data.text[:40]}...") |
|
|
return response_data |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"STT error: {str(e)}") |
|
|
raise HTTPException(status_code=500, detail=f"Speech-to-text failed: {str(e)}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@router.post("/text-to-speech", response_model=TTSResponse) |
|
|
async def text_to_speech_endpoint(request: TTSRequest): |
|
|
""" |
|
|
Convert text to synthesized speech using Bark. |
|
|
Returns streamed audio. |
|
|
""" |
|
|
try: |
|
|
audio_bytes = text_to_speech(request.text) |
|
|
|
|
|
metadata = TTSResponse( |
|
|
message="Audio generated successfully", |
|
|
audio_format="wav", |
|
|
length_seconds=None, |
|
|
model_name="suno/bark" |
|
|
) |
|
|
|
|
|
logger.info(f"TTS generated for text: {request.text[:40]}...") |
|
|
|
|
|
return StreamingResponse( |
|
|
iter([audio_bytes]), |
|
|
media_type="audio/wav", |
|
|
headers={ |
|
|
"X-Audio-Metadata": metadata.model_dump_json() |
|
|
} |
|
|
) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"TTS error: {str(e)}") |
|
|
raise HTTPException(status_code=500, detail=f"Text-to-speech failed: {str(e)}") |
|
|
|