"""Pydantic schemas for Speech-to-Text and Text-to-Speech endpoints""" from pydantic import BaseModel, Field, ConfigDict from typing import Optional # ================================ # SPEECH TO TEXT # ================================ class STTResponse(BaseModel): """Response model for Whisper speech → text""" model_config = ConfigDict( json_schema_extra={ "example": { "text": "hello how are you", "model_name": "openai/whisper-large-v3", "language": "en", "duration_seconds": 3.2 } } ) text: str = Field(..., description="Transcribed text from the input audio") model_name: str = Field(..., description="STT model used for inference") language: Optional[str] = Field(None, description="Detected language") duration_seconds: Optional[float] = Field( None, description="Approximate audio duration in seconds" ) # ================================ # TEXT TO SPEECH # ================================ class TTSRequest(BaseModel): """Text input for TTS conversion""" model_config = ConfigDict( json_schema_extra={ "example": { "text": "Hello, welcome to our AI system." } } ) text: str = Field( ..., min_length=1, max_length=500, description="Text that will be converted into speech" ) class TTSResponse(BaseModel): """Metadata response for TTS generation""" model_config = ConfigDict( json_schema_extra={ "example": { "message": "Audio generated successfully", "audio_format": "wav", "length_seconds": 2.5, "model_name": "suno/bark" } } ) message: str audio_format: str length_seconds: Optional[float] = None model_name: str