from pydantic import BaseModel, Field, ConfigDict from typing import Optional # ============================== # SPEECH TO TEXT RESPONSE # ============================== class STTResponse(BaseModel): model_config = ConfigDict( json_schema_extra={ "example": { "text": "hello how are you", "model_name": "openai/whisper-large-v3", "language": "en", "duration_seconds": 3.2 } } ) text: str = Field(..., description="Transcribed text from the input audio") model_name: str = Field(..., description="STT model used for inference") language: Optional[str] = Field(None, description="Detected language") duration_seconds: Optional[float] = Field( None, description="Approximate audio duration in seconds" ) # ============================== # TEXT TO SPEECH REQUEST / RESPONSE # ============================== class TTSRequest(BaseModel): model_config = ConfigDict( json_schema_extra={"example": {"text": "Hello, welcome to our AI system."}} ) text: str = Field(..., min_length=1, max_length=500, description="Text to convert to speech") class TTSResponse(BaseModel): model_config = ConfigDict( json_schema_extra={ "example": { "message": "Audio generated successfully", "audio_format": "wav", "length_seconds": 2.5, "model_name": "suno/bark" } } ) message: str audio_format: str length_seconds: Optional[float] = None model_name: str