malek-messaoudii
Refactor audio models and services for improved error handling and response streaming
9aa985d
| from pydantic import BaseModel, Field, ConfigDict | |
| from typing import Optional | |
| # ============================== | |
| # SPEECH TO TEXT RESPONSE | |
| # ============================== | |
| class STTResponse(BaseModel): | |
| model_config = ConfigDict( | |
| json_schema_extra={ | |
| "example": { | |
| "text": "hello how are you", | |
| "model_name": "openai/whisper-large-v3", | |
| "language": "en", | |
| "duration_seconds": 3.2 | |
| } | |
| } | |
| ) | |
| text: str = Field(..., description="Transcribed text from the input audio") | |
| model_name: str = Field(..., description="STT model used for inference") | |
| language: Optional[str] = Field(None, description="Detected language") | |
| duration_seconds: Optional[float] = Field( | |
| None, | |
| description="Approximate audio duration in seconds" | |
| ) | |
| # ============================== | |
| # TEXT TO SPEECH REQUEST / RESPONSE | |
| # ============================== | |
| class TTSRequest(BaseModel): | |
| model_config = ConfigDict( | |
| json_schema_extra={"example": {"text": "Hello, welcome to our AI system."}} | |
| ) | |
| text: str = Field(..., min_length=1, max_length=500, description="Text to convert to speech") | |
| class TTSResponse(BaseModel): | |
| model_config = ConfigDict( | |
| json_schema_extra={ | |
| "example": { | |
| "message": "Audio generated successfully", | |
| "audio_format": "wav", | |
| "length_seconds": 2.5, | |
| "model_name": "suno/bark" | |
| } | |
| } | |
| ) | |
| message: str | |
| audio_format: str | |
| length_seconds: Optional[float] = None | |
| model_name: str | |