Spaces:

Samfy001
/

Caption

Paused

App Files Files Community

Samfy001 commited on Aug 16, 2025

Commit

eb29d75

verified ·

1 Parent(s): f5a2aaa

Update main.py

Browse files

Files changed (1) hide show

main.py +453 -4

main.py CHANGED Viewed

@@ -4,6 +4,7 @@ import uuid
 from datetime import datetime
 from typing import Optional, List, Literal
 from fastapi import FastAPI, HTTPException, BackgroundTasks
 from pydantic import BaseModel, Field
 import logging
 import os
@@ -13,13 +14,14 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 app = FastAPI(
-    title="OpenAI Compatible Image Generation API",
-    description="OpenAI-compatible API for image generation using Captions backend",
     version="1.0.0"
 )
 # Configuration
 CAPTIONS_BASE_URL = "https://core.captions-web-api.xyz/proxy/v1/gen-ai/image"
 BEARER_TOKEN = os.getenv("CAPTIONS_BEARER_TOKEN", "eyJhbGciOiJSUzI1NiIsImtpZCI6IjU3YmZiMmExMWRkZmZjMGFkMmU2ODE0YzY4NzYzYjhjNjg3NTgxZDgiLCJ0eXAiOiJKV1QifQ.eyJnb29nbGUiOnRydWUsImlzcyI6Imh0dHBzOi8vc2VjdXJldG9rZW4uZ29vZ2xlLmNvbS9jYXB0aW9ucy1mNmRlOSIsImF1ZCI6ImNhcHRpb25zLWY2ZGU5IiwiYXV0aF90aW1lIjoxNzU1MzYyODEzLCJ1c2VyX2lkIjoic3hWek5XaUYyempXYmUxTjNjd3UiLCJzdWIiOiJzeFZ6TldpRjJ6aldiZTFOM2N3dSIsImlhdCI6MTc1NTM2MjgxMywiZXhwIjoxNzU1MzY2NDEzLCJmaXJlYmFzZSI6eyJpZGVudGl0aWVzIjp7fSwic2lnbl9pbl9wcm92aWRlciI6ImN1c3RvbSJ9fQ.jGuhWp-w8jlGy8xmMjqOyig_LVcr53udFgMjrQTJtKtE_J_iVkvMLncO2TnJ2BquoEp9pwVlZIG-imlFe6Uhtz95-t1oHENf5yzUWu3HocFsNVeAZh9avi_iObSYM_pFOT9lwRNzk1oMa6LbwViuVgTXvHDse9T4_nDfmCBbWngWksh1_JGtnrK2qPb5YD8Hr26itDRMx8mzUr2cQqtU9mU0R910CROqsNaQ9ovemeGe-2RT-hZku4VVYAMDOdvcFsgcf_BJTLRikmc3T7Ekx8T0KM6ZpTgr34wtnl7rpDBNOX0cOSYu3NEUDBnhNJKmPl5qL08gcYEur1ijP2mcTA")
 # Model mappings from OpenAI model names to Captions model IDs
@@ -42,6 +44,142 @@ MODEL_MAPPINGS = {
     "stable-diffusion": "stable-diffusion-3-5-large"
 }
 # Available models information
 AVAILABLE_MODELS = {
     "google-imagen-3": {"name": "Imagen 3", "provider": "Google"},
@@ -65,6 +203,14 @@ class ImageGenerationRequest(BaseModel):
     style: Optional[Literal["vivid", "natural"]] = Field("vivid", description="Style of the generated images")
     user: Optional[str] = Field(None, description="A unique identifier representing your end-user")
 # OpenAI-compatible response models
 class ImageData(BaseModel):
     url: Optional[str] = None
@@ -86,6 +232,16 @@ class CaptionsSubmitRequest(BaseModel):
 class CaptionsStatusRequest(BaseModel):
     operationId: str
 # In-memory storage for operation tracking (use Redis in production)
 operations_store = {}
@@ -104,6 +260,10 @@ def get_aspect_ratio_from_size(size: str) -> int:
     }
     return size_map.get(size, 1)
 async def submit_image_generation(prompt: str, model: str = "dall-e-3", size: str = "1024x1024") -> str:
     """Submit image generation request to Captions API"""
     headers = {
@@ -420,6 +580,286 @@ async def get_generation_status(operation_id: str):
         logger.error(f"Error checking generation status: {e}")
         raise HTTPException(status_code=500, detail="Failed to check generation status")
 @app.get("/health")
 async def health_check():
     """Health check endpoint"""
@@ -429,21 +869,30 @@ async def health_check():
 async def root():
     """Root endpoint with API information"""
     return {
-        "message": "OpenAI Compatible Image Generation API",
         "version": "1.0.0",
         "supported_models": list(AVAILABLE_MODELS.keys()),
         "openai_aliases": list(MODEL_MAPPINGS.keys()),
         "endpoints": {
             "models": "/v1/models",
             "image_generation": "/v1/images/generations",
             "async_generation": "/v1/images/generations/async",
             "status_check": "/v1/images/generations/status/{operation_id}",
             "health": "/health",
             "docs": "/docs"
         },
         "example_curl": {
             "generate_image": "curl -X POST 'http://localhost:8000/v1/images/generations' -H 'Content-Type: application/json' -d '{\"prompt\": \"a cat\", \"model\": \"dall-e-3\", \"size\": \"1024x1024\"}'",
-            "list_models": "curl -X GET 'http://localhost:8000/v1/models'"
         }
     }

 from datetime import datetime
 from typing import Optional, List, Literal
 from fastapi import FastAPI, HTTPException, BackgroundTasks
+from fastapi.responses import StreamingResponse
 from pydantic import BaseModel, Field
 import logging
 import os
 logger = logging.getLogger(__name__)
 app = FastAPI(
+    title="OpenAI Compatible API - Images & TTS",
+    description="OpenAI-compatible API for image generation and text-to-speech using Captions backend",
     version="1.0.0"
 )
 # Configuration
 CAPTIONS_BASE_URL = "https://core.captions-web-api.xyz/proxy/v1/gen-ai/image"
+CAPTIONS_TTS_BASE_URL = "https://core.captions-web-api.xyz/proxy/v1/voiceover/tts"
 BEARER_TOKEN = os.getenv("CAPTIONS_BEARER_TOKEN", "eyJhbGciOiJSUzI1NiIsImtpZCI6IjU3YmZiMmExMWRkZmZjMGFkMmU2ODE0YzY4NzYzYjhjNjg3NTgxZDgiLCJ0eXAiOiJKV1QifQ.eyJnb29nbGUiOnRydWUsImlzcyI6Imh0dHBzOi8vc2VjdXJldG9rZW4uZ29vZ2xlLmNvbS9jYXB0aW9ucy1mNmRlOSIsImF1ZCI6ImNhcHRpb25zLWY2ZGU5IiwiYXV0aF90aW1lIjoxNzU1MzYyODEzLCJ1c2VyX2lkIjoic3hWek5XaUYyempXYmUxTjNjd3UiLCJzdWIiOiJzeFZ6TldpRjJ6aldiZTFOM2N3dSIsImlhdCI6MTc1NTM2MjgxMywiZXhwIjoxNzU1MzY2NDEzLCJmaXJlYmFzZSI6eyJpZGVudGl0aWVzIjp7fSwic2lnbl9pbl9wcm92aWRlciI6ImN1c3RvbSJ9fQ.jGuhWp-w8jlGy8xmMjqOyig_LVcr53udFgMjrQTJtKtE_J_iVkvMLncO2TnJ2BquoEp9pwVlZIG-imlFe6Uhtz95-t1oHENf5yzUWu3HocFsNVeAZh9avi_iObSYM_pFOT9lwRNzk1oMa6LbwViuVgTXvHDse9T4_nDfmCBbWngWksh1_JGtnrK2qPb5YD8Hr26itDRMx8mzUr2cQqtU9mU0R910CROqsNaQ9ovemeGe-2RT-hZku4VVYAMDOdvcFsgcf_BJTLRikmc3T7Ekx8T0KM6ZpTgr34wtnl7rpDBNOX0cOSYu3NEUDBnhNJKmPl5qL08gcYEur1ijP2mcTA")
 # Model mappings from OpenAI model names to Captions model IDs
     "stable-diffusion": "stable-diffusion-3-5-large"
 }
+# TTS Voice mappings from OpenAI voice names to Captions voice IDs
+VOICE_MAPPINGS = {
+    "alloy": "0s0tckZNA4EDjsNWIGpn",  # Brandon (OpenAI)
+    "echo": "VfJEoIjcuedwbnVocfwS",   # John (OpenAI)
+    "fable": "aIJGQIEdPBlV4bWoLgiC", # Jordan (OpenAI)
+    "onyx": "NkxXZNRZuGVagP3gLTlk",  # James (OpenAI)
+    "nova": "dEcutGbESImg8uIOJOb3",   # Julie (OpenAI)
+    "shimmer": "OsLeLksKZUcYFR6Rj3AV", # Lea (OpenAI)
+    # Additional popular voices
+    "brandon": "0s0tckZNA4EDjsNWIGpn",
+    "nicole": "2OMmjuvizlUUkgCLYrEU",
+    "jamal": "4VCohb9n7kc8qQAMbC9T",
+    "xavier": "6LVJ04FKnALQY4vuI3xi",
+    "emma": "7pjl1PlCtijY5E7k9nex",
+    "alexandra": "8OwpkBz4OXvyOgg6uSVM",
+    "josh": "9H5PLh8sHyc4NiQba2sO",
+    "vincent": "A6YwaBVPdqMuPU5guI31",
+    "bella": "DVkGI1gOEQwhI9D98kgV",
+    "sophia": "Dw4Y69nCUd0lijzanffn",
+    "ethan": "FNrD9UXPRmnlfELyZfOH",
+    "greg": "GFvARbVuizGj4jkdG1iN",
+    "isabella": "GNliQ6gOp8Y96hz0uPSY",
+    "mason": "Jc5LFEs9ONmW3vilHdpg",
+    "justin": "LWoskltOczE5nVUCPFCl",
+    "bradford": "Lvu57Tdi6WU0LrCkf3W0",
+    "ally": "NJSANg1RFfytiL3apSc0",
+    "maddy": "NX9RZUSep3h9RzDoipkJ",
+    "george": "NmypOAkKcWovPSbjMJPk",
+    "brian": "Pt04qYLGmK9HateRrrdh",
+    "taylor": "QQ0vIwK2AgVtbHZk3wYq",
+    "samara": "QyFFVFY5hzA5T7sVv9JI",
+    "linda": "RzrSQgnXwblMgDyOeOuy",
+    "liam": "SveSw38zJT860NRIeiVk",
+    "hope": "UfOKaDAlzOMjZnyEhPH1",
+    "william": "VesROIDY8lJS6zz8xTRb",
+    "dwight": "W76fVeloaQcuN71bIQF6",
+    "lisa": "ZbuIjlIzHpIc8oO17kWW",
+    "arial": "aCWKe1NzicFCAkohj7TY",
+    "elliot": "arGkfQC5Z0yNlNrYLlE8",
+    "rhea": "blo9kiIBaFNr0UCI2gpA",
+    "leo": "bqvJyFf80waIYPYiv6zX",
+    "eve": "cQ0q3hcj9Bm4IccGDY9C",
+    "serena": "e3zFWWHHfNk6vOh5kbBX",
+    "domi": "eSojoW8lMv5whHRCJugk",
+    "alex": "eXjri1H442qcs35pWaTr",
+    "blondie": "fHmK4z2cR0VXxvQmd7ei",
+    "nathan": "gO0Do5f1lCvLoIvbl6dx",
+    "daniel": "grqhFog58KWjgcO6t4ya",
+    "tara": "iBsjG6Kk8tmO0ldX7Aho",
+    "maya": "iWBJcyi2qdFpXYRGt42f",
+    "ashley": "j51tO8Upz9wEVIUkynCJ",
+    "matthew": "lJQLBnDNpkkc4RIgqhIZ",
+    "andrew": "lQS5Hszd1P0W2m18M4ME",
+    "olivia": "ltYBSrCwVJp0I99DmLfq",
+    "adam": "m1t6JeyI9DXRhnCg8kuX",
+    "mark": "okc8JAt7Vb3u20k4soKB",
+    "micah": "r0ZdS6QBWDxmcRN7HxWq",
+    "elli": "r4gww888sYU82aKZSUHy",
+    "sylvia": "rJmVxgRa6YI9bALBqvtC",
+    "noah": "rgqCbvqWKIaxYs54d7xS",
+    "kayla": "s1YBw3dmanbLNCq7MXI8",
+    "carla": "sUXCiUMyEVHBC7sRlPZY",
+    "owen": "tijk10imWq7nGRawDD62",
+    "lila": "wjOnivHr3V1ZGNuCMZJI",
+    "sam": "xpkvvHUyS37s3f84MObW",
+    "antoni": "y5nGwtfzvQ2OhrBXZnj5",
+    "ava": "zYqKDc8tFTIsAhJFpTaC"
+}
+# Available voices information
+AVAILABLE_VOICES = {
+    "0s0tckZNA4EDjsNWIGpn": {"name": "Brandon", "gender": "male", "accent": "american", "provider": "OpenAI"},
+    "2OMmjuvizlUUkgCLYrEU": {"name": "Nicole", "gender": "female", "accent": "australian", "provider": "Cartesia"},
+    "4VCohb9n7kc8qQAMbC9T": {"name": "Jamal", "gender": "male", "accent": "american", "provider": "ElevenLabs"},
+    "6LVJ04FKnALQY4vuI3xi": {"name": "Xavier", "gender": "male", "accent": "american", "provider": "PlayHT"},
+    "7pjl1PlCtijY5E7k9nex": {"name": "Emma", "gender": "female", "accent": "american", "provider": "Google"},
+    "8OwpkBz4OXvyOgg6uSVM": {"name": "Alexandra", "gender": "female", "accent": "american", "provider": "ElevenLabs"},
+    "9H5PLh8sHyc4NiQba2sO": {"name": "Josh", "gender": "male", "accent": "american", "provider": "ElevenLabs"},
+    "A6YwaBVPdqMuPU5guI31": {"name": "Vincent", "gender": "male", "accent": "american", "provider": "PlayHT"},
+    "DVkGI1gOEQwhI9D98kgV": {"name": "Bella", "gender": "female", "accent": "american", "provider": "ElevenLabs"},
+    "Dw4Y69nCUd0lijzanffn": {"name": "Sophia", "gender": "female", "accent": "american", "provider": "ElevenLabs"},
+    "FNrD9UXPRmnlfELyZfOH": {"name": "Ethan", "gender": "male", "accent": "american", "provider": "ElevenLabs"},
+    "GFvARbVuizGj4jkdG1iN": {"name": "Greg", "gender": "male", "accent": "american", "provider": "ElevenLabs"},
+    "GNliQ6gOp8Y96hz0uPSY": {"name": "Isabella", "gender": "female", "accent": "american", "provider": "Google"},
+    "Jc5LFEs9ONmW3vilHdpg": {"name": "Mason", "gender": "male", "accent": "american", "provider": "Google"},
+    "LWoskltOczE5nVUCPFCl": {"name": "Justin", "gender": "male", "accent": "american", "provider": "Cartesia"},
+    "Lvu57Tdi6WU0LrCkf3W0": {"name": "Bradford", "gender": "male", "accent": "british", "provider": "ElevenLabs"},
+    "NJSANg1RFfytiL3apSc0": {"name": "Ally", "gender": "female", "accent": "american", "provider": "PlayHT"},
+    "NX9RZUSep3h9RzDoipkJ": {"name": "Maddy", "gender": "female", "accent": "american", "provider": "PlayHT"},
+    "NkxXZNRZuGVagP3gLTlk": {"name": "James", "gender": "male", "accent": "british", "provider": "OpenAI"},
+    "NmypOAkKcWovPSbjMJPk": {"name": "George", "gender": "male", "accent": "british", "provider": "Cartesia"},
+    "OsLeLksKZUcYFR6Rj3AV": {"name": "Lea", "gender": "female", "accent": "american", "provider": "OpenAI"},
+    "Pt04qYLGmK9HateRrrdh": {"name": "Brian", "gender": "male", "accent": "american", "provider": "Cartesia"},
+    "QQ0vIwK2AgVtbHZk3wYq": {"name": "Taylor", "gender": "female", "accent": "british", "provider": "ElevenLabs"},
+    "QyFFVFY5hzA5T7sVv9JI": {"name": "Samara", "gender": "female", "accent": "british", "provider": "ElevenLabs"},
+    "RzrSQgnXwblMgDyOeOuy": {"name": "Linda", "gender": "female", "accent": "british", "provider": "PlayHT"},
+    "SveSw38zJT860NRIeiVk": {"name": "Liam", "gender": "male", "accent": "american", "provider": "Google"},
+    "UfOKaDAlzOMjZnyEhPH1": {"name": "Hope", "gender": "female", "accent": "american", "provider": "ElevenLabs"},
+    "VesROIDY8lJS6zz8xTRb": {"name": "William", "gender": "male", "accent": "american", "provider": "Google"},
+    "VfJEoIjcuedwbnVocfwS": {"name": "John", "gender": "male", "accent": "american", "provider": "OpenAI"},
+    "W76fVeloaQcuN71bIQF6": {"name": "Dwight", "gender": "male", "accent": "american", "provider": "ElevenLabs"},
+    "ZbuIjlIzHpIc8oO17kWW": {"name": "Lisa", "gender": "female", "accent": "american", "provider": "PlayHT"},
+    "aCWKe1NzicFCAkohj7TY": {"name": "Arial", "gender": "female", "accent": "american", "provider": "Cartesia"},
+    "aIJGQIEdPBlV4bWoLgiC": {"name": "Jordan", "gender": "male", "accent": "american", "provider": "OpenAI"},
+    "arGkfQC5Z0yNlNrYLlE8": {"name": "Elliot", "gender": "male", "accent": "american", "provider": "ElevenLabs"},
+    "blo9kiIBaFNr0UCI2gpA": {"name": "Rhea", "gender": "female", "accent": "australian", "provider": "PlayHT"},
+    "bqvJyFf80waIYPYiv6zX": {"name": "Leo", "gender": "male", "accent": "american", "provider": "ElevenLabs"},
+    "cQ0q3hcj9Bm4IccGDY9C": {"name": "Eve", "gender": "female", "accent": "american", "provider": "ElevenLabs"},
+    "dEcutGbESImg8uIOJOb3": {"name": "Julie", "gender": "female", "accent": "american", "provider": "OpenAI"},
+    "e3zFWWHHfNk6vOh5kbBX": {"name": "Serena", "gender": "female", "accent": "american", "provider": "ElevenLabs"},
+    "eSojoW8lMv5whHRCJugk": {"name": "Domi", "gender": "female", "accent": "american", "provider": "ElevenLabs"},
+    "eXjri1H442qcs35pWaTr": {"name": "Alex", "gender": "female", "accent": "american", "provider": "ElevenLabs"},
+    "fHmK4z2cR0VXxvQmd7ei": {"name": "Blondie", "gender": "female", "accent": "british", "provider": "ElevenLabs"},
+    "gO0Do5f1lCvLoIvbl6dx": {"name": "Nathan", "gender": "male", "accent": "british", "provider": "PlayHT"},
+    "grqhFog58KWjgcO6t4ya": {"name": "Daniel", "gender": "male", "accent": "american", "provider": "PlayHT"},
+    "iBsjG6Kk8tmO0ldX7Aho": {"name": "Tara", "gender": "female", "accent": "american", "provider": "Cartesia"},
+    "iWBJcyi2qdFpXYRGt42f": {"name": "Maya", "gender": "female", "accent": "american", "provider": "Cartesia"},
+    "j51tO8Upz9wEVIUkynCJ": {"name": "Ashley", "gender": "female", "accent": "american", "provider": "OpenAI"},
+    "lJQLBnDNpkkc4RIgqhIZ": {"name": "Matthew", "gender": "male", "accent": "australian", "provider": "Cartesia"},
+    "lQS5Hszd1P0W2m18M4ME": {"name": "Andrew", "gender": "male", "accent": "american", "provider": "Cartesia"},
+    "ltYBSrCwVJp0I99DmLfq": {"name": "Olivia", "gender": "female", "accent": "american", "provider": "Google"},
+    "m1t6JeyI9DXRhnCg8kuX": {"name": "Adam", "gender": "male", "accent": "american", "provider": "ElevenLabs"},
+    "okc8JAt7Vb3u20k4soKB": {"name": "Mark", "gender": "male", "accent": "american", "provider": "ElevenLabs"},
+    "r0ZdS6QBWDxmcRN7HxWq": {"name": "Micah", "gender": "male", "accent": "british", "provider": "ElevenLabs"},
+    "r4gww888sYU82aKZSUHy": {"name": "Elli", "gender": "female", "accent": "american", "provider": "ElevenLabs"},
+    "rJmVxgRa6YI9bALBqvtC": {"name": "Sylvia", "gender": "female", "accent": "american", "provider": "OpenAI"},
+    "rgqCbvqWKIaxYs54d7xS": {"name": "Noah", "gender": "male", "accent": "australian", "provider": "ElevenLabs"},
+    "s1YBw3dmanbLNCq7MXI8": {"name": "Kayla", "gender": "female", "accent": "american", "provider": "OpenAI"},
+    "sUXCiUMyEVHBC7sRlPZY": {"name": "Carla", "gender": "female", "accent": "american", "provider": "Cartesia"},
+    "tijk10imWq7nGRawDD62": {"name": "Owen", "gender": "male", "accent": "american", "provider": "Google"},
+    "wjOnivHr3V1ZGNuCMZJI": {"name": "Lila", "gender": "female", "accent": "american", "provider": "ElevenLabs"},
+    "xpkvvHUyS37s3f84MObW": {"name": "Sam", "gender": "male", "accent": "american", "provider": "ElevenLabs"},
+    "y5nGwtfzvQ2OhrBXZnj5": {"name": "Antoni", "gender": "male", "accent": "american", "provider": "ElevenLabs"},
+    "zYqKDc8tFTIsAhJFpTaC": {"name": "Ava", "gender": "female", "accent": "american", "provider": "Google"}
+}
 # Available models information
 AVAILABLE_MODELS = {
     "google-imagen-3": {"name": "Imagen 3", "provider": "Google"},
     style: Optional[Literal["vivid", "natural"]] = Field("vivid", description="Style of the generated images")
     user: Optional[str] = Field(None, description="A unique identifier representing your end-user")
+# TTS request models
+class TTSRequest(BaseModel):
+    model: str = Field("tts-1", description="The TTS model to use")
+    input: str = Field(..., description="The text to generate audio for")
+    voice: str = Field("alloy", description="The voice to use for generation")
+    response_format: Optional[Literal["mp3", "opus", "aac", "flac"]] = Field("mp3", description="The format to audio in")
+    speed: Optional[float] = Field(1.0, ge=0.25, le=4.0, description="The speed of the generated audio")
 # OpenAI-compatible response models
 class ImageData(BaseModel):
     url: Optional[str] = None
 class CaptionsStatusRequest(BaseModel):
     operationId: str
+# TTS models for Captions API
+class CaptionsTTSSubmitRequest(BaseModel):
+    text: str
+    voiceId: str = "4VCohb9n7kc8qQAMbC9T"  # Default to Jamal
+    modelId: str = "QHwZJt6xARgiV04YqEFY"  # Default TTS model
+    optimisticProjectId: str
+class CaptionsTTSStatusRequest(BaseModel):
+    operationId: str
 # In-memory storage for operation tracking (use Redis in production)
 operations_store = {}
     }
     return size_map.get(size, 1)
+def get_captions_voice_id(openai_voice: str) -> str:
+    """Convert OpenAI voice name to Captions voice ID"""
+    return VOICE_MAPPINGS.get(openai_voice.lower(), "0s0tckZNA4EDjsNWIGpn")  # Default to Brandon
 async def submit_image_generation(prompt: str, model: str = "dall-e-3", size: str = "1024x1024") -> str:
     """Submit image generation request to Captions API"""
     headers = {
         logger.error(f"Error checking generation status: {e}")
         raise HTTPException(status_code=500, detail="Failed to check generation status")
+# TTS Endpoints
+@app.post("/v1/audio/speech")
+async def create_speech(request: TTSRequest):
+    """
+    Generate speech from text using OpenAI-compatible API
+    """
+    try:
+        # Convert OpenAI voice to Captions voice ID
+        voice_id = get_captions_voice_id(request.voice)
+        # Prepare the request for Captions API
+        captions_request = CaptionsTTSSubmitRequest(
+            text=request.input,
+            voiceId=voice_id,
+            modelId="QHwZJt6xARgiV04YqEFY",  # Default TTS model
+            optimisticProjectId=f"tts-{uuid.uuid4().hex[:8]}"
+        )
+        # Submit TTS generation request
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                f"{CAPTIONS_TTS_BASE_URL}/generate/submit",
+                json=captions_request.dict(),
+                headers={
+                    "Authorization": f"Bearer {BEARER_TOKEN}",
+                    "Content-Type": "application/json",
+                    "x-app-version": "1.0.0",
+                    "x-device-id": "api-client"
+                },
+                timeout=30.0
+            )
+            if response.status_code != 200:
+                logger.error(f"TTS submit failed: {response.text}")
+                raise HTTPException(status_code=response.status_code, detail="TTS generation failed")
+            result = response.json()
+            operation_id = result["data"]["operationId"]
+            # Store operation details
+            operations_store[operation_id] = {
+                "type": "tts",
+                "voice_id": voice_id,
+                "text": request.input,
+                "format": request.response_format,
+                "created_at": datetime.now()
+            }
+            # Poll for completion
+            max_retries = 60  # 60 seconds max wait
+            retry_count = 0
+            while retry_count < max_retries:
+                status_response = await client.post(
+                    f"{CAPTIONS_TTS_BASE_URL}/generate/status",
+                    json={"operationId": operation_id},
+                    headers={
+                        "Authorization": f"Bearer {BEARER_TOKEN}",
+                        "Content-Type": "application/json",
+                        "x-app-version": "1.0.0",
+                        "x-device-id": "api-client"
+                    },
+                    timeout=30.0
+                )
+                if status_response.status_code != 200:
+                    await asyncio.sleep(1)
+                    retry_count += 1
+                    continue
+                status_result = status_response.json()
+                state = status_result["data"]["state"]
+                if state == "COMPLETE":
+                    audio_url = status_result["data"]["url"]
+                    # Fetch the audio file
+                    audio_response = await client.get(audio_url)
+                    if audio_response.status_code == 200:
+                        # Return audio file directly
+                        return StreamingResponse(
+                            iter([audio_response.content]),
+                            media_type="audio/mpeg",
+                            headers={
+                                "Content-Disposition": f"attachment; filename=speech.{request.response_format}"
+                            }
+                        )
+                    else:
+                        raise HTTPException(status_code=500, detail="Failed to fetch generated audio")
+                elif state == "FAILED":
+                    raise HTTPException(status_code=500, detail="TTS generation failed")
+                # Still processing, wait and retry
+                await asyncio.sleep(1)
+                retry_count += 1
+            # Timeout
+            raise HTTPException(status_code=408, detail="TTS generation timed out")
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error in TTS generation: {e}")
+        raise HTTPException(status_code=500, detail="Internal server error")
+@app.post("/v1/audio/speech/async")
+async def create_speech_async(request: TTSRequest, background_tasks: BackgroundTasks):
+    """
+    Start async TTS generation and return operation ID
+    """
+    try:
+        # Convert OpenAI voice to Captions voice ID
+        voice_id = get_captions_voice_id(request.voice)
+        # Prepare the request for Captions API
+        captions_request = CaptionsTTSSubmitRequest(
+            text=request.input,
+            voiceId=voice_id,
+            modelId="QHwZJt6xARgiV04YqEFY",  # Default TTS model
+            optimisticProjectId=f"tts-{uuid.uuid4().hex[:8]}"
+        )
+        # Submit TTS generation request
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                f"{CAPTIONS_TTS_BASE_URL}/generate/submit",
+                json=captions_request.dict(),
+                headers={
+                    "Authorization": f"Bearer {BEARER_TOKEN}",
+                    "Content-Type": "application/json",
+                    "x-app-version": "1.0.0",
+                    "x-device-id": "api-client"
+                },
+                timeout=30.0
+            )
+            if response.status_code != 200:
+                logger.error(f"TTS submit failed: {response.text}")
+                raise HTTPException(status_code=response.status_code, detail="TTS generation failed")
+            result = response.json()
+            operation_id = result["data"]["operationId"]
+            # Store operation details
+            operations_store[operation_id] = {
+                "type": "tts",
+                "voice_id": voice_id,
+                "text": request.input,
+                "format": request.response_format,
+                "created_at": datetime.now(),
+                "status": "processing"
+            }
+            return {"operation_id": operation_id, "status": "processing"}
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error in async TTS generation: {e}")
+        raise HTTPException(status_code=500, detail="Internal server error")
+@app.get("/v1/audio/speech/status/{operation_id}")
+async def get_tts_status(operation_id: str):
+    """
+    Check the status of a TTS generation operation
+    """
+    if operation_id not in operations_store:
+        raise HTTPException(status_code=404, detail="Operation not found")
+    operation = operations_store[operation_id]
+    if operation["type"] != "tts":
+        raise HTTPException(status_code=400, detail="Invalid operation type")
+    try:
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                f"{CAPTIONS_TTS_BASE_URL}/generate/status",
+                json={"operationId": operation_id},
+                headers={
+                    "Authorization": f"Bearer {BEARER_TOKEN}",
+                    "Content-Type": "application/json",
+                    "x-app-version": "1.0.0",
+                    "x-device-id": "api-client"
+                },
+                timeout=30.0
+            )
+            if response.status_code != 200:
+                return {"status": "error", "error": "Failed to check status"}
+            result = response.json()
+            state = result["data"]["state"]
+            if state == "COMPLETE":
+                audio_url = result["data"]["url"]
+                operations_store[operation_id]["status"] = "completed"
+                operations_store[operation_id]["url"] = audio_url
+                return {
+                    "status": "completed",
+                    "url": audio_url,
+                    "operation_id": operation_id
+                }
+            elif state == "FAILED":
+                operations_store[operation_id]["status"] = "failed"
+                return {"status": "failed", "operation_id": operation_id}
+            else:
+                operations_store[operation_id]["status"] = "processing"
+                return {"status": "processing", "operation_id": operation_id}
+    except Exception as e:
+        logger.error(f"Error checking TTS status: {e}")
+        raise HTTPException(status_code=500, detail="Failed to check TTS status")
+@app.get("/v1/audio/speech/download/{operation_id}")
+async def download_tts_audio(operation_id: str):
+    """
+    Download the generated audio file
+    """
+    if operation_id not in operations_store:
+        raise HTTPException(status_code=404, detail="Operation not found")
+    operation = operations_store[operation_id]
+    if operation["type"] != "tts":
+        raise HTTPException(status_code=400, detail="Invalid operation type")
+    if operation.get("status") != "completed":
+        raise HTTPException(status_code=400, detail="Audio not ready yet")
+    audio_url = operation.get("url")
+    if not audio_url:
+        raise HTTPException(status_code=404, detail="Audio URL not found")
+    try:
+        async with httpx.AsyncClient() as client:
+            audio_response = await client.get(audio_url)
+            if audio_response.status_code == 200:
+                format_type = operation.get("format", "mp3")
+                return StreamingResponse(
+                    iter([audio_response.content]),
+                    media_type="audio/mpeg",
+                    headers={
+                        "Content-Disposition": f"attachment; filename=speech.{format_type}"
+                    }
+                )
+            else:
+                raise HTTPException(status_code=500, detail="Failed to fetch generated audio")
+    except Exception as e:
+        logger.error(f"Error downloading TTS audio: {e}")
+        raise HTTPException(status_code=500, detail="Failed to download audio")
+@app.get("/v1/voices")
+async def list_voices():
+    """
+    List available TTS voices
+    """
+    voices = []
+    for voice_id, voice_info in AVAILABLE_VOICES.items():
+        # Find OpenAI compatible name
+        openai_name = None
+        for name, mapped_id in VOICE_MAPPINGS.items():
+            if mapped_id == voice_id:
+                openai_name = name
+                break
+        voices.append({
+            "id": voice_id,
+            "name": voice_info["name"],
+            "openai_name": openai_name,
+            "gender": voice_info["gender"],
+            "accent": voice_info["accent"],
+            "provider": voice_info["provider"]
+        })
+    return {
+        "voices": voices,
+        "openai_compatible": ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
+    }
 @app.get("/health")
 async def health_check():
     """Health check endpoint"""
 async def root():
     """Root endpoint with API information"""
     return {
+        "message": "OpenAI Compatible Image Generation & TTS API",
         "version": "1.0.0",
         "supported_models": list(AVAILABLE_MODELS.keys()),
         "openai_aliases": list(MODEL_MAPPINGS.keys()),
+        "supported_voices": len(AVAILABLE_VOICES),
+        "openai_voice_aliases": list(set([k for k in VOICE_MAPPINGS.keys() if k in ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]])),
         "endpoints": {
             "models": "/v1/models",
+            "voices": "/v1/voices",
             "image_generation": "/v1/images/generations",
             "async_generation": "/v1/images/generations/async",
             "status_check": "/v1/images/generations/status/{operation_id}",
+            "tts": "/v1/audio/speech",
+            "tts_async": "/v1/audio/speech/async",
+            "tts_status": "/v1/audio/speech/status/{operation_id}",
+            "tts_download": "/v1/audio/speech/download/{operation_id}",
             "health": "/health",
             "docs": "/docs"
         },
         "example_curl": {
             "generate_image": "curl -X POST 'http://localhost:8000/v1/images/generations' -H 'Content-Type: application/json' -d '{\"prompt\": \"a cat\", \"model\": \"dall-e-3\", \"size\": \"1024x1024\"}'",
+            "list_models": "curl -X GET 'http://localhost:8000/v1/models'",
+            "generate_speech": "curl -X POST 'http://localhost:8000/v1/audio/speech' -H 'Content-Type: application/json' -d '{\"model\": \"tts-1\", \"input\": \"Hello world\", \"voice\": \"alloy\"}' --output speech.mp3",
+            "list_voices": "curl -X GET 'http://localhost:8000/v1/voices'"
         }
     }