Samfy001 commited on
Commit
eb29d75
·
verified ·
1 Parent(s): f5a2aaa

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +453 -4
main.py CHANGED
@@ -4,6 +4,7 @@ import uuid
4
  from datetime import datetime
5
  from typing import Optional, List, Literal
6
  from fastapi import FastAPI, HTTPException, BackgroundTasks
 
7
  from pydantic import BaseModel, Field
8
  import logging
9
  import os
@@ -13,13 +14,14 @@ logging.basicConfig(level=logging.INFO)
13
  logger = logging.getLogger(__name__)
14
 
15
  app = FastAPI(
16
- title="OpenAI Compatible Image Generation API",
17
- description="OpenAI-compatible API for image generation using Captions backend",
18
  version="1.0.0"
19
  )
20
 
21
  # Configuration
22
  CAPTIONS_BASE_URL = "https://core.captions-web-api.xyz/proxy/v1/gen-ai/image"
 
23
  BEARER_TOKEN = os.getenv("CAPTIONS_BEARER_TOKEN", "eyJhbGciOiJSUzI1NiIsImtpZCI6IjU3YmZiMmExMWRkZmZjMGFkMmU2ODE0YzY4NzYzYjhjNjg3NTgxZDgiLCJ0eXAiOiJKV1QifQ.eyJnb29nbGUiOnRydWUsImlzcyI6Imh0dHBzOi8vc2VjdXJldG9rZW4uZ29vZ2xlLmNvbS9jYXB0aW9ucy1mNmRlOSIsImF1ZCI6ImNhcHRpb25zLWY2ZGU5IiwiYXV0aF90aW1lIjoxNzU1MzYyODEzLCJ1c2VyX2lkIjoic3hWek5XaUYyempXYmUxTjNjd3UiLCJzdWIiOiJzeFZ6TldpRjJ6aldiZTFOM2N3dSIsImlhdCI6MTc1NTM2MjgxMywiZXhwIjoxNzU1MzY2NDEzLCJmaXJlYmFzZSI6eyJpZGVudGl0aWVzIjp7fSwic2lnbl9pbl9wcm92aWRlciI6ImN1c3RvbSJ9fQ.jGuhWp-w8jlGy8xmMjqOyig_LVcr53udFgMjrQTJtKtE_J_iVkvMLncO2TnJ2BquoEp9pwVlZIG-imlFe6Uhtz95-t1oHENf5yzUWu3HocFsNVeAZh9avi_iObSYM_pFOT9lwRNzk1oMa6LbwViuVgTXvHDse9T4_nDfmCBbWngWksh1_JGtnrK2qPb5YD8Hr26itDRMx8mzUr2cQqtU9mU0R910CROqsNaQ9ovemeGe-2RT-hZku4VVYAMDOdvcFsgcf_BJTLRikmc3T7Ekx8T0KM6ZpTgr34wtnl7rpDBNOX0cOSYu3NEUDBnhNJKmPl5qL08gcYEur1ijP2mcTA")
24
 
25
  # Model mappings from OpenAI model names to Captions model IDs
@@ -42,6 +44,142 @@ MODEL_MAPPINGS = {
42
  "stable-diffusion": "stable-diffusion-3-5-large"
43
  }
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  # Available models information
46
  AVAILABLE_MODELS = {
47
  "google-imagen-3": {"name": "Imagen 3", "provider": "Google"},
@@ -65,6 +203,14 @@ class ImageGenerationRequest(BaseModel):
65
  style: Optional[Literal["vivid", "natural"]] = Field("vivid", description="Style of the generated images")
66
  user: Optional[str] = Field(None, description="A unique identifier representing your end-user")
67
 
 
 
 
 
 
 
 
 
68
  # OpenAI-compatible response models
69
  class ImageData(BaseModel):
70
  url: Optional[str] = None
@@ -86,6 +232,16 @@ class CaptionsSubmitRequest(BaseModel):
86
  class CaptionsStatusRequest(BaseModel):
87
  operationId: str
88
 
 
 
 
 
 
 
 
 
 
 
89
  # In-memory storage for operation tracking (use Redis in production)
90
  operations_store = {}
91
 
@@ -104,6 +260,10 @@ def get_aspect_ratio_from_size(size: str) -> int:
104
  }
105
  return size_map.get(size, 1)
106
 
 
 
 
 
107
  async def submit_image_generation(prompt: str, model: str = "dall-e-3", size: str = "1024x1024") -> str:
108
  """Submit image generation request to Captions API"""
109
  headers = {
@@ -420,6 +580,286 @@ async def get_generation_status(operation_id: str):
420
  logger.error(f"Error checking generation status: {e}")
421
  raise HTTPException(status_code=500, detail="Failed to check generation status")
422
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
423
  @app.get("/health")
424
  async def health_check():
425
  """Health check endpoint"""
@@ -429,21 +869,30 @@ async def health_check():
429
  async def root():
430
  """Root endpoint with API information"""
431
  return {
432
- "message": "OpenAI Compatible Image Generation API",
433
  "version": "1.0.0",
434
  "supported_models": list(AVAILABLE_MODELS.keys()),
435
  "openai_aliases": list(MODEL_MAPPINGS.keys()),
 
 
436
  "endpoints": {
437
  "models": "/v1/models",
 
438
  "image_generation": "/v1/images/generations",
439
  "async_generation": "/v1/images/generations/async",
440
  "status_check": "/v1/images/generations/status/{operation_id}",
 
 
 
 
441
  "health": "/health",
442
  "docs": "/docs"
443
  },
444
  "example_curl": {
445
  "generate_image": "curl -X POST 'http://localhost:8000/v1/images/generations' -H 'Content-Type: application/json' -d '{\"prompt\": \"a cat\", \"model\": \"dall-e-3\", \"size\": \"1024x1024\"}'",
446
- "list_models": "curl -X GET 'http://localhost:8000/v1/models'"
 
 
447
  }
448
  }
449
 
 
4
  from datetime import datetime
5
  from typing import Optional, List, Literal
6
  from fastapi import FastAPI, HTTPException, BackgroundTasks
7
+ from fastapi.responses import StreamingResponse
8
  from pydantic import BaseModel, Field
9
  import logging
10
  import os
 
14
  logger = logging.getLogger(__name__)
15
 
16
  app = FastAPI(
17
+ title="OpenAI Compatible API - Images & TTS",
18
+ description="OpenAI-compatible API for image generation and text-to-speech using Captions backend",
19
  version="1.0.0"
20
  )
21
 
22
  # Configuration
23
  CAPTIONS_BASE_URL = "https://core.captions-web-api.xyz/proxy/v1/gen-ai/image"
24
+ CAPTIONS_TTS_BASE_URL = "https://core.captions-web-api.xyz/proxy/v1/voiceover/tts"
25
  BEARER_TOKEN = os.getenv("CAPTIONS_BEARER_TOKEN", "eyJhbGciOiJSUzI1NiIsImtpZCI6IjU3YmZiMmExMWRkZmZjMGFkMmU2ODE0YzY4NzYzYjhjNjg3NTgxZDgiLCJ0eXAiOiJKV1QifQ.eyJnb29nbGUiOnRydWUsImlzcyI6Imh0dHBzOi8vc2VjdXJldG9rZW4uZ29vZ2xlLmNvbS9jYXB0aW9ucy1mNmRlOSIsImF1ZCI6ImNhcHRpb25zLWY2ZGU5IiwiYXV0aF90aW1lIjoxNzU1MzYyODEzLCJ1c2VyX2lkIjoic3hWek5XaUYyempXYmUxTjNjd3UiLCJzdWIiOiJzeFZ6TldpRjJ6aldiZTFOM2N3dSIsImlhdCI6MTc1NTM2MjgxMywiZXhwIjoxNzU1MzY2NDEzLCJmaXJlYmFzZSI6eyJpZGVudGl0aWVzIjp7fSwic2lnbl9pbl9wcm92aWRlciI6ImN1c3RvbSJ9fQ.jGuhWp-w8jlGy8xmMjqOyig_LVcr53udFgMjrQTJtKtE_J_iVkvMLncO2TnJ2BquoEp9pwVlZIG-imlFe6Uhtz95-t1oHENf5yzUWu3HocFsNVeAZh9avi_iObSYM_pFOT9lwRNzk1oMa6LbwViuVgTXvHDse9T4_nDfmCBbWngWksh1_JGtnrK2qPb5YD8Hr26itDRMx8mzUr2cQqtU9mU0R910CROqsNaQ9ovemeGe-2RT-hZku4VVYAMDOdvcFsgcf_BJTLRikmc3T7Ekx8T0KM6ZpTgr34wtnl7rpDBNOX0cOSYu3NEUDBnhNJKmPl5qL08gcYEur1ijP2mcTA")
26
 
27
  # Model mappings from OpenAI model names to Captions model IDs
 
44
  "stable-diffusion": "stable-diffusion-3-5-large"
45
  }
46
 
47
+ # TTS Voice mappings from OpenAI voice names to Captions voice IDs
48
+ VOICE_MAPPINGS = {
49
+ "alloy": "0s0tckZNA4EDjsNWIGpn", # Brandon (OpenAI)
50
+ "echo": "VfJEoIjcuedwbnVocfwS", # John (OpenAI)
51
+ "fable": "aIJGQIEdPBlV4bWoLgiC", # Jordan (OpenAI)
52
+ "onyx": "NkxXZNRZuGVagP3gLTlk", # James (OpenAI)
53
+ "nova": "dEcutGbESImg8uIOJOb3", # Julie (OpenAI)
54
+ "shimmer": "OsLeLksKZUcYFR6Rj3AV", # Lea (OpenAI)
55
+ # Additional popular voices
56
+ "brandon": "0s0tckZNA4EDjsNWIGpn",
57
+ "nicole": "2OMmjuvizlUUkgCLYrEU",
58
+ "jamal": "4VCohb9n7kc8qQAMbC9T",
59
+ "xavier": "6LVJ04FKnALQY4vuI3xi",
60
+ "emma": "7pjl1PlCtijY5E7k9nex",
61
+ "alexandra": "8OwpkBz4OXvyOgg6uSVM",
62
+ "josh": "9H5PLh8sHyc4NiQba2sO",
63
+ "vincent": "A6YwaBVPdqMuPU5guI31",
64
+ "bella": "DVkGI1gOEQwhI9D98kgV",
65
+ "sophia": "Dw4Y69nCUd0lijzanffn",
66
+ "ethan": "FNrD9UXPRmnlfELyZfOH",
67
+ "greg": "GFvARbVuizGj4jkdG1iN",
68
+ "isabella": "GNliQ6gOp8Y96hz0uPSY",
69
+ "mason": "Jc5LFEs9ONmW3vilHdpg",
70
+ "justin": "LWoskltOczE5nVUCPFCl",
71
+ "bradford": "Lvu57Tdi6WU0LrCkf3W0",
72
+ "ally": "NJSANg1RFfytiL3apSc0",
73
+ "maddy": "NX9RZUSep3h9RzDoipkJ",
74
+ "george": "NmypOAkKcWovPSbjMJPk",
75
+ "brian": "Pt04qYLGmK9HateRrrdh",
76
+ "taylor": "QQ0vIwK2AgVtbHZk3wYq",
77
+ "samara": "QyFFVFY5hzA5T7sVv9JI",
78
+ "linda": "RzrSQgnXwblMgDyOeOuy",
79
+ "liam": "SveSw38zJT860NRIeiVk",
80
+ "hope": "UfOKaDAlzOMjZnyEhPH1",
81
+ "william": "VesROIDY8lJS6zz8xTRb",
82
+ "dwight": "W76fVeloaQcuN71bIQF6",
83
+ "lisa": "ZbuIjlIzHpIc8oO17kWW",
84
+ "arial": "aCWKe1NzicFCAkohj7TY",
85
+ "elliot": "arGkfQC5Z0yNlNrYLlE8",
86
+ "rhea": "blo9kiIBaFNr0UCI2gpA",
87
+ "leo": "bqvJyFf80waIYPYiv6zX",
88
+ "eve": "cQ0q3hcj9Bm4IccGDY9C",
89
+ "serena": "e3zFWWHHfNk6vOh5kbBX",
90
+ "domi": "eSojoW8lMv5whHRCJugk",
91
+ "alex": "eXjri1H442qcs35pWaTr",
92
+ "blondie": "fHmK4z2cR0VXxvQmd7ei",
93
+ "nathan": "gO0Do5f1lCvLoIvbl6dx",
94
+ "daniel": "grqhFog58KWjgcO6t4ya",
95
+ "tara": "iBsjG6Kk8tmO0ldX7Aho",
96
+ "maya": "iWBJcyi2qdFpXYRGt42f",
97
+ "ashley": "j51tO8Upz9wEVIUkynCJ",
98
+ "matthew": "lJQLBnDNpkkc4RIgqhIZ",
99
+ "andrew": "lQS5Hszd1P0W2m18M4ME",
100
+ "olivia": "ltYBSrCwVJp0I99DmLfq",
101
+ "adam": "m1t6JeyI9DXRhnCg8kuX",
102
+ "mark": "okc8JAt7Vb3u20k4soKB",
103
+ "micah": "r0ZdS6QBWDxmcRN7HxWq",
104
+ "elli": "r4gww888sYU82aKZSUHy",
105
+ "sylvia": "rJmVxgRa6YI9bALBqvtC",
106
+ "noah": "rgqCbvqWKIaxYs54d7xS",
107
+ "kayla": "s1YBw3dmanbLNCq7MXI8",
108
+ "carla": "sUXCiUMyEVHBC7sRlPZY",
109
+ "owen": "tijk10imWq7nGRawDD62",
110
+ "lila": "wjOnivHr3V1ZGNuCMZJI",
111
+ "sam": "xpkvvHUyS37s3f84MObW",
112
+ "antoni": "y5nGwtfzvQ2OhrBXZnj5",
113
+ "ava": "zYqKDc8tFTIsAhJFpTaC"
114
+ }
115
+
116
+ # Available voices information
117
+ AVAILABLE_VOICES = {
118
+ "0s0tckZNA4EDjsNWIGpn": {"name": "Brandon", "gender": "male", "accent": "american", "provider": "OpenAI"},
119
+ "2OMmjuvizlUUkgCLYrEU": {"name": "Nicole", "gender": "female", "accent": "australian", "provider": "Cartesia"},
120
+ "4VCohb9n7kc8qQAMbC9T": {"name": "Jamal", "gender": "male", "accent": "american", "provider": "ElevenLabs"},
121
+ "6LVJ04FKnALQY4vuI3xi": {"name": "Xavier", "gender": "male", "accent": "american", "provider": "PlayHT"},
122
+ "7pjl1PlCtijY5E7k9nex": {"name": "Emma", "gender": "female", "accent": "american", "provider": "Google"},
123
+ "8OwpkBz4OXvyOgg6uSVM": {"name": "Alexandra", "gender": "female", "accent": "american", "provider": "ElevenLabs"},
124
+ "9H5PLh8sHyc4NiQba2sO": {"name": "Josh", "gender": "male", "accent": "american", "provider": "ElevenLabs"},
125
+ "A6YwaBVPdqMuPU5guI31": {"name": "Vincent", "gender": "male", "accent": "american", "provider": "PlayHT"},
126
+ "DVkGI1gOEQwhI9D98kgV": {"name": "Bella", "gender": "female", "accent": "american", "provider": "ElevenLabs"},
127
+ "Dw4Y69nCUd0lijzanffn": {"name": "Sophia", "gender": "female", "accent": "american", "provider": "ElevenLabs"},
128
+ "FNrD9UXPRmnlfELyZfOH": {"name": "Ethan", "gender": "male", "accent": "american", "provider": "ElevenLabs"},
129
+ "GFvARbVuizGj4jkdG1iN": {"name": "Greg", "gender": "male", "accent": "american", "provider": "ElevenLabs"},
130
+ "GNliQ6gOp8Y96hz0uPSY": {"name": "Isabella", "gender": "female", "accent": "american", "provider": "Google"},
131
+ "Jc5LFEs9ONmW3vilHdpg": {"name": "Mason", "gender": "male", "accent": "american", "provider": "Google"},
132
+ "LWoskltOczE5nVUCPFCl": {"name": "Justin", "gender": "male", "accent": "american", "provider": "Cartesia"},
133
+ "Lvu57Tdi6WU0LrCkf3W0": {"name": "Bradford", "gender": "male", "accent": "british", "provider": "ElevenLabs"},
134
+ "NJSANg1RFfytiL3apSc0": {"name": "Ally", "gender": "female", "accent": "american", "provider": "PlayHT"},
135
+ "NX9RZUSep3h9RzDoipkJ": {"name": "Maddy", "gender": "female", "accent": "american", "provider": "PlayHT"},
136
+ "NkxXZNRZuGVagP3gLTlk": {"name": "James", "gender": "male", "accent": "british", "provider": "OpenAI"},
137
+ "NmypOAkKcWovPSbjMJPk": {"name": "George", "gender": "male", "accent": "british", "provider": "Cartesia"},
138
+ "OsLeLksKZUcYFR6Rj3AV": {"name": "Lea", "gender": "female", "accent": "american", "provider": "OpenAI"},
139
+ "Pt04qYLGmK9HateRrrdh": {"name": "Brian", "gender": "male", "accent": "american", "provider": "Cartesia"},
140
+ "QQ0vIwK2AgVtbHZk3wYq": {"name": "Taylor", "gender": "female", "accent": "british", "provider": "ElevenLabs"},
141
+ "QyFFVFY5hzA5T7sVv9JI": {"name": "Samara", "gender": "female", "accent": "british", "provider": "ElevenLabs"},
142
+ "RzrSQgnXwblMgDyOeOuy": {"name": "Linda", "gender": "female", "accent": "british", "provider": "PlayHT"},
143
+ "SveSw38zJT860NRIeiVk": {"name": "Liam", "gender": "male", "accent": "american", "provider": "Google"},
144
+ "UfOKaDAlzOMjZnyEhPH1": {"name": "Hope", "gender": "female", "accent": "american", "provider": "ElevenLabs"},
145
+ "VesROIDY8lJS6zz8xTRb": {"name": "William", "gender": "male", "accent": "american", "provider": "Google"},
146
+ "VfJEoIjcuedwbnVocfwS": {"name": "John", "gender": "male", "accent": "american", "provider": "OpenAI"},
147
+ "W76fVeloaQcuN71bIQF6": {"name": "Dwight", "gender": "male", "accent": "american", "provider": "ElevenLabs"},
148
+ "ZbuIjlIzHpIc8oO17kWW": {"name": "Lisa", "gender": "female", "accent": "american", "provider": "PlayHT"},
149
+ "aCWKe1NzicFCAkohj7TY": {"name": "Arial", "gender": "female", "accent": "american", "provider": "Cartesia"},
150
+ "aIJGQIEdPBlV4bWoLgiC": {"name": "Jordan", "gender": "male", "accent": "american", "provider": "OpenAI"},
151
+ "arGkfQC5Z0yNlNrYLlE8": {"name": "Elliot", "gender": "male", "accent": "american", "provider": "ElevenLabs"},
152
+ "blo9kiIBaFNr0UCI2gpA": {"name": "Rhea", "gender": "female", "accent": "australian", "provider": "PlayHT"},
153
+ "bqvJyFf80waIYPYiv6zX": {"name": "Leo", "gender": "male", "accent": "american", "provider": "ElevenLabs"},
154
+ "cQ0q3hcj9Bm4IccGDY9C": {"name": "Eve", "gender": "female", "accent": "american", "provider": "ElevenLabs"},
155
+ "dEcutGbESImg8uIOJOb3": {"name": "Julie", "gender": "female", "accent": "american", "provider": "OpenAI"},
156
+ "e3zFWWHHfNk6vOh5kbBX": {"name": "Serena", "gender": "female", "accent": "american", "provider": "ElevenLabs"},
157
+ "eSojoW8lMv5whHRCJugk": {"name": "Domi", "gender": "female", "accent": "american", "provider": "ElevenLabs"},
158
+ "eXjri1H442qcs35pWaTr": {"name": "Alex", "gender": "female", "accent": "american", "provider": "ElevenLabs"},
159
+ "fHmK4z2cR0VXxvQmd7ei": {"name": "Blondie", "gender": "female", "accent": "british", "provider": "ElevenLabs"},
160
+ "gO0Do5f1lCvLoIvbl6dx": {"name": "Nathan", "gender": "male", "accent": "british", "provider": "PlayHT"},
161
+ "grqhFog58KWjgcO6t4ya": {"name": "Daniel", "gender": "male", "accent": "american", "provider": "PlayHT"},
162
+ "iBsjG6Kk8tmO0ldX7Aho": {"name": "Tara", "gender": "female", "accent": "american", "provider": "Cartesia"},
163
+ "iWBJcyi2qdFpXYRGt42f": {"name": "Maya", "gender": "female", "accent": "american", "provider": "Cartesia"},
164
+ "j51tO8Upz9wEVIUkynCJ": {"name": "Ashley", "gender": "female", "accent": "american", "provider": "OpenAI"},
165
+ "lJQLBnDNpkkc4RIgqhIZ": {"name": "Matthew", "gender": "male", "accent": "australian", "provider": "Cartesia"},
166
+ "lQS5Hszd1P0W2m18M4ME": {"name": "Andrew", "gender": "male", "accent": "american", "provider": "Cartesia"},
167
+ "ltYBSrCwVJp0I99DmLfq": {"name": "Olivia", "gender": "female", "accent": "american", "provider": "Google"},
168
+ "m1t6JeyI9DXRhnCg8kuX": {"name": "Adam", "gender": "male", "accent": "american", "provider": "ElevenLabs"},
169
+ "okc8JAt7Vb3u20k4soKB": {"name": "Mark", "gender": "male", "accent": "american", "provider": "ElevenLabs"},
170
+ "r0ZdS6QBWDxmcRN7HxWq": {"name": "Micah", "gender": "male", "accent": "british", "provider": "ElevenLabs"},
171
+ "r4gww888sYU82aKZSUHy": {"name": "Elli", "gender": "female", "accent": "american", "provider": "ElevenLabs"},
172
+ "rJmVxgRa6YI9bALBqvtC": {"name": "Sylvia", "gender": "female", "accent": "american", "provider": "OpenAI"},
173
+ "rgqCbvqWKIaxYs54d7xS": {"name": "Noah", "gender": "male", "accent": "australian", "provider": "ElevenLabs"},
174
+ "s1YBw3dmanbLNCq7MXI8": {"name": "Kayla", "gender": "female", "accent": "american", "provider": "OpenAI"},
175
+ "sUXCiUMyEVHBC7sRlPZY": {"name": "Carla", "gender": "female", "accent": "american", "provider": "Cartesia"},
176
+ "tijk10imWq7nGRawDD62": {"name": "Owen", "gender": "male", "accent": "american", "provider": "Google"},
177
+ "wjOnivHr3V1ZGNuCMZJI": {"name": "Lila", "gender": "female", "accent": "american", "provider": "ElevenLabs"},
178
+ "xpkvvHUyS37s3f84MObW": {"name": "Sam", "gender": "male", "accent": "american", "provider": "ElevenLabs"},
179
+ "y5nGwtfzvQ2OhrBXZnj5": {"name": "Antoni", "gender": "male", "accent": "american", "provider": "ElevenLabs"},
180
+ "zYqKDc8tFTIsAhJFpTaC": {"name": "Ava", "gender": "female", "accent": "american", "provider": "Google"}
181
+ }
182
+
183
  # Available models information
184
  AVAILABLE_MODELS = {
185
  "google-imagen-3": {"name": "Imagen 3", "provider": "Google"},
 
203
  style: Optional[Literal["vivid", "natural"]] = Field("vivid", description="Style of the generated images")
204
  user: Optional[str] = Field(None, description="A unique identifier representing your end-user")
205
 
206
+ # TTS request models
207
+ class TTSRequest(BaseModel):
208
+ model: str = Field("tts-1", description="The TTS model to use")
209
+ input: str = Field(..., description="The text to generate audio for")
210
+ voice: str = Field("alloy", description="The voice to use for generation")
211
+ response_format: Optional[Literal["mp3", "opus", "aac", "flac"]] = Field("mp3", description="The format to audio in")
212
+ speed: Optional[float] = Field(1.0, ge=0.25, le=4.0, description="The speed of the generated audio")
213
+
214
  # OpenAI-compatible response models
215
  class ImageData(BaseModel):
216
  url: Optional[str] = None
 
232
  class CaptionsStatusRequest(BaseModel):
233
  operationId: str
234
 
235
+ # TTS models for Captions API
236
+ class CaptionsTTSSubmitRequest(BaseModel):
237
+ text: str
238
+ voiceId: str = "4VCohb9n7kc8qQAMbC9T" # Default to Jamal
239
+ modelId: str = "QHwZJt6xARgiV04YqEFY" # Default TTS model
240
+ optimisticProjectId: str
241
+
242
+ class CaptionsTTSStatusRequest(BaseModel):
243
+ operationId: str
244
+
245
  # In-memory storage for operation tracking (use Redis in production)
246
  operations_store = {}
247
 
 
260
  }
261
  return size_map.get(size, 1)
262
 
263
+ def get_captions_voice_id(openai_voice: str) -> str:
264
+ """Convert OpenAI voice name to Captions voice ID"""
265
+ return VOICE_MAPPINGS.get(openai_voice.lower(), "0s0tckZNA4EDjsNWIGpn") # Default to Brandon
266
+
267
  async def submit_image_generation(prompt: str, model: str = "dall-e-3", size: str = "1024x1024") -> str:
268
  """Submit image generation request to Captions API"""
269
  headers = {
 
580
  logger.error(f"Error checking generation status: {e}")
581
  raise HTTPException(status_code=500, detail="Failed to check generation status")
582
 
583
+ # TTS Endpoints
584
+ @app.post("/v1/audio/speech")
585
+ async def create_speech(request: TTSRequest):
586
+ """
587
+ Generate speech from text using OpenAI-compatible API
588
+ """
589
+ try:
590
+ # Convert OpenAI voice to Captions voice ID
591
+ voice_id = get_captions_voice_id(request.voice)
592
+
593
+ # Prepare the request for Captions API
594
+ captions_request = CaptionsTTSSubmitRequest(
595
+ text=request.input,
596
+ voiceId=voice_id,
597
+ modelId="QHwZJt6xARgiV04YqEFY", # Default TTS model
598
+ optimisticProjectId=f"tts-{uuid.uuid4().hex[:8]}"
599
+ )
600
+
601
+ # Submit TTS generation request
602
+ async with httpx.AsyncClient() as client:
603
+ response = await client.post(
604
+ f"{CAPTIONS_TTS_BASE_URL}/generate/submit",
605
+ json=captions_request.dict(),
606
+ headers={
607
+ "Authorization": f"Bearer {BEARER_TOKEN}",
608
+ "Content-Type": "application/json",
609
+ "x-app-version": "1.0.0",
610
+ "x-device-id": "api-client"
611
+ },
612
+ timeout=30.0
613
+ )
614
+
615
+ if response.status_code != 200:
616
+ logger.error(f"TTS submit failed: {response.text}")
617
+ raise HTTPException(status_code=response.status_code, detail="TTS generation failed")
618
+
619
+ result = response.json()
620
+ operation_id = result["data"]["operationId"]
621
+
622
+ # Store operation details
623
+ operations_store[operation_id] = {
624
+ "type": "tts",
625
+ "voice_id": voice_id,
626
+ "text": request.input,
627
+ "format": request.response_format,
628
+ "created_at": datetime.now()
629
+ }
630
+
631
+ # Poll for completion
632
+ max_retries = 60 # 60 seconds max wait
633
+ retry_count = 0
634
+
635
+ while retry_count < max_retries:
636
+ status_response = await client.post(
637
+ f"{CAPTIONS_TTS_BASE_URL}/generate/status",
638
+ json={"operationId": operation_id},
639
+ headers={
640
+ "Authorization": f"Bearer {BEARER_TOKEN}",
641
+ "Content-Type": "application/json",
642
+ "x-app-version": "1.0.0",
643
+ "x-device-id": "api-client"
644
+ },
645
+ timeout=30.0
646
+ )
647
+
648
+ if status_response.status_code != 200:
649
+ await asyncio.sleep(1)
650
+ retry_count += 1
651
+ continue
652
+
653
+ status_result = status_response.json()
654
+ state = status_result["data"]["state"]
655
+
656
+ if state == "COMPLETE":
657
+ audio_url = status_result["data"]["url"]
658
+
659
+ # Fetch the audio file
660
+ audio_response = await client.get(audio_url)
661
+ if audio_response.status_code == 200:
662
+ # Return audio file directly
663
+ return StreamingResponse(
664
+ iter([audio_response.content]),
665
+ media_type="audio/mpeg",
666
+ headers={
667
+ "Content-Disposition": f"attachment; filename=speech.{request.response_format}"
668
+ }
669
+ )
670
+ else:
671
+ raise HTTPException(status_code=500, detail="Failed to fetch generated audio")
672
+
673
+ elif state == "FAILED":
674
+ raise HTTPException(status_code=500, detail="TTS generation failed")
675
+
676
+ # Still processing, wait and retry
677
+ await asyncio.sleep(1)
678
+ retry_count += 1
679
+
680
+ # Timeout
681
+ raise HTTPException(status_code=408, detail="TTS generation timed out")
682
+
683
+ except HTTPException:
684
+ raise
685
+ except Exception as e:
686
+ logger.error(f"Error in TTS generation: {e}")
687
+ raise HTTPException(status_code=500, detail="Internal server error")
688
+
689
+ @app.post("/v1/audio/speech/async")
690
+ async def create_speech_async(request: TTSRequest, background_tasks: BackgroundTasks):
691
+ """
692
+ Start async TTS generation and return operation ID
693
+ """
694
+ try:
695
+ # Convert OpenAI voice to Captions voice ID
696
+ voice_id = get_captions_voice_id(request.voice)
697
+
698
+ # Prepare the request for Captions API
699
+ captions_request = CaptionsTTSSubmitRequest(
700
+ text=request.input,
701
+ voiceId=voice_id,
702
+ modelId="QHwZJt6xARgiV04YqEFY", # Default TTS model
703
+ optimisticProjectId=f"tts-{uuid.uuid4().hex[:8]}"
704
+ )
705
+
706
+ # Submit TTS generation request
707
+ async with httpx.AsyncClient() as client:
708
+ response = await client.post(
709
+ f"{CAPTIONS_TTS_BASE_URL}/generate/submit",
710
+ json=captions_request.dict(),
711
+ headers={
712
+ "Authorization": f"Bearer {BEARER_TOKEN}",
713
+ "Content-Type": "application/json",
714
+ "x-app-version": "1.0.0",
715
+ "x-device-id": "api-client"
716
+ },
717
+ timeout=30.0
718
+ )
719
+
720
+ if response.status_code != 200:
721
+ logger.error(f"TTS submit failed: {response.text}")
722
+ raise HTTPException(status_code=response.status_code, detail="TTS generation failed")
723
+
724
+ result = response.json()
725
+ operation_id = result["data"]["operationId"]
726
+
727
+ # Store operation details
728
+ operations_store[operation_id] = {
729
+ "type": "tts",
730
+ "voice_id": voice_id,
731
+ "text": request.input,
732
+ "format": request.response_format,
733
+ "created_at": datetime.now(),
734
+ "status": "processing"
735
+ }
736
+
737
+ return {"operation_id": operation_id, "status": "processing"}
738
+
739
+ except HTTPException:
740
+ raise
741
+ except Exception as e:
742
+ logger.error(f"Error in async TTS generation: {e}")
743
+ raise HTTPException(status_code=500, detail="Internal server error")
744
+
745
+ @app.get("/v1/audio/speech/status/{operation_id}")
746
+ async def get_tts_status(operation_id: str):
747
+ """
748
+ Check the status of a TTS generation operation
749
+ """
750
+ if operation_id not in operations_store:
751
+ raise HTTPException(status_code=404, detail="Operation not found")
752
+
753
+ operation = operations_store[operation_id]
754
+ if operation["type"] != "tts":
755
+ raise HTTPException(status_code=400, detail="Invalid operation type")
756
+
757
+ try:
758
+ async with httpx.AsyncClient() as client:
759
+ response = await client.post(
760
+ f"{CAPTIONS_TTS_BASE_URL}/generate/status",
761
+ json={"operationId": operation_id},
762
+ headers={
763
+ "Authorization": f"Bearer {BEARER_TOKEN}",
764
+ "Content-Type": "application/json",
765
+ "x-app-version": "1.0.0",
766
+ "x-device-id": "api-client"
767
+ },
768
+ timeout=30.0
769
+ )
770
+
771
+ if response.status_code != 200:
772
+ return {"status": "error", "error": "Failed to check status"}
773
+
774
+ result = response.json()
775
+ state = result["data"]["state"]
776
+
777
+ if state == "COMPLETE":
778
+ audio_url = result["data"]["url"]
779
+ operations_store[operation_id]["status"] = "completed"
780
+ operations_store[operation_id]["url"] = audio_url
781
+ return {
782
+ "status": "completed",
783
+ "url": audio_url,
784
+ "operation_id": operation_id
785
+ }
786
+ elif state == "FAILED":
787
+ operations_store[operation_id]["status"] = "failed"
788
+ return {"status": "failed", "operation_id": operation_id}
789
+ else:
790
+ operations_store[operation_id]["status"] = "processing"
791
+ return {"status": "processing", "operation_id": operation_id}
792
+
793
+ except Exception as e:
794
+ logger.error(f"Error checking TTS status: {e}")
795
+ raise HTTPException(status_code=500, detail="Failed to check TTS status")
796
+
797
+ @app.get("/v1/audio/speech/download/{operation_id}")
798
+ async def download_tts_audio(operation_id: str):
799
+ """
800
+ Download the generated audio file
801
+ """
802
+ if operation_id not in operations_store:
803
+ raise HTTPException(status_code=404, detail="Operation not found")
804
+
805
+ operation = operations_store[operation_id]
806
+ if operation["type"] != "tts":
807
+ raise HTTPException(status_code=400, detail="Invalid operation type")
808
+
809
+ if operation.get("status") != "completed":
810
+ raise HTTPException(status_code=400, detail="Audio not ready yet")
811
+
812
+ audio_url = operation.get("url")
813
+ if not audio_url:
814
+ raise HTTPException(status_code=404, detail="Audio URL not found")
815
+
816
+ try:
817
+ async with httpx.AsyncClient() as client:
818
+ audio_response = await client.get(audio_url)
819
+ if audio_response.status_code == 200:
820
+ format_type = operation.get("format", "mp3")
821
+ return StreamingResponse(
822
+ iter([audio_response.content]),
823
+ media_type="audio/mpeg",
824
+ headers={
825
+ "Content-Disposition": f"attachment; filename=speech.{format_type}"
826
+ }
827
+ )
828
+ else:
829
+ raise HTTPException(status_code=500, detail="Failed to fetch generated audio")
830
+
831
+ except Exception as e:
832
+ logger.error(f"Error downloading TTS audio: {e}")
833
+ raise HTTPException(status_code=500, detail="Failed to download audio")
834
+
835
+ @app.get("/v1/voices")
836
+ async def list_voices():
837
+ """
838
+ List available TTS voices
839
+ """
840
+ voices = []
841
+ for voice_id, voice_info in AVAILABLE_VOICES.items():
842
+ # Find OpenAI compatible name
843
+ openai_name = None
844
+ for name, mapped_id in VOICE_MAPPINGS.items():
845
+ if mapped_id == voice_id:
846
+ openai_name = name
847
+ break
848
+
849
+ voices.append({
850
+ "id": voice_id,
851
+ "name": voice_info["name"],
852
+ "openai_name": openai_name,
853
+ "gender": voice_info["gender"],
854
+ "accent": voice_info["accent"],
855
+ "provider": voice_info["provider"]
856
+ })
857
+
858
+ return {
859
+ "voices": voices,
860
+ "openai_compatible": ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
861
+ }
862
+
863
  @app.get("/health")
864
  async def health_check():
865
  """Health check endpoint"""
 
869
  async def root():
870
  """Root endpoint with API information"""
871
  return {
872
+ "message": "OpenAI Compatible Image Generation & TTS API",
873
  "version": "1.0.0",
874
  "supported_models": list(AVAILABLE_MODELS.keys()),
875
  "openai_aliases": list(MODEL_MAPPINGS.keys()),
876
+ "supported_voices": len(AVAILABLE_VOICES),
877
+ "openai_voice_aliases": list(set([k for k in VOICE_MAPPINGS.keys() if k in ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]])),
878
  "endpoints": {
879
  "models": "/v1/models",
880
+ "voices": "/v1/voices",
881
  "image_generation": "/v1/images/generations",
882
  "async_generation": "/v1/images/generations/async",
883
  "status_check": "/v1/images/generations/status/{operation_id}",
884
+ "tts": "/v1/audio/speech",
885
+ "tts_async": "/v1/audio/speech/async",
886
+ "tts_status": "/v1/audio/speech/status/{operation_id}",
887
+ "tts_download": "/v1/audio/speech/download/{operation_id}",
888
  "health": "/health",
889
  "docs": "/docs"
890
  },
891
  "example_curl": {
892
  "generate_image": "curl -X POST 'http://localhost:8000/v1/images/generations' -H 'Content-Type: application/json' -d '{\"prompt\": \"a cat\", \"model\": \"dall-e-3\", \"size\": \"1024x1024\"}'",
893
+ "list_models": "curl -X GET 'http://localhost:8000/v1/models'",
894
+ "generate_speech": "curl -X POST 'http://localhost:8000/v1/audio/speech' -H 'Content-Type: application/json' -d '{\"model\": \"tts-1\", \"input\": \"Hello world\", \"voice\": \"alloy\"}' --output speech.mp3",
895
+ "list_voices": "curl -X GET 'http://localhost:8000/v1/voices'"
896
  }
897
  }
898