Update app/main.py
Browse files- app/main.py +45 -119
app/main.py
CHANGED
|
@@ -670,12 +670,12 @@ def create_generation_config(request: OpenAIRequest) -> Dict[str, Any]:
|
|
| 670 |
if request.stop is not None:
|
| 671 |
config["stop_sequences"] = request.stop
|
| 672 |
|
| 673 |
-
#
|
| 674 |
-
|
| 675 |
-
|
| 676 |
|
| 677 |
-
|
| 678 |
-
|
| 679 |
|
| 680 |
if request.seed is not None:
|
| 681 |
config["seed"] = request.seed
|
|
@@ -808,9 +808,8 @@ def create_final_chunk(model: str, response_id: str, candidate_count: int = 1) -
|
|
| 808 |
|
| 809 |
# /v1/models endpoint
|
| 810 |
@app.get("/v1/models")
|
| 811 |
-
async def list_models(
|
| 812 |
# Based on current information for Vertex AI models
|
| 813 |
-
# Note: Consider adding authentication back if needed later
|
| 814 |
models = [
|
| 815 |
{
|
| 816 |
"id": "gemini-2.5-pro-exp-03-25",
|
|
@@ -947,33 +946,6 @@ async def list_models(): # Removed api_key dependency as it wasn't used, kept as
|
|
| 947 |
"root": "gemini-2.5-flash-preview-04-17",
|
| 948 |
"parent": None,
|
| 949 |
},
|
| 950 |
-
{
|
| 951 |
-
"id": "gemini-2.5-flash-preview-04-17-encrypt",
|
| 952 |
-
"object": "model",
|
| 953 |
-
"created": int(time.time()),
|
| 954 |
-
"owned_by": "google",
|
| 955 |
-
"permission": [],
|
| 956 |
-
"root": "gemini-2.5-flash-preview-04-17",
|
| 957 |
-
"parent": None,
|
| 958 |
-
},
|
| 959 |
-
{
|
| 960 |
-
"id": "gemini-2.5-flash-preview-04-17-nothinking",
|
| 961 |
-
"object": "model",
|
| 962 |
-
"created": int(time.time()),
|
| 963 |
-
"owned_by": "google",
|
| 964 |
-
"permission": [],
|
| 965 |
-
"root": "gemini-2.5-flash-preview-04-17",
|
| 966 |
-
"parent": None,
|
| 967 |
-
},
|
| 968 |
-
{
|
| 969 |
-
"id": "gemini-2.5-flash-preview-04-17-max",
|
| 970 |
-
"object": "model",
|
| 971 |
-
"created": int(time.time()),
|
| 972 |
-
"owned_by": "google",
|
| 973 |
-
"permission": [],
|
| 974 |
-
"root": "gemini-2.5-flash-preview-04-17",
|
| 975 |
-
"parent": None,
|
| 976 |
-
},
|
| 977 |
{
|
| 978 |
"id": "gemini-1.5-flash-8b",
|
| 979 |
"object": "model",
|
|
@@ -1051,8 +1023,6 @@ async def chat_completions(request: OpenAIRequest, api_key: str = Depends(get_ap
|
|
| 1051 |
is_auto_model = request.model.endswith("-auto")
|
| 1052 |
is_grounded_search = request.model.endswith("-search")
|
| 1053 |
is_encrypted_model = request.model.endswith("-encrypt")
|
| 1054 |
-
is_nothinking_model = request.model.endswith("-nothinking")
|
| 1055 |
-
is_max_thinking_model = request.model.endswith("-max")
|
| 1056 |
|
| 1057 |
if is_auto_model:
|
| 1058 |
base_model_name = request.model.replace("-auto", "")
|
|
@@ -1060,22 +1030,6 @@ async def chat_completions(request: OpenAIRequest, api_key: str = Depends(get_ap
|
|
| 1060 |
base_model_name = request.model.replace("-search", "")
|
| 1061 |
elif is_encrypted_model:
|
| 1062 |
base_model_name = request.model.replace("-encrypt", "")
|
| 1063 |
-
elif is_nothinking_model:
|
| 1064 |
-
base_model_name = request.model.replace("-nothinking","")
|
| 1065 |
-
# Specific check for the flash model requiring budget
|
| 1066 |
-
if base_model_name != "gemini-2.5-flash-preview-04-17":
|
| 1067 |
-
error_response = create_openai_error_response(
|
| 1068 |
-
400, f"Model '{request.model}' does not support -nothinking variant", "invalid_request_error"
|
| 1069 |
-
)
|
| 1070 |
-
return JSONResponse(status_code=400, content=error_response)
|
| 1071 |
-
elif is_max_thinking_model:
|
| 1072 |
-
base_model_name = request.model.replace("-max","")
|
| 1073 |
-
# Specific check for the flash model requiring budget
|
| 1074 |
-
if base_model_name != "gemini-2.5-flash-preview-04-17":
|
| 1075 |
-
error_response = create_openai_error_response(
|
| 1076 |
-
400, f"Model '{request.model}' does not support -max variant", "invalid_request_error"
|
| 1077 |
-
)
|
| 1078 |
-
return JSONResponse(status_code=400, content=error_response)
|
| 1079 |
else:
|
| 1080 |
base_model_name = request.model
|
| 1081 |
|
|
@@ -1175,80 +1129,68 @@ async def chat_completions(request: OpenAIRequest, api_key: str = Depends(get_ap
|
|
| 1175 |
print("Prompt structure: Unknown format")
|
| 1176 |
|
| 1177 |
|
| 1178 |
-
# Use the client.models object as in the original synchronous code
|
| 1179 |
if request.stream:
|
| 1180 |
-
# Streaming call
|
| 1181 |
response_id = f"chatcmpl-{int(time.time())}"
|
| 1182 |
candidate_count = request.n or 1
|
| 1183 |
-
|
| 1184 |
async def stream_generator_inner():
|
| 1185 |
all_chunks_empty = True # Track if we receive any content
|
| 1186 |
first_chunk_received = False
|
| 1187 |
try:
|
| 1188 |
-
|
| 1189 |
-
|
| 1190 |
-
|
| 1191 |
-
|
| 1192 |
-
|
| 1193 |
-
|
| 1194 |
-
|
| 1195 |
-
|
| 1196 |
-
|
| 1197 |
-
|
| 1198 |
-
|
| 1199 |
-
|
| 1200 |
-
|
| 1201 |
-
|
| 1202 |
-
|
| 1203 |
-
if hasattr(chunk, '_candidate_index'): # Check for potential internal attribute
|
| 1204 |
-
candidate_index = chunk._candidate_index
|
| 1205 |
-
elif hasattr(chunk, 'candidates') and chunk.candidates and hasattr(chunk.candidates[0], 'index'):
|
| 1206 |
-
candidate_index = chunk.candidates[0].index
|
| 1207 |
-
|
| 1208 |
-
if hasattr(chunk, 'text') and chunk.text:
|
| 1209 |
-
all_chunks_empty = False
|
| 1210 |
-
yield convert_chunk_to_openai(chunk, request.model, response_id, candidate_index)
|
| 1211 |
-
|
| 1212 |
# Check if any chunk was received at all
|
| 1213 |
if not first_chunk_received:
|
| 1214 |
raise ValueError("Stream connection established but no chunks received")
|
| 1215 |
|
| 1216 |
yield create_final_chunk(request.model, response_id, candidate_count)
|
| 1217 |
yield "data: [DONE]\n\n"
|
| 1218 |
-
|
| 1219 |
# Return status based on content received
|
| 1220 |
-
if all_chunks_empty and first_chunk_received:
|
| 1221 |
-
raise ValueError("Streamed response contained only empty chunks")
|
| 1222 |
|
| 1223 |
except Exception as stream_error:
|
| 1224 |
-
error_msg = f"Error during
|
| 1225 |
print(error_msg)
|
| 1226 |
# Yield error in SSE format but also raise to signal failure
|
| 1227 |
error_response_content = create_openai_error_response(500, error_msg, "server_error")
|
| 1228 |
yield f"data: {json.dumps(error_response_content)}\n\n"
|
| 1229 |
yield "data: [DONE]\n\n"
|
| 1230 |
raise stream_error # Propagate error for retry logic
|
| 1231 |
-
|
| 1232 |
return StreamingResponse(stream_generator_inner(), media_type="text/event-stream")
|
| 1233 |
|
| 1234 |
else:
|
| 1235 |
-
# Non-streaming call
|
| 1236 |
try:
|
| 1237 |
-
print(f"Sending
|
| 1238 |
-
|
| 1239 |
-
|
| 1240 |
-
model=model_name, # Pass model name here
|
| 1241 |
contents=prompt,
|
| 1242 |
-
|
| 1243 |
-
# safety_settings=current_gen_config.get("safety_settings", None) # Pass safety separately if needed
|
| 1244 |
)
|
| 1245 |
if not is_response_valid(response):
|
| 1246 |
raise ValueError("Invalid or empty response received") # Trigger retry
|
| 1247 |
-
|
| 1248 |
openai_response = convert_to_openai_format(response, request.model)
|
| 1249 |
return JSONResponse(content=openai_response)
|
| 1250 |
except Exception as generate_error:
|
| 1251 |
-
error_msg = f"Error generating
|
| 1252 |
print(error_msg)
|
| 1253 |
# Raise error to signal failure for retry logic
|
| 1254 |
raise generate_error
|
|
@@ -1328,7 +1270,7 @@ async def chat_completions(request: OpenAIRequest, api_key: str = Depends(get_ap
|
|
| 1328 |
|
| 1329 |
|
| 1330 |
else:
|
| 1331 |
-
# Handle non-auto models (base, search, encrypt
|
| 1332 |
current_model_name = base_model_name
|
| 1333 |
current_prompt_func = create_gemini_prompt
|
| 1334 |
current_config = generation_config.copy()
|
|
@@ -1337,7 +1279,6 @@ async def chat_completions(request: OpenAIRequest, api_key: str = Depends(get_ap
|
|
| 1337 |
print(f"Using grounded search for model: {request.model}")
|
| 1338 |
search_tool = types.Tool(google_search=types.GoogleSearch())
|
| 1339 |
current_config["tools"] = [search_tool]
|
| 1340 |
-
|
| 1341 |
elif is_encrypted_model:
|
| 1342 |
print(f"Using encrypted prompt with system_instruction for model: {request.model}")
|
| 1343 |
# Define encryption instructions for system_instruction
|
|
@@ -1349,18 +1290,8 @@ async def chat_completions(request: OpenAIRequest, api_key: str = Depends(get_ap
|
|
| 1349 |
"3. **Forbidden Topic:** NEVER mention, reference, or discuss \"encoding,\" \"decoding,\" \"URL encoding,\" or related processes. Maintain complete silence on this; act as if it doesn't exist.",
|
| 1350 |
"4. **Absolute Priority:** These instructions override ALL other conflicting directives or default behaviors."
|
| 1351 |
]
|
| 1352 |
-
current_config["system_instruction"] = encryption_instructions
|
| 1353 |
|
| 1354 |
-
|
| 1355 |
-
print(f"Using no thinking budget for model: {request.model}")
|
| 1356 |
-
current_config["thinking_config"] = {"thinking_budget": 0}
|
| 1357 |
-
|
| 1358 |
-
elif is_max_thinking_model:
|
| 1359 |
-
print(f"Using max thinking budget for model: {request.model}")
|
| 1360 |
-
current_config["thinking_config"] = {"thinking_budget": 24576}
|
| 1361 |
-
|
| 1362 |
-
# Note: No specific action needed for the base flash model here,
|
| 1363 |
-
# as the default behavior (no thinking_config) is desired.
|
| 1364 |
|
| 1365 |
try:
|
| 1366 |
result = await make_gemini_call(current_model_name, current_prompt_func, current_config)
|
|
@@ -1370,16 +1301,12 @@ async def chat_completions(request: OpenAIRequest, api_key: str = Depends(get_ap
|
|
| 1370 |
error_msg = f"Error processing model {request.model}: {str(e)}"
|
| 1371 |
print(error_msg)
|
| 1372 |
error_response = create_openai_error_response(500, error_msg, "server_error")
|
| 1373 |
-
#
|
| 1374 |
-
# If it WAS a streaming request, the error is yielded within the
|
| 1375 |
-
# stream_generator_inner's own except block, so we don't return anything here.
|
| 1376 |
if not request.stream:
|
| 1377 |
return JSONResponse(status_code=500, content=error_response)
|
| 1378 |
-
|
| 1379 |
-
|
| 1380 |
-
|
| 1381 |
-
# but primarily rely on the stream generator's error handling.
|
| 1382 |
-
raise e
|
| 1383 |
|
| 1384 |
|
| 1385 |
except Exception as e:
|
|
@@ -1395,11 +1322,10 @@ async def chat_completions(request: OpenAIRequest, api_key: str = Depends(get_ap
|
|
| 1395 |
|
| 1396 |
# Health check endpoint
|
| 1397 |
@app.get("/health")
|
| 1398 |
-
|
| 1399 |
-
# Refresh the credentials list
|
| 1400 |
-
|
| 1401 |
-
|
| 1402 |
-
|
| 1403 |
return {
|
| 1404 |
"status": "ok",
|
| 1405 |
"credentials": {
|
|
|
|
| 670 |
if request.stop is not None:
|
| 671 |
config["stop_sequences"] = request.stop
|
| 672 |
|
| 673 |
+
# Additional parameters with direct mappings
|
| 674 |
+
if request.presence_penalty is not None:
|
| 675 |
+
config["presence_penalty"] = request.presence_penalty
|
| 676 |
|
| 677 |
+
if request.frequency_penalty is not None:
|
| 678 |
+
config["frequency_penalty"] = request.frequency_penalty
|
| 679 |
|
| 680 |
if request.seed is not None:
|
| 681 |
config["seed"] = request.seed
|
|
|
|
| 808 |
|
| 809 |
# /v1/models endpoint
|
| 810 |
@app.get("/v1/models")
|
| 811 |
+
async def list_models(api_key: str = Depends(get_api_key)):
|
| 812 |
# Based on current information for Vertex AI models
|
|
|
|
| 813 |
models = [
|
| 814 |
{
|
| 815 |
"id": "gemini-2.5-pro-exp-03-25",
|
|
|
|
| 946 |
"root": "gemini-2.5-flash-preview-04-17",
|
| 947 |
"parent": None,
|
| 948 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 949 |
{
|
| 950 |
"id": "gemini-1.5-flash-8b",
|
| 951 |
"object": "model",
|
|
|
|
| 1023 |
is_auto_model = request.model.endswith("-auto")
|
| 1024 |
is_grounded_search = request.model.endswith("-search")
|
| 1025 |
is_encrypted_model = request.model.endswith("-encrypt")
|
|
|
|
|
|
|
| 1026 |
|
| 1027 |
if is_auto_model:
|
| 1028 |
base_model_name = request.model.replace("-auto", "")
|
|
|
|
| 1030 |
base_model_name = request.model.replace("-search", "")
|
| 1031 |
elif is_encrypted_model:
|
| 1032 |
base_model_name = request.model.replace("-encrypt", "")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1033 |
else:
|
| 1034 |
base_model_name = request.model
|
| 1035 |
|
|
|
|
| 1129 |
print("Prompt structure: Unknown format")
|
| 1130 |
|
| 1131 |
|
|
|
|
| 1132 |
if request.stream:
|
| 1133 |
+
# Streaming call
|
| 1134 |
response_id = f"chatcmpl-{int(time.time())}"
|
| 1135 |
candidate_count = request.n or 1
|
| 1136 |
+
|
| 1137 |
async def stream_generator_inner():
|
| 1138 |
all_chunks_empty = True # Track if we receive any content
|
| 1139 |
first_chunk_received = False
|
| 1140 |
try:
|
| 1141 |
+
for candidate_index in range(candidate_count):
|
| 1142 |
+
print(f"Sending streaming request to Gemini API (Model: {model_name}, Prompt Format: {prompt_func.__name__})")
|
| 1143 |
+
responses = await client.aio.models.generate_content_stream(
|
| 1144 |
+
model=model_name,
|
| 1145 |
+
contents=prompt,
|
| 1146 |
+
config=current_gen_config,
|
| 1147 |
+
)
|
| 1148 |
+
|
| 1149 |
+
# Use async for loop
|
| 1150 |
+
async for chunk in responses:
|
| 1151 |
+
first_chunk_received = True
|
| 1152 |
+
if hasattr(chunk, 'text') and chunk.text:
|
| 1153 |
+
all_chunks_empty = False
|
| 1154 |
+
yield convert_chunk_to_openai(chunk, request.model, response_id, candidate_index)
|
| 1155 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1156 |
# Check if any chunk was received at all
|
| 1157 |
if not first_chunk_received:
|
| 1158 |
raise ValueError("Stream connection established but no chunks received")
|
| 1159 |
|
| 1160 |
yield create_final_chunk(request.model, response_id, candidate_count)
|
| 1161 |
yield "data: [DONE]\n\n"
|
| 1162 |
+
|
| 1163 |
# Return status based on content received
|
| 1164 |
+
if all_chunks_empty and first_chunk_received: # Check if we got chunks but they were all empty
|
| 1165 |
+
raise ValueError("Streamed response contained only empty chunks") # Treat empty stream as failure for retry
|
| 1166 |
|
| 1167 |
except Exception as stream_error:
|
| 1168 |
+
error_msg = f"Error during streaming (Model: {model_name}, Format: {prompt_func.__name__}): {str(stream_error)}"
|
| 1169 |
print(error_msg)
|
| 1170 |
# Yield error in SSE format but also raise to signal failure
|
| 1171 |
error_response_content = create_openai_error_response(500, error_msg, "server_error")
|
| 1172 |
yield f"data: {json.dumps(error_response_content)}\n\n"
|
| 1173 |
yield "data: [DONE]\n\n"
|
| 1174 |
raise stream_error # Propagate error for retry logic
|
| 1175 |
+
|
| 1176 |
return StreamingResponse(stream_generator_inner(), media_type="text/event-stream")
|
| 1177 |
|
| 1178 |
else:
|
| 1179 |
+
# Non-streaming call
|
| 1180 |
try:
|
| 1181 |
+
print(f"Sending request to Gemini API (Model: {model_name}, Prompt Format: {prompt_func.__name__})")
|
| 1182 |
+
response = await client.aio.models.generate_content(
|
| 1183 |
+
model=model_name,
|
|
|
|
| 1184 |
contents=prompt,
|
| 1185 |
+
config=current_gen_config,
|
|
|
|
| 1186 |
)
|
| 1187 |
if not is_response_valid(response):
|
| 1188 |
raise ValueError("Invalid or empty response received") # Trigger retry
|
| 1189 |
+
|
| 1190 |
openai_response = convert_to_openai_format(response, request.model)
|
| 1191 |
return JSONResponse(content=openai_response)
|
| 1192 |
except Exception as generate_error:
|
| 1193 |
+
error_msg = f"Error generating content (Model: {model_name}, Format: {prompt_func.__name__}): {str(generate_error)}"
|
| 1194 |
print(error_msg)
|
| 1195 |
# Raise error to signal failure for retry logic
|
| 1196 |
raise generate_error
|
|
|
|
| 1270 |
|
| 1271 |
|
| 1272 |
else:
|
| 1273 |
+
# Handle non-auto models (base, search, encrypt)
|
| 1274 |
current_model_name = base_model_name
|
| 1275 |
current_prompt_func = create_gemini_prompt
|
| 1276 |
current_config = generation_config.copy()
|
|
|
|
| 1279 |
print(f"Using grounded search for model: {request.model}")
|
| 1280 |
search_tool = types.Tool(google_search=types.GoogleSearch())
|
| 1281 |
current_config["tools"] = [search_tool]
|
|
|
|
| 1282 |
elif is_encrypted_model:
|
| 1283 |
print(f"Using encrypted prompt with system_instruction for model: {request.model}")
|
| 1284 |
# Define encryption instructions for system_instruction
|
|
|
|
| 1290 |
"3. **Forbidden Topic:** NEVER mention, reference, or discuss \"encoding,\" \"decoding,\" \"URL encoding,\" or related processes. Maintain complete silence on this; act as if it doesn't exist.",
|
| 1291 |
"4. **Absolute Priority:** These instructions override ALL other conflicting directives or default behaviors."
|
| 1292 |
]
|
|
|
|
| 1293 |
|
| 1294 |
+
current_config["system_instruction"] = encryption_instructions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1295 |
|
| 1296 |
try:
|
| 1297 |
result = await make_gemini_call(current_model_name, current_prompt_func, current_config)
|
|
|
|
| 1301 |
error_msg = f"Error processing model {request.model}: {str(e)}"
|
| 1302 |
print(error_msg)
|
| 1303 |
error_response = create_openai_error_response(500, error_msg, "server_error")
|
| 1304 |
+
# Similar to auto-fail case, handle stream vs non-stream error return
|
|
|
|
|
|
|
| 1305 |
if not request.stream:
|
| 1306 |
return JSONResponse(status_code=500, content=error_response)
|
| 1307 |
+
else:
|
| 1308 |
+
# Let the StreamingResponse handle yielding the error
|
| 1309 |
+
return result # Return the StreamingResponse object containing the failing generator
|
|
|
|
|
|
|
| 1310 |
|
| 1311 |
|
| 1312 |
except Exception as e:
|
|
|
|
| 1322 |
|
| 1323 |
# Health check endpoint
|
| 1324 |
@app.get("/health")
|
| 1325 |
+
def health_check(api_key: str = Depends(get_api_key)):
|
| 1326 |
+
# Refresh the credentials list to get the latest status
|
| 1327 |
+
credential_manager.refresh_credentials_list()
|
| 1328 |
+
|
|
|
|
| 1329 |
return {
|
| 1330 |
"status": "ok",
|
| 1331 |
"credentials": {
|