Spaces:
Paused
Paused
Commit
·
5c95d1b
1
Parent(s):
61a24c9
added global region to express mode
Browse files- app/message_processing.py +7 -4
- app/requirements.txt +2 -1
- app/routes/chat_api.py +26 -4
app/message_processing.py
CHANGED
|
@@ -241,7 +241,7 @@ def parse_gemini_response_for_reasoning_and_content(gemini_response_candidate: A
|
|
| 241 |
reasoning_text_parts.append(part_text)
|
| 242 |
else:
|
| 243 |
normal_text_parts.append(part_text)
|
| 244 |
-
|
| 245 |
normal_text_parts.append(candidate_part_text)
|
| 246 |
# If no parts and no direct text on candidate, both lists remain empty.
|
| 247 |
|
|
@@ -291,10 +291,14 @@ def convert_to_openai_format(gemini_response: Any, model: str) -> Dict[str, Any]
|
|
| 291 |
def convert_chunk_to_openai(chunk: Any, model: str, response_id: str, candidate_index: int = 0) -> str:
|
| 292 |
is_encrypt_full = model.endswith("-encrypt-full")
|
| 293 |
delta_payload = {}
|
| 294 |
-
finish_reason = None
|
| 295 |
|
| 296 |
if hasattr(chunk, 'candidates') and chunk.candidates:
|
| 297 |
-
candidate = chunk.candidates[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 298 |
|
| 299 |
# For a streaming chunk, candidate might be simpler, or might have candidate.content with parts.
|
| 300 |
# parse_gemini_response_for_reasoning_and_content is designed to handle both candidate and candidate.content
|
|
@@ -308,7 +312,6 @@ def convert_chunk_to_openai(chunk: Any, model: str, response_id: str, candidate_
|
|
| 308 |
if normal_text or (not reasoning_text and not delta_payload): # Ensure content key if nothing else
|
| 309 |
delta_payload['content'] = normal_text if normal_text else ""
|
| 310 |
|
| 311 |
-
|
| 312 |
chunk_data = {
|
| 313 |
"id": response_id, "object": "chat.completion.chunk", "created": int(time.time()), "model": model,
|
| 314 |
"choices": [{"index": candidate_index, "delta": delta_payload, "finish_reason": finish_reason}]
|
|
|
|
| 241 |
reasoning_text_parts.append(part_text)
|
| 242 |
else:
|
| 243 |
normal_text_parts.append(part_text)
|
| 244 |
+
elif candidate_part_text: # Candidate had text but no parts and was not a thought itself
|
| 245 |
normal_text_parts.append(candidate_part_text)
|
| 246 |
# If no parts and no direct text on candidate, both lists remain empty.
|
| 247 |
|
|
|
|
| 291 |
def convert_chunk_to_openai(chunk: Any, model: str, response_id: str, candidate_index: int = 0) -> str:
|
| 292 |
is_encrypt_full = model.endswith("-encrypt-full")
|
| 293 |
delta_payload = {}
|
| 294 |
+
finish_reason = None
|
| 295 |
|
| 296 |
if hasattr(chunk, 'candidates') and chunk.candidates:
|
| 297 |
+
candidate = chunk.candidates[0]
|
| 298 |
+
|
| 299 |
+
# Check for finish reason
|
| 300 |
+
if hasattr(candidate, 'finishReason') and candidate.finishReason:
|
| 301 |
+
finish_reason = "stop" # Convert Gemini finish reasons to OpenAI format
|
| 302 |
|
| 303 |
# For a streaming chunk, candidate might be simpler, or might have candidate.content with parts.
|
| 304 |
# parse_gemini_response_for_reasoning_and_content is designed to handle both candidate and candidate.content
|
|
|
|
| 312 |
if normal_text or (not reasoning_text and not delta_payload): # Ensure content key if nothing else
|
| 313 |
delta_payload['content'] = normal_text if normal_text else ""
|
| 314 |
|
|
|
|
| 315 |
chunk_data = {
|
| 316 |
"id": response_id, "object": "chat.completion.chunk", "created": int(time.time()), "model": model,
|
| 317 |
"choices": [{"index": candidate_index, "delta": delta_payload, "finish_reason": finish_reason}]
|
app/requirements.txt
CHANGED
|
@@ -6,4 +6,5 @@ pydantic==2.6.1
|
|
| 6 |
google-genai==1.17.0
|
| 7 |
httpx>=0.25.0
|
| 8 |
openai
|
| 9 |
-
google-auth-oauthlib
|
|
|
|
|
|
| 6 |
google-genai==1.17.0
|
| 7 |
httpx>=0.25.0
|
| 8 |
openai
|
| 9 |
+
google-auth-oauthlib
|
| 10 |
+
aiohttp
|
app/routes/chat_api.py
CHANGED
|
@@ -24,6 +24,7 @@ from api_helpers import (
|
|
| 24 |
execute_gemini_call,
|
| 25 |
)
|
| 26 |
from openai_handler import OpenAIDirectHandler
|
|
|
|
| 27 |
|
| 28 |
router = APIRouter()
|
| 29 |
|
|
@@ -115,8 +116,14 @@ async def chat_completions(fastapi_request: Request, request: OpenAIRequest, api
|
|
| 115 |
if key_tuple:
|
| 116 |
original_idx, key_val = key_tuple
|
| 117 |
try:
|
| 118 |
-
|
| 119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
break # Successfully initialized client
|
| 121 |
except Exception as e:
|
| 122 |
print(f"WARNING: Attempt {attempt+1}/{total_keys} - Vertex Express Mode client init failed for API key (original index: {original_idx}) for model {request.model}: {e}. Trying next key.")
|
|
@@ -177,7 +184,11 @@ async def chat_completions(fastapi_request: Request, request: OpenAIRequest, api
|
|
| 177 |
current_gen_config = attempt["config_modifier"](generation_config.copy())
|
| 178 |
try:
|
| 179 |
# Pass is_auto_attempt=True for auto-mode calls
|
| 180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
except Exception as e_auto:
|
| 182 |
last_err = e_auto
|
| 183 |
print(f"Auto-attempt '{attempt['name']}' for model {attempt['model']} failed: {e_auto}")
|
|
@@ -185,6 +196,9 @@ async def chat_completions(fastapi_request: Request, request: OpenAIRequest, api
|
|
| 185 |
|
| 186 |
print(f"All auto attempts failed. Last error: {last_err}")
|
| 187 |
err_msg = f"All auto-mode attempts failed for model {request.model}. Last error: {str(last_err)}"
|
|
|
|
|
|
|
|
|
|
| 188 |
if not request.stream and last_err:
|
| 189 |
return JSONResponse(status_code=500, content=create_openai_error_response(500, err_msg, "server_error"))
|
| 190 |
elif request.stream:
|
|
@@ -231,9 +245,17 @@ async def chat_completions(fastapi_request: Request, request: OpenAIRequest, api
|
|
| 231 |
# but the API call might need the full "gemini-1.5-pro-search".
|
| 232 |
# Let's use `request.model` for the API call here, and `base_model_name` for checks like Express eligibility.
|
| 233 |
# For non-auto mode, is_auto_attempt defaults to False in execute_gemini_call
|
| 234 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
|
| 236 |
except Exception as e:
|
| 237 |
error_msg = f"Unexpected error in chat_completions endpoint: {str(e)}"
|
| 238 |
print(error_msg)
|
|
|
|
|
|
|
|
|
|
| 239 |
return JSONResponse(status_code=500, content=create_openai_error_response(500, error_msg, "server_error"))
|
|
|
|
| 24 |
execute_gemini_call,
|
| 25 |
)
|
| 26 |
from openai_handler import OpenAIDirectHandler
|
| 27 |
+
from direct_vertex_client import DirectVertexClient
|
| 28 |
|
| 29 |
router = APIRouter()
|
| 30 |
|
|
|
|
| 116 |
if key_tuple:
|
| 117 |
original_idx, key_val = key_tuple
|
| 118 |
try:
|
| 119 |
+
# Check if model contains "gemini-2.5-pro" for direct URL approach
|
| 120 |
+
if "gemini-2.5-pro" in base_model_name:
|
| 121 |
+
client_to_use = DirectVertexClient(api_key=key_val)
|
| 122 |
+
await client_to_use.discover_project_id()
|
| 123 |
+
print(f"INFO: Attempt {attempt+1}/{total_keys} - Using DirectVertexClient for model {request.model} (base: {base_model_name}) with API key (original index: {original_idx}).")
|
| 124 |
+
else:
|
| 125 |
+
client_to_use = genai.Client(vertexai=True, api_key=key_val)
|
| 126 |
+
print(f"INFO: Attempt {attempt+1}/{total_keys} - Using Vertex Express Mode SDK for model {request.model} (base: {base_model_name}) with API key (original index: {original_idx}).")
|
| 127 |
break # Successfully initialized client
|
| 128 |
except Exception as e:
|
| 129 |
print(f"WARNING: Attempt {attempt+1}/{total_keys} - Vertex Express Mode client init failed for API key (original index: {original_idx}) for model {request.model}: {e}. Trying next key.")
|
|
|
|
| 184 |
current_gen_config = attempt["config_modifier"](generation_config.copy())
|
| 185 |
try:
|
| 186 |
# Pass is_auto_attempt=True for auto-mode calls
|
| 187 |
+
result = await execute_gemini_call(client_to_use, attempt["model"], attempt["prompt_func"], current_gen_config, request, is_auto_attempt=True)
|
| 188 |
+
# Clean up DirectVertexClient session if used
|
| 189 |
+
if isinstance(client_to_use, DirectVertexClient):
|
| 190 |
+
await client_to_use.close()
|
| 191 |
+
return result
|
| 192 |
except Exception as e_auto:
|
| 193 |
last_err = e_auto
|
| 194 |
print(f"Auto-attempt '{attempt['name']}' for model {attempt['model']} failed: {e_auto}")
|
|
|
|
| 196 |
|
| 197 |
print(f"All auto attempts failed. Last error: {last_err}")
|
| 198 |
err_msg = f"All auto-mode attempts failed for model {request.model}. Last error: {str(last_err)}"
|
| 199 |
+
# Clean up DirectVertexClient session if used
|
| 200 |
+
if isinstance(client_to_use, DirectVertexClient):
|
| 201 |
+
await client_to_use.close()
|
| 202 |
if not request.stream and last_err:
|
| 203 |
return JSONResponse(status_code=500, content=create_openai_error_response(500, err_msg, "server_error"))
|
| 204 |
elif request.stream:
|
|
|
|
| 245 |
# but the API call might need the full "gemini-1.5-pro-search".
|
| 246 |
# Let's use `request.model` for the API call here, and `base_model_name` for checks like Express eligibility.
|
| 247 |
# For non-auto mode, is_auto_attempt defaults to False in execute_gemini_call
|
| 248 |
+
try:
|
| 249 |
+
return await execute_gemini_call(client_to_use, base_model_name, current_prompt_func, generation_config, request)
|
| 250 |
+
finally:
|
| 251 |
+
# Clean up DirectVertexClient session if used
|
| 252 |
+
if isinstance(client_to_use, DirectVertexClient):
|
| 253 |
+
await client_to_use.close()
|
| 254 |
|
| 255 |
except Exception as e:
|
| 256 |
error_msg = f"Unexpected error in chat_completions endpoint: {str(e)}"
|
| 257 |
print(error_msg)
|
| 258 |
+
# Clean up DirectVertexClient session if it exists
|
| 259 |
+
if 'client_to_use' in locals() and isinstance(client_to_use, DirectVertexClient):
|
| 260 |
+
await client_to_use.close()
|
| 261 |
return JSONResponse(status_code=500, content=create_openai_error_response(500, error_msg, "server_error"))
|