Spaces:
Paused
Paused
Commit
·
cdf27f4
1
Parent(s):
71235a6
added reasoning support
Browse files- app/message_processing.py +102 -39
- app/routes/chat_api.py +66 -6
app/message_processing.py
CHANGED
|
@@ -342,38 +342,81 @@ def convert_to_openai_format(gemini_response, model: str) -> Dict[str, Any]:
|
|
| 342 |
|
| 343 |
if hasattr(gemini_response, 'candidates') and gemini_response.candidates:
|
| 344 |
for i, candidate in enumerate(gemini_response.candidates):
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
elif hasattr(candidate, 'content') and hasattr(candidate.content, 'parts'):
|
| 349 |
-
# Ensure content remains a string even if parts have None text
|
| 350 |
-
parts_texts = []
|
| 351 |
-
for part_item in candidate.content.parts:
|
| 352 |
-
if hasattr(part_item, 'text') and part_item.text is not None:
|
| 353 |
-
parts_texts.append(part_item.text)
|
| 354 |
-
content = "".join(parts_texts)
|
| 355 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 356 |
if is_encrypt_full:
|
| 357 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 358 |
|
| 359 |
choices.append({
|
| 360 |
"index": i,
|
| 361 |
-
"message":
|
| 362 |
-
"finish_reason": "stop"
|
| 363 |
})
|
|
|
|
|
|
|
| 364 |
elif hasattr(gemini_response, 'text'):
|
| 365 |
-
|
| 366 |
if is_encrypt_full:
|
| 367 |
-
|
| 368 |
choices.append({
|
| 369 |
"index": 0,
|
| 370 |
-
"message": {"role": "assistant", "content":
|
| 371 |
"finish_reason": "stop"
|
| 372 |
})
|
| 373 |
-
else:
|
| 374 |
choices.append({
|
| 375 |
"index": 0,
|
| 376 |
-
"message": {"role": "assistant", "content": ""},
|
| 377 |
"finish_reason": "stop"
|
| 378 |
})
|
| 379 |
|
|
@@ -395,32 +438,49 @@ def convert_to_openai_format(gemini_response, model: str) -> Dict[str, Any]:
|
|
| 395 |
def convert_chunk_to_openai(chunk, model: str, response_id: str, candidate_index: int = 0) -> str:
|
| 396 |
"""Converts Gemini stream chunk to OpenAI format, applying deobfuscation if needed."""
|
| 397 |
is_encrypt_full = model.endswith("-encrypt-full")
|
| 398 |
-
|
|
|
|
|
|
|
| 399 |
|
|
|
|
|
|
|
|
|
|
| 400 |
try:
|
| 401 |
-
if hasattr(
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
# Ensure part_item.text exists, is not None, and convert to string
|
| 405 |
if hasattr(part_item, 'text') and part_item.text is not None:
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
except Exception as e_chunk_extract:
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
chunk_content_str = "" # Default to empty string in case of any error
|
| 416 |
|
| 417 |
-
|
| 418 |
-
|
| 419 |
|
| 420 |
if is_encrypt_full:
|
| 421 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 422 |
|
| 423 |
-
finish_reason = None
|
| 424 |
# Actual finish reason handling would be more complex if Gemini provides it mid-stream
|
| 425 |
|
| 426 |
chunk_data = {
|
|
@@ -431,13 +491,16 @@ def convert_chunk_to_openai(chunk, model: str, response_id: str, candidate_index
|
|
| 431 |
"choices": [
|
| 432 |
{
|
| 433 |
"index": candidate_index,
|
| 434 |
-
"delta":
|
| 435 |
"finish_reason": finish_reason
|
| 436 |
}
|
| 437 |
]
|
| 438 |
}
|
| 439 |
-
|
| 440 |
-
|
|
|
|
|
|
|
|
|
|
| 441 |
return f"data: {json.dumps(chunk_data)}\n\n"
|
| 442 |
|
| 443 |
def create_final_chunk(model: str, response_id: str, candidate_count: int = 1) -> str:
|
|
|
|
| 342 |
|
| 343 |
if hasattr(gemini_response, 'candidates') and gemini_response.candidates:
|
| 344 |
for i, candidate in enumerate(gemini_response.candidates):
|
| 345 |
+
print(candidate) # Existing print statement
|
| 346 |
+
reasoning_text_parts = []
|
| 347 |
+
normal_text_parts = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 348 |
|
| 349 |
+
gemini_candidate_content = None
|
| 350 |
+
if hasattr(candidate, 'content'):
|
| 351 |
+
gemini_candidate_content = candidate.content
|
| 352 |
+
|
| 353 |
+
if gemini_candidate_content:
|
| 354 |
+
try:
|
| 355 |
+
if hasattr(gemini_candidate_content, 'parts') and gemini_candidate_content.parts:
|
| 356 |
+
for part_item in gemini_candidate_content.parts:
|
| 357 |
+
part_text = ""
|
| 358 |
+
if hasattr(part_item, 'text') and part_item.text is not None:
|
| 359 |
+
part_text = str(part_item.text)
|
| 360 |
+
|
| 361 |
+
# Check for 'thought' attribute on part_item and append directly
|
| 362 |
+
if hasattr(part_item, 'thought') and part_item.thought is True:
|
| 363 |
+
reasoning_text_parts.append(part_text)
|
| 364 |
+
else:
|
| 365 |
+
normal_text_parts.append(part_text)
|
| 366 |
+
elif hasattr(gemini_candidate_content, 'text') and gemini_candidate_content.text is not None:
|
| 367 |
+
# If no 'parts', but 'text' exists on content, it's normal content
|
| 368 |
+
normal_text_parts.append(str(gemini_candidate_content.text))
|
| 369 |
+
except Exception as e_extract:
|
| 370 |
+
print(f"WARNING: Error extracting from candidate.content: {e_extract}. Content: {str(gemini_candidate_content)[:200]}")
|
| 371 |
+
# Fallback: if candidate.content is not informative, but candidate.text exists directly
|
| 372 |
+
elif hasattr(candidate, 'text') and candidate.text is not None:
|
| 373 |
+
normal_text_parts.append(str(candidate.text))
|
| 374 |
+
|
| 375 |
+
|
| 376 |
+
final_reasoning_content_str = "".join(reasoning_text_parts)
|
| 377 |
+
final_normal_content_str = "".join(normal_text_parts)
|
| 378 |
+
|
| 379 |
if is_encrypt_full:
|
| 380 |
+
final_reasoning_content_str = deobfuscate_text(final_reasoning_content_str)
|
| 381 |
+
final_normal_content_str = deobfuscate_text(final_normal_content_str)
|
| 382 |
+
|
| 383 |
+
message_payload = {"role": "assistant"}
|
| 384 |
+
if final_reasoning_content_str:
|
| 385 |
+
message_payload['reasoning_content'] = final_reasoning_content_str
|
| 386 |
+
|
| 387 |
+
# Ensure 'content' key is present, even if empty or None, as per OpenAI spec for assistant messages
|
| 388 |
+
# if not final_normal_content_str and not final_reasoning_content_str:
|
| 389 |
+
# message_payload['content'] = ""
|
| 390 |
+
# elif final_reasoning_content_str and not final_normal_content_str:
|
| 391 |
+
# message_payload['content'] = None
|
| 392 |
+
# else: # final_normal_content_str has content
|
| 393 |
+
# message_payload['content'] = final_normal_content_str
|
| 394 |
+
|
| 395 |
+
# Simplified logic for content: always include it. If it was empty, it'll be empty string.
|
| 396 |
+
# If only reasoning was present, content will be empty string.
|
| 397 |
+
message_payload['content'] = final_normal_content_str
|
| 398 |
+
|
| 399 |
|
| 400 |
choices.append({
|
| 401 |
"index": i,
|
| 402 |
+
"message": message_payload,
|
| 403 |
+
"finish_reason": "stop" # Assuming "stop" as Gemini doesn't always map directly
|
| 404 |
})
|
| 405 |
+
|
| 406 |
+
# This elif handles cases where gemini_response itself might be a simple text response
|
| 407 |
elif hasattr(gemini_response, 'text'):
|
| 408 |
+
content_str = gemini_response.text or ""
|
| 409 |
if is_encrypt_full:
|
| 410 |
+
content_str = deobfuscate_text(content_str)
|
| 411 |
choices.append({
|
| 412 |
"index": 0,
|
| 413 |
+
"message": {"role": "assistant", "content": content_str},
|
| 414 |
"finish_reason": "stop"
|
| 415 |
})
|
| 416 |
+
else: # Fallback for empty or unexpected response structure
|
| 417 |
choices.append({
|
| 418 |
"index": 0,
|
| 419 |
+
"message": {"role": "assistant", "content": ""}, # Ensure content key
|
| 420 |
"finish_reason": "stop"
|
| 421 |
})
|
| 422 |
|
|
|
|
| 438 |
def convert_chunk_to_openai(chunk, model: str, response_id: str, candidate_index: int = 0) -> str:
|
| 439 |
"""Converts Gemini stream chunk to OpenAI format, applying deobfuscation if needed."""
|
| 440 |
is_encrypt_full = model.endswith("-encrypt-full")
|
| 441 |
+
|
| 442 |
+
# This is original_chunk.candidates[0].content after your reassignment
|
| 443 |
+
gemini_content_part = chunk.candidates[0].content
|
| 444 |
|
| 445 |
+
reasoning_text_parts = []
|
| 446 |
+
normal_text_parts = []
|
| 447 |
+
|
| 448 |
try:
|
| 449 |
+
if hasattr(gemini_content_part, 'parts') and gemini_content_part.parts:
|
| 450 |
+
for part_item in gemini_content_part.parts:
|
| 451 |
+
part_text = ""
|
|
|
|
| 452 |
if hasattr(part_item, 'text') and part_item.text is not None:
|
| 453 |
+
part_text = str(part_item.text)
|
| 454 |
+
|
| 455 |
+
# Check for the 'thought' attribute on the part_item itself and append directly
|
| 456 |
+
if hasattr(part_item, 'thought') and part_item.thought is True: # Corrected to 'thought'
|
| 457 |
+
reasoning_text_parts.append(part_text)
|
| 458 |
+
else:
|
| 459 |
+
normal_text_parts.append(part_text)
|
| 460 |
+
elif hasattr(gemini_content_part, 'text') and gemini_content_part.text is not None:
|
| 461 |
+
# If no 'parts', but 'text' exists, it's normal content
|
| 462 |
+
normal_text_parts.append(str(gemini_content_part.text))
|
| 463 |
+
# If gemini_content_part has neither .parts nor .text, or if .text is None, both lists remain empty
|
| 464 |
except Exception as e_chunk_extract:
|
| 465 |
+
print(f"WARNING: Error extracting content from Gemini content part in convert_chunk_to_openai: {e_chunk_extract}. Content part type: {type(gemini_content_part)}. Data: {str(gemini_content_part)[:200]}")
|
| 466 |
+
# Fallback to empty if extraction fails, lists will remain empty
|
|
|
|
| 467 |
|
| 468 |
+
final_reasoning_content_str = "".join(reasoning_text_parts)
|
| 469 |
+
final_normal_content_str = "".join(normal_text_parts)
|
| 470 |
|
| 471 |
if is_encrypt_full:
|
| 472 |
+
final_reasoning_content_str = deobfuscate_text(final_reasoning_content_str)
|
| 473 |
+
final_normal_content_str = deobfuscate_text(final_normal_content_str)
|
| 474 |
+
|
| 475 |
+
# Construct delta payload
|
| 476 |
+
delta_payload = {}
|
| 477 |
+
if final_reasoning_content_str: # Only add if there's content
|
| 478 |
+
delta_payload['reasoning_content'] = final_reasoning_content_str
|
| 479 |
+
if final_normal_content_str: # Only add if there's content
|
| 480 |
+
delta_payload['content'] = final_normal_content_str
|
| 481 |
+
# If both are empty, delta_payload will be an empty dict {}, which is valid for OpenAI stream (empty update)
|
| 482 |
|
| 483 |
+
finish_reason = None
|
| 484 |
# Actual finish reason handling would be more complex if Gemini provides it mid-stream
|
| 485 |
|
| 486 |
chunk_data = {
|
|
|
|
| 491 |
"choices": [
|
| 492 |
{
|
| 493 |
"index": candidate_index,
|
| 494 |
+
"delta": delta_payload, # Use the new delta_payload
|
| 495 |
"finish_reason": finish_reason
|
| 496 |
}
|
| 497 |
]
|
| 498 |
}
|
| 499 |
+
# Note: The original 'chunk' variable in the broader scope was the full Gemini GenerateContentResponse chunk.
|
| 500 |
+
# The 'logprobs' would be on the candidate, not on gemini_content_part.
|
| 501 |
+
# We need to access logprobs from the original chunk's candidate.
|
| 502 |
+
if hasattr(chunk, 'candidates') and chunk.candidates and hasattr(chunk.candidates[0], 'logprobs'):
|
| 503 |
+
chunk_data["choices"][0]["logprobs"] = getattr(chunk.candidates[0], 'logprobs', None)
|
| 504 |
return f"data: {json.dumps(chunk_data)}\n\n"
|
| 505 |
|
| 506 |
def create_final_chunk(model: str, response_id: str, candidate_count: int = 1) -> str:
|
app/routes/chat_api.py
CHANGED
|
@@ -228,16 +228,42 @@ async def chat_completions(fastapi_request: Request, request: OpenAIRequest, api
|
|
| 228 |
)
|
| 229 |
async for chunk in stream_response:
|
| 230 |
try:
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
print(f"ERROR: {error_msg_chunk}")
|
| 235 |
# Truncate
|
| 236 |
if len(error_msg_chunk) > 1024:
|
| 237 |
error_msg_chunk = error_msg_chunk[:1024] + "..."
|
| 238 |
error_response_chunk = create_openai_error_response(500, error_msg_chunk, "server_error")
|
| 239 |
-
json_payload_for_chunk_error = json.dumps(error_response_chunk)
|
| 240 |
-
print(f"DEBUG: Yielding chunk
|
| 241 |
yield f"data: {json_payload_for_chunk_error}\n\n"
|
| 242 |
yield "data: [DONE]\n\n"
|
| 243 |
return # Stop further processing for this request
|
|
@@ -263,7 +289,41 @@ async def chat_completions(fastapi_request: Request, request: OpenAIRequest, api
|
|
| 263 |
**openai_params,
|
| 264 |
extra_body=openai_extra_body
|
| 265 |
)
|
| 266 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
except Exception as generate_error:
|
| 268 |
error_msg_generate = f"Error calling OpenAI client for {request.model}: {str(generate_error)}"
|
| 269 |
print(f"ERROR: {error_msg_generate}")
|
|
|
|
| 228 |
)
|
| 229 |
async for chunk in stream_response:
|
| 230 |
try:
|
| 231 |
+
chunk_as_dict = chunk.model_dump(exclude_unset=True, exclude_none=True)
|
| 232 |
+
print(chunk_as_dict)
|
| 233 |
+
|
| 234 |
+
# Safely navigate and check for thought flag
|
| 235 |
+
choices = chunk_as_dict.get('choices')
|
| 236 |
+
if choices and isinstance(choices, list) and len(choices) > 0:
|
| 237 |
+
delta = choices[0].get('delta')
|
| 238 |
+
if delta and isinstance(delta, dict):
|
| 239 |
+
extra_content = delta.get('extra_content')
|
| 240 |
+
if isinstance(extra_content, dict):
|
| 241 |
+
google_content = extra_content.get('google')
|
| 242 |
+
if isinstance(google_content, dict) and google_content.get('thought') is True:
|
| 243 |
+
# This is a thought chunk, modify chunk_as_dict's delta in place
|
| 244 |
+
reasoning_text = delta.get('content')
|
| 245 |
+
if reasoning_text is not None:
|
| 246 |
+
delta['reasoning_content'] = reasoning_text
|
| 247 |
+
|
| 248 |
+
if 'content' in delta:
|
| 249 |
+
del delta['content']
|
| 250 |
+
|
| 251 |
+
# Always delete extra_content for thought chunks
|
| 252 |
+
if 'extra_content' in delta:
|
| 253 |
+
del delta['extra_content']
|
| 254 |
+
|
| 255 |
+
# Yield the (potentially modified) dictionary as JSON
|
| 256 |
+
yield f"data: {json.dumps(chunk_as_dict)}\n\n"
|
| 257 |
+
|
| 258 |
+
except Exception as chunk_processing_error: # Catch errors from dict manipulation or json.dumps
|
| 259 |
+
error_msg_chunk = f"Error processing or serializing OpenAI chunk for {request.model}: {str(chunk_processing_error)}. Chunk: {str(chunk)[:200]}"
|
| 260 |
print(f"ERROR: {error_msg_chunk}")
|
| 261 |
# Truncate
|
| 262 |
if len(error_msg_chunk) > 1024:
|
| 263 |
error_msg_chunk = error_msg_chunk[:1024] + "..."
|
| 264 |
error_response_chunk = create_openai_error_response(500, error_msg_chunk, "server_error")
|
| 265 |
+
json_payload_for_chunk_error = json.dumps(error_response_chunk) # Ensure json is imported
|
| 266 |
+
print(f"DEBUG: Yielding chunk processing error JSON payload (OpenAI path): {json_payload_for_chunk_error}")
|
| 267 |
yield f"data: {json_payload_for_chunk_error}\n\n"
|
| 268 |
yield "data: [DONE]\n\n"
|
| 269 |
return # Stop further processing for this request
|
|
|
|
| 289 |
**openai_params,
|
| 290 |
extra_body=openai_extra_body
|
| 291 |
)
|
| 292 |
+
response_dict = response.model_dump(exclude_unset=True, exclude_none=True)
|
| 293 |
+
|
| 294 |
+
# Process reasoning_tokens for non-streaming response
|
| 295 |
+
try:
|
| 296 |
+
usage = response_dict.get('usage')
|
| 297 |
+
if usage and isinstance(usage, dict):
|
| 298 |
+
completion_details = usage.get('completion_tokens_details')
|
| 299 |
+
if completion_details and isinstance(completion_details, dict):
|
| 300 |
+
num_reasoning_tokens = completion_details.get('reasoning_tokens')
|
| 301 |
+
|
| 302 |
+
if isinstance(num_reasoning_tokens, int) and num_reasoning_tokens > 0:
|
| 303 |
+
choices = response_dict.get('choices')
|
| 304 |
+
if choices and isinstance(choices, list) and len(choices) > 0:
|
| 305 |
+
# Ensure choices[0] and message are dicts, model_dump makes them so
|
| 306 |
+
message_dict = choices[0].get('message')
|
| 307 |
+
if message_dict and isinstance(message_dict, dict):
|
| 308 |
+
full_content = message_dict.get('content')
|
| 309 |
+
if isinstance(full_content, str): # Ensure content is a string
|
| 310 |
+
reasoning_text = full_content[:num_reasoning_tokens]
|
| 311 |
+
actual_content = full_content[num_reasoning_tokens:]
|
| 312 |
+
|
| 313 |
+
message_dict['reasoning_content'] = reasoning_text
|
| 314 |
+
message_dict['content'] = actual_content
|
| 315 |
+
|
| 316 |
+
# Clean up Vertex-specific field
|
| 317 |
+
del completion_details['reasoning_tokens']
|
| 318 |
+
if not completion_details: # If dict is now empty
|
| 319 |
+
del usage['completion_tokens_details']
|
| 320 |
+
if not usage: # If dict is now empty
|
| 321 |
+
del response_dict['usage']
|
| 322 |
+
except Exception as e_non_stream_reasoning:
|
| 323 |
+
print(f"WARNING: Could not process non-streaming reasoning tokens for model {request.model}: {e_non_stream_reasoning}. Response will be returned as is from Vertex.")
|
| 324 |
+
# Fallthrough to return response_dict as is if processing fails
|
| 325 |
+
|
| 326 |
+
return JSONResponse(content=response_dict)
|
| 327 |
except Exception as generate_error:
|
| 328 |
error_msg_generate = f"Error calling OpenAI client for {request.model}: {str(generate_error)}"
|
| 329 |
print(f"ERROR: {error_msg_generate}")
|